From ccb1535577c019312b69b95a60bb75c8a3ee22a3 Mon Sep 17 00:00:00 2001 From: Apple Date: Fri, 1 May 2020 18:26:57 +0000 Subject: [PATCH] hfs-522.100.5.tar.gz --- core/.open_source_exclude | 2 + core/BTree.c | 2088 +++++ core/BTreeAllocate.c | 748 ++ core/BTreeMiscOps.c | 676 ++ core/BTreeNodeOps.c | 1036 +++ core/BTreeNodeReserve.c | 336 + core/BTreeScanner.c | 410 + core/BTreeScanner.h | 122 + core/BTreeTreeOps.c | 1338 +++ core/BTreeWrapper.c | 278 + core/BTreesInternal.h | 368 + core/BTreesPrivate.h | 516 ++ core/CatalogPrivate.h | 129 + core/CatalogUtilities.c | 343 + core/FileExtentMapping.c | 2249 +++++ core/FileIDsServices.c | 794 ++ core/FileMgrInternal.h | 397 + core/HFSUnicodeWrappers.h | 120 + core/MacOSStubs.c | 143 + core/UCStringCompareData.h | 329 + core/UnicodeWrappers.c | 508 ++ core/VolumeAllocation.c | 6198 ++++++++++++++ core/hfs.h | 1171 +++ core/hfs_alloc_trace.h | 34 + core/hfs_attrlist.c | 1743 ++++ core/hfs_attrlist.h | 108 + core/hfs_btreeio.c | 948 ++ core/hfs_btreeio.h | 59 + core/hfs_catalog.c | 4813 +++++++++++ core/hfs_catalog.h | 512 ++ core/hfs_chash.c | 578 ++ core/hfs_cnode.c | 2561 ++++++ core/hfs_cnode.h | 630 ++ core/hfs_cprotect.c | 2773 ++++++ core/hfs_cprotect.h | 424 + core/hfs_dbg.h | 92 + core/hfs_endian.c | 1227 +++ core/hfs_endian.h | 105 + core/hfs_extents.c | 771 ++ core/hfs_extents.h | 74 + core/hfs_format.h | 818 ++ core/hfs_fsctl.h | 387 + core/hfs_fsinfo.c | 889 ++ core/hfs_hotfiles.c | 3929 +++++++++ core/hfs_hotfiles.h | 136 + core/hfs_iokit.cpp | 307 + core/hfs_iokit.h | 57 + core/hfs_journal.c | 4892 +++++++++++ core/hfs_journal.h | 378 + core/hfs_kdebug.h | 114 + core/hfs_link.c | 1419 +++ core/hfs_lookup.c | 680 ++ core/hfs_macos_defs.h | 299 + core/hfs_mount.h | 83 + core/hfs_notification.c | 198 + core/hfs_quota.c | 1014 +++ core/hfs_quota.h | 111 + core/hfs_readwrite.c | 5876 +++++++++++++ core/hfs_resize.c | 3432 ++++++++ core/hfs_search.c | 1395 +++ core/hfs_unistr.h | 64 + core/hfs_vfsops.c | 4751 ++++++++++ core/hfs_vfsutils.c | 4462 ++++++++++ core/hfs_vnops.c | 7622 +++++++++++++++++ core/hfs_xattr.c | 2633 ++++++ core/install | 35 + core/iphoneos-Info.plist | 59 + core/kext-config.h | 56 + core/kext.xcconfig | 52 + core/macosx-Info.plist | 61 + core/mk-root.sh | 39 + core/rangelist.c | 429 + core/rangelist.h | 86 + hfs.xcodeproj/project.pbxproj | 3 +- .../xcschemes/livefiles_hfs_tester.xcscheme | 8 +- livefiles_hfs_plugin/lf_hfs_btree_node_ops.c | 2 +- livefiles_hfs_plugin/lf_hfs_btree_tree_ops.c | 2 +- livefiles_hfs_plugin/lf_hfs_chash.c | 7 +- livefiles_hfs_plugin/lf_hfs_cnode.c | 203 +- livefiles_hfs_plugin/lf_hfs_common.h | 7 +- livefiles_hfs_plugin/lf_hfs_dirops_handler.c | 2 +- livefiles_hfs_plugin/lf_hfs_endian.c | 5 +- .../lf_hfs_file_extent_mapping.c | 3 +- livefiles_hfs_plugin/lf_hfs_fileops_handler.c | 102 + livefiles_hfs_plugin/lf_hfs_fileops_handler.h | 3 + livefiles_hfs_plugin/lf_hfs_fsops_handler.c | 31 +- livefiles_hfs_plugin/lf_hfs_raw_read_write.c | 7 +- livefiles_hfs_plugin/lf_hfs_readwrite_ops.c | 141 + livefiles_hfs_plugin/lf_hfs_readwrite_ops.h | 1 + livefiles_hfs_plugin/lf_hfs_vfsops.c | 1 + livefiles_hfs_plugin/lf_hfs_vnode.c | 70 +- livefiles_hfs_plugin/lf_hfs_vnode.h | 13 +- livefiles_hfs_plugin/lf_hfs_vnops.c | 188 +- livefiles_hfs_plugin/lf_hfs_vnops.h | 2 + make_opensource.sh | 118 - 95 files changed, 85154 insertions(+), 279 deletions(-) create mode 100644 core/.open_source_exclude create mode 100644 core/BTree.c create mode 100644 core/BTreeAllocate.c create mode 100644 core/BTreeMiscOps.c create mode 100644 core/BTreeNodeOps.c create mode 100644 core/BTreeNodeReserve.c create mode 100644 core/BTreeScanner.c create mode 100644 core/BTreeScanner.h create mode 100644 core/BTreeTreeOps.c create mode 100644 core/BTreeWrapper.c create mode 100644 core/BTreesInternal.h create mode 100644 core/BTreesPrivate.h create mode 100644 core/CatalogPrivate.h create mode 100644 core/CatalogUtilities.c create mode 100644 core/FileExtentMapping.c create mode 100644 core/FileIDsServices.c create mode 100644 core/FileMgrInternal.h create mode 100644 core/HFSUnicodeWrappers.h create mode 100644 core/MacOSStubs.c create mode 100644 core/UCStringCompareData.h create mode 100644 core/UnicodeWrappers.c create mode 100644 core/VolumeAllocation.c create mode 100644 core/hfs.h create mode 100644 core/hfs_alloc_trace.h create mode 100644 core/hfs_attrlist.c create mode 100644 core/hfs_attrlist.h create mode 100644 core/hfs_btreeio.c create mode 100644 core/hfs_btreeio.h create mode 100644 core/hfs_catalog.c create mode 100644 core/hfs_catalog.h create mode 100644 core/hfs_chash.c create mode 100644 core/hfs_cnode.c create mode 100644 core/hfs_cnode.h create mode 100644 core/hfs_cprotect.c create mode 100644 core/hfs_cprotect.h create mode 100644 core/hfs_dbg.h create mode 100644 core/hfs_endian.c create mode 100644 core/hfs_endian.h create mode 100644 core/hfs_extents.c create mode 100644 core/hfs_extents.h create mode 100644 core/hfs_format.h create mode 100644 core/hfs_fsctl.h create mode 100644 core/hfs_fsinfo.c create mode 100644 core/hfs_hotfiles.c create mode 100644 core/hfs_hotfiles.h create mode 100644 core/hfs_iokit.cpp create mode 100644 core/hfs_iokit.h create mode 100644 core/hfs_journal.c create mode 100644 core/hfs_journal.h create mode 100644 core/hfs_kdebug.h create mode 100644 core/hfs_link.c create mode 100644 core/hfs_lookup.c create mode 100644 core/hfs_macos_defs.h create mode 100644 core/hfs_mount.h create mode 100644 core/hfs_notification.c create mode 100644 core/hfs_quota.c create mode 100644 core/hfs_quota.h create mode 100644 core/hfs_readwrite.c create mode 100644 core/hfs_resize.c create mode 100644 core/hfs_search.c create mode 100644 core/hfs_unistr.h create mode 100644 core/hfs_vfsops.c create mode 100644 core/hfs_vfsutils.c create mode 100644 core/hfs_vnops.c create mode 100644 core/hfs_xattr.c create mode 100755 core/install create mode 100644 core/iphoneos-Info.plist create mode 100644 core/kext-config.h create mode 100644 core/kext.xcconfig create mode 100644 core/macosx-Info.plist create mode 100755 core/mk-root.sh create mode 100644 core/rangelist.c create mode 100644 core/rangelist.h delete mode 100755 make_opensource.sh diff --git a/core/.open_source_exclude b/core/.open_source_exclude new file mode 100644 index 0000000..0c36529 --- /dev/null +++ b/core/.open_source_exclude @@ -0,0 +1,2 @@ +hfs_key_roll.c +hfs_key_roll.h diff --git a/core/BTree.c b/core/BTree.c new file mode 100644 index 0000000..cd7803d --- /dev/null +++ b/core/BTree.c @@ -0,0 +1,2088 @@ +/* + * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + File: BTree.c + + Contains: Implementation of public interface routines for B-tree manager. + + Version: HFS Plus 1.0 + + Written by: Gordon Sheridan and Bill Bruffey + + Copyright: (c) 1992-1999 by Apple Inc., all rights reserved. + + File Ownership: + + DRI: Don Brady + + Other Contact: Mark Day + + Technology: File Systems + + Writers: + + (msd) Mark Day + (DSH) Deric Horn + (djb) Don Brady + + Change History (most recent first): + 9/22/99 ser Added routines BTGetLastSync and BTSetLastSync + 6/1/99 djb Sync up with Mac OS 8.6. + 6/30/98 djb In BTOpenPath make sure nodes are contiguous on disk (radar #2249539). + 4/15/98 djb In BTOpenPath need to clear nodeRec.buffer if GetBlockProc fails. + 4/11/98 djb Add RequireFileLock checking to all external entry points. + + 03/23/98 djb In BTOpenPath use kTrashBlock option when releasing the header so + that we get a full node when we call GetNode. + + 12/12/97 djb Radar #2202682, BTIterateRecord with kBTreeCurrentRecord was not + checking if we had a record and could call BlockMove with an + uninitialize source pointer (causing a bus error). + 10/24/97 msd In BTIterateRecord, when moving to the previous or next record + and we have to move to another node, see if we need to release + the node about to be "shifted out" (opposite sibling of the + direction we need to move). + 7/25/97 DSH BTSearchRecord now takes a heuristicHint, nodeNum, and tries it + before calling SearchBTree + 7/24/97 djb GetBlockProc now take a file refnum instead of an FCB ptr. + 7/22/97 djb Move trace points from BTreeWrapper.c to here. + 7/21/97 djb LogEndTime now takes an error code. + 7/16/97 DSH FilesInternal.i renamed FileMgrInternal.i to avoid name + collision + 5/19/97 djb Add summary traces to BTIterateRecord. + 4/23/97 djb first checked in + + 2/19/97 djb Enable variable sized index keys for HFS+ volumes. Added node + cache to support nodes larger than 512 bytes. + 1/27/97 djb Calls to InsertTree and DeleteTree are now recursive (to support + variable sized index keys). + 1/13/97 djb Added support for getting current record to BTIterateRecord. + 1/6/97 djb Initialize "BigKeys" attribute in BTOpen. + 1/3/97 djb Added support for large keys. + 12/23/96 djb On exit map fsBTEmptyErr and fsBTEndOfIterationErr to + fsBTRecordNotFoundErr. + 12/19/96 djb first checked in + + History applicable to original Scarecrow Design: + + <13> 10/25/96 ser Changing for new VFPI + <12> 10/18/96 ser Converting over VFPI changes + <11> 9/17/96 dkh More BTree statistics. Modified hint checks to not bail out when + an error is returned from GetNode. + <10> 9/16/96 dkh Revised BTree statistics. + <9> 8/23/96 dkh Remove checks for multiple paths to BTree file. Need to add + equivalent mechanism later. + <8> 6/20/96 dkh Radar #1358740. Switch from using Pools to debug MemAllocators. + <7> 3/14/96 jev Fix BTreeSetRecord, recordFound was not set for the case of a + simple replace causing the leafRecords count to get bumped even + though we didn't have to add a record. + <6> 3/1/96 prp Fix lint problems. Bug in BTSetRecord that does not initialize + recordFound. + <5> 1/22/96 dkh Add #include Memory.h + <4> 1/10/96 msd Use the real function names from Math64.i. + <3> 1/4/96 jev Fix BTItererateRecord for the condition when the iterator + position routine does not find the record and we are looking for + the next record. In such a case, if the node's forrward link is + non-zero, we have to keep iterating next and not return + fsBTEndOfIterationErr error. + <2> 12/7/95 dkh D10E2 build. Changed usage of Ref data type to LogicalAddress. + <1> 10/18/95 rst Moved from Scarecrow project. + + <24> 7/18/95 mbb Change MoveData & ClearBytes to BlockMoveData & BlockZero. + <23> 1/31/95 prp GetBlockProc interface uses a 64 bit node number. + <22> 1/12/95 wjk Adopt Model FileSystem changes in D5. + <21> 11/16/94 prp Add IsItAHint routine and use it whenever hint's node number was + used for testing. + <20> 11/10/94 prp BTGetInfo name collides with the same name in FileManagerPriv.i. + Change it to BTGetInformation. + <19> 9/30/94 prp Get in sync with D2 interface changes. + <18> 7/22/94 wjk Convert to the new set of header files. + <17> 12/9/93 wjk Cleanup usage of char, Byte, int8, UInt8, etc. + <16> 12/2/93 wjk Move from Makefiles to BuildFiles. Fit into the ModernOS and + NRCmds environments. + <15> 11/30/93 wjk Move from Makefiles to BuildFiles. Fit into the ModernOS and + NRCmds environments. + <14> 9/30/93 gs Rename E_NoGetNodeProc and E_NoReleaseNodeProc to + E_NoXxxxBlockProc. + <13> 8/31/93 prp Use Set64U instead of Set64. + <12> 8/16/93 prp In BTSearchRecord, if the input hint found the node and record, + set the local nodeNum variable correctly so that the resultant + iterator gets set correctly. + <11> 7/1/93 gs Fix bug in BTIterateRecord related to kBTreePrevRecord + operation. + <10> 6/2/93 gs Update for changes to FSErrors.h and add some comments. + <9> 5/24/93 gs Fix bug in BTInsert/Set/ReplaceRecord which didn't set node hint + properly in some cases. + <8> 5/24/93 gs Do NOT map fsBTEmptyErr to fsBTRecordNotFoundErr in BTSearchRecord. + <7> 5/24/93 gs Rename BTFlush to BTFlushPath. + <6> 5/21/93 gs Add hint optimization to Set/Replace routines. + <5> 5/10/93 gs Remove Panic from BTInitialize for small logicalEOF. Implement + Insert, Set, Replace, and Delete. + <4> 3/23/93 gs Finish BTInitialize. + <3> 2/8/93 gs Implement BTSearchRecord and BTIterateRecord. + <2> 12/8/92 gs Implement Open and Close routines. + <1> 11/15/92 gs first checked in + +*/ + +#include "BTreesPrivate.h" +#include "hfs_btreeio.h" + +//////////////////////////////////// Globals //////////////////////////////////// + + +/////////////////////////// BTree Module Entry Points /////////////////////////// + + + +/*------------------------------------------------------------------------------- +Routine: BTOpenPath - Open a file for access as a B*Tree. + +Function: Create BTree control block for a file, if necessary. Validates the + file to be sure it looks like a BTree file. + + +Input: filePtr - pointer to file to open as a B-tree + keyCompareProc - pointer to client's KeyCompare function + +Result: noErr - success + paramErr - required ptr was nil + fsBTInvalidFileErr - + memFullErr - + != noErr - failure +-------------------------------------------------------------------------------*/ + +OSStatus BTOpenPath(FCB *filePtr, KeyCompareProcPtr keyCompareProc) +{ + OSStatus err; + BTreeControlBlockPtr btreePtr; + BTHeaderRec *header; + NodeRec nodeRec; + + ////////////////////// Preliminary Error Checking /////////////////////////// + + if ( filePtr == nil ) + { + return paramErr; + } + + /* + * Subsequent opens allow key compare proc to be changed. + */ + if ( filePtr->fcbBTCBPtr != nil && keyCompareProc != nil) { + btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; + btreePtr->keyCompareProc = keyCompareProc; + return noErr; + } + + if ( filePtr->fcbEOF < kMinNodeSize ) + return fsBTInvalidFileErr; + + + //////////////////////// Allocate Control Block ///////////////////////////// + + btreePtr = hfs_mallocz(sizeof(BTreeControlBlock)); + + btreePtr->getBlockProc = GetBTreeBlock; + btreePtr->releaseBlockProc = ReleaseBTreeBlock; + btreePtr->setEndOfForkProc = ExtendBTreeFile; + btreePtr->keyCompareProc = keyCompareProc; + + /////////////////////////// Read Header Node //////////////////////////////// + + nodeRec.buffer = nil; // so we can call ReleaseNode + btreePtr->fileRefNum = GetFileRefNumFromFCB(filePtr); + filePtr->fcbBTCBPtr = (Ptr) btreePtr; // attach btree cb to file + + /* Prefer doing I/O a physical block at a time */ + nodeRec.blockSize = VTOHFS(btreePtr->fileRefNum)->hfs_physical_block_size; + + /* Start with the allocation block size for regular files. */ + if (FTOC(filePtr)->c_fileid >= kHFSFirstUserCatalogNodeID) + { + nodeRec.blockSize = FCBTOVCB(filePtr)->blockSize; + } + REQUIRE_FILE_LOCK(btreePtr->fileRefNum, false); + + // it is now safe to call M_ExitOnError (err) + + err = SetBTreeBlockSize (btreePtr->fileRefNum, nodeRec.blockSize, 1); + M_ExitOnError (err); + + + err = GetBTreeBlock(btreePtr->fileRefNum, + kHeaderNodeNum, + kGetBlock, + &nodeRec ); + if (err != noErr) + { + nodeRec.buffer = nil; + nodeRec.blockHeader = nil; + Panic("BTOpen: getNodeProc returned error getting header node."); + goto ErrorExit; + } + ++btreePtr->numGetNodes; + header = (BTHeaderRec*) ((uintptr_t)nodeRec.buffer + sizeof(BTNodeDescriptor)); + + + ///////////////////////////// verify header ///////////////////////////////// + + err = VerifyHeader (filePtr, header); + M_ExitOnError (err); + + + ///////////////////// Initalize fields from header ////////////////////////// + + PanicIf ( (FCBTOVCB(filePtr)->vcbSigWord != 0x4244) && (header->nodeSize == 512), " BTOpenPath: wrong node size for HFS+ volume!"); // 0x4244 = 'BD' + + btreePtr->treeDepth = header->treeDepth; + btreePtr->rootNode = header->rootNode; + btreePtr->leafRecords = header->leafRecords; + btreePtr->firstLeafNode = header->firstLeafNode; + btreePtr->lastLeafNode = header->lastLeafNode; + btreePtr->nodeSize = header->nodeSize; + btreePtr->maxKeyLength = header->maxKeyLength; + btreePtr->totalNodes = header->totalNodes; + btreePtr->freeNodes = header->freeNodes; + if (FTOC(filePtr)->c_fileid >= kHFSFirstUserCatalogNodeID) + filePtr->ff_clumpsize = header->clumpSize; + btreePtr->btreeType = header->btreeType; + + btreePtr->keyCompareType = header->keyCompareType; + + btreePtr->attributes = header->attributes; + + if ( btreePtr->maxKeyLength > 40 ) + btreePtr->attributes |= (kBTBigKeysMask + kBTVariableIndexKeysMask); //€€ we need a way to save these attributes + + /////////////////////// Initialize dynamic fields /////////////////////////// + + btreePtr->version = kBTreeVersion; + btreePtr->flags = 0; + btreePtr->writeCount = 1; + + /////////////////////////// Check Header Node /////////////////////////////// + + // set kBadClose attribute bit, and UpdateNode + + /* b-tree node size must be at least as big as the logical block size */ + if (btreePtr->nodeSize < VTOHFS(btreePtr->fileRefNum)->hfs_logical_block_size) + { + /* + * If this tree has any records or the media is writeable then + * we cannot mount using the current physical block size. + */ + if (btreePtr->leafRecords > 0 || + VTOHFS(btreePtr->fileRefNum)->hfs_flags & HFS_WRITEABLE_MEDIA) + { + err = fsBTBadNodeSize; + goto ErrorExit; + } + } + + /* + * If the actual node size is different than the amount we read, + * then release and trash this block, and re-read with the correct + * node size. + */ + if ( btreePtr->nodeSize != nodeRec.blockSize ) + { + err = SetBTreeBlockSize (btreePtr->fileRefNum, btreePtr->nodeSize, 32); + M_ExitOnError (err); + + /* + * Need to use kTrashBlock option to force the + * buffer cache to read the entire node + */ + err = ReleaseBTreeBlock(btreePtr->fileRefNum, &nodeRec, kTrashBlock); + ++btreePtr->numReleaseNodes; + M_ExitOnError (err); + + err = GetNode (btreePtr, kHeaderNodeNum, 0, &nodeRec ); + M_ExitOnError (err); + } + + //€€ total nodes * node size <= LEOF? + + + err = ReleaseNode (btreePtr, &nodeRec); + M_ExitOnError (err); + + /* + * Under Mac OS, b-tree nodes can be non-contiguous on disk when the + * allocation block size is smaller than the b-tree node size. + * + * If journaling is turned on for this volume we can't deal with this + * situation and so we bail out. If journaling isn't on it's ok as + * hfs_strategy_fragmented() deals with it. Journaling can't support + * this because it assumes that if you give it a block that it's + * contiguous on disk. + */ + if ( FCBTOHFS(filePtr)->jnl && !NodesAreContiguous(FCBTOVCB(filePtr), filePtr, btreePtr->nodeSize) ) { + return fsBTInvalidNodeErr; + } + + //////////////////////////////// Success //////////////////////////////////// + + //€€ align LEOF to multiple of node size? - just on close + + return noErr; + + + /////////////////////// Error - Clean up and Exit /////////////////////////// + +ErrorExit: + + filePtr->fcbBTCBPtr = nil; + (void) ReleaseNode (btreePtr, &nodeRec); + hfs_free(btreePtr, sizeof(*btreePtr)); + + return err; +} + + + +/*------------------------------------------------------------------------------- +Routine: BTClosePath - Flush BTree Header and Deallocate Memory for BTree. + +Function: Flush the BTreeControlBlock fields to header node, and delete BTree control + block and key descriptor associated with the file if filePtr is last + path of type kBTreeType ('btre'). + + +Input: filePtr - pointer to file to delete BTree control block for. + +Result: noErr - success + fsBTInvalidFileErr - + != noErr - failure +-------------------------------------------------------------------------------*/ + +OSStatus BTClosePath (FCB *filePtr) +{ + OSStatus err; + BTreeControlBlockPtr btreePtr; + + btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; + + if (btreePtr == nil) + return fsBTInvalidFileErr; + + REQUIRE_FILE_LOCK(btreePtr->fileRefNum, false); + + ////////////////////// Check for other BTree Paths ////////////////////////// + + btreePtr->attributes &= ~kBTBadCloseMask; // clear "bad close" attribute bit + err = UpdateHeader (btreePtr, true); + M_ExitOnError (err); + + hfs_free(btreePtr, sizeof(*btreePtr)); + filePtr->fcbBTCBPtr = nil; + + return noErr; + + /////////////////////// Error - Clean Up and Exit /////////////////////////// + +ErrorExit: + + return err; +} + + + +/*------------------------------------------------------------------------------- +Routine: BTSearchRecord - Search BTree for a record with a matching key. + +Function: Search for position in B*Tree indicated by searchKey. If a valid node hint + is provided, it will be searched first, then SearchTree will be called. + If a BTreeIterator is provided, it will be set to the position found as + a result of the search. If a record exists at that position, and a BufferDescriptor + is supplied, the record will be copied to the buffer (as much as will fit), + and recordLen will be set to the length of the record. + + If an error other than fsBTRecordNotFoundErr occurs, the BTreeIterator, if any, + is invalidated, and recordLen is set to 0. + + +Input: pathPtr - pointer to path for BTree file. + searchKey - pointer to search key to match. + hintPtr - pointer to hint (may be nil) + +Output: record - pointer to BufferDescriptor containing record + recordLen - length of data at recordPtr + iterator - pointer to BTreeIterator indicating position result of search + +Result: noErr - success, record contains copy of record found + fsBTRecordNotFoundErr - record was not found, no data copied + fsBTInvalidFileErr - no BTreeControlBlock is allocated for the fork + fsBTInvalidKeyLengthErr - + != noErr - failure +-------------------------------------------------------------------------------*/ + +OSStatus BTSearchRecord (FCB *filePtr, + BTreeIterator *searchIterator, + FSBufferDescriptor *record, + u_int16_t *recordLen, + BTreeIterator *resultIterator ) +{ + OSStatus err; + BTreeControlBlockPtr btreePtr; + TreePathTable treePathTable; + u_int32_t nodeNum = 0; + BlockDescriptor node; + u_int16_t index = 0; + BTreeKeyPtr keyPtr = NULL; + RecordPtr recordPtr; + u_int16_t len; + Boolean foundRecord; + Boolean validHint; + + if (filePtr == nil) + { + return paramErr; + } + + if (searchIterator == nil) + { + return paramErr; + } + + node.buffer = nil; + node.blockHeader = nil; + + btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; + if (btreePtr == nil) + { + return fsBTInvalidFileErr; + } + + REQUIRE_FILE_LOCK(btreePtr->fileRefNum, true); + + foundRecord = false; + + ////////////////////////////// Take A Hint ////////////////////////////////// + + err = IsItAHint (btreePtr, searchIterator, &validHint); + M_ExitOnError (err); + + if (validHint) + { + nodeNum = searchIterator->hint.nodeNum; + + err = GetNode (btreePtr, nodeNum, kGetNodeHint, &node); + if( err == noErr ) + { + if ( ((BTNodeDescriptor*) node.buffer)->kind == kBTLeafNode && + ((BTNodeDescriptor*) node.buffer)->numRecords > 0 ) + { + foundRecord = SearchNode (btreePtr, node.buffer, &searchIterator->key, &index); + + //€€ if !foundRecord, we could still skip tree search if ( 0 < index < numRecords ) + } + + if (foundRecord == false) + { + err = ReleaseNode (btreePtr, &node); + M_ExitOnError (err); + } + else + { + ++btreePtr->numValidHints; + } + } + + if( foundRecord == false ) + (void) BTInvalidateHint( searchIterator ); + } + + + //////////////////////////// Search The Tree //////////////////////////////// + + if (foundRecord == false) + { + err = SearchTree ( btreePtr, &searchIterator->key, treePathTable, &nodeNum, &node, &index); + switch (err) + { + case noErr: + foundRecord = true; + break; + case fsBTRecordNotFoundErr: + break; + default: + goto ErrorExit; + } + } + + + //////////////////////////// Get the Record ///////////////////////////////// + + if (foundRecord == true) + { + //XXX Should check for errors! Or BlockMove could choke on recordPtr!!! + GetRecordByIndex (btreePtr, node.buffer, index, &keyPtr, &recordPtr, &len); + + if (recordLen != nil) *recordLen = len; + + if (record != nil) + { + ByteCount recordSize; + + recordSize = record->itemCount * record->itemSize; + + if (len > recordSize) len = recordSize; + + BlockMoveData (recordPtr, record->bufferAddress, len); + } + } + + + /////////////////////// Success - Update Iterator /////////////////////////// + + if (resultIterator != nil) + { + if (foundRecord) { + resultIterator->hint.writeCount = btreePtr->writeCount; + resultIterator->hint.nodeNum = nodeNum; + resultIterator->hint.index = index; + } +#if DEBUG + resultIterator->hint.reserved1 = 0; + resultIterator->hint.reserved2 = 0; + resultIterator->version = 0; + resultIterator->reserved = 0; +#endif + // copy the key in the BTree when found rather than searchIterator->key to get proper case/diacriticals + if (foundRecord == true) + BlockMoveData ((Ptr)keyPtr, (Ptr)&resultIterator->key, CalcKeySize(btreePtr, keyPtr)); + else + BlockMoveData ((Ptr)&searchIterator->key, (Ptr)&resultIterator->key, CalcKeySize(btreePtr, &searchIterator->key)); + } + + err = ReleaseNode (btreePtr, &node); + M_ExitOnError (err); + + if (foundRecord == false) return fsBTRecordNotFoundErr; + else return noErr; + + + /////////////////////// Error - Clean Up and Exit /////////////////////////// + +ErrorExit: + + if (recordLen != nil) + *recordLen = 0; + + if (resultIterator != nil) + { + resultIterator->hint.writeCount = 0; + resultIterator->hint.nodeNum = 0; + resultIterator->hint.index = 0; + resultIterator->hint.reserved1 = 0; + resultIterator->hint.reserved2 = 0; + + resultIterator->version = 0; + resultIterator->reserved = 0; + resultIterator->key.length16 = 0; // zero out two bytes to cover both types of keys + } + + if ( err == fsBTEmptyErr ) + err = fsBTRecordNotFoundErr; + + return err; +} + + + +/*------------------------------------------------------------------------------- +Routine: BTIterateRecord - Find the first, next, previous, or last record. + +Function: Find the first, next, previous, or last record in the BTree + +Input: pathPtr - pointer to path iterate records for. + operation - iteration operation (first,next,prev,last) + iterator - pointer to iterator indicating start position + +Output: iterator - iterator is updated to indicate new position + newKeyPtr - pointer to buffer to copy key found by iteration + record - pointer to buffer to copy record found by iteration + recordLen - length of record + +Result: noErr - success + != noErr - failure +-------------------------------------------------------------------------------*/ + +OSStatus BTIterateRecord (FCB *filePtr, + BTreeIterationOperation operation, + BTreeIterator *iterator, + FSBufferDescriptor *record, + u_int16_t *recordLen ) +{ + OSStatus err; + BTreeControlBlockPtr btreePtr; + BTreeKeyPtr keyPtr; + RecordPtr recordPtr; + u_int16_t len; + + Boolean foundRecord; + u_int32_t nodeNum; + + BlockDescriptor left, node, right; + u_int16_t index; + + + ////////////////////////// Priliminary Checks /////////////////////////////// + + left.buffer = nil; + left.blockHeader = nil; + right.buffer = nil; + right.blockHeader = nil; + node.buffer = nil; + node.blockHeader = nil; + + + if (filePtr == nil) + { + return paramErr; + } + + btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; + if (btreePtr == nil) + { + return fsBTInvalidFileErr; //€€ handle properly + } + + REQUIRE_FILE_LOCK(btreePtr->fileRefNum, true); + + if ((operation != kBTreeFirstRecord) && + (operation != kBTreeNextRecord) && + (operation != kBTreeCurrentRecord) && + (operation != kBTreePrevRecord) && + (operation != kBTreeLastRecord)) + { + err = fsInvalidIterationMovmentErr; + goto ErrorExit; + } + + /////////////////////// Find First or Last Record /////////////////////////// + + if ((operation == kBTreeFirstRecord) || (operation == kBTreeLastRecord)) + { + if (operation == kBTreeFirstRecord) nodeNum = btreePtr->firstLeafNode; + else nodeNum = btreePtr->lastLeafNode; + + if (nodeNum == 0) + { + err = fsBTEmptyErr; + goto ErrorExit; + } + + err = GetNode (btreePtr, nodeNum, 0, &node); + M_ExitOnError (err); + + if ( ((NodeDescPtr) node.buffer)->kind != kBTLeafNode || + ((NodeDescPtr) node.buffer)->numRecords <= 0 ) + { + err = ReleaseNode (btreePtr, &node); + M_ExitOnError (err); + + err = fsBTInvalidNodeErr; + printf ("hfs: BTIterateRecord() found invalid btree node on volume %s\n", FCBTOVCB(filePtr)->vcbVN); + hfs_mark_inconsistent(FCBTOVCB(filePtr), HFS_INCONSISTENCY_DETECTED); + goto ErrorExit; + } + + if (operation == kBTreeFirstRecord) index = 0; + else index = ((BTNodeDescriptor*) node.buffer)->numRecords - 1; + + goto CopyData; //€€ is there a cleaner way? + } + + + //////////////////////// Find Iterator Position ///////////////////////////// + + // Not called for (operation == kBTreeFirstRecord || operation == kBTreeLastRecord) + err = FindIteratorPosition (btreePtr, iterator, + &left, &node, &right, &nodeNum, &index, &foundRecord); + M_ExitOnError (err); + + + ///////////////////// Find Next Or Previous Record ////////////////////////// + + if (operation == kBTreePrevRecord) + { + if (index > 0) + { + --index; + } + else + { + if (left.buffer == nil) + { + nodeNum = ((NodeDescPtr) node.buffer)->bLink; + if ( nodeNum > 0) + { + // BTree nodes are always grabbed in left to right order. + // Therefore release the current node before looking up the + // left node. + err = ReleaseNode(btreePtr, &node); + M_ExitOnError(err); + + // Look up the left node + err = GetNode (btreePtr, nodeNum, 0, &left); + M_ExitOnError (err); + + // Look up the current node again + err = GetRightSiblingNode (btreePtr, left.buffer, &node); + M_ExitOnError (err); + } else { + err = fsBTStartOfIterationErr; + goto ErrorExit; + } + } + // Before we stomp on "right", we'd better release it if needed + if (right.buffer != nil) { + err = ReleaseNode(btreePtr, &right); + M_ExitOnError(err); + } + right = node; + node = left; + left.buffer = nil; + index = ((NodeDescPtr) node.buffer)->numRecords -1; + } + } + else if (operation == kBTreeNextRecord) + { + if ((foundRecord != true) && + (((NodeDescPtr) node.buffer)->fLink == 0) && + (index == ((NodeDescPtr) node.buffer)->numRecords)) + { + err = fsBTEndOfIterationErr; + goto ErrorExit; + } + + // we did not find the record but the index is already positioned correctly + if ((foundRecord == false) && (index != ((NodeDescPtr) node.buffer)->numRecords)) + goto CopyData; + + // we found the record OR we have to look in the next node + if (index < ((NodeDescPtr) node.buffer)->numRecords -1) + { + ++index; + } + else + { + if (right.buffer == nil) + { + nodeNum = ((NodeDescPtr) node.buffer)->fLink; + if ( nodeNum > 0) + { + err = GetNode (btreePtr, nodeNum, 0, &right); + M_ExitOnError (err); + } else { + err = fsBTEndOfIterationErr; + goto ErrorExit; + } + } + // Before we stomp on "left", we'd better release it if needed + if (left.buffer != nil) { + err = ReleaseNode(btreePtr, &left); + M_ExitOnError(err); + } + left = node; + node = right; + right.buffer = nil; + index = 0; + } + } + else // operation == kBTreeCurrentRecord + { + // make sure we have something... + if ((foundRecord != true) && + (index >= ((NodeDescPtr) node.buffer)->numRecords)) + { + err = fsBTEndOfIterationErr; + goto ErrorExit; + } + } + + //////////////////// Copy Record And Update Iterator //////////////////////// + +CopyData: + + // added check for errors + err = GetRecordByIndex (btreePtr, node.buffer, index, &keyPtr, &recordPtr, &len); + M_ExitOnError (err); + + if (recordLen != nil) + *recordLen = len; + + if (record != nil) + { + ByteCount recordSize; + + recordSize = record->itemCount * record->itemSize; + + if (len > recordSize) len = recordSize; + + BlockMoveData (recordPtr, record->bufferAddress, len); + } + + if (iterator != nil) // first & last do not require iterator + { + iterator->hint.writeCount = btreePtr->writeCount; + iterator->hint.nodeNum = nodeNum; + iterator->hint.index = index; + iterator->hint.reserved1 = 0; + iterator->hint.reserved2 = 0; + + iterator->version = 0; + iterator->reserved = 0; + + /* SER + * Check for infinite loops by making sure we do not + * process more leaf records, than can possibly be (or the BTree header + * is seriously damaged)....a brute force method. + */ + if ((operation == kBTreeFirstRecord) || (operation == kBTreeLastRecord)) + iterator->hitCount = 1; + else if (operation != kBTreeCurrentRecord) + iterator->hitCount += 1; + /* Always use the highest max, in case the grows while iterating */ + iterator->maxLeafRecs = max(btreePtr->leafRecords, iterator->maxLeafRecs); + +#if 0 + if (iterator->hitCount > iterator->maxLeafRecs + kNumLeafRecSlack) + { + err = fsBTInvalidNodeErr; + printf ("hfs: BTIterateRecord() found invalid btree node on volume %s\n", FCBTOVCB(filePtr)->vcbVN); + hfs_mark_inconsistent(FCBTOVCB(filePtr), HFS_INCONSISTENCY_DETECTED); + goto ErrorExit; + } +#endif + + BlockMoveData ((Ptr)keyPtr, (Ptr)&iterator->key, CalcKeySize(btreePtr, keyPtr)); + } + + + ///////////////////////////// Release Nodes ///////////////////////////////// + + err = ReleaseNode (btreePtr, &node); + M_ExitOnError (err); + + if (left.buffer != nil) + { + err = ReleaseNode (btreePtr, &left); + M_ExitOnError (err); + } + + if (right.buffer != nil) + { + err = ReleaseNode (btreePtr, &right); + M_ExitOnError (err); + } + + return noErr; + + /////////////////////// Error - Clean Up and Exit /////////////////////////// + +ErrorExit: + + (void) ReleaseNode (btreePtr, &left); + (void) ReleaseNode (btreePtr, &node); + (void) ReleaseNode (btreePtr, &right); + + if (recordLen != nil) + *recordLen = 0; + + if (iterator != nil) + { + iterator->hint.writeCount = 0; + iterator->hint.nodeNum = 0; + iterator->hint.index = 0; + iterator->hint.reserved1 = 0; + iterator->hint.reserved2 = 0; + + iterator->version = 0; + iterator->reserved = 0; + iterator->key.length16 = 0; + } + + if ( err == fsBTEmptyErr || err == fsBTEndOfIterationErr ) + err = fsBTRecordNotFoundErr; + + return err; +} + + +/*------------------------------------------------------------------------------- +Routine: BTIterateRecords + +Function: Find a series of records + +Input: filePtr - b-tree file + operation - iteration operation (first,next,prev,last) + iterator - pointer to iterator indicating start position + callBackProc - pointer to routince to process a record + callBackState - pointer to state data (used by callBackProc) + +Output: iterator - iterator is updated to indicate new position + +Result: noErr - success + != noErr - failure +-------------------------------------------------------------------------------*/ + +OSStatus +BTIterateRecords(FCB *filePtr, BTreeIterationOperation operation, BTreeIterator *iterator, + IterateCallBackProcPtr callBackProc, void * callBackState) +{ + OSStatus err; + BTreeControlBlockPtr btreePtr; + BTreeKeyPtr keyPtr; + RecordPtr recordPtr; + u_int16_t len; + Boolean foundRecord; + u_int32_t nodeNum; + BlockDescriptor left, node, right; + u_int16_t index; + + + ////////////////////////// Priliminary Checks /////////////////////////////// + + left.buffer = nil; + left.blockHeader = nil; + right.buffer = nil; + right.blockHeader = nil; + node.buffer = nil; + node.blockHeader = nil; + + btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; + + REQUIRE_FILE_LOCK(btreePtr->fileRefNum, true); + + if ((operation != kBTreeFirstRecord) && + (operation != kBTreeNextRecord) && + (operation != kBTreeCurrentRecord) && + (operation != kBTreePrevRecord) && + (operation != kBTreeLastRecord)) + { + err = fsInvalidIterationMovmentErr; + goto ErrorExit; + } + + /////////////////////// Find First or Last Record /////////////////////////// + + if ((operation == kBTreeFirstRecord) || (operation == kBTreeLastRecord)) + { + if (operation == kBTreeFirstRecord) + nodeNum = btreePtr->firstLeafNode; + else + nodeNum = btreePtr->lastLeafNode; + + if (nodeNum == 0) + { + err = fsBTEmptyErr; + goto ErrorExit; + } + + err = GetNode(btreePtr, nodeNum, 0, &node); + M_ExitOnError(err); + + if ( ((NodeDescPtr)node.buffer)->kind != kBTLeafNode || + ((NodeDescPtr)node.buffer)->numRecords <= 0 ) + { + err = ReleaseNode(btreePtr, &node); + M_ExitOnError(err); + + err = fsBTInvalidNodeErr; + printf ("hfs: BTIterateRecords() found invalid btree node on volume %s\n", FCBTOVCB(filePtr)->vcbVN); + hfs_mark_inconsistent(FCBTOVCB(filePtr), HFS_INCONSISTENCY_DETECTED); + goto ErrorExit; + } + + if (operation == kBTreeFirstRecord) + index = 0; + else + index = ((BTNodeDescriptor*) node.buffer)->numRecords - 1; + + goto ProcessData; + } + + //////////////////////// Find Iterator Position ///////////////////////////// + + // Not called for (operation == kBTreeFirstRecord || operation == kBTreeLastRecord) + err = FindIteratorPosition(btreePtr, iterator, &left, &node, &right, + &nodeNum, &index, &foundRecord); + if (err == fsBTRecordNotFoundErr) + err = 0; + M_ExitOnError(err); + + + ///////////////////// Find Next Or Previous Record ////////////////////////// + + if (operation == kBTreePrevRecord) + { + if (index > 0) + { + --index; + } + else + { + if (left.buffer == nil) + { + nodeNum = ((NodeDescPtr) node.buffer)->bLink; + if ( nodeNum > 0) + { + // BTree nodes are always grabbed in left to right order. + // Therefore release the current node before looking up the + // left node. + err = ReleaseNode(btreePtr, &node); + M_ExitOnError(err); + + // Look up the left node + err = GetNode (btreePtr, nodeNum, 0, &left); + M_ExitOnError (err); + + // Look up the current node again + err = GetRightSiblingNode (btreePtr, left.buffer, &node); + M_ExitOnError (err); + } else { + err = fsBTStartOfIterationErr; + goto ErrorExit; + } + } + // Before we stomp on "right", we'd better release it if needed + if (right.buffer != nil) { + err = ReleaseNode(btreePtr, &right); + M_ExitOnError(err); + } + right = node; + node = left; + left.buffer = nil; + index = ((NodeDescPtr) node.buffer)->numRecords -1; + } + } + else if (operation == kBTreeNextRecord) + { + if ((foundRecord != true) && + (((NodeDescPtr)node.buffer)->fLink == 0) && + (index == ((NodeDescPtr)node.buffer)->numRecords)) + { + err = fsBTEndOfIterationErr; + goto ErrorExit; + } + + // we did not find the record but the index is already positioned correctly + if ((foundRecord == false) && (index != ((NodeDescPtr)node.buffer)->numRecords)) + goto ProcessData; + + // we found the record OR we have to look in the next node + if (index < ((NodeDescPtr)node.buffer)->numRecords -1) + { + ++index; + } + else + { + if (right.buffer == nil) + { + nodeNum = ((NodeDescPtr)node.buffer)->fLink; + if ( nodeNum > 0) + { + err = GetNode(btreePtr, nodeNum, 0, &right); + M_ExitOnError(err); + } else { + err = fsBTEndOfIterationErr; + goto ErrorExit; + } + } + // Before we stomp on "left", we'd better release it if needed + if (left.buffer != nil) { + err = ReleaseNode(btreePtr, &left); + M_ExitOnError(err); + } + left = node; + node = right; + right.buffer = nil; + index = 0; + } + } + else // operation == kBTreeCurrentRecord + { + // make sure we have something... + if ((foundRecord != true) && + (index >= ((NodeDescPtr)node.buffer)->numRecords)) + { + err = fsBTEndOfIterationErr; + goto ErrorExit; + } + } + + //////////////////// Process Records Using Callback //////////////////////// + +ProcessData: + err = GetRecordByIndex(btreePtr, node.buffer, index, &keyPtr, &recordPtr, &len); + if (err) { + err = btBadNode; + goto ErrorExit; + } + + while (err == 0) { + if (callBackProc(keyPtr, recordPtr, callBackState) == 0) + break; + + if ((index+1) < ((NodeDescPtr)node.buffer)->numRecords) { + ++index; + } else { + if (right.buffer == nil) + { + nodeNum = ((NodeDescPtr)node.buffer)->fLink; + if ( nodeNum > 0) + { + err = GetNode(btreePtr, nodeNum, 0, &right); + M_ExitOnError(err); + } else { + err = fsBTEndOfIterationErr; + break; + } + } + // Before we stomp on "left", we'd better release it if needed + if (left.buffer != nil) { + err = ReleaseNode(btreePtr, &left); + M_ExitOnError(err); + } + left = node; + node = right; + right.buffer = nil; + index = 0; + } + err = GetRecordByIndex(btreePtr, node.buffer, index, + &keyPtr, &recordPtr, &len); + if (err) { + err = btBadNode; + goto ErrorExit; + } + } + + + ///////////////// Update Iterator to Last Item Processed ///////////////////// + + + if (iterator != nil) // first & last have optional iterator + { + iterator->hint.writeCount = btreePtr->writeCount; + iterator->hint.nodeNum = nodeNum; + iterator->hint.index = index; + iterator->version = 0; + + BlockMoveData((Ptr)keyPtr, (Ptr)&iterator->key, CalcKeySize(btreePtr, keyPtr)); + } + M_ExitOnError(err); + + + ///////////////////////////// Release Nodes ///////////////////////////////// + + err = ReleaseNode(btreePtr, &node); + M_ExitOnError(err); + + if (left.buffer != nil) + { + err = ReleaseNode(btreePtr, &left); + M_ExitOnError(err); + } + + if (right.buffer != nil) + { + err = ReleaseNode(btreePtr, &right); + M_ExitOnError(err); + } + + return noErr; + + /////////////////////// Error - Clean Up and Exit /////////////////////////// + +ErrorExit: + + (void) ReleaseNode(btreePtr, &left); + (void) ReleaseNode(btreePtr, &node); + (void) ReleaseNode(btreePtr, &right); + + if (iterator != nil) + { + iterator->hint.writeCount = 0; + iterator->hint.nodeNum = 0; + iterator->hint.index = 0; + iterator->version = 0; + iterator->key.length16 = 0; + } + + if ( err == fsBTEmptyErr || err == fsBTEndOfIterationErr ) + err = fsBTRecordNotFoundErr; + + return err; +} + + +//////////////////////////////// BTInsertRecord ///////////////////////////////// + +OSStatus BTInsertRecord (FCB *filePtr, + BTreeIterator *iterator, + FSBufferDescriptor *record, + u_int16_t recordLen ) +{ + OSStatus err; + BTreeControlBlockPtr btreePtr; + TreePathTable treePathTable; + u_int32_t nodesNeeded; + BlockDescriptor nodeRec; + u_int32_t insertNodeNum; + u_int16_t index; + Boolean recordFit; + + ////////////////////////// Priliminary Checks /////////////////////////////// + + nodeRec.buffer = nil; // so we can call ReleaseNode + nodeRec.blockHeader = nil; + + err = CheckInsertParams (filePtr, iterator, record, recordLen); + if (err != noErr) + return err; + + btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; + + REQUIRE_FILE_LOCK(btreePtr->fileRefNum, false); + + + ///////////////////////// Find Insert Position ////////////////////////////// + + // always call SearchTree for Insert + err = SearchTree (btreePtr, &iterator->key, treePathTable, &insertNodeNum, &nodeRec, &index); + + switch (err) // set/replace/insert decision point + { + case noErr: err = fsBTDuplicateRecordErr; + goto ErrorExit; + + case fsBTRecordNotFoundErr: break; + + case fsBTEmptyErr: // if tree empty add 1st leaf node + + if (btreePtr->freeNodes == 0) + { + err = ExtendBTree (btreePtr, btreePtr->totalNodes + 1); + M_ExitOnError (err); + } + + err = AllocateNode (btreePtr, &insertNodeNum); + M_ExitOnError (err); + + err = GetNewNode (btreePtr, insertNodeNum, &nodeRec); + M_ExitOnError (err); + + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, &nodeRec); + + ((NodeDescPtr)nodeRec.buffer)->kind = kBTLeafNode; + ((NodeDescPtr)nodeRec.buffer)->height = 1; + + recordFit = InsertKeyRecord (btreePtr, nodeRec.buffer, 0, + &iterator->key, KeyLength(btreePtr, &iterator->key), + record->bufferAddress, recordLen ); + if (recordFit != true) + { + err = fsBTRecordTooLargeErr; + goto ErrorExit; + } + + /* + * Update the B-tree control block. Do this before + * calling UpdateNode since it will compare the node's + * height with treeDepth. + */ + btreePtr->treeDepth = 1; + btreePtr->rootNode = insertNodeNum; + btreePtr->firstLeafNode = insertNodeNum; + btreePtr->lastLeafNode = insertNodeNum; + + err = UpdateNode (btreePtr, &nodeRec, 0, kLockTransaction); + M_ExitOnError (err); + + M_BTreeHeaderDirty (btreePtr); + + goto Success; + + default: goto ErrorExit; + } + + if (index > 0) + { + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, &nodeRec); + + recordFit = InsertKeyRecord (btreePtr, nodeRec.buffer, index, + &iterator->key, KeyLength(btreePtr, &iterator->key), + record->bufferAddress, recordLen); + if (recordFit == true) + { + err = UpdateNode (btreePtr, &nodeRec, 0, kLockTransaction); + M_ExitOnError (err); + + goto Success; + } + } + + /////////////////////// Extend File If Necessary //////////////////////////// + + if ((btreePtr->treeDepth + 1UL) > btreePtr->freeNodes) + { + nodesNeeded = btreePtr->treeDepth + 1 + btreePtr->totalNodes - btreePtr->freeNodes; + if (nodesNeeded > CalcMapBits (btreePtr)) // we'll need to add a map node too! + ++nodesNeeded; + + err = ExtendBTree (btreePtr, nodesNeeded); + M_ExitOnError (err); + } + + // no need to delete existing record + + err = InsertTree (btreePtr, treePathTable, &iterator->key, record->bufferAddress, + recordLen, &nodeRec, index, 1, kInsertRecord, &insertNodeNum); + M_ExitOnError (err); + + + //////////////////////////////// Success //////////////////////////////////// + +Success: + ++btreePtr->writeCount; + ++btreePtr->leafRecords; + M_BTreeHeaderDirty (btreePtr); + + // create hint + iterator->hint.writeCount = btreePtr->writeCount; + iterator->hint.nodeNum = insertNodeNum; + iterator->hint.index = 0; // unused + iterator->hint.reserved1 = 0; + iterator->hint.reserved2 = 0; + + return noErr; + + + ////////////////////////////// Error Exit /////////////////////////////////// + +ErrorExit: + + (void) ReleaseNode (btreePtr, &nodeRec); + + iterator->hint.writeCount = 0; + iterator->hint.nodeNum = 0; + iterator->hint.index = 0; + iterator->hint.reserved1 = 0; + iterator->hint.reserved2 = 0; + + if (err == fsBTEmptyErr) + err = fsBTRecordNotFoundErr; + + return err; +} + + +//////////////////////////////// BTReplaceRecord //////////////////////////////// + +OSStatus BTReplaceRecord (FCB *filePtr, + BTreeIterator *iterator, + FSBufferDescriptor *record, + u_int16_t recordLen ) +{ + OSStatus err; + BTreeControlBlockPtr btreePtr; + TreePathTable treePathTable; + u_int32_t nodesNeeded; + BlockDescriptor nodeRec; + u_int32_t insertNodeNum; + u_int16_t index; + Boolean recordFit; + Boolean validHint; + + + ////////////////////////// Priliminary Checks /////////////////////////////// + + nodeRec.buffer = nil; // so we can call ReleaseNode + nodeRec.blockHeader = nil; + + err = CheckInsertParams (filePtr, iterator, record, recordLen); + if (err != noErr) + return err; + + btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; + + REQUIRE_FILE_LOCK(btreePtr->fileRefNum, false); + + ////////////////////////////// Take A Hint ////////////////////////////////// + + err = IsItAHint (btreePtr, iterator, &validHint); + M_ExitOnError (err); + + if (validHint) + { + insertNodeNum = iterator->hint.nodeNum; + + err = GetNode (btreePtr, insertNodeNum, kGetNodeHint, &nodeRec); + if( err == noErr ) + { + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, &nodeRec); + + err = TrySimpleReplace (btreePtr, nodeRec.buffer, iterator, record, recordLen, &recordFit); + M_ExitOnError (err); + + if (recordFit) + { + err = UpdateNode (btreePtr, &nodeRec, 0, 0); + M_ExitOnError (err); + + ++btreePtr->numValidHints; + + goto Success; + } + else + { + (void) BTInvalidateHint( iterator ); + } + + err = ReleaseNode (btreePtr, &nodeRec); + M_ExitOnError (err); + } + else + { + (void) BTInvalidateHint( iterator ); + } + } + + + ////////////////////////////// Get A Clue /////////////////////////////////// + + err = SearchTree (btreePtr, &iterator->key, treePathTable, &insertNodeNum, &nodeRec, &index); + M_ExitOnError (err); // record must exit for Replace + + // optimization - if simple replace will work then don't extend btree + // €€ if we tried this before, and failed because it wouldn't fit then we shouldn't try this again... + + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, &nodeRec); + + err = TrySimpleReplace (btreePtr, nodeRec.buffer, iterator, record, recordLen, &recordFit); + M_ExitOnError (err); + + if (recordFit) + { + err = UpdateNode (btreePtr, &nodeRec, 0, 0); + M_ExitOnError (err); + + goto Success; + } + + + //////////////////////////// Make Some Room ///////////////////////////////// + + if ((btreePtr->treeDepth + 1UL) > btreePtr->freeNodes) + { + nodesNeeded = btreePtr->treeDepth + 1 + btreePtr->totalNodes - btreePtr->freeNodes; + if (nodesNeeded > CalcMapBits (btreePtr)) // we'll need to add a map node too! + ++nodesNeeded; + + err = ExtendBTree (btreePtr, nodesNeeded); + M_ExitOnError (err); + } + + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, &nodeRec); + + DeleteRecord (btreePtr, nodeRec.buffer, index); // delete existing key/record + + err = InsertTree (btreePtr, treePathTable, &iterator->key, record->bufferAddress, + recordLen, &nodeRec, index, 1, kReplaceRecord, &insertNodeNum); + M_ExitOnError (err); + + ++btreePtr->writeCount; /* writeCount changes only if the tree structure changed */ + +Success: + // create hint + iterator->hint.writeCount = btreePtr->writeCount; + iterator->hint.nodeNum = insertNodeNum; + iterator->hint.index = 0; // unused + iterator->hint.reserved1 = 0; + iterator->hint.reserved2 = 0; + + return noErr; + + + ////////////////////////////// Error Exit /////////////////////////////////// + +ErrorExit: + + (void) ReleaseNode (btreePtr, &nodeRec); + + iterator->hint.writeCount = 0; + iterator->hint.nodeNum = 0; + iterator->hint.index = 0; + iterator->hint.reserved1 = 0; + iterator->hint.reserved2 = 0; + + return err; +} + + + +//////////////////////////////// BTUpdateRecord //////////////////////////////// + +OSStatus +BTUpdateRecord(FCB *filePtr, BTreeIterator *iterator, + IterateCallBackProcPtr callBackProc, void * callBackState) +{ + OSStatus err; + BTreeControlBlockPtr btreePtr; + TreePathTable treePathTable; + BlockDescriptor nodeRec; + RecordPtr recordPtr; + BTreeKeyPtr keyPtr; + u_int32_t insertNodeNum; + u_int16_t recordLen; + u_int16_t index; + Boolean validHint; + + + ////////////////////////// Priliminary Checks /////////////////////////////// + + nodeRec.buffer = nil; // so we can call ReleaseNode + nodeRec.blockHeader = nil; + + btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; + + REQUIRE_FILE_LOCK(btreePtr->fileRefNum, true); + + ////////////////////////////// Take A Hint ////////////////////////////////// + + err = IsItAHint (btreePtr, iterator, &validHint); + M_ExitOnError (err); + + if (validHint) + { + insertNodeNum = iterator->hint.nodeNum; + + err = GetNode (btreePtr, insertNodeNum, kGetNodeHint, &nodeRec); + if (err == noErr) + { + if (((NodeDescPtr)nodeRec.buffer)->kind == kBTLeafNode && + SearchNode (btreePtr, nodeRec.buffer, &iterator->key, &index)) + { + err = GetRecordByIndex(btreePtr, nodeRec.buffer, index, &keyPtr, &recordPtr, &recordLen); + M_ExitOnError (err); + + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, &nodeRec); + + err = callBackProc(keyPtr, recordPtr, callBackState); + M_ExitOnError (err); + + err = UpdateNode (btreePtr, &nodeRec, 0, 0); + M_ExitOnError (err); + + ++btreePtr->numValidHints; + + goto Success; + } + else + { + (void) BTInvalidateHint( iterator ); + } + + err = ReleaseNode (btreePtr, &nodeRec); + M_ExitOnError (err); + } + else + { + (void) BTInvalidateHint( iterator ); + } + } + + ////////////////////////////// Get A Clue /////////////////////////////////// + + err = SearchTree (btreePtr, &iterator->key, treePathTable, &insertNodeNum, &nodeRec, &index); + M_ExitOnError (err); + + err = GetRecordByIndex(btreePtr, nodeRec.buffer, index, &keyPtr, &recordPtr, &recordLen); + M_ExitOnError (err); + + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, &nodeRec); + + err = callBackProc(keyPtr, recordPtr, callBackState); + M_ExitOnError (err); + + err = UpdateNode (btreePtr, &nodeRec, 0, 0); + M_ExitOnError (err); + +Success: + // create hint + iterator->hint.writeCount = btreePtr->writeCount; + iterator->hint.nodeNum = insertNodeNum; + iterator->hint.index = 0; + iterator->hint.reserved1 = 0; + iterator->hint.reserved2 = 0; + return noErr; + + ////////////////////////////// Error Exit /////////////////////////////////// + +ErrorExit: + + (void) ReleaseNode (btreePtr, &nodeRec); + + iterator->hint.writeCount = 0; + iterator->hint.nodeNum = 0; + iterator->hint.index = 0; + iterator->hint.reserved1 = 0; + iterator->hint.reserved2 = 0; + return err; +} + + + +//////////////////////////////// BTDeleteRecord ///////////////////////////////// + +OSStatus BTDeleteRecord (FCB *filePtr, + BTreeIterator *iterator ) +{ + OSStatus err; + BTreeControlBlockPtr btreePtr; + TreePathTable treePathTable; + BlockDescriptor nodeRec; + u_int32_t nodesNeeded; + u_int32_t nodeNum; + u_int16_t index; + + + ////////////////////////// Priliminary Checks /////////////////////////////// + + nodeRec.buffer = nil; // so we can call ReleaseNode + nodeRec.blockHeader = nil; + + M_ReturnErrorIf (filePtr == nil, paramErr); + M_ReturnErrorIf (iterator == nil, paramErr); + + btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; + if (btreePtr == nil) + { + err = fsBTInvalidFileErr; + goto ErrorExit; + } + + REQUIRE_FILE_LOCK(btreePtr->fileRefNum, false); + + + /////////////////////////////// Find Key //////////////////////////////////// + + // check hint for simple delete case (index > 0, numRecords > 2) + + err = SearchTree (btreePtr, &iterator->key, treePathTable, &nodeNum, &nodeRec, &index); + M_ExitOnError (err); // record must exit for Delete + + + /////////////////////// Extend File If Necessary //////////////////////////// + + /* + * Worst case: we delete the first record in the tree and + * following key is sufficiently larger to cause all parents to + * require splitting and we need a new root node and a new map + * node. + */ + if (index == 0 && btreePtr->treeDepth + 1 > btreePtr->freeNodes) + { + nodesNeeded = btreePtr->treeDepth + btreePtr->totalNodes; + if (nodesNeeded > CalcMapBits (btreePtr)) + ++nodesNeeded; + + if (nodesNeeded - btreePtr->totalNodes > btreePtr->freeNodes) { + err = ExtendBTree (btreePtr, nodesNeeded); + M_ExitOnError (err); + } + } + + ///////////////////////////// Delete Record ///////////////////////////////// + + err = DeleteTree (btreePtr, treePathTable, &nodeRec, index, 1); + M_ExitOnError (err); + + ++btreePtr->writeCount; + --btreePtr->leafRecords; + M_BTreeHeaderDirty (btreePtr); + + iterator->hint.nodeNum = 0; + + return noErr; + + ////////////////////////////// Error Exit /////////////////////////////////// + +ErrorExit: + (void) ReleaseNode (btreePtr, &nodeRec); + + return err; +} + + + +OSStatus BTGetInformation (FCB *filePtr, + u_int16_t file_version, + BTreeInfoRec *info ) +{ +#pragma unused (file_version) + + BTreeControlBlockPtr btreePtr; + + + M_ReturnErrorIf (filePtr == nil, paramErr); + + btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; + + /* + * XXX SER + * This should not require the whole tree to be locked, just maybe the BTreeControlBlockPtr + * + * REQUIRE_FILE_LOCK(btreePtr->fileRefNum, true); + */ + + M_ReturnErrorIf (btreePtr == nil, fsBTInvalidFileErr); + M_ReturnErrorIf (info == nil, paramErr); + + //€€ check version? + + info->nodeSize = btreePtr->nodeSize; + info->maxKeyLength = btreePtr->maxKeyLength; + info->treeDepth = btreePtr->treeDepth; + info->numRecords = btreePtr->leafRecords; + info->numNodes = btreePtr->totalNodes; + info->numFreeNodes = btreePtr->freeNodes; + info->lastfsync = btreePtr->lastfsync; + info->keyCompareType = btreePtr->keyCompareType; + return noErr; +} + +// XXXdbg +OSStatus +BTIsDirty(FCB *filePtr) +{ + BTreeControlBlockPtr btreePtr; + + btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; + return TreeIsDirty(btreePtr); +} + +/*------------------------------------------------------------------------------- +Routine: BTFlushPath - Flush BTreeControlBlock to Header Node. + +Function: Brief_description_of_the_function_and_any_side_effects + + +Input: pathPtr - pointer to path control block for B*Tree file to flush + +Output: none + +Result: noErr - success + != noErr - failure +-------------------------------------------------------------------------------*/ + +OSStatus BTFlushPath (FCB *filePtr) +{ + OSStatus err; + BTreeControlBlockPtr btreePtr; + + + M_ReturnErrorIf (filePtr == nil, paramErr); + + btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; + + M_ReturnErrorIf (btreePtr == nil, fsBTInvalidFileErr); + + REQUIRE_FILE_LOCK(btreePtr->fileRefNum, true); + + err = UpdateHeader (btreePtr, false); + + return err; +} + + +/*------------------------------------------------------------------------------- +Routine: BTReload - Reload B-tree Header Data. + +Function: Reload B-tree header data from disk. This is called after fsck + has made repairs to the root filesystem. The filesystem is + mounted read-only when BTReload is caled. + + +Input: filePtr - the B*Tree file that needs its header updated + +Output: none + +Result: noErr - success + != noErr - failure +-------------------------------------------------------------------------------*/ + +OSStatus +BTReloadData(FCB *filePtr) +{ + OSStatus err; + BTreeControlBlockPtr btreePtr; + BlockDescriptor node; + BTHeaderRec *header; + + + node.buffer = nil; + node.blockHeader = nil; + + btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; + if (btreePtr == nil) + return (fsBTInvalidFileErr); + + REQUIRE_FILE_LOCK(btreePtr->fileRefNum, false); + + err = GetNode(btreePtr, kHeaderNodeNum, 0, &node); + if (err != noErr) + return (err); + + header = (BTHeaderRec*)((char *)node.buffer + sizeof(BTNodeDescriptor)); + if ((err = VerifyHeader (filePtr, header)) == 0) { + btreePtr->treeDepth = header->treeDepth; + btreePtr->rootNode = header->rootNode; + btreePtr->leafRecords = header->leafRecords; + btreePtr->firstLeafNode = header->firstLeafNode; + btreePtr->lastLeafNode = header->lastLeafNode; + btreePtr->maxKeyLength = header->maxKeyLength; + btreePtr->totalNodes = header->totalNodes; + btreePtr->freeNodes = header->freeNodes; + btreePtr->btreeType = header->btreeType; + + btreePtr->flags &= (~kBTHeaderDirty); + } + + (void) ReleaseNode(btreePtr, &node); + + return err; +} + + +/*------------------------------------------------------------------------------- +Routine: BTInvalidateHint - Invalidates the hint within a BTreeInterator. + +Function: Invalidates the hint within a BTreeInterator. + + +Input: iterator - pointer to BTreeIterator + +Output: iterator - iterator with the hint.nodeNum cleared + +Result: noErr - success + paramErr - iterator == nil +-------------------------------------------------------------------------------*/ + + +OSStatus BTInvalidateHint (BTreeIterator *iterator ) +{ + if (iterator == nil) + return paramErr; + + iterator->hint.nodeNum = 0; + + return noErr; +} + + + + +/*------------------------------------------------------------------------------- +Routine: BTGetLastSync + +Function: Returns the last time that this btree was flushed, does not include header. + +Input: filePtr - pointer file control block + +Output: lastfsync - time in seconds of last update + +Result: noErr - success + paramErr - iterator == nil +-------------------------------------------------------------------------------*/ + + +OSStatus BTGetLastSync (FCB *filePtr, + u_int32_t *lastsync) +{ + BTreeControlBlockPtr btreePtr; + + + M_ReturnErrorIf (filePtr == nil, paramErr); + + btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; + + /* Maybe instead of requiring a lock..an atomic set might be more appropriate */ + REQUIRE_FILE_LOCK(btreePtr->fileRefNum, true); + + M_ReturnErrorIf (btreePtr == nil, fsBTInvalidFileErr); + M_ReturnErrorIf (lastsync == nil, paramErr); + + *lastsync = btreePtr->lastfsync; + + return noErr; +} + + + + +/*------------------------------------------------------------------------------- +Routine: BTSetLastSync + +Function: Sets the last time that this btree was flushed, does not include header. + + +Input: fcb - pointer file control block + +Output: lastfsync - time in seconds of last update + +Result: noErr - success + paramErr - iterator == nil +-------------------------------------------------------------------------------*/ + + +OSStatus BTSetLastSync (FCB *filePtr, + u_int32_t lastsync) +{ + BTreeControlBlockPtr btreePtr; + + + M_ReturnErrorIf (filePtr == nil, paramErr); + + btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; + + /* Maybe instead of requiring a lock..an atomic set might be more appropriate */ + REQUIRE_FILE_LOCK(btreePtr->fileRefNum, true); + + M_ReturnErrorIf (btreePtr == nil, fsBTInvalidFileErr); + M_ReturnErrorIf (lastsync == 0, paramErr); + + btreePtr->lastfsync = lastsync; + + return noErr; +} + +OSStatus BTHasContiguousNodes (FCB *filePtr) +{ + BTreeControlBlockPtr btreePtr; + + + M_ReturnErrorIf (filePtr == nil, paramErr); + + btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; + + REQUIRE_FILE_LOCK(btreePtr->fileRefNum, true); + + M_ReturnErrorIf (btreePtr == nil, fsBTInvalidFileErr); + + return NodesAreContiguous(FCBTOVCB(filePtr), filePtr, btreePtr->nodeSize); +} + + +/*------------------------------------------------------------------------------- +Routine: BTGetUserData + +Function: Read the user data area of the b-tree header node. + +-------------------------------------------------------------------------------*/ +OSStatus +BTGetUserData(FCB *filePtr, void * dataPtr, int dataSize) +{ + BTreeControlBlockPtr btreePtr; + BlockDescriptor node; + char * offset; + OSStatus err; + + if (dataSize > kBTreeHeaderUserBytes) + return (EINVAL); + node.buffer = nil; + node.blockHeader = nil; + + btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; + if (btreePtr == nil) + return (fsBTInvalidFileErr); + + REQUIRE_FILE_LOCK(btreePtr->fileRefNum, false); + + err = GetNode(btreePtr, kHeaderNodeNum, 0, &node); + if (err) + return (err); + + offset = (char *)node.buffer + sizeof(BTNodeDescriptor) + sizeof(BTHeaderRec); + bcopy(offset, dataPtr, dataSize); + + (void) ReleaseNode(btreePtr, &node); + + return (0); +} + + +/*------------------------------------------------------------------------------- +Routine: BTSetUserData + +Function: Write the user data area of the b-tree header node. +-------------------------------------------------------------------------------*/ +OSStatus +BTSetUserData(FCB *filePtr, void * dataPtr, int dataSize) +{ + BTreeControlBlockPtr btreePtr; + BlockDescriptor node; + char * offset; + OSStatus err; + + if (dataSize > kBTreeHeaderUserBytes) + return (EINVAL); + node.buffer = nil; + node.blockHeader = nil; + + btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; + if (btreePtr == nil) + return (fsBTInvalidFileErr); + + REQUIRE_FILE_LOCK(btreePtr->fileRefNum, false); + + err = GetNode(btreePtr, kHeaderNodeNum, 0, &node); + if (err) + return (err); + + ModifyBlockStart(btreePtr->fileRefNum, &node); + + offset = (char *)node.buffer + sizeof(BTNodeDescriptor) + sizeof(BTHeaderRec); + bcopy(dataPtr, offset, dataSize); + + err = UpdateNode (btreePtr, &node, 0, 0); + + return (err); +} + diff --git a/core/BTreeAllocate.c b/core/BTreeAllocate.c new file mode 100644 index 0000000..d9b3b63 --- /dev/null +++ b/core/BTreeAllocate.c @@ -0,0 +1,748 @@ +/* + * Copyright (c) 2000-2003, 2005-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + File: BTreeAllocate.c + + Contains: BTree Node Allocation routines for the BTree Module. + + Version: xxx put the technology version here xxx + + Written by: Gordon Sheridan and Bill Bruffey + + Copyright: (c) 1992-1999 by Apple Inc., all rights reserved. + + File Ownership: + + DRI: Don Brady + + Other Contact: Mark Day + + Technology: File Systems + + Writers: + + (djb) Don Brady + (ser) Scott Roberts + (msd) Mark Day + + Change History (most recent first): + + 6/1/99 djb Sync up with Mac OS 8.6. + 11/24/97 djb Remove some debug code (Panic calls). + 7/24/97 djb CallbackProcs now take refnum instead of an FCB. + 4/23/97 djb first checked in + + 2/19/97 djb Change E_BadNodeType to fsBTBadNodeType. + 12/19/96 djb first checked in + + History applicable to original Scarecrow Design: + + <4> 10/25/96 ser Changing for new VFPI + <3> 10/18/96 ser Converting over VFPI changes + <2> 1/10/96 msd Change 64-bit math to use real function names from Math64.i. + <1> 10/18/95 rst Moved from Scarecrow project. + + <8> 1/12/95 wjk Adopt Model FileSystem changes in D5. + <7> 9/30/94 prp Get in sync with D2 interface changes. + <6> 7/22/94 wjk Convert to the new set of header files. + <5> 8/31/93 prp Use U64SetU instead of S64Set. + <4> 5/21/93 gs Fix ExtendBTree bug. + <3> 5/10/93 gs Fix pointer arithmetic bug in AllocateNode. + <2> 3/23/93 gs finish ExtendBTree routine. + <1> 2/8/93 gs first checked in + <0> 1/1/93 gs begin AllocateNode and FreeNode + +*/ + +#include "hfs_btreeio.h" +#include "hfs_endian.h" +#include "BTreesPrivate.h" + +///////////////////// Routines Internal To BTreeAllocate.c ////////////////////// + +static OSStatus GetMapNode (BTreeControlBlockPtr btreePtr, + BlockDescriptor *nodePtr, + u_int16_t **mapPtr, + u_int16_t *mapSize ); + +///////////////////////////////////////////////////////////////////////////////// + +/*------------------------------------------------------------------------------- + +Routine: AllocateNode - Find Free Node, Mark It Used, and Return Node Number. + +Function: Searches the map records for the first free node, marks it "in use" and + returns the node number found. This routine should really only be called + when we know there are free blocks, otherwise it's just a waste of time. + +Note: We have to examine map nodes a word at a time rather than a long word + because the External BTree Mgr used map records that were not an integral + number of long words. Too bad. In our spare time could develop a more + sophisticated algorithm that read map records by long words (and long + word aligned) and handled the spare bytes at the beginning and end + appropriately. + +Input: btreePtr - pointer to control block for BTree file + +Output: nodeNum - number of node allocated + + +Result: noErr - success + fsBTNoMoreMapNodesErr - no free blocks were found + != noErr - failure +-------------------------------------------------------------------------------*/ + +OSStatus AllocateNode (BTreeControlBlockPtr btreePtr, u_int32_t *nodeNum) +{ + OSStatus err; + BlockDescriptor node; + u_int16_t *mapPtr, *pos; + u_int16_t mapSize, size; + u_int16_t freeWord; + u_int16_t mask; + u_int16_t bitOffset; + u_int32_t nodeNumber; + + + nodeNumber = 0; // first node number of header map record + node.buffer = nil; // clear node.buffer to get header node + // - and for ErrorExit + node.blockHeader = nil; + + while (true) + { + err = GetMapNode (btreePtr, &node, &mapPtr, &mapSize); + M_ExitOnError (err); + + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, &node); + + //////////////////////// Find Word with Free Bit //////////////////////////// + + pos = mapPtr; + size = mapSize; + size >>= 1; // convert to number of words + //€€ assumes mapRecords contain an integral number of words + + while ( size-- ) + { + if ( *pos++ != 0xFFFF ) // assume test fails, and increment pos + break; + } + + --pos; // whoa! backup + + if (*pos != 0xFFFF) // hey, we got one! + break; + + nodeNumber += mapSize << 3; // covert to number of bits (nodes) + } + + ///////////////////////// Find Free Bit in Word ///////////////////////////// + + freeWord = SWAP_BE16 (*pos); + bitOffset = 15; + mask = 0x8000; + + do { + if ( (freeWord & mask) == 0) + break; + mask >>= 1; + } while (--bitOffset); + + ////////////////////// Calculate Free Node Number /////////////////////////// + + nodeNumber += ((pos - mapPtr) << 4) + (15 - bitOffset); // (pos-mapPtr) = # of words! + + + ///////////////////////// Check for End of Map ////////////////////////////// + + if (nodeNumber >= btreePtr->totalNodes) + { + err = fsBTFullErr; + goto ErrorExit; + } + + /////////////////////////// Allocate the Node /////////////////////////////// + + *pos |= SWAP_BE16 (mask); // set the map bit for the node + + err = UpdateNode (btreePtr, &node, 0, kLockTransaction); + M_ExitOnError (err); + + --btreePtr->freeNodes; + M_BTreeHeaderDirty(btreePtr); + + /* Account for allocations from node reserve */ + BTUpdateReserve(btreePtr, 1); + + *nodeNum = nodeNumber; + + return noErr; + +////////////////////////////////// Error Exit /////////////////////////////////// + +ErrorExit: + + (void) ReleaseNode (btreePtr, &node); + *nodeNum = 0; + + return err; +} + + + +/*------------------------------------------------------------------------------- + +Routine: FreeNode - Clear allocation bit for node. + +Function: Finds the bit representing the node specified by nodeNum in the node + map and clears the bit. + + +Input: btreePtr - pointer to control block for BTree file + nodeNum - number of node to mark free + +Output: none + +Result: noErr - success + fsBTNoMoreMapNodesErr - node number is beyond end of node map + != noErr - GetNode or ReleaseNode encountered some difficulty +-------------------------------------------------------------------------------*/ + +OSStatus FreeNode (BTreeControlBlockPtr btreePtr, u_int32_t nodeNum) +{ + OSStatus err; + BlockDescriptor node; + u_int32_t nodeIndex; + u_int16_t mapSize = 0; + u_int16_t *mapPos = NULL; + u_int16_t bitOffset; + + + //////////////////////////// Find Map Record //////////////////////////////// + nodeIndex = 0; // first node number of header map record + node.buffer = nil; // invalidate node.buffer to get header node + node.blockHeader = nil; + + while (nodeNum >= nodeIndex) + { + err = GetMapNode (btreePtr, &node, &mapPos, &mapSize); + M_ExitOnError (err); + + nodeIndex += mapSize << 3; // covert to number of bits (nodes) + } + + //////////////////////////// Mark Node Free ///////////////////////////////// + + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, &node); + + nodeNum -= (nodeIndex - (mapSize << 3)); // relative to this map record + bitOffset = 15 - (nodeNum & 0x0000000F); // last 4 bits are bit offset + mapPos += nodeNum >> 4; // point to word containing map bit + + M_SWAP_BE16_ClearBitNum (*mapPos, bitOffset); // clear it + + err = UpdateNode (btreePtr, &node, 0, kLockTransaction); + M_ExitOnError (err); + + ++btreePtr->freeNodes; + M_BTreeHeaderDirty(btreePtr); + + return noErr; + +ErrorExit: + + (void) ReleaseNode (btreePtr, &node); + + return err; +} + + + +/*------------------------------------------------------------------------------- + +Routine: ExtendBTree - Call FSAgent to extend file, and allocate necessary map nodes. + +Function: This routine calls the the FSAgent to extend the end of fork, if necessary, + to accomodate the number of nodes requested. It then allocates as many + map nodes as are necessary to account for all the nodes in the B*Tree. + If newTotalNodes is less than the current number of nodes, no action is + taken. + +Note: Internal HFS File Manager BTree Module counts on an integral number of + long words in map records, although they are not long word aligned. + +Input: btreePtr - pointer to control block for BTree file + newTotalNodes - total number of nodes the B*Tree is to extended to + +Output: none + +Result: noErr - success + != noErr - failure +-------------------------------------------------------------------------------*/ + +OSStatus ExtendBTree (BTreeControlBlockPtr btreePtr, + u_int32_t newTotalNodes ) +{ + OSStatus err; + FCB *filePtr; + FSSize minEOF, maxEOF; + u_int16_t nodeSize; + u_int32_t oldTotalNodes; + u_int32_t newMapNodes; + u_int32_t mapBits, totalMapBits; + u_int32_t recStartBit; + u_int32_t nodeNum, nextNodeNum; + u_int32_t firstNewMapNodeNum, lastNewMapNodeNum; + BlockDescriptor mapNode, newNode; + u_int16_t *mapPos; + u_int16_t *mapStart; + u_int16_t mapSize; + u_int16_t mapNodeRecSize; + u_int32_t bitInWord, bitInRecord; + u_int16_t mapIndex; + + + oldTotalNodes = btreePtr->totalNodes; + if (newTotalNodes <= oldTotalNodes) // we're done! + return noErr; + + nodeSize = btreePtr->nodeSize; + filePtr = GetFileControlBlock(btreePtr->fileRefNum); + + mapNode.buffer = nil; + mapNode.blockHeader = nil; + newNode.buffer = nil; + newNode.blockHeader = nil; + + mapNodeRecSize = nodeSize - sizeof(BTNodeDescriptor) - 6; // 2 bytes of free space (see note) + + + //////////////////////// Count Bits In Node Map ///////////////////////////// + + totalMapBits = 0; + do { + err = GetMapNode (btreePtr, &mapNode, &mapStart, &mapSize); + M_ExitOnError (err); + + mapBits = mapSize << 3; // mapSize (in bytes) * 8 + recStartBit = totalMapBits; // bit number of first bit in map record + totalMapBits += mapBits; + + } while ( ((BTNodeDescriptor*)mapNode.buffer)->fLink != 0 ); + +#if DEBUG + if (totalMapBits != CalcMapBits (btreePtr)) + Panic ("ExtendBTree: totalMapBits != CalcMapBits"); +#endif + + /////////////////////// Extend LEOF If Necessary //////////////////////////// + + minEOF = (u_int64_t)newTotalNodes * (u_int64_t)nodeSize; + if ( (u_int64_t)filePtr->fcbEOF < minEOF ) + { + maxEOF = (u_int64_t)0x7fffffffLL * (u_int64_t)nodeSize; + + err = btreePtr->setEndOfForkProc (btreePtr->fileRefNum, minEOF, maxEOF); + M_ExitOnError (err); + } + + + //////////////////// Calc New Total Number Of Nodes ///////////////////////// + + newTotalNodes = filePtr->fcbEOF / nodeSize; // hack! + // do we wish to perform any verification of newTotalNodes at this point? + + btreePtr->totalNodes = newTotalNodes; // do we need to update freeNodes here too? + + + ////////////// Calculate Number Of New Map Nodes Required /////////////////// + + newMapNodes = 0; + if (newTotalNodes > totalMapBits) + { + newMapNodes = (((newTotalNodes - totalMapBits) >> 3) / mapNodeRecSize) + 1; + firstNewMapNodeNum = oldTotalNodes; + lastNewMapNodeNum = firstNewMapNodeNum + newMapNodes - 1; + } + else + { + err = ReleaseNode (btreePtr, &mapNode); + M_ExitOnError (err); + + goto Success; + } + + + /////////////////////// Initialize New Map Nodes //////////////////////////// + // XXXdbg - this is the correct place for this: + ModifyBlockStart(btreePtr->fileRefNum, &mapNode); + + ((BTNodeDescriptor*)mapNode.buffer)->fLink = firstNewMapNodeNum; + + nodeNum = firstNewMapNodeNum; + while (true) + { + err = GetNewNode (btreePtr, nodeNum, &newNode); + M_ExitOnError (err); + + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, &newNode); + + ((NodeDescPtr)newNode.buffer)->numRecords = 1; + ((NodeDescPtr)newNode.buffer)->kind = kBTMapNode; + + // set free space offset + *(u_int16_t *)((Ptr)newNode.buffer + nodeSize - 4) = nodeSize - 6; + + if (nodeNum++ == lastNewMapNodeNum) + break; + + ((BTNodeDescriptor*)newNode.buffer)->fLink = nodeNum; // point to next map node + + err = UpdateNode (btreePtr, &newNode, 0, kLockTransaction); + M_ExitOnError (err); + } + + err = UpdateNode (btreePtr, &newNode, 0, kLockTransaction); + M_ExitOnError (err); + + + ///////////////////// Mark New Map Nodes Allocated ////////////////////////// + + nodeNum = firstNewMapNodeNum; + do { + bitInRecord = nodeNum - recStartBit; + + while (bitInRecord >= mapBits) + { + nextNodeNum = ((NodeDescPtr)mapNode.buffer)->fLink; + if ( nextNodeNum == 0) + { + err = fsBTNoMoreMapNodesErr; + goto ErrorExit; + } + + err = UpdateNode (btreePtr, &mapNode, 0, kLockTransaction); + M_ExitOnError (err); + + err = GetNode (btreePtr, nextNodeNum, 0, &mapNode); + M_ExitOnError (err); + + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, &mapNode); + + mapIndex = 0; + + mapStart = (u_int16_t *) GetRecordAddress (btreePtr, mapNode.buffer, mapIndex); + mapSize = GetRecordSize (btreePtr, mapNode.buffer, mapIndex); + +#if DEBUG + if (mapSize != M_MapRecordSize (btreePtr->nodeSize) ) + { + Panic ("ExtendBTree: mapSize != M_MapRecordSize"); + } +#endif + + mapBits = mapSize << 3; // mapSize (in bytes) * 8 + recStartBit = totalMapBits; // bit number of first bit in map record + totalMapBits += mapBits; + + bitInRecord = nodeNum - recStartBit; + } + + mapPos = mapStart + ((nodeNum - recStartBit) >> 4); + bitInWord = 15 - ((nodeNum - recStartBit) & 0x0000000F); + + M_SWAP_BE16_SetBitNum (*mapPos, bitInWord); + + ++nodeNum; + + } while (nodeNum <= lastNewMapNodeNum); + + err = UpdateNode (btreePtr, &mapNode, 0, kLockTransaction); + M_ExitOnError (err); + + + //////////////////////////////// Success //////////////////////////////////// + +Success: + + btreePtr->totalNodes = newTotalNodes; + btreePtr->freeNodes += (newTotalNodes - oldTotalNodes) - newMapNodes; + + M_BTreeHeaderDirty(btreePtr); + + /* Force the b-tree header changes to disk */ + (void) UpdateHeader (btreePtr, true); + + return noErr; + + + ////////////////////////////// Error Exit /////////////////////////////////// + +ErrorExit: + + (void) ReleaseNode (btreePtr, &mapNode); + (void) ReleaseNode (btreePtr, &newNode); + + return err; +} + + + +/*------------------------------------------------------------------------------- + +Routine: GetMapNode - Get the next map node and pointer to the map record. + +Function: Given a BlockDescriptor to a map node in nodePtr, GetMapNode releases + it and gets the next node. If nodePtr->buffer is nil, then the header + node is retrieved. + + +Input: btreePtr - pointer to control block for BTree file + nodePtr - pointer to a BlockDescriptor of a map node + +Output: nodePtr - pointer to the BlockDescriptor for the next map node + mapPtr - pointer to the map record within the map node + mapSize - number of bytes in the map record + +Result: noErr - success + fsBTNoMoreMapNodesErr - we've run out of map nodes + fsBTInvalidNodeErr - bad node, or not node type kMapNode + != noErr - failure +-------------------------------------------------------------------------------*/ + +static +OSStatus GetMapNode (BTreeControlBlockPtr btreePtr, + BlockDescriptor *nodePtr, + u_int16_t **mapPtr, + u_int16_t *mapSize ) +{ + OSStatus err; + u_int16_t mapIndex; + u_int32_t nextNodeNum; + + if (nodePtr->buffer != nil) // if iterator is valid... + { + nextNodeNum = ((NodeDescPtr)nodePtr->buffer)->fLink; + if (nextNodeNum == 0) + { + err = fsBTNoMoreMapNodesErr; + goto ErrorExit; + } + + err = ReleaseNode (btreePtr, nodePtr); + M_ExitOnError (err); + + err = GetNode (btreePtr, nextNodeNum, 0, nodePtr); + M_ExitOnError (err); + + if ( ((NodeDescPtr)nodePtr->buffer)->kind != kBTMapNode) + { + err = fsBTBadNodeType; + goto ErrorExit; + } + + ++btreePtr->numMapNodesRead; + mapIndex = 0; + } else { + err = GetNode (btreePtr, kHeaderNodeNum, 0, nodePtr); + M_ExitOnError (err); + + if ( ((NodeDescPtr)nodePtr->buffer)->kind != kBTHeaderNode) + { + err = fsBTInvalidHeaderErr; //€€ or fsBTBadNodeType + goto ErrorExit; + } + + mapIndex = 2; + } + + + *mapPtr = (u_int16_t *) GetRecordAddress (btreePtr, nodePtr->buffer, mapIndex); + *mapSize = GetRecordSize (btreePtr, nodePtr->buffer, mapIndex); + + return noErr; + + +ErrorExit: + + (void) ReleaseNode (btreePtr, nodePtr); + + *mapPtr = nil; + *mapSize = 0; + + return err; +} + + + +////////////////////////////////// CalcMapBits ////////////////////////////////// + +u_int32_t CalcMapBits (BTreeControlBlockPtr btreePtr) +{ + u_int32_t mapBits; + + mapBits = M_HeaderMapRecordSize (btreePtr->nodeSize) << 3; + + while (mapBits < btreePtr->totalNodes) + mapBits += M_MapRecordSize (btreePtr->nodeSize) << 3; + + return mapBits; +} + + +/*------------------------------------------------------------------------------- +Routine: BTZeroUnusedNodes + +Function: Write zeros to all nodes in the B-tree that are not currently in use. +-------------------------------------------------------------------------------*/ +int +BTZeroUnusedNodes(FCB *filePtr) +{ + int err; + vnode_t vp; + BTreeControlBlockPtr btreePtr; + BlockDescriptor mapNode; + buf_t bp; + u_int32_t nodeNumber; + u_int16_t *mapPtr, *pos; + u_int16_t mapSize, size; + u_int16_t mask; + u_int16_t bitNumber; + u_int16_t word; + int numWritten; + + vp = FTOV(filePtr); + btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; + bp = NULL; + nodeNumber = 0; + mapNode.buffer = nil; + mapNode.blockHeader = nil; + numWritten = 0; + + /* Iterate over map nodes. */ + while (true) + { + err = GetMapNode (btreePtr, &mapNode, &mapPtr, &mapSize); + if (err) + { + err = MacToVFSError(err); + goto ErrorExit; + } + + pos = mapPtr; + size = mapSize; + size >>= 1; /* convert to number of 16-bit words */ + + /* Iterate over 16-bit words in the map record. */ + while (size--) + { + if (*pos != 0xFFFF) /* Anything free in this word? */ + { + word = SWAP_BE16(*pos); + + /* Iterate over bits in the word. */ + for (bitNumber = 0, mask = 0x8000; + bitNumber < 16; + ++bitNumber, mask >>= 1) + { + if (word & mask) + continue; /* This node is in use. */ + + if (nodeNumber + bitNumber >= btreePtr->totalNodes) + { + /* We've processed all of the nodes. */ + goto done; + } + + /* + * Get a buffer full of zeros and write it to the unused + * node. Since we'll probably be writing a lot of nodes, + * bypass the journal (to avoid a transaction that's too + * big). Instead, this behaves more like clearing out + * nodes when extending a B-tree (eg., ClearBTNodes). + */ + bp = buf_getblk(vp, nodeNumber + bitNumber, btreePtr->nodeSize, 0, 0, BLK_META); + if (bp == NULL) + { + printf("hfs: BTZeroUnusedNodes: unable to read node %u\n", nodeNumber + bitNumber); + err = EIO; + goto ErrorExit; + } + + if (buf_flags(bp) & B_LOCKED) { + /* + * This node is already part of a transaction and will be written when + * the transaction is committed, so don't write it here. If we did, then + * we'd hit a panic in hfs_vnop_bwrite because the B_LOCKED bit is still set. + */ + buf_brelse(bp); + continue; + } + + buf_clear(bp); + buf_markaged(bp); + + /* + * Try not to hog the buffer cache. Wait for the write + * every 32 nodes. If VNOP_BWRITE reports an error, bail out and bubble + * it up to the function calling us. If we tried to update a read-only + * mount on read-only media, for example, catching the error will let + * us alert the callers of this function that they should maintain + * the mount in read-only mode. + + */ + ++numWritten; + if (numWritten % 32 == 0) { + err = VNOP_BWRITE(bp); + if (err) { + goto ErrorExit; + } + } + else { + buf_bawrite(bp); + } + } + } + + /* Go to the next word in the bitmap */ + ++pos; + nodeNumber += 16; + } + } + +ErrorExit: +done: + (void) ReleaseNode(btreePtr, &mapNode); + + return err; +} diff --git a/core/BTreeMiscOps.c b/core/BTreeMiscOps.c new file mode 100644 index 0000000..a8682ef --- /dev/null +++ b/core/BTreeMiscOps.c @@ -0,0 +1,676 @@ +/* + * Copyright (c) 2000-2003, 2005-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + File: BTreeMiscOps.c + + Contains: Miscellaneous operations for the BTree Module. + + Version: xxx put the technology version here xxx + + Written by: Gordon Sheridan and Bill Bruffey + + Copyright: (c) 1992-1999 by Apple Inc., all rights reserved. + + File Ownership: + + DRI: Don Brady + + Other Contact: Mark Day + + Technology: File Systems + + Writers: + + (DSH) Deric Horn + (msd) Mark Day + (djb) Don Brady + + Change History (most recent first): + + 6/1/99 djb Sync up with Mac OS 8.6. + 9/4/97 djb Optimize TrySimpleReplace for the case where record size is not + changing. + 4/23/97 djb first checked in + + 3/31/97 djb Move ClearMemory to Utilities.c. + 3/17/97 DSH Casting for DFA + 2/27/97 msd Remove temporary fix from last revision. BTree EOF's should be + correct now, so check for strict equality. + 2/26/97 msd Fix a casting problem in ClearMemory. TEMPORARY FIX: Made + VerifyHeader more lenient, allowing the EOF to be greater than + the amount actually used by nodes; this should really be fixed + in the formatting code (which needs to compute the real BTree + sizes before writing the volume header). + 2/19/97 djb Added ClearMemory. Changed CalcKeyLength to KeyLength. + 1/3/97 djb Added support for large keys. + 12/19/96 djb first checked in + + History applicable to original Scarecrow Design: + + <9> 10/25/96 ser Changing for new VFPI + <8> 10/18/96 ser Converting over VFPI changes + <7> 9/17/96 dkh More BTree statistics. Change IsItAHint to not always check to + see if the hint node is allocated. + <6> 9/16/96 dkh Revised BTree statistics. + <5> 6/20/96 dkh Radar #1358740. Change from using Pools to debug MemAllocators. + <4> 1/22/96 dkh Change Pools.i inclusion to PoolsPriv.i + <3> 1/10/96 msd Change 64-bit math to use real function names from Math64.i. + <2> 12/7/95 dkh D10E2 build. Changed usage of Ref data type to LogicalAddress. + <1> 10/18/95 rst Moved from Scarecrow project. + + <19> 4/26/95 prp In UpdateHeader, clear the dirty flag after the BTree is updated. + <18> 1/12/95 wjk Adopt Model FileSystem changes in D5. + <17> 11/16/94 prp Add IsItAHint routine and use it whenever hint's node number was + used for testing. + <16> 10/5/94 bk add pools.h include file + <15> 9/30/94 prp Get in sync with D2 interface changes. + <14> 7/22/94 wjk Convert to the new set of header files. + <13> 12/2/93 wjk Move from Makefiles to BuildFiles. Fit into the ModernOS and + NRCmds environments. + <12> 11/30/93 wjk Move from Makefiles to BuildFiles. Fit into the ModernOS and + NRCmds environments. + <11> 11/23/93 wjk Changes required to compile on the RS6000. + <10> 8/31/93 prp Use U64SetU instead of S64Set. + <9> 6/2/93 gs Update for changes to FSErrors.h and add some comments. + <8> 5/21/93 gs Modify UpdateHeader to write out attributes. Remove + Get/UpdateNode from TrySimpleReplace. + <7> 5/10/93 gs Add TrySimpleReplace routine. + <6> 3/23/93 gs Change MoveData to take void * instead of Ptr. Add UpdateHeader + and ClearBytes routines. + <5> 2/8/93 gs Add FindIteratorPosition. + <4> 12/10/92 gs Implement CheckKeyDescriptor and the KeyDescriptor interpreter. + <3> 12/8/92 gs Add GetKeyDescriptor, VerifyHeader, and Alloc/Dealloc memory + routines. + <2> 12/2/92 gs Add CompareKeys routine. + <1> 11/15/92 gs first checked in + +*/ + +#include "BTreesPrivate.h" +#include "hfs_btreeio.h" + + +////////////////////////////// Routine Definitions ////////////////////////////// + +/*------------------------------------------------------------------------------- +Routine: CalcKeyRecordSize - Return size of combined key/record structure. + +Function: Rounds keySize and recSize so they will end on word boundaries. + Does NOT add size of offset. + +Input: keySize - length of key (including length field) + recSize - length of record data + +Output: none + +Result: u_int16_t - size of combined key/record that will be inserted in btree +-------------------------------------------------------------------------------*/ + +u_int16_t CalcKeyRecordSize (u_int16_t keySize, + u_int16_t recSize ) +{ + if ( M_IsOdd (keySize) ) keySize += 1; // pad byte + + if (M_IsOdd (recSize) ) recSize += 1; // pad byte + + return (keySize + recSize); +} + + + +/*------------------------------------------------------------------------------- +Routine: VerifyHeader - Validate fields of the BTree header record. + +Function: Examines the fields of the BTree header record to determine if the + fork appears to contain a valid BTree. + +Input: forkPtr - pointer to fork control block + header - pointer to BTree header + + +Result: noErr - success + != noErr - failure +-------------------------------------------------------------------------------*/ + +OSStatus VerifyHeader (FCB *filePtr, + BTHeaderRec *header ) +{ + u_int64_t forkSize; + u_int32_t totalNodes; + + + switch (header->nodeSize) // node size == 512*2^n + { + case 512: + case 1024: + case 2048: + case 4096: + case 8192: + case 16384: + case 32768: break; + default: return fsBTInvalidHeaderErr; //€€ E_BadNodeType + } + + totalNodes = header->totalNodes; + + forkSize = (u_int64_t)totalNodes * (u_int64_t)header->nodeSize; + + if ( forkSize > (u_int64_t)filePtr->fcbEOF ) + return fsBTInvalidHeaderErr; + + if ( header->freeNodes >= totalNodes ) + return fsBTInvalidHeaderErr; + + if ( header->rootNode >= totalNodes ) + return fsBTInvalidHeaderErr; + + if ( header->firstLeafNode >= totalNodes ) + return fsBTInvalidHeaderErr; + + if ( header->lastLeafNode >= totalNodes ) + return fsBTInvalidHeaderErr; + + if ( header->treeDepth > kMaxTreeDepth ) + return fsBTInvalidHeaderErr; + + + /////////////////////////// Check BTree Type //////////////////////////////// + + switch (header->btreeType) + { + case 0: // HFS Type - no Key Descriptor + case kUserBTreeType: // with Key Descriptors etc. + case kReservedBTreeType: // Desktop Mgr BTree ? + break; + + default: return fsBTUnknownVersionErr; + } + + return noErr; +} + + + +OSStatus TreeIsDirty(BTreeControlBlockPtr btreePtr) +{ + return (btreePtr->flags & kBTHeaderDirty); +} + + + +/*------------------------------------------------------------------------------- +Routine: UpdateHeader - Write BTreeInfoRec fields to Header node. + +Function: Checks the kBTHeaderDirty flag in the BTreeInfoRec and updates the + header node if necessary. + +Input: btreePtr - pointer to BTreeInfoRec + + +Result: noErr - success + != noErr - failure +-------------------------------------------------------------------------------*/ + +OSStatus UpdateHeader(BTreeControlBlockPtr btreePtr, Boolean forceWrite) +{ + OSStatus err; + BlockDescriptor node; + BTHeaderRec *header; + u_int32_t options; + + if ((btreePtr->flags & kBTHeaderDirty) == 0) // btree info already flushed + return noErr; + + err = GetNode (btreePtr, kHeaderNodeNum, 0, &node ); + if (err != noErr) { + return err; + } + + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, &node); + + header = (BTHeaderRec*) ((char *)node.buffer + sizeof(BTNodeDescriptor)); + + header->treeDepth = btreePtr->treeDepth; + header->rootNode = btreePtr->rootNode; + header->leafRecords = btreePtr->leafRecords; + header->firstLeafNode = btreePtr->firstLeafNode; + header->lastLeafNode = btreePtr->lastLeafNode; + header->nodeSize = btreePtr->nodeSize; //€€ this shouldn't change + header->maxKeyLength = btreePtr->maxKeyLength; //€€ neither should this + header->totalNodes = btreePtr->totalNodes; + header->freeNodes = btreePtr->freeNodes; + header->btreeType = btreePtr->btreeType; + + // ignore header->clumpSize; //€€ rename this field? + + if (forceWrite) + options = kForceWriteBlock; + else + options = kLockTransaction; + + err = UpdateNode (btreePtr, &node, 0, options); + + btreePtr->flags &= (~kBTHeaderDirty); + + return err; +} + + + +/*------------------------------------------------------------------------------- +Routine: FindIteratorPosition - One_line_description. + +Function: Brief_description_of_the_function_and_any_side_effects + +Algorithm: see FSC.BT.BTIterateRecord.PICT + +Note: //€€ document side-effects of bad node hints + +Input: btreePtr - description + iterator - description + + +Output: iterator - description + left - description + middle - description + right - description + nodeNum - description + returnIndex - description + foundRecord - description + + +Result: noErr - success + != noErr - failure +-------------------------------------------------------------------------------*/ + +OSStatus FindIteratorPosition (BTreeControlBlockPtr btreePtr, + BTreeIteratorPtr iterator, + BlockDescriptor *left, + BlockDescriptor *middle, + BlockDescriptor *right, + u_int32_t *returnNodeNum, + u_int16_t *returnIndex, + Boolean *foundRecord ) +{ + OSStatus err; + Boolean foundIt; + u_int32_t nodeNum; + u_int16_t leftIndex, index, rightIndex; + Boolean validHint; + + // assume btreePtr valid + // assume left, middle, right point to BlockDescriptors + // assume nodeNum points to u_int32_t + // assume index points to u_int16_t + // assume foundRecord points to Boolean + + left->buffer = nil; + left->blockHeader = nil; + middle->buffer = nil; + middle->blockHeader = nil; + right->buffer = nil; + right->blockHeader = nil; + + foundIt = false; + + if (iterator == nil) // do we have an iterator? + { + err = fsBTInvalidIteratorErr; + goto ErrorExit; + } + + err = IsItAHint (btreePtr, iterator, &validHint); + M_ExitOnError (err); + + nodeNum = iterator->hint.nodeNum; + if (! validHint) // does the hint appear to be valid? + { + goto SearchTheTree; + } + + err = GetNode (btreePtr, nodeNum, kGetNodeHint, middle); + if( err == fsBTInvalidNodeErr ) // returned if nodeNum is out of range + goto SearchTheTree; + + M_ExitOnError (err); + + if ( ((NodeDescPtr) middle->buffer)->kind != kBTLeafNode || + ((NodeDescPtr) middle->buffer)->numRecords <= 0 ) + { + goto SearchTheTree; + } + + foundIt = SearchNode (btreePtr, middle->buffer, &iterator->key, &index); + if (foundIt == true) + { + ++btreePtr->numValidHints; + goto SuccessfulExit; + } + iterator->hint.nodeNum = 0; + + if (index == 0) + { + if (((NodeDescPtr) middle->buffer)->bLink == 0) // before 1st btree record + { + goto SuccessfulExit; + } + + nodeNum = ((NodeDescPtr) middle->buffer)->bLink; + + // BTree nodes are always grabbed in left to right order. + // Therefore release the current node before looking up the + // left node. + err = ReleaseNode(btreePtr, middle); + M_ExitOnError(err); + + // Look up the left node + err = GetNode (btreePtr, nodeNum, 0, left); + M_ExitOnError (err); + + // Look up the current node again + err = GetRightSiblingNode (btreePtr, left->buffer, middle); + M_ExitOnError (err); + + if ( ((NodeDescPtr) left->buffer)->kind != kBTLeafNode || + ((NodeDescPtr) left->buffer)->numRecords <= 0 ) + { + goto SearchTheTree; + } + + foundIt = SearchNode (btreePtr, left->buffer, &iterator->key, &leftIndex); + if (foundIt == true) + { + *right = *middle; + *middle = *left; + left->buffer = nil; + index = leftIndex; + + goto SuccessfulExit; + } + + if (leftIndex == 0) // we're lost! + { + goto SearchTheTree; + } + else if (leftIndex >= ((NodeDescPtr) left->buffer)->numRecords) + { + nodeNum = ((NodeDescPtr) left->buffer)->fLink; + + PanicIf (index != 0, "FindIteratorPosition: index != 0"); //€€ just checking... + goto SuccessfulExit; + } + else + { + *right = *middle; + *middle = *left; + left->buffer = nil; + index = leftIndex; + + goto SuccessfulExit; + } + } + else if (index >= ((NodeDescPtr) middle->buffer)->numRecords) + { + if (((NodeDescPtr) middle->buffer)->fLink == 0) // beyond last record + { + goto SuccessfulExit; + } + + nodeNum = ((NodeDescPtr) middle->buffer)->fLink; + + err = GetRightSiblingNode (btreePtr, middle->buffer, right); + M_ExitOnError (err); + + if ( ((NodeDescPtr) right->buffer)->kind != kBTLeafNode || + ((NodeDescPtr) right->buffer)->numRecords <= 0 ) + { + goto SearchTheTree; + } + + foundIt = SearchNode (btreePtr, right->buffer, &iterator->key, &rightIndex); + if (rightIndex >= ((NodeDescPtr) right->buffer)->numRecords) // we're lost + { + goto SearchTheTree; + } + else // we found it, or rightIndex==0, or rightIndexbuffer = nil; + index = rightIndex; + + goto SuccessfulExit; + } + } + + + //////////////////////////// Search The Tree //////////////////////////////// + +SearchTheTree: + { + TreePathTable treePathTable; // so we only use stack space if we need to + + err = ReleaseNode (btreePtr, left); M_ExitOnError (err); + err = ReleaseNode (btreePtr, middle); M_ExitOnError (err); + err = ReleaseNode (btreePtr, right); M_ExitOnError (err); + + err = SearchTree ( btreePtr, &iterator->key, treePathTable, &nodeNum, middle, &index); + switch (err) //€€ separate find condition from exceptions + { + case noErr: foundIt = true; break; + case fsBTRecordNotFoundErr: break; + default: goto ErrorExit; + } + } + + /////////////////////////////// Success! //////////////////////////////////// + +SuccessfulExit: + + *returnNodeNum = nodeNum; + *returnIndex = index; + *foundRecord = foundIt; + + return noErr; + + + ////////////////////////////// Error Exit /////////////////////////////////// + +ErrorExit: + + (void) ReleaseNode (btreePtr, left); + (void) ReleaseNode (btreePtr, middle); + (void) ReleaseNode (btreePtr, right); + + *returnNodeNum = 0; + *returnIndex = 0; + *foundRecord = false; + + return err; +} + + + +/////////////////////////////// CheckInsertParams /////////////////////////////// + +OSStatus CheckInsertParams (FCB *filePtr, + BTreeIterator *iterator, + FSBufferDescriptor *record, + u_int16_t recordLen ) +{ + BTreeControlBlockPtr btreePtr; + + if (filePtr == nil) return paramErr; + + btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; + if (btreePtr == nil) return fsBTInvalidFileErr; + if (iterator == nil) return paramErr; + if (record == nil) return paramErr; + + // check total key/record size limit + if ( CalcKeyRecordSize (CalcKeySize(btreePtr, &iterator->key), recordLen) > (btreePtr->nodeSize >> 1)) + return fsBTRecordTooLargeErr; + + return noErr; +} + + + +/*------------------------------------------------------------------------------- +Routine: TrySimpleReplace - Attempts a simple insert, set, or replace. + +Function: If a hint exitst for the iterator, attempt to find the key in the hint + node. If the key is found, an insert operation fails. If the is not + found, a replace operation fails. If the key was not found, and the + insert position is greater than 0 and less than numRecords, the record + is inserted, provided there is enough freeSpace. If the key was found, + and there is more freeSpace than the difference between the new record + and the old record, the old record is deleted and the new record is + inserted. + +Assumptions: iterator key has already been checked by CheckKey + + +Input: btreePtr - description + iterator - description + record - description + recordLen - description + operation - description + + +Output: recordInserted - description + + +Result: noErr - success + E_RecordExits - insert operation failure + != noErr - GetNode, ReleaseNode, UpdateNode returned an error +-------------------------------------------------------------------------------*/ + +OSStatus TrySimpleReplace (BTreeControlBlockPtr btreePtr, + NodeDescPtr nodePtr, + BTreeIterator *iterator, + FSBufferDescriptor *record, + u_int16_t recordLen, + Boolean *recordInserted ) +{ + u_int32_t oldSpace; + u_int32_t spaceNeeded; + u_int16_t index; + u_int16_t keySize; + Boolean foundIt; + Boolean didItFit; + + + *recordInserted = false; // we'll assume this won't work... + + if ( nodePtr->kind != kBTLeafNode ) + return noErr; // we're in the weeds! + + foundIt = SearchNode (btreePtr, nodePtr, &iterator->key, &index); + + if ( foundIt == false ) + return noErr; // we might be lost... + + keySize = CalcKeySize(btreePtr, &iterator->key); // includes length field + + spaceNeeded = CalcKeyRecordSize (keySize, recordLen); + + oldSpace = GetRecordSize (btreePtr, nodePtr, index); + + if ( spaceNeeded == oldSpace ) + { + u_int8_t * dst; + + dst = GetRecordAddress (btreePtr, nodePtr, index); + + if ( M_IsOdd (keySize) ) + ++keySize; // add pad byte + + dst += keySize; // skip over key to point at record + + BlockMoveData(record->bufferAddress, dst, recordLen); // blast away... + + *recordInserted = true; + } + else if ( (GetNodeFreeSize(btreePtr, nodePtr) + oldSpace) >= spaceNeeded) + { + DeleteRecord (btreePtr, nodePtr, index); + + didItFit = InsertKeyRecord (btreePtr, nodePtr, index, + &iterator->key, KeyLength(btreePtr, &iterator->key), + record->bufferAddress, recordLen); + PanicIf (didItFit == false, "TrySimpleInsert: InsertKeyRecord returned false!"); + + *recordInserted = true; + } + // else not enough space... + + return noErr; +} + + +/*------------------------------------------------------------------------------- +Routine: IsItAHint - checks the hint within a BTreeInterator. + +Function: checks the hint within a BTreeInterator. If it is non-zero, it may + possibly be valid. + +Input: btreePtr - pointer to control block for BTree file + iterator - pointer to BTreeIterator + +Output: answer - true if the hint looks reasonable + - false if the hint is 0 + +Result: noErr - success +-------------------------------------------------------------------------------*/ + + +OSStatus IsItAHint (BTreeControlBlockPtr btreePtr, BTreeIterator *iterator, Boolean *answer) +{ + ++btreePtr->numHintChecks; + +#if DEBUG + if (iterator->hint.nodeNum >= btreePtr->totalNodes) + { + *answer = false; + } else + +#endif + if (iterator->hint.nodeNum == 0) + { + *answer = false; + } + else + { + *answer = true; + ++btreePtr->numPossibleHints; + } + + return noErr; +} diff --git a/core/BTreeNodeOps.c b/core/BTreeNodeOps.c new file mode 100644 index 0000000..9fee0b4 --- /dev/null +++ b/core/BTreeNodeOps.c @@ -0,0 +1,1036 @@ +/* + * Copyright (c) 2000, 2002, 2005-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + File: BTreeNodeOps.c + + Contains: Single-node operations for the BTree Module. + + Version: xxx put the technology version here xxx + + Written by: Gordon Sheridan and Bill Bruffey + + Copyright: (c) 1992-1999 by Apple Inc., all rights reserved. + + File Ownership: + + DRI: Don Brady + + Other Contact: Mark Day + + Technology: File Systems + + Writers: + + (msd) Mark Day + (djb) Don Brady + + Change History (most recent first): + + 6/1/99 djb Sync up with Mac OS 8.6. + 4/113/99 djb Fix key size checking bug in CheckNode. + 3/19/99 djb Added key size checking to CheckNode. + 3/26/98 djb Added PrintNode for debugging. + 9/4/97 djb Removed GetRightSiblingNode and GetLeftSiblingNode - they are + now macros. SearchNode is now in BTreeSearchNode.a. + 8/22/97 djb Turn off debugging code in CheckKey. + 7/24/97 djb Add summary traces for Get/Rel Node. Made GetRecordOffset into a + macro. Only call CheckNode if the node came from disk. + 7/21/97 msd Make GetRecordByIndex check its record index input; it now + returns an OSStatus. + 4/23/97 djb first checked in + + 2/19/97 djb Changes to support big node cache. + 1/3/97 djb Added support for large keys. + 12/19/96 djb first checked in + + + History applicable to original Scarecrow Design: + + <6> 10/25/96 ser Changing for new VFPI + <5> 9/17/96 dkh Add bounds checking to GetNode. Update GetNode to not assert + that CheckNode failed if the node is all zeroes. This can happen + if the hint case if the fetched node has been deallocated + <4> 3/7/96 dkh Change GetNewNode() to not use kGetEmptyBlock. Instead use + kGetBlock to fetch a block from the disk itself. €€€ Why? + <3> 1/22/96 dkh Add #include Memory.h + <2> 1/10/96 msd Change 64-bit math to use real function names from Math64.i. + <1> 10/18/95 rst Moved from Scarecrow project. + + <17> 7/18/95 mbb Change MoveData & ClearBytes to BlockMoveData & BlockZero. + <16> 1/31/95 prp GetBlockProc interface uses a 64 bit node number. + <15> 1/12/95 wjk Adopt Model FileSystem changes in D5. + <14> 9/30/94 prp Get in sync with D2 interface changes. + <13> 7/25/94 wjk Eliminate usage of BytePtr in favor of UInt8 *. + <12> 7/22/94 wjk Convert to the new set of header files. + <11> 12/2/93 wjk Move from Makefiles to BuildFiles. Fit into the ModernOS and + NRCmds environments. + <10> 11/30/93 wjk Change some Ptr's to BytePtr's in function definitions so they + agree with their prototypes. + <9> 8/31/93 prp Use U64SetU instead of S64Set. + <8> 5/21/93 gs Maintain statistical counters on Get/Release node routines. + <7> 5/10/93 gs Change keySize parameter to keyLength for InsertKeyRecord + routine. Calculate number of bytes in key from keyLength to + account for length and pad bytes. Add GetChildNodeNum routine. + <6> 3/23/93 gs Add InsertKeyRecord routine. + <5> 2/8/93 gs Fix bug in SearchNode that caused "off by 1" error when final + compare was searchKey > trialKey. Add UpdateNode. + <4> 12/10/92 gs Change keyLength field of key to 'length'. + <3> 12/8/92 gs Incorporate suggestions from preliminary code review. + <2> 12/2/92 gs Implement routines. + <1> 11/15/92 gs Define routine interfaces. + +*/ + +#include "BTreesPrivate.h" + + + +///////////////////////// BTree Module Node Operations ////////////////////////// +// +// GetNode - Call FS Agent to get node +// GetNewNode - Call FS Agent to get a new node +// ReleaseNode - Call FS Agent to release node obtained by GetNode. +// UpdateNode - Mark a node as dirty and call FS Agent to release it. +// +// ClearNode - Clear a node to all zeroes. +// +// InsertRecord - Inserts a record into a BTree node. +// InsertKeyRecord - Inserts a key and record pair into a BTree node. +// DeleteRecord - Deletes a record from a BTree node. +// +// SearchNode - Return index for record that matches key. +// LocateRecord - Return pointer to key and data, and size of data. +// +// GetNodeDataSize - Return the amount of space used for data in the node. +// GetNodeFreeSize - Return the amount of free space in the node. +// +// GetRecordOffset - Return the offset for record "index". +// GetRecordAddress - Return address of record "index". +// GetOffsetAddress - Return address of offset for record "index". +// +// InsertOffset - Inserts a new offset into a node. +// DeleteOffset - Deletes an offset from a node. +// +///////////////////////////////////////////////////////////////////////////////// + + + +////////////////////// Routines Internal To BTreeNodeOps.c ////////////////////// + +u_int16_t GetRecordOffset (BTreeControlBlockPtr btree, + NodeDescPtr node, + u_int16_t index ); + +u_int16_t *GetOffsetAddress (BTreeControlBlockPtr btreePtr, + NodeDescPtr node, + u_int16_t index ); + +void InsertOffset (BTreeControlBlockPtr btreePtr, + NodeDescPtr node, + u_int16_t index, + u_int16_t delta ); + +void DeleteOffset (BTreeControlBlockPtr btreePtr, + NodeDescPtr node, + u_int16_t index ); + + +///////////////////////////////////////////////////////////////////////////////// + +#define GetRecordOffset(btreePtr,node,index) (*(short *) ((u_int8_t *)(node) + (btreePtr)->nodeSize - ((index) << 1) - kOffsetSize)) + + +/*------------------------------------------------------------------------------- + +Routine: GetNode - Call FS Agent to get node + +Function: Gets an existing BTree node from FS Agent and verifies it. + +Input: btreePtr - pointer to BTree control block + nodeNum - number of node to request + +Output: nodePtr - pointer to beginning of node (nil if error) + +Result: + noErr - success + != noErr - failure +-------------------------------------------------------------------------------*/ + +OSStatus GetNode (BTreeControlBlockPtr btreePtr, + u_int32_t nodeNum, + u_int32_t flags, + NodeRec *nodePtr ) +{ + OSStatus err; + GetBlockProcPtr getNodeProc; + u_int32_t options; + + + // is nodeNum within proper range? + if( nodeNum >= btreePtr->totalNodes ) + { + Panic("GetNode:nodeNum >= totalNodes"); + err = fsBTInvalidNodeErr; + goto ErrorExit; + } + + nodePtr->blockSize = btreePtr->nodeSize; // indicate the size of a node + + options = kGetBlock; + if ( flags & kGetNodeHint ) + { + options |= kGetBlockHint; + } + + getNodeProc = btreePtr->getBlockProc; + err = getNodeProc (btreePtr->fileRefNum, + nodeNum, + options, + nodePtr ); + + if (err != noErr) + { + Panic ("GetNode: getNodeProc returned error."); + goto ErrorExit; + } + ++btreePtr->numGetNodes; + + return noErr; + +ErrorExit: + nodePtr->buffer = nil; + nodePtr->blockHeader = nil; + + return err; +} + + + +/*------------------------------------------------------------------------------- + +Routine: GetNewNode - Call FS Agent to get a new node + +Function: Gets a new BTree node from FS Agent and initializes it to an empty + state. + +Input: btreePtr - pointer to BTree control block + nodeNum - number of node to request + +Output: returnNodePtr - pointer to beginning of node (nil if error) + +Result: noErr - success + != noErr - failure +-------------------------------------------------------------------------------*/ + +OSStatus GetNewNode (BTreeControlBlockPtr btreePtr, + u_int32_t nodeNum, + NodeRec *returnNodePtr ) +{ + OSStatus err; + NodeDescPtr node; + void *pos; + GetBlockProcPtr getNodeProc; + + + //////////////////////// get buffer for new node //////////////////////////// + + returnNodePtr->blockSize = btreePtr->nodeSize; // indicate the size of a node + + getNodeProc = btreePtr->getBlockProc; + err = getNodeProc (btreePtr->fileRefNum, + nodeNum, + kGetBlock+kGetEmptyBlock, + returnNodePtr ); + + if (err != noErr) + { + Panic ("GetNewNode: getNodeProc returned error."); + // returnNodePtr->buffer = nil; + return err; + } + ++btreePtr->numGetNewNodes; + + + ////////////////////////// initialize the node ////////////////////////////// + + node = returnNodePtr->buffer; + + ClearNode (btreePtr, node); // clear the node + + pos = (char *)node + btreePtr->nodeSize - 2; // find address of last offset + *(u_int16_t *)pos = sizeof (BTNodeDescriptor); // set offset to beginning of free space + + + return noErr; +} + + + +/*------------------------------------------------------------------------------- + +Routine: ReleaseNode - Call FS Agent to release node obtained by GetNode. + +Function: Informs the FS Agent that a BTree node may be released. + +Input: btreePtr - pointer to BTree control block + nodeNum - number of node to release + +Result: noErr - success + != noErr - failure +-------------------------------------------------------------------------------*/ + +OSStatus ReleaseNode (BTreeControlBlockPtr btreePtr, + NodePtr nodePtr ) +{ + OSStatus err; + ReleaseBlockProcPtr releaseNodeProc; + + + err = noErr; + + if (nodePtr->buffer != nil) + { + releaseNodeProc = btreePtr->releaseBlockProc; + err = releaseNodeProc (btreePtr->fileRefNum, + nodePtr, + kReleaseBlock ); + PanicIf (err, "ReleaseNode: releaseNodeProc returned error."); + ++btreePtr->numReleaseNodes; + } + + nodePtr->buffer = nil; + nodePtr->blockHeader = nil; + + return err; +} + + + + +/*------------------------------------------------------------------------------- + +Routine: TrashNode - Call FS Agent to release node obtained by GetNode, and + not store it...mark it as bad. + +Function: Informs the FS Agent that a BTree node may be released and thrown away. + +Input: btreePtr - pointer to BTree control block + nodeNum - number of node to release + +Result: noErr - success + != noErr - failure +-------------------------------------------------------------------------------*/ + +OSStatus TrashNode (BTreeControlBlockPtr btreePtr, + NodePtr nodePtr ) +{ + OSStatus err; + ReleaseBlockProcPtr releaseNodeProc; + + + err = noErr; + + if (nodePtr->buffer != nil) + { + releaseNodeProc = btreePtr->releaseBlockProc; + err = releaseNodeProc (btreePtr->fileRefNum, + nodePtr, + kReleaseBlock | kTrashBlock ); + PanicIf (err, "TrashNode: releaseNodeProc returned error."); + ++btreePtr->numReleaseNodes; + } + + nodePtr->buffer = nil; + nodePtr->blockHeader = nil; + + return err; +} + + + +/*------------------------------------------------------------------------------- + +Routine: UpdateNode - Mark a node as dirty and call FS Agent to release it. + +Function: Marks a BTree node dirty and informs the FS Agent that it may be released. + +Input: btreePtr - pointer to BTree control block + nodeNum - number of node to release + transactionID - ID of transaction this node update is a part of + flags - special flags to pass to ReleaseNodeProc + +Result: noErr - success + != noErr - failure +-------------------------------------------------------------------------------*/ + +OSStatus UpdateNode (BTreeControlBlockPtr btreePtr, + NodePtr nodePtr, + u_int32_t transactionID, + u_int32_t flags ) +{ +#pragma unused(transactionID) + + OSStatus err; + ReleaseBlockProcPtr releaseNodeProc; + + + err = noErr; + + if (nodePtr->buffer != nil) // Why call UpdateNode if nil ?!? + { + releaseNodeProc = btreePtr->releaseBlockProc; + err = releaseNodeProc (btreePtr->fileRefNum, + nodePtr, + flags | kMarkBlockDirty ); + ++btreePtr->numUpdateNodes; + M_ExitOnError (err); + } + + nodePtr->buffer = nil; + nodePtr->blockHeader = nil; + + return noErr; + +ErrorExit: + + return err; +} + +/*------------------------------------------------------------------------------- + +Routine: ClearNode - Clear a node to all zeroes. + +Function: Writes zeroes from beginning of node for nodeSize bytes. + +Input: btreePtr - pointer to BTree control block + node - pointer to node to clear + +Result: none +-------------------------------------------------------------------------------*/ + +void ClearNode (BTreeControlBlockPtr btreePtr, NodeDescPtr node ) +{ + ClearMemory( node, btreePtr->nodeSize ); +} + +/*------------------------------------------------------------------------------- + +Routine: InsertRecord - Inserts a record into a BTree node. + +Function: + +Note: Record size must be even! + +Input: btreePtr - pointer to BTree control block + node - pointer to node to insert the record + index - position record is to be inserted + recPtr - pointer to record to insert + +Result: noErr - success + fsBTFullErr - record larger than remaining free space. +-------------------------------------------------------------------------------*/ + +Boolean InsertRecord (BTreeControlBlockPtr btreePtr, + NodeDescPtr node, + u_int16_t index, + RecordPtr recPtr, + u_int16_t recSize ) +{ + u_int16_t freeSpace; + u_int16_t indexOffset; + u_int16_t freeOffset; + u_int16_t bytesToMove; + void *src; + void *dst; + + //// will new record fit in node? + + freeSpace = GetNodeFreeSize (btreePtr, node); + //€€ we could get freeOffset & calc freeSpace + if ( freeSpace < recSize + 2) + { + return false; + } + + + //// make hole for new record + + indexOffset = GetRecordOffset (btreePtr, node, index); + freeOffset = GetRecordOffset (btreePtr, node, node->numRecords); + + src = ((Ptr) node) + indexOffset; + dst = ((Ptr) src) + recSize; + bytesToMove = freeOffset - indexOffset; + if (bytesToMove) + MoveRecordsRight (src, dst, bytesToMove); + + + //// adjust offsets for moved records + + InsertOffset (btreePtr, node, index, recSize); + + + //// move in the new record + + dst = ((Ptr) node) + indexOffset; + MoveRecordsLeft (recPtr, dst, recSize); + + return true; +} + + + +/*------------------------------------------------------------------------------- + +Routine: InsertKeyRecord - Inserts a record into a BTree node. + +Function: + +Note: Record size must be even! + +Input: btreePtr - pointer to BTree control block + node - pointer to node to insert the record + index - position record is to be inserted + keyPtr - pointer to key for record to insert + keyLength - length of key (or maxKeyLength) + recPtr - pointer to record to insert + recSize - number of bytes to copy for record + +Result: noErr - success + fsBTFullErr - record larger than remaining free space. +-------------------------------------------------------------------------------*/ + +Boolean InsertKeyRecord (BTreeControlBlockPtr btreePtr, + NodeDescPtr node, + u_int16_t index, + KeyPtr keyPtr, + u_int16_t keyLength, + RecordPtr recPtr, + u_int16_t recSize ) +{ + u_int16_t freeSpace; + u_int16_t indexOffset; + u_int16_t freeOffset; + u_int16_t bytesToMove; + u_int8_t * src; + u_int8_t * dst; + u_int16_t keySize; + u_int16_t rawKeyLength; + u_int16_t sizeOfLength; + + //// calculate actual key size + + if ( btreePtr->attributes & kBTBigKeysMask ) + keySize = keyLength + sizeof(u_int16_t); + else + keySize = keyLength + sizeof(u_int8_t); + + if ( M_IsOdd (keySize) ) + ++keySize; // add pad byte + + + //// will new record fit in node? + + freeSpace = GetNodeFreeSize (btreePtr, node); + //€€ we could get freeOffset & calc freeSpace + if ( freeSpace < keySize + recSize + 2) + { + return false; + } + + + //// make hole for new record + + indexOffset = GetRecordOffset (btreePtr, node, index); + freeOffset = GetRecordOffset (btreePtr, node, node->numRecords); + + src = ((u_int8_t *) node) + indexOffset; + dst = ((u_int8_t *) src) + keySize + recSize; + bytesToMove = freeOffset - indexOffset; + if (bytesToMove) + MoveRecordsRight (src, dst, bytesToMove); + + + //// adjust offsets for moved records + + InsertOffset (btreePtr, node, index, keySize + recSize); + + + //// copy record key + + dst = ((u_int8_t *) node) + indexOffset; + + if ( btreePtr->attributes & kBTBigKeysMask ) + { + *((u_int16_t *)dst) = keyLength; // use keyLength rather than key.length + dst = (u_int8_t *) (((u_int16_t *)dst) + 1); + rawKeyLength = keyPtr->length16; + sizeOfLength = 2; + } + else + { + *dst++ = keyLength; // use keyLength rather than key.length + rawKeyLength = keyPtr->length8; + sizeOfLength = 1; + } + + MoveRecordsLeft ( ((u_int8_t *) keyPtr) + sizeOfLength, dst, rawKeyLength); // copy key + + // any pad bytes? + bytesToMove = keySize - rawKeyLength; + if (bytesToMove) + ClearMemory (dst + rawKeyLength, bytesToMove); // clear pad bytes in index key + + + //// copy record data + + dst = ((u_int8_t *) node) + indexOffset + keySize; + MoveRecordsLeft (recPtr, dst, recSize); + + return true; +} + + + +/*------------------------------------------------------------------------------- + +Routine: DeleteRecord - Deletes a record from a BTree node. + +Function: + +Input: btreePtr - pointer to BTree control block + node - pointer to node to insert the record + index - position record is to be inserted + +Result: none +-------------------------------------------------------------------------------*/ + +void DeleteRecord (BTreeControlBlockPtr btreePtr, + NodeDescPtr node, + u_int16_t index ) +{ + int16_t indexOffset; + int16_t nextOffset; + int16_t freeOffset; + int16_t bytesToMove; + void *src; + void *dst; + + //// compress records + indexOffset = GetRecordOffset (btreePtr, node, index); + nextOffset = GetRecordOffset (btreePtr, node, index + 1); + freeOffset = GetRecordOffset (btreePtr, node, node->numRecords); + + src = ((Ptr) node) + nextOffset; + dst = ((Ptr) node) + indexOffset; + bytesToMove = freeOffset - nextOffset; + if (bytesToMove) + MoveRecordsLeft (src, dst, bytesToMove); + + //// Adjust the offsets + DeleteOffset (btreePtr, node, index); + + /* clear out new free space */ + bytesToMove = nextOffset - indexOffset; + ClearMemory(GetRecordAddress(btreePtr, node, node->numRecords), bytesToMove); + +} + + + +/*------------------------------------------------------------------------------- + +Routine: SearchNode - Return index for record that matches key. + +Function: Returns the record index for the record that matches the search key. + If no record was found that matches the search key, the "insert index" + of where the record should go is returned instead. + +Algorithm: A binary search algorithm is used to find the specified key. + +Input: btreePtr - pointer to BTree control block + node - pointer to node that contains the record + searchKey - pointer to the key to match + +Output: index - pointer to beginning of key for record + +Result: true - success (index = record index) + false - key did not match anything in node (index = insert index) +-------------------------------------------------------------------------------*/ +Boolean +SearchNode( BTreeControlBlockPtr btreePtr, + NodeDescPtr node, + KeyPtr searchKey, + u_int16_t *returnIndex ) +{ + int32_t lowerBound; + int32_t upperBound; + int32_t index; + int32_t result; + KeyPtr trialKey; + u_int16_t *offset; + KeyCompareProcPtr compareProc = btreePtr->keyCompareProc; + + lowerBound = 0; + upperBound = node->numRecords - 1; + offset = (u_int16_t *) ((u_int8_t *)(node) + (btreePtr)->nodeSize - kOffsetSize); + + while (lowerBound <= upperBound) { + index = (lowerBound + upperBound) >> 1; + + trialKey = (KeyPtr) ((u_int8_t *)node + *(offset - index)); + + result = compareProc(searchKey, trialKey); + + if (result < 0) { + upperBound = index - 1; /* search < trial */ + } else if (result > 0) { + lowerBound = index + 1; /* search > trial */ + } else { + *returnIndex = index; /* search == trial */ + return true; + } + } + + *returnIndex = lowerBound; /* lowerBound is insert index */ + return false; +} + + +/*------------------------------------------------------------------------------- + +Routine: GetRecordByIndex - Return pointer to key and data, and size of data. + +Function: Returns a pointer to beginning of key for record, a pointer to the + beginning of the data for the record, and the size of the record data + (does not include the size of the key). + +Input: btreePtr - pointer to BTree control block + node - pointer to node that contains the record + index - index of record to get + +Output: keyPtr - pointer to beginning of key for record + dataPtr - pointer to beginning of data for record + dataSize - size of the data portion of the record + +Result: none +-------------------------------------------------------------------------------*/ + +OSStatus GetRecordByIndex (BTreeControlBlockPtr btreePtr, + NodeDescPtr node, + u_int16_t index, + KeyPtr *keyPtr, + u_int8_t * *dataPtr, + u_int16_t *dataSize ) +{ + u_int16_t offset; + u_int16_t nextOffset; + u_int16_t keySize; + + // + // Make sure index is valid (in range 0..numRecords-1) + // + if (index >= node->numRecords) + return fsBTRecordNotFoundErr; + + //// find keyPtr + offset = GetRecordOffset (btreePtr, node, index); + *keyPtr = (KeyPtr) ((Ptr)node + offset); + + //// find dataPtr + keySize = CalcKeySize(btreePtr, *keyPtr); + if ( M_IsOdd (keySize) ) + ++keySize; // add pad byte + + offset += keySize; // add the key length to find data offset + *dataPtr = (u_int8_t *) node + offset; + + //// find dataSize + nextOffset = GetRecordOffset (btreePtr, node, index + 1); + *dataSize = nextOffset - offset; + + return noErr; +} + + + +/*------------------------------------------------------------------------------- + +Routine: GetNodeDataSize - Return the amount of space used for data in the node. + +Function: Gets the size of the data currently contained in a node, excluding + the node header. (record data + offset overhead) + +Input: btreePtr - pointer to BTree control block + node - pointer to node that contains the record + +Result: - number of bytes used for data and offsets in the node. +-------------------------------------------------------------------------------*/ + +u_int16_t GetNodeDataSize (BTreeControlBlockPtr btreePtr, NodeDescPtr node ) +{ + u_int16_t freeOffset; + + freeOffset = GetRecordOffset (btreePtr, node, node->numRecords); + + return freeOffset + (node->numRecords << 1) - sizeof (BTNodeDescriptor); +} + + + +/*------------------------------------------------------------------------------- + +Routine: GetNodeFreeSize - Return the amount of free space in the node. + +Function: + +Input: btreePtr - pointer to BTree control block + node - pointer to node that contains the record + +Result: - number of bytes of free space in the node. +-------------------------------------------------------------------------------*/ + +u_int16_t GetNodeFreeSize (BTreeControlBlockPtr btreePtr, NodeDescPtr node ) +{ + u_int16_t freeOffset; + + freeOffset = GetRecordOffset (btreePtr, node, node->numRecords); //€€ inline? + + return btreePtr->nodeSize - freeOffset - (node->numRecords << 1) - kOffsetSize; +} + + + +/*------------------------------------------------------------------------------- + +Routine: GetRecordOffset - Return the offset for record "index". + +Function: + +Input: btreePtr - pointer to BTree control block + node - pointer to node that contains the record + index - record to obtain offset for + +Result: - offset (in bytes) from beginning of node of record specified by index +-------------------------------------------------------------------------------*/ +// make this a macro (for inlining) +#if 0 +u_int16_t GetRecordOffset (BTreeControlBlockPtr btreePtr, + NodeDescPtr node, + u_int16_t index ) +{ + void *pos; + + + pos = (u_int8_t *)node + btreePtr->nodeSize - (index << 1) - kOffsetSize; + + return *(short *)pos; +} +#endif + + + +/*------------------------------------------------------------------------------- + +Routine: GetRecordAddress - Return address of record "index". + +Function: + +Input: btreePtr - pointer to BTree control block + node - pointer to node that contains the record + index - record to obtain offset address for + +Result: - pointer to record "index". +-------------------------------------------------------------------------------*/ +// make this a macro (for inlining) +#if 0 +u_int8_t * GetRecordAddress (BTreeControlBlockPtr btreePtr, + NodeDescPtr node, + u_int16_t index ) +{ + u_int8_t * pos; + + pos = (u_int8_t *)node + GetRecordOffset (btreePtr, node, index); + + return pos; +} +#endif + + + +/*------------------------------------------------------------------------------- + +Routine: GetRecordSize - Return size of record "index". + +Function: + +Note: This does not work on the FreeSpace index! + +Input: btreePtr - pointer to BTree control block + node - pointer to node that contains the record + index - record to obtain record size for + +Result: - size of record "index". +-------------------------------------------------------------------------------*/ + +u_int16_t GetRecordSize (BTreeControlBlockPtr btreePtr, + NodeDescPtr node, + u_int16_t index ) +{ + u_int16_t *pos; + + pos = (u_int16_t *) ((Ptr)node + btreePtr->nodeSize - (index << 1) - kOffsetSize); + + return *(pos-1) - *pos; +} + + + +/*------------------------------------------------------------------------------- +Routine: GetOffsetAddress - Return address of offset for record "index". + +Function: + +Input: btreePtr - pointer to BTree control block + node - pointer to node that contains the record + index - record to obtain offset address for + +Result: - pointer to offset for record "index". +-------------------------------------------------------------------------------*/ + +u_int16_t *GetOffsetAddress (BTreeControlBlockPtr btreePtr, + NodeDescPtr node, + u_int16_t index ) +{ + void *pos; + + pos = (Ptr)node + btreePtr->nodeSize - (index << 1) -2; + + return (u_int16_t *)pos; +} + + + +/*------------------------------------------------------------------------------- +Routine: GetChildNodeNum - Return child node number from index record "index". + +Function: Returns the first u_int32_t stored after the key for record "index". + +Assumes: The node is an Index Node. + The key.length stored at record "index" is ODD. //€€ change for variable length index keys + +Input: btreePtr - pointer to BTree control block + node - pointer to node that contains the record + index - record to obtain child node number from + +Result: - child node number from record "index". +-------------------------------------------------------------------------------*/ + +u_int32_t GetChildNodeNum (BTreeControlBlockPtr btreePtr, + NodeDescPtr nodePtr, + u_int16_t index ) +{ + u_int8_t * pos; + + pos = GetRecordAddress (btreePtr, nodePtr, index); + pos += CalcKeySize(btreePtr, (BTreeKey *) pos); // key.length + size of length field + + return *(u_int32_t *)pos; +} + + + +/*------------------------------------------------------------------------------- +Routine: InsertOffset - Add an offset and adjust existing offsets by delta. + +Function: Add an offset at 'index' by shifting 'index+1' through the last offset + and adjusting them by 'delta', the size of the record to be inserted. + The number of records contained in the node is also incremented. + +Input: btreePtr - pointer to BTree control block + node - pointer to node + index - index at which to insert record + delta - size of record to be inserted + +Result: none +-------------------------------------------------------------------------------*/ + +void InsertOffset (BTreeControlBlockPtr btreePtr, + NodeDescPtr node, + u_int16_t index, + u_int16_t delta ) +{ + u_int16_t *src, *dst; + u_int16_t numOffsets; + + src = GetOffsetAddress (btreePtr, node, node->numRecords); // point to free offset + dst = src - 1; // point to new offset + numOffsets = node->numRecords++ - index; // subtract index & postincrement + + do { + *dst++ = *src++ + delta; // to tricky? + } while (numOffsets--); +} + + + +/*------------------------------------------------------------------------------- + +Routine: DeleteOffset - Delete an offset. + +Function: Delete the offset at 'index' by shifting 'index+1' through the last offset + and adjusting them by the size of the record 'index'. + The number of records contained in the node is also decremented. + +Input: btreePtr - pointer to BTree control block + node - pointer to node + index - index at which to delete record + +Result: none +-------------------------------------------------------------------------------*/ + +void DeleteOffset (BTreeControlBlockPtr btreePtr, + NodeDescPtr node, + u_int16_t index ) +{ + u_int16_t *src, *dst; + u_int16_t numOffsets; + u_int16_t delta; + + dst = GetOffsetAddress (btreePtr, node, index); + src = dst - 1; + delta = *src - *dst; + numOffsets = --node->numRecords - index; // predecrement numRecords & subtract index + + while (numOffsets--) + { + *--dst = *--src - delta; // work our way left + } +} + + diff --git a/core/BTreeNodeReserve.c b/core/BTreeNodeReserve.c new file mode 100644 index 0000000..c75af1f --- /dev/null +++ b/core/BTreeNodeReserve.c @@ -0,0 +1,336 @@ +/* + * Copyright (c) 2004-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include "BTreesPrivate.h" +#include "sys/malloc.h" +#include + + +/* + * B-tree Node Reserve + * + * BTReserveSpace + * BTReleaseReserve + * BTUpdateReserve + * + * Each kernel thread can have it's own reserve of b-tree + * nodes. This reserve info is kept in a hash table. + * + * Don't forget to call BTReleaseReserve when you're finished + * or you will leave stale node reserves in the hash. + */ + + +/* + * BE CAREFUL WHEN INCREASING THE SIZE OF THIS STRUCT! + * + * It must remain equal in size to the opaque cat_cookie_t + * struct (in hfs_catalog.h). + */ +struct nreserve { + LIST_ENTRY(nreserve) nr_hash; /* hash chain */ + int nr_nodecnt; /* count of nodes held in reserve */ + int nr_newnodes; /* nodes that were allocated */ + struct vnode *nr_btvp; /* b-tree file vnode */ + void *nr_tag; /* unique tag (per thread) */ +}; + +#define NR_GET_TAG() (current_thread()) + +#define NR_CACHE 17 + +#define NR_HASH(btvp, tag) \ + (&nr_hashtbl[((((intptr_t)(btvp)) >> 8) ^ ((intptr_t)(tag) >> 4)) & nr_hashmask]) + +LIST_HEAD(nodereserve, nreserve) *nr_hashtbl; + +u_long nr_hashmask; + +lck_grp_t * nr_lck_grp; +lck_grp_attr_t * nr_lck_grp_attr; +lck_attr_t * nr_lck_attr; + +lck_mtx_t nr_mutex; + +/* Internal Node Reserve Hash Routines (private) */ +static void nr_insert (struct vnode *, struct nreserve *nrp, int); +static void nr_delete (struct vnode *, struct nreserve *nrp, int *); +static void nr_update (struct vnode *, int); + + +/* + * BTReserveSetup - initialize the node reserve hash table + */ +void BTReserveSetup(void) +{ + if (sizeof(struct nreserve) != sizeof(cat_cookie_t)) + panic("hfs: BTReserveSetup: nreserve size != opaque struct size"); + + nr_hashtbl = hashinit(NR_CACHE, M_TEMP, &nr_hashmask); + + nr_lck_grp_attr= lck_grp_attr_alloc_init(); + nr_lck_grp = lck_grp_alloc_init("btree_node_reserve", nr_lck_grp_attr); + + nr_lck_attr = lck_attr_alloc_init(); + + lck_mtx_init(&nr_mutex, nr_lck_grp, nr_lck_attr); +} + + +/* + * BTReserveSpace - obtain a node reserve (for current thread) + * + * Used by the Catalog Layer (hfs_catalog.c) to reserve space. + * + * When data is NULL, we only insure that there's enough space + * but it is not reserved (assumes you keep the b-tree lock). + */ +int +BTReserveSpace(FCB *file, int operations, void* data) +{ + BTreeControlBlock *btree; + int rsrvNodes, availNodes, totalNodes; + int height; + int inserts, deletes; + u_int32_t clumpsize; + int err = 0; + + btree = (BTreeControlBlockPtr)file->fcbBTCBPtr; + clumpsize = file->ff_clumpsize; + + REQUIRE_FILE_LOCK(btree->fileRefNum, true); + + /* + * The node reserve is based on the number of b-tree + * operations (insert/deletes) and the height of the + * tree. + */ + height = btree->treeDepth; + if (height < 2) + height = 2; /* prevent underflow in rsrvNodes calculation */ + inserts = operations & 0xffff; + deletes = operations >> 16; + + /* + * Allow for at least one root split. + * + * Each delete operation can propogate a big key up the + * index. This can cause a split at each level up. + * + * Each insert operation can cause a local split and a + * split at each level up. + */ + rsrvNodes = 1 + (deletes * (height - 2)) + (inserts * (height - 1)); + + availNodes = btree->freeNodes - btree->reservedNodes; + + if (rsrvNodes > availNodes) { + u_int32_t reqblks, freeblks, rsrvblks; + uint32_t bt_rsrv; + struct hfsmount *hfsmp; + + /* + * For UNIX conformance, we try and reserve the MIN of either 5% of + * total file blocks or 10MB worth of blocks, for growing existing + * files. On non-HFS filesystems, creating a new directory entry may + * not cause additional disk space to be allocated, but on HFS, creating + * a new entry could cause the b-tree to grow. As a result, we take + * some precautions here to prevent that on configurations that try to + * satisfy conformance. + */ + hfsmp = VTOVCB(btree->fileRefNum); + rsrvblks = ((u_int64_t)hfsmp->allocLimit * 5) / 100; + if (hfsmp->blockSize > HFS_BT_MAXRESERVE) { + bt_rsrv = 1; + } + else { + bt_rsrv = (HFS_BT_MAXRESERVE / hfsmp->blockSize); + } + rsrvblks = MIN(rsrvblks, bt_rsrv); + + freeblks = hfs_freeblks(hfsmp, 0); + if (freeblks <= rsrvblks) { + /* When running low, disallow adding new items. */ + if ((inserts > 0) && (deletes == 0)) { + return (ENOSPC); + } + freeblks = 0; + } else { + freeblks -= rsrvblks; + } + reqblks = clumpsize / hfsmp->blockSize; + + if (reqblks > freeblks) { + reqblks = ((rsrvNodes - availNodes) * btree->nodeSize) / hfsmp->blockSize; + /* When running low, disallow adding new items. */ + if ((reqblks > freeblks) && (inserts > 0) && (deletes == 0)) { + return (ENOSPC); + } + file->ff_clumpsize = freeblks * hfsmp->blockSize; + } + totalNodes = rsrvNodes + btree->totalNodes - availNodes; + + /* See if we also need a map node */ + if (totalNodes > (int)CalcMapBits(btree)) { + ++totalNodes; + } + if ((err = ExtendBTree(btree, totalNodes))) { + goto out; + } + } + /* Save this reserve if this is a persistent request. */ + if (data) { + btree->reservedNodes += rsrvNodes; + nr_insert(btree->fileRefNum, (struct nreserve *)data, rsrvNodes); + } +out: + /* Put clump size back if it was changed. */ + if (file->ff_clumpsize != clumpsize) + file->ff_clumpsize = clumpsize; + + return (err); +} + + +/* + * BTReleaseReserve - release the node reserve held by current thread + * + * Used by the Catalog Layer (hfs_catalog.c) to relinquish reserved space. + */ +int +BTReleaseReserve(FCB *file, void* data) +{ + BTreeControlBlock *btree; + int nodecnt; + + btree = (BTreeControlBlockPtr)file->fcbBTCBPtr; + + REQUIRE_FILE_LOCK(btree->fileRefNum, true); + + nr_delete(btree->fileRefNum, (struct nreserve *)data, &nodecnt); + + if (nodecnt) + btree->reservedNodes -= nodecnt; + + return (0); +} + +/* + * BTUpdateReserve - update a node reserve for allocations that occurred. + */ +void +BTUpdateReserve(BTreeControlBlockPtr btreePtr, int nodes) +{ + nr_update(btreePtr->fileRefNum, nodes); +} + + +/*----------------------------------------------------------------------------*/ +/* Node Reserve Hash Functions (private) */ + + +int nrinserts = 0; +int nrdeletes = 0; + +/* + * Insert a new node reserve. + */ +static void +nr_insert(struct vnode * btvp, struct nreserve *nrp, int nodecnt) +{ + struct nodereserve *nrhead; + struct nreserve *tmp_nrp; + void * tag = NR_GET_TAG(); + + /* + * Check the cache - there may already be a reserve + */ + lck_mtx_lock(&nr_mutex); + nrhead = NR_HASH(btvp, tag); + for (tmp_nrp = nrhead->lh_first; tmp_nrp; + tmp_nrp = tmp_nrp->nr_hash.le_next) { + if ((tmp_nrp->nr_tag == tag) && (tmp_nrp->nr_btvp == btvp)) { + nrp->nr_tag = 0; + tmp_nrp->nr_nodecnt += nodecnt; + lck_mtx_unlock(&nr_mutex); + return; + } + } + + nrp->nr_nodecnt = nodecnt; + nrp->nr_newnodes = 0; + nrp->nr_btvp = btvp; + nrp->nr_tag = tag; + LIST_INSERT_HEAD(nrhead, nrp, nr_hash); + ++nrinserts; + lck_mtx_unlock(&nr_mutex); +} + +/* + * Delete a node reserve. + */ +static void +nr_delete(struct vnode * btvp, struct nreserve *nrp, int *nodecnt) +{ + void * tag = NR_GET_TAG(); + + lck_mtx_lock(&nr_mutex); + if (nrp->nr_tag) { + if ((nrp->nr_tag != tag) || (nrp->nr_btvp != btvp)) + panic("hfs: nr_delete: invalid NR (%p)", nrp); + LIST_REMOVE(nrp, nr_hash); + *nodecnt = nrp->nr_nodecnt; + bzero(nrp, sizeof(struct nreserve)); + ++nrdeletes; + } else { + *nodecnt = 0; + } + lck_mtx_unlock(&nr_mutex); +} + + +/* + * Update a node reserve for any allocations that occurred. + */ +static void +nr_update(struct vnode * btvp, int nodecnt) +{ + struct nodereserve *nrhead; + struct nreserve *nrp; + void* tag = NR_GET_TAG(); + + lck_mtx_lock(&nr_mutex); + + nrhead = NR_HASH(btvp, tag); + for (nrp = nrhead->lh_first; nrp; nrp = nrp->nr_hash.le_next) { + if ((nrp->nr_tag == tag) && (nrp->nr_btvp == btvp)) { + nrp->nr_newnodes += nodecnt; + break; + } + } + lck_mtx_unlock(&nr_mutex); +} diff --git a/core/BTreeScanner.c b/core/BTreeScanner.c new file mode 100644 index 0000000..6ebf78a --- /dev/null +++ b/core/BTreeScanner.c @@ -0,0 +1,410 @@ +/* + * Copyright (c) 1996-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + * + * @(#)BTreeScanner.c + */ +#include +#include "hfs_endian.h" + +#include "BTreeScanner.h" + +static int FindNextLeafNode( BTScanState *scanState, Boolean avoidIO ); +static int ReadMultipleNodes( BTScanState *scanState ); + + +//_________________________________________________________________________________ +// +// Routine: BTScanNextRecord +// +// Purpose: Return the next leaf record in a scan. +// +// Inputs: +// scanState Scanner's current state +// avoidIO If true, don't do any I/O to refill the buffer +// +// Outputs: +// key Key of found record (points into buffer) +// data Data of found record (points into buffer) +// dataSize Size of data in found record +// +// Result: +// noErr Found a valid record +// btNotFound No more records +// ??? Needed to do I/O to get next node, but avoidIO set +// +// Notes: +// This routine returns pointers to the found record's key and data. It +// does not copy the key or data to a caller-supplied buffer (like +// GetBTreeRecord would). The caller must not modify the key or data. +//_________________________________________________________________________________ + +int BTScanNextRecord( BTScanState * scanState, + Boolean avoidIO, + void * * key, + void * * data, + u_int32_t * dataSize ) +{ + int err; + u_int16_t dataSizeShort; + + err = noErr; + + // + // If this is the first call, there won't be any nodes in the buffer, so go + // find the first first leaf node (if any). + // + if ( scanState->nodesLeftInBuffer == 0 ) + { + err = FindNextLeafNode( scanState, avoidIO ); + } + + while ( err == noErr ) + { + // See if we have a record in the current node + err = GetRecordByIndex( scanState->btcb, scanState->currentNodePtr, + scanState->recordNum, (KeyPtr *) key, + (u_int8_t **) data, &dataSizeShort ); + + if ( err == noErr ) + { + ++scanState->recordsFound; + ++scanState->recordNum; + if (dataSize != NULL) + *dataSize = dataSizeShort; + return noErr; + } + else if (err > 0) + { + // We didn't get the node through the cache, so we can't invalidate it. + //XXX Should we do something else to avoid seeing the same record again? + return err; + } + + // We're done with the current node. See if we've returned all the records + if ( scanState->recordsFound >= scanState->btcb->leafRecords ) + { + return btNotFound; + } + + // Move to the first record of the next leaf node + scanState->recordNum = 0; + err = FindNextLeafNode( scanState, avoidIO ); + } + + // + // If we got an EOF error from FindNextLeafNode, then there are no more leaf + // records to be found. + // + if ( err == fsEndOfIterationErr ) + err = btNotFound; + + return err; + +} /* BTScanNextRecord */ + + +//_________________________________________________________________________________ +// +// Routine: FindNextLeafNode +// +// Purpose: Point to the next leaf node in the buffer. Read more nodes +// into the buffer if needed (and allowed). +// +// Inputs: +// scanState Scanner's current state +// avoidIO If true, don't do any I/O to refill the buffer +// +// Result: +// noErr Found a valid record +// fsEndOfIterationErr No more nodes in file +// ??? Needed to do I/O to get next node, but avoidIO set +//_________________________________________________________________________________ + +static int FindNextLeafNode( BTScanState *scanState, Boolean avoidIO ) +{ + int err; + BlockDescriptor block; + FileReference fref; + + err = noErr; // Assume everything will be OK + + while ( 1 ) + { + if ( scanState->nodesLeftInBuffer == 0 ) + { + // Time to read some more nodes into the buffer + if ( avoidIO ) + { + return fsBTTimeOutErr; + } + else + { + // read some more nodes into buffer + err = ReadMultipleNodes( scanState ); + if ( err != noErr ) + break; + } + } + else + { + // Adjust the node counters and point to the next node in the buffer + ++scanState->nodeNum; + --scanState->nodesLeftInBuffer; + + // If we've looked at all nodes in the tree, then we're done + if ( scanState->nodeNum >= scanState->btcb->totalNodes ) + return fsEndOfIterationErr; + + if ( scanState->nodesLeftInBuffer == 0 ) + { + scanState->recordNum = 0; + continue; + } + + scanState->currentNodePtr = (BTNodeDescriptor *)(((u_int8_t *)scanState->currentNodePtr) + + scanState->btcb->nodeSize); + } + + /* Fake a BlockDescriptor */ + block.blockHeader = NULL; /* No buffer cache buffer */ + block.buffer = scanState->currentNodePtr; + block.blockNum = scanState->nodeNum; + block.blockSize = scanState->btcb->nodeSize; + block.blockReadFromDisk = 1; + block.isModified = 0; + + fref = scanState->btcb->fileRefNum; + + /* This node was read from disk, so it must be swapped/checked. + * Since we are reading multiple nodes, we might have read an + * unused node. Therefore we allow swapping of unused nodes. + */ + err = hfs_swap_BTNode(&block, fref, kSwapBTNodeBigToHost, true); + if ( err != noErr ) { + printf("hfs: FindNextLeafNode: Error from hfs_swap_BTNode (node %u)\n", scanState->nodeNum); + continue; + } + + if ( scanState->currentNodePtr->kind == kBTLeafNode ) + break; + } + + return err; + +} /* FindNextLeafNode */ + + +//_________________________________________________________________________________ +// +// Routine: ReadMultipleNodes +// +// Purpose: Read one or more nodes into the buffer. +// +// Inputs: +// theScanStatePtr Scanner's current state +// +// Result: +// noErr One or nodes were read +// fsEndOfIterationErr No nodes left in file, none in buffer +//_________________________________________________________________________________ + +static int ReadMultipleNodes( BTScanState *theScanStatePtr ) +{ + int myErr = E_NONE; + BTreeControlBlockPtr myBTreeCBPtr; + daddr64_t myPhyBlockNum; + u_int32_t myBufferSize; + struct vnode * myDevPtr; + unsigned int myBlockRun; + u_int32_t myBlocksInBufferCount; + + // release old buffer if we have one + if ( theScanStatePtr->bufferPtr != NULL ) + { + buf_markinvalid(theScanStatePtr->bufferPtr); + buf_brelse( theScanStatePtr->bufferPtr ); + theScanStatePtr->bufferPtr = NULL; + theScanStatePtr->currentNodePtr = NULL; + } + + myBTreeCBPtr = theScanStatePtr->btcb; + + // map logical block in catalog btree file to physical block on volume + myErr = hfs_bmap(myBTreeCBPtr->fileRefNum, theScanStatePtr->nodeNum, + &myDevPtr, &myPhyBlockNum, &myBlockRun); + if ( myErr != E_NONE ) + { + goto ExitThisRoutine; + } + + // bmap block run gives us the remaining number of valid blocks (number of blocks + // minus the first). so if there are 10 valid blocks our run number will be 9. + // blocks, in our case is the same as nodes (both are 4K) + myBlocksInBufferCount = (theScanStatePtr->bufferSize / myBTreeCBPtr->nodeSize ); + myBufferSize = theScanStatePtr->bufferSize; + if ( (myBlockRun + 1) < myBlocksInBufferCount ) + { + myBufferSize = (myBlockRun + 1) * myBTreeCBPtr->nodeSize; + } + + // now read blocks from the device + myErr = (int)buf_meta_bread(myDevPtr, + myPhyBlockNum, + myBufferSize, + NOCRED, + &theScanStatePtr->bufferPtr ); + if ( myErr != E_NONE ) + { + goto ExitThisRoutine; + } + + theScanStatePtr->nodesLeftInBuffer = buf_count(theScanStatePtr->bufferPtr) / theScanStatePtr->btcb->nodeSize; + theScanStatePtr->currentNodePtr = (BTNodeDescriptor *) buf_dataptr(theScanStatePtr->bufferPtr); + +ExitThisRoutine: + return myErr; + +} /* ReadMultipleNodes */ + + + +//_________________________________________________________________________________ +// +// Routine: BTScanInitialize +// +// Purpose: Prepare to start a new BTree scan, or resume a previous one. +// +// Inputs: +// btreeFile The B-Tree's file control block +// startingNode Initial node number +// startingRecord Initial record number within node +// recordsFound Number of valid records found so far +// bufferSize Size (in bytes) of buffer +// +// Outputs: +// scanState Scanner's current state; pass to other scanner calls +// +// Notes: +// To begin a new scan and see all records in the B-Tree, pass zeroes for +// startingNode, startingRecord, and recordsFound. +// +// To resume a scan from the point of a previous BTScanTerminate, use the +// values returned by BTScanTerminate as input for startingNode, startingRecord, +// and recordsFound. +// +// When resuming a scan, the caller should check the B-tree's write count. If +// it is different from the write count when the scan was terminated, then the +// tree may have changed and the current state may be incorrect. In particular, +// you may see some records more than once, or never see some records. Also, +// the scanner may not be able to detect when all leaf records have been seen, +// and will have to scan through many empty nodes. +// +// XXXÊPerhaps the write count should be managed by BTScanInitialize and +// XXX BTScanTerminate? This would avoid the caller having to peek at +// XXX internal B-Tree structures. +//_________________________________________________________________________________ + +int BTScanInitialize( const FCB * btreeFile, + u_int32_t startingNode, + u_int32_t startingRecord, + u_int32_t recordsFound, + u_int32_t bufferSize, + BTScanState * scanState ) +{ + BTreeControlBlock *btcb; + + // + // Make sure this is a valid B-Tree file + // + btcb = (BTreeControlBlock *) btreeFile->fcbBTCBPtr; + if (btcb == NULL) + return fsBTInvalidFileErr; + + // + // Make sure buffer size is big enough, and a multiple of the + // B-Tree node size + // + if ( bufferSize < btcb->nodeSize ) + return paramErr; + bufferSize = (bufferSize / btcb->nodeSize) * btcb->nodeSize; + + // + // Set up the scanner's state + // + scanState->bufferSize = bufferSize; + scanState->bufferPtr = NULL; + scanState->btcb = btcb; + scanState->nodeNum = startingNode; + scanState->recordNum = startingRecord; + scanState->currentNodePtr = NULL; + scanState->nodesLeftInBuffer = 0; // no nodes currently in buffer + scanState->recordsFound = recordsFound; + microuptime(&scanState->startTime); // initialize our throttle + + return noErr; + +} /* BTScanInitialize */ + + +//_________________________________________________________________________________ +// +// Routine: BTScanTerminate +// +// Purpose: Return state information about a scan so that it can be resumed +// later via BTScanInitialize. +// +// Inputs: +// scanState Scanner's current state +// +// Outputs: +// nextNode Node number to resume a scan (pass to BTScanInitialize) +// nextRecord Record number to resume a scan (pass to BTScanInitialize) +// recordsFound Valid records seen so far (pass to BTScanInitialize) +//_________________________________________________________________________________ + +int BTScanTerminate( BTScanState * scanState, + u_int32_t * startingNode, + u_int32_t * startingRecord, + u_int32_t * recordsFound ) +{ + *startingNode = scanState->nodeNum; + *startingRecord = scanState->recordNum; + *recordsFound = scanState->recordsFound; + + if ( scanState->bufferPtr != NULL ) + { + buf_markinvalid(scanState->bufferPtr); + buf_brelse( scanState->bufferPtr ); + scanState->bufferPtr = NULL; + scanState->currentNodePtr = NULL; + } + + return noErr; + +} /* BTScanTerminate */ + + diff --git a/core/BTreeScanner.h b/core/BTreeScanner.h new file mode 100644 index 0000000..05a1043 --- /dev/null +++ b/core/BTreeScanner.h @@ -0,0 +1,122 @@ +/* + * Copyright (c) 1996-2004 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + * + * @(#)BTreeScanner.h + */ + +#ifndef _BTREESCANNER_H_ +#define _BTREESCANNER_H_ + +#include + +#ifdef KERNEL +#ifdef __APPLE_API_PRIVATE +#include + +#include "FileMgrInternal.h" +#include "BTreesPrivate.h" + +// amount of time we are allowed to process a catalog search (in µ secs) +// NOTE - code assumes kMaxMicroSecsInKernel is less than 1,000,000 +enum { kMaxMicroSecsInKernel = (1000 * 100) }; // 1 tenth of a second + +// btree node scanner buffer size. at 32K we get 8 nodes. this is the size used +// in Mac OS 9 +enum { kCatSearchBufferSize = (32 * 1024) }; + + +/* + * ============ W A R N I N G ! ============ + * DO NOT INCREASE THE SIZE OF THIS STRUCT! + * It must be less than or equal to the size of + * the opaque searchstate struct (in sys/attr.h). + */ +/* Private description used in hfs_search */ +struct CatPosition +{ + u_int32_t writeCount; /* The BTree's write count (to see if the catalog writeCount */ + /* changed since the last search). If 0, the rest */ + /* of the record is invalid, start from beginning. */ + u_int32_t nextNode; /* node number to resume search */ + u_int32_t nextRecord; /* record number to resume search */ + u_int32_t recordsFound; /* number of leaf records seen so far */ +}; +typedef struct CatPosition CatPosition; + + +/* + BTScanState - This structure is used to keep track of the current state + of a BTree scan. It contains both the dynamic state information (like + the current node number and record number) and information that is static + for the duration of a scan (such as buffer pointers). + + NOTE: recordNum may equal or exceed the number of records in the node + number nodeNum. If so, then the next attempt to get a record will move + to a new node number. +*/ +struct BTScanState +{ + // The following fields are set up once at initialization time. + // They are not changed during a scan. + u_int32_t bufferSize; + struct buf * bufferPtr; + BTreeControlBlock * btcb; + + // The following fields are the dynamic state of the current scan. + u_int32_t nodeNum; // zero is first node + u_int32_t recordNum; // zero is first record + BTNodeDescriptor * currentNodePtr; // points to current node within buffer + u_int32_t nodesLeftInBuffer; // number of valid nodes still in the buffer + u_int32_t recordsFound; // number of leaf records seen so far + struct timeval startTime; // time we started catalog search +}; +typedef struct BTScanState BTScanState; + + +/* *********************** PROTOTYPES *********************** */ + +int BTScanInitialize( const FCB * btreeFile, + u_int32_t startingNode, + u_int32_t startingRecord, + u_int32_t recordsFound, + u_int32_t bufferSize, + BTScanState * scanState ); + +int BTScanNextRecord( BTScanState * scanState, + Boolean avoidIO, + void * * key, + void * * data, + u_int32_t * dataSize ); + +int BTScanTerminate( BTScanState * scanState, + u_int32_t * startingNode, + u_int32_t * startingRecord, + u_int32_t * recordsFound ); + +#endif /* __APPLE_API_PRIVATE */ +#endif /* KERNEL */ +#endif /* !_BTREESCANNER_H_ */ diff --git a/core/BTreeTreeOps.c b/core/BTreeTreeOps.c new file mode 100644 index 0000000..74cd04e --- /dev/null +++ b/core/BTreeTreeOps.c @@ -0,0 +1,1338 @@ +/* + * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + File: BTreeTreeOps.c + + Contains: Multi-node tree operations for the BTree Module. + + Version: xxx put the technology version here xxx + + Written by: Gordon Sheridan and Bill Bruffey + + Copyright: (c) 1992-1999 by Apple Inc., all rights reserved. + + File Ownership: + + DRI: Don Brady + + Other Contact: Mark Day + + Technology: File Systems + + Writers: + + (msd) Mark Day + (DSH) Deric Horn + (djb) Don Brady + + Change History (most recent first): + + 6/1/99 djb Sync up with Mac OS 8.6. + 12/8/97 djb Radar #2200632, CollapseTree wasn't marking root node dirty. + 11/24/97 djb Radar #2005325, InsertLevel incorrectly handled root splits! + 10/17/97 msd Conditionalize DebugStrs. + 5/16/97 msd InsertNode() needs a return statement in ErrorExit. + 4/23/97 djb first checked in + + 3/17/97 DSH Conditionalize out Panic assertion for SC. + 3/3/97 djb Removed DebugStr in InsertLevel. + 2/19/97 djb Major re-write of insert code; added InsertLevel and InsertNode. + 1/27/97 djb InsertTree and DeleteTree are now recursive and support variable + sized index keys. + 1/16/97 djb Removed DebugStr in SearchTree. Added initial support for + variable sized index keys. + 1/3/97 djb Changed len8 to length8. + 1/3/97 djb Added support for large keys. + 12/19/96 djb first checked in + + History applicable to original Scarecrow Design: + + <3> 10/25/96 ser Changing for new VFPI + <2> 1/22/96 dkh Add #include Memory.h + <1> 10/18/95 rst Moved from Scarecrow project. + + <12> 7/18/95 mbb Change MoveData & ClearBytes to BlockMoveData & BlockZero. + <11> 9/30/94 prp Get in sync with D2 interface changes. + <10> 7/25/94 wjk Eliminate usage of BytePtr in favor of UInt8 *. + <9> 7/22/94 wjk Convert to the new set of header files. + <8> 12/2/93 wjk Move from Makefiles to BuildFiles. Fit into the ModernOS and + NRCmds environments. + <7> 11/30/93 wjk Change some Ptr's to BytePtr's in function definitions so they + agree with their prototypes. + <6> 5/21/93 gs Debug DeleteTree. Modify InsertTree for BTReplaceRecord. + <5> 5/10/93 gs Modify RotateLeft, and add DeleteTree, CollapseTree routines. + <4> 3/23/93 gs revise RotateLeft to use InsertKeyRecord instead of + InsertRecord. + <3> 3/23/93 gs Implement SplitLeft, InsertTree routine. + <2> 2/8/93 gs Implement SearchTree, and RotateLeft. + <1> 11/15/92 gs first checked in + +*/ + +#include "BTreesPrivate.h" +#include "hfs_btreeio.h" + +// +/////////////////////// Routines Internal To BTree Module /////////////////////// +// +// SearchTree +// InsertTree +// +////////////////////// Routines Internal To BTreeTreeOps.c ////////////////////// + +static OSStatus AddNewRootNode (BTreeControlBlockPtr btreePtr, + NodeDescPtr leftNode, + NodeDescPtr rightNode ); + +static OSStatus CollapseTree (BTreeControlBlockPtr btreePtr, + BlockDescriptor *blockPtr ); + +static OSStatus RotateLeft (BTreeControlBlockPtr btreePtr, + NodeDescPtr leftNode, + NodeDescPtr rightNode, + u_int16_t rightInsertIndex, + KeyPtr keyPtr, + u_int8_t * recPtr, + u_int16_t recSize, + u_int16_t *insertIndex, + u_int32_t *insertNodeNum, + Boolean *recordFit, + u_int16_t *recsRotated ); + +static Boolean RotateRecordLeft (BTreeControlBlockPtr btreePtr, + NodeDescPtr leftNode, + NodeDescPtr rightNode ); + +static OSStatus SplitLeft (BTreeControlBlockPtr btreePtr, + BlockDescriptor *leftNode, + BlockDescriptor *rightNode, + u_int32_t rightNodeNum, + u_int16_t index, + KeyPtr keyPtr, + u_int8_t * recPtr, + u_int16_t recSize, + u_int16_t *insertIndex, + u_int32_t *insertNodeNum, + u_int16_t *recsRotated ); + + + +static OSStatus InsertLevel (BTreeControlBlockPtr btreePtr, + TreePathTable treePathTable, + InsertKey *primaryKey, + InsertKey *secondaryKey, + BlockDescriptor *targetNode, + u_int16_t index, + u_int16_t level, + u_int32_t *insertNode ); + +static OSErr InsertNode (BTreeControlBlockPtr btreePtr, + InsertKey *key, + BlockDescriptor *rightNode, + u_int32_t node, + u_int16_t index, + u_int32_t *newNode, + u_int16_t *newIndex, + BlockDescriptor *leftNode, + Boolean *updateParent, + Boolean *insertParent, + Boolean *rootSplit ); + +static u_int16_t GetKeyLength (const BTreeControlBlock *btreePtr, + const BTreeKey *key, + Boolean forLeafNode ); + + + +//////////////////////// BTree Multi-node Tree Operations /////////////////////// + + +/*------------------------------------------------------------------------------- + +Routine: SearchTree - Search BTree for key and set up Tree Path Table. + +Function: Searches BTree for specified key, setting up the Tree Path Table to + reflect the search path. + + +Input: btreePtr - pointer to control block of BTree to search + keyPtr - pointer to the key to search for + treePathTable - pointer to the tree path table to construct + +Output: nodeNum - number of the node containing the key position + iterator - BTreeIterator specifying record or insert position + +Result: noErr - key found, index is record index + fsBTRecordNotFoundErr - key not found, index is insert index + fsBTEmptyErr - key not found, return params are nil + otherwise - catastrophic failure (GetNode/ReleaseNode failed) +-------------------------------------------------------------------------------*/ + +OSStatus SearchTree (BTreeControlBlockPtr btreePtr, + BTreeKeyPtr searchKey, + TreePathTable treePathTable, + u_int32_t *nodeNum, + BlockDescriptor *nodePtr, + u_int16_t *returnIndex ) +{ + OSStatus err; + int16_t level; // Expected depth of current node + u_int32_t curNodeNum; // Current node we're searching + NodeRec nodeRec; + u_int16_t index; + Boolean keyFound; + int8_t nodeKind; // Kind of current node (index/leaf) + KeyPtr keyPtr; + u_int8_t * dataPtr; + u_int16_t dataSize; + + + curNodeNum = btreePtr->rootNode; + level = btreePtr->treeDepth; + + if (level == 0) // is the tree empty? + { + err = fsBTEmptyErr; + goto ErrorExit; + } + + //€€ for debugging... + treePathTable [0].node = 0; + treePathTable [0].index = 0; + + while (true) + { + // + // [2550929] Node number 0 is the header node. It is never a valid + // index or leaf node. If we're ever asked to search through node 0, + // something has gone wrong (typically a bad child node number, or + // we found a node full of zeroes that we thought was an index node). + // + if (curNodeNum == 0) + { +// Panic("SearchTree: curNodeNum is zero!"); + err = btBadNode; + goto ErrorExit; + } + + err = GetNode (btreePtr, curNodeNum, 0, &nodeRec); + if (err != noErr) + { + goto ErrorExit; + } + + // + // [2550929] Sanity check the node height and node type. We expect + // particular values at each iteration in the search. This checking + // quickly finds bad pointers, loops, and other damage to the + // hierarchy of the B-tree. + // + if (((BTNodeDescriptor*)nodeRec.buffer)->height != level) + { +// Panic("Incorrect node height"); + err = btBadNode; + goto ReleaseAndExit; + } + nodeKind = ((BTNodeDescriptor*)nodeRec.buffer)->kind; + if (level == 1) + { + // Nodes at level 1 must be leaves, by definition + if (nodeKind != kBTLeafNode) + { + // Panic("Incorrect node type: expected leaf"); + err = btBadNode; + goto ReleaseAndExit; + } + } + else + { + // A node at any other depth must be an index node + if (nodeKind != kBTIndexNode) + { +// Panic("Incorrect node type: expected index"); + err = btBadNode; + goto ReleaseAndExit; + } + } + + keyFound = SearchNode (btreePtr, nodeRec.buffer, searchKey, &index); + + treePathTable [level].node = curNodeNum; + + if (nodeKind == kBTLeafNode) + { + treePathTable [level].index = index; + break; // were done... + } + + if ( (keyFound != true) && (index != 0)) + --index; + + treePathTable [level].index = index; + + err = GetRecordByIndex (btreePtr, nodeRec.buffer, index, &keyPtr, &dataPtr, &dataSize); + if (err != noErr) + { + // [2550929] If we got an error, it is probably because the index was bad + // (typically a corrupt node that confused SearchNode). Invalidate the node + // so we won't accidentally use the corrupted contents. NOTE: the Mac OS 9 + // sources call this InvalidateNode. + + (void) TrashNode(btreePtr, &nodeRec); + goto ErrorExit; + } + + // Get the child pointer out of this index node. We're now done with the current + // node and can continue the search with the child node. + curNodeNum = *(u_int32_t *)dataPtr; + err = ReleaseNode (btreePtr, &nodeRec); + if (err != noErr) + { + goto ErrorExit; + } + + // The child node should be at a level one less than the parent. + --level; + } + + *nodeNum = curNodeNum; + *nodePtr = nodeRec; + *returnIndex = index; + + if (keyFound) + return noErr; // searchKey found, index identifies record in node + else + return fsBTRecordNotFoundErr; // searchKey not found, index identifies insert point + +ReleaseAndExit: + (void) ReleaseNode(btreePtr, &nodeRec); + // fall into ErrorExit + +ErrorExit: + + *nodeNum = 0; + nodePtr->buffer = nil; + nodePtr->blockHeader = nil; + *returnIndex = 0; + + return err; +} + + + + +////////////////////////////////// InsertTree /////////////////////////////////// + +OSStatus InsertTree ( BTreeControlBlockPtr btreePtr, + TreePathTable treePathTable, + KeyPtr keyPtr, + u_int8_t * recPtr, + u_int16_t recSize, + BlockDescriptor *targetNode, + u_int16_t index, + u_int16_t level, + Boolean replacingKey, + u_int32_t *insertNode ) +{ + InsertKey primaryKey; + OSStatus err; + + primaryKey.keyPtr = keyPtr; + primaryKey.keyLength = GetKeyLength(btreePtr, primaryKey.keyPtr, (level == 1)); + primaryKey.recPtr = recPtr; + primaryKey.recSize = recSize; + primaryKey.replacingKey = replacingKey; + primaryKey.skipRotate = false; + + err = InsertLevel (btreePtr, treePathTable, &primaryKey, nil, + targetNode, index, level, insertNode ); + + return err; + +} // End of InsertTree + + +////////////////////////////////// InsertLevel ////////////////////////////////// + +OSStatus InsertLevel (BTreeControlBlockPtr btreePtr, + TreePathTable treePathTable, + InsertKey *primaryKey, + InsertKey *secondaryKey, + BlockDescriptor *targetNode, + u_int16_t index, + u_int16_t level, + u_int32_t *insertNode ) +{ + OSStatus err; + BlockDescriptor leftNode; + u_int32_t targetNodeNum; + u_int32_t newNodeNum; + u_int16_t newIndex; + Boolean insertParent; + Boolean updateParent; + Boolean newRoot; + InsertKey insertKey; + +#if defined(applec) && !defined(__SC__) + PanicIf ((level == 1) && (((NodeDescPtr)targetNode->buffer)->kind != kBTLeafNode), " InsertLevel: non-leaf at level 1! "); +#endif + leftNode.buffer = nil; + leftNode.blockHeader = nil; + targetNodeNum = treePathTable [level].node; + + insertParent = false; + updateParent = false; + + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, targetNode); + + ////// process first insert ////// + + err = InsertNode (btreePtr, primaryKey, targetNode, targetNodeNum, index, + &newNodeNum, &newIndex, &leftNode, &updateParent, &insertParent, &newRoot ); + M_ExitOnError (err); + + if ( newRoot ) + { + // Extend the treePathTable by adding an entry for the new + // root node that references the current targetNode. + // + // If inserting the secondaryKey changes the first key of + // the target node, then we'll have to update the second + // key in the new root node. + + treePathTable [level + 1].node = btreePtr->rootNode; + treePathTable [level + 1].index = 1; // 1 since we always split/rotate left + } + + if ( level == 1 ) + *insertNode = newNodeNum; + + ////// process second insert (if any) ////// + + if ( secondaryKey != nil ) + { + Boolean temp; + + err = InsertNode (btreePtr, secondaryKey, targetNode, newNodeNum, newIndex, + &newNodeNum, &newIndex, &leftNode, &updateParent, &insertParent, &temp); + M_ExitOnError (err); + } + + //////////////////////// Update Parent(s) /////////////////////////////// + + if ( insertParent || updateParent ) + { + BlockDescriptor parentNode; + u_int32_t parentNodeNum; + KeyPtr keyPtr; + u_int8_t * recPtr; + u_int16_t recSize; + + parentNode.buffer = nil; + parentNode.blockHeader = nil; + + secondaryKey = nil; + + PanicIf ( (level == btreePtr->treeDepth), " InsertLevel: unfinished insert!?"); + + ++level; + + // Get Parent Node data... + index = treePathTable [level].index; + parentNodeNum = treePathTable [level].node; + + PanicIf ( parentNodeNum == 0, " InsertLevel: parent node is zero!?"); + + err = GetNode (btreePtr, parentNodeNum, 0, &parentNode); // released as target node in next level up + M_ExitOnError (err); + ////////////////////////// Update Parent Index ////////////////////////////// + + if ( updateParent ) + { + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, &parentNode); + + //€€ debug: check if ptr == targetNodeNum + GetRecordByIndex (btreePtr, parentNode.buffer, index, &keyPtr, &recPtr, &recSize); + PanicIf( (*(u_int32_t *) recPtr) != targetNodeNum, " InsertLevel: parent ptr doesn't match target node!"); + + // need to delete and re-insert this parent key/ptr + // we delete it here and it gets re-inserted in the + // InsertLevel call below. + DeleteRecord (btreePtr, parentNode.buffer, index); + + primaryKey->keyPtr = (KeyPtr) GetRecordAddress( btreePtr, targetNode->buffer, 0 ); + primaryKey->keyLength = GetKeyLength(btreePtr, primaryKey->keyPtr, false); + primaryKey->recPtr = (u_int8_t *) &targetNodeNum; + primaryKey->recSize = sizeof(targetNodeNum); + primaryKey->replacingKey = kReplaceRecord; + primaryKey->skipRotate = insertParent; // don't rotate left if we have two inserts occuring + } + + ////////////////////////// Add New Parent Index ///////////////////////////// + + if ( insertParent ) + { + InsertKey *insertKeyPtr; + + if ( updateParent ) + { + insertKeyPtr = &insertKey; + secondaryKey = &insertKey; + } + else + { + insertKeyPtr = primaryKey; + } + + insertKeyPtr->keyPtr = (KeyPtr) GetRecordAddress (btreePtr, leftNode.buffer, 0); + insertKeyPtr->keyLength = GetKeyLength(btreePtr, insertKeyPtr->keyPtr, false); + insertKeyPtr->recPtr = (u_int8_t *) &((NodeDescPtr)targetNode->buffer)->bLink; + insertKeyPtr->recSize = sizeof(u_int32_t); + insertKeyPtr->replacingKey = kInsertRecord; + insertKeyPtr->skipRotate = false; // a rotate is OK during second insert + } + + err = InsertLevel (btreePtr, treePathTable, primaryKey, secondaryKey, + &parentNode, index, level, insertNode ); + M_ExitOnError (err); + } + + err = UpdateNode (btreePtr, targetNode, 0, kLockTransaction); // all done with target + M_ExitOnError (err); + + err = UpdateNode (btreePtr, &leftNode, 0, kLockTransaction); // all done with left sibling + M_ExitOnError (err); + + return noErr; + +ErrorExit: + + (void) ReleaseNode (btreePtr, targetNode); + (void) ReleaseNode (btreePtr, &leftNode); + + Panic (" InsertLevel: an error occurred!"); + + return err; + +} // End of InsertLevel + + + +////////////////////////////////// InsertNode /////////////////////////////////// + +static OSErr InsertNode (BTreeControlBlockPtr btreePtr, + InsertKey *key, + + BlockDescriptor *rightNode, + u_int32_t node, + u_int16_t index, + + u_int32_t *newNode, + u_int16_t *newIndex, + + BlockDescriptor *leftNode, + Boolean *updateParent, + Boolean *insertParent, + Boolean *rootSplit ) +{ + BlockDescriptor *targetNode = NULL; + u_int32_t leftNodeNum; + u_int16_t recsRotated; + OSErr err; + Boolean recordFit; + + *rootSplit = false; + + PanicIf ( rightNode->buffer == leftNode->buffer, " InsertNode: rightNode == leftNode, huh?"); + + leftNodeNum = ((NodeDescPtr) rightNode->buffer)->bLink; + + + /////////////////////// Try Simple Insert /////////////////////////////// + + /* sanity check our left and right nodes here. */ + if (node == leftNodeNum) { + if (leftNode->buffer == NULL) { + err = fsBTInvalidNodeErr; + M_ExitOnError(err); + } + else{ + targetNode = leftNode; + } + } + else { + // we can assume right node is initialized. + targetNode = rightNode; + } + + + recordFit = InsertKeyRecord (btreePtr, targetNode->buffer, index, key->keyPtr, key->keyLength, key->recPtr, key->recSize); + + if ( recordFit ) + { + *newNode = node; + *newIndex = index; + + if ( (index == 0) && (((NodeDescPtr) targetNode->buffer)->height != btreePtr->treeDepth) ) + *updateParent = true; // the first record changed so we need to update the parent + } + + + //////////////////////// Try Rotate Left //////////////////////////////// + + if ( !recordFit && leftNodeNum > 0 ) + { + PanicIf ( leftNode->buffer != nil, " InsertNode: leftNode already acquired!"); + + if ( leftNode->buffer == nil ) + { + err = GetNode (btreePtr, leftNodeNum, 0, leftNode); // will be released by caller or a split below + M_ExitOnError (err); + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, leftNode); + } + + PanicIf ( ((NodeDescPtr) leftNode->buffer)->fLink != node, " InsertNode, RotateLeft: invalid sibling link!" ); + + if ( !key->skipRotate ) // are rotates allowed? + { + err = RotateLeft (btreePtr, leftNode->buffer, rightNode->buffer, index, key->keyPtr, key->recPtr, + key->recSize, newIndex, newNode, &recordFit, &recsRotated ); + M_ExitOnError (err); + + if ( recordFit ) + { + if ( key->replacingKey || (recsRotated > 1) || (index > 0) ) + *updateParent = true; + } + } + } + + + //////////////////////// Try Split Left ///////////////////////////////// + + if ( !recordFit ) + { + // might not have left node... + err = SplitLeft (btreePtr, leftNode, rightNode, node, index, key->keyPtr, + key->recPtr, key->recSize, newIndex, newNode, &recsRotated); + M_ExitOnError (err); + + // if we split root node - add new root + + if ( ((NodeDescPtr) rightNode->buffer)->height == btreePtr->treeDepth ) + { + err = AddNewRootNode (btreePtr, leftNode->buffer, rightNode->buffer); // Note: does not update TPT + M_ExitOnError (err); + *rootSplit = true; + } + else + { + *insertParent = true; + + if ( key->replacingKey || (recsRotated > 1) || (index > 0) ) + *updateParent = true; + } + } + + return noErr; + +ErrorExit: + (void) ReleaseNode (btreePtr, leftNode); + return err; + +} // End of InsertNode + + +/*------------------------------------------------------------------------------- +Routine: DeleteTree - One_line_description. + +Function: Brief_description_of_the_function_and_any_side_effects + +ToDo: + +Input: btreePtr - description + treePathTable - description + targetNode - description + index - description + +Result: noErr - success + != noErr - failure +-------------------------------------------------------------------------------*/ + +OSStatus DeleteTree (BTreeControlBlockPtr btreePtr, + TreePathTable treePathTable, + BlockDescriptor *targetNode, + u_int16_t index, + u_int16_t level ) +{ + OSStatus err; + BlockDescriptor parentNode; + BTNodeDescriptor *targetNodePtr; + u_int32_t targetNodeNum; + Boolean deleteRequired; + Boolean updateRequired; + + // XXXdbg - initialize these to null in case we get an + // error and try to exit before it's initialized + parentNode.buffer = nil; + parentNode.blockHeader = nil; + + deleteRequired = false; + updateRequired = false; + + targetNodeNum = treePathTable[level].node; + targetNodePtr = targetNode->buffer; + PanicIf (targetNodePtr == nil, "DeleteTree: targetNode has nil buffer!"); + + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, targetNode); + + DeleteRecord (btreePtr, targetNodePtr, index); + + //€€ coalesce remaining records? + + if ( targetNodePtr->numRecords == 0 ) // did we delete the last record? + { + BlockDescriptor siblingNode; + u_int32_t siblingNodeNum; + + deleteRequired = true; + + siblingNode.buffer = nil; + siblingNode.blockHeader = nil; + + ////////////////// Get Siblings & Update Links ////////////////////////// + + siblingNodeNum = targetNodePtr->bLink; // Left Sibling Node + if ( siblingNodeNum != 0 ) + { + err = GetNode (btreePtr, siblingNodeNum, 0, &siblingNode); + M_ExitOnError (err); + + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, &siblingNode); + + ((NodeDescPtr)siblingNode.buffer)->fLink = targetNodePtr->fLink; + err = UpdateNode (btreePtr, &siblingNode, 0, kLockTransaction); + M_ExitOnError (err); + } + else if ( targetNodePtr->kind == kBTLeafNode ) // update firstLeafNode + { + btreePtr->firstLeafNode = targetNodePtr->fLink; + } + + siblingNodeNum = targetNodePtr->fLink; // Right Sibling Node + if ( siblingNodeNum != 0 ) + { + err = GetNode (btreePtr, siblingNodeNum, 0, &siblingNode); + M_ExitOnError (err); + + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, &siblingNode); + + ((NodeDescPtr)siblingNode.buffer)->bLink = targetNodePtr->bLink; + err = UpdateNode (btreePtr, &siblingNode, 0, kLockTransaction); + M_ExitOnError (err); + } + else if ( targetNodePtr->kind == kBTLeafNode ) // update lastLeafNode + { + btreePtr->lastLeafNode = targetNodePtr->bLink; + } + + //////////////////////// Free Empty Node //////////////////////////////// + + ClearNode (btreePtr, targetNodePtr); + + err = UpdateNode (btreePtr, targetNode, 0, kLockTransaction); + M_ExitOnError (err); + + err = FreeNode (btreePtr, targetNodeNum); + M_ExitOnError (err); + } + else if ( index == 0 ) // did we delete the first record? + { + updateRequired = true; // yes, so we need to update parent + } + + + if ( level == btreePtr->treeDepth ) // then targetNode->buffer is the root node + { + deleteRequired = false; + updateRequired = false; + + if ( targetNode->buffer == nil ) // then root was freed and the btree is empty + { + btreePtr->rootNode = 0; + btreePtr->treeDepth = 0; + } + else if ( ((NodeDescPtr)targetNode->buffer)->numRecords == 1 ) + { + err = CollapseTree (btreePtr, targetNode); + M_ExitOnError (err); + } + } + + + if ( updateRequired || deleteRequired ) + { + ++level; // next level + + //// Get Parent Node and index + index = treePathTable [level].index; + err = GetNode (btreePtr, treePathTable[level].node, 0, &parentNode); + M_ExitOnError (err); + + if ( updateRequired ) + { + KeyPtr keyPtr; + u_int8_t * recPtr; + u_int16_t recSize; + u_int32_t insertNode; + + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, &parentNode); + + //€€ debug: check if ptr == targetNodeNum + GetRecordByIndex (btreePtr, parentNode.buffer, index, &keyPtr, &recPtr, &recSize); + PanicIf( (*(u_int32_t *) recPtr) != targetNodeNum, " DeleteTree: parent ptr doesn't match targetNodeNum!!"); + + // need to delete and re-insert this parent key/ptr + DeleteRecord (btreePtr, parentNode.buffer, index); + + keyPtr = (KeyPtr) GetRecordAddress( btreePtr, targetNode->buffer, 0 ); + recPtr = (u_int8_t *) &targetNodeNum; + recSize = sizeof(targetNodeNum); + + err = InsertTree (btreePtr, treePathTable, keyPtr, recPtr, recSize, + &parentNode, index, level, kReplaceRecord, &insertNode); + M_ExitOnError (err); + } + else // deleteRequired + { + err = DeleteTree (btreePtr, treePathTable, &parentNode, index, level); + M_ExitOnError (err); + } + } + + + err = UpdateNode (btreePtr, targetNode, 0, kLockTransaction); + M_ExitOnError (err); + + return noErr; + +ErrorExit: + + (void) ReleaseNode (btreePtr, targetNode); + (void) ReleaseNode (btreePtr, &parentNode); + + return err; + +} // end DeleteTree + + + +///////////////////////////////// CollapseTree ////////////////////////////////// + +static OSStatus CollapseTree (BTreeControlBlockPtr btreePtr, + BlockDescriptor *blockPtr ) +{ + OSStatus err; + u_int32_t originalRoot; + u_int32_t nodeNum; + + originalRoot = btreePtr->rootNode; + + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, blockPtr); + + while (true) + { + if ( ((NodeDescPtr)blockPtr->buffer)->numRecords > 1) + break; // this will make a fine root node + + if ( ((NodeDescPtr)blockPtr->buffer)->kind == kBTLeafNode) + break; // we've hit bottom + + nodeNum = btreePtr->rootNode; + btreePtr->rootNode = GetChildNodeNum (btreePtr, blockPtr->buffer, 0); + --btreePtr->treeDepth; + + //// Clear and Free Current Old Root Node //// + ClearNode (btreePtr, blockPtr->buffer); + err = UpdateNode (btreePtr, blockPtr, 0, kLockTransaction); + M_ExitOnError (err); + err = FreeNode (btreePtr, nodeNum); + M_ExitOnError (err); + + //// Get New Root Node + err = GetNode (btreePtr, btreePtr->rootNode, 0, blockPtr); + M_ExitOnError (err); + + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, blockPtr); + } + + if (btreePtr->rootNode != originalRoot) + M_BTreeHeaderDirty (btreePtr); + + err = UpdateNode (btreePtr, blockPtr, 0, kLockTransaction); // always update! + M_ExitOnError (err); + + return noErr; + + +/////////////////////////////////// ErrorExit /////////////////////////////////// + +ErrorExit: + (void) ReleaseNode (btreePtr, blockPtr); + return err; +} + + + +////////////////////////////////// RotateLeft /////////////////////////////////// + +/*------------------------------------------------------------------------------- + +Routine: RotateLeft - One_line_description. + +Function: Brief_description_of_the_function_and_any_side_effects + +Algorithm: if rightIndex > insertIndex, subtract 1 for actual rightIndex + +Input: btreePtr - description + leftNode - description + rightNode - description + rightInsertIndex - description + keyPtr - description + recPtr - description + recSize - description + +Output: insertIndex + insertNodeNum - description + recordFit - description + recsRotated + +Result: noErr - success + != noErr - failure +-------------------------------------------------------------------------------*/ + +static OSStatus RotateLeft (BTreeControlBlockPtr btreePtr, + NodeDescPtr leftNode, + NodeDescPtr rightNode, + u_int16_t rightInsertIndex, + KeyPtr keyPtr, + u_int8_t * recPtr, + u_int16_t recSize, + u_int16_t *insertIndex, + u_int32_t *insertNodeNum, + Boolean *recordFit, + u_int16_t *recsRotated ) +{ + OSStatus err; + int32_t insertSize; + int32_t nodeSize; + int32_t leftSize, rightSize; + int32_t moveSize = 0; + u_int16_t keyLength; + u_int16_t lengthFieldSize; + u_int16_t index, moveIndex; + Boolean didItFit; + + ///////////////////// Determine If Record Will Fit ////////////////////////// + + keyLength = GetKeyLength(btreePtr, keyPtr, (rightNode->kind == kBTLeafNode)); + + // the key's length field is 8-bits in HFS and 16-bits in HFS+ + if ( btreePtr->attributes & kBTBigKeysMask ) + lengthFieldSize = sizeof(u_int16_t); + else + lengthFieldSize = sizeof(u_int8_t); + + insertSize = keyLength + lengthFieldSize + recSize + sizeof(u_int16_t); + + if ( M_IsOdd (insertSize) ) + ++insertSize; // add pad byte; + + nodeSize = btreePtr->nodeSize; + + // add size of insert record to right node + rightSize = nodeSize - GetNodeFreeSize (btreePtr, rightNode) + insertSize; + leftSize = nodeSize - GetNodeFreeSize (btreePtr, leftNode); + + moveIndex = 0; + + while ( leftSize < rightSize ) + { + if ( moveIndex < rightInsertIndex ) + { + moveSize = GetRecordSize (btreePtr, rightNode, moveIndex) + 2; + } + else if ( moveIndex == rightInsertIndex ) + { + moveSize = insertSize; + } + else // ( moveIndex > rightInsertIndex ) + { + moveSize = GetRecordSize (btreePtr, rightNode, moveIndex - 1) + 2; + } + + leftSize += moveSize; + rightSize -= moveSize; + ++moveIndex; + } + + if ( leftSize > nodeSize ) // undo last move + { + leftSize -= moveSize; + rightSize += moveSize; + --moveIndex; + } + + if ( rightSize > nodeSize ) // record won't fit - failure, but not error + { + *insertIndex = 0; + *insertNodeNum = 0; + *recordFit = false; + *recsRotated = 0; + + return noErr; + } + + // we've found balance point, moveIndex == number of records moved into leftNode + + + //////////////////////////// Rotate Records ///////////////////////////////// + + *recsRotated = moveIndex; + *recordFit = true; + index = 0; + + while ( index < moveIndex ) + { + if ( index == rightInsertIndex ) // insert new record in left node + { + u_int16_t leftInsertIndex; + + leftInsertIndex = leftNode->numRecords; + + didItFit = InsertKeyRecord (btreePtr, leftNode, leftInsertIndex, + keyPtr, keyLength, recPtr, recSize); + if ( !didItFit ) + { + Panic ("RotateLeft: InsertKeyRecord (left) returned false!"); + err = fsBTBadRotateErr; + goto ErrorExit; + } + + *insertIndex = leftInsertIndex; + *insertNodeNum = rightNode->bLink; + } + else + { + didItFit = RotateRecordLeft (btreePtr, leftNode, rightNode); + if ( !didItFit ) + { + Panic ("RotateLeft: RotateRecordLeft returned false!"); + err = fsBTBadRotateErr; + goto ErrorExit; + } + } + + ++index; + } + + if ( moveIndex <= rightInsertIndex ) // then insert new record in right node + { + rightInsertIndex -= index; // adjust for records already rotated + + didItFit = InsertKeyRecord (btreePtr, rightNode, rightInsertIndex, + keyPtr, keyLength, recPtr, recSize); + if ( !didItFit ) + { + Panic ("RotateLeft: InsertKeyRecord (right) returned false!"); + err = fsBTBadRotateErr; + goto ErrorExit; + } + + *insertIndex = rightInsertIndex; + *insertNodeNum = leftNode->fLink; + } + + + return noErr; + + + ////////////////////////////// Error Exit /////////////////////////////////// + +ErrorExit: + + *insertIndex = 0; + *insertNodeNum = 0; + *recordFit = false; + *recsRotated = 0; + + return err; +} + + + +/////////////////////////////////// SplitLeft /////////////////////////////////// + +static OSStatus SplitLeft (BTreeControlBlockPtr btreePtr, + BlockDescriptor *leftNode, + BlockDescriptor *rightNode, + u_int32_t rightNodeNum, + u_int16_t index, + KeyPtr keyPtr, + u_int8_t * recPtr, + u_int16_t recSize, + u_int16_t *insertIndex, + u_int32_t *insertNodeNum, + u_int16_t *recsRotated ) +{ + OSStatus err; + NodeDescPtr left, right; + u_int32_t newNodeNum; + Boolean recordFit; + + + ///////////////////////////// Compare Nodes ///////////////////////////////// + + right = rightNode->buffer; + left = leftNode->buffer; + + PanicIf ( right->bLink != 0 && left == 0, " SplitLeft: left sibling missing!?" ); + + /* type should be kBTLeafNode or kBTIndexNode */ + + if ( (right->height == 1) && (right->kind != kBTLeafNode) ) + return fsBTInvalidNodeErr; + + if ( left != nil ) + { + if ( left->fLink != rightNodeNum ) + return fsBTInvalidNodeErr; //€€ E_BadSibling ? + + if ( left->height != right->height ) + return fsBTInvalidNodeErr; //€€ E_BadNodeHeight ? + + if ( left->kind != right->kind ) + return fsBTInvalidNodeErr; //€€ E_BadNodeType ? + } + + + ///////////////////////////// Allocate Node ///////////////////////////////// + + err = AllocateNode (btreePtr, &newNodeNum); + M_ExitOnError (err); + + + /////////////// Update Forward Link In Original Left Node /////////////////// + + if ( left != nil ) + { + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, leftNode); + + left->fLink = newNodeNum; + err = UpdateNode (btreePtr, leftNode, 0, kLockTransaction); + M_ExitOnError (err); + } + + + /////////////////////// Initialize New Left Node //////////////////////////// + + err = GetNewNode (btreePtr, newNodeNum, leftNode); + M_ExitOnError (err); + + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, leftNode); + + left = leftNode->buffer; + left->fLink = rightNodeNum; + + + // Steal Info From Right Node + + left->bLink = right->bLink; + left->kind = right->kind; + left->height = right->height; + + right->bLink = newNodeNum; // update Right bLink + + if ( (left->kind == kBTLeafNode) && (left->bLink == 0) ) + { + // if we're adding a new first leaf node - update BTreeInfoRec + + btreePtr->firstLeafNode = newNodeNum; + M_BTreeHeaderDirty (btreePtr); //€€ AllocateNode should have set the bit already... + } + + ////////////////////////////// Rotate Left ////////////////////////////////// + + err = RotateLeft (btreePtr, left, right, index, keyPtr, recPtr, recSize, + insertIndex, insertNodeNum, &recordFit, recsRotated); + + M_ExitOnError (err); + + return noErr; + +ErrorExit: + + (void) ReleaseNode (btreePtr, leftNode); + (void) ReleaseNode (btreePtr, rightNode); + + //€€ Free new node if allocated? + + *insertIndex = 0; + *insertNodeNum = 0; + *recsRotated = 0; + + return err; +} + + + +/////////////////////////////// RotateRecordLeft //////////////////////////////// + +static Boolean RotateRecordLeft (BTreeControlBlockPtr btreePtr, + NodeDescPtr leftNode, + NodeDescPtr rightNode ) +{ + u_int16_t size; + u_int8_t * recPtr; + Boolean recordFit; + + size = GetRecordSize (btreePtr, rightNode, 0); + recPtr = GetRecordAddress (btreePtr, rightNode, 0); + + recordFit = InsertRecord (btreePtr, leftNode, leftNode->numRecords, recPtr, size); + + if ( !recordFit ) + return false; + + DeleteRecord (btreePtr, rightNode, 0); + + return true; +} + + +//////////////////////////////// AddNewRootNode ///////////////////////////////// + +static OSStatus AddNewRootNode (BTreeControlBlockPtr btreePtr, + NodeDescPtr leftNode, + NodeDescPtr rightNode ) +{ + OSStatus err; + BlockDescriptor rootNode; + u_int32_t rootNum; + KeyPtr keyPtr; + Boolean didItFit; + u_int16_t keyLength; + + rootNode.buffer = nil; + rootNode.blockHeader = nil; + + PanicIf (leftNode == nil, "AddNewRootNode: leftNode == nil"); + PanicIf (rightNode == nil, "AddNewRootNode: rightNode == nil"); + + + /////////////////////// Initialize New Root Node //////////////////////////// + + err = AllocateNode (btreePtr, &rootNum); + M_ExitOnError (err); + + err = GetNewNode (btreePtr, rootNum, &rootNode); + M_ExitOnError (err); + + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, &rootNode); + + ((NodeDescPtr)rootNode.buffer)->kind = kBTIndexNode; + ((NodeDescPtr)rootNode.buffer)->height = ++btreePtr->treeDepth; + + + ///////////////////// Insert Left Node Index Record ///////////////////////// + + keyPtr = (KeyPtr) GetRecordAddress (btreePtr, leftNode, 0); + keyLength = GetKeyLength(btreePtr, keyPtr, false); + + didItFit = InsertKeyRecord ( btreePtr, rootNode.buffer, 0, keyPtr, keyLength, + (u_int8_t *) &rightNode->bLink, 4 ); + + PanicIf ( !didItFit, "AddNewRootNode:InsertKeyRecord failed for left index record"); + + + //////////////////// Insert Right Node Index Record ///////////////////////// + + keyPtr = (KeyPtr) GetRecordAddress (btreePtr, rightNode, 0); + keyLength = GetKeyLength(btreePtr, keyPtr, false); + + didItFit = InsertKeyRecord ( btreePtr, rootNode.buffer, 1, keyPtr, keyLength, + (u_int8_t *) &leftNode->fLink, 4 ); + + PanicIf ( !didItFit, "AddNewRootNode:InsertKeyRecord failed for right index record"); + + + /////////////////////////// Release Root Node /////////////////////////////// + + err = UpdateNode (btreePtr, &rootNode, 0, kLockTransaction); + M_ExitOnError (err); + + // update BTreeInfoRec + + btreePtr->rootNode = rootNum; + M_BTreeHeaderDirty(btreePtr); + + return noErr; + + + ////////////////////////////// Error Exit /////////////////////////////////// + +ErrorExit: + + return err; +} + + +static u_int16_t GetKeyLength ( const BTreeControlBlock *btreePtr, const BTreeKey *key, Boolean forLeafNode ) +{ + u_int16_t length; + + if ( forLeafNode || btreePtr->attributes & kBTVariableIndexKeysMask ) + length = KeyLength (btreePtr, key); // just use actual key length + else + length = btreePtr->maxKeyLength; // fixed sized index key (i.e. HFS) //€€ shouldn't we clear the pad bytes? + + return length; +} + diff --git a/core/BTreeWrapper.c b/core/BTreeWrapper.c new file mode 100644 index 0000000..12ce54d --- /dev/null +++ b/core/BTreeWrapper.c @@ -0,0 +1,278 @@ +/* + * Copyright (c) 2000, 2002, 2005-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include "BTreesPrivate.h" +#include +#include + + +// local routines +static OSErr CheckBTreeKey(const BTreeKey *key, const BTreeControlBlock *btcb); + +#if DEBUG +static Boolean ValidHFSRecord(const void *record, const BTreeControlBlock *btcb, u_int16_t recordSize); +#endif + +OSErr ReplaceBTreeRecord(FileReference refNum, const void* key, u_int32_t hint, void *newData, u_int16_t dataSize, u_int32_t *newHint) +{ + FSBufferDescriptor btRecord; + struct BTreeIterator *iterator = NULL; + FCB *fcb; + BTreeControlBlock *btcb; + OSStatus result; + + iterator = hfs_mallocz(sizeof(struct BTreeIterator)); + + fcb = GetFileControlBlock(refNum); + btcb = (BTreeControlBlock*) fcb->fcbBTCBPtr; + + btRecord.bufferAddress = newData; + btRecord.itemSize = dataSize; + btRecord.itemCount = 1; + + iterator->hint.nodeNum = hint; + + result = CheckBTreeKey((const BTreeKey *) key, btcb); + if (result) { + goto ErrorExit; + } + + BlockMoveData(key, &iterator->key, CalcKeySize(btcb, (const BTreeKey *) key)); //€€ should we range check against maxkeylen? + +#if DEBUG + if ( !ValidHFSRecord(newData, btcb, dataSize) ) + DebugStr("ReplaceBTreeRecord: bad record?"); +#endif + + result = BTReplaceRecord( fcb, iterator, &btRecord, dataSize ); + + *newHint = iterator->hint.nodeNum; + +ErrorExit: + + hfs_free(iterator, sizeof(*iterator)); + return result; +} + + + +static OSErr CheckBTreeKey(const BTreeKey *key, const BTreeControlBlock *btcb) +{ + u_int16_t keyLen; + + if ( btcb->attributes & kBTBigKeysMask ) + keyLen = key->length16; + else + keyLen = key->length8; + + if ( (keyLen < 6) || (keyLen > btcb->maxKeyLength) ) + { + hfs_debug("CheckBTreeKey: bad key length!"); + return fsBTInvalidKeyLengthErr; + } + + return noErr; +} + +#if DEBUG + +static Boolean ValidHFSRecord(const void *record, const BTreeControlBlock *btcb, u_int16_t recordSize) +{ + u_int32_t cNodeID; + + if (btcb->maxKeyLength == kHFSPlusExtentKeyMaximumLength ) + { + return ( recordSize == sizeof(HFSPlusExtentRecord) ); + } +#if CONFIG_HFS_STD + else if ( btcb->maxKeyLength == kHFSExtentKeyMaximumLength ) + { + return ( recordSize == sizeof(HFSExtentRecord) ); + } +#endif + + else // Catalog record + { + const CatalogRecord *catalogRecord = (const CatalogRecord*) record; + + switch(catalogRecord->recordType) + { + +#if CONFIG_HFS_STD + /* + * HFS standard File/folder records and File/Folder Thread records + * are only valid on configs that support HFS standard. + */ + case kHFSFolderRecord: + { + if ( recordSize != sizeof(HFSCatalogFolder) ) + return false; + if ( catalogRecord->hfsFolder.flags != 0 ) + return false; + if ( catalogRecord->hfsFolder.valence > 0x7FFF ) + return false; + + cNodeID = catalogRecord->hfsFolder.folderID; + + if ( (cNodeID == 0) || (cNodeID < 16 && cNodeID > 2) ) + return false; + } + break; + + case kHFSFileRecord: + { + const HFSExtentDescriptor *dataExtent; + const HFSExtentDescriptor *rsrcExtent; + + if ( recordSize != sizeof(HFSCatalogFile) ) + return false; + if ( (catalogRecord->hfsFile.flags & ~(0x83)) != 0 ) + return false; + + cNodeID = catalogRecord->hfsFile.fileID; + + if ( cNodeID < 16 ) + return false; + + // make sure 0 ¾ LEOF ¾ PEOF for both forks + + if ( catalogRecord->hfsFile.dataLogicalSize < 0 ) + return false; + if ( catalogRecord->hfsFile.dataPhysicalSize < catalogRecord->hfsFile.dataLogicalSize ) + return false; + if ( catalogRecord->hfsFile.rsrcLogicalSize < 0 ) + return false; + if ( catalogRecord->hfsFile.rsrcPhysicalSize < catalogRecord->hfsFile.rsrcLogicalSize ) + return false; + + dataExtent = (const HFSExtentDescriptor*) &catalogRecord->hfsFile.dataExtents; + rsrcExtent = (const HFSExtentDescriptor*) &catalogRecord->hfsFile.rsrcExtents; + +#if 0 + for (i = 0; i < kHFSExtentDensity; ++i) + { + if ( (dataExtent[i].blockCount > 0) && (dataExtent[i].startBlock == 0) ) + return false; + if ( (rsrcExtent[i].blockCount > 0) && (rsrcExtent[i].startBlock == 0) ) + return false; + } +#endif + } + break; + + case kHFSFileThreadRecord: + case kHFSFolderThreadRecord: + { + if ( recordSize != sizeof(HFSCatalogThread) ) + return false; + + cNodeID = catalogRecord->hfsThread.parentID; + if ( (cNodeID == 0) || (cNodeID < 16 && cNodeID > 2) ) + return false; + + if ( (catalogRecord->hfsThread.nodeName[0] == 0) || + (catalogRecord->hfsThread.nodeName[0] > 31) ) + return false; + } + break; +#endif + + case kHFSPlusFolderRecord: + { + if ( recordSize != sizeof(HFSPlusCatalogFolder) ) + return false; + if ( catalogRecord->hfsPlusFolder.flags != 0 ) + return false; + if ( catalogRecord->hfsPlusFolder.valence > 0x7FFF ) + return false; + + cNodeID = catalogRecord->hfsPlusFolder.folderID; + + if ( (cNodeID == 0) || (cNodeID < 16 && cNodeID > 2) ) + return false; + } + break; + + case kHFSPlusFileRecord: + { +// u_int16_t i; + const HFSPlusExtentDescriptor *dataExtent; + const HFSPlusExtentDescriptor *rsrcExtent; + + if ( recordSize != sizeof(HFSPlusCatalogFile) ) + return false; + if ( (catalogRecord->hfsPlusFile.flags & ~(0x83)) != 0 ) + return false; + + cNodeID = catalogRecord->hfsPlusFile.fileID; + + if ( cNodeID < 16 ) + return false; + + // make sure 0 ¾ LEOF ¾ PEOF for both forks + + dataExtent = (const HFSPlusExtentDescriptor*) &catalogRecord->hfsPlusFile.dataFork.extents; + rsrcExtent = (const HFSPlusExtentDescriptor*) &catalogRecord->hfsPlusFile.resourceFork.extents; + +#if 0 + for (i = 0; i < kHFSPlusExtentDensity; ++i) + { + if ( (dataExtent[i].blockCount > 0) && (dataExtent[i].startBlock == 0) ) + return false; + if ( (rsrcExtent[i].blockCount > 0) && (rsrcExtent[i].startBlock == 0) ) + return false; + } +#endif + } + break; + + case kHFSPlusFileThreadRecord: + case kHFSPlusFolderThreadRecord: + { + if ( recordSize > sizeof(HFSPlusCatalogThread) || recordSize < (sizeof(HFSPlusCatalogThread) - sizeof(HFSUniStr255))) + return false; + + cNodeID = catalogRecord->hfsPlusThread.parentID; + if ( (cNodeID == 0) || (cNodeID < 16 && cNodeID > 2) ) + return false; + + if ( (catalogRecord->hfsPlusThread.nodeName.length == 0) || + (catalogRecord->hfsPlusThread.nodeName.length > 255) ) + return false; + } + break; + + default: + return false; + } + } + + return true; // record appears to be OK +} + +#endif // DEBUG diff --git a/core/BTreesInternal.h b/core/BTreesInternal.h new file mode 100644 index 0000000..11a7842 --- /dev/null +++ b/core/BTreesInternal.h @@ -0,0 +1,368 @@ +/* + * Copyright (c) 2000-2014 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + File: BTreesInternal.h + + Contains: IPI to File Manager B-tree + + Version: HFS Plus 1.0 + + Copyright: (c) 1996-1998 by Apple Inc., all rights reserved. + + File Ownership: + + DRI: Don Brady + + Other Contact: Mark Day + + Technology: File Systems + + Writers: + + (msd) Mark Day + (DSH) Deric Horn + (djb) Don Brady + + Change History (most recent first): + + 9/22/99 ser Added prototypes for BTGetLastSync and BTSetLastSync + 6/22/98 djb Add ERR_BASE to btree error codes to make them negative (for MacOS X only). + + 7/28/97 msd Add enum for fsBTTimeOutErr. + 7/25/97 DSH Added heuristicHint as parameter to BTSearchRecord. + 7/24/97 djb Add blockReadFromDisk flag to BlockDescriptor. Callbacks now use + a file refNum instead of an FCB. + 7/16/97 DSH FilesInternal.i renamed FileMgrInternal.i to avoid name + collision + 6/2/97 DSH Added SetEndOfForkProc() prototype, so Attributes.c can call it + directly. + 5/19/97 djb kMaxKeyLength is now 520. + 4/28/97 djb first checked in + + 3/17/97 DSH Remove Key Comparison prototype, already in FilesInternal.h. + 2/19/97 djb Add SetBlockSizeProcPtr. Add blockSize field to BlockDescriptor. + Remove E_ type error enums. + 1/27/97 djb Include Types.h and FilesInternal.h. + 1/13/97 djb Added kBTreeCurrentRecord for BTIterateRecord. + 1/3/97 djb Added support for large keys. + 12/19/96 djb first checked in + +*/ + +#ifndef __BTREESINTERNAL__ +#define __BTREESINTERNAL__ + +#include + +#ifdef KERNEL +#ifdef __APPLE_API_PRIVATE + +#ifndef __FILEMGRINTERNAL__ +#include "FileMgrInternal.h" +#endif + +enum { + fsBTInvalidHeaderErr = btBadHdr, + fsBTBadRotateErr = dsBadRotate, + fsBTInvalidNodeErr = btBadNode, + fsBTRecordTooLargeErr = btNoFit, + fsBTRecordNotFoundErr = btNotFound, + fsBTDuplicateRecordErr = btExists, + fsBTFullErr = btNoSpaceAvail, + + fsBTInvalidFileErr = ERR_BASE + 0x0302, /* no BTreeCB has been allocated for fork*/ + fsBTrFileAlreadyOpenErr = ERR_BASE + 0x0303, + fsBTInvalidIteratorErr = ERR_BASE + 0x0308, + fsBTEmptyErr = ERR_BASE + 0x030A, + fsBTNoMoreMapNodesErr = ERR_BASE + 0x030B, + fsBTBadNodeSize = ERR_BASE + 0x030C, + fsBTBadNodeType = ERR_BASE + 0x030D, + fsBTInvalidKeyLengthErr = ERR_BASE + 0x030E, + fsBTStartOfIterationErr = ERR_BASE + 0x0353, + fsBTEndOfIterationErr = ERR_BASE + 0x0354, + fsBTUnknownVersionErr = ERR_BASE + 0x0355, + fsBTTreeTooDeepErr = ERR_BASE + 0x0357, + fsIteratorExitedScopeErr = ERR_BASE + 0x0A02, /* iterator exited the scope*/ + fsIteratorScopeExceptionErr = ERR_BASE + 0x0A03, /* iterator is undefined due to error or movement of scope locality*/ + fsUnknownIteratorMovementErr = ERR_BASE + 0x0A04, /* iterator movement is not defined*/ + fsInvalidIterationMovmentErr = ERR_BASE + 0x0A05, /* iterator movement is invalid in current context*/ + fsClientIDMismatchErr = ERR_BASE + 0x0A06, /* wrong client process ID*/ + fsEndOfIterationErr = ERR_BASE + 0x0A07, /* there were no objects left to return on iteration*/ + fsBTTimeOutErr = ERR_BASE + 0x0A08 /* BTree scan interrupted -- no time left for physical I/O */ +}; + +struct BlockDescriptor{ + void *buffer; + void *blockHeader; + daddr64_t blockNum; /* logical block number (used by hfs_swap_BTNode) */ + ByteCount blockSize; + Boolean blockReadFromDisk; + Byte isModified; // XXXdbg - for journaling + Byte reserved[2]; +}; +typedef struct BlockDescriptor BlockDescriptor; +typedef BlockDescriptor *BlockDescPtr; + + +struct FSBufferDescriptor { + void * bufferAddress; + ByteCount itemSize; + ItemCount itemCount; +}; +typedef struct FSBufferDescriptor FSBufferDescriptor; + +typedef FSBufferDescriptor *FSBufferDescriptorPtr; + + +/* + Fork Level Access Method Block get options +*/ +enum { + kGetBlock = 0x00000000, + kGetBlockHint = 0x00000001, // if set, the block is being looked up using hint + kForceReadBlock = 0x00000002, //€€ how does this relate to Read/Verify? Do we need this? + kGetEmptyBlock = 0x00000008 +}; +typedef u_int32_t GetBlockOptions; + +/* + Fork Level Access Method Block release options +*/ +enum { + kReleaseBlock = 0x00000000, + kForceWriteBlock = 0x00000001, + kMarkBlockDirty = 0x00000002, + kTrashBlock = 0x00000004, + kLockTransaction = 0x00000100 +}; +typedef u_int32_t ReleaseBlockOptions; + +typedef u_int64_t FSSize; +typedef u_int32_t ForkBlockNumber; + +/*============================================================================ + Fork Level Buffered I/O Access Method +============================================================================*/ + +typedef OSStatus (* GetBlockProcPtr) (FileReference fileRefNum, + u_int32_t blockNum, + GetBlockOptions options, + BlockDescriptor *block ); + + +typedef OSStatus (* ReleaseBlockProcPtr) (FileReference fileRefNum, + BlockDescPtr blockPtr, + ReleaseBlockOptions options ); + +typedef OSStatus (* SetEndOfForkProcPtr) (FileReference fileRefNum, + FSSize minEOF, + FSSize maxEOF ); + +typedef OSStatus (* SetBlockSizeProcPtr) (FileReference fileRefNum, + ByteCount blockSize, + ItemCount minBlockCount ); + +OSStatus SetEndOfForkProc ( FileReference fileRefNum, FSSize minEOF, FSSize maxEOF ); + + +/* + B*Tree Information Version +*/ + +enum BTreeInformationVersion{ + kBTreeInfoVersion = 0 +}; + +/* + B*Tree Iteration Operation Constants +*/ + +enum BTreeIterationOperations{ + kBTreeFirstRecord, + kBTreeNextRecord, + kBTreePrevRecord, + kBTreeLastRecord, + kBTreeCurrentRecord +}; +typedef u_int16_t BTreeIterationOperation; + + +/* + Btree types: 0 is HFS CAT/EXT file, 1~127 are AppleShare B*Tree files, 128~254 unused + hfsBtreeType EQU 0 ; control file + validBTType EQU $80 ; user btree type starts from 128 + userBT1Type EQU $FF ; 255 is our Btree type. Used by BTInit and BTPatch +*/ + +enum BTreeTypes{ + kHFSBTreeType = 0, // control file + kUserBTreeType = 128, // user btree type starts from 128 + kReservedBTreeType = 255 // +}; + +#define kBTreeHeaderUserBytes 128 + + +typedef BTreeKey *BTreeKeyPtr; + + +/* + BTreeInfoRec Structure - for BTGetInformation +*/ +struct BTreeInfoRec{ + u_int16_t version; + u_int16_t nodeSize; + u_int16_t maxKeyLength; + u_int16_t treeDepth; + u_int32_t lastfsync; /* Last time that this was fsynced */ + ItemCount numRecords; + ItemCount numNodes; + ItemCount numFreeNodes; + u_int8_t keyCompareType; + u_int8_t reserved[3]; +}; +typedef struct BTreeInfoRec BTreeInfoRec; +typedef BTreeInfoRec *BTreeInfoPtr; + +/* + BTreeHint can never be exported to the outside. Use u_int32_t BTreeHint[4], + u_int8_t BTreeHint[16], etc. + */ +struct BTreeHint{ + ItemCount writeCount; + u_int32_t nodeNum; // node the key was last seen in + u_int16_t index; // index then key was last seen at + u_int16_t reserved1; + u_int32_t reserved2; +}; +typedef struct BTreeHint BTreeHint; +typedef BTreeHint *BTreeHintPtr; + +/* + BTree Iterator +*/ +struct BTreeIterator{ + BTreeHint hint; + u_int16_t version; + u_int16_t reserved; + u_int32_t hitCount; // Total number of leaf records hit + u_int32_t maxLeafRecs; // Max leaf records over iteration + BTreeKey key; +}; +typedef struct BTreeIterator BTreeIterator; +typedef BTreeIterator *BTreeIteratorPtr; + + +/*============================================================================ + B*Tree SPI +============================================================================*/ + +/* + Key Comparison Function ProcPtr Type - for BTOpenPath +*/ +//typedef int32_t (* KeyCompareProcPtr)(BTreeKeyPtr a, BTreeKeyPtr b); + + +typedef int32_t (* IterateCallBackProcPtr)(BTreeKeyPtr key, void * record, void * state); + + +extern OSStatus BTOpenPath(FCB *filePtr, KeyCompareProcPtr keyCompareProc); + +extern OSStatus BTClosePath (FCB *filePtr ); + + +extern OSStatus BTSearchRecord (FCB *filePtr, + BTreeIterator *searchIterator, + FSBufferDescriptor *btRecord, + u_int16_t *recordLen, + BTreeIterator *resultIterator ); + +extern OSStatus BTIterateRecord (FCB *filePtr, + BTreeIterationOperation operation, + BTreeIterator *iterator, + FSBufferDescriptor *btRecord, + u_int16_t *recordLen ); + + +extern OSStatus BTIterateRecords(FCB *filePtr, BTreeIterationOperation operation, BTreeIterator *iterator, + IterateCallBackProcPtr callBackProc, void * callBackState); + +extern OSStatus BTInsertRecord (FCB *filePtr, + BTreeIterator *iterator, + FSBufferDescriptor *btrecord, + u_int16_t recordLen ); + +extern OSStatus BTReplaceRecord (FCB *filePtr, + BTreeIterator *iterator, + FSBufferDescriptor *btRecord, + u_int16_t recordLen ); + +extern OSStatus BTUpdateRecord (FCB *filePtr, + BTreeIterator *iterator, + IterateCallBackProcPtr callBackProc, + void *callBackState ); + +extern OSStatus BTDeleteRecord (FCB *filePtr, + BTreeIterator *iterator ); + +extern OSStatus BTGetInformation (FCB *filePtr, + u_int16_t vers, + BTreeInfoRec *info ); + +extern OSStatus BTIsDirty(FCB *filePtr); + +extern OSStatus BTFlushPath (FCB *filePtr ); + +extern OSStatus BTReloadData (FCB *filePtr); + +extern OSStatus BTInvalidateHint (BTreeIterator *iterator ); + +extern OSStatus BTGetLastSync (FCB *filePtr, + u_int32_t *lastfsync ); + +extern OSStatus BTSetLastSync (FCB *filePtr, + u_int32_t lastfsync ); + +extern OSStatus BTHasContiguousNodes(FCB *filePtr); + +extern OSStatus BTGetUserData(FCB *filePtr, void * dataPtr, int dataSize); + +extern OSStatus BTSetUserData(FCB *filePtr, void * dataPtr, int dataSize); + +/* B-tree node reserve routines. */ +extern void BTReserveSetup(void); + +extern int BTReserveSpace(FCB *file, int operations, void * data); + +extern int BTReleaseReserve(FCB *file, void * data); + +extern int BTZeroUnusedNodes(FCB *file); + +#endif /* __APPLE_API_PRIVATE */ +#endif /* KERNEL */ +#endif // __BTREESINTERNAL__ diff --git a/core/BTreesPrivate.h b/core/BTreesPrivate.h new file mode 100644 index 0000000..260de38 --- /dev/null +++ b/core/BTreesPrivate.h @@ -0,0 +1,516 @@ +/* + * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + File: BTreesPrivate.h + + Contains: Private interface file for the BTree Module. + + Version: xxx put the technology version here xxx + + Written by: Gordon Sheridan and Bill Bruffey + + Copyright: (c) 1992-1999 by Apple Inc., all rights reserved. + + File Ownership: + + DRI: Don Brady + + Other Contact: Mark Day + + Technology: File Systems + + Writers: + + (msd) Mark Day + (DSH) Deric Horn + (djb) Don Brady + (ser) Scott Roberts + (dkh) Dave Heller + + Change History (most recent first): + 3/19/99 djb Disable MoveRecordsLeft/Right macros since bcopy is broken. + + 8/10/98 djb Removed unused BTreeIterator from BTreeControlBlock, fixed alignment. + + 9/4/97 djb Convert MoveRecordsLeft and GetLeftSiblingNode to macros. + 7/24/97 djb Add macro for GetRecordAddress (was a function before). + 7/21/97 msd GetRecordByIndex now returns an OSStatus. + 7/16/97 DSH FilesInternal.i renamed FileMgrInternal.i to avoid name + collision + 4/23/97 djb first checked in + + 3/17/97 DSH Added a refCon field to BTreeControlBlock, for DFA use, to point + to additional data. Fixed Panic macros for use with SC. + 2/19/97 djb Add InsertKey struct. Moved on-disk definitions to + HFSBTreesPriv.h + 1/27/97 djb InsertTree and DeleteTree are now recursive and support variable + sized index keys. + 1/15/97 djb Move GetFileRefNumFromFCB macro to FilesInternal.h. Added + kBTVariableIndexKeysMask. + 1/3/97 djb Added support for large keys. + 12/19/96 djb first checked in + + History applicable to original Scarecrow Design: + + <7> 10/25/96 ser Changing for new VFPI + <6> 10/18/96 ser Converting over VFPI changes + <5> 9/17/96 dkh More BTree statistics + <4> 9/16/96 dkh Revised BTree statistics + <3> 6/20/96 dkh Radar #1358740. Switch from using Pools to debug MemAllocators. + <2> 12/7/95 dkh D10E2 build. Changed usage of Ref data type to LogicalAddress. + <1> 10/18/95 rst Moved from Scarecrow project. + + <19> 11/22/94 djb Add prototype for GetMapNode + <18> 11/16/94 prp Add IsItAHint routine prototype. + <17> 9/30/94 prp Get in sync with D2 interface changes. + <16> 7/25/94 wjk Eliminate usage of BytePtr in favor of UInt8 *. + <15> 7/22/94 wjk Convert to the new set of header files. + <14> 5/31/94 srs Moved Btree types to public interface + <13> 12/9/93 wjk Add 68k alignment pragma's around persistent structures. + <12> 11/30/93 wjk Move from Makefiles to BuildFiles. Fit into the ModernOS and + NRCmds environments. + <11> 11/23/93 wjk Changes required to compile on the RS6000. + <10> 8/30/93 CH Removed the M_ExitOnError and M_ReturnErrorIf macros which were + already defined in FileSystemPriv.h (included here). + <9> 8/30/93 CH Added parens around the M_ReturnErrorIf macro. + <8> 5/21/93 gs Add kBadClose flag. Add some prototypes for internal routines. + <7> 5/10/93 gs Change Ptr to BytePtr. Move BTreeTypes to BTree.h. Add + DeleteTree prototype. + <6> 3/23/93 gs Remove mysterious "flags" field from HeaderRec structure. Move + prototypes of private functions to top of respective source + files. + <5> 2/8/93 gs Update to use FSAgent.h Get/Release/SetEOF/SetBlockSize + procPtrs. Add UpdateNode routine. + <4> 12/10/92 gs Add Key Descriptor function declarations. + <3> 12/8/92 gs Add HeaderRec structure and incorporate review feedback. + <2> 12/2/92 gs Add GetNode and ReleaseNode callback procptrs to BTree CB, and + add internal function declarations. + <1> 11/15/92 gs first checked in + +*/ + +#ifndef __BTREESPRIVATE__ +#define __BTREESPRIVATE__ + +#include + +#ifdef KERNEL +#ifdef __APPLE_API_PRIVATE + +#include "hfs_macos_defs.h" + +#ifndef __FILEMGRINTERNAL__ +#include "FileMgrInternal.h" +#endif + +#ifndef __BTREESINTERNAL__ +#include "BTreesInternal.h" +#endif + + +/////////////////////////////////// Constants /////////////////////////////////// + +#define kBTreeVersion 1 +#define kMaxTreeDepth 16 + + +#define kHeaderNodeNum 0 +#define kKeyDescRecord 1 + + +// Header Node Record Offsets +enum { + kHeaderRecOffset = 0x000E, + kKeyDescRecOffset = 0x0078, + kHeaderMapRecOffset = 0x00F8 +}; + +#define kMinNodeSize 512 + +#define kMinRecordSize 6 + // where is minimum record size enforced? + +// miscellaneous BTree constants +enum { + kOffsetSize = 2 +}; + +// Insert Operations +typedef enum { + kInsertRecord = 0, + kReplaceRecord = 1 +} InsertType; + +// illegal string attribute bits set in mask +#define kBadStrAttribMask 0xCF + + + +//////////////////////////////////// Macros ///////////////////////////////////// + +#define M_NodesInMap(mapSize) ((mapSize) << 3) + +#define M_ClearBitNum(integer,bitNumber) ((integer) &= (~(1<<(bitNumber)))) +#define M_SetBitNum(integer,bitNumber) ((integer) |= (1<<(bitNumber))) +#define M_IsOdd(integer) (((integer) & 1) != 0) +#define M_IsEven(integer) (((integer) & 1) == 0) + +#define M_MapRecordSize(nodeSize) (nodeSize - sizeof (BTNodeDescriptor) - 6) +#define M_HeaderMapRecordSize(nodeSize) (nodeSize - sizeof(BTNodeDescriptor) - sizeof(BTHeaderRec) - 128 - 8) + +#define M_SWAP_BE16_ClearBitNum(integer,bitNumber) ((integer) &= SWAP_BE16(~(1<<(bitNumber)))) +#define M_SWAP_BE16_SetBitNum(integer,bitNumber) ((integer) |= SWAP_BE16(1<<(bitNumber))) + +///////////////////////////////////// Types ///////////////////////////////////// + +typedef struct BTreeControlBlock { // fields specific to BTree CBs + + u_int8_t keyCompareType; /* Key string Comparison Type */ + u_int8_t btreeType; + u_int16_t treeDepth; + FileReference fileRefNum; // refNum of btree file + KeyCompareProcPtr keyCompareProc; + u_int32_t rootNode; + u_int32_t leafRecords; + u_int32_t firstLeafNode; + u_int32_t lastLeafNode; + u_int16_t nodeSize; + u_int16_t maxKeyLength; + u_int32_t totalNodes; + u_int32_t freeNodes; + + u_int16_t reserved3; // 4-byte alignment + + // new fields + int16_t version; + u_int32_t flags; // dynamic flags + u_int32_t attributes; // persistent flags + u_int32_t writeCount; + u_int32_t lastfsync; /* Last time that this was fsynced */ + + GetBlockProcPtr getBlockProc; + ReleaseBlockProcPtr releaseBlockProc; + SetEndOfForkProcPtr setEndOfForkProc; + + // statistical information + u_int32_t numGetNodes; + u_int32_t numGetNewNodes; + u_int32_t numReleaseNodes; + u_int32_t numUpdateNodes; + u_int32_t numMapNodesRead; // map nodes beyond header node + u_int32_t numHintChecks; + u_int32_t numPossibleHints; // Looks like a formated hint + u_int32_t numValidHints; // Hint used to find correct record. + u_int32_t reservedNodes; + BTreeIterator iterator; // useable when holding exclusive b-tree lock + +#if DEBUG + void *madeDirtyBy[2]; +#endif +} BTreeControlBlock, *BTreeControlBlockPtr; + +u_int32_t CalcKeySize(const BTreeControlBlock *btcb, const BTreeKey *key); +#define CalcKeySize(btcb, key) ( ((btcb)->attributes & kBTBigKeysMask) ? ((key)->length16 + 2) : ((key)->length8 + 1) ) + +u_int32_t KeyLength(const BTreeControlBlock *btcb, const BTreeKey *key); +#define KeyLength(btcb, key) ( ((btcb)->attributes & kBTBigKeysMask) ? (key)->length16 : (key)->length8 ) + + + +typedef enum { + kBTHeaderDirty = 0x00000001 +} BTreeFlags; + +static inline void M_BTreeHeaderDirty(BTreeControlBlock *bt) { +#if DEBUG + bt->madeDirtyBy[0] = __builtin_return_address(0); + bt->madeDirtyBy[1] = __builtin_return_address(1); +#endif + bt->flags |= kBTHeaderDirty; +} + +typedef int8_t *NodeBuffer; +typedef BlockDescriptor NodeRec, *NodePtr; //€€ remove this someday... + + + + +//// Tree Path Table - constructed by SearchTree, used by InsertTree and DeleteTree + +typedef struct { + u_int32_t node; // node number + u_int16_t index; + u_int16_t reserved; // align size to a power of 2 +} TreePathRecord, *TreePathRecordPtr; + +typedef TreePathRecord TreePathTable [kMaxTreeDepth]; + + +//// InsertKey - used by InsertTree, InsertLevel and InsertNode + +struct InsertKey { + BTreeKeyPtr keyPtr; + u_int8_t * recPtr; + u_int16_t keyLength; + u_int16_t recSize; + Boolean replacingKey; + Boolean skipRotate; +}; + +typedef struct InsertKey InsertKey; + + +//// For Notational Convenience + +typedef BTNodeDescriptor* NodeDescPtr; +typedef u_int8_t *RecordPtr; +typedef BTreeKeyPtr KeyPtr; + + +//////////////////////////////////// Globals //////////////////////////////////// + + +//////////////////////////////////// Macros ///////////////////////////////////// + +#if DEBUG + #define Panic( message ) DebugStr( message ) + #define PanicIf( condition, message ) do { if ( (condition) != 0 ) DebugStr( message ); } while(0) +#else + #define Panic( message ) do { } while(0) + #define PanicIf( condition, message ) do { } while(0) +#endif + +// Exit function on error +#define M_ExitOnError( result ) do { if ( ( result ) != noErr ) goto ErrorExit; } while(0) + +// Test for passed condition and return if true +#define M_ReturnErrorIf( condition, error ) do { if ( condition ) return( error ); } while(0) + +//////////////////////////////// Key Operations ///////////////////////////////// + +int32_t CompareKeys (BTreeControlBlockPtr btreePtr, + KeyPtr searchKey, + KeyPtr trialKey ); + +//////////////////////////////// Map Operations ///////////////////////////////// + +OSStatus AllocateNode (BTreeControlBlockPtr btreePtr, + u_int32_t *nodeNum); + +OSStatus FreeNode (BTreeControlBlockPtr btreePtr, + u_int32_t nodeNum); + +OSStatus ExtendBTree (BTreeControlBlockPtr btreePtr, + u_int32_t nodes ); + +u_int32_t CalcMapBits (BTreeControlBlockPtr btreePtr); + + +void BTUpdateReserve (BTreeControlBlockPtr btreePtr, + int nodes); + +//////////////////////////////// Misc Operations //////////////////////////////// + +u_int16_t CalcKeyRecordSize (u_int16_t keySize, + u_int16_t recSize ); + +OSStatus VerifyHeader (FCB *filePtr, + BTHeaderRec *header ); + +OSStatus UpdateHeader (BTreeControlBlockPtr btreePtr, + Boolean forceWrite ); + +OSStatus FindIteratorPosition (BTreeControlBlockPtr btreePtr, + BTreeIteratorPtr iterator, + BlockDescriptor *left, + BlockDescriptor *middle, + BlockDescriptor *right, + u_int32_t *nodeNum, + u_int16_t *index, + Boolean *foundRecord ); + +OSStatus CheckInsertParams (FCB *filePtr, + BTreeIterator *iterator, + FSBufferDescriptor *record, + u_int16_t recordLen ); + +OSStatus TrySimpleReplace (BTreeControlBlockPtr btreePtr, + NodeDescPtr nodePtr, + BTreeIterator *iterator, + FSBufferDescriptor *record, + u_int16_t recordLen, + Boolean *recordInserted ); + +OSStatus IsItAHint (BTreeControlBlockPtr btreePtr, + BTreeIterator *iterator, + Boolean *answer ); + +extern OSStatus TreeIsDirty(BTreeControlBlockPtr btreePtr); + +//////////////////////////////// Node Operations //////////////////////////////// + +//// Node Operations + +OSStatus GetNode (BTreeControlBlockPtr btreePtr, + u_int32_t nodeNum, + u_int32_t flags, + NodeRec *returnNodePtr ); + +/* Flags for GetNode() */ +#define kGetNodeHint 0x1 /* If set, the node is being looked up using a hint */ + +OSStatus GetLeftSiblingNode (BTreeControlBlockPtr btreePtr, + NodeDescPtr node, + NodeRec *left ); + +#define GetLeftSiblingNode(btree,node,left) GetNode ((btree), ((NodeDescPtr)(node))->bLink, 0, (left)) + +OSStatus GetRightSiblingNode (BTreeControlBlockPtr btreePtr, + NodeDescPtr node, + NodeRec *right ); + +#define GetRightSiblingNode(btree,node,right) GetNode ((btree), ((NodeDescPtr)(node))->fLink, 0, (right)) + + +OSStatus GetNewNode (BTreeControlBlockPtr btreePtr, + u_int32_t nodeNum, + NodeRec *returnNodePtr ); + +OSStatus ReleaseNode (BTreeControlBlockPtr btreePtr, + NodePtr nodePtr ); + +OSStatus TrashNode (BTreeControlBlockPtr btreePtr, + NodePtr nodePtr ); + +OSStatus UpdateNode (BTreeControlBlockPtr btreePtr, + NodePtr nodePtr, + u_int32_t transactionID, + u_int32_t flags ); + +//// Node Buffer Operations + +void ClearNode (BTreeControlBlockPtr btreePtr, + NodeDescPtr node ); + +u_int16_t GetNodeDataSize (BTreeControlBlockPtr btreePtr, + NodeDescPtr node ); + +u_int16_t GetNodeFreeSize (BTreeControlBlockPtr btreePtr, + NodeDescPtr node ); + + +//// Record Operations + +Boolean InsertRecord (BTreeControlBlockPtr btreePtr, + NodeDescPtr node, + u_int16_t index, + RecordPtr recPtr, + u_int16_t recSize ); + +Boolean InsertKeyRecord (BTreeControlBlockPtr btreePtr, + NodeDescPtr node, + u_int16_t index, + KeyPtr keyPtr, + u_int16_t keyLength, + RecordPtr recPtr, + u_int16_t recSize ); + +void DeleteRecord (BTreeControlBlockPtr btree, + NodeDescPtr node, + u_int16_t index ); + + +Boolean SearchNode (BTreeControlBlockPtr btree, + NodeDescPtr node, + KeyPtr searchKey, + u_int16_t *index ); + +OSStatus GetRecordByIndex (BTreeControlBlockPtr btree, + NodeDescPtr node, + u_int16_t index, + KeyPtr *keyPtr, + u_int8_t * *dataPtr, + u_int16_t *dataSize ); + +u_int8_t * GetRecordAddress (BTreeControlBlockPtr btree, + NodeDescPtr node, + u_int16_t index ); + +#define GetRecordAddress(btreePtr,node,index) ((u_int8_t *)(node) + (*(short *) ((u_int8_t *)(node) + (btreePtr)->nodeSize - ((index) << 1) - kOffsetSize))) + + +u_int16_t GetRecordSize (BTreeControlBlockPtr btree, + NodeDescPtr node, + u_int16_t index ); + +u_int32_t GetChildNodeNum (BTreeControlBlockPtr btreePtr, + NodeDescPtr nodePtr, + u_int16_t index ); + +void MoveRecordsLeft (u_int8_t * src, + u_int8_t * dst, + u_int16_t bytesToMove ); + +#define MoveRecordsLeft(src,dst,bytes) bcopy((src),(dst),(bytes)) + +void MoveRecordsRight (u_int8_t * src, + u_int8_t * dst, + u_int16_t bytesToMove ); + +#define MoveRecordsRight(src,dst,bytes) bcopy((src),(dst),(bytes)) + + +//////////////////////////////// Tree Operations //////////////////////////////// + +OSStatus SearchTree (BTreeControlBlockPtr btreePtr, + BTreeKeyPtr keyPtr, + TreePathTable treePathTable, + u_int32_t *nodeNum, + BlockDescriptor *nodePtr, + u_int16_t *index ); + +OSStatus InsertTree (BTreeControlBlockPtr btreePtr, + TreePathTable treePathTable, + KeyPtr keyPtr, + u_int8_t * recPtr, + u_int16_t recSize, + BlockDescriptor *targetNode, + u_int16_t index, + u_int16_t level, + Boolean replacingKey, + u_int32_t *insertNode ); + +OSStatus DeleteTree (BTreeControlBlockPtr btreePtr, + TreePathTable treePathTable, + BlockDescriptor *targetNode, + u_int16_t index, + u_int16_t level ); + +#endif /* __APPLE_API_PRIVATE */ +#endif /* KERNEL */ +#endif //__BTREESPRIVATE__ diff --git a/core/CatalogPrivate.h b/core/CatalogPrivate.h new file mode 100644 index 0000000..72abbfe --- /dev/null +++ b/core/CatalogPrivate.h @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2000-2005, 2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + File: CatalogPrivate.h + + Contains: Private Catalog Manager interfaces. + + Version: HFS Plus 1.0 + + Copyright: (c) 1997-1998 by Apple Inc., all rights reserved. + + File Ownership: + + DRI: Don Brady + + Other Contact: xxx put other contact here xxx + + Technology: xxx put technology here xxx + + Writers: + + (JL) Jim Luther + (msd) Mark Day + (DSH) Deric Horn + (djb) Don Brady + + Change History (most recent first): + 11/10/98 djb Remove obsolete PrepareInputName prototype; + 4/6/98 djb Added lock data stuctures and ReleaseCatalogIterator prototype; + 4/6/98 djb Removed CatalogDataCache since its no longer used. + 4/2/98 djb InvalidateCatalogNodeCache does nothing under MacOS X. + 3/31/98 djb Sync up with final HFSVolumes.h header file. + + 11/20/97 djb Radar #2002357. Fixing retry mechanism. + 11/17/97 djb PrepareInputName routine now returns an error. + 11/13/97 djb Radar #1683572. Move CatalogIterator to this file from + FileMgrInternal.i. Double size of short unicode name. + 10/31/97 JL #2000184 - Changed prototypes for CreateFileThreadID and + ExchangeFiles. + 10/17/97 msd In CatalogCacheGlobals, add room for a single UniStr255 so + catalog iterators can step over long Unicode names. + 10/17/97 djb Add ConvertInputNameToUnicode for Catalog Create/Rename. + 10/1/97 djb Change catalog iterator implementation. + 7/16/97 DSH FilesInternal.i renamed FileMgrInternal.i to avoid name + collision + 6/24/97 djb Add LocateCatalogNodeByMangledName routine. + 6/24/97 djb first checked in +*/ + +#ifndef __CATALOGPRIVATE__ +#define __CATALOGPRIVATE__ + +#include + +#ifdef KERNEL +#ifdef __APPLE_API_PRIVATE + +#include "hfs_format.h" + +#include "FileMgrInternal.h" +#include "BTreesInternal.h" + +// +// Private Catalog Manager Routines (for use only by Catalog Manager, CatSearch and FileID Services) +// + + +extern OSErr LocateCatalogNodeByKey ( const ExtendedVCB *volume, u_int32_t hint, CatalogKey *keyPtr, + CatalogRecord *dataPtr, u_int32_t *newHint ); + +extern OSErr LocateCatalogRecord( const ExtendedVCB *volume, HFSCatalogNodeID folderID, const CatalogName *name, + u_int32_t hint, CatalogKey *keyPtr, CatalogRecord *dataPtr, u_int32_t *newHint); + +extern OSErr LocateCatalogNodeWithRetry ( const ExtendedVCB *volume, HFSCatalogNodeID folderID, ConstStr31Param pascalName, + CatalogName *unicodeName, u_int32_t hint, CatalogKey *keyPtr, CatalogRecord *dataPtr, + u_int32_t *newHint ); +extern OSErr FlushCatalog( ExtendedVCB *volume); + + +extern void ConvertInputNameToUnicode(ConstStr31Param name, TextEncoding encodingHint, + TextEncoding *actualEncoding, CatalogName *catalogName); + +extern void BuildCatalogKey( HFSCatalogNodeID parentID, const CatalogName *name, Boolean isHFSPlus, + CatalogKey *key); + +extern OSErr BuildCatalogKeyUTF8(ExtendedVCB *volume, HFSCatalogNodeID parentID, const unsigned char *name, + u_int32_t length, CatalogKey *key); + +extern void CopyCatalogName( const CatalogName *srcName, CatalogName *dstName, Boolean isHFSPLus); + +extern OSErr ResolveFileID( ExtendedVCB *vcb, HFSCatalogNodeID fileID, HFSCatalogNodeID *parentID, Str31 name ); + +#if 0 +extern OSErr CreateFileThreadID( FIDParam *filePB, WDCBRecPtr *wdcbPtr ); + +extern OSErr ExchangeFiles( FIDParam *filePB, WDCBRecPtr *wdcbPtr ); +#endif + +extern void UpdateCatalogName( ConstStr31Param srcName, Str31 destName ); + + +#endif /* __APPLE_API_PRIVATE */ +#endif /* KERNEL */ +#endif //__CATALOGPRIVATE__ diff --git a/core/CatalogUtilities.c b/core/CatalogUtilities.c new file mode 100644 index 0000000..e56473d --- /dev/null +++ b/core/CatalogUtilities.c @@ -0,0 +1,343 @@ +/* + * Copyright (c) 2000-2002, 2004-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include +#include +#include +#include +#include +#include + +#include "FileMgrInternal.h" +#include "BTreesInternal.h" +#include "CatalogPrivate.h" +#include "HFSUnicodeWrappers.h" +#include "BTreesPrivate.h" +#include + +// +// Routine: LocateCatalogNodeByKey +// +// Function: Locates the catalog record for an existing folder or file +// CNode and returns the key and data records. +// + +OSErr +LocateCatalogNodeByKey(const ExtendedVCB *volume, u_int32_t hint, CatalogKey *keyPtr, + CatalogRecord *dataPtr, u_int32_t *newHint) +{ + OSErr result; + CatalogName *nodeName = NULL; + HFSCatalogNodeID threadParentID; + u_int16_t tempSize; + FSBufferDescriptor btRecord; + struct BTreeIterator *searchIterator; + FCB *fcb; + + searchIterator = hfs_mallocz(sizeof(struct BTreeIterator)); + + fcb = GetFileControlBlock(volume->catalogRefNum); + + btRecord.bufferAddress = dataPtr; + btRecord.itemCount = 1; + btRecord.itemSize = sizeof(CatalogRecord); + + searchIterator->hint.nodeNum = hint; + + bcopy(keyPtr, &searchIterator->key, sizeof(CatalogKey)); + + result = BTSearchRecord( fcb, searchIterator, &btRecord, &tempSize, searchIterator ); + + if (result == noErr) + { + *newHint = searchIterator->hint.nodeNum; + + BlockMoveData(&searchIterator->key, keyPtr, sizeof(CatalogKey)); + } + + if (result == btNotFound) { + result = cmNotFound; + } + + if (result) { + hfs_free(searchIterator, sizeof(*searchIterator)); + return result; + } + + // if we got a thread record, then go look up real record + switch ( dataPtr->recordType ) + { + +#if CONFIG_HFS_STD + case kHFSFileThreadRecord: + case kHFSFolderThreadRecord: + threadParentID = dataPtr->hfsThread.parentID; + nodeName = (CatalogName *) &dataPtr->hfsThread.nodeName; + break; +#endif + + case kHFSPlusFileThreadRecord: + case kHFSPlusFolderThreadRecord: + threadParentID = dataPtr->hfsPlusThread.parentID; + nodeName = (CatalogName *) &dataPtr->hfsPlusThread.nodeName; + break; + + default: + threadParentID = 0; + break; + } + + if ( threadParentID ) // found a thread + result = LocateCatalogRecord(volume, threadParentID, nodeName, kNoHint, keyPtr, dataPtr, newHint); + + hfs_free(searchIterator, sizeof(*searchIterator)); + return result; +} + + + +//******************************************************************************* +// Routine: LocateCatalogRecord +// +// Function: Locates the catalog record associated with folderID and name +// +//******************************************************************************* + +OSErr +LocateCatalogRecord(const ExtendedVCB *volume, HFSCatalogNodeID folderID, const CatalogName *name, + __unused u_int32_t hint, CatalogKey *keyPtr, CatalogRecord *dataPtr, u_int32_t *newHint) +{ + OSErr result; + uint16_t tempSize; + FSBufferDescriptor btRecord; + struct BTreeIterator *searchIterator = NULL; + FCB *fcb; + BTreeControlBlock *btcb; + + searchIterator = hfs_mallocz(sizeof(struct BTreeIterator)); + + fcb = GetFileControlBlock(volume->catalogRefNum); + btcb = (BTreeControlBlock *)fcb->fcbBTCBPtr; + + btRecord.bufferAddress = dataPtr; + btRecord.itemCount = 1; + btRecord.itemSize = sizeof(CatalogRecord); + + BuildCatalogKey(folderID, name, (volume->vcbSigWord == kHFSPlusSigWord), (CatalogKey *)&searchIterator->key); + + result = BTSearchRecord(fcb, searchIterator, &btRecord, &tempSize, searchIterator); + if (result == noErr) { + *newHint = searchIterator->hint.nodeNum; + BlockMoveData(&searchIterator->key, keyPtr, CalcKeySize(btcb, &searchIterator->key)); + } + + hfs_free(searchIterator, sizeof(*searchIterator)); + return (result == btNotFound ? cmNotFound : result); +} + + + +/* + * Routine: BuildCatalogKey + * + * Function: Constructs a catalog key record (ckr) given the parent + * folder ID and CName. Works for both classic and extended + * HFS volumes. + * + */ + +void +BuildCatalogKey(HFSCatalogNodeID parentID, const CatalogName *cName, Boolean isHFSPlus, CatalogKey *key) +{ + if ( isHFSPlus ) + { + key->hfsPlus.keyLength = kHFSPlusCatalogKeyMinimumLength; // initial key length (4 + 2) + key->hfsPlus.parentID = parentID; // set parent ID + key->hfsPlus.nodeName.length = 0; // null CName length + if ( cName != NULL ) + { + CopyCatalogName(cName, (CatalogName *) &key->hfsPlus.nodeName, isHFSPlus); + key->hfsPlus.keyLength += sizeof(UniChar) * cName->ustr.length; // add CName size to key length + } + } +#if CONFIG_HFS_STD + else + { + key->hfs.keyLength = kHFSCatalogKeyMinimumLength; // initial key length (1 + 4 + 1) + key->hfs.reserved = 0; // clear unused byte + key->hfs.parentID = parentID; // set parent ID + key->hfs.nodeName[0] = 0; // null CName length + if ( cName != NULL ) + { + UpdateCatalogName(cName->pstr, key->hfs.nodeName); + key->hfs.keyLength += key->hfs.nodeName[0]; // add CName size to key length + } + } +#endif + +} + +OSErr +BuildCatalogKeyUTF8(ExtendedVCB *volume, HFSCatalogNodeID parentID, const unsigned char *name, u_int32_t nameLength, + CatalogKey *key) +{ + OSErr err = 0; + + if ( name == NULL) + nameLength = 0; + else if (nameLength == kUndefinedStrLen) + nameLength = strlen((const char *)name); + + if ( volume->vcbSigWord == kHFSPlusSigWord ) { + size_t unicodeBytes = 0; + + key->hfsPlus.keyLength = kHFSPlusCatalogKeyMinimumLength; // initial key length (4 + 2) + key->hfsPlus.parentID = parentID; // set parent ID + key->hfsPlus.nodeName.length = 0; // null CName length + if ( nameLength > 0 ) { + err = utf8_decodestr(name, nameLength, key->hfsPlus.nodeName.unicode, + &unicodeBytes, sizeof(key->hfsPlus.nodeName.unicode), ':', UTF_DECOMPOSED); + key->hfsPlus.nodeName.length = unicodeBytes / sizeof(UniChar); + key->hfsPlus.keyLength += unicodeBytes; + } + } +#if CONFIG_HFS_STD + else { + key->hfs.keyLength = kHFSCatalogKeyMinimumLength; // initial key length (1 + 4 + 1) + key->hfs.reserved = 0; // clear unused byte + key->hfs.parentID = parentID; // set parent ID + key->hfs.nodeName[0] = 0; // null CName length + if ( nameLength > 0 ) { + err = utf8_to_hfs(volume, nameLength, name, &key->hfs.nodeName[0]); + /* + * Retry with MacRoman in case that's how it was exported. + * When textEncoding != NULL we know that this is a create + * or rename call and can skip the retry (ugly but it works). + */ + if (err) + err = utf8_to_mac_roman(nameLength, name, &key->hfs.nodeName[0]); + key->hfs.keyLength += key->hfs.nodeName[0]; // add CName size to key length + } + } +#endif + + if (err) { + if (err == ENAMETOOLONG) + err = bdNamErr; /* name is too long */ + else + err = paramErr; /* name has invalid characters */ + } + + return err; +} + + +//******************************************************************************* +// Routine: FlushCatalog +// +// Function: Flushes the catalog for a specified volume. +// +//******************************************************************************* + +OSErr +FlushCatalog(ExtendedVCB *volume) +{ + FCB * fcb; + OSErr result; + struct hfsmount *hfsmp = VCBTOHFS (volume); + + fcb = GetFileControlBlock(volume->catalogRefNum); + result = BTFlushPath(fcb); + + if (result == noErr) + { + //--- check if catalog's fcb is dirty... + + if ( (0) /*fcb->fcbFlags & fcbModifiedMask*/ ) + { + hfs_lock_mount (hfsmp); + MarkVCBDirty(volume); // Mark the VCB dirty + volume->vcbLsMod = GetTimeUTC(); // update last modified date + hfs_unlock_mount (hfsmp); + + // result = FlushVolumeControlBlock(volume); + } + } + + return result; +} + + +//ÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑ +// Routine: UpdateCatalogName +// +// Function: Updates a CName. +// +//ÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑÑ + +void +UpdateCatalogName(ConstStr31Param srcName, Str31 destName) +{ + Size length = srcName[0]; + + if (length > CMMaxCName) + length = CMMaxCName; // truncate to max + + destName[0] = length; // set length byte + + BlockMoveData(&srcName[1], &destName[1], length); +} + +//_______________________________________________________________________ + +void +CopyCatalogName(const CatalogName *srcName, CatalogName *dstName, Boolean isHFSPlus) +{ + u_int32_t length = 0; + + if ( srcName == NULL ) + { + if ( dstName != NULL ) + dstName->ustr.length = 0; // set length byte to zero (works for both unicode and pascal) + return; + } + + if (isHFSPlus) { + length = sizeof(UniChar) * (srcName->ustr.length + 1); + } +#if CONFIG_HFS_STD + else { + length = sizeof(u_int8_t) + srcName->pstr[0]; + } +#endif + + if ( length > 1 ) + BlockMoveData(srcName, dstName, length); + else + dstName->ustr.length = 0; // set length byte to zero (works for both unicode and pascal) +} + diff --git a/core/FileExtentMapping.c b/core/FileExtentMapping.c new file mode 100644 index 0000000..1ea93f9 --- /dev/null +++ b/core/FileExtentMapping.c @@ -0,0 +1,2249 @@ +/* + * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + + +#include "hfs.h" +#include "hfs_format.h" +#include "hfs_endian.h" + +#include "FileMgrInternal.h" +#include "BTreesInternal.h" + +#include + +/* +============================================================ +Public (Exported) Routines: +============================================================ + + ExtendFileC Allocate more space to a given file. + + CompareExtentKeys + Compare two extents file keys (a search key and a trial + key). Used by the BTree manager when searching for, + adding, or deleting keys in the extents file of an HFS + volume. + + CompareExtentKeysPlus + Compare two extents file keys (a search key and a trial + key). Used by the BTree manager when searching for, + adding, or deleting keys in the extents file of an HFS+ + volume. + + MapFileBlockC Convert (map) an offset within a given file into a + physical disk address. + + TruncateFileC Truncates the disk space allocated to a file. The file + space is truncated to a specified new physical EOF, rounded + up to the next allocation block boundry. There is an option + to truncate to the end of the extent containing the new EOF. + + FlushExtentFile + Flush the extents file for a given volume. + + SearchExtentFile + Search the FCB and extents file for an extent record that + contains a given file position (in bytes). + + +============================================================ +Internal Routines: +============================================================ + FindExtentRecord + Search the extents BTree for a particular extent record. + SearchExtentRecord + Search a given extent record to see if it contains a given + file position (in bytes). Used by SearchExtentFile. + ReleaseExtents + Deallocate all allocation blocks in all extents of an extent + data record. + TruncateExtents + Deallocate blocks and delete extent records for all allocation + blocks beyond a certain point in a file. The starting point + must be the first file allocation block for some extent record + for the file. + DeallocateFork + Deallocate all allocation blocks belonging to a given fork. + UpdateExtentRecord + If the extent record came from the extents file, write out + the updated record; otherwise, copy the updated record into + the FCB resident extent record. If the record has no extents, + and was in the extents file, then delete the record instead. +*/ + +#if CONFIG_HFS_STD +static const int64_t kTwoGigabytes = 0x80000000LL; +#endif + +enum +{ + kDataForkType = 0, + kResourceForkType = 0xFF, + + kPreviousRecord = -1 +}; + + +#if CONFIG_HFS_STD +static OSErr HFSPlusToHFSExtents( + const HFSPlusExtentRecord oldExtents, + HFSExtentRecord newExtents); +#endif + +static OSErr FindExtentRecord( + const ExtendedVCB *vcb, + u_int8_t forkType, + u_int32_t fileID, + u_int32_t startBlock, + Boolean allowPrevious, + HFSPlusExtentKey *foundKey, + HFSPlusExtentRecord foundData, + u_int32_t *foundHint); + +static OSErr DeleteExtentRecord( + const ExtendedVCB *vcb, + u_int8_t forkType, + u_int32_t fileID, + u_int32_t startBlock); + +static OSErr CreateExtentRecord( + ExtendedVCB *vcb, + HFSPlusExtentKey *key, + HFSPlusExtentRecord extents, + u_int32_t *hint); + + +static OSErr GetFCBExtentRecord( + const FCB *fcb, + HFSPlusExtentRecord extents); + +static OSErr SearchExtentRecord( + ExtendedVCB *vcb, + u_int32_t searchFABN, + const HFSPlusExtentRecord extentData, + u_int32_t extentDataStartFABN, + u_int32_t *foundExtentDataOffset, + u_int32_t *endingFABNPlusOne, + Boolean *noMoreExtents); + +static OSErr ReleaseExtents( + ExtendedVCB *vcb, + const HFSPlusExtentRecord extentRecord, + u_int32_t *numReleasedAllocationBlocks, + Boolean *releasedLastExtent); + +static OSErr DeallocateFork( + ExtendedVCB *vcb, + HFSCatalogNodeID fileID, + u_int8_t forkType, + HFSPlusExtentRecord catalogExtents, + Boolean * recordDeleted); + +static OSErr TruncateExtents( + ExtendedVCB *vcb, + u_int8_t forkType, + u_int32_t fileID, + u_int32_t startBlock, + Boolean * recordDeleted); + +static OSErr UpdateExtentRecord ( + ExtendedVCB *vcb, + FCB *fcb, + int deleted, + const HFSPlusExtentKey *extentFileKey, + const HFSPlusExtentRecord extentData, + u_int32_t extentBTreeHint); + +static Boolean ExtentsAreIntegral( + const HFSPlusExtentRecord extentRecord, + u_int32_t mask, + u_int32_t *blocksChecked, + Boolean *checkedLastExtent); + +//_________________________________________________________________________________ +// +// Routine: FindExtentRecord +// +// Purpose: Search the extents BTree for an extent record matching the given +// FileID, fork, and starting file allocation block number. +// +// Inputs: +// vcb Volume to search +// forkType 0 = data fork, -1 = resource fork +// fileID File's FileID (CatalogNodeID) +// startBlock Starting file allocation block number +// allowPrevious If the desired record isn't found and this flag is set, +// then see if the previous record belongs to the same fork. +// If so, then return it. +// +// Outputs: +// foundKey The key data for the record actually found +// foundData The extent record actually found (NOTE: on an HFS volume, the +// fourth entry will be zeroes. +// foundHint The BTree hint to find the node again +//_________________________________________________________________________________ +static OSErr FindExtentRecord( + const ExtendedVCB *vcb, + u_int8_t forkType, + u_int32_t fileID, + u_int32_t startBlock, + Boolean allowPrevious, + HFSPlusExtentKey *foundKey, + HFSPlusExtentRecord foundData, + u_int32_t *foundHint) +{ + FCB * fcb; + struct BTreeIterator *btIterator = NULL; + FSBufferDescriptor btRecord; + OSErr err; + u_int16_t btRecordSize; + + err = noErr; + if (foundHint) + *foundHint = 0; + fcb = GetFileControlBlock(vcb->extentsRefNum); + + btIterator = hfs_mallocz(sizeof(struct BTreeIterator)); + + /* HFS Plus / HFSX */ + if (vcb->vcbSigWord != kHFSSigWord) { + HFSPlusExtentKey * extentKeyPtr; + HFSPlusExtentRecord extentData; + + extentKeyPtr = (HFSPlusExtentKey*) &btIterator->key; + extentKeyPtr->keyLength = kHFSPlusExtentKeyMaximumLength; + extentKeyPtr->forkType = forkType; + extentKeyPtr->pad = 0; + extentKeyPtr->fileID = fileID; + extentKeyPtr->startBlock = startBlock; + + btRecord.bufferAddress = &extentData; + btRecord.itemSize = sizeof(HFSPlusExtentRecord); + btRecord.itemCount = 1; + + err = BTSearchRecord(fcb, btIterator, &btRecord, &btRecordSize, btIterator); + + if (err == btNotFound && allowPrevious) { + err = BTIterateRecord(fcb, kBTreePrevRecord, btIterator, &btRecord, &btRecordSize); + + // A previous record may not exist, so just return btNotFound (like we would if + // it was for the wrong file/fork). + if (err == (OSErr) fsBTStartOfIterationErr) //¥¥ fsBTStartOfIterationErr is type unsigned long + err = btNotFound; + + if (err == noErr) { + // Found a previous record. Does it belong to the same fork of the same file? + if (extentKeyPtr->fileID != fileID || extentKeyPtr->forkType != forkType) + err = btNotFound; + } + } + + if (err == noErr) { + // Copy the found key back for the caller + if (foundKey) + BlockMoveData(extentKeyPtr, foundKey, sizeof(HFSPlusExtentKey)); + // Copy the found data back for the caller + BlockMoveData(&extentData, foundData, sizeof(HFSPlusExtentRecord)); + } + } +#if CONFIG_HFS_STD + else { + HFSExtentKey * extentKeyPtr; + HFSExtentRecord extentData; + + extentKeyPtr = (HFSExtentKey*) &btIterator->key; + extentKeyPtr->keyLength = kHFSExtentKeyMaximumLength; + extentKeyPtr->forkType = forkType; + extentKeyPtr->fileID = fileID; + extentKeyPtr->startBlock = startBlock; + + btRecord.bufferAddress = &extentData; + btRecord.itemSize = sizeof(HFSExtentRecord); + btRecord.itemCount = 1; + + err = BTSearchRecord(fcb, btIterator, &btRecord, &btRecordSize, btIterator); + + if (err == btNotFound && allowPrevious) { + err = BTIterateRecord(fcb, kBTreePrevRecord, btIterator, &btRecord, &btRecordSize); + + // A previous record may not exist, so just return btNotFound (like we would if + // it was for the wrong file/fork). + if (err == (OSErr) fsBTStartOfIterationErr) //¥¥ fsBTStartOfIterationErr is type unsigned long + err = btNotFound; + + if (err == noErr) { + // Found a previous record. Does it belong to the same fork of the same file? + if (extentKeyPtr->fileID != fileID || extentKeyPtr->forkType != forkType) + err = btNotFound; + } + } + + if (err == noErr) { + u_int16_t i; + + // Copy the found key back for the caller + if (foundKey) { + foundKey->keyLength = kHFSPlusExtentKeyMaximumLength; + foundKey->forkType = extentKeyPtr->forkType; + foundKey->pad = 0; + foundKey->fileID = extentKeyPtr->fileID; + foundKey->startBlock = extentKeyPtr->startBlock; + } + // Copy the found data back for the caller + foundData[0].startBlock = extentData[0].startBlock; + foundData[0].blockCount = extentData[0].blockCount; + foundData[1].startBlock = extentData[1].startBlock; + foundData[1].blockCount = extentData[1].blockCount; + foundData[2].startBlock = extentData[2].startBlock; + foundData[2].blockCount = extentData[2].blockCount; + + for (i = 3; i < kHFSPlusExtentDensity; ++i) + { + foundData[i].startBlock = 0; + foundData[i].blockCount = 0; + } + } + } +#endif + + if (foundHint) + *foundHint = btIterator->hint.nodeNum; + + hfs_free(btIterator, sizeof(*btIterator)); + return err; +} + + + +static OSErr CreateExtentRecord( + ExtendedVCB *vcb, + HFSPlusExtentKey *key, + HFSPlusExtentRecord extents, + u_int32_t *hint) +{ + struct BTreeIterator *btIterator = NULL; + FSBufferDescriptor btRecord; + u_int16_t btRecordSize; + int lockflags; + OSErr err; + + err = noErr; + *hint = 0; + + btIterator = hfs_mallocz(sizeof(struct BTreeIterator)); + + /* + * The lock taken by callers of ExtendFileC is speculative and + * only occurs when the file already has overflow extents. So + * We need to make sure we have the lock here. The extents + * btree lock can be nested (its recursive) so we always take + * it here. + */ + lockflags = hfs_systemfile_lock(vcb, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK); + + /* HFS+/HFSX */ + if (vcb->vcbSigWord != kHFSSigWord) { + btRecordSize = sizeof(HFSPlusExtentRecord); + btRecord.bufferAddress = extents; + btRecord.itemSize = btRecordSize; + btRecord.itemCount = 1; + + BlockMoveData(key, &btIterator->key, sizeof(HFSPlusExtentKey)); + } +#if CONFIG_HFS_STD + else { + /* HFS Standard */ + HFSExtentKey * keyPtr; + HFSExtentRecord data; + + btRecordSize = sizeof(HFSExtentRecord); + btRecord.bufferAddress = &data; + btRecord.itemSize = btRecordSize; + btRecord.itemCount = 1; + + keyPtr = (HFSExtentKey*) &btIterator->key; + keyPtr->keyLength = kHFSExtentKeyMaximumLength; + keyPtr->forkType = key->forkType; + keyPtr->fileID = key->fileID; + keyPtr->startBlock = key->startBlock; + + err = HFSPlusToHFSExtents(extents, data); + } +#endif + + if (err == noErr) + err = BTInsertRecord(GetFileControlBlock(vcb->extentsRefNum), btIterator, &btRecord, btRecordSize); + + if (err == noErr) + *hint = btIterator->hint.nodeNum; + + (void) BTFlushPath(GetFileControlBlock(vcb->extentsRefNum)); + + hfs_systemfile_unlock(vcb, lockflags); + + hfs_free(btIterator, sizeof(*btIterator)); + return err; +} + + +static OSErr DeleteExtentRecord( + const ExtendedVCB *vcb, + u_int8_t forkType, + u_int32_t fileID, + u_int32_t startBlock) +{ + struct BTreeIterator *btIterator = NULL; + OSErr err; + + err = noErr; + + btIterator = hfs_mallocz(sizeof(struct BTreeIterator)); + + /* HFS+ / HFSX */ + if (vcb->vcbSigWord != kHFSSigWord) { // HFS Plus volume + HFSPlusExtentKey * keyPtr; + + keyPtr = (HFSPlusExtentKey*) &btIterator->key; + keyPtr->keyLength = kHFSPlusExtentKeyMaximumLength; + keyPtr->forkType = forkType; + keyPtr->pad = 0; + keyPtr->fileID = fileID; + keyPtr->startBlock = startBlock; + } +#if CONFIG_HFS_STD + else { + /* HFS standard */ + HFSExtentKey * keyPtr; + + keyPtr = (HFSExtentKey*) &btIterator->key; + keyPtr->keyLength = kHFSExtentKeyMaximumLength; + keyPtr->forkType = forkType; + keyPtr->fileID = fileID; + keyPtr->startBlock = startBlock; + } +#endif + + err = BTDeleteRecord(GetFileControlBlock(vcb->extentsRefNum), btIterator); + (void) BTFlushPath(GetFileControlBlock(vcb->extentsRefNum)); + + + hfs_free(btIterator, sizeof(*btIterator)); + return err; +} + + + +//_________________________________________________________________________________ +// +// Routine: MapFileBlock +// +// Function: Maps a file position into a physical disk address. +// +//_________________________________________________________________________________ + +OSErr MapFileBlockC ( + ExtendedVCB *vcb, // volume that file resides on + FCB *fcb, // FCB of file + size_t numberOfBytes, // number of contiguous bytes desired + off_t offset, // starting offset within file (in bytes) + daddr64_t *startSector, // first sector (NOT an allocation block) + size_t *availableBytes) // number of contiguous bytes (up to numberOfBytes) +{ + OSErr err; + u_int32_t allocBlockSize; // Size of the volume's allocation block + u_int32_t sectorSize; + HFSPlusExtentKey foundKey; + HFSPlusExtentRecord foundData; + u_int32_t foundIndex; + u_int32_t hint; + u_int32_t firstFABN = 0; // file allocation block of first block in found extent + u_int32_t nextFABN; // file allocation block of block after end of found extent + off_t dataEnd; // (offset) end of range that is contiguous + u_int32_t sectorsPerBlock; // Number of sectors per allocation block + u_int32_t startBlock = 0; // volume allocation block corresponding to firstFABN + daddr64_t temp; + off_t tmpOff; + + allocBlockSize = vcb->blockSize; + sectorSize = VCBTOHFS(vcb)->hfs_logical_block_size; + + err = SearchExtentFile(vcb, fcb, offset, &foundKey, foundData, &foundIndex, &hint, &nextFABN); + if (err == noErr) { + startBlock = foundData[foundIndex].startBlock; + firstFABN = nextFABN - foundData[foundIndex].blockCount; + } + + if (err != noErr) + { + return err; + } + + // + // Determine the end of the available space. It will either be the end of the extent, + // or the file's PEOF, whichever is smaller. + // + dataEnd = (off_t)((off_t)(nextFABN) * (off_t)(allocBlockSize)); // Assume valid data through end of this extent + if (((off_t)fcb->ff_blocks * (off_t)allocBlockSize) < dataEnd) // Is PEOF shorter? + dataEnd = (off_t)fcb->ff_blocks * (off_t)allocBlockSize; // Yes, so only map up to PEOF + + // Compute the number of sectors in an allocation block + sectorsPerBlock = allocBlockSize / sectorSize; // sectors per allocation block + + // + // Compute the absolute sector number that contains the offset of the given file + // offset in sectors from start of the extent + + // offset in sectors from start of allocation block space + // + temp = (daddr64_t)((offset - (off_t)((off_t)(firstFABN) * (off_t)(allocBlockSize)))/sectorSize); + temp += (daddr64_t)startBlock * (daddr64_t)sectorsPerBlock; + + /* Add in any volume offsets */ + if (vcb->vcbSigWord == kHFSPlusSigWord) + temp += vcb->hfsPlusIOPosOffset / sectorSize; + else + temp += vcb->vcbAlBlSt; + + // Return the desired sector for file position "offset" + *startSector = temp; + + // + // Determine the number of contiguous bytes until the end of the extent + // (or the amount they asked for, whichever comes first). + // + if (availableBytes) + { + tmpOff = dataEnd - offset; + /* + * Disallow negative runs. + */ + if (tmpOff <= 0) { + /* This shouldn't happen unless something is corrupt */ + hfs_corruption_debug("MapFileBlockC: tmpOff <= 0 (%lld)\n", tmpOff); + return EINVAL; + } + + if (tmpOff > (off_t)(numberOfBytes)) { + *availableBytes = numberOfBytes; // more there than they asked for, so pin the output + } + else { + *availableBytes = tmpOff; + } + } + + return noErr; +} + + +//‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹ +// Routine: ReleaseExtents +// +// Function: Release the extents of a single extent data record. +//‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹ + +static OSErr ReleaseExtents( + ExtendedVCB *vcb, + const HFSPlusExtentRecord extentRecord, + u_int32_t *numReleasedAllocationBlocks, + Boolean *releasedLastExtent) +{ + u_int32_t extentIndex; + u_int32_t numberOfExtents; + OSErr err = noErr; + + *numReleasedAllocationBlocks = 0; + *releasedLastExtent = false; + + if (vcb->vcbSigWord == kHFSPlusSigWord) + numberOfExtents = kHFSPlusExtentDensity; + else + numberOfExtents = kHFSExtentDensity; + + for( extentIndex = 0; extentIndex < numberOfExtents; extentIndex++) + { + u_int32_t numAllocationBlocks; + + // Loop over the extent record and release the blocks associated with each extent. + + numAllocationBlocks = extentRecord[extentIndex].blockCount; + if ( numAllocationBlocks == 0 ) + { + *releasedLastExtent = true; + break; + } + + err = BlockDeallocate( vcb, extentRecord[extentIndex].startBlock, numAllocationBlocks , 0); + if ( err != noErr ) + break; + + *numReleasedAllocationBlocks += numAllocationBlocks; // bump FABN to beg of next extent + } + + return( err ); +} + + + +//‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹ +// Routine: TruncateExtents +// +// Purpose: Delete extent records whose starting file allocation block number +// is greater than or equal to a given starting block number. The +// allocation blocks represented by the extents are deallocated. +// +// Inputs: +// vcb Volume to operate on +// fileID Which file to operate on +// startBlock Starting file allocation block number for first extent +// record to delete. +//‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹ + +static OSErr TruncateExtents( + ExtendedVCB *vcb, + u_int8_t forkType, + u_int32_t fileID, + u_int32_t startBlock, + Boolean * recordDeleted) +{ + OSErr err; + u_int32_t numberExtentsReleased; + Boolean releasedLastExtent; + u_int32_t hint; + HFSPlusExtentKey key; + HFSPlusExtentRecord extents; + int lockflags; + + /* + * The lock taken by callers of TruncateFileC is speculative and + * only occurs when the file already has overflow extents. So + * We need to make sure we have the lock here. The extents + * btree lock can be nested (its recursive) so we always take + * it here. + */ + lockflags = hfs_systemfile_lock(vcb, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK); + + while (true) { + err = FindExtentRecord(vcb, forkType, fileID, startBlock, false, &key, extents, &hint); + if (err != noErr) { + if (err == btNotFound) + err = noErr; + break; + } + + err = ReleaseExtents( vcb, extents, &numberExtentsReleased, &releasedLastExtent ); + if (err != noErr) break; + + err = DeleteExtentRecord(vcb, forkType, fileID, startBlock); + if (err != noErr) break; + + *recordDeleted = true; + startBlock += numberExtentsReleased; + } + hfs_systemfile_unlock(vcb, lockflags); + + return err; +} + + + +//‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹ +// Routine: DeallocateFork +// +// Function: De-allocates all disk space allocated to a specified fork. +//‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹ + +static OSErr DeallocateFork( + ExtendedVCB *vcb, + HFSCatalogNodeID fileID, + u_int8_t forkType, + HFSPlusExtentRecord catalogExtents, + Boolean * recordDeleted) /* true if a record was deleted */ +{ + OSErr err; + u_int32_t numReleasedAllocationBlocks; + Boolean releasedLastExtent; + + // Release the catalog extents + err = ReleaseExtents( vcb, catalogExtents, &numReleasedAllocationBlocks, &releasedLastExtent ); + // Release the extra extents, if present + if (err == noErr && !releasedLastExtent) + err = TruncateExtents(vcb, forkType, fileID, numReleasedAllocationBlocks, recordDeleted); + + return( err ); +} + +//‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹ +// Routine: FlushExtentFile +// +// Function: Flushes the extent file for a specified volume +//‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹ + +OSErr FlushExtentFile( ExtendedVCB *vcb ) +{ + FCB * fcb; + OSErr err; + int lockflags; + + fcb = GetFileControlBlock(vcb->extentsRefNum); + + lockflags = hfs_systemfile_lock(vcb, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK); + err = BTFlushPath(fcb); + hfs_systemfile_unlock(vcb, lockflags); + + if ( err == noErr ) + { + // If the FCB for the extent "file" is dirty, mark the VCB as dirty. + + if (FTOC(fcb)->c_flag & C_MODIFIED) + { + MarkVCBDirty( vcb ); + // err = FlushVolumeControlBlock( vcb ); + } + } + + return( err ); +} + + +#if CONFIG_HFS_STD +//‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹ +// Routine: CompareExtentKeys +// +// Function: Compares two extent file keys (a search key and a trial key) for +// an HFS volume. +//‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹ + +int32_t CompareExtentKeys( const HFSExtentKey *searchKey, const HFSExtentKey *trialKey ) +{ + int32_t result; // ± 1 + + #if DEBUG + if (searchKey->keyLength != kHFSExtentKeyMaximumLength) + DebugStr("HFS: search Key is wrong length"); + if (trialKey->keyLength != kHFSExtentKeyMaximumLength) + DebugStr("HFS: trial Key is wrong length"); + #endif + + result = -1; // assume searchKey < trialKey + + if (searchKey->fileID == trialKey->fileID) { + // + // FileNum's are equal; compare fork types + // + if (searchKey->forkType == trialKey->forkType) { + // + // Fork types are equal; compare allocation block number + // + if (searchKey->startBlock == trialKey->startBlock) { + // + // Everything is equal + // + result = 0; + } + else { + // + // Allocation block numbers differ; determine sign + // + if (searchKey->startBlock > trialKey->startBlock) + result = 1; + } + } + else { + // + // Fork types differ; determine sign + // + if (searchKey->forkType > trialKey->forkType) + result = 1; + } + } + else { + // + // FileNums differ; determine sign + // + if (searchKey->fileID > trialKey->fileID) + result = 1; + } + + return( result ); +} +#endif + + +//‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹ +// Routine: CompareExtentKeysPlus +// +// Function: Compares two extent file keys (a search key and a trial key) for +// an HFS volume. +//‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹ + +int32_t CompareExtentKeysPlus( const HFSPlusExtentKey *searchKey, const HFSPlusExtentKey *trialKey ) +{ + int32_t result; // ± 1 + + #if DEBUG + if (searchKey->keyLength != kHFSPlusExtentKeyMaximumLength) + DebugStr("HFS: search Key is wrong length"); + if (trialKey->keyLength != kHFSPlusExtentKeyMaximumLength) + DebugStr("HFS: trial Key is wrong length"); + #endif + + result = -1; // assume searchKey < trialKey + + if (searchKey->fileID == trialKey->fileID) { + // + // FileNum's are equal; compare fork types + // + if (searchKey->forkType == trialKey->forkType) { + // + // Fork types are equal; compare allocation block number + // + if (searchKey->startBlock == trialKey->startBlock) { + // + // Everything is equal + // + result = 0; + } + else { + // + // Allocation block numbers differ; determine sign + // + if (searchKey->startBlock > trialKey->startBlock) + result = 1; + } + } + else { + // + // Fork types differ; determine sign + // + if (searchKey->forkType > trialKey->forkType) + result = 1; + } + } + else { + // + // FileNums differ; determine sign + // + if (searchKey->fileID > trialKey->fileID) + result = 1; + } + + return( result ); +} + +static int +should_pin_blocks(hfsmount_t *hfsmp, FCB *fcb) +{ + if (!ISSET(hfsmp->hfs_flags, HFS_CS_HOTFILE_PIN) + || fcb->ff_cp == NULL || fcb->ff_cp->c_vp == NULL) { + return 0; + } + + int pin_blocks; + + // + // File system metadata should get pinned + // + if (vnode_issystem(fcb->ff_cp->c_vp)) { + return 1; + } + + // + // If a file is AutoCandidate, we should not pin its blocks because + // it was an automatically added file and this function is intended + // to pin new blocks being added to user-generated content. + // + if (fcb->ff_cp->c_attr.ca_recflags & kHFSAutoCandidateMask) { + return 0; + } + + // + // If a file is marked FastDevPinned it is an existing pinned file + // or a new file that should be pinned. + // + // If a file is marked FastDevCandidate it is a new file that is + // being written to for the first time so we don't want to pin it + // just yet as it may not meet the criteria (i.e. too large). + // + if ((fcb->ff_cp->c_attr.ca_recflags & (kHFSFastDevPinnedMask)) != 0) { + pin_blocks = 1; + } else { + pin_blocks = 0; + } + + return pin_blocks; +} + + + +static void +pin_blocks_if_needed(ExtendedVCB *vcb, FCB *fcb, u_int32_t startBlock, u_int32_t blockCount) +{ + if (!should_pin_blocks(vcb, fcb)) { + return; + } + + // ask CoreStorage to pin the new blocks being added to this file + if (hfs_pin_block_range((struct hfsmount *)vcb, HFS_PIN_IT, startBlock, blockCount) == 0) { + struct vnode *vp = fcb->ff_cp->c_vp; + + // and make sure to keep our accounting in order + hfs_hotfile_adjust_blocks(vp, -blockCount); + } +} + + + +/* + * Add a file extent to a file. + * + * Used by hfs_extendfs to extend the volume allocation bitmap file. + * + */ +int +AddFileExtent(ExtendedVCB *vcb, FCB *fcb, u_int32_t startBlock, u_int32_t blockCount) +{ + HFSPlusExtentKey foundKey; + HFSPlusExtentRecord foundData; + u_int32_t foundIndex; + u_int32_t hint; + u_int32_t nextBlock; + int64_t peof; + int i; + int error; + + peof = (int64_t)(fcb->ff_blocks + blockCount) * (int64_t)vcb->blockSize; + + error = SearchExtentFile(vcb, fcb, peof-1, &foundKey, foundData, &foundIndex, &hint, &nextBlock); + if (error != fxRangeErr) + return (EBUSY); + + /* + * Add new extent. See if there is room in the current record. + */ + if (foundData[foundIndex].blockCount != 0) + ++foundIndex; + if (foundIndex == kHFSPlusExtentDensity) { + /* + * Existing record is full so create a new one. + */ + foundKey.keyLength = kHFSPlusExtentKeyMaximumLength; + foundKey.forkType = kDataForkType; + foundKey.pad = 0; + foundKey.fileID = FTOC(fcb)->c_fileid; + foundKey.startBlock = nextBlock; + + foundData[0].startBlock = startBlock; + foundData[0].blockCount = blockCount; + + /* zero out remaining extents. */ + for (i = 1; i < kHFSPlusExtentDensity; ++i) { + foundData[i].startBlock = 0; + foundData[i].blockCount = 0; + } + + foundIndex = 0; + + error = CreateExtentRecord(vcb, &foundKey, foundData, &hint); + if (error == fxOvFlErr) { + error = dskFulErr; + } else if (error == 0) { + pin_blocks_if_needed(vcb, fcb, startBlock, blockCount); + } + + } else { + /* + * Add a new extent into existing record. + */ + foundData[foundIndex].startBlock = startBlock; + foundData[foundIndex].blockCount = blockCount; + error = UpdateExtentRecord(vcb, fcb, 0, &foundKey, foundData, hint); + if (error == 0) { + pin_blocks_if_needed(vcb, fcb, startBlock, blockCount); + } + } + (void) FlushExtentFile(vcb); + + return (error); +} + + +//_________________________________________________________________________________ +// +// Routine: Extendfile +// +// Function: Extends the disk space allocated to a file. +// +//_________________________________________________________________________________ + +OSErr ExtendFileC ( + ExtendedVCB *vcb, // volume that file resides on + FCB *fcb, // FCB of file to truncate + int64_t bytesToAdd, // number of bytes to allocate + u_int32_t blockHint, // desired starting allocation block + u_int32_t flags, // EFContig and/or EFAll + int64_t *actualBytesAdded) // number of bytes actually allocated +{ + OSErr err; + u_int32_t volumeBlockSize; + int64_t blocksToAdd; + int64_t bytesThisExtent; + HFSPlusExtentKey foundKey; + HFSPlusExtentRecord foundData; + u_int32_t foundIndex; + u_int32_t hint; + u_int32_t nextBlock; + u_int32_t startBlock; + Boolean allOrNothing; + Boolean forceContig; + Boolean wantContig; + Boolean useMetaZone; + Boolean needsFlush; + int allowFlushTxns; + u_int32_t actualStartBlock; + u_int32_t actualNumBlocks; + u_int32_t numExtentsPerRecord; + int64_t maximumBytes; + int64_t availbytes; + int64_t peof; + u_int32_t prevblocks; + uint32_t fastdev = 0; + + struct hfsmount *hfsmp = (struct hfsmount*)vcb; + allowFlushTxns = 0; + needsFlush = false; + *actualBytesAdded = 0; + volumeBlockSize = vcb->blockSize; + allOrNothing = ((flags & kEFAllMask) != 0); + forceContig = ((flags & kEFContigMask) != 0); + prevblocks = fcb->ff_blocks; + + if (vcb->vcbSigWord != kHFSSigWord) { + numExtentsPerRecord = kHFSPlusExtentDensity; + } +#if CONFIG_HFS_STD + else { + /* HFS Standard */ + numExtentsPerRecord = kHFSExtentDensity; + + /* Make sure the request and new PEOF are less than 2GB if HFS std*/ + if (bytesToAdd >= kTwoGigabytes) + goto HFS_Std_Overflow; + if ((((int64_t)fcb->ff_blocks * (int64_t)volumeBlockSize) + bytesToAdd) >= kTwoGigabytes) + goto HFS_Std_Overflow; + } +#endif + + // + // Determine how many blocks need to be allocated. + // Round up the number of desired bytes to add. + // + blocksToAdd = howmany(bytesToAdd, volumeBlockSize); + bytesToAdd = (int64_t)((int64_t)blocksToAdd * (int64_t)volumeBlockSize); + + /* + * For deferred allocations just reserve the blocks. + */ + if ((flags & kEFDeferMask) + && (vcb->vcbSigWord == kHFSPlusSigWord) + && (bytesToAdd < (int64_t)HFS_MAX_DEFERED_ALLOC) + && (blocksToAdd < hfs_freeblks(VCBTOHFS(vcb), 1))) { + hfs_lock_mount (hfsmp); + vcb->loanedBlocks += blocksToAdd; + hfs_unlock_mount(hfsmp); + + fcb->ff_unallocblocks += blocksToAdd; + FTOC(fcb)->c_blocks += blocksToAdd; + fcb->ff_blocks += blocksToAdd; + + /* + * We haven't touched the disk here; no blocks have been + * allocated and the volume will not be inconsistent if we + * don't update the catalog record immediately. + */ + FTOC(fcb)->c_flag |= C_MINOR_MOD; + *actualBytesAdded = bytesToAdd; + return (0); + } + /* + * Give back any unallocated blocks before doing real allocations. + */ + if (fcb->ff_unallocblocks > 0) { + u_int32_t loanedBlocks; + + loanedBlocks = fcb->ff_unallocblocks; + blocksToAdd += loanedBlocks; + bytesToAdd = (int64_t)blocksToAdd * (int64_t)volumeBlockSize; + FTOC(fcb)->c_blocks -= loanedBlocks; + fcb->ff_blocks -= loanedBlocks; + fcb->ff_unallocblocks = 0; + + hfs_lock_mount(hfsmp); + vcb->loanedBlocks -= loanedBlocks; + hfs_unlock_mount(hfsmp); + } + + // + // If the file's clump size is larger than the allocation block size, + // then set the maximum number of bytes to the requested number of bytes + // rounded up to a multiple of the clump size. + // + if ((vcb->vcbClpSiz > (int32_t)volumeBlockSize) + && (bytesToAdd < (int64_t)HFS_MAX_DEFERED_ALLOC) + && (flags & kEFNoClumpMask) == 0) { + maximumBytes = (int64_t)howmany(bytesToAdd, vcb->vcbClpSiz); + maximumBytes *= vcb->vcbClpSiz; + } else { + maximumBytes = bytesToAdd; + } + +#if CONFIG_HFS_STD + // + // Compute new physical EOF, rounded up to a multiple of a block. + // + if ( (vcb->vcbSigWord == kHFSSigWord) && // Too big? + ((((int64_t)fcb->ff_blocks * (int64_t)volumeBlockSize) + bytesToAdd) >= kTwoGigabytes) ) { + if (allOrNothing) // Yes, must they have it all? + goto HFS_Std_Overflow; // Yes, can't have it + else { + --blocksToAdd; // No, give give 'em one block less + bytesToAdd -= volumeBlockSize; + } + } +#endif + + // + // If allocation is all-or-nothing, make sure there are + // enough free blocks on the volume (quick test). + // + if (allOrNothing && + (blocksToAdd > hfs_freeblks(VCBTOHFS(vcb), flags & kEFReserveMask))) { + err = dskFulErr; + goto ErrorExit; + } + + // + // See if there are already enough blocks allocated to the file. + // + peof = ((int64_t)fcb->ff_blocks * (int64_t)volumeBlockSize) + bytesToAdd; // potential new PEOF + err = SearchExtentFile(vcb, fcb, peof-1, &foundKey, foundData, &foundIndex, &hint, &nextBlock); + if (err == noErr) { + // Enough blocks are already allocated. Just update the FCB to reflect the new length. + fcb->ff_blocks = peof / volumeBlockSize; + FTOC(fcb)->c_blocks += (bytesToAdd / volumeBlockSize); + FTOC(fcb)->c_flag |= C_MODIFIED; + goto Exit; + } + if (err != fxRangeErr) // Any real error? + goto ErrorExit; // Yes, so exit immediately + + // + // Adjust the PEOF to the end of the last extent. + // + peof = (int64_t)((int64_t)nextBlock * (int64_t)volumeBlockSize); // currently allocated PEOF + bytesThisExtent = (int64_t)(nextBlock - fcb->ff_blocks) * (int64_t)volumeBlockSize; + if (bytesThisExtent != 0) { + fcb->ff_blocks = nextBlock; + FTOC(fcb)->c_blocks += (bytesThisExtent / volumeBlockSize); + FTOC(fcb)->c_flag |= C_MODIFIED; + bytesToAdd -= bytesThisExtent; + } + + // + // Allocate some more space. + // + // First try a contiguous allocation (of the whole amount). + // If that fails, get whatever we can. + // If forceContig, then take whatever we got + // else, keep getting bits and pieces (non-contig) + + /* + * Note that for sparse devices (like sparse bundle dmgs), we + * should only be aggressive with re-using once-allocated pieces + * if we're not dealing with system files. If we're trying to operate + * on behalf of a system file, we need the maximum contiguous amount + * possible. For non-system files we favor locality and fragmentation over + * contiguity as it can result in fewer blocks being needed from the underlying + * filesystem that the sparse image resides upon. + */ + err = noErr; + if ( (vcb->hfs_flags & HFS_HAS_SPARSE_DEVICE) + && (fcb->ff_cp->c_fileid >= kHFSFirstUserCatalogNodeID) + && (flags & kEFMetadataMask) == 0) { + /* + * We want locality over contiguity so by default we set wantContig to + * false unless we hit one of the circumstances below. + */ + wantContig = false; + if (hfs_isrbtree_active(VCBTOHFS(vcb))) { + /* + * If the red-black tree is acive, we can always find a suitable contiguous + * chunk. So if the user specifically requests contiguous files, we should + * honor that no matter what kind of device it is. + */ + if (forceContig) { + wantContig = true; + } + } + else { + /* + * If the red-black tree is not active, then only set wantContig to true + * if we have never done a contig scan on the device, which would populate + * the free extent cache. Note that the caller may explicitly unset the + * DID_CONTIG_SCAN bit in order to force us to vend a contiguous extent here + * if the caller wants to get a contiguous chunk. + */ + if ((vcb->hfs_flags & HFS_DID_CONTIG_SCAN) == 0) { + vcb->hfs_flags |= HFS_DID_CONTIG_SCAN; + wantContig = true; + } + } + } + else { + wantContig = true; + } + + if (should_pin_blocks(hfsmp, fcb)) + fastdev = HFS_ALLOC_FAST_DEV; + + useMetaZone = flags & kEFMetadataMask; + do { + if (blockHint != 0) + startBlock = blockHint; + else + startBlock = foundData[foundIndex].startBlock + foundData[foundIndex].blockCount; + + actualNumBlocks = 0; + actualStartBlock = 0; + + /* Find number of free blocks based on reserved block flag option */ + availbytes = (int64_t)hfs_freeblks(VCBTOHFS(vcb), flags & kEFReserveMask) * + (int64_t)volumeBlockSize; + if (availbytes <= 0) { + err = dskFulErr; + } else { + if (wantContig && (availbytes < bytesToAdd)) { + err = dskFulErr; + } + else { + uint32_t ba_flags = fastdev; + + if (wantContig) { + ba_flags |= HFS_ALLOC_FORCECONTIG; + } + if (useMetaZone) { + ba_flags |= HFS_ALLOC_METAZONE; + } + if (allowFlushTxns) { + ba_flags |= HFS_ALLOC_FLUSHTXN; + } + + err = BlockAllocate( + vcb, + startBlock, + howmany(MIN(bytesToAdd, availbytes), volumeBlockSize), + howmany(MIN(maximumBytes, availbytes), volumeBlockSize), + ba_flags, + &actualStartBlock, + &actualNumBlocks); + } + } + if (err == dskFulErr) { + if (forceContig) { + if (allowFlushTxns == 0) { + /* If we're forcing contiguity, re-try but allow plucking from recently freed regions */ + allowFlushTxns = 1; + wantContig = 1; + err = noErr; + continue; + } + else { + break; // AllocContig failed because not enough contiguous space + } + } + if (wantContig) { + // Couldn't get one big chunk, so get whatever we can. + err = noErr; + wantContig = false; + continue; + } + if (actualNumBlocks != 0) + err = noErr; + + if (useMetaZone == 0) { + /* Couldn't get anything so dip into metadat zone */ + err = noErr; + useMetaZone = 1; + continue; + } + + /* If we couldn't find what we needed without flushing the journal, then go ahead and do it now */ + if (allowFlushTxns == 0) { + allowFlushTxns = 1; + err = noErr; + continue; + } + + } + if (err == noErr) { + // Add the new extent to the existing extent record, or create a new one. + if ((actualStartBlock == startBlock) && (blockHint == 0)) { + // We grew the file's last extent, so just adjust the number of blocks. + foundData[foundIndex].blockCount += actualNumBlocks; + err = UpdateExtentRecord(vcb, fcb, 0, &foundKey, foundData, hint); + if (err != noErr) break; + } + else { + u_int16_t i; + + // Need to add a new extent. See if there is room in the current record. + if (foundData[foundIndex].blockCount != 0) // Is current extent free to use? + ++foundIndex; // No, so use the next one. + if (foundIndex == numExtentsPerRecord) { + // This record is full. Need to create a new one. + if (FTOC(fcb)->c_fileid == kHFSExtentsFileID) { + (void) BlockDeallocate(vcb, actualStartBlock, actualNumBlocks, 0); + err = dskFulErr; // Oops. Can't extend extents file past first record. + break; + } + + foundKey.keyLength = kHFSPlusExtentKeyMaximumLength; + if (FORK_IS_RSRC(fcb)) + foundKey.forkType = kResourceForkType; + else + foundKey.forkType = kDataForkType; + foundKey.pad = 0; + foundKey.fileID = FTOC(fcb)->c_fileid; + foundKey.startBlock = nextBlock; + + foundData[0].startBlock = actualStartBlock; + foundData[0].blockCount = actualNumBlocks; + + // zero out remaining extents... + for (i = 1; i < kHFSPlusExtentDensity; ++i) + { + foundData[i].startBlock = 0; + foundData[i].blockCount = 0; + } + + foundIndex = 0; + + err = CreateExtentRecord(vcb, &foundKey, foundData, &hint); + if (err == fxOvFlErr) { + // We couldn't create an extent record because extents B-tree + // couldn't grow. Dellocate the extent just allocated and + // return a disk full error. + (void) BlockDeallocate(vcb, actualStartBlock, actualNumBlocks, 0); + err = dskFulErr; + } + if (err != noErr) break; + + needsFlush = true; // We need to update the B-tree header + } + else { + // Add a new extent into this record and update. + foundData[foundIndex].startBlock = actualStartBlock; + foundData[foundIndex].blockCount = actualNumBlocks; + err = UpdateExtentRecord(vcb, fcb, 0, &foundKey, foundData, hint); + if (err != noErr) break; + } + } + + // Figure out how many bytes were actually allocated. + // NOTE: BlockAllocate could have allocated more than we asked for. + // Don't set the PEOF beyond what our client asked for. + nextBlock += actualNumBlocks; + bytesThisExtent = (int64_t)((int64_t)actualNumBlocks * (int64_t)volumeBlockSize); + if (bytesThisExtent > bytesToAdd) { + bytesToAdd = 0; + } + else { + bytesToAdd -= bytesThisExtent; + maximumBytes -= bytesThisExtent; + } + fcb->ff_blocks += (bytesThisExtent / volumeBlockSize); + FTOC(fcb)->c_blocks += (bytesThisExtent / volumeBlockSize); + FTOC(fcb)->c_flag |= C_MODIFIED; + + // If contiguous allocation was requested, then we've already got one contiguous + // chunk. If we didn't get all we wanted, then adjust the error to disk full. + if (forceContig) { + if (bytesToAdd != 0) + err = dskFulErr; + break; // We've already got everything that's contiguous + } + } + } while (err == noErr && bytesToAdd); + +ErrorExit: +Exit: + if (VCBTOHFS(vcb)->hfs_flags & HFS_METADATA_ZONE) { + /* Keep the roving allocator out of the metadata zone. */ + if (vcb->nextAllocation >= VCBTOHFS(vcb)->hfs_metazone_start && + vcb->nextAllocation <= VCBTOHFS(vcb)->hfs_metazone_end) { + hfs_lock_mount (hfsmp); + HFS_UPDATE_NEXT_ALLOCATION(vcb, VCBTOHFS(vcb)->hfs_metazone_end + 1); + MarkVCBDirty(vcb); + hfs_unlock_mount(hfsmp); + } + } + if (prevblocks < fcb->ff_blocks) { + *actualBytesAdded = (int64_t)(fcb->ff_blocks - prevblocks) * (int64_t)volumeBlockSize; + } else { + *actualBytesAdded = 0; + } + + if (fastdev) { + hfs_hotfile_adjust_blocks(fcb->ff_cp->c_vp, + (int64_t)prevblocks - fcb->ff_blocks); + } + + if (needsFlush) + (void) FlushExtentFile(vcb); + + return err; + +#if CONFIG_HFS_STD +HFS_Std_Overflow: + err = fileBoundsErr; + goto ErrorExit; +#endif +} + + + +//_________________________________________________________________________________ +// +// Routine: TruncateFileC +// +// Function: Truncates the disk space allocated to a file. The file space is +// truncated to a specified new PEOF rounded up to the next allocation +// block boundry. If the 'TFTrunExt' option is specified, the file is +// truncated to the end of the extent containing the new PEOF. +// +//_________________________________________________________________________________ + +OSErr TruncateFileC ( + ExtendedVCB *vcb, // volume that file resides on + FCB *fcb, // FCB of file to truncate + int64_t peof, // new physical size for file + int deleted, // if nonzero, the file's catalog record has already been deleted. + int rsrc, // does this represent a resource fork or not? + uint32_t fileid, // the fileid of the file we're manipulating. + Boolean truncateToExtent) // if true, truncate to end of extent containing newPEOF + +{ + OSErr err; + u_int32_t nextBlock; // next file allocation block to consider + u_int32_t startBlock; // Physical (volume) allocation block number of start of a range + u_int32_t physNumBlocks; // Number of allocation blocks in file (according to PEOF) + u_int32_t numBlocks; + HFSPlusExtentKey key; // key for current extent record; key->keyLength == 0 if FCB's extent record + u_int32_t hint; // BTree hint corresponding to key + HFSPlusExtentRecord extentRecord; + u_int32_t extentIndex; + u_int32_t extentNextBlock; + u_int32_t numExtentsPerRecord; + int64_t temp64; + u_int8_t forkType; + Boolean extentChanged; // true if we actually changed an extent + Boolean recordDeleted; // true if an extent record got deleted + + recordDeleted = false; + + if (vcb->vcbSigWord == kHFSPlusSigWord) { + numExtentsPerRecord = kHFSPlusExtentDensity; + } + else { + numExtentsPerRecord = kHFSExtentDensity; + } + + if (rsrc) { + forkType = kResourceForkType; + } + else { + forkType = kDataForkType; + } + + temp64 = fcb->ff_blocks; + physNumBlocks = (u_int32_t)temp64; + + // + // Round newPEOF up to a multiple of the allocation block size. If new size is + // two gigabytes or more, then round down by one allocation block (??? really? + // shouldn't that be an error?). + // + nextBlock = howmany(peof, vcb->blockSize); // number of allocation blocks to remain in file + peof = (int64_t)((int64_t)nextBlock * (int64_t)vcb->blockSize); // number of bytes in those blocks + +#if CONFIG_HFS_STD + if ((vcb->vcbSigWord == kHFSSigWord) && (peof >= kTwoGigabytes)) { + #if DEBUG + DebugStr("HFS: Trying to truncate a file to 2GB or more"); + #endif + err = fileBoundsErr; + goto ErrorExit; + } +#endif + + // + // Update FCB's length + // + /* + * XXX Any errors could cause ff_blocks and c_blocks to get out of sync... + */ + numBlocks = peof / vcb->blockSize; + if (!deleted) { + FTOC(fcb)->c_blocks -= (fcb->ff_blocks - numBlocks); + } + fcb->ff_blocks = numBlocks; + + // this catalog entry is modified and *must* get forced + // to disk when hfs_update() is called + if (!deleted) { + /* + * If the file is already C_NOEXISTS, then the catalog record + * has been removed from disk already. We wouldn't need to force + * another update + */ + FTOC(fcb)->c_flag |= C_MODIFIED; + } + // + // If the new PEOF is 0, then truncateToExtent has no meaning (we should always deallocate + // all storage). + // + if (peof == 0) { + int i; + + // Deallocate all the extents for this fork + err = DeallocateFork(vcb, fileid, forkType, fcb->fcbExtents, &recordDeleted); + if (err != noErr) goto ErrorExit; // got some error, so return it + + // Update the catalog extent record (making sure it's zeroed out) + if (err == noErr) { + for (i=0; i < kHFSPlusExtentDensity; i++) { + fcb->fcbExtents[i].startBlock = 0; + fcb->fcbExtents[i].blockCount = 0; + } + } + goto Done; + } + + // + // Find the extent containing byte (peof-1). This is the last extent we'll keep. + // (If truncateToExtent is true, we'll keep the whole extent; otherwise, we'll only + // keep up through peof). The search will tell us how many allocation blocks exist + // in the found extent plus all previous extents. + // + err = SearchExtentFile(vcb, fcb, peof-1, &key, extentRecord, &extentIndex, &hint, &extentNextBlock); + if (err != noErr) goto ErrorExit; + + extentChanged = false; // haven't changed the extent yet + + if (!truncateToExtent) { + // + // Shorten this extent. It may be the case that the entire extent gets + // freed here. + // + numBlocks = extentNextBlock - nextBlock; // How many blocks in this extent to free up + if (numBlocks != 0) { + // Compute first volume allocation block to free + startBlock = extentRecord[extentIndex].startBlock + extentRecord[extentIndex].blockCount - numBlocks; + // Free the blocks in bitmap + err = BlockDeallocate(vcb, startBlock, numBlocks, 0); + if (err != noErr) goto ErrorExit; + // Adjust length of this extent + extentRecord[extentIndex].blockCount -= numBlocks; + // If extent is empty, set start block to 0 + if (extentRecord[extentIndex].blockCount == 0) + extentRecord[extentIndex].startBlock = 0; + // Remember that we changed the extent record + extentChanged = true; + } + } + + // + // Now move to the next extent in the record, and set up the file allocation block number + // + nextBlock = extentNextBlock; // Next file allocation block to free + ++extentIndex; // Its index within the extent record + + // + // Release all following extents in this extent record. Update the record. + // + while (extentIndex < numExtentsPerRecord && extentRecord[extentIndex].blockCount != 0) { + numBlocks = extentRecord[extentIndex].blockCount; + // Deallocate this extent + err = BlockDeallocate(vcb, extentRecord[extentIndex].startBlock, numBlocks, 0); + if (err != noErr) goto ErrorExit; + // Update next file allocation block number + nextBlock += numBlocks; + // Zero out start and length of this extent to delete it from record + extentRecord[extentIndex].startBlock = 0; + extentRecord[extentIndex].blockCount = 0; + // Remember that we changed an extent + extentChanged = true; + // Move to next extent in record + ++extentIndex; + } + + // + // If any of the extents in the current record were changed, then update that + // record (in the FCB, or extents file). + // + if (extentChanged) { + err = UpdateExtentRecord(vcb, fcb, deleted, &key, extentRecord, hint); + if (err != noErr) goto ErrorExit; + } + + // + // If there are any following allocation blocks, then we need + // to seach for their extent records and delete those allocation + // blocks. + // + if (nextBlock < physNumBlocks) + err = TruncateExtents(vcb, forkType, fileid, nextBlock, &recordDeleted); + +Done: +ErrorExit: + if (recordDeleted) + (void) FlushExtentFile(vcb); + + return err; +} + + +/* + * HFS Plus only + * + */ +OSErr HeadTruncateFile ( + ExtendedVCB *vcb, + FCB *fcb, + u_int32_t headblks) +{ + HFSPlusExtentRecord extents; + HFSPlusExtentRecord tailExtents; + HFSCatalogNodeID fileID; + u_int8_t forkType; + u_int32_t blkcnt = 0; + u_int32_t startblk; + u_int32_t blksfreed; + int i, j; + int error = 0; + int lockflags; + + + if (vcb->vcbSigWord != kHFSPlusSigWord) + return (-1); + + forkType = FORK_IS_RSRC(fcb) ? kResourceForkType : kDataForkType; + fileID = FTOC(fcb)->c_fileid; + bzero(tailExtents, sizeof(tailExtents)); + + blksfreed = 0; + startblk = 0; + + /* + * Process catalog resident extents + */ + for (i = 0, j = 0; i < kHFSPlusExtentDensity; ++i) { + blkcnt = fcb->fcbExtents[i].blockCount; + if (blkcnt == 0) + break; /* end of extents */ + + if (blksfreed < headblks) { + error = BlockDeallocate(vcb, fcb->fcbExtents[i].startBlock, blkcnt, 0); + /* + * Any errors after the first BlockDeallocate + * must be ignored so we can put the file in + * a known state. + */ + if (error ) { + if (i == 0) + goto ErrorExit; /* uh oh */ + else { + error = 0; + printf("hfs: HeadTruncateFile: problems deallocating %s (%d)\n", + FTOC(fcb)->c_desc.cd_nameptr ? (const char *)FTOC(fcb)->c_desc.cd_nameptr : "", error); + } + } + + blksfreed += blkcnt; + fcb->fcbExtents[i].startBlock = 0; + fcb->fcbExtents[i].blockCount = 0; + } else { + tailExtents[j].startBlock = fcb->fcbExtents[i].startBlock; + tailExtents[j].blockCount = blkcnt; + ++j; + } + startblk += blkcnt; + } + + if (blkcnt == 0) + goto CopyExtents; + + lockflags = hfs_systemfile_lock(vcb, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK); + + /* + * Process overflow extents + */ + for (;;) { + u_int32_t extblks; + + error = FindExtentRecord(vcb, forkType, fileID, startblk, false, NULL, extents, NULL); + if (error) { + /* + * Any errors after the first BlockDeallocate + * must be ignored so we can put the file in + * a known state. + */ + if (error != btNotFound) + printf("hfs: HeadTruncateFile: problems finding extents %s (%d)\n", + FTOC(fcb)->c_desc.cd_nameptr ? (const char *)FTOC(fcb)->c_desc.cd_nameptr : "", error); + error = 0; + break; + } + + for(i = 0, extblks = 0; i < kHFSPlusExtentDensity; ++i) { + blkcnt = extents[i].blockCount; + if (blkcnt == 0) + break; /* end of extents */ + + if (blksfreed < headblks) { + error = BlockDeallocate(vcb, extents[i].startBlock, blkcnt, 0); + if (error) { + printf("hfs: HeadTruncateFile: problems deallocating %s (%d)\n", + FTOC(fcb)->c_desc.cd_nameptr ? (const char *)FTOC(fcb)->c_desc.cd_nameptr : "", error); + error = 0; + } + blksfreed += blkcnt; + } else { + tailExtents[j].startBlock = extents[i].startBlock; + tailExtents[j].blockCount = blkcnt; + ++j; + } + extblks += blkcnt; + } + + error = DeleteExtentRecord(vcb, forkType, fileID, startblk); + if (error) { + printf("hfs: HeadTruncateFile: problems deallocating %s (%d)\n", + FTOC(fcb)->c_desc.cd_nameptr ? (const char *)FTOC(fcb)->c_desc.cd_nameptr : "", error); + error = 0; + } + + if (blkcnt == 0) + break; /* all done */ + + startblk += extblks; + } + hfs_systemfile_unlock(vcb, lockflags); + +CopyExtents: + if (blksfreed) { + bcopy(tailExtents, fcb->fcbExtents, sizeof(tailExtents)); + blkcnt = fcb->ff_blocks - headblks; + FTOC(fcb)->c_blocks -= headblks; + fcb->ff_blocks = blkcnt; + + FTOC(fcb)->c_flag |= C_MODIFIED; + FTOC(fcb)->c_touch_chgtime = TRUE; + + (void) FlushExtentFile(vcb); + } + +ErrorExit: + return MacToVFSError(error); +} + + + +//‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹ +// Routine: SearchExtentRecord (was XRSearch) +// +// Function: Searches extent record for the extent mapping a given file +// allocation block number (FABN). +// +// Input: searchFABN - desired FABN +// extentData - pointer to extent data record (xdr) +// extentDataStartFABN - beginning FABN for extent record +// +// Output: foundExtentDataOffset - offset to extent entry within xdr +// result = noErr, offset to extent mapping desired FABN +// result = FXRangeErr, offset to last extent in record +// endingFABNPlusOne - ending FABN +1 +// noMoreExtents - True if the extent was not found, and the +// extent record was not full (so don't bother +// looking in subsequent records); false otherwise. +// +// Result: noErr = ok +// FXRangeErr = desired FABN > last mapped FABN in record +//‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹ + +static OSErr SearchExtentRecord( + ExtendedVCB *vcb, + u_int32_t searchFABN, + const HFSPlusExtentRecord extentData, + u_int32_t extentDataStartFABN, + u_int32_t *foundExtentIndex, + u_int32_t *endingFABNPlusOne, + Boolean *noMoreExtents) +{ + OSErr err = noErr; + u_int32_t extentIndex; + /* Set it to the HFS std value */ + u_int32_t numberOfExtents = kHFSExtentDensity; + u_int32_t numAllocationBlocks; + Boolean foundExtent; + + *endingFABNPlusOne = extentDataStartFABN; + *noMoreExtents = false; + foundExtent = false; + + /* Override numberOfExtents for HFS+/HFSX */ + if (vcb->vcbSigWord != kHFSSigWord) { + numberOfExtents = kHFSPlusExtentDensity; + } + + for( extentIndex = 0; extentIndex < numberOfExtents; ++extentIndex ) + { + + // Loop over the extent record and find the search FABN. + + numAllocationBlocks = extentData[extentIndex].blockCount; + if ( numAllocationBlocks == 0 ) + { + break; + } + + *endingFABNPlusOne += numAllocationBlocks; + + if( searchFABN < *endingFABNPlusOne ) + { + // Found the extent. + foundExtent = true; + break; + } + } + + if( foundExtent ) + { + // Found the extent. Note the extent offset + *foundExtentIndex = extentIndex; + } + else + { + // Did not find the extent. Set foundExtentDataOffset accordingly + if( extentIndex > 0 ) + { + *foundExtentIndex = extentIndex - 1; + } + else + { + *foundExtentIndex = 0; + } + + // If we found an empty extent, then set noMoreExtents. + if (extentIndex < numberOfExtents) + *noMoreExtents = true; + + // Finally, return an error to the caller + err = fxRangeErr; + } + + return( err ); +} + +//‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹ +// Routine: SearchExtentFile (was XFSearch) +// +// Function: Searches extent file (including the FCB resident extent record) +// for the extent mapping a given file position. +// +// Input: vcb - VCB pointer +// fcb - FCB pointer +// filePosition - file position (byte address) +// +// Output: foundExtentKey - extent key record (xkr) +// If extent was found in the FCB's resident extent record, +// then foundExtentKey->keyLength will be set to 0. +// foundExtentData - extent data record(xdr) +// foundExtentIndex - index to extent entry in xdr +// result = 0, offset to extent mapping desired FABN +// result = FXRangeErr, offset to last extent in record +// (i.e., kNumExtentsPerRecord-1) +// extentBTreeHint - BTree hint for extent record +// kNoHint = Resident extent record +// endingFABNPlusOne - ending FABN +1 +// +// Result: +// noErr Found an extent that contains the given file position +// FXRangeErr Given position is beyond the last allocated extent +// (other) (some other internal I/O error) +//‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹ + +OSErr SearchExtentFile( + ExtendedVCB *vcb, + const FCB *fcb, + int64_t filePosition, + HFSPlusExtentKey *foundExtentKey, + HFSPlusExtentRecord foundExtentData, + u_int32_t *foundExtentIndex, + u_int32_t *extentBTreeHint, + u_int32_t *endingFABNPlusOne ) +{ + OSErr err; + u_int32_t filePositionBlock; + int64_t temp64; + Boolean noMoreExtents; + int lockflags; + + temp64 = filePosition / (int64_t)vcb->blockSize; + filePositionBlock = (u_int32_t)temp64; + + bcopy ( fcb->fcbExtents, foundExtentData, sizeof(HFSPlusExtentRecord)); + + // Search the resident FCB first. + err = SearchExtentRecord( vcb, filePositionBlock, foundExtentData, 0, + foundExtentIndex, endingFABNPlusOne, &noMoreExtents ); + + if( err == noErr ) { + // Found the extent. Set results accordingly + *extentBTreeHint = kNoHint; // no hint, because not in the BTree + foundExtentKey->keyLength = 0; // 0 = the FCB itself + + goto Exit; + } + + // Didn't find extent in FCB. If FCB's extent record wasn't full, there's no point + // in searching the extents file. Note that SearchExtentRecord left us pointing at + // the last valid extent (or the first one, if none were valid). This means we need + // to fill in the hint and key outputs, just like the "if" statement above. + if ( noMoreExtents ) { + *extentBTreeHint = kNoHint; // no hint, because not in the BTree + foundExtentKey->keyLength = 0; // 0 = the FCB itself + err = fxRangeErr; // There are no more extents, so must be beyond PEOF + goto Exit; + } + + // + // Find the desired record, or the previous record if it is the same fork + // + lockflags = hfs_systemfile_lock(vcb, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK); + + err = FindExtentRecord(vcb, FORK_IS_RSRC(fcb) ? kResourceForkType : kDataForkType, + FTOC(fcb)->c_fileid, filePositionBlock, true, foundExtentKey, foundExtentData, extentBTreeHint); + hfs_systemfile_unlock(vcb, lockflags); + + if (err == btNotFound) { + // + // If we get here, the desired position is beyond the extents in the FCB, and there are no extents + // in the extents file. Return the FCB's extents and a range error. + // + *extentBTreeHint = kNoHint; + foundExtentKey->keyLength = 0; + err = GetFCBExtentRecord(fcb, foundExtentData); + // Note: foundExtentIndex and endingFABNPlusOne have already been set as a result of the very + // first SearchExtentRecord call in this function (when searching in the FCB's extents, and + // we got a range error). + + return fxRangeErr; + } + + // + // If we get here, there was either a BTree error, or we found an appropriate record. + // If we found a record, then search it for the correct index into the extents. + // + if (err == noErr) { + // Find appropriate index into extent record + err = SearchExtentRecord(vcb, filePositionBlock, foundExtentData, foundExtentKey->startBlock, + foundExtentIndex, endingFABNPlusOne, &noMoreExtents); + } + +Exit: + return err; +} + + + +//============================================================================ +// Routine: UpdateExtentRecord +// +// Function: Write new extent data to an existing extent record with a given key. +// If all of the extents are empty, and the extent record is in the +// extents file, then the record is deleted. +// +// Input: vcb - the volume containing the extents +// fcb - the file that owns the extents +// deleted - whether or not the file is already deleted +// extentFileKey - pointer to extent key record (xkr) +// If the key length is 0, then the extents are actually part +// of the catalog record, stored in the FCB. +// extentData - pointer to extent data record (xdr) +// extentBTreeHint - hint for given key, or kNoHint +// +// Result: noErr = ok +// (other) = error from BTree +//============================================================================ + +static OSErr UpdateExtentRecord (ExtendedVCB *vcb, FCB *fcb, int deleted, + const HFSPlusExtentKey *extentFileKey, + const HFSPlusExtentRecord extentData, + u_int32_t extentBTreeHint) +{ + OSErr err = noErr; + + if (extentFileKey->keyLength == 0) { // keyLength == 0 means the FCB's extent record + BlockMoveData(extentData, fcb->fcbExtents, sizeof(HFSPlusExtentRecord)); + if (!deleted) { + FTOC(fcb)->c_flag |= C_MODIFIED; + } + } + else { + struct BTreeIterator *btIterator = NULL; + FSBufferDescriptor btRecord; + u_int16_t btRecordSize; + FCB * btFCB; + int lockflags; + + // + // Need to find and change a record in Extents BTree + // + btFCB = GetFileControlBlock(vcb->extentsRefNum); + + btIterator = hfs_mallocz(sizeof(struct BTreeIterator)); + + /* + * The lock taken by callers of ExtendFileC/TruncateFileC is + * speculative and only occurs when the file already has + * overflow extents. So we need to make sure we have the lock + * here. The extents btree lock can be nested (its recursive) + * so we always take it here. + */ + lockflags = hfs_systemfile_lock(vcb, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK); + + /* HFS+/HFSX */ + if (vcb->vcbSigWord != kHFSSigWord) { // HFS Plus volume + HFSPlusExtentRecord foundData; // The extent data actually found + + BlockMoveData(extentFileKey, &btIterator->key, sizeof(HFSPlusExtentKey)); + + btIterator->hint.index = 0; + btIterator->hint.nodeNum = extentBTreeHint; + + btRecord.bufferAddress = &foundData; + btRecord.itemSize = sizeof(HFSPlusExtentRecord); + btRecord.itemCount = 1; + + err = BTSearchRecord(btFCB, btIterator, &btRecord, &btRecordSize, btIterator); + + if (err == noErr) { + BlockMoveData(extentData, &foundData, sizeof(HFSPlusExtentRecord)); + err = BTReplaceRecord(btFCB, btIterator, &btRecord, btRecordSize); + } + (void) BTFlushPath(btFCB); + } +#if CONFIG_HFS_STD + else { + /* HFS Standard */ + HFSExtentKey * key; // Actual extent key used on disk in HFS + HFSExtentRecord foundData; // The extent data actually found + + key = (HFSExtentKey*) &btIterator->key; + key->keyLength = kHFSExtentKeyMaximumLength; + key->forkType = extentFileKey->forkType; + key->fileID = extentFileKey->fileID; + key->startBlock = extentFileKey->startBlock; + + btIterator->hint.index = 0; + btIterator->hint.nodeNum = extentBTreeHint; + + btRecord.bufferAddress = &foundData; + btRecord.itemSize = sizeof(HFSExtentRecord); + btRecord.itemCount = 1; + + err = BTSearchRecord(btFCB, btIterator, &btRecord, &btRecordSize, btIterator); + + if (err == noErr) + err = HFSPlusToHFSExtents(extentData, (HFSExtentDescriptor *)&foundData); + + if (err == noErr) + err = BTReplaceRecord(btFCB, btIterator, &btRecord, btRecordSize); + (void) BTFlushPath(btFCB); + + } +#endif + + hfs_systemfile_unlock(vcb, lockflags); + + hfs_free(btIterator, sizeof(*btIterator)); + } + + return err; +} + + + +#if CONFIG_HFS_STD +static OSErr HFSPlusToHFSExtents( + const HFSPlusExtentRecord oldExtents, + HFSExtentRecord newExtents) +{ + OSErr err; + + err = noErr; + + // copy the first 3 extents + newExtents[0].startBlock = oldExtents[0].startBlock; + newExtents[0].blockCount = oldExtents[0].blockCount; + newExtents[1].startBlock = oldExtents[1].startBlock; + newExtents[1].blockCount = oldExtents[1].blockCount; + newExtents[2].startBlock = oldExtents[2].startBlock; + newExtents[2].blockCount = oldExtents[2].blockCount; + + #if DEBUG + if (oldExtents[3].startBlock || oldExtents[3].blockCount) { + DebugStr("ExtentRecord with > 3 extents is invalid for HFS"); + err = fsDSIntErr; + } + #endif + + return err; +} +#endif + + + +static OSErr GetFCBExtentRecord( + const FCB *fcb, + HFSPlusExtentRecord extents) +{ + + BlockMoveData(fcb->fcbExtents, extents, sizeof(HFSPlusExtentRecord)); + + return noErr; +} + + +//_________________________________________________________________________________ +// +// Routine: ExtentsAreIntegral +// +// Purpose: Ensure that each extent can hold an integral number of nodes +// Called by the NodesAreContiguous function +//_________________________________________________________________________________ + +static Boolean ExtentsAreIntegral( + const HFSPlusExtentRecord extentRecord, + u_int32_t mask, + u_int32_t *blocksChecked, + Boolean *checkedLastExtent) +{ + u_int32_t blocks; + u_int32_t extentIndex; + + *blocksChecked = 0; + *checkedLastExtent = false; + + for(extentIndex = 0; extentIndex < kHFSPlusExtentDensity; extentIndex++) + { + blocks = extentRecord[extentIndex].blockCount; + + if ( blocks == 0 ) + { + *checkedLastExtent = true; + break; + } + + *blocksChecked += blocks; + + if (blocks & mask) + return false; + } + + return true; +} + + +//_________________________________________________________________________________ +// +// Routine: NodesAreContiguous +// +// Purpose: Ensure that all b-tree nodes are contiguous on disk +// Called by BTOpenPath during volume mount +//_________________________________________________________________________________ + +Boolean NodesAreContiguous( + ExtendedVCB *vcb, + FCB *fcb, + u_int32_t nodeSize) +{ + u_int32_t mask; + u_int32_t startBlock; + u_int32_t blocksChecked; + u_int32_t hint; + HFSPlusExtentKey key; + HFSPlusExtentRecord extents; + OSErr result; + Boolean lastExtentReached; + int lockflags; + + + if (vcb->blockSize >= nodeSize) + return TRUE; + + mask = (nodeSize / vcb->blockSize) - 1; + + // check the local extents + (void) GetFCBExtentRecord(fcb, extents); + if ( !ExtentsAreIntegral(extents, mask, &blocksChecked, &lastExtentReached) ) + return FALSE; + + if ( lastExtentReached || + (int64_t)((int64_t)blocksChecked * (int64_t)vcb->blockSize) >= (int64_t)fcb->ff_size) + return TRUE; + + startBlock = blocksChecked; + + lockflags = hfs_systemfile_lock(vcb, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK); + + // check the overflow extents (if any) + while ( !lastExtentReached ) + { + result = FindExtentRecord(vcb, kDataForkType, fcb->ff_cp->c_fileid, startBlock, FALSE, &key, extents, &hint); + if (result) break; + + if ( !ExtentsAreIntegral(extents, mask, &blocksChecked, &lastExtentReached) ) { + hfs_systemfile_unlock(vcb, lockflags); + return FALSE; + } + startBlock += blocksChecked; + } + hfs_systemfile_unlock(vcb, lockflags); + return TRUE; +} + diff --git a/core/FileIDsServices.c b/core/FileIDsServices.c new file mode 100644 index 0000000..aba8940 --- /dev/null +++ b/core/FileIDsServices.c @@ -0,0 +1,794 @@ +/* + * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include "hfs_macos_defs.h" +#include "hfs_format.h" + +#include "FileMgrInternal.h" +#include "HFSUnicodeWrappers.h" +#include "CatalogPrivate.h" +#include +#include +#include + +#include "hfs_dbg.h" + +struct ExtentsRecBuffer { + ExtentKey extentKey; + ExtentRecord extentData; +}; +typedef struct ExtentsRecBuffer ExtentsRecBuffer; + + +static u_int32_t CheckExtents( void *extents, u_int32_t blocks, Boolean isHFSPlus ); +static OSErr DeleteExtents( ExtendedVCB *vcb, u_int32_t fileNumber, int quitEarly, u_int8_t forkType, Boolean isHFSPlus ); +static OSErr MoveExtents( ExtendedVCB *vcb, u_int32_t srcFileID, u_int32_t destFileID, int quitEarly, u_int8_t forkType, Boolean isHFSPlus ); + +#if CONFIG_HFS_STD +static void CopyCatalogNodeInfo( CatalogRecord *src, CatalogRecord *dest ); +#endif + +static void CopyBigCatalogNodeInfo( CatalogRecord *src, CatalogRecord *dest ); +static void CopyExtentInfo( ExtentKey *key, ExtentRecord *data, ExtentsRecBuffer *buffer, u_int16_t bufferCount ); + +/* + * This function moves the overflow extents associated with srcID into the file associated with dstID. + * We should have already verified that 'srcID' has overflow extents. So now we move all of the overflow + * extent records. + */ +OSErr MoveData( ExtendedVCB *vcb, HFSCatalogNodeID srcID, HFSCatalogNodeID destID, int rsrc) { + + OSErr err; + + /* + * Only the source file should have extents, so we just track those. + * We operate on the fork represented by the open FD that was used to call into this + * function + */ + if (rsrc) { + /* Copy the extent overflow blocks. */ + err = MoveExtents( vcb, srcID, destID, 1, (u_int8_t)0xff, 1); + if ( err != noErr ) { + if ( err != dskFulErr ) { + return( err ); + } + /* + * In case of error, we would have probably run into problems + * growing the extents b-tree. Since the move is actually a copy + delete + * just delete the new entries. Same for below. + */ + err = DeleteExtents( vcb, destID, 1, (u_int8_t)0xff, 1); + ReturnIfError( err ); // we are doomed. Just QUIT! + goto FlushAndReturn; + } + } + else { + /* Copy the extent overflow blocks. */ + err = MoveExtents( vcb, srcID, destID, 1, 0, 1); + if ( err != noErr ) { + if ( err != dskFulErr ) { + return( err ); + } + err = DeleteExtents( vcb, destID, 1, 0, 1); + ReturnIfError( err ); // we are doomed. Just QUIT! + goto FlushAndReturn; + } + } + +FlushAndReturn: + /* Write out the catalog and extent overflow B-Tree changes */ + err = FlushCatalog( vcb ); + err = FlushExtentFile( vcb ); + + return( err ); +} + + +OSErr ExchangeFileIDs( ExtendedVCB *vcb, ConstUTF8Param srcName, ConstUTF8Param destName, HFSCatalogNodeID srcID, HFSCatalogNodeID destID, u_int32_t srcHint, u_int32_t destHint ) +{ + CatalogKey srcKey; // 518 bytes + CatalogKey destKey; // 518 bytes + CatalogRecord srcData; // 520 bytes + CatalogRecord destData; // 520 bytes + CatalogRecord swapData; // 520 bytes + int16_t numSrcExtentBlocks; + int16_t numDestExtentBlocks; + OSErr err; + Boolean isHFSPlus = ( vcb->vcbSigWord == kHFSPlusSigWord ); + + err = BuildCatalogKeyUTF8(vcb, srcID, srcName, kUndefinedStrLen, &srcKey); + ReturnIfError(err); + + err = BuildCatalogKeyUTF8(vcb, destID, destName, kUndefinedStrLen, &destKey); + ReturnIfError(err); + + if ( isHFSPlus ) + { + //-- Step 1: Check the catalog nodes for extents + + //-- locate the source file, test for extents in extent file, and copy the cat record for later + err = LocateCatalogNodeByKey( vcb, srcHint, &srcKey, &srcData, &srcHint ); + ReturnIfError( err ); + + if ( srcData.recordType != kHFSPlusFileRecord ) + return( cmFThdDirErr ); // Error "cmFThdDirErr = it is a directory" + + //-- Check if there are any extents in the source file + //€€ I am only checling the extents in the low 32 bits, routine will fail if files extents after 2 gig are in overflow + numSrcExtentBlocks = CheckExtents( srcData.hfsPlusFile.dataFork.extents, srcData.hfsPlusFile.dataFork.totalBlocks, isHFSPlus ); + if ( numSrcExtentBlocks == 0 ) // then check the resource fork extents + numSrcExtentBlocks = CheckExtents( srcData.hfsPlusFile.resourceFork.extents, srcData.hfsPlusFile.resourceFork.totalBlocks, isHFSPlus ); + + //-- Check if there are any extents in the destination file + err = LocateCatalogNodeByKey( vcb, destHint, &destKey, &destData, &destHint ); + ReturnIfError( err ); + + if ( destData.recordType != kHFSPlusFileRecord ) + return( cmFThdDirErr ); // Error "cmFThdDirErr = it is a directory" + + numDestExtentBlocks = CheckExtents( destData.hfsPlusFile.dataFork.extents, destData.hfsPlusFile.dataFork.totalBlocks, isHFSPlus ); + if ( numDestExtentBlocks == 0 ) // then check the resource fork extents + numDestExtentBlocks = CheckExtents( destData.hfsPlusFile.resourceFork.extents, destData.hfsPlusFile.resourceFork.totalBlocks, isHFSPlus ); + + //-- Step 2: Exchange the Extent key in the extent file + + //-- Exchange the extents key in the extent file + err = DeleteExtents( vcb, kHFSBogusExtentFileID, 0, 0, isHFSPlus ); + ReturnIfError( err ); + + if ( numSrcExtentBlocks && numDestExtentBlocks ) // if both files have extents + { + //-- Change the source extents file ids to our known bogus value + err = MoveExtents( vcb, srcData.hfsPlusFile.fileID, kHFSBogusExtentFileID, 0,0, isHFSPlus ); + if ( err != noErr ) + { + if ( err != dskFulErr ) { + return( err ); + } + else { + err = DeleteExtents( vcb, kHFSBogusExtentFileID, 0, 0, isHFSPlus ); + ReturnIfError( err ); // we are doomed. Just QUIT! + + err = FlushCatalog( vcb ); // flush the catalog + err = FlushExtentFile( vcb ); // flush the extent file (unneeded for common case, but it's cheap) + return( dskFulErr ); + } + } + + //-- Change the destination extents file id's to the source id's + err = MoveExtents( vcb, destData.hfsPlusFile.fileID, srcData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); + if ( err != noErr ) + { + if ( err != dskFulErr ) + return( err ); + +ExUndo2aPlus: err = DeleteExtents( vcb, srcData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); + ReturnIfError( err ); // we are doomed. Just QUIT! + + err = MoveExtents( vcb, kHFSBogusExtentFileID, srcData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); // Move the extents back + ReturnIfError( err ); // we are doomed. Just QUIT! + + err = DeleteExtents( vcb, kHFSBogusExtentFileID, 0, 0, isHFSPlus ); + ReturnIfError( err ); // we are doomed. Just QUIT! + + err = FlushCatalog( vcb ); // flush the catalog + err = FlushExtentFile( vcb ); // flush the extent file (unneeded for common case, but it's cheap) + return( dskFulErr ); + + } + + //-- Change the bogus extents file id's to the dest id's + err = MoveExtents( vcb, kHFSBogusExtentFileID, destData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); + if ( err != noErr ) + { + if ( err != dskFulErr ) + return( err ); + + err = DeleteExtents( vcb, destData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); + ReturnIfError( err ); // we are doomed. Just QUIT! + + err = MoveExtents( vcb, srcData.hfsPlusFile.fileID, destData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); // Move the extents back + ReturnIfError( err ); // we are doomed. Just QUIT! + + goto ExUndo2aPlus; + } + + } + else if ( numSrcExtentBlocks ) // just the source file has extents + { + err = MoveExtents( vcb, srcData.hfsPlusFile.fileID, destData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); + if ( err != noErr ) + { + if ( err != dskFulErr ) + return( err ); + + err = DeleteExtents( vcb, srcData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); + ReturnIfError( err ); // we are doomed. Just QUIT! + + goto FlushAndReturn; + } + } + else if ( numDestExtentBlocks ) // just the destination file has extents + { + err = MoveExtents( vcb, destData.hfsPlusFile.fileID, srcData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); + if ( err != noErr ) + { + if ( err != dskFulErr ) + return( err ); + + err = DeleteExtents( vcb, destData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); + ReturnIfError( err ); // we are doomed. Just QUIT! + + goto FlushAndReturn; + } + } + + //-- Step 3: Change the data in the catalog nodes + + //-- find the source cnode and put dest info in it + err = LocateCatalogNodeByKey( vcb, srcHint, &srcKey, &srcData, &srcHint ); + if ( err != noErr ) + return( cmBadNews ); + + BlockMoveData( &srcData, &swapData, sizeof(CatalogRecord) ); + CopyBigCatalogNodeInfo( &destData, &srcData ); + + err = ReplaceBTreeRecord( vcb->catalogRefNum, &srcKey, srcHint, &srcData, sizeof(HFSPlusCatalogFile), &srcHint ); + ReturnIfError( err ); + + // find the destination cnode and put source info in it + err = LocateCatalogNodeByKey( vcb, destHint, &destKey, &destData, &destHint ); + if ( err != noErr ) + return( cmBadNews ); + + CopyBigCatalogNodeInfo( &swapData, &destData ); + err = ReplaceBTreeRecord( vcb->catalogRefNum, &destKey, destHint, &destData, sizeof(HFSPlusCatalogFile), &destHint ); + ReturnIfError( err ); + } +#if CONFIG_HFS_STD + else // HFS // + { + //-- Step 1: Check the catalog nodes for extents + + //-- locate the source file, test for extents in extent file, and copy the cat record for later + err = LocateCatalogNodeByKey( vcb, srcHint, &srcKey, &srcData, &srcHint ); + ReturnIfError( err ); + + if ( srcData.recordType != kHFSFileRecord ) + return( cmFThdDirErr ); // Error "cmFThdDirErr = it is a directory" + + //-- Check if there are any extents in the source file + numSrcExtentBlocks = CheckExtents( srcData.hfsFile.dataExtents, srcData.hfsFile.dataPhysicalSize / vcb->blockSize, isHFSPlus ); + if ( numSrcExtentBlocks == 0 ) // then check the resource fork extents + numSrcExtentBlocks = CheckExtents( srcData.hfsFile.rsrcExtents, srcData.hfsFile.rsrcPhysicalSize / vcb->blockSize, isHFSPlus ); + + + //€€ Do we save the found source node for later use? + + + //-- Check if there are any extents in the destination file + err = LocateCatalogNodeByKey( vcb, destHint, &destKey, &destData, &destHint ); + ReturnIfError( err ); + + if ( destData.recordType != kHFSFileRecord ) + return( cmFThdDirErr ); // Error "cmFThdDirErr = it is a directory" + + numDestExtentBlocks = CheckExtents( destData.hfsFile.dataExtents, destData.hfsFile.dataPhysicalSize / vcb->blockSize, isHFSPlus ); + if ( numDestExtentBlocks == 0 ) // then check the resource fork extents + numDestExtentBlocks = CheckExtents( destData.hfsFile.rsrcExtents, destData.hfsFile.rsrcPhysicalSize / vcb->blockSize, isHFSPlus ); + + //€€ Do we save the found destination node for later use? + + + //-- Step 2: Exchange the Extent key in the extent file + + //-- Exchange the extents key in the extent file + err = DeleteExtents( vcb, kHFSBogusExtentFileID, 0, 0, isHFSPlus ); + ReturnIfError( err ); + + if ( numSrcExtentBlocks && numDestExtentBlocks ) // if both files have extents + { + //-- Change the source extents file ids to our known bogus value + err = MoveExtents( vcb, srcData.hfsFile.fileID, kHFSBogusExtentFileID, 0, 0, isHFSPlus ); + if ( err != noErr ) + { + if ( err != dskFulErr ) + return( err ); + +ExUndo1a: err = DeleteExtents( vcb, kHFSBogusExtentFileID, 0, 0, isHFSPlus ); + ReturnIfError( err ); // we are doomed. Just QUIT! + + err = FlushCatalog( vcb ); // flush the catalog + err = FlushExtentFile( vcb ); // flush the extent file (unneeded for common case, but it's cheap) + return( dskFulErr ); + } + + //-- Change the destination extents file id's to the source id's + err = MoveExtents( vcb, destData.hfsFile.fileID, srcData.hfsFile.fileID, 0, 0, isHFSPlus ); + if ( err != noErr ) + { + if ( err != dskFulErr ) + return( err ); + +ExUndo2a: err = DeleteExtents( vcb, srcData.hfsFile.fileID, 0, 0, isHFSPlus ); + ReturnIfError( err ); // we are doomed. Just QUIT! + + err = MoveExtents( vcb, kHFSBogusExtentFileID, srcData.hfsFile.fileID, 0, 0, isHFSPlus ); // Move the extents back + ReturnIfError( err ); // we are doomed. Just QUIT! + + goto ExUndo1a; + } + + //-- Change the bogus extents file id's to the dest id's + err = MoveExtents( vcb, kHFSBogusExtentFileID, destData.hfsFile.fileID, 0, 0, isHFSPlus ); + if ( err != noErr ) + { + if ( err != dskFulErr ) + return( err ); + + err = DeleteExtents( vcb, destData.hfsFile.fileID, 0, 0, isHFSPlus ); + ReturnIfError( err ); // we are doomed. Just QUIT! + + err = MoveExtents( vcb, srcData.hfsFile.fileID, destData.hfsFile.fileID, 0, 0, isHFSPlus ); // Move the extents back + ReturnIfError( err ); // we are doomed. Just QUIT! + + goto ExUndo2a; + } + + } + else if ( numSrcExtentBlocks ) // just the source file has extents + { + err = MoveExtents( vcb, srcData.hfsFile.fileID, destData.hfsFile.fileID, 0, 0, isHFSPlus ); + if ( err != noErr ) + { + if ( err != dskFulErr ) + return( err ); + + err = DeleteExtents( vcb, srcData.hfsFile.fileID, 0, 0, isHFSPlus ); + ReturnIfError( err ); // we are doomed. Just QUIT! + + goto FlushAndReturn; + } + } + else if ( numDestExtentBlocks ) // just the destination file has extents + { + err = MoveExtents( vcb, destData.hfsFile.fileID, srcData.hfsFile.fileID, 0, 0, isHFSPlus ); + if ( err != noErr ) + { + if ( err != dskFulErr ) + return( err ); + + err = DeleteExtents( vcb, destData.hfsFile.fileID, 0, 0, isHFSPlus ); + ReturnIfError( err ); // we are doomed. Just QUIT! + + goto FlushAndReturn; + } + } + + //-- Step 3: Change the data in the catalog nodes + + //-- find the source cnode and put dest info in it + err = LocateCatalogNodeByKey( vcb, srcHint, &srcKey, &srcData, &srcHint ); + if ( err != noErr ) + return( cmBadNews ); + + BlockMoveData( &srcData, &swapData, sizeof(CatalogRecord) ); + //€€ Asm source copies from the saved dest catalog node + CopyCatalogNodeInfo( &destData, &srcData ); + + err = ReplaceBTreeRecord( vcb->catalogRefNum, &srcKey, srcHint, &srcData, sizeof(HFSCatalogFile), &srcHint ); + ReturnIfError( err ); + + + // find the destination cnode and put source info in it + err = LocateCatalogNodeByKey( vcb, destHint, &destKey, &destData, &destHint ); + if ( err != noErr ) + return( cmBadNews ); + + CopyCatalogNodeInfo( &swapData, &destData ); + err = ReplaceBTreeRecord( vcb->catalogRefNum, &destKey, destHint, &destData, sizeof(HFSCatalogFile), &destHint ); + ReturnIfError( err ); + } +#endif + + err = noErr; + + //-- Step 4: Error Handling section + + +FlushAndReturn: + err = FlushCatalog( vcb ); // flush the catalog + err = FlushExtentFile( vcb ); // flush the extent file (unneeded for common case, but it's cheap) + return( err ); +} + + +#if CONFIG_HFS_STD +static void CopyCatalogNodeInfo( CatalogRecord *src, CatalogRecord *dest ) +{ + dest->hfsFile.dataLogicalSize = src->hfsFile.dataLogicalSize; + dest->hfsFile.dataPhysicalSize = src->hfsFile.dataPhysicalSize; + dest->hfsFile.rsrcLogicalSize = src->hfsFile.rsrcLogicalSize; + dest->hfsFile.rsrcPhysicalSize = src->hfsFile.rsrcPhysicalSize; + dest->hfsFile.modifyDate = src->hfsFile.modifyDate; + BlockMoveData( src->hfsFile.dataExtents, dest->hfsFile.dataExtents, sizeof(HFSExtentRecord) ); + BlockMoveData( src->hfsFile.rsrcExtents, dest->hfsFile.rsrcExtents, sizeof(HFSExtentRecord) ); +} +#endif + +static void CopyBigCatalogNodeInfo( CatalogRecord *src, CatalogRecord *dest ) +{ + BlockMoveData( &src->hfsPlusFile.dataFork, &dest->hfsPlusFile.dataFork, sizeof(HFSPlusForkData) ); + BlockMoveData( &src->hfsPlusFile.resourceFork, &dest->hfsPlusFile.resourceFork, sizeof(HFSPlusForkData) ); + dest->hfsPlusFile.contentModDate = src->hfsPlusFile.contentModDate; +} + + +static OSErr MoveExtents( ExtendedVCB *vcb, u_int32_t srcFileID, u_int32_t destFileID, int quitEarly, u_int8_t forkType, Boolean isHFSPlus ) +{ + FCB * fcb; + ExtentsRecBuffer extentsBuffer[kNumExtentsToCache]; + ExtentKey * extentKeyPtr; + ExtentRecord extentData; + struct BTreeIterator *btIterator = NULL; + struct BTreeIterator *tmpIterator = NULL; + FSBufferDescriptor btRecord; + u_int16_t btKeySize; + u_int16_t btRecordSize; + int16_t i, j; + OSErr err; + + btIterator = hfs_mallocz(sizeof(struct BTreeIterator)); + tmpIterator = hfs_mallocz(sizeof(struct BTreeIterator)); + + fcb = GetFileControlBlock(vcb->extentsRefNum); + + (void) BTInvalidateHint(btIterator); + extentKeyPtr = (ExtentKey*) &btIterator->key; + btRecord.bufferAddress = &extentData; + btRecord.itemCount = 1; + + //-- Collect the extent records + + // + // A search on the following key will cause the BTree to be positioned immediately + // before the first extent record for file #srcFileID, but not actually positioned + // on any record. This is because there cannot be an extent record with FABN = 0 + // (the first extent of the fork, which would be in the catalog entry, not an extent + // record). + // + // Using BTIterateRecord with kBTreeNextRecord will then get that first extent record. + // + if (isHFSPlus) { + btRecord.itemSize = sizeof(HFSPlusExtentRecord); + btKeySize = sizeof(HFSPlusExtentKey); + + extentKeyPtr->hfsPlus.keyLength = kHFSPlusExtentKeyMaximumLength; + extentKeyPtr->hfsPlus.forkType = forkType; + extentKeyPtr->hfsPlus.pad = 0; + extentKeyPtr->hfsPlus.fileID = srcFileID; + extentKeyPtr->hfsPlus.startBlock = 0; + } +#if CONFIG_HFS_STD + else { + btRecord.itemSize = sizeof(HFSExtentRecord); + btKeySize = sizeof(HFSExtentKey); + + extentKeyPtr->hfs.keyLength = kHFSExtentKeyMaximumLength; + extentKeyPtr->hfs.forkType = 0; + extentKeyPtr->hfs.fileID = srcFileID; + extentKeyPtr->hfs.startBlock = 0; + } +#else + else { + hfs_free(tmpIterator, sizeof(*tmpIterator)); + hfs_free(btIterator, sizeof(*btIterator)); + return cmBadNews; + } +#endif + + // + // We do an initial BTSearchRecord to position the BTree's iterator just before any extent + // records for srcFileID. We then do a few BTIterateRecord and BTInsertRecord of those found + // records, but with destFileID as the file number in the key. Keep doing this sequence of + // BTIterateRecord and BTInsertRecord until we find an extent for another file, or there are + // no more extent records in the tree. + // + // Basically, we're copying records kNumExtentsToCache at a time. The copies have their file ID + // set to destFileID. + // + // This depends on BTInsertRecord not effecting the iterator used by BTIterateRecord. If it + // _did_ effect the iterator, then we would need to do a BTSearchRecord before each series + // of BTIterateRecord. We'd need to set up the key for BTSearchRecord to find the last record + // we found, so that BTIterateRecord would get the next one (the first we haven't processed). + // + + err = BTSearchRecord(fcb, btIterator, &btRecord, &btRecordSize, btIterator); + + // We expect a btNotFound here, since there shouldn't be an extent record with FABN = 0. + if (err != btNotFound) + { + hfs_debug("hfs: unexpected error from SearchBTreeRecord\n"); + + if (err == noErr) // If we found such a bogus extent record, then the tree is really messed up + err = cmBadNews; // so return an error that conveys the disk is hosed. + + hfs_free(tmpIterator, sizeof(*tmpIterator)); + hfs_free(btIterator, sizeof(*btIterator)); + return err; + } + + do + { + btRecord.bufferAddress = &extentData; + btRecord.itemCount = 1; + + for ( i=0 ; ihfsPlus.fileID; + } +#if CONFIG_HFS_STD + else { + foundFileID = extentKeyPtr->hfs.fileID; + } +#endif + if ( foundFileID == srcFileID ) { + /* Check if we need to quit early. */ + if (quitEarly && isHFSPlus) { + if (extentKeyPtr->hfsPlus.forkType != forkType) { + break; + } + } + CopyExtentInfo(extentKeyPtr, &extentData, extentsBuffer, i); + } + else{ + /* The fileID's are of a different file. We're done here. */ + break; + } + } + + + + //-- edit each extent key, and reinsert each extent record in the extent file + if (isHFSPlus) + btRecordSize = sizeof(HFSPlusExtentRecord); +#if CONFIG_HFS_STD + else + btRecordSize = sizeof(HFSExtentRecord); +#endif + + for ( j=0 ; jkey, btKeySize); + btRecord.bufferAddress = &(extentsBuffer[j].extentData); + + err = BTInsertRecord(fcb, tmpIterator, &btRecord, btRecordSize); + if ( err != noErr ) { + /* Parse the error and free iterators */ + hfs_free(btIterator, sizeof(*btIterator)); + hfs_free(tmpIterator, sizeof(*tmpIterator)); + if ( err == btExists ) + { + hfs_debug("hfs: can't insert record -- already exists\n"); + return( cmBadNews ); + } + else { + return( err ); + } + } + } + + //-- okay, done with this buffered batch, go get the next set of extent records + // If our buffer is not full, we must be done, or recieved an error + + if ( i != kNumExtentsToCache ) // if the buffer is not full, we must be done + { + err = DeleteExtents( vcb, srcFileID, quitEarly, forkType, isHFSPlus ); // Now delete all the extent entries with the sourceID + if (err != noErr ) + hfs_debug("hfs: error from DeleteExtents (%d)\n", err); + break; // we're done! + } + } while ( true ); + + hfs_free(tmpIterator, sizeof(*tmpIterator)); + hfs_free(btIterator, sizeof(*btIterator)); + + return( err ); +} + + +static void CopyExtentInfo( ExtentKey *key, ExtentRecord *data, ExtentsRecBuffer *buffer, u_int16_t bufferCount ) +{ + BlockMoveData( key, &(buffer[bufferCount].extentKey), sizeof( ExtentKey ) ); + BlockMoveData( data, &(buffer[bufferCount].extentData), sizeof( ExtentRecord ) ); +} + + +//-- Delete all extents in extent file that have the ID given. +static OSErr DeleteExtents( ExtendedVCB *vcb, u_int32_t fileID, int quitEarly, u_int8_t forkType, Boolean isHFSPlus ) +{ + FCB * fcb; + ExtentKey * extentKeyPtr; + ExtentRecord extentData; + struct BTreeIterator *btIterator = NULL; + struct BTreeIterator *tmpIterator = NULL; + FSBufferDescriptor btRecord; + u_int16_t btRecordSize; + OSErr err; + + btIterator = hfs_mallocz(sizeof(struct BTreeIterator)); + tmpIterator = hfs_mallocz(sizeof(struct BTreeIterator)); + + fcb = GetFileControlBlock(vcb->extentsRefNum); + + (void) BTInvalidateHint(btIterator); + extentKeyPtr = (ExtentKey*) &btIterator->key; + btRecord.bufferAddress = &extentData; + btRecord.itemCount = 1; + + // The algorithm is to position the BTree just before any extent records for fileID. + // Then just keep getting successive records. If the record is still for fileID, + // then delete it. + + if (isHFSPlus) { + btRecord.itemSize = sizeof(HFSPlusExtentRecord); + + extentKeyPtr->hfsPlus.keyLength = kHFSPlusExtentKeyMaximumLength; + extentKeyPtr->hfsPlus.forkType = forkType; + extentKeyPtr->hfsPlus.pad = 0; + extentKeyPtr->hfsPlus.fileID = fileID; + extentKeyPtr->hfsPlus.startBlock = 0; + } +#if CONFIG_HFS_STD + else { + btRecord.itemSize = sizeof(HFSExtentRecord); + + extentKeyPtr->hfs.keyLength = kHFSExtentKeyMaximumLength; + extentKeyPtr->hfs.forkType = forkType; + extentKeyPtr->hfs.fileID = fileID; + extentKeyPtr->hfs.startBlock = 0; + } +#else + else { + err = cmBadNews; + goto exit; + } +#endif + + err = BTSearchRecord(fcb, btIterator, &btRecord, &btRecordSize, btIterator); + if ( err != btNotFound ) + { + if (err == noErr) { // Did we find a bogus extent record? + err = cmBadNews; // Yes, so indicate things are messed up. + } + + goto exit; + } + + do + { + HFSCatalogNodeID foundFileID = 0; + + err = BTIterateRecord(fcb, kBTreeNextRecord, btIterator, &btRecord, &btRecordSize); + if ( err != noErr ) + { + if (err == btNotFound) // If we hit the end of the BTree + err = noErr; // then it's OK + + break; // We're done now. + } + if (isHFSPlus) { + foundFileID = extentKeyPtr->hfsPlus.fileID; + } +#if CONFIG_HFS_STD + else { + foundFileID = extentKeyPtr->hfs.fileID; + } +#endif + + if ( foundFileID != fileID ) { + break; // numbers don't match, we must be done + } + if (quitEarly && isHFSPlus) { + /* If we're only deleting one type of fork, then quit early if it doesn't match */ + if (extentKeyPtr->hfsPlus.forkType != forkType) { + break; + } + } + + *tmpIterator = *btIterator; + err = BTDeleteRecord( fcb, tmpIterator ); + if (err != noErr) + break; + } while ( true ); + +exit: + + hfs_free(tmpIterator, sizeof(*tmpIterator)); + hfs_free(btIterator, sizeof(*btIterator)); + + return( err ); +} + + +// Check if there are extents represented in the extents overflow file. +static u_int32_t CheckExtents( void *extents, u_int32_t totalBlocks, Boolean isHFSPlus ) +{ + u_int32_t extentAllocationBlocks; + u_int16_t i; + + + if ( totalBlocks == 0 ) + return( 0 ); + + extentAllocationBlocks = 0; + + if ( isHFSPlus ) + { + for ( i = 0 ; i < kHFSPlusExtentDensity ; i++ ) + { + extentAllocationBlocks += ((HFSPlusExtentDescriptor *)extents)[i].blockCount; + if ( extentAllocationBlocks >= totalBlocks ) // greater than or equal (extents can add past eof if 'Close" crashes w/o truncating new clump) + return( 0 ); + } + } +#if CONFIG_HFS_STD + else + { + for ( i = 0 ; i < kHFSExtentDensity ; i++ ) + { + extentAllocationBlocks += ((HFSExtentDescriptor *)extents)[i].blockCount; + if ( extentAllocationBlocks >= totalBlocks ) // greater than or equal (extents can add past eof if 'Close" crashes w/o truncating new clump) + return( 0 ); + } + } +#endif + + return( extentAllocationBlocks ); +} diff --git a/core/FileMgrInternal.h b/core/FileMgrInternal.h new file mode 100644 index 0000000..b54daf3 --- /dev/null +++ b/core/FileMgrInternal.h @@ -0,0 +1,397 @@ +/* + * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + File: FilesInternal.h + + Contains: IPI for File Manager (HFS Plus) + + Version: HFS Plus 1.0 + + Copyright: (c) 1996-2001 by Apple Inc., all rights reserved. + +*/ +#ifndef __FILEMGRINTERNAL__ +#define __FILEMGRINTERNAL__ + +#include + +#ifdef KERNEL +#ifdef __APPLE_API_PRIVATE + +#include +#include + +#if !HFS_ALLOC_TEST + +#include "hfs.h" +#include "hfs_macos_defs.h" +#include "hfs_format.h" +#include "hfs_cnode.h" + +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* CatalogNodeID is used to track catalog objects */ +typedef u_int32_t HFSCatalogNodeID; + +/* internal error codes*/ + +#if TARGET_API_MACOS_X + #define ERR_BASE -32767 +#else + #define ERR_BASE 0 +#endif + +enum { + /* FXM errors*/ + fxRangeErr = ERR_BASE + 16, /* file position beyond mapped range*/ + fxOvFlErr = ERR_BASE + 17, /* extents file overflow*/ + /* Unicode errors*/ + uniTooLongErr = ERR_BASE + 24, /* Unicode string too long to convert to Str31*/ + uniBufferTooSmallErr = ERR_BASE + 25, /* Unicode output buffer too small*/ + uniNotMappableErr = ERR_BASE + 26, /* Unicode string can't be mapped to given script*/ + /* BTree Manager errors*/ + btNotFound = ERR_BASE + 32, /* record not found*/ + btExists = ERR_BASE + 33, /* record already exists*/ + btNoSpaceAvail = ERR_BASE + 34, /* no available space*/ + btNoFit = ERR_BASE + 35, /* record doesn't fit in node */ + btBadNode = ERR_BASE + 36, /* bad node detected*/ + btBadHdr = ERR_BASE + 37, /* bad BTree header record detected*/ + dsBadRotate = ERR_BASE + 64, /* bad BTree rotate*/ + /* Catalog Manager errors*/ + cmNotFound = ERR_BASE + 48, /* CNode not found*/ + cmExists = ERR_BASE + 49, /* CNode already exists*/ + cmNotEmpty = ERR_BASE + 50, /* directory CNode not empty (valence = 0)*/ + cmRootCN = ERR_BASE + 51, /* invalid reference to root CNode*/ + cmBadNews = ERR_BASE + 52, /* detected bad catalog structure*/ + cmFThdDirErr = ERR_BASE + 53, /* thread belongs to a directory not a file*/ + cmFThdGone = ERR_BASE + 54, /* file thread doesn't exist*/ + cmParentNotFound = ERR_BASE + 55, /* CNode for parent ID does not exist*/ + /* TFS internal errors*/ + fsDSIntErr = -127 /* Internal file system error*/ +}; + + +/* internal flags*/ + +enum { + kEFAllMask = 0x01, /* allocate all requested bytes or none */ + kEFContigMask = 0x02, /* force contiguous allocation */ + kEFReserveMask = 0x04, /* keep block reserve */ + kEFDeferMask = 0x08, /* defer file block allocations */ + kEFNoClumpMask = 0x10, /* don't round up to clump size */ + kEFMetadataMask = 0x20, /* metadata allocation */ + + kTFTrunExtBit = 0, /* truncate to the extent containing new PEOF*/ + kTFTrunExtMask = 1 +}; + +enum { + kUndefinedStrLen = 0, /* Unknown string length */ + kNoHint = 0, + + /* FileIDs variables*/ + kNumExtentsToCache = 4 /* just guessing for ExchangeFiles*/ +}; + + +/* Universal Extent Key */ + +union ExtentKey { + HFSExtentKey hfs; + HFSPlusExtentKey hfsPlus; +}; +typedef union ExtentKey ExtentKey; +/* Universal extent descriptor */ + +union ExtentDescriptor { + HFSExtentDescriptor hfs; + HFSPlusExtentDescriptor hfsPlus; +}; +typedef union ExtentDescriptor ExtentDescriptor; +/* Universal extent record */ + +union ExtentRecord { + HFSExtentRecord hfs; + HFSPlusExtentRecord hfsPlus; +}; +typedef union ExtentRecord ExtentRecord; + + +enum { + CMMaxCName = kHFSMaxFileNameChars +}; + + + +/* Universal catalog name*/ + +union CatalogName { + Str31 pstr; + HFSUniStr255 ustr; +}; +typedef union CatalogName CatalogName; + + +/* + * MacOS accessor routines + */ +#define GetFileControlBlock(fref) VTOF((fref)) +#define GetFileRefNumFromFCB(fcb) FTOV((fcb)) + +/* Test for error and return if error occurred*/ +EXTERN_API_C( void ) +ReturnIfError (OSErr result); + +#define ReturnIfError(result) do { if ( (result) != noErr ) return (result); } while(0) + +/* Exit function on error*/ +EXTERN_API_C( void ) +ExitOnError (OSErr result); + +#define ExitOnError( result ) do { if ( ( result ) != noErr ) goto ErrorExit; } while(0) + + + +/* Catalog Manager Routines (IPI)*/ + +EXTERN_API_C( OSErr ) +ExchangeFileIDs (ExtendedVCB * volume, + ConstUTF8Param srcName, + ConstUTF8Param destName, + HFSCatalogNodeID srcID, + HFSCatalogNodeID destID, + u_int32_t srcHint, + u_int32_t destHint ); + +EXTERN_API_C( OSErr ) +MoveData( ExtendedVCB *vcb, HFSCatalogNodeID srcID, HFSCatalogNodeID destID, int rsrc); + +/* BTree Manager Routines*/ + +typedef CALLBACK_API_C( int32_t , KeyCompareProcPtr )(void *a, void *b); + + +EXTERN_API_C( OSErr ) +ReplaceBTreeRecord (FileReference refNum, + const void * key, + u_int32_t hint, + void * newData, + u_int16_t dataSize, + u_int32_t * newHint); + + +/* Prototypes for exported routines in VolumeAllocation.c*/ + +/* + * Flags for BlockAllocate(), BlockDeallocate() and hfs_block_alloc. + * Some of these are for internal use only. See the comment at the + * top of hfs_alloc_int for more details on the semantics of these + * flags. + */ +#define HFS_ALLOC_FORCECONTIG 0x001 //force contiguous block allocation; minblocks must be allocated +#define HFS_ALLOC_METAZONE 0x002 //can use metazone blocks +#define HFS_ALLOC_SKIPFREEBLKS 0x004 //skip checking/updating freeblocks during alloc/dealloc +#define HFS_ALLOC_FLUSHTXN 0x008 //pick best fit for allocation, even if a jnl flush is req'd +#define HFS_ALLOC_TENTATIVE 0x010 //reserved allocation that can be claimed back +#define HFS_ALLOC_LOCKED 0x020 //reserved allocation that can't be claimed back +#define HFS_ALLOC_IGNORE_TENTATIVE 0x040 //Steal tentative blocks if necessary +#define HFS_ALLOC_IGNORE_RESERVED 0x080 //Ignore tentative/committed blocks +#define HFS_ALLOC_USE_TENTATIVE 0x100 //Use the supplied tentative range (if possible) +#define HFS_ALLOC_COMMIT 0x200 //Commit the supplied extent to disk +#define HFS_ALLOC_TRY_HARD 0x400 //Search hard to try and get maxBlocks; implies HFS_ALLOC_FLUSHTXN +#define HFS_ALLOC_ROLL_BACK 0x800 //Reallocate blocks that were just deallocated +#define HFS_ALLOC_FAST_DEV 0x1000 //Prefer fast device for allocation + +typedef uint32_t hfs_block_alloc_flags_t; + +struct rl_entry; +EXTERN_API_C( OSErr ) +BlockAllocate (ExtendedVCB * vcb, + u_int32_t startingBlock, + u_int32_t minBlocks, + u_int32_t maxBlocks, + hfs_block_alloc_flags_t flags, + u_int32_t * startBlock, + u_int32_t * actualBlocks); + +typedef struct hfs_alloc_extra_args { + // Used with HFS_ALLOC_TRY_HARD and HFS_ALLOC_FORCECONTIG + uint32_t max_blocks; + + // Used with with HFS_ALLOC_USE_TENTATIVE & HFS_ALLOC_COMMIT + struct rl_entry **reservation_in; + + // Used with HFS_ALLOC_TENTATIVE & HFS_ALLOC_LOCKED + struct rl_entry **reservation_out; + + /* + * If the maximum cannot be returned, the allocation will be + * trimmed to the specified alignment after taking + * @alignment_offset into account. @alignment and + * @alignment_offset are both in terms of blocks, *not* bytes. + * The result will be such that: + * + * (block_count + @alignment_offset) % @alignment == 0 + * + * Alignment is *not* guaranteed. + * + * One example where alignment might be useful is in the case + * where the page size is greater than the allocation block size + * and I/O is being performed in multiples of the page size. + */ + int alignment; + int alignment_offset; +} hfs_alloc_extra_args_t; + +/* + * Same as BlockAllocate but slightly different API. + * @extent.startBlock is a hint for where to start searching and + * @extent.blockCount is the minimum number of blocks acceptable. + * Additional arguments can be passed in @extra_args and use will + * depend on @flags. See comment at top of hfs_block_alloc_int for + * more information. + */ +errno_t hfs_block_alloc(hfsmount_t *hfsmp, + HFSPlusExtentDescriptor *extent, + hfs_block_alloc_flags_t flags, + hfs_alloc_extra_args_t *extra_args); + +EXTERN_API_C( OSErr ) +BlockDeallocate (ExtendedVCB * vcb, + u_int32_t firstBlock, + u_int32_t numBlocks, + hfs_block_alloc_flags_t flags); + +EXTERN_API_C ( void ) +ResetVCBFreeExtCache(struct hfsmount *hfsmp); + +EXTERN_API_C( OSErr ) +BlockMarkAllocated(ExtendedVCB *vcb, u_int32_t startingBlock, u_int32_t numBlocks); + +EXTERN_API_C( OSErr ) +BlockMarkFree( ExtendedVCB *vcb, u_int32_t startingBlock, u_int32_t numBlocks); + +EXTERN_API_C( OSErr ) +BlockMarkFreeUnused( ExtendedVCB *vcb, u_int32_t startingBlock, u_int32_t numBlocks); + +EXTERN_API_C( u_int32_t ) +MetaZoneFreeBlocks(ExtendedVCB *vcb); + +EXTERN_API_C( u_int32_t ) +UpdateAllocLimit (struct hfsmount *hfsmp, u_int32_t new_end_block); + +EXTERN_API_C( u_int32_t ) +ScanUnmapBlocks(struct hfsmount *hfsmp); + +EXTERN_API_C( int ) +hfs_init_summary (struct hfsmount *hfsmp); + +errno_t hfs_find_free_extents(struct hfsmount *hfsmp, + void (*callback)(void *data, off_t), void *callback_arg); + +void hfs_free_tentative(hfsmount_t *hfsmp, struct rl_entry **reservation); +void hfs_free_locked(hfsmount_t *hfsmp, struct rl_entry **reservation); + +/* File Extent Mapping routines*/ +EXTERN_API_C( OSErr ) +FlushExtentFile (ExtendedVCB * vcb); + +#if CONFIG_HFS_STD +EXTERN_API_C( int32_t ) +CompareExtentKeys (const HFSExtentKey * searchKey, + const HFSExtentKey * trialKey); +#endif + +EXTERN_API_C( int32_t ) +CompareExtentKeysPlus (const HFSPlusExtentKey *searchKey, + const HFSPlusExtentKey *trialKey); + +OSErr SearchExtentFile(ExtendedVCB *vcb, + const FCB *fcb, + int64_t filePosition, + HFSPlusExtentKey *foundExtentKey, + HFSPlusExtentRecord foundExtentData, + u_int32_t *foundExtentDataIndex, + u_int32_t *extentBTreeHint, + u_int32_t *endingFABNPlusOne ); + +EXTERN_API_C( OSErr ) +TruncateFileC (ExtendedVCB *vcb, FCB *fcb, int64_t peof, int deleted, + int rsrc, uint32_t fileid, Boolean truncateToExtent); + +EXTERN_API_C( OSErr ) +ExtendFileC (ExtendedVCB * vcb, + FCB * fcb, + int64_t bytesToAdd, + u_int32_t blockHint, + u_int32_t flags, + int64_t * actualBytesAdded); + +EXTERN_API_C( OSErr ) +MapFileBlockC (ExtendedVCB * vcb, + FCB * fcb, + size_t numberOfBytes, + off_t offset, + daddr64_t * startBlock, + size_t * availableBytes); + +OSErr HeadTruncateFile(ExtendedVCB *vcb, FCB *fcb, u_int32_t headblks); + +EXTERN_API_C( int ) +AddFileExtent (ExtendedVCB *vcb, FCB *fcb, u_int32_t startBlock, u_int32_t blockCount); + +#if TARGET_API_MACOS_X +EXTERN_API_C( Boolean ) +NodesAreContiguous (ExtendedVCB * vcb, + FCB * fcb, + u_int32_t nodeSize); +#endif + +/* Get the current time in UTC (GMT)*/ +EXTERN_API_C( u_int32_t ) +GetTimeUTC (void); + +EXTERN_API_C( u_int32_t ) +LocalToUTC (u_int32_t localTime); + +EXTERN_API_C( u_int32_t ) +UTCToLocal (u_int32_t utcTime); + + +#ifdef __cplusplus +} +#endif + +#endif /* __APPLE_API_PRIVATE */ +#endif /* KERNEL */ +#endif /* __FILEMGRINTERNAL__ */ + diff --git a/core/HFSUnicodeWrappers.h b/core/HFSUnicodeWrappers.h new file mode 100644 index 0000000..35a394b --- /dev/null +++ b/core/HFSUnicodeWrappers.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2000-2003, 2005-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + File: HFSUnicodeWrappers.h + + Contains: IPI to Unicode routines used by File Manager. + + Version: HFS Plus 1.0 + + Written by: Mark Day + + Copyright: (c) 1996-1997 by Apple Inc., all rights reserved. + + File Ownership: + + DRI: xxx put dri here xxx + + Other Contact: xxx put other contact here xxx + + Technology: xxx put technology here xxx + + Writers: + + (DSH) Deric Horn + (msd) Mark Day + (djb) Don Brady + + Change History (most recent first): + + 11/16/97 djb Change Unicode.h to UnicodeConverter.h. + 11/7/97 msd Remove prototype for CompareUnicodeNames(). Add prototype for + FastUnicodeCompare(). + 10/13/97 djb Add encoding/index macros and add prototypes for new Get/Set + encodding routines. + 9/15/97 djb InitUnicodeConverter now takes a boolean. + 9/10/97 msd Add prototype for InitializeEncodingContext. + 6/26/97 DSH Include "MockConverter" prototype for DFA usage. + 6/25/97 DSH Removed Prototype definitions, and checked in Unicode.h and + TextCommon.h from Julio Gonzales into InternalInterfaces. + 6/25/97 msd Add prototypes for some new Unicode routines that haven't + appeared in MasterInterfaces yet. + 6/18/97 djb Add more ConversionContexts routines. + 6/13/97 djb Switched to ConvertUnicodeToHFSName, ConvertHFSNameToUnicode, & + CompareUnicodeNames. + 4/28/97 djb first checked in + 12/12/96 msd first checked in + +*/ +#ifndef _HFSUNICODEWRAPPERS_ +#define _HFSUNICODEWRAPPERS_ + +#include + +#ifdef KERNEL +#ifdef __APPLE_API_PRIVATE + +#include "hfs_macos_defs.h" +#include "hfs_format.h" + + +extern OSErr ConvertUnicodeToUTF8Mangled ( ByteCount srcLen, + ConstUniCharArrayPtr srcStr, + ByteCount maxDstLen, + ByteCount *actualDstLen, + unsigned char* dstStr , + HFSCatalogNodeID cnid); + +/* + This routine compares two Unicode names based on an ordering defined by the HFS Plus B-tree. + This ordering must stay fixed for all time. + + Output: + -n name1 < name2 (i.e. name 1 sorts before name 2) + 0 name1 = name2 + +n name1 > name2 + + NOTE: You should not depend on the magnitude of the result, just its sign. That is, when name1 < name2, then any + negative number may be returned. +*/ + +extern int32_t FastUnicodeCompare(register ConstUniCharArrayPtr str1, register ItemCount length1, + register ConstUniCharArrayPtr str2, register ItemCount length2); + +extern int32_t UnicodeBinaryCompare (register ConstUniCharArrayPtr str1, register ItemCount length1, + register ConstUniCharArrayPtr str2, register ItemCount length2); + +extern int32_t FastRelString( ConstStr255Param str1, ConstStr255Param str2 ); + + +extern HFSCatalogNodeID GetEmbeddedFileID( ConstStr31Param filename, u_int32_t length, u_int32_t *prefixLength ); +extern u_int32_t CountFilenameExtensionChars( const unsigned char * filename, u_int32_t length ); + +#endif /* __APPLE_API_PRIVATE */ +#endif /* KERNEL */ +#endif /* _HFSUNICODEWRAPPERS_ */ diff --git a/core/MacOSStubs.c b/core/MacOSStubs.c new file mode 100644 index 0000000..abfd208 --- /dev/null +++ b/core/MacOSStubs.c @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include +#include +#include +#include +#include +#include +#include + +#include "hfs.h" +#include "hfs_dbg.h" +#include "FileMgrInternal.h" + +/* + * gTimeZone should only be used for HFS volumes! + * It is initialized when an HFS volume is mounted. + */ +struct timezone gTimeZone = {8*60,1}; + +/* + * GetTimeUTC - get the GMT Mac OS time (in seconds since 1/1/1904) + * + * called by the Catalog Manager when creating/updating HFS Plus records + */ +u_int32_t GetTimeUTC(void) +{ + struct timeval tv; + + microtime(&tv); + + return (tv.tv_sec + MAC_GMT_FACTOR); +} + + +/* + * LocalToUTC - convert from Mac OS local time to Mac OS GMT time. + * This should only be called for HFS volumes (not for HFS Plus). + */ +u_int32_t LocalToUTC(u_int32_t localTime) +{ + u_int32_t gtime = localTime; + + if (gtime != 0) { + gtime += (gTimeZone.tz_minuteswest * 60); + /* + * We no longer do DST adjustments here since we don't + * know if time supplied needs adjustment! + * + * if (gTimeZone.tz_dsttime) + * gtime -= 3600; + */ + } + return (gtime); +} + +/* + * UTCToLocal - convert from Mac OS GMT time to Mac OS local time. + * This should only be called for HFS volumes (not for HFS Plus). + */ +u_int32_t UTCToLocal(u_int32_t utcTime) +{ + u_int32_t ltime = utcTime; + + if (ltime != 0) { + ltime -= (gTimeZone.tz_minuteswest * 60); + /* + * We no longer do DST adjustments here since we don't + * know if time supplied needs adjustment! + * + * if (gTimeZone.tz_dsttime) + * ltime += 3600; + */ + } + return (ltime); +} + +/* + * to_bsd_time - convert from Mac OS time (seconds since 1/1/1904) + * to BSD time (seconds since 1/1/1970) + */ +time_t to_bsd_time(u_int32_t hfs_time) +{ + u_int32_t gmt = hfs_time; + + if (gmt > MAC_GMT_FACTOR) + gmt -= MAC_GMT_FACTOR; + else + gmt = 0; /* don't let date go negative! */ + + return (time_t)gmt; +} + +/* + * to_hfs_time - convert from BSD time (seconds since 1/1/1970) + * to Mac OS time (seconds since 1/1/1904) + */ +u_int32_t to_hfs_time(time_t bsd_time) +{ + u_int32_t hfs_time = (u_int32_t)bsd_time; + + /* don't adjust zero - treat as uninitialzed */ + if (hfs_time != 0) + hfs_time += MAC_GMT_FACTOR; + + return (hfs_time); +} + +void +DebugStr( + const char * debuggerMsg + ) +{ + kprintf ("*** Mac OS Debugging Message: %s\n", debuggerMsg); +#if DEBUG + Debugger(debuggerMsg); +#endif +} diff --git a/core/UCStringCompareData.h b/core/UCStringCompareData.h new file mode 100644 index 0000000..7322837 --- /dev/null +++ b/core/UCStringCompareData.h @@ -0,0 +1,329 @@ +/* + * Copyright (c) 2000-2002, 2005 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + File: UCStringCompareData.h + + Contains: xxx put contents here xxx + + Version: HFS Plus 1.0 + + Copyright: (c) 1997-1999 by Apple Inc., all rights reserved. + + File Ownership: + + DRI: Mark Day + + Other Contact: xxx put other contact here xxx + + Technology: xxx put technology here xxx + + Writers: + + (djb) Don Brady + (msd) Mark Day + + Change History (most recent first): + + 11/16/97 djb msd. Updated lower case table with ignorable mappings and less + aggressive case folding. Added a trailing comma to make the + StreamEdit script work right. Removed Unicode decomposition + tables. Case folding tables convert u+0000 to 0xFFFF so that the + NUL character can appear in names, while still allowing a zero + value to be a sentinel. (From Andy Daniels, 11/10/97) + 8/26/97 djb Tweak gLowerCaseTable to make it faster. + 8/14/97 djb Add RelString compare table... + 4/24/97 djb first checked in + 2/27/97 msd first checked in +*/ + +#ifndef _UCSTRINGCOMPAREDATA_ +#define _UCSTRINGCOMPAREDATA_ + +#include + +#ifdef KERNEL +#ifdef __APPLE_API_PRIVATE +/* + * For better performance, the case folding table for basic latin + * is seperate from the others. This eliminates the extra lookup + * to get the offset to this table. + * + * Note: 0x0000 now maps to 0 so that it will be ignored + */ +u_int16_t gLatinCaseFold[] = { + /* 0 */ 0xFFFF, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F, + /* 1 */ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001A, 0x001B, 0x001C, 0x001D, 0x001E, 0x001F, + /* 2 */ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F, + /* 3 */ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F, + /* 4 */ 0x0040, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, + /* 5 */ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F, + /* 6 */ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, + /* 7 */ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x007F, + /* 8 */ 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E, 0x008F, + /* 9 */ 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F, + /* A */ 0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, 0x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF, + /* B */ 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, 0x00B8, 0x00B9, 0x00BA, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF, + /* C */ 0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00E6, 0x00C7, 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF, + /* D */ 0x00F0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7, 0x00F8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00FE, 0x00DF, + /* E */ 0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7, 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, + /* F */ 0x00F0, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7, 0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0x00FF, +}; + +/* The lower case table consists of a 256-entry high-byte table followed by some number of + 256-entry subtables. The high-byte table contains either an offset to the subtable for + characters with that high byte or zero, which means that there are no case mappings or + ignored characters in that block. Ignored characters are mapped to zero. + */ + +u_int16_t gLowerCaseTable[] = { + + /* High-byte indices ( == 0 iff no case mapping and no ignorables ) */ + + /* 0 */ 0x0000, 0x0100, 0x0000, 0x0200, 0x0300, 0x0400, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* 1 */ 0x0500, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* 2 */ 0x0600, 0x0700, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* 3 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* 4 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* 5 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* 6 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* 7 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* 8 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* 9 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* A */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* B */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* C */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* D */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* E */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* F */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0800, 0x0900, + + /* Table 1 (for high byte 0x01) */ + + /* 0 */ 0x0100, 0x0101, 0x0102, 0x0103, 0x0104, 0x0105, 0x0106, 0x0107, 0x0108, 0x0109, 0x010A, 0x010B, 0x010C, 0x010D, 0x010E, 0x010F, + /* 1 */ 0x0111, 0x0111, 0x0112, 0x0113, 0x0114, 0x0115, 0x0116, 0x0117, 0x0118, 0x0119, 0x011A, 0x011B, 0x011C, 0x011D, 0x011E, 0x011F, + /* 2 */ 0x0120, 0x0121, 0x0122, 0x0123, 0x0124, 0x0125, 0x0127, 0x0127, 0x0128, 0x0129, 0x012A, 0x012B, 0x012C, 0x012D, 0x012E, 0x012F, + /* 3 */ 0x0130, 0x0131, 0x0133, 0x0133, 0x0134, 0x0135, 0x0136, 0x0137, 0x0138, 0x0139, 0x013A, 0x013B, 0x013C, 0x013D, 0x013E, 0x0140, + /* 4 */ 0x0140, 0x0142, 0x0142, 0x0143, 0x0144, 0x0145, 0x0146, 0x0147, 0x0148, 0x0149, 0x014B, 0x014B, 0x014C, 0x014D, 0x014E, 0x014F, + /* 5 */ 0x0150, 0x0151, 0x0153, 0x0153, 0x0154, 0x0155, 0x0156, 0x0157, 0x0158, 0x0159, 0x015A, 0x015B, 0x015C, 0x015D, 0x015E, 0x015F, + /* 6 */ 0x0160, 0x0161, 0x0162, 0x0163, 0x0164, 0x0165, 0x0167, 0x0167, 0x0168, 0x0169, 0x016A, 0x016B, 0x016C, 0x016D, 0x016E, 0x016F, + /* 7 */ 0x0170, 0x0171, 0x0172, 0x0173, 0x0174, 0x0175, 0x0176, 0x0177, 0x0178, 0x0179, 0x017A, 0x017B, 0x017C, 0x017D, 0x017E, 0x017F, + /* 8 */ 0x0180, 0x0253, 0x0183, 0x0183, 0x0185, 0x0185, 0x0254, 0x0188, 0x0188, 0x0256, 0x0257, 0x018C, 0x018C, 0x018D, 0x01DD, 0x0259, + /* 9 */ 0x025B, 0x0192, 0x0192, 0x0260, 0x0263, 0x0195, 0x0269, 0x0268, 0x0199, 0x0199, 0x019A, 0x019B, 0x026F, 0x0272, 0x019E, 0x0275, + /* A */ 0x01A0, 0x01A1, 0x01A3, 0x01A3, 0x01A5, 0x01A5, 0x01A6, 0x01A8, 0x01A8, 0x0283, 0x01AA, 0x01AB, 0x01AD, 0x01AD, 0x0288, 0x01AF, + /* B */ 0x01B0, 0x028A, 0x028B, 0x01B4, 0x01B4, 0x01B6, 0x01B6, 0x0292, 0x01B9, 0x01B9, 0x01BA, 0x01BB, 0x01BD, 0x01BD, 0x01BE, 0x01BF, + /* C */ 0x01C0, 0x01C1, 0x01C2, 0x01C3, 0x01C6, 0x01C6, 0x01C6, 0x01C9, 0x01C9, 0x01C9, 0x01CC, 0x01CC, 0x01CC, 0x01CD, 0x01CE, 0x01CF, + /* D */ 0x01D0, 0x01D1, 0x01D2, 0x01D3, 0x01D4, 0x01D5, 0x01D6, 0x01D7, 0x01D8, 0x01D9, 0x01DA, 0x01DB, 0x01DC, 0x01DD, 0x01DE, 0x01DF, + /* E */ 0x01E0, 0x01E1, 0x01E2, 0x01E3, 0x01E5, 0x01E5, 0x01E6, 0x01E7, 0x01E8, 0x01E9, 0x01EA, 0x01EB, 0x01EC, 0x01ED, 0x01EE, 0x01EF, + /* F */ 0x01F0, 0x01F3, 0x01F3, 0x01F3, 0x01F4, 0x01F5, 0x01F6, 0x01F7, 0x01F8, 0x01F9, 0x01FA, 0x01FB, 0x01FC, 0x01FD, 0x01FE, 0x01FF, + + /* Table 2 (for high byte 0x03) */ + + /* 0 */ 0x0300, 0x0301, 0x0302, 0x0303, 0x0304, 0x0305, 0x0306, 0x0307, 0x0308, 0x0309, 0x030A, 0x030B, 0x030C, 0x030D, 0x030E, 0x030F, + /* 1 */ 0x0310, 0x0311, 0x0312, 0x0313, 0x0314, 0x0315, 0x0316, 0x0317, 0x0318, 0x0319, 0x031A, 0x031B, 0x031C, 0x031D, 0x031E, 0x031F, + /* 2 */ 0x0320, 0x0321, 0x0322, 0x0323, 0x0324, 0x0325, 0x0326, 0x0327, 0x0328, 0x0329, 0x032A, 0x032B, 0x032C, 0x032D, 0x032E, 0x032F, + /* 3 */ 0x0330, 0x0331, 0x0332, 0x0333, 0x0334, 0x0335, 0x0336, 0x0337, 0x0338, 0x0339, 0x033A, 0x033B, 0x033C, 0x033D, 0x033E, 0x033F, + /* 4 */ 0x0340, 0x0341, 0x0342, 0x0343, 0x0344, 0x0345, 0x0346, 0x0347, 0x0348, 0x0349, 0x034A, 0x034B, 0x034C, 0x034D, 0x034E, 0x034F, + /* 5 */ 0x0350, 0x0351, 0x0352, 0x0353, 0x0354, 0x0355, 0x0356, 0x0357, 0x0358, 0x0359, 0x035A, 0x035B, 0x035C, 0x035D, 0x035E, 0x035F, + /* 6 */ 0x0360, 0x0361, 0x0362, 0x0363, 0x0364, 0x0365, 0x0366, 0x0367, 0x0368, 0x0369, 0x036A, 0x036B, 0x036C, 0x036D, 0x036E, 0x036F, + /* 7 */ 0x0370, 0x0371, 0x0372, 0x0373, 0x0374, 0x0375, 0x0376, 0x0377, 0x0378, 0x0379, 0x037A, 0x037B, 0x037C, 0x037D, 0x037E, 0x037F, + /* 8 */ 0x0380, 0x0381, 0x0382, 0x0383, 0x0384, 0x0385, 0x0386, 0x0387, 0x0388, 0x0389, 0x038A, 0x038B, 0x038C, 0x038D, 0x038E, 0x038F, + /* 9 */ 0x0390, 0x03B1, 0x03B2, 0x03B3, 0x03B4, 0x03B5, 0x03B6, 0x03B7, 0x03B8, 0x03B9, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BE, 0x03BF, + /* A */ 0x03C0, 0x03C1, 0x03A2, 0x03C3, 0x03C4, 0x03C5, 0x03C6, 0x03C7, 0x03C8, 0x03C9, 0x03AA, 0x03AB, 0x03AC, 0x03AD, 0x03AE, 0x03AF, + /* B */ 0x03B0, 0x03B1, 0x03B2, 0x03B3, 0x03B4, 0x03B5, 0x03B6, 0x03B7, 0x03B8, 0x03B9, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BE, 0x03BF, + /* C */ 0x03C0, 0x03C1, 0x03C2, 0x03C3, 0x03C4, 0x03C5, 0x03C6, 0x03C7, 0x03C8, 0x03C9, 0x03CA, 0x03CB, 0x03CC, 0x03CD, 0x03CE, 0x03CF, + /* D */ 0x03D0, 0x03D1, 0x03D2, 0x03D3, 0x03D4, 0x03D5, 0x03D6, 0x03D7, 0x03D8, 0x03D9, 0x03DA, 0x03DB, 0x03DC, 0x03DD, 0x03DE, 0x03DF, + /* E */ 0x03E0, 0x03E1, 0x03E3, 0x03E3, 0x03E5, 0x03E5, 0x03E7, 0x03E7, 0x03E9, 0x03E9, 0x03EB, 0x03EB, 0x03ED, 0x03ED, 0x03EF, 0x03EF, + /* F */ 0x03F0, 0x03F1, 0x03F2, 0x03F3, 0x03F4, 0x03F5, 0x03F6, 0x03F7, 0x03F8, 0x03F9, 0x03FA, 0x03FB, 0x03FC, 0x03FD, 0x03FE, 0x03FF, + + /* Table 3 (for high byte 0x04) */ + + /* 0 */ 0x0400, 0x0401, 0x0452, 0x0403, 0x0454, 0x0455, 0x0456, 0x0407, 0x0458, 0x0459, 0x045A, 0x045B, 0x040C, 0x040D, 0x040E, 0x045F, + /* 1 */ 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, 0x0438, 0x0419, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E, 0x043F, + /* 2 */ 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, 0x0448, 0x0449, 0x044A, 0x044B, 0x044C, 0x044D, 0x044E, 0x044F, + /* 3 */ 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, 0x0438, 0x0439, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E, 0x043F, + /* 4 */ 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, 0x0448, 0x0449, 0x044A, 0x044B, 0x044C, 0x044D, 0x044E, 0x044F, + /* 5 */ 0x0450, 0x0451, 0x0452, 0x0453, 0x0454, 0x0455, 0x0456, 0x0457, 0x0458, 0x0459, 0x045A, 0x045B, 0x045C, 0x045D, 0x045E, 0x045F, + /* 6 */ 0x0461, 0x0461, 0x0463, 0x0463, 0x0465, 0x0465, 0x0467, 0x0467, 0x0469, 0x0469, 0x046B, 0x046B, 0x046D, 0x046D, 0x046F, 0x046F, + /* 7 */ 0x0471, 0x0471, 0x0473, 0x0473, 0x0475, 0x0475, 0x0476, 0x0477, 0x0479, 0x0479, 0x047B, 0x047B, 0x047D, 0x047D, 0x047F, 0x047F, + /* 8 */ 0x0481, 0x0481, 0x0482, 0x0483, 0x0484, 0x0485, 0x0486, 0x0487, 0x0488, 0x0489, 0x048A, 0x048B, 0x048C, 0x048D, 0x048E, 0x048F, + /* 9 */ 0x0491, 0x0491, 0x0493, 0x0493, 0x0495, 0x0495, 0x0497, 0x0497, 0x0499, 0x0499, 0x049B, 0x049B, 0x049D, 0x049D, 0x049F, 0x049F, + /* A */ 0x04A1, 0x04A1, 0x04A3, 0x04A3, 0x04A5, 0x04A5, 0x04A7, 0x04A7, 0x04A9, 0x04A9, 0x04AB, 0x04AB, 0x04AD, 0x04AD, 0x04AF, 0x04AF, + /* B */ 0x04B1, 0x04B1, 0x04B3, 0x04B3, 0x04B5, 0x04B5, 0x04B7, 0x04B7, 0x04B9, 0x04B9, 0x04BB, 0x04BB, 0x04BD, 0x04BD, 0x04BF, 0x04BF, + /* C */ 0x04C0, 0x04C1, 0x04C2, 0x04C4, 0x04C4, 0x04C5, 0x04C6, 0x04C8, 0x04C8, 0x04C9, 0x04CA, 0x04CC, 0x04CC, 0x04CD, 0x04CE, 0x04CF, + /* D */ 0x04D0, 0x04D1, 0x04D2, 0x04D3, 0x04D4, 0x04D5, 0x04D6, 0x04D7, 0x04D8, 0x04D9, 0x04DA, 0x04DB, 0x04DC, 0x04DD, 0x04DE, 0x04DF, + /* E */ 0x04E0, 0x04E1, 0x04E2, 0x04E3, 0x04E4, 0x04E5, 0x04E6, 0x04E7, 0x04E8, 0x04E9, 0x04EA, 0x04EB, 0x04EC, 0x04ED, 0x04EE, 0x04EF, + /* F */ 0x04F0, 0x04F1, 0x04F2, 0x04F3, 0x04F4, 0x04F5, 0x04F6, 0x04F7, 0x04F8, 0x04F9, 0x04FA, 0x04FB, 0x04FC, 0x04FD, 0x04FE, 0x04FF, + + /* Table 4 (for high byte 0x05) */ + + /* 0 */ 0x0500, 0x0501, 0x0502, 0x0503, 0x0504, 0x0505, 0x0506, 0x0507, 0x0508, 0x0509, 0x050A, 0x050B, 0x050C, 0x050D, 0x050E, 0x050F, + /* 1 */ 0x0510, 0x0511, 0x0512, 0x0513, 0x0514, 0x0515, 0x0516, 0x0517, 0x0518, 0x0519, 0x051A, 0x051B, 0x051C, 0x051D, 0x051E, 0x051F, + /* 2 */ 0x0520, 0x0521, 0x0522, 0x0523, 0x0524, 0x0525, 0x0526, 0x0527, 0x0528, 0x0529, 0x052A, 0x052B, 0x052C, 0x052D, 0x052E, 0x052F, + /* 3 */ 0x0530, 0x0561, 0x0562, 0x0563, 0x0564, 0x0565, 0x0566, 0x0567, 0x0568, 0x0569, 0x056A, 0x056B, 0x056C, 0x056D, 0x056E, 0x056F, + /* 4 */ 0x0570, 0x0571, 0x0572, 0x0573, 0x0574, 0x0575, 0x0576, 0x0577, 0x0578, 0x0579, 0x057A, 0x057B, 0x057C, 0x057D, 0x057E, 0x057F, + /* 5 */ 0x0580, 0x0581, 0x0582, 0x0583, 0x0584, 0x0585, 0x0586, 0x0557, 0x0558, 0x0559, 0x055A, 0x055B, 0x055C, 0x055D, 0x055E, 0x055F, + /* 6 */ 0x0560, 0x0561, 0x0562, 0x0563, 0x0564, 0x0565, 0x0566, 0x0567, 0x0568, 0x0569, 0x056A, 0x056B, 0x056C, 0x056D, 0x056E, 0x056F, + /* 7 */ 0x0570, 0x0571, 0x0572, 0x0573, 0x0574, 0x0575, 0x0576, 0x0577, 0x0578, 0x0579, 0x057A, 0x057B, 0x057C, 0x057D, 0x057E, 0x057F, + /* 8 */ 0x0580, 0x0581, 0x0582, 0x0583, 0x0584, 0x0585, 0x0586, 0x0587, 0x0588, 0x0589, 0x058A, 0x058B, 0x058C, 0x058D, 0x058E, 0x058F, + /* 9 */ 0x0590, 0x0591, 0x0592, 0x0593, 0x0594, 0x0595, 0x0596, 0x0597, 0x0598, 0x0599, 0x059A, 0x059B, 0x059C, 0x059D, 0x059E, 0x059F, + /* A */ 0x05A0, 0x05A1, 0x05A2, 0x05A3, 0x05A4, 0x05A5, 0x05A6, 0x05A7, 0x05A8, 0x05A9, 0x05AA, 0x05AB, 0x05AC, 0x05AD, 0x05AE, 0x05AF, + /* B */ 0x05B0, 0x05B1, 0x05B2, 0x05B3, 0x05B4, 0x05B5, 0x05B6, 0x05B7, 0x05B8, 0x05B9, 0x05BA, 0x05BB, 0x05BC, 0x05BD, 0x05BE, 0x05BF, + /* C */ 0x05C0, 0x05C1, 0x05C2, 0x05C3, 0x05C4, 0x05C5, 0x05C6, 0x05C7, 0x05C8, 0x05C9, 0x05CA, 0x05CB, 0x05CC, 0x05CD, 0x05CE, 0x05CF, + /* D */ 0x05D0, 0x05D1, 0x05D2, 0x05D3, 0x05D4, 0x05D5, 0x05D6, 0x05D7, 0x05D8, 0x05D9, 0x05DA, 0x05DB, 0x05DC, 0x05DD, 0x05DE, 0x05DF, + /* E */ 0x05E0, 0x05E1, 0x05E2, 0x05E3, 0x05E4, 0x05E5, 0x05E6, 0x05E7, 0x05E8, 0x05E9, 0x05EA, 0x05EB, 0x05EC, 0x05ED, 0x05EE, 0x05EF, + /* F */ 0x05F0, 0x05F1, 0x05F2, 0x05F3, 0x05F4, 0x05F5, 0x05F6, 0x05F7, 0x05F8, 0x05F9, 0x05FA, 0x05FB, 0x05FC, 0x05FD, 0x05FE, 0x05FF, + + /* Table 5 (for high byte 0x10) */ + + /* 0 */ 0x1000, 0x1001, 0x1002, 0x1003, 0x1004, 0x1005, 0x1006, 0x1007, 0x1008, 0x1009, 0x100A, 0x100B, 0x100C, 0x100D, 0x100E, 0x100F, + /* 1 */ 0x1010, 0x1011, 0x1012, 0x1013, 0x1014, 0x1015, 0x1016, 0x1017, 0x1018, 0x1019, 0x101A, 0x101B, 0x101C, 0x101D, 0x101E, 0x101F, + /* 2 */ 0x1020, 0x1021, 0x1022, 0x1023, 0x1024, 0x1025, 0x1026, 0x1027, 0x1028, 0x1029, 0x102A, 0x102B, 0x102C, 0x102D, 0x102E, 0x102F, + /* 3 */ 0x1030, 0x1031, 0x1032, 0x1033, 0x1034, 0x1035, 0x1036, 0x1037, 0x1038, 0x1039, 0x103A, 0x103B, 0x103C, 0x103D, 0x103E, 0x103F, + /* 4 */ 0x1040, 0x1041, 0x1042, 0x1043, 0x1044, 0x1045, 0x1046, 0x1047, 0x1048, 0x1049, 0x104A, 0x104B, 0x104C, 0x104D, 0x104E, 0x104F, + /* 5 */ 0x1050, 0x1051, 0x1052, 0x1053, 0x1054, 0x1055, 0x1056, 0x1057, 0x1058, 0x1059, 0x105A, 0x105B, 0x105C, 0x105D, 0x105E, 0x105F, + /* 6 */ 0x1060, 0x1061, 0x1062, 0x1063, 0x1064, 0x1065, 0x1066, 0x1067, 0x1068, 0x1069, 0x106A, 0x106B, 0x106C, 0x106D, 0x106E, 0x106F, + /* 7 */ 0x1070, 0x1071, 0x1072, 0x1073, 0x1074, 0x1075, 0x1076, 0x1077, 0x1078, 0x1079, 0x107A, 0x107B, 0x107C, 0x107D, 0x107E, 0x107F, + /* 8 */ 0x1080, 0x1081, 0x1082, 0x1083, 0x1084, 0x1085, 0x1086, 0x1087, 0x1088, 0x1089, 0x108A, 0x108B, 0x108C, 0x108D, 0x108E, 0x108F, + /* 9 */ 0x1090, 0x1091, 0x1092, 0x1093, 0x1094, 0x1095, 0x1096, 0x1097, 0x1098, 0x1099, 0x109A, 0x109B, 0x109C, 0x109D, 0x109E, 0x109F, + /* A */ 0x10D0, 0x10D1, 0x10D2, 0x10D3, 0x10D4, 0x10D5, 0x10D6, 0x10D7, 0x10D8, 0x10D9, 0x10DA, 0x10DB, 0x10DC, 0x10DD, 0x10DE, 0x10DF, + /* B */ 0x10E0, 0x10E1, 0x10E2, 0x10E3, 0x10E4, 0x10E5, 0x10E6, 0x10E7, 0x10E8, 0x10E9, 0x10EA, 0x10EB, 0x10EC, 0x10ED, 0x10EE, 0x10EF, + /* C */ 0x10F0, 0x10F1, 0x10F2, 0x10F3, 0x10F4, 0x10F5, 0x10C6, 0x10C7, 0x10C8, 0x10C9, 0x10CA, 0x10CB, 0x10CC, 0x10CD, 0x10CE, 0x10CF, + /* D */ 0x10D0, 0x10D1, 0x10D2, 0x10D3, 0x10D4, 0x10D5, 0x10D6, 0x10D7, 0x10D8, 0x10D9, 0x10DA, 0x10DB, 0x10DC, 0x10DD, 0x10DE, 0x10DF, + /* E */ 0x10E0, 0x10E1, 0x10E2, 0x10E3, 0x10E4, 0x10E5, 0x10E6, 0x10E7, 0x10E8, 0x10E9, 0x10EA, 0x10EB, 0x10EC, 0x10ED, 0x10EE, 0x10EF, + /* F */ 0x10F0, 0x10F1, 0x10F2, 0x10F3, 0x10F4, 0x10F5, 0x10F6, 0x10F7, 0x10F8, 0x10F9, 0x10FA, 0x10FB, 0x10FC, 0x10FD, 0x10FE, 0x10FF, + + /* Table 6 (for high byte 0x20) */ + + /* 0 */ 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x200B, 0x0000, 0x0000, 0x0000, 0x0000, + /* 1 */ 0x2010, 0x2011, 0x2012, 0x2013, 0x2014, 0x2015, 0x2016, 0x2017, 0x2018, 0x2019, 0x201A, 0x201B, 0x201C, 0x201D, 0x201E, 0x201F, + /* 2 */ 0x2020, 0x2021, 0x2022, 0x2023, 0x2024, 0x2025, 0x2026, 0x2027, 0x2028, 0x2029, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x202F, + /* 3 */ 0x2030, 0x2031, 0x2032, 0x2033, 0x2034, 0x2035, 0x2036, 0x2037, 0x2038, 0x2039, 0x203A, 0x203B, 0x203C, 0x203D, 0x203E, 0x203F, + /* 4 */ 0x2040, 0x2041, 0x2042, 0x2043, 0x2044, 0x2045, 0x2046, 0x2047, 0x2048, 0x2049, 0x204A, 0x204B, 0x204C, 0x204D, 0x204E, 0x204F, + /* 5 */ 0x2050, 0x2051, 0x2052, 0x2053, 0x2054, 0x2055, 0x2056, 0x2057, 0x2058, 0x2059, 0x205A, 0x205B, 0x205C, 0x205D, 0x205E, 0x205F, + /* 6 */ 0x2060, 0x2061, 0x2062, 0x2063, 0x2064, 0x2065, 0x2066, 0x2067, 0x2068, 0x2069, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* 7 */ 0x2070, 0x2071, 0x2072, 0x2073, 0x2074, 0x2075, 0x2076, 0x2077, 0x2078, 0x2079, 0x207A, 0x207B, 0x207C, 0x207D, 0x207E, 0x207F, + /* 8 */ 0x2080, 0x2081, 0x2082, 0x2083, 0x2084, 0x2085, 0x2086, 0x2087, 0x2088, 0x2089, 0x208A, 0x208B, 0x208C, 0x208D, 0x208E, 0x208F, + /* 9 */ 0x2090, 0x2091, 0x2092, 0x2093, 0x2094, 0x2095, 0x2096, 0x2097, 0x2098, 0x2099, 0x209A, 0x209B, 0x209C, 0x209D, 0x209E, 0x209F, + /* A */ 0x20A0, 0x20A1, 0x20A2, 0x20A3, 0x20A4, 0x20A5, 0x20A6, 0x20A7, 0x20A8, 0x20A9, 0x20AA, 0x20AB, 0x20AC, 0x20AD, 0x20AE, 0x20AF, + /* B */ 0x20B0, 0x20B1, 0x20B2, 0x20B3, 0x20B4, 0x20B5, 0x20B6, 0x20B7, 0x20B8, 0x20B9, 0x20BA, 0x20BB, 0x20BC, 0x20BD, 0x20BE, 0x20BF, + /* C */ 0x20C0, 0x20C1, 0x20C2, 0x20C3, 0x20C4, 0x20C5, 0x20C6, 0x20C7, 0x20C8, 0x20C9, 0x20CA, 0x20CB, 0x20CC, 0x20CD, 0x20CE, 0x20CF, + /* D */ 0x20D0, 0x20D1, 0x20D2, 0x20D3, 0x20D4, 0x20D5, 0x20D6, 0x20D7, 0x20D8, 0x20D9, 0x20DA, 0x20DB, 0x20DC, 0x20DD, 0x20DE, 0x20DF, + /* E */ 0x20E0, 0x20E1, 0x20E2, 0x20E3, 0x20E4, 0x20E5, 0x20E6, 0x20E7, 0x20E8, 0x20E9, 0x20EA, 0x20EB, 0x20EC, 0x20ED, 0x20EE, 0x20EF, + /* F */ 0x20F0, 0x20F1, 0x20F2, 0x20F3, 0x20F4, 0x20F5, 0x20F6, 0x20F7, 0x20F8, 0x20F9, 0x20FA, 0x20FB, 0x20FC, 0x20FD, 0x20FE, 0x20FF, + + /* Table 7 (for high byte 0x21) */ + + /* 0 */ 0x2100, 0x2101, 0x2102, 0x2103, 0x2104, 0x2105, 0x2106, 0x2107, 0x2108, 0x2109, 0x210A, 0x210B, 0x210C, 0x210D, 0x210E, 0x210F, + /* 1 */ 0x2110, 0x2111, 0x2112, 0x2113, 0x2114, 0x2115, 0x2116, 0x2117, 0x2118, 0x2119, 0x211A, 0x211B, 0x211C, 0x211D, 0x211E, 0x211F, + /* 2 */ 0x2120, 0x2121, 0x2122, 0x2123, 0x2124, 0x2125, 0x2126, 0x2127, 0x2128, 0x2129, 0x212A, 0x212B, 0x212C, 0x212D, 0x212E, 0x212F, + /* 3 */ 0x2130, 0x2131, 0x2132, 0x2133, 0x2134, 0x2135, 0x2136, 0x2137, 0x2138, 0x2139, 0x213A, 0x213B, 0x213C, 0x213D, 0x213E, 0x213F, + /* 4 */ 0x2140, 0x2141, 0x2142, 0x2143, 0x2144, 0x2145, 0x2146, 0x2147, 0x2148, 0x2149, 0x214A, 0x214B, 0x214C, 0x214D, 0x214E, 0x214F, + /* 5 */ 0x2150, 0x2151, 0x2152, 0x2153, 0x2154, 0x2155, 0x2156, 0x2157, 0x2158, 0x2159, 0x215A, 0x215B, 0x215C, 0x215D, 0x215E, 0x215F, + /* 6 */ 0x2170, 0x2171, 0x2172, 0x2173, 0x2174, 0x2175, 0x2176, 0x2177, 0x2178, 0x2179, 0x217A, 0x217B, 0x217C, 0x217D, 0x217E, 0x217F, + /* 7 */ 0x2170, 0x2171, 0x2172, 0x2173, 0x2174, 0x2175, 0x2176, 0x2177, 0x2178, 0x2179, 0x217A, 0x217B, 0x217C, 0x217D, 0x217E, 0x217F, + /* 8 */ 0x2180, 0x2181, 0x2182, 0x2183, 0x2184, 0x2185, 0x2186, 0x2187, 0x2188, 0x2189, 0x218A, 0x218B, 0x218C, 0x218D, 0x218E, 0x218F, + /* 9 */ 0x2190, 0x2191, 0x2192, 0x2193, 0x2194, 0x2195, 0x2196, 0x2197, 0x2198, 0x2199, 0x219A, 0x219B, 0x219C, 0x219D, 0x219E, 0x219F, + /* A */ 0x21A0, 0x21A1, 0x21A2, 0x21A3, 0x21A4, 0x21A5, 0x21A6, 0x21A7, 0x21A8, 0x21A9, 0x21AA, 0x21AB, 0x21AC, 0x21AD, 0x21AE, 0x21AF, + /* B */ 0x21B0, 0x21B1, 0x21B2, 0x21B3, 0x21B4, 0x21B5, 0x21B6, 0x21B7, 0x21B8, 0x21B9, 0x21BA, 0x21BB, 0x21BC, 0x21BD, 0x21BE, 0x21BF, + /* C */ 0x21C0, 0x21C1, 0x21C2, 0x21C3, 0x21C4, 0x21C5, 0x21C6, 0x21C7, 0x21C8, 0x21C9, 0x21CA, 0x21CB, 0x21CC, 0x21CD, 0x21CE, 0x21CF, + /* D */ 0x21D0, 0x21D1, 0x21D2, 0x21D3, 0x21D4, 0x21D5, 0x21D6, 0x21D7, 0x21D8, 0x21D9, 0x21DA, 0x21DB, 0x21DC, 0x21DD, 0x21DE, 0x21DF, + /* E */ 0x21E0, 0x21E1, 0x21E2, 0x21E3, 0x21E4, 0x21E5, 0x21E6, 0x21E7, 0x21E8, 0x21E9, 0x21EA, 0x21EB, 0x21EC, 0x21ED, 0x21EE, 0x21EF, + /* F */ 0x21F0, 0x21F1, 0x21F2, 0x21F3, 0x21F4, 0x21F5, 0x21F6, 0x21F7, 0x21F8, 0x21F9, 0x21FA, 0x21FB, 0x21FC, 0x21FD, 0x21FE, 0x21FF, + + /* Table 8 (for high byte 0xFE) */ + + /* 0 */ 0xFE00, 0xFE01, 0xFE02, 0xFE03, 0xFE04, 0xFE05, 0xFE06, 0xFE07, 0xFE08, 0xFE09, 0xFE0A, 0xFE0B, 0xFE0C, 0xFE0D, 0xFE0E, 0xFE0F, + /* 1 */ 0xFE10, 0xFE11, 0xFE12, 0xFE13, 0xFE14, 0xFE15, 0xFE16, 0xFE17, 0xFE18, 0xFE19, 0xFE1A, 0xFE1B, 0xFE1C, 0xFE1D, 0xFE1E, 0xFE1F, + /* 2 */ 0xFE20, 0xFE21, 0xFE22, 0xFE23, 0xFE24, 0xFE25, 0xFE26, 0xFE27, 0xFE28, 0xFE29, 0xFE2A, 0xFE2B, 0xFE2C, 0xFE2D, 0xFE2E, 0xFE2F, + /* 3 */ 0xFE30, 0xFE31, 0xFE32, 0xFE33, 0xFE34, 0xFE35, 0xFE36, 0xFE37, 0xFE38, 0xFE39, 0xFE3A, 0xFE3B, 0xFE3C, 0xFE3D, 0xFE3E, 0xFE3F, + /* 4 */ 0xFE40, 0xFE41, 0xFE42, 0xFE43, 0xFE44, 0xFE45, 0xFE46, 0xFE47, 0xFE48, 0xFE49, 0xFE4A, 0xFE4B, 0xFE4C, 0xFE4D, 0xFE4E, 0xFE4F, + /* 5 */ 0xFE50, 0xFE51, 0xFE52, 0xFE53, 0xFE54, 0xFE55, 0xFE56, 0xFE57, 0xFE58, 0xFE59, 0xFE5A, 0xFE5B, 0xFE5C, 0xFE5D, 0xFE5E, 0xFE5F, + /* 6 */ 0xFE60, 0xFE61, 0xFE62, 0xFE63, 0xFE64, 0xFE65, 0xFE66, 0xFE67, 0xFE68, 0xFE69, 0xFE6A, 0xFE6B, 0xFE6C, 0xFE6D, 0xFE6E, 0xFE6F, + /* 7 */ 0xFE70, 0xFE71, 0xFE72, 0xFE73, 0xFE74, 0xFE75, 0xFE76, 0xFE77, 0xFE78, 0xFE79, 0xFE7A, 0xFE7B, 0xFE7C, 0xFE7D, 0xFE7E, 0xFE7F, + /* 8 */ 0xFE80, 0xFE81, 0xFE82, 0xFE83, 0xFE84, 0xFE85, 0xFE86, 0xFE87, 0xFE88, 0xFE89, 0xFE8A, 0xFE8B, 0xFE8C, 0xFE8D, 0xFE8E, 0xFE8F, + /* 9 */ 0xFE90, 0xFE91, 0xFE92, 0xFE93, 0xFE94, 0xFE95, 0xFE96, 0xFE97, 0xFE98, 0xFE99, 0xFE9A, 0xFE9B, 0xFE9C, 0xFE9D, 0xFE9E, 0xFE9F, + /* A */ 0xFEA0, 0xFEA1, 0xFEA2, 0xFEA3, 0xFEA4, 0xFEA5, 0xFEA6, 0xFEA7, 0xFEA8, 0xFEA9, 0xFEAA, 0xFEAB, 0xFEAC, 0xFEAD, 0xFEAE, 0xFEAF, + /* B */ 0xFEB0, 0xFEB1, 0xFEB2, 0xFEB3, 0xFEB4, 0xFEB5, 0xFEB6, 0xFEB7, 0xFEB8, 0xFEB9, 0xFEBA, 0xFEBB, 0xFEBC, 0xFEBD, 0xFEBE, 0xFEBF, + /* C */ 0xFEC0, 0xFEC1, 0xFEC2, 0xFEC3, 0xFEC4, 0xFEC5, 0xFEC6, 0xFEC7, 0xFEC8, 0xFEC9, 0xFECA, 0xFECB, 0xFECC, 0xFECD, 0xFECE, 0xFECF, + /* D */ 0xFED0, 0xFED1, 0xFED2, 0xFED3, 0xFED4, 0xFED5, 0xFED6, 0xFED7, 0xFED8, 0xFED9, 0xFEDA, 0xFEDB, 0xFEDC, 0xFEDD, 0xFEDE, 0xFEDF, + /* E */ 0xFEE0, 0xFEE1, 0xFEE2, 0xFEE3, 0xFEE4, 0xFEE5, 0xFEE6, 0xFEE7, 0xFEE8, 0xFEE9, 0xFEEA, 0xFEEB, 0xFEEC, 0xFEED, 0xFEEE, 0xFEEF, + /* F */ 0xFEF0, 0xFEF1, 0xFEF2, 0xFEF3, 0xFEF4, 0xFEF5, 0xFEF6, 0xFEF7, 0xFEF8, 0xFEF9, 0xFEFA, 0xFEFB, 0xFEFC, 0xFEFD, 0xFEFE, 0x0000, + + /* Table 9 (for high byte 0xFF) */ + + /* 0 */ 0xFF00, 0xFF01, 0xFF02, 0xFF03, 0xFF04, 0xFF05, 0xFF06, 0xFF07, 0xFF08, 0xFF09, 0xFF0A, 0xFF0B, 0xFF0C, 0xFF0D, 0xFF0E, 0xFF0F, + /* 1 */ 0xFF10, 0xFF11, 0xFF12, 0xFF13, 0xFF14, 0xFF15, 0xFF16, 0xFF17, 0xFF18, 0xFF19, 0xFF1A, 0xFF1B, 0xFF1C, 0xFF1D, 0xFF1E, 0xFF1F, + /* 2 */ 0xFF20, 0xFF41, 0xFF42, 0xFF43, 0xFF44, 0xFF45, 0xFF46, 0xFF47, 0xFF48, 0xFF49, 0xFF4A, 0xFF4B, 0xFF4C, 0xFF4D, 0xFF4E, 0xFF4F, + /* 3 */ 0xFF50, 0xFF51, 0xFF52, 0xFF53, 0xFF54, 0xFF55, 0xFF56, 0xFF57, 0xFF58, 0xFF59, 0xFF5A, 0xFF3B, 0xFF3C, 0xFF3D, 0xFF3E, 0xFF3F, + /* 4 */ 0xFF40, 0xFF41, 0xFF42, 0xFF43, 0xFF44, 0xFF45, 0xFF46, 0xFF47, 0xFF48, 0xFF49, 0xFF4A, 0xFF4B, 0xFF4C, 0xFF4D, 0xFF4E, 0xFF4F, + /* 5 */ 0xFF50, 0xFF51, 0xFF52, 0xFF53, 0xFF54, 0xFF55, 0xFF56, 0xFF57, 0xFF58, 0xFF59, 0xFF5A, 0xFF5B, 0xFF5C, 0xFF5D, 0xFF5E, 0xFF5F, + /* 6 */ 0xFF60, 0xFF61, 0xFF62, 0xFF63, 0xFF64, 0xFF65, 0xFF66, 0xFF67, 0xFF68, 0xFF69, 0xFF6A, 0xFF6B, 0xFF6C, 0xFF6D, 0xFF6E, 0xFF6F, + /* 7 */ 0xFF70, 0xFF71, 0xFF72, 0xFF73, 0xFF74, 0xFF75, 0xFF76, 0xFF77, 0xFF78, 0xFF79, 0xFF7A, 0xFF7B, 0xFF7C, 0xFF7D, 0xFF7E, 0xFF7F, + /* 8 */ 0xFF80, 0xFF81, 0xFF82, 0xFF83, 0xFF84, 0xFF85, 0xFF86, 0xFF87, 0xFF88, 0xFF89, 0xFF8A, 0xFF8B, 0xFF8C, 0xFF8D, 0xFF8E, 0xFF8F, + /* 9 */ 0xFF90, 0xFF91, 0xFF92, 0xFF93, 0xFF94, 0xFF95, 0xFF96, 0xFF97, 0xFF98, 0xFF99, 0xFF9A, 0xFF9B, 0xFF9C, 0xFF9D, 0xFF9E, 0xFF9F, + /* A */ 0xFFA0, 0xFFA1, 0xFFA2, 0xFFA3, 0xFFA4, 0xFFA5, 0xFFA6, 0xFFA7, 0xFFA8, 0xFFA9, 0xFFAA, 0xFFAB, 0xFFAC, 0xFFAD, 0xFFAE, 0xFFAF, + /* B */ 0xFFB0, 0xFFB1, 0xFFB2, 0xFFB3, 0xFFB4, 0xFFB5, 0xFFB6, 0xFFB7, 0xFFB8, 0xFFB9, 0xFFBA, 0xFFBB, 0xFFBC, 0xFFBD, 0xFFBE, 0xFFBF, + /* C */ 0xFFC0, 0xFFC1, 0xFFC2, 0xFFC3, 0xFFC4, 0xFFC5, 0xFFC6, 0xFFC7, 0xFFC8, 0xFFC9, 0xFFCA, 0xFFCB, 0xFFCC, 0xFFCD, 0xFFCE, 0xFFCF, + /* D */ 0xFFD0, 0xFFD1, 0xFFD2, 0xFFD3, 0xFFD4, 0xFFD5, 0xFFD6, 0xFFD7, 0xFFD8, 0xFFD9, 0xFFDA, 0xFFDB, 0xFFDC, 0xFFDD, 0xFFDE, 0xFFDF, + /* E */ 0xFFE0, 0xFFE1, 0xFFE2, 0xFFE3, 0xFFE4, 0xFFE5, 0xFFE6, 0xFFE7, 0xFFE8, 0xFFE9, 0xFFEA, 0xFFEB, 0xFFEC, 0xFFED, 0xFFEE, 0xFFEF, + /* F */ 0xFFF0, 0xFFF1, 0xFFF2, 0xFFF3, 0xFFF4, 0xFFF5, 0xFFF6, 0xFFF7, 0xFFF8, 0xFFF9, 0xFFFA, 0xFFFB, 0xFFFC, 0xFFFD, 0xFFFE, 0xFFFF, +}; + + +/* RelString case folding table */ + +unsigned short gCompareTable[] = { + + /* 0 */ 0x0000, 0x0100, 0x0200, 0x0300, 0x0400, 0x0500, 0x0600, 0x0700, 0x0800, 0x0900, 0x0A00, 0x0B00, 0x0C00, 0x0D00, 0x0E00, 0x0F00, + /* 1 */ 0x1000, 0x1100, 0x1200, 0x1300, 0x1400, 0x1500, 0x1600, 0x1700, 0x1800, 0x1900, 0x1A00, 0x1B00, 0x1C00, 0x1D00, 0x1E00, 0x1F00, + /* 2 */ 0x2000, 0x2100, 0x2200, 0x2300, 0x2400, 0x2500, 0x2600, 0x2700, 0x2800, 0x2900, 0x2A00, 0x2B00, 0x2C00, 0x2D00, 0x2E00, 0x2F00, + /* 3 */ 0x3000, 0x3100, 0x3200, 0x3300, 0x3400, 0x3500, 0x3600, 0x3700, 0x3800, 0x3900, 0x3A00, 0x3B00, 0x3C00, 0x3D00, 0x3E00, 0x3F00, + /* 4 */ 0x4000, 0x4100, 0x4200, 0x4300, 0x4400, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, 0x4A00, 0x4B00, 0x4C00, 0x4D00, 0x4E00, 0x4F00, + /* 5 */ 0x5000, 0x5100, 0x5200, 0x5300, 0x5400, 0x5500, 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x5B00, 0x5C00, 0x5D00, 0x5E00, 0x5F00, + + // 0x60 maps to 'a' + // range 0x61 to 0x7a ('a' to 'z') map to upper case + + /* 6 */ 0x4180, 0x4100, 0x4200, 0x4300, 0x4400, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, 0x4A00, 0x4B00, 0x4C00, 0x4D00, 0x4E00, 0x4F00, + /* 7 */ 0x5000, 0x5100, 0x5200, 0x5300, 0x5400, 0x5500, 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x7B00, 0x7C00, 0x7D00, 0x7E00, 0x7F00, + + // range 0x80 to 0xd8 gets mapped... + + /* 8 */ 0x4108, 0x410C, 0x4310, 0x4502, 0x4E0A, 0x4F08, 0x5508, 0x4182, 0x4104, 0x4186, 0x4108, 0x410A, 0x410C, 0x4310, 0x4502, 0x4584, + /* 9 */ 0x4586, 0x4588, 0x4982, 0x4984, 0x4986, 0x4988, 0x4E0A, 0x4F82, 0x4F84, 0x4F86, 0x4F08, 0x4F0A, 0x5582, 0x5584, 0x5586, 0x5508, + /* A */ 0xA000, 0xA100, 0xA200, 0xA300, 0xA400, 0xA500, 0xA600, 0x5382, 0xA800, 0xA900, 0xAA00, 0xAB00, 0xAC00, 0xAD00, 0x4114, 0x4F0E, + /* B */ 0xB000, 0xB100, 0xB200, 0xB300, 0xB400, 0xB500, 0xB600, 0xB700, 0xB800, 0xB900, 0xBA00, 0x4192, 0x4F92, 0xBD00, 0x4114, 0x4F0E, + /* C */ 0xC000, 0xC100, 0xC200, 0xC300, 0xC400, 0xC500, 0xC600, 0x2206, 0x2208, 0xC900, 0x2000, 0x4104, 0x410A, 0x4F0A, 0x4F14, 0x4F14, + /* D */ 0xD000, 0xD100, 0x2202, 0x2204, 0x2702, 0x2704, 0xD600, 0xD700, 0x5988, 0xD900, 0xDA00, 0xDB00, 0xDC00, 0xDD00, 0xDE00, 0xDF00, + + /* E */ 0xE000, 0xE100, 0xE200, 0xE300, 0xE400, 0xE500, 0xE600, 0xE700, 0xE800, 0xE900, 0xEA00, 0xEB00, 0xEC00, 0xED00, 0xEE00, 0xEF00, + /* F */ 0xF000, 0xF100, 0xF200, 0xF300, 0xF400, 0xF500, 0xF600, 0xF700, 0xF800, 0xF900, 0xFA00, 0xFB00, 0xFC00, 0xFD00, 0xFE00, 0xFF00, + +}; +#endif /* __APPLE_API_PRIVATE */ +#endif /* KERNEL */ +#endif /* _UCSTRINGCOMPAREDATA_ */ diff --git a/core/UnicodeWrappers.c b/core/UnicodeWrappers.c new file mode 100644 index 0000000..8e5b6e6 --- /dev/null +++ b/core/UnicodeWrappers.c @@ -0,0 +1,508 @@ +/* + * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + File: UnicodeWrappers.c + + Contains: Wrapper routines for Unicode conversion and comparison. + +*/ + +#include +#include + +#include "hfs_macos_defs.h" +#include "UCStringCompareData.h" + +#include "FileMgrInternal.h" +#include "HFSUnicodeWrappers.h" + +enum { + kMinFileExtensionChars = 1, /* does not include dot */ + kMaxFileExtensionChars = 5 /* does not include dot */ +}; + + +#define EXTENSIONCHAR(c) (((c) >= 0x61 && (c) <= 0x7A) || \ + ((c) >= 0x41 && (c) <= 0x5A) || \ + ((c) >= 0x30 && (c) <= 0x39)) + + +#define IsHexDigit(c) (((c) >= (u_int8_t) '0' && (c) <= (u_int8_t) '9') || \ + ((c) >= (u_int8_t) 'A' && (c) <= (u_int8_t) 'F')) + + +static void GetFilenameExtension( ItemCount length, ConstUniCharArrayPtr unicodeStr, char* extStr ); + + +static u_int32_t HexStringToInteger( u_int32_t length, const u_int8_t *hexStr ); + + +/* + * Get filename extension (if any) as a C string + */ +static void +GetFilenameExtension(ItemCount length, ConstUniCharArrayPtr unicodeStr, char * extStr) +{ + u_int32_t i; + UniChar c; + u_int16_t extChars; /* number of extension chars (excluding dot) */ + u_int16_t maxExtChars; + Boolean foundExtension; + + extStr[0] = '\0'; /* assume there's no extension */ + + if ( length < 3 ) + return; /* "x.y" is smallest possible extension */ + + if ( length < (kMaxFileExtensionChars + 2) ) + maxExtChars = length - 2; /* save room for prefix + dot */ + else + maxExtChars = kMaxFileExtensionChars; + + i = length; + extChars = 0; + foundExtension = false; + + while ( extChars <= maxExtChars ) { + c = unicodeStr[--i]; + + /* look for leading dot */ + if ( c == (UniChar) '.' ) { + if ( extChars > 0 ) /* cannot end with a dot */ + foundExtension = true; + break; + } + + if ( EXTENSIONCHAR(c) ) + ++extChars; + else + break; + } + + /* if we found one then copy it */ + if ( foundExtension ) { + u_int8_t *extStrPtr = (u_int8_t *)extStr; + const UniChar *unicodeStrPtr = &unicodeStr[i]; + + for ( i = 0; i <= extChars; ++i ) + *(extStrPtr++) = (u_int8_t) *(unicodeStrPtr++); + extStr[extChars + 1] = '\0'; /* terminate extension + dot */ + } +} + + + +/* + * Count filename extension characters (if any) + */ +u_int32_t +CountFilenameExtensionChars( const unsigned char * filename, u_int32_t length ) +{ + u_int32_t i; + UniChar c; + u_int32_t extChars; /* number of extension chars (excluding dot) */ + u_int16_t maxExtChars; + Boolean foundExtension; + + if ( length < 3 ) + return 0; /* "x.y" is smallest possible extension */ + + if ( length < (kMaxFileExtensionChars + 2) ) + maxExtChars = length - 2; /* save room for prefix + dot */ + else + maxExtChars = kMaxFileExtensionChars; + + extChars = 0; /* assume there's no extension */ + i = length - 1; /* index to last ascii character */ + foundExtension = false; + + while ( extChars <= maxExtChars ) { + c = filename[i--]; + + /* look for leading dot */ + if ( c == (u_int8_t) '.' ) { + if ( extChars > 0 ) /* cannot end with a dot */ + return (extChars); + + break; + } + + if ( EXTENSIONCHAR(c) ) + ++extChars; + else + break; + } + + return 0; +} + + +/* + * extract the file id from a mangled name + */ +HFSCatalogNodeID +GetEmbeddedFileID(const unsigned char * filename, u_int32_t length, u_int32_t *prefixLength) +{ + short extChars; + short i; + u_int8_t c; + + *prefixLength = 0; + + if ( filename == NULL ) + return 0; + + if ( length < 28 ) + return 0; /* too small to have been mangled */ + + /* big enough for a file ID (#10) and an extension (.x) ? */ + if ( length > 5 ) + extChars = CountFilenameExtensionChars(filename, length); + else + extChars = 0; + + /* skip over dot plus extension characters */ + if ( extChars > 0 ) + length -= (extChars + 1); + + /* scan for file id digits */ + for ( i = length - 1; i >= 0; --i) { + c = filename[i]; + + /* look for file ID marker */ + if ( c == '#' ) { + if ( (length - i) < 3 ) + break; /* too small to be a file ID */ + + *prefixLength = i; + return HexStringToInteger(length - i - 1, &filename[i+1]); + } + + if ( !IsHexDigit(c) ) + break; /* file ID string must have hex digits */ + } + + return 0; +} + + + +static u_int32_t +HexStringToInteger(u_int32_t length, const u_int8_t *hexStr) +{ + u_int32_t value; + u_int32_t i; + u_int8_t c; + const u_int8_t *p; + + value = 0; + p = hexStr; + + for ( i = 0; i < length; ++i ) { + c = *p++; + + if (c >= '0' && c <= '9') { + value = value << 4; + value += (u_int32_t) c - (u_int32_t) '0'; + } else if (c >= 'A' && c <= 'F') { + value = value << 4; + value += 10 + ((unsigned int) c - (unsigned int) 'A'); + } else { + return 0; /* bad character */ + } + } + + return value; +} + + +/* + * Routine: FastRelString + * + * Output: returns -1 if str1 < str2 + * returns 1 if str1 > str2 + * return 0 if equal + * + */ +int32_t FastRelString( ConstStr255Param str1, ConstStr255Param str2 ) +{ + u_int16_t* compareTable; + int32_t bestGuess; + u_int8_t length, length2; + u_int8_t delta; + + delta = 0; + length = *(str1++); + length2 = *(str2++); + + if (length == length2) + bestGuess = 0; + else if (length < length2) + { + bestGuess = -1; + delta = length2 - length; + } + else + { + bestGuess = 1; + length = length2; + } + + compareTable = (u_int16_t*) gCompareTable; + + while (length--) + { + u_int8_t aChar, bChar; + + aChar = *(str1++); + bChar = *(str2++); + + if (aChar != bChar) // If they don't match exacly, do case conversion + { + u_int16_t aSortWord, bSortWord; + + aSortWord = compareTable[aChar]; + bSortWord = compareTable[bChar]; + + if (aSortWord > bSortWord) + return 1; + + if (aSortWord < bSortWord) + return -1; + } + + // If characters match exactly, then go on to next character immediately without + // doing any extra work. + } + + // if you got to here, then return bestGuess + return bestGuess; +} + + + +// +// FastUnicodeCompare - Compare two Unicode strings; produce a relative ordering +// +// IF RESULT +// -------------------------- +// str1 < str2 => -1 +// str1 = str2 => 0 +// str1 > str2 => +1 +// +// The lower case table starts with 256 entries (one for each of the upper bytes +// of the original Unicode char). If that entry is zero, then all characters with +// that upper byte are already case folded. If the entry is non-zero, then it is +// the _index_ (not byte offset) of the start of the sub-table for the characters +// with that upper byte. All ignorable characters are folded to the value zero. +// +// In pseudocode: +// +// Let c = source Unicode character +// Let table[] = lower case table +// +// lower = table[highbyte(c)] +// if (lower == 0) +// lower = c +// else +// lower = table[lower+lowbyte(c)] +// +// if (lower == 0) +// ignore this character +// +// To handle ignorable characters, we now need a loop to find the next valid character. +// Also, we can't pre-compute the number of characters to compare; the string length might +// be larger than the number of non-ignorable characters. Further, we must be able to handle +// ignorable characters at any point in the string, including as the first or last characters. +// We use a zero value as a sentinel to detect both end-of-string and ignorable characters. +// Since the File Manager doesn't prevent the NUL character (value zero) as part of a filename, +// the case mapping table is assumed to map u+0000 to some non-zero value (like 0xFFFF, which is +// an invalid Unicode character). +// +// Pseudocode: +// +// while (1) { +// c1 = GetNextValidChar(str1) // returns zero if at end of string +// c2 = GetNextValidChar(str2) +// +// if (c1 != c2) break // found a difference +// +// if (c1 == 0) // reached end of string on both strings at once? +// return 0; // yes, so strings are equal +// } +// +// // When we get here, c1 != c2. So, we just need to determine which one is less. +// if (c1 < c2) +// return -1; +// else +// return 1; +// + +int32_t FastUnicodeCompare ( register ConstUniCharArrayPtr str1, register ItemCount length1, + register ConstUniCharArrayPtr str2, register ItemCount length2) +{ + register u_int16_t c1,c2; + register u_int16_t temp; + register u_int16_t* lowerCaseTable; + + lowerCaseTable = (u_int16_t*) gLowerCaseTable; + + while (1) { + /* Set default values for c1, c2 in case there are no more valid chars */ + c1 = 0; + c2 = 0; + + /* Find next non-ignorable char from str1, or zero if no more */ + while (length1 && c1 == 0) { + c1 = *(str1++); + --length1; + /* check for basic latin first */ + if (c1 < 0x0100) { + c1 = gLatinCaseFold[c1]; + break; + } + /* case fold if neccessary */ + if ((temp = lowerCaseTable[c1>>8]) != 0) + c1 = lowerCaseTable[temp + (c1 & 0x00FF)]; + } + + + /* Find next non-ignorable char from str2, or zero if no more */ + while (length2 && c2 == 0) { + c2 = *(str2++); + --length2; + /* check for basic latin first */ + if (c2 < 0x0100) { + c2 = gLatinCaseFold[c2]; + break; + } + /* case fold if neccessary */ + if ((temp = lowerCaseTable[c2>>8]) != 0) + c2 = lowerCaseTable[temp + (c2 & 0x00FF)]; + } + + if (c1 != c2) // found a difference, so stop looping + break; + + if (c1 == 0) // did we reach the end of both strings at the same time? + return 0; // yes, so strings are equal + } + + if (c1 < c2) + return -1; + else + return 1; +} + +/* + * UnicodeBinaryCompare + * Compare two UTF-16 strings and perform case-sensitive (binary) matching against them. + * + * Results are emitted like FastUnicodeCompare: + * + * + * IF RESULT + * -------------------------- + * str1 < str2 => -1 + * str1 = str2 => 0 + * str1 > str2 => +1 + * + * The case matching source code is greatly simplified due to the lack of case-folding + * in this comparison routine. We compare, in order: the lengths, then do character-by- + * character comparisons. + * + */ +int32_t UnicodeBinaryCompare (register ConstUniCharArrayPtr str1, register ItemCount len1, + register ConstUniCharArrayPtr str2, register ItemCount len2) { + uint16_t c1; + uint16_t c2; + int string_length; + int32_t result = 0; + + /* Set default values for the two character pointers */ + c1 = 0; + c2 = 0; + + /* First generate the string length (for comparison purposes) */ + if (len1 < len2) { + string_length = len1; + --result; + } + else if (len1 > len2) { + string_length = len2; + ++result; + } + else { + string_length = len1; + } + + /* now compare the two string pointers */ + while (string_length--) { + c1 = *(str1++); + c2 = *(str2++); + + if (c1 > c2) { + result = 1; + break; + } + + if (c1 < c2) { + result = -1; + break; + } + /* If equal, iterate to the next two respective chars */ + } + + return result; +} + + +OSErr +ConvertUnicodeToUTF8Mangled(ByteCount srcLen, ConstUniCharArrayPtr srcStr, ByteCount maxDstLen, + ByteCount *actualDstLen, unsigned char* dstStr, HFSCatalogNodeID cnid) +{ + ByteCount subMaxLen; + size_t utf8len; + char fileIDStr[15]; + char extStr[15]; + + snprintf(fileIDStr, sizeof(fileIDStr), "#%X", cnid); + GetFilenameExtension(srcLen/sizeof(UniChar), srcStr, extStr); + + /* remove extension chars from source */ + srcLen -= strlen(extStr) * sizeof(UniChar); + subMaxLen = maxDstLen - (strlen(extStr) + strlen(fileIDStr)); + + (void) utf8_encodestr(srcStr, srcLen, dstStr, &utf8len, subMaxLen, ':', 0); + + strlcat((char *)dstStr, fileIDStr, maxDstLen); + strlcat((char *)dstStr, extStr, maxDstLen); + *actualDstLen = utf8len + (strlen(extStr) + strlen(fileIDStr)); + + return noErr; +} diff --git a/core/VolumeAllocation.c b/core/VolumeAllocation.c new file mode 100644 index 0000000..f26811c --- /dev/null +++ b/core/VolumeAllocation.c @@ -0,0 +1,6198 @@ +/* + * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + File: VolumeAllocation.c + + Contains: Routines for accessing and modifying the volume bitmap. + + Version: HFS Plus 1.0 + + Copyright: (c) 1996-2009 by Apple Inc., all rights reserved. + +*/ + +/* +Public routines: + BlockAllocate / hfs_block_alloc + Allocate space on a volume. Can allocate space contiguously. + If not contiguous, then allocation may be less than what was + asked for. Returns the starting block number, and number of + blocks. It will only return a single extent. + + BlockDeallocate + Deallocate a contiguous run of allocation blocks. + + BlockMarkAllocated + Exported wrapper to mark blocks as in-use. This will correctly determine + whether or not the red-black tree is enabled and call the appropriate function + if applicable. + BlockMarkFree + Exported wrapper to mark blocks as freed. This will correctly determine whether or + not the red-black tree is enabled and call the appropriate function if applicable. + + + ResetVCBFreeExtCache + Since the red-black tree obviates the need to maintain the free extent cache, we do + not update it if the tree is also live. As a result, if we ever need to destroy the trees + we should reset the free extent cache so it doesn't confuse us when we need to fall back to the + bitmap scanning allocator. + We also reset and disable the free extent cache when volume resizing is + in flight. + + UpdateAllocLimit + Adjusts the AllocLimit field in the hfs mount point. This is used when we need to prevent + allocations from occupying space in the region we are modifying during a filesystem resize. + At other times, it should be consistent with the total number of allocation blocks in the + filesystem. It is also used to shrink or grow the number of blocks that the red-black tree should + know about. If growing, scan the new range of bitmap, and if shrinking, reduce the + number of items in the tree that we can allocate from. + + ScanUnmapBlocks + Traverse the entire allocation bitmap. Potentially issue DKIOCUNMAPs to the device as it + tracks unallocated ranges when iterating the volume bitmap. Additionally, build up the in-core + summary table of the allocation bitmap. + +Internal routines: + BlockMarkFreeInternal + Mark a contiguous range of blocks as free. The corresponding + bits in the volume bitmap will be cleared. This will actually do the work + of modifying the bitmap for us. + + BlockMarkAllocatedInternal + Mark a contiguous range of blocks as allocated. The cor- + responding bits in the volume bitmap are set. Also tests to see + if any of the blocks were previously unallocated. + BlockFindContiguous + Find a contiguous range of blocks of a given size. The caller + specifies where to begin the search (by block number). The + block number of the first block in the range is returned. This is only + called by the bitmap scanning logic as the red-black tree should be able + to do this internally by searching its tree. + BlockFindAny + Find and allocate a contiguous range of blocks up to a given size. The + first range of contiguous free blocks found are allocated, even if there + are fewer blocks than requested (and even if a contiguous range of blocks + of the given size exists elsewhere). + BlockFindAnyBitmap + Finds a range of blocks per the above requirements without using the + Allocation RB Tree. This relies on the bitmap-scanning logic in order to find + any valid range of free space needed. + BlockFindContig + Find a contiguous range of blocks of a given size. + If the minimum cannot be satisfied, nothing is + returned. + BlockFindKnown + Try to allocate space from known free space in the volume's + free extent cache. + ReadBitmapBlock + Given an allocation block number, read the bitmap block that + contains that allocation block into a caller-supplied buffer. + + ReleaseBitmapBlock + Release a bitmap block back into the buffer cache. + + ReadBitmapRange + Given an allocation block number, read a range of bitmap that + must begin at that allocation block into a caller supplied buffer. + + ReleaseBitmapRange + Release and invalidate a buf_t corresponding to the bitmap + back into the UBC in order to prevent coherency issues. + + remove_free_extent_cache + Remove an extent from the free extent cache. Handles overlaps + with multiple extents in the cache, and handles splitting an + extent in the cache if the extent to be removed is in the middle + of a cached extent. + + add_free_extent_cache + Add an extent to the free extent cache. It will merge the + input extent with extents already in the cache. + CheckUnmappedBytes + Check whether or not the current transaction + has allocated blocks that were recently freed. This may have data safety implications. + + + +Debug/Test Routines + hfs_isallocated + Test to see if any blocks in a range are allocated. Journal or + allocation file lock must be held. + + hfs_isallocated_scan + Test to see if any blocks in a range are allocated. Releases and + invalidates the block used when finished. + +Optimization Routines + hfs_alloc_scan_block + Given a starting allocation block number, figures out which physical block contains that + allocation block's bit, and scans it from the starting bit until either the ending bit or + the end of the block. Free space extents are inserted into the appropriate red-black tree. + +*/ + + +#include +#include + +#if !HFS_ALLOC_TEST + +#include "hfs_macos_defs.h" +#include +#include +/* For VM Page size */ +#include +#include "hfs_journal.h" +#include "hfs.h" +#include "hfs_endian.h" +#include "FileMgrInternal.h" + +#endif // !HFS_ALLOC_TEST + +#include +#include +#include +#include + +#include "hfs_dbg.h" +#include "hfs_format.h" +#include "hfs_kdebug.h" +#include "rangelist.h" +#include "hfs_extents.h" + +/* Headers for unmap-on-mount support */ +#include + +/* + * Use sysctl vfs.generic.hfs.kdebug.allocation to control which + * KERNEL_DEBUG_CONSTANT events are enabled at runtime. (They're + * disabled by default because there can be a lot of these events, + * and we don't want to overwhelm the kernel debug buffer. If you + * want to watch these events in particular, just set the sysctl.) + */ +static int hfs_kdebug_allocation = 0; +SYSCTL_DECL(_vfs_generic); +HFS_SYSCTL(NODE, _vfs_generic, OID_AUTO, hfs, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "HFS file system") +HFS_SYSCTL(NODE, _vfs_generic_hfs, OID_AUTO, kdebug, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "HFS kdebug") +HFS_SYSCTL(INT, _vfs_generic_hfs_kdebug, OID_AUTO, allocation, CTLFLAG_RW|CTLFLAG_LOCKED, &hfs_kdebug_allocation, 0, "Enable kdebug logging for HFS allocations") +enum { + /* + * HFSDBG_ALLOC_ENABLED: Log calls to BlockAllocate and + * BlockDeallocate, including the internal BlockAllocateXxx + * routines so we can see how an allocation was satisfied. + * + * HFSDBG_EXT_CACHE_ENABLED: Log routines that read or write the + * free extent cache. + * + * HFSDBG_UNMAP_ENABLED: Log events involving the trim list. + * + * HFSDBG_BITMAP_ENABLED: Log accesses to the volume bitmap (setting + * or clearing bits, scanning the bitmap). + */ + HFSDBG_ALLOC_ENABLED = 1, + HFSDBG_EXT_CACHE_ENABLED = 2, + HFSDBG_UNMAP_ENABLED = 4, + HFSDBG_BITMAP_ENABLED = 8 +}; + +enum { + kBytesPerWord = 4, + kBitsPerByte = 8, + kBitsPerWord = 32, + + kBitsWithinWordMask = kBitsPerWord-1 +}; + +#define kLowBitInWordMask 0x00000001ul +#define kHighBitInWordMask 0x80000000ul +#define kAllBitsSetInWord 0xFFFFFFFFul + +#define HFS_MIN_SUMMARY_BLOCKSIZE 4096 + +#define ALLOC_DEBUG 0 + +static OSErr ReadBitmapBlock( + ExtendedVCB *vcb, + u_int32_t bit, + u_int32_t **buffer, + uintptr_t *blockRef, + hfs_block_alloc_flags_t flags); + +static OSErr ReleaseBitmapBlock( + ExtendedVCB *vcb, + uintptr_t blockRef, + Boolean dirty); + +static OSErr hfs_block_alloc_int(hfsmount_t *hfsmp, + HFSPlusExtentDescriptor *extent, + hfs_block_alloc_flags_t flags, + hfs_alloc_extra_args_t *ap); + +static OSErr BlockFindAny( + ExtendedVCB *vcb, + u_int32_t startingBlock, + u_int32_t endingBlock, + u_int32_t maxBlocks, + hfs_block_alloc_flags_t flags, + Boolean trustSummary, + u_int32_t *actualStartBlock, + u_int32_t *actualNumBlocks); + +static OSErr BlockFindAnyBitmap( + ExtendedVCB *vcb, + u_int32_t startingBlock, + u_int32_t endingBlock, + u_int32_t maxBlocks, + hfs_block_alloc_flags_t flags, + u_int32_t *actualStartBlock, + u_int32_t *actualNumBlocks); + +static OSErr BlockFindContig( + ExtendedVCB *vcb, + u_int32_t startingBlock, + u_int32_t minBlocks, + u_int32_t maxBlocks, + hfs_block_alloc_flags_t flags, + u_int32_t *actualStartBlock, + u_int32_t *actualNumBlocks); + +static OSErr BlockFindContiguous( + ExtendedVCB *vcb, + u_int32_t startingBlock, + u_int32_t endingBlock, + u_int32_t minBlocks, + u_int32_t maxBlocks, + Boolean useMetaZone, + Boolean trustSummary, + u_int32_t *actualStartBlock, + u_int32_t *actualNumBlocks, + hfs_block_alloc_flags_t flags); + +static OSErr BlockFindKnown( + ExtendedVCB *vcb, + u_int32_t maxBlocks, + u_int32_t *actualStartBlock, + u_int32_t *actualNumBlocks); + +static OSErr hfs_alloc_try_hard(hfsmount_t *hfsmp, + HFSPlusExtentDescriptor *extent, + uint32_t max_blocks, + hfs_block_alloc_flags_t flags); + +static OSErr BlockMarkAllocatedInternal ( + ExtendedVCB *vcb, + u_int32_t startingBlock, + u_int32_t numBlocks, + hfs_block_alloc_flags_t flags); + +static OSErr BlockMarkFreeInternal( + ExtendedVCB *vcb, + u_int32_t startingBlock, + u_int32_t numBlocks, + Boolean do_validate); + + +static OSErr ReadBitmapRange (struct hfsmount *hfsmp, uint32_t offset, uint32_t iosize, + uint32_t **buffer, struct buf **blockRef); + +static OSErr ReleaseScanBitmapRange( struct buf *bp ); + +static int hfs_track_unmap_blocks (struct hfsmount *hfsmp, u_int32_t offset, + u_int32_t numBlocks, struct jnl_trim_list *list); + +static int hfs_issue_unmap (struct hfsmount *hfsmp, struct jnl_trim_list *list); + +static int hfs_alloc_scan_range(struct hfsmount *hfsmp, + u_int32_t startbit, + u_int32_t *bitToScan, + struct jnl_trim_list *list); + +static int hfs_scan_range_size (struct hfsmount* hfsmp, uint32_t start, uint32_t *iosize); +static uint32_t CheckUnmappedBytes (struct hfsmount *hfsmp, uint64_t blockno, uint64_t numblocks, int *recent, uint32_t *next); + +/* Bitmap Re-use Detection */ +static inline int extents_overlap (uint32_t start1, uint32_t len1, + uint32_t start2, uint32_t len2) { + return !( ((start1 + len1) <= start2) || ((start2 + len2) <= start1) ); +} + + +int hfs_isallocated_scan (struct hfsmount *hfsmp, + u_int32_t startingBlock, + u_int32_t *bp_buf); + +/* Summary Table Functions */ +static int hfs_set_summary (struct hfsmount *hfsmp, uint32_t summarybit, uint32_t inuse); +static int hfs_get_summary_index (struct hfsmount *hfsmp, uint32_t block, uint32_t *index); +static int hfs_find_summary_free (struct hfsmount *hfsmp, uint32_t block, uint32_t *newblock); +static int hfs_get_summary_allocblock (struct hfsmount *hfsmp, uint32_t summarybit, uint32_t *alloc); +static int hfs_release_summary (struct hfsmount *hfsmp, uint32_t start, uint32_t length); +static int hfs_check_summary (struct hfsmount *hfsmp, uint32_t start, uint32_t *freeblocks); +static int hfs_rebuild_summary (struct hfsmount *hfsmp); + +#if 0 +static int hfs_get_next_summary (struct hfsmount *hfsmp, uint32_t block, uint32_t *newblock); +#endif + +/* Used in external mount code to initialize the summary table */ +int hfs_init_summary (struct hfsmount *hfsmp); + +#if ALLOC_DEBUG +void hfs_validate_summary (struct hfsmount *hfsmp); +#endif + + +/* Functions for manipulating free extent cache */ +static void remove_free_extent_cache(struct hfsmount *hfsmp, u_int32_t startBlock, u_int32_t blockCount); +static Boolean add_free_extent_cache(struct hfsmount *hfsmp, u_int32_t startBlock, u_int32_t blockCount); +static void sanity_check_free_ext(struct hfsmount *hfsmp, int check_allocated); + +static void hfs_release_reserved(hfsmount_t *hfsmp, struct rl_entry *range, int list); + +/* Functions for getting free exents */ + +typedef struct bitmap_context { + void *bitmap; // current bitmap chunk + uint32_t run_offset; // offset (in bits) from start of bitmap to start of current run + uint32_t chunk_current; // next bit to scan in the chunk + uint32_t chunk_end; // number of valid bits in this chunk + struct hfsmount *hfsmp; + struct buf *bp; + uint32_t last_free_summary_bit; // last marked free summary bit + int lockflags; + uint64_t lock_start; +} bitmap_context_t; + + +static errno_t get_more_bits(bitmap_context_t *bitmap_ctx); +static int bit_count_set(void *bitmap, int start, int end); +static int bit_count_clr(void *bitmap, int start, int end); +static errno_t hfs_bit_count(bitmap_context_t *bitmap_ctx, int (*fn)(void *, int ,int), uint32_t *bit_count); +static errno_t hfs_bit_count_set(bitmap_context_t *bitmap_ctx, uint32_t *count); +static errno_t hfs_bit_count_clr(bitmap_context_t *bitmap_ctx, uint32_t *count); +static errno_t update_summary_table(bitmap_context_t *bitmap_ctx, uint32_t start, uint32_t count, bool set); +static int clzll(uint64_t x); + +#if ALLOC_DEBUG +/* + * Validation Routine to verify that the TRIM list maintained by the journal + * is in good shape relative to what we think the bitmap should have. We should + * never encounter allocated blocks in the TRIM list, so if we ever encounter them, + * we panic. + */ +int trim_validate_bitmap (struct hfsmount *hfsmp); +int trim_validate_bitmap (struct hfsmount *hfsmp) { + u_int64_t blockno_offset; + u_int64_t numblocks; + int i; + int count; + u_int32_t startblk; + u_int32_t blks; + int err = 0; + uint32_t alloccount = 0; + + if (hfsmp->jnl) { + struct journal *jnl = (struct journal*)hfsmp->jnl; + if (jnl->active_tr) { + struct jnl_trim_list *trim = &(jnl->active_tr->trim); + count = trim->extent_count; + for (i = 0; i < count; i++) { + blockno_offset = trim->extents[i].offset; + blockno_offset = blockno_offset - (uint64_t)hfsmp->hfsPlusIOPosOffset; + blockno_offset = blockno_offset / hfsmp->blockSize; + numblocks = trim->extents[i].length / hfsmp->blockSize; + + startblk = (u_int32_t)blockno_offset; + blks = (u_int32_t) numblocks; + err = hfs_count_allocated (hfsmp, startblk, blks, &alloccount); + + if (err == 0 && alloccount != 0) { + panic ("trim_validate_bitmap: %d blocks @ ABN %d are allocated!", alloccount, startblk); + } + } + } + } + return 0; +} + +#endif + + +/* + ;________________________________________________________________________________ + ; + ; Routine: hfs_unmap_free_extent + ; + ; Function: Make note of a range of allocation blocks that should be + ; unmapped (trimmed). That is, the given range of blocks no + ; longer have useful content, and the device can unmap the + ; previous contents. For example, a solid state disk may reuse + ; the underlying storage for other blocks. + ; + ; This routine is only supported for journaled volumes. The extent + ; being freed is passed to the journal code, and the extent will + ; be unmapped after the current transaction is written to disk. + ; + ; Input Arguments: + ; hfsmp - The volume containing the allocation blocks. + ; startingBlock - The first allocation block of the extent being freed. + ; numBlocks - The number of allocation blocks of the extent being freed. + ;________________________________________________________________________________ + */ +static void hfs_unmap_free_extent(struct hfsmount *hfsmp, u_int32_t startingBlock, u_int32_t numBlocks) +{ + u_int64_t offset; + u_int64_t length; + u_int64_t device_sz; + int err = 0; + + if (hfs_kdebug_allocation & HFSDBG_UNMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_UNMAP_FREE | DBG_FUNC_START, startingBlock, numBlocks, 0, 0, 0); + + if (ALLOC_DEBUG) { + if (hfs_isallocated(hfsmp, startingBlock, numBlocks)) { + panic("hfs: %p: (%u,%u) unmapping allocated blocks", hfsmp, startingBlock, numBlocks); + } + } + + if (hfsmp->jnl != NULL) { + device_sz = hfsmp->hfs_logical_bytes; + offset = (u_int64_t) startingBlock * hfsmp->blockSize + (u_int64_t) hfsmp->hfsPlusIOPosOffset; + length = (u_int64_t) numBlocks * hfsmp->blockSize; + + /* Validate that the trim is in a valid range of bytes */ + if ((offset >= device_sz) || ((offset + length) > device_sz)) { + printf("hfs_unmap_free_ext: ignoring trim vol=%s @ off %lld len %lld \n", hfsmp->vcbVN, offset, length); + err = EINVAL; + } + + if (err == 0) { + err = journal_trim_add_extent(hfsmp->jnl, offset, length); + if (err) { + printf("hfs_unmap_free_extent: error %d from journal_trim_add_extent for vol=%s", err, hfsmp->vcbVN); + } + } + } + + if (hfs_kdebug_allocation & HFSDBG_UNMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_UNMAP_FREE | DBG_FUNC_END, err, 0, 0, 0, 0); +} + +/* + ;________________________________________________________________________________ + ; + ; Routine: hfs_track_unmap_blocks + ; + ; Function: Make note of a range of allocation blocks that should be + ; unmapped (trimmed). That is, the given range of blocks no + ; longer have useful content, and the device can unmap the + ; previous contents. For example, a solid state disk may reuse + ; the underlying storage for other blocks. + ; + ; This routine is only supported for journaled volumes. + ; + ; *****NOTE*****: + ; This function should *NOT* be used when the volume is fully + ; mounted. This function is intended to support a bitmap iteration + ; at mount time to fully inform the SSD driver of the state of all blocks + ; at mount time, and assumes that there is no allocation/deallocation + ; interference during its iteration., + ; + ; Input Arguments: + ; hfsmp - The volume containing the allocation blocks. + ; offset - The first allocation block of the extent being freed. + ; numBlocks - The number of allocation blocks of the extent being freed. + ; list - The list of currently tracked trim ranges. + ;________________________________________________________________________________ + */ +static int hfs_track_unmap_blocks (struct hfsmount *hfsmp, u_int32_t start, + u_int32_t numBlocks, struct jnl_trim_list *list) { + + u_int64_t offset; + u_int64_t length; + int error = 0; + + if ((hfsmp->hfs_flags & HFS_UNMAP) && (hfsmp->jnl != NULL) && list->allocated_count && list->extents != NULL) { + int extent_no = list->extent_count; + offset = (u_int64_t) start * hfsmp->blockSize + (u_int64_t) hfsmp->hfsPlusIOPosOffset; + length = (u_int64_t) numBlocks * hfsmp->blockSize; + + + list->extents[extent_no].offset = offset; + list->extents[extent_no].length = length; + list->extent_count++; + if (list->extent_count == list->allocated_count) { + error = hfs_issue_unmap (hfsmp, list); + } + } + + return error; +} + +/* + ;________________________________________________________________________________ + ; + ; Routine: hfs_issue_unmap + ; + ; Function: Issue a DKIOCUNMAP for all blocks currently tracked by the jnl_trim_list + ; + ; Input Arguments: + ; hfsmp - The volume containing the allocation blocks. + ; list - The list of currently tracked trim ranges. + ;________________________________________________________________________________ + */ + +static int hfs_issue_unmap (struct hfsmount *hfsmp, struct jnl_trim_list *list) +{ + dk_unmap_t unmap; + int error = 0; + + if (hfs_kdebug_allocation & HFSDBG_UNMAP_ENABLED) { + KERNEL_DEBUG_CONSTANT(HFSDBG_UNMAP_SCAN_TRIM | DBG_FUNC_START, hfsmp->hfs_raw_dev, 0, 0, 0, 0); + } + + if (list->extent_count > 0 && list->extents != NULL) { + bzero(&unmap, sizeof(unmap)); + unmap.extents = list->extents; + unmap.extentsCount = list->extent_count; + + if (hfs_kdebug_allocation & HFSDBG_UNMAP_ENABLED) { + KERNEL_DEBUG_CONSTANT(HFSDBG_UNMAP_SCAN_TRIM | DBG_FUNC_NONE, hfsmp->hfs_raw_dev, unmap.extentsCount, 0, 0, 0); + } + +#if CONFIG_PROTECT + /* + * If we have not yet completed the first scan through the bitmap, then + * optionally inform the block driver below us that this is an initialization + * TRIM scan, if it can deal with this information. + */ + if ((hfsmp->scan_var & HFS_ALLOCATOR_SCAN_COMPLETED) == 0) { + unmap.options |= _DK_UNMAP_INITIALIZE; + } +#endif + /* Issue a TRIM and flush them out */ + error = VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCUNMAP, (caddr_t)&unmap, 0, vfs_context_kernel()); + + bzero (list->extents, (list->allocated_count * sizeof(dk_extent_t))); + bzero (&unmap, sizeof(unmap)); + list->extent_count = 0; + } + + if (hfs_kdebug_allocation & HFSDBG_UNMAP_ENABLED) { + KERNEL_DEBUG_CONSTANT(HFSDBG_UNMAP_SCAN_TRIM | DBG_FUNC_END, error, hfsmp->hfs_raw_dev, 0, 0, 0); + } + + return error; +} + +/* + ;________________________________________________________________________________ + ; + ; Routine: hfs_unmap_alloc_extent + ; + ; Function: Make note of a range of allocation blocks, some of + ; which may have previously been passed to hfs_unmap_free_extent, + ; is now in use on the volume. The given blocks will be removed + ; from any pending DKIOCUNMAP. + ; + ; Input Arguments: + ; hfsmp - The volume containing the allocation blocks. + ; startingBlock - The first allocation block of the extent being allocated. + ; numBlocks - The number of allocation blocks being allocated. + ;________________________________________________________________________________ + */ + +static void hfs_unmap_alloc_extent(struct hfsmount *hfsmp, u_int32_t startingBlock, u_int32_t numBlocks) +{ + u_int64_t offset; + u_int64_t length; + int err = 0; + + if (hfs_kdebug_allocation & HFSDBG_UNMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_UNMAP_ALLOC | DBG_FUNC_START, startingBlock, numBlocks, 0, 0, 0); + + if (hfsmp->jnl != NULL) { + offset = (u_int64_t) startingBlock * hfsmp->blockSize + (u_int64_t) hfsmp->hfsPlusIOPosOffset; + length = (u_int64_t) numBlocks * hfsmp->blockSize; + + err = journal_trim_remove_extent(hfsmp->jnl, offset, length); + if (err) { + printf("hfs_unmap_alloc_extent: error %d from journal_trim_remove_extent for vol=%s", err, hfsmp->vcbVN); + } + } + + if (hfs_kdebug_allocation & HFSDBG_UNMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_UNMAP_ALLOC | DBG_FUNC_END, err, 0, 0, 0, 0); +} + + +/* +;________________________________________________________________________________ +; +; Routine: hfs_trim_callback +; +; Function: This function is called when a transaction that freed extents +; (via hfs_unmap_free_extent/journal_trim_add_extent) has been +; written to the on-disk journal. This routine will add those +; extents to the free extent cache so that they can be reused. +; +; CAUTION: This routine is called while the journal's trim lock +; is held shared, so that no other thread can reuse any portion +; of those extents. We must be very careful about which locks +; we take from within this callback, to avoid deadlock. The +; call to add_free_extent_cache will end up taking the cache's +; lock (just long enough to add these extents to the cache). +; +; CAUTION: If the journal becomes invalid (eg., due to an I/O +; error when trying to write to the journal), this callback +; will stop getting called, even if extents got freed before +; the journal became invalid! +; +; Input Arguments: +; arg - The hfsmount of the volume containing the extents. +; extent_count - The number of extents freed in the transaction. +; extents - An array of extents (byte ranges) that were freed. +;________________________________________________________________________________ +*/ + +void +hfs_trim_callback(void *arg, uint32_t extent_count, const dk_extent_t *extents) +{ + uint32_t i; + uint32_t startBlock, numBlocks; + struct hfsmount *hfsmp = arg; + + if (hfs_kdebug_allocation & HFSDBG_UNMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_UNMAP_CALLBACK | DBG_FUNC_START, 0, extent_count, 0, 0, 0); + + for (i=0; ihfsPlusIOPosOffset) / hfsmp->blockSize; + numBlocks = extents[i].length / hfsmp->blockSize; + (void) add_free_extent_cache(hfsmp, startBlock, numBlocks); + } + + if (hfs_kdebug_allocation & HFSDBG_UNMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_UNMAP_CALLBACK | DBG_FUNC_END, 0, 0, 0, 0, 0); +} + + +/* + ;________________________________________________________________________________ + ; + ; Routine: CheckUnmappedBytes + ; + ; Function: From the specified inputs, determine if the extent in question overlaps + ; space that was recently freed, where the recently freed space may still be + ; lingering in an uncommitted journal transaction. This may have data safety + ; implications. The intended use is to decide whether or not to force a journal flush + ; before allowing file data I/O to be issued. If we did not do this + ; then it would be possible to issue the file I/O ahead of the + ; journal, resulting in data being overwritten if the transaction either + ; is not committed or cannot be replayed. + ; + ; NOTE: This function assumes that the journal and catalog/extent locks are held. + ; + ; Input Arguments: + ; hfsmp - The volume containing the allocation blocks. + ; foffset - start of the extent in question (in allocation blocks) + ; numbytes - number of blocks in the extent. + ; recently_freed: - output pointer containing whether or not the blocks were freed recently + ; overlap_end - end of the overlap between the argument extent and the trim list (in allocation blocks) + ; + ; Output: + ; + ; Returns 0 if we could determine extent validity for this (or a previous transaction) + ; Returns errno if there was an error + ; + ; If returned 0, then recently freed will contain a boolean that indicates + ; that it was recently freed. + ;________________________________________________________________________________ + */ + +u_int32_t +CheckUnmappedBytes (struct hfsmount *hfsmp, uint64_t blockno, uint64_t numblocks, int *recently_freed, uint32_t *overlap_end) { + uint64_t device_offset; + uint64_t numbytes; + uint32_t err = 0; + uint64_t lba_overlap_end; + + if (hfsmp->jnl != NULL) { + /* + * Convert the allocation block # and the number of blocks into device-relative + * offsets so that they can be compared using the TRIM list. + */ + uint64_t device_sz = hfsmp->hfs_logical_bytes; + device_offset = blockno * ((uint64_t)hfsmp->blockSize); + device_offset += hfsmp->hfsPlusIOPosOffset; + numbytes = (((uint64_t)hfsmp->blockSize) * numblocks); + + /* + * Since we check that the device_offset isn't too large, it's safe to subtract it + * from the size in the second check. + */ + if ((device_offset >= device_sz) || (numbytes > (device_sz - device_offset))) { + return EINVAL; + } + + /* Ask the journal if this extent overlaps with any pending TRIMs */ + if (journal_trim_extent_overlap (hfsmp->jnl, device_offset, numbytes, &lba_overlap_end)) { + *recently_freed = 1; + + /* Convert lba_overlap_end back into allocation blocks */ + uint64_t end_offset = lba_overlap_end - hfsmp->hfsPlusIOPosOffset; + end_offset = end_offset / ((uint64_t) hfsmp->blockSize); + *overlap_end = (uint32_t) end_offset; + } + else { + *recently_freed = 0; + } + err = 0; + } + else { + /* There may not be a journal. In that case, always return success. */ + *recently_freed = 0; + } + return err; + +} + + +/* + ;________________________________________________________________________________ + ; + ; Routine: ScanUnmapBlocks + ; + ; Function: Traverse the bitmap, and potentially issue DKIOCUNMAPs to the underlying + ; device as needed so that the underlying disk device is as + ; up-to-date as possible with which blocks are unmapped. + ; Additionally build up the summary table as needed. + ; + ; This function reads the bitmap in large block size + ; (up to 1MB) unlike the runtime which reads the bitmap + ; in 4K block size. So if this function is being called + ; after the volume is mounted and actively modified, the + ; caller needs to invalidate all of the existing buffers + ; associated with the bitmap vnode before calling this + ; function. If the buffers are not invalidated, it can + ; cause buf_t collision and potential data corruption. + ; + ; Input Arguments: + ; hfsmp - The volume containing the allocation blocks. + ;________________________________________________________________________________ + */ + +u_int32_t ScanUnmapBlocks (struct hfsmount *hfsmp) +{ + u_int32_t blocks_scanned = 0; + int error = 0; + struct jnl_trim_list trimlist; + + if (hfs_kdebug_allocation & HFSDBG_UNMAP_ENABLED) { + KERNEL_DEBUG_CONSTANT(HFSDBG_UNMAP_SCAN | DBG_FUNC_START, hfsmp->hfs_raw_dev, 0, 0, 0, 0); + } + + /* + *struct jnl_trim_list { + uint32_t allocated_count; + uint32_t extent_count; + dk_extent_t *extents; + }; + */ + bzero (&trimlist, sizeof(trimlist)); + + /* + * Any trim related work should be tied to whether the underlying + * storage media supports UNMAP, as any solid state device would + * on desktop or embedded. + * + * We do this because we may want to scan the full bitmap on + * desktop for spinning media for the purposes of building up the + * summary table. + * + * We also avoid sending TRIMs down to the underlying media if the + * mount is read-only. + */ + + if ((hfsmp->hfs_flags & HFS_UNMAP) && + ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0)) { + /* If the underlying device supports unmap and the mount is read-write, initialize */ + int alloc_count = PAGE_SIZE / sizeof(dk_extent_t); + void *extents = hfs_malloc(alloc_count * sizeof(dk_extent_t)); + trimlist.extents = (dk_extent_t*)extents; + trimlist.allocated_count = alloc_count; + trimlist.extent_count = 0; + } + + while ((blocks_scanned < hfsmp->totalBlocks) && (error == 0)){ + + error = hfs_alloc_scan_range (hfsmp, blocks_scanned, &blocks_scanned, &trimlist); + + if (error) { + printf("HFS: bitmap scan range error: %d on vol=%s\n", error, hfsmp->vcbVN); + break; + } + } + + if ((hfsmp->hfs_flags & HFS_UNMAP) && + ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0)) { + if (error == 0) { + hfs_issue_unmap(hfsmp, &trimlist); + } + if (trimlist.extents) { + hfs_free(trimlist.extents, trimlist.allocated_count * sizeof(dk_extent_t)); + } + } + + /* + * This is in an #if block because hfs_validate_summary prototype and function body + * will only show up if ALLOC_DEBUG is on, to save wired memory ever so slightly. + */ +#if ALLOC_DEBUG + sanity_check_free_ext(hfsmp, 1); + if (hfsmp->hfs_flags & HFS_SUMMARY_TABLE) { + /* Validate the summary table too! */ + hfs_validate_summary(hfsmp); + printf("HFS: Summary validation complete on %s\n", hfsmp->vcbVN); + } +#endif + + if (hfs_kdebug_allocation & HFSDBG_UNMAP_ENABLED) { + KERNEL_DEBUG_CONSTANT(HFSDBG_UNMAP_SCAN | DBG_FUNC_END, error, hfsmp->hfs_raw_dev, 0, 0, 0); + } + + return error; +} + +static void add_to_reserved_list(hfsmount_t *hfsmp, uint32_t start, + uint32_t count, int list, + struct rl_entry **reservation) +{ + struct rl_entry *range, *next_range; + + if (list == HFS_TENTATIVE_BLOCKS) { + int nranges = 0; + // Don't allow more than 4 tentative reservations + TAILQ_FOREACH_SAFE(range, &hfsmp->hfs_reserved_ranges[HFS_TENTATIVE_BLOCKS], + rl_link, next_range) { + if (++nranges > 3) + hfs_release_reserved(hfsmp, range, HFS_TENTATIVE_BLOCKS); + } + } + + range = hfs_malloc(sizeof(*range)); + range->rl_start = start; + range->rl_end = start + count - 1; + TAILQ_INSERT_HEAD(&hfsmp->hfs_reserved_ranges[list], range, rl_link); + *reservation = range; +} + +static void hfs_release_reserved(hfsmount_t *hfsmp, + struct rl_entry *range, + int list) +{ + if (range->rl_start == -1) + return; + + TAILQ_REMOVE(&hfsmp->hfs_reserved_ranges[list], range, rl_link); + + if (rl_len(range) > 0) { + if (list == HFS_TENTATIVE_BLOCKS) + hfsmp->tentativeBlocks -= rl_len(range); + else { + /* + * We don't need to unmap tentative blocks because we won't have + * written to them, but we might have written to reserved blocks. + * Nothing can refer to those blocks so this doesn't have to be + * via the journal. If this proves to be too expensive, we could + * consider not sending down the unmap or we could require this + * to always be called within a transaction and then we can use + * the journal. + */ + dk_extent_t extent = { + .offset = (hfs_blk_to_bytes(range->rl_start, hfsmp->blockSize) + + hfsmp->hfsPlusIOPosOffset), + .length = hfs_blk_to_bytes(rl_len(range), hfsmp->blockSize) + }; + dk_unmap_t unmap = { + .extents = &extent, + .extentsCount = 1, + }; + VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCUNMAP, (caddr_t)&unmap, + 0, vfs_context_kernel()); + hfs_assert(hfsmp->lockedBlocks >= rl_len(range)); + hfsmp->lockedBlocks -= rl_len(range); + } + hfs_release_summary(hfsmp, range->rl_start, rl_len(range)); + add_free_extent_cache(hfsmp, range->rl_start, rl_len(range)); + } + + range->rl_start = -1; + range->rl_end = -2; +} + +static void hfs_free_locked_internal(hfsmount_t *hfsmp, + struct rl_entry **reservation, + int list) +{ + if (*reservation) { + hfs_release_reserved(hfsmp, *reservation, list); + hfs_free(*reservation, sizeof(**reservation)); + *reservation = NULL; + } +} + +void hfs_free_tentative(hfsmount_t *hfsmp, struct rl_entry **reservation) +{ + hfs_free_locked_internal(hfsmp, reservation, HFS_TENTATIVE_BLOCKS); +} + +void hfs_free_locked(hfsmount_t *hfsmp, struct rl_entry **reservation) +{ + hfs_free_locked_internal(hfsmp, reservation, HFS_LOCKED_BLOCKS); +} + +OSErr BlockAllocate ( + hfsmount_t *hfsmp, /* which volume to allocate space on */ + u_int32_t startingBlock, /* preferred starting block, or 0 for no preference */ + u_int32_t minBlocks, /* desired number of blocks to allocate */ + u_int32_t maxBlocks, /* maximum number of blocks to allocate */ + hfs_block_alloc_flags_t flags, /* option flags */ + u_int32_t *actualStartBlock, /* actual first block of allocation */ + u_int32_t *actualNumBlocks) +{ + hfs_alloc_extra_args_t extra_args = { + .max_blocks = maxBlocks + }; + + HFSPlusExtentDescriptor extent = { startingBlock, minBlocks }; + + OSErr err = hfs_block_alloc_int(hfsmp, &extent, flags, &extra_args); + + *actualStartBlock = extent.startBlock; + *actualNumBlocks = extent.blockCount; + + return err; +} + +errno_t hfs_block_alloc(hfsmount_t *hfsmp, + HFSPlusExtentDescriptor *extent, + hfs_block_alloc_flags_t flags, + hfs_alloc_extra_args_t *ap) +{ + return MacToVFSError(hfs_block_alloc_int(hfsmp, extent, flags, ap)); +} + +/* + ;________________________________________________________________________________ + ; + ; Routine: hfs_block_alloc_int + ; + ; Function: Allocate space on a volume. If contiguous allocation is requested, + ; at least the requested number of bytes will be allocated or an + ; error will be returned. If contiguous allocation is not forced, + ; the space will be allocated with the first largest extent available + ; at the requested starting allocation block. If there is not enough + ; room there, a block allocation of less than the requested size will be + ; allocated. + ; + ; If the requested starting block is 0 (for new file allocations), + ; the volume's allocation block pointer will be used as a starting + ; point. + ; + ; Input Arguments: + ; hfsmp - Pointer to the HFS mount structure. + ; extent - startBlock indicates the block to start + ; searching from and blockCount is the number of + ; blocks required. Depending on the flags used, + ; more or less blocks may be returned. The + ; allocated extent is returned via this + ; parameter. + ; flags - Flags to specify options like contiguous, use + ; metadata zone, skip free block check, etc. + ; ap - Additional arguments used depending on flags. + ; See hfs_alloc_extra_args_t and below. + ; + ; Output: + ; (result) - Error code, zero for successful allocation + ; extent - If successful, the allocated extent. + ; + ; Side effects: + ; The volume bitmap is read and updated; the volume bitmap cache may be changed. + ; + ; HFS_ALLOC_TENTATIVE + ; Blocks will be reserved but not marked allocated. They can be + ; stolen if free space is limited. Tentative blocks can be used by + ; passing HFS_ALLOC_USE_TENTATIVE and passing in the resevation. + ; @ap->reservation_out is used to store the reservation. + ; + ; HFS_ALLOC_USE_TENTATIVE + ; Use blocks previously returned with HFS_ALLOC_TENTATIVE. + ; @ap->reservation_in should be set to whatever @ap->reservation_out + ; was set to when HFS_ALLOC_TENTATIVE was used. If the tentative + ; reservation was stolen, a normal allocation will take place. + ; + ; HFS_ALLOC_LOCKED + ; Blocks will be reserved but not marked allocated. Unlike tentative + ; reservations they cannot be stolen. It is safe to write to these + ; blocks. @ap->reservation_out is used to store the reservation. + ; + ; HFS_ALLOC_COMMIT + ; This will take blocks previously returned with HFS_ALLOC_LOCKED and + ; mark them allocated on disk. @ap->reservation_in is used. + ; + ; HFS_ALLOC_ROLL_BACK + ; Take blocks that were just recently deallocated and mark them + ; allocated. This is for roll back situations. Blocks got + ; deallocated and then something went wrong and we need to roll back + ; by marking the blocks allocated. + ; + ; HFS_ALLOC_FORCECONTIG + ; It will not return fewer than @min_blocks. + ; + ; HFS_ALLOC_TRY_HARD + ; We will perform an exhaustive search to try and find @max_blocks. + ; It will not return fewer than @min_blocks. + ; + ;________________________________________________________________________________ + */ +OSErr hfs_block_alloc_int(hfsmount_t *hfsmp, + HFSPlusExtentDescriptor *extent, + hfs_block_alloc_flags_t flags, + hfs_alloc_extra_args_t *ap) +{ + u_int32_t freeBlocks; + OSErr err = 0; + Boolean updateAllocPtr = false; // true if nextAllocation needs to be updated + Boolean forceContiguous = false; + Boolean forceFlush; + + uint32_t startingBlock = extent->startBlock; + uint32_t minBlocks = extent->blockCount; + uint32_t maxBlocks = (ap && ap->max_blocks) ? ap->max_blocks : minBlocks; + + if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_BLOCK_ALLOCATE | DBG_FUNC_START, startingBlock, minBlocks, maxBlocks, flags, 0); + + if (ISSET(flags, HFS_ALLOC_COMMIT)) { + extent->startBlock = (*ap->reservation_in)->rl_start; + extent->blockCount = rl_len(*ap->reservation_in); + goto mark_allocated; + } + + if (ISSET(flags, HFS_ALLOC_ROLL_BACK)) + goto mark_allocated; + + freeBlocks = hfs_freeblks(hfsmp, 0); + + if (ISSET(flags, HFS_ALLOC_USE_TENTATIVE)) { + struct rl_entry *range = *ap->reservation_in; + + if (range && range->rl_start != -1) { + /* + * It's possible that we have a tentative reservation + * but there aren't enough free blocks due to loaned blocks + * or insufficient space in the backing store. + */ + uint32_t count = min(min(maxBlocks, rl_len(range)), freeBlocks); + + if (count >= minBlocks) { + extent->startBlock = range->rl_start; + extent->blockCount = count; + + // Should we go straight to commit? + if (!ISSET(flags, HFS_ALLOC_LOCKED)) + SET(flags, HFS_ALLOC_COMMIT); + + goto mark_allocated; + } + } + + /* + * We can't use the tentative reservation so free it and allocate + * normally. + */ + hfs_free_tentative(hfsmp, ap->reservation_in); + CLR(flags, HFS_ALLOC_USE_TENTATIVE); + } + + if (ISSET(flags, HFS_ALLOC_FORCECONTIG | HFS_ALLOC_TRY_HARD)) + forceContiguous = true; + + if (flags & HFS_ALLOC_FLUSHTXN) { + forceFlush = true; + } + else { + forceFlush = false; + } + + hfs_assert(hfsmp->freeBlocks >= hfsmp->tentativeBlocks); + + // See if we have to steal tentative blocks + if (freeBlocks < hfsmp->tentativeBlocks + minBlocks) + SET(flags, HFS_ALLOC_IGNORE_TENTATIVE); + + /* Skip free block check if blocks are being allocated for relocating + * data during truncating a volume. + * + * During hfs_truncatefs(), the volume free block count is updated + * before relocating data to reflect the total number of free blocks + * that will exist on the volume after resize is successful. This + * means that we have reserved allocation blocks required for relocating + * the data and hence there is no need to check the free blocks. + * It will also prevent resize failure when the number of blocks in + * an extent being relocated is more than the free blocks that will + * exist after the volume is resized. + */ + if ((flags & HFS_ALLOC_SKIPFREEBLKS) == 0) { + // If the disk is already full, don't bother. + if (freeBlocks == 0) { + err = dskFulErr; + goto exit; + } + if (forceContiguous && freeBlocks < minBlocks) { + err = dskFulErr; + goto exit; + } + + /* + * Clip if necessary so we don't over-subscribe the free blocks. + */ + if (minBlocks > freeBlocks) { + minBlocks = freeBlocks; + } + if (maxBlocks > freeBlocks) { + maxBlocks = freeBlocks; + } + } + + if (ISSET(flags, HFS_ALLOC_TRY_HARD)) { + err = hfs_alloc_try_hard(hfsmp, extent, maxBlocks, flags); + if (err) + goto exit; + + goto mark_allocated; + } + + // + // If caller didn't specify a starting block number, then use the volume's + // next block to allocate from. + // + if (startingBlock == 0) { + hfs_lock_mount (hfsmp); + + /* Sparse Allocation and nextAllocation are both used even if the R/B Tree is on */ + if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) { + startingBlock = hfsmp->sparseAllocation; + } + else { + startingBlock = hfsmp->nextAllocation; + } + hfs_unlock_mount(hfsmp); + updateAllocPtr = true; + } + + + if (startingBlock >= hfsmp->allocLimit) { + startingBlock = 0; /* overflow so start at beginning */ + } + + // + // If the request must be contiguous, then find a sequence of free blocks + // that is long enough. Otherwise, find the first free block. + // + if (forceContiguous) { + err = BlockFindContig(hfsmp, startingBlock, minBlocks, maxBlocks, + flags, &extent->startBlock, &extent->blockCount); + /* + * If we allocated from a new position then also update the roving allocator. + * This will keep the roving allocation pointer up-to-date even + * if we are using the new R/B tree allocator, since + * it doesn't matter to us here, how the underlying allocator found + * the block to vend out. + */ + if ((err == noErr) && + (extent->startBlock > startingBlock) && + ((extent->startBlock < hfsmp->hfs_metazone_start) || + (extent->startBlock > hfsmp->hfs_metazone_end))) { + updateAllocPtr = true; + } + } else { + /* + * Scan the bitmap once, gather the N largest free extents, then + * allocate from these largest extents. Repeat as needed until + * we get all the space we needed. We could probably build up + * that list when the higher level caller tried (and failed) a + * contiguous allocation first. + * + * Note that the free-extent cache will be cease to be updated if + * we are using the red-black tree for allocations. If we jettison + * the tree, then we will reset the free-extent cache and start over. + */ + + /* Disable HFS_ALLOC_FLUSHTXN if needed */ + if (forceFlush) { + flags &= ~HFS_ALLOC_FLUSHTXN; + } + + /* + * BlockFindKnown only examines the free extent cache; anything in there will + * have been committed to stable storage already. + */ + err = BlockFindKnown(hfsmp, maxBlocks, &extent->startBlock, + &extent->blockCount); + + /* dskFulErr out of BlockFindKnown indicates an empty Free Extent Cache */ + + if (err == dskFulErr) { + /* + * Now we have to do a bigger scan. Start at startingBlock and go up until the + * allocation limit. We 'trust' the summary bitmap in this call, if it tells us + * that it could not find any free space. + */ + err = BlockFindAny(hfsmp, startingBlock, hfsmp->allocLimit, + maxBlocks, flags, true, + &extent->startBlock, &extent->blockCount); + } + if (err == dskFulErr) { + /* + * Vary the behavior here if the summary table is on or off. + * If it is on, then we don't trust it it if we get into this case and + * basically do a full scan for maximum coverage. + * If it is off, then we trust the above and go up until the startingBlock. + */ + if (hfsmp->hfs_flags & HFS_SUMMARY_TABLE) { + err = BlockFindAny(hfsmp, 1, hfsmp->allocLimit, maxBlocks, + flags, false, + &extent->startBlock, &extent->blockCount); + } + else { + err = BlockFindAny(hfsmp, 1, startingBlock, maxBlocks, + flags, false, + &extent->startBlock, &extent->blockCount); + } + + /* + * Last Resort: Find/use blocks that may require a journal flush. + */ + if (err == dskFulErr && forceFlush) { + flags |= HFS_ALLOC_FLUSHTXN; + err = BlockFindAny(hfsmp, 1, hfsmp->allocLimit, maxBlocks, + flags, false, + &extent->startBlock, &extent->blockCount); + } + } + } + + if (err) + goto exit; + +mark_allocated: + + // Handle alignment + if (ap && ap->alignment && extent->blockCount < ap->max_blocks) { + /* + * See the comment in FileMgrInternal.h for alignment + * semantics. + */ + uint32_t rounding = ((extent->blockCount + ap->alignment_offset) + % ap->alignment); + + // @minBlocks is still the minimum + if (extent->blockCount >= minBlocks + rounding) + extent->blockCount -= rounding; + } + + err = BlockMarkAllocatedInternal(hfsmp, extent->startBlock, + extent->blockCount, flags); + + if (err) + goto exit; + + if (ISSET(hfsmp->hfs_flags, HFS_CS) && extent->blockCount != 0 + && !ISSET(flags, HFS_ALLOC_TENTATIVE)) { + if (ISSET(flags, HFS_ALLOC_FAST_DEV)) { +#if !HFS_ALLOC_TEST /* need this guard because this file is compiled outside of the kernel */ + hfs_pin_block_range(hfsmp, HFS_PIN_IT, + extent->startBlock, extent->blockCount); +#endif + } else { + _dk_cs_map_t cm = { + .cm_extent = { + (hfs_blk_to_bytes(extent->startBlock, hfsmp->blockSize) + + hfsmp->hfsPlusIOPosOffset), + hfs_blk_to_bytes(extent->blockCount, hfsmp->blockSize) + } + }; + + errno_t err2 = VNOP_IOCTL(hfsmp->hfs_devvp, _DKIOCCSMAP, + (caddr_t)&cm, 0, vfs_context_current()); + + /* + * Ignore errors for now; we are fully provisioned so in + * theory CoreStorage should be able to handle this + * allocation. Should we want to change this in future, then + * we should think carefully how we handle errors. Allowing + * CoreStorage to truncate our allocation is problematic + * because we might have minimum and alignment requirements + * and backing out changes we have already made is + * non-trivial. + */ + + if (err2 || cm.cm_bytes_mapped < cm.cm_extent.length) { + printf("hfs: _DKIOCCSMAP error: %d, bytes_mapped: %llu\n", + err2, cm.cm_bytes_mapped); + } + } + } + + // if we actually allocated something then go update the + // various bits of state that we maintain regardless of + // whether there was an error (i.e. partial allocations + // still need to update things like the free block count). + // + if (extent->blockCount != 0) { + // + // If we used the volume's roving allocation pointer, then we need to update it. + // Adding in the length of the current allocation might reduce the next allocate + // call by avoiding a re-scan of the already allocated space. However, the clump + // just allocated can quite conceivably end up being truncated or released when + // the file is closed or its EOF changed. Leaving the allocation pointer at the + // start of the last allocation will avoid unnecessary fragmentation in this case. + // + hfs_lock_mount (hfsmp); + + if (!ISSET(flags, HFS_ALLOC_USE_TENTATIVE | HFS_ALLOC_COMMIT)) { + lck_spin_lock(&hfsmp->vcbFreeExtLock); + if (hfsmp->vcbFreeExtCnt == 0 && hfsmp->hfs_freed_block_count == 0) { + hfsmp->sparseAllocation = extent->startBlock; + } + lck_spin_unlock(&hfsmp->vcbFreeExtLock); + if (extent->blockCount < hfsmp->hfs_freed_block_count) { + hfsmp->hfs_freed_block_count -= extent->blockCount; + } else { + hfsmp->hfs_freed_block_count = 0; + } + + if (updateAllocPtr && + ((extent->startBlock < hfsmp->hfs_metazone_start) || + (extent->startBlock > hfsmp->hfs_metazone_end))) { + HFS_UPDATE_NEXT_ALLOCATION(hfsmp, extent->startBlock); + } + + (void) remove_free_extent_cache(hfsmp, extent->startBlock, extent->blockCount); + } + + if (ISSET(flags, HFS_ALLOC_USE_TENTATIVE)) { + (*ap->reservation_in)->rl_start += extent->blockCount; + hfsmp->tentativeBlocks -= extent->blockCount; + if (rl_len(*ap->reservation_in) <= 0) + hfs_free_tentative(hfsmp, ap->reservation_in); + } else if (ISSET(flags, HFS_ALLOC_COMMIT)) { + // Handle committing locked extents + hfs_assert(hfsmp->lockedBlocks >= extent->blockCount); + (*ap->reservation_in)->rl_start += extent->blockCount; + hfsmp->lockedBlocks -= extent->blockCount; + hfs_free_locked(hfsmp, ap->reservation_in); + } + + /* + * Update the number of free blocks on the volume + * + * Skip updating the free blocks count if the block are + * being allocated to relocate data as part of hfs_truncatefs() + */ + + if (ISSET(flags, HFS_ALLOC_TENTATIVE)) { + hfsmp->tentativeBlocks += extent->blockCount; + } else if (ISSET(flags, HFS_ALLOC_LOCKED)) { + hfsmp->lockedBlocks += extent->blockCount; + } else if ((flags & HFS_ALLOC_SKIPFREEBLKS) == 0) { + hfsmp->freeBlocks -= extent->blockCount; + } + MarkVCBDirty(hfsmp); + hfs_unlock_mount(hfsmp); + + hfs_generate_volume_notifications(hfsmp); + + if (ISSET(flags, HFS_ALLOC_TENTATIVE)) { + hfs_assert(ap); + add_to_reserved_list(hfsmp, extent->startBlock, extent->blockCount, + 0, ap->reservation_out); + } else if (ISSET(flags, HFS_ALLOC_LOCKED)) { + hfs_assert(ap); + add_to_reserved_list(hfsmp, extent->startBlock, extent->blockCount, + 1, ap->reservation_out); + } + + if (ISSET(flags, HFS_ALLOC_IGNORE_TENTATIVE)) { + /* + * See if we used tentative blocks. Note that we cannot + * free the reservations here because we don't have access + * to the external pointers. All we can do is update the + * reservations and they'll be cleaned up when whatever is + * holding the pointers calls us back. + * + * We use the rangelist code to detect overlaps and + * constrain the tentative block allocation. Note that + * @end is inclusive so that our rangelist code will + * resolve the various cases for us. As a result, we need + * to ensure that we account for it properly when removing + * the blocks from the tentative count in the mount point + * and re-inserting the remainder (either head or tail) + */ + struct rl_entry *range, *next_range; + struct rl_head *ranges = &hfsmp->hfs_reserved_ranges[HFS_TENTATIVE_BLOCKS]; + const uint32_t start = extent->startBlock; + const uint32_t end = start + extent->blockCount - 1; + TAILQ_FOREACH_SAFE(range, ranges, rl_link, next_range) { + switch (rl_overlap(range, start, end)) { + case RL_OVERLAPCONTAINSRANGE: + // Keep the bigger part + if (start - range->rl_start > range->rl_end - end) { + // Discard the tail + hfsmp->tentativeBlocks -= range->rl_end + 1 - start; + hfs_release_summary(hfsmp, end + 1, range->rl_end - end); + const uint32_t old_end = range->rl_end; + range->rl_end = start - 1; + add_free_extent_cache(hfsmp, end + 1, old_end - end); + } else { + // Discard the head + hfsmp->tentativeBlocks -= end + 1 - range->rl_start; + hfs_release_summary(hfsmp, range->rl_start, + start - range->rl_start); + const uint32_t old_start = range->rl_start; + range->rl_start = end + 1; + add_free_extent_cache(hfsmp, old_start, + start - old_start); + } + hfs_assert(range->rl_end >= range->rl_start); + break; + case RL_MATCHINGOVERLAP: + case RL_OVERLAPISCONTAINED: + hfsmp->tentativeBlocks -= rl_len(range); + range->rl_end = range->rl_start - 1; + hfs_release_reserved(hfsmp, range, HFS_TENTATIVE_BLOCKS); + break; + case RL_OVERLAPSTARTSBEFORE: + hfsmp->tentativeBlocks -= range->rl_end + 1 - start; + range->rl_end = start - 1; + hfs_assert(range->rl_end >= range->rl_start); + break; + case RL_OVERLAPENDSAFTER: + hfsmp->tentativeBlocks -= end + 1 - range->rl_start; + range->rl_start = end + 1; + hfs_assert(range->rl_end >= range->rl_start); + break; + case RL_NOOVERLAP: + break; + } + } + } + } + +exit: + + if (ALLOC_DEBUG) { + if (err == noErr) { + if (extent->startBlock >= hfsmp->totalBlocks) { + panic ("BlockAllocate: vending invalid blocks!"); + } + if (extent->startBlock >= hfsmp->allocLimit) { + panic ("BlockAllocate: vending block past allocLimit!"); + } + + if ((extent->startBlock + extent->blockCount) >= hfsmp->totalBlocks) { + panic ("BlockAllocate: vending too many invalid blocks!"); + } + + if ((extent->startBlock + extent->blockCount) >= hfsmp->allocLimit) { + panic ("BlockAllocate: vending too many invalid blocks past allocLimit!"); + } + } + } + + if (err) { + // Just to be safe... + extent->startBlock = 0; + extent->blockCount = 0; + } + + if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_BLOCK_ALLOCATE | DBG_FUNC_END, err, extent->startBlock, extent->blockCount, 0, 0); + + return err; +} + + +/* +;________________________________________________________________________________ +; +; Routine: BlockDeallocate +; +; Function: Update the bitmap to deallocate a run of disk allocation blocks +; +; Input Arguments: +; vcb - Pointer to ExtendedVCB for the volume to free space on +; firstBlock - First allocation block to be freed +; numBlocks - Number of allocation blocks to free up (must be > 0!) +; +; Output: +; (result) - Result code +; +; Side effects: +; The volume bitmap is read and updated; the volume bitmap cache may be changed. +; The Allocator's red-black trees may also be modified as a result. +; +;________________________________________________________________________________ +*/ + +OSErr BlockDeallocate ( + ExtendedVCB *vcb, // Which volume to deallocate space on + u_int32_t firstBlock, // First block in range to deallocate + u_int32_t numBlocks, // Number of contiguous blocks to deallocate + hfs_block_alloc_flags_t flags) +{ + if (ISSET(flags, HFS_ALLOC_TENTATIVE | HFS_ALLOC_LOCKED)) + return 0; + + OSErr err; + struct hfsmount *hfsmp; + hfsmp = VCBTOHFS(vcb); + + if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_BLOCK_DEALLOCATE | DBG_FUNC_START, firstBlock, numBlocks, flags, 0, 0); + + // + // If no blocks to deallocate, then exit early + // + if (numBlocks == 0) { + err = noErr; + goto Exit; + } + + + if (ALLOC_DEBUG) { + if (firstBlock >= hfsmp->totalBlocks) { + panic ("BlockDeallocate: freeing invalid blocks!"); + } + + if ((firstBlock + numBlocks) >= hfsmp->totalBlocks) { + panic ("BlockDeallocate: freeing too many invalid blocks!"); + } + } + + /* + * If we're using the summary bitmap, then try to mark the bits + * as potentially usable/free before actually deallocating them. + * It is better to be slightly speculative here for correctness. + */ + + (void) hfs_release_summary (hfsmp, firstBlock, numBlocks); + + err = BlockMarkFreeInternal(vcb, firstBlock, numBlocks, true); + + if (err) { + goto Exit; + } + + // + // Update the volume's free block count, and mark the VCB as dirty. + // + hfs_lock_mount(hfsmp); + /* + * Do not update the free block count. This flags is specified + * when a volume is being truncated. + */ + if ((flags & HFS_ALLOC_SKIPFREEBLKS) == 0) { + vcb->freeBlocks += numBlocks; + } + + vcb->hfs_freed_block_count += numBlocks; + + if (vcb->nextAllocation == (firstBlock + numBlocks)) { + HFS_UPDATE_NEXT_ALLOCATION(vcb, (vcb->nextAllocation - numBlocks)); + } + + if (hfsmp->jnl == NULL) { + /* + * In the journal case, we'll add the free extent once the journal + * calls us back to tell us it wrote the transaction to disk. + */ + (void) add_free_extent_cache(vcb, firstBlock, numBlocks); + + /* + * If the journal case, we'll only update sparseAllocation once the + * free extent cache becomes empty (when we remove the last entry + * from the cache). Skipping it here means we're less likely to + * find a recently freed extent via the bitmap before it gets added + * to the free extent cache. + */ + if (firstBlock < vcb->sparseAllocation) { + vcb->sparseAllocation = firstBlock; + } + } + + MarkVCBDirty(vcb); + hfs_unlock_mount(hfsmp); + + hfs_generate_volume_notifications(VCBTOHFS(vcb)); +Exit: + + if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_BLOCK_DEALLOCATE | DBG_FUNC_END, err, 0, 0, 0, 0); + + return err; +} + + +u_int8_t freebitcount[16] = { + 4, 3, 3, 2, 3, 2, 2, 1, /* 0 1 2 3 4 5 6 7 */ + 3, 2, 2, 1, 2, 1, 1, 0, /* 8 9 A B C D E F */ +}; + +u_int32_t +MetaZoneFreeBlocks(ExtendedVCB *vcb) +{ + u_int32_t freeblocks; + u_int32_t *currCache; + uintptr_t blockRef; + u_int32_t bit; + u_int32_t lastbit; + int bytesleft; + int bytesperblock; + u_int8_t byte; + u_int8_t *buffer; + + blockRef = 0; + bytesleft = freeblocks = 0; + buffer = NULL; + bit = VCBTOHFS(vcb)->hfs_metazone_start; + if (bit == 1) + bit = 0; + + lastbit = VCBTOHFS(vcb)->hfs_metazone_end; + bytesperblock = vcb->vcbVBMIOSize; + + /* + * Count all the bits from bit to lastbit. + */ + while (bit < lastbit) { + /* + * Get next bitmap block. + */ + if (bytesleft == 0) { + if (blockRef) { + (void) ReleaseBitmapBlock(vcb, blockRef, false); + blockRef = 0; + } + if (ReadBitmapBlock(vcb, bit, &currCache, &blockRef, + HFS_ALLOC_IGNORE_TENTATIVE) != 0) { + return (0); + } + buffer = (u_int8_t *)currCache; + bytesleft = bytesperblock; + } + byte = *buffer++; + freeblocks += freebitcount[byte & 0x0F]; + freeblocks += freebitcount[(byte >> 4) & 0x0F]; + bit += kBitsPerByte; + --bytesleft; + } + if (blockRef) + (void) ReleaseBitmapBlock(vcb, blockRef, false); + + return (freeblocks); +} + + +/* + * Obtain the next allocation block (bit) that's + * outside the metadata allocation zone. + */ +static u_int32_t NextBitmapBlock( + ExtendedVCB *vcb, + u_int32_t bit) +{ + struct hfsmount *hfsmp = VCBTOHFS(vcb); + + if ((hfsmp->hfs_flags & HFS_METADATA_ZONE) == 0) + return (bit); + /* + * Skip over metadata allocation zone. + */ + if ((bit >= hfsmp->hfs_metazone_start) && + (bit <= hfsmp->hfs_metazone_end)) { + bit = hfsmp->hfs_metazone_end + 1; + } + return (bit); +} + + +// Assumes @bitmap is aligned to 8 bytes and multiple of 8 bytes. +static void bits_set(void *bitmap, int start, int end) +{ + const int start_bit = start & 63; + const int end_bit = end & 63; + +#define LEFT_MASK(bit) OSSwapHostToBigInt64(0xffffffffffffffffull << (64 - bit)) +#define RIGHT_MASK(bit) OSSwapHostToBigInt64(0xffffffffffffffffull >> bit) + + uint64_t *p = (uint64_t *)bitmap + start / 64; + + if ((start & ~63) == (end & ~63)) { + // Start and end in same 64 bits + *p |= RIGHT_MASK(start_bit) & LEFT_MASK(end_bit); + } else { + *p++ |= RIGHT_MASK(start_bit); + + int nquads = (end - end_bit - start - 1) / 64; + + while (nquads--) + *p++ = 0xffffffffffffffffull; + + if (end_bit) + *p |= LEFT_MASK(end_bit); + } +} + +// Modifies the buffer and applies any reservations that we might have +static buf_t process_reservations(hfsmount_t *hfsmp, buf_t bp, off_t offset, + hfs_block_alloc_flags_t flags, + bool always_copy) +{ + bool taken_copy = false; + void *buffer = (void *)buf_dataptr(bp); + const uint32_t nbytes = buf_count(bp); + const off_t end = offset + nbytes * 8 - 1; + + for (int i = (ISSET(flags, HFS_ALLOC_IGNORE_TENTATIVE) + ? HFS_LOCKED_BLOCKS : HFS_TENTATIVE_BLOCKS); i < 2; ++i) { + struct rl_entry *entry; + TAILQ_FOREACH(entry, &hfsmp->hfs_reserved_ranges[i], rl_link) { + uint32_t a, b; + + enum rl_overlaptype overlap_type = rl_overlap(entry, offset, end); + + if (overlap_type == RL_NOOVERLAP) + continue; + + /* + * If always_copy is false, we only take a copy if B_LOCKED is + * set because ReleaseScanBitmapRange doesn't invalidate the + * buffer in that case. + */ + if (!taken_copy && (always_copy || ISSET(buf_flags(bp), B_LOCKED))) { + buf_t new_bp = buf_create_shadow(bp, true, 0, NULL, NULL); + buf_brelse(bp); + bp = new_bp; + buf_setflags(bp, B_NOCACHE); + buffer = (void *)buf_dataptr(bp); + taken_copy = true; + } + + switch (overlap_type) { + case RL_OVERLAPCONTAINSRANGE: + case RL_MATCHINGOVERLAP: + memset(buffer, 0xff, nbytes); + return bp; + case RL_OVERLAPISCONTAINED: + a = entry->rl_start; + b = entry->rl_end; + break; + case RL_OVERLAPSTARTSBEFORE: + a = offset; + b = entry->rl_end; + break; + case RL_OVERLAPENDSAFTER: + a = entry->rl_start; + b = end; + break; + case RL_NOOVERLAP: + __builtin_unreachable(); + } + + a -= offset; + b -= offset; + + hfs_assert(a < buf_count(bp) * 8); + hfs_assert(b < buf_count(bp) * 8); + hfs_assert(b >= a); + + // b is inclusive + bits_set(buffer, a, b + 1); + } + } // for (;;) + + return bp; +} + +/* +;_______________________________________________________________________ +; +; Routine: ReadBitmapBlock +; +; Function: Read in a bitmap block corresponding to a given allocation +; block (bit). Return a pointer to the bitmap block. +; +; Inputs: +; vcb -- Pointer to ExtendedVCB +; bit -- Allocation block whose bitmap block is desired +; +; Outputs: +; buffer -- Pointer to bitmap block corresonding to "block" +; blockRef +;_______________________________________________________________________ +*/ +static OSErr ReadBitmapBlock(ExtendedVCB *vcb, + u_int32_t bit, + u_int32_t **buffer, + uintptr_t *blockRef, + hfs_block_alloc_flags_t flags) +{ + OSErr err; + struct buf *bp = NULL; + struct vnode *vp = NULL; + daddr64_t block; + u_int32_t blockSize; + + if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_READ_BITMAP_BLOCK | DBG_FUNC_START, bit, 0, 0, 0, 0); + + /* + * volume bitmap blocks are protected by the allocation file lock + */ + REQUIRE_FILE_LOCK(vcb->hfs_allocation_vp, false); + + blockSize = (u_int32_t)vcb->vcbVBMIOSize; + block = (daddr64_t)(bit / (blockSize * kBitsPerByte)); + + /* HFS+ / HFSX */ + if (vcb->vcbSigWord != kHFSSigWord) { + vp = vcb->hfs_allocation_vp; /* use allocation file vnode */ + } +#if CONFIG_HFS_STD + else { + /* HFS Standard */ + vp = VCBTOHFS(vcb)->hfs_devvp; /* use device I/O vnode */ + block += vcb->vcbVBMSt; /* map to physical block */ + } +#endif + + err = (int)buf_meta_bread(vp, block, blockSize, NOCRED, &bp); + + if (bp) { + if (err) { + buf_brelse(bp); + *blockRef = 0; + *buffer = NULL; + } else { + if (!ISSET(flags, HFS_ALLOC_IGNORE_RESERVED)) { + bp = process_reservations(vcb, bp, block * blockSize * 8, + flags, /* always_copy: */ true); + } + + buf_setfsprivate(bp, (void *)(uintptr_t)flags); + + *blockRef = (uintptr_t)bp; + *buffer = (u_int32_t *)buf_dataptr(bp); + } + } else + hfs_assert(err); + + if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_READ_BITMAP_BLOCK | DBG_FUNC_END, err, 0, 0, 0, 0); + + return err; +} + + +/* +;_______________________________________________________________________ +; +; Routine: ReadBitmapRange +; +; Function: Read in a range of the bitmap starting at the given offset. +; Use the supplied size to determine the amount of I/O to generate +; against the bitmap file. Return a pointer to the bitmap block. +; +; Inputs: +; hfsmp -- Pointer to hfs mount +; offset -- byte offset into the bitmap file +; size -- How much I/O to generate against the bitmap file. +; +; Outputs: +; buffer -- Pointer to bitmap block data corresonding to "block" +; blockRef -- struct 'buf' pointer which MUST be released in a subsequent call. +;_______________________________________________________________________ +*/ +static OSErr ReadBitmapRange(struct hfsmount *hfsmp, uint32_t offset, + uint32_t iosize, uint32_t **buffer, struct buf **blockRef) +{ + + OSErr err; + struct buf *bp = NULL; + struct vnode *vp = NULL; + daddr64_t block; + + /* This function isn't supported for HFS standard */ + if (hfsmp->vcbSigWord != kHFSPlusSigWord) { + return EINVAL; + } + + if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) { + KERNEL_DEBUG_CONSTANT(HFSDBG_READ_BITMAP_RANGE | DBG_FUNC_START, offset, iosize, 0, 0, 0); + } + + /* + * volume bitmap blocks are protected by the allocation file lock + */ + REQUIRE_FILE_LOCK(hfsmp->hfs_allocation_vp, false); + + vp = hfsmp->hfs_allocation_vp; /* use allocation file vnode */ + + /* + * The byte offset argument must be converted into bitmap-relative logical + * block numbers before using it in buf_meta_bread. + * + * buf_meta_bread (and the things it calls) will eventually try to + * reconstruct the byte offset into the file by multiplying the logical + * block number passed in below by the vcbVBMIOSize field in the mount + * point. So we prepare for that by converting the byte offset back into + * logical blocks in terms of VBMIOSize units. + * + * The amount of I/O requested and the byte offset should be computed + * based on the helper function in the frame that called us, so we can + * get away with just doing a simple divide here. + */ + block = (daddr64_t)(offset / hfsmp->vcbVBMIOSize); + + err = (int) buf_meta_bread(vp, block, iosize, NOCRED, &bp); + + if (bp) { + if (err) { + buf_brelse(bp); + *blockRef = 0; + *buffer = NULL; + } else { + bp = process_reservations(hfsmp, bp, (offset * 8), 0, + /* always_copy: */ false); + + *blockRef = bp; + *buffer = (u_int32_t *)buf_dataptr(bp); + } + } + + if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) { + KERNEL_DEBUG_CONSTANT(HFSDBG_READ_BITMAP_RANGE | DBG_FUNC_END, err, 0, 0, 0, 0); + } + + return err; +} + + +/* +;_______________________________________________________________________ +; +; Routine: ReleaseBitmapBlock +; +; Function: Relase a bitmap block. +; +; Inputs: +; vcb +; blockRef +; dirty +;_______________________________________________________________________ +*/ +static OSErr ReleaseBitmapBlock( + ExtendedVCB *vcb, + uintptr_t blockRef, + Boolean dirty) +{ + struct buf *bp = (struct buf *)blockRef; + + if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_RELEASE_BITMAP_BLOCK | DBG_FUNC_START, dirty, 0, 0, 0, 0); + + if (blockRef == 0) { + if (dirty) + panic("hfs: ReleaseBitmapBlock: missing bp"); + return (0); + } + + if (bp) { + if (dirty) { + hfs_block_alloc_flags_t flags = (uintptr_t)buf_fsprivate(bp); + + if (!ISSET(flags, HFS_ALLOC_IGNORE_RESERVED)) + panic("Modified read-only bitmap buffer!"); + + struct hfsmount *hfsmp = VCBTOHFS(vcb); + + if (hfsmp->jnl) { + journal_modify_block_end(hfsmp->jnl, bp, NULL, NULL); + } else { + buf_bdwrite(bp); + } + } else { + buf_brelse(bp); + } + } + + if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_RELEASE_BITMAP_BLOCK | DBG_FUNC_END, 0, 0, 0, 0, 0); + + return (0); +} + +/* + * ReleaseScanBitmapRange + * + * This is used to release struct bufs that were created for use by + * bitmap scanning code. Because they may be of sizes different than the + * typical runtime manipulation code, we want to force them to be purged out + * of the buffer cache ASAP, so we'll release them differently than in the + * ReleaseBitmapBlock case. + * + * Additionally, because we know that we're only reading the blocks and that they + * should have been clean prior to reading them, we will never + * issue a write to them (thus dirtying them). + */ + +static OSErr ReleaseScanBitmapRange(struct buf *bp ) { + + if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) { + KERNEL_DEBUG_CONSTANT(HFSDBG_RELEASE_BITMAP_BLOCK | DBG_FUNC_START, 0, 0, 0, 0, 0); + } + + if (bp) { + /* Mark the buffer invalid if it isn't locked, then release it */ + if ((buf_flags(bp) & B_LOCKED) == 0) { + buf_markinvalid(bp); + } + buf_brelse(bp); + } + + if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) { + KERNEL_DEBUG_CONSTANT(HFSDBG_RELEASE_SCAN_BITMAP | DBG_FUNC_END, 0, 0, 0, 0, 0); + } + + return (0); +} + +/* + * @extent.startBlock, on input, contains a preferred block for the + * allocation. @extent.blockCount, on input, contains the minimum + * number of blocks acceptable. Upon success, the result is conveyed + * in @extent. + */ +static OSErr hfs_alloc_try_hard(hfsmount_t *hfsmp, + HFSPlusExtentDescriptor *extent, + uint32_t max_blocks, + hfs_block_alloc_flags_t flags) +{ + OSErr err = dskFulErr; + + const uint32_t min_blocks = extent->blockCount; + + // It's > rather than >= because the last block is always reserved + if (extent->startBlock > 0 && extent->startBlock < hfsmp->allocLimit + && hfsmp->allocLimit - extent->startBlock > max_blocks) { + /* + * This is just checking to see if there's an extent starting + * at extent->startBlock that will suit. We only check for + * @max_blocks here; @min_blocks is ignored. + */ + + err = BlockFindContiguous(hfsmp, extent->startBlock, extent->startBlock + max_blocks, + max_blocks, max_blocks, true, true, + &extent->startBlock, &extent->blockCount, flags); + + if (err != dskFulErr) + return err; + } + + err = BlockFindKnown(hfsmp, max_blocks, &extent->startBlock, + &extent->blockCount); + + if (!err) { + if (extent->blockCount >= max_blocks) + return 0; + } else if (err != dskFulErr) + return err; + + // Try a more exhaustive search + return BlockFindContiguous(hfsmp, 1, hfsmp->allocLimit, + min_blocks, max_blocks, + /* useMetaZone: */ true, + /* trustSummary: */ true, + &extent->startBlock, &extent->blockCount, flags); +} + +/* +_______________________________________________________________________ + +Routine: BlockFindContig + +Function: Find a contiguous group of allocation blocks. If the + minimum cannot be satisfied, nothing is returned. The + caller guarantees that there are enough free blocks + (though they may not be contiguous, in which case this + call will fail). + +Inputs: + vcb Pointer to volume where space is to be allocated + startingBlock Preferred first block for allocation + minBlocks Minimum number of contiguous blocks to allocate + maxBlocks Maximum number of contiguous blocks to allocate + flags + +Outputs: + actualStartBlock First block of range allocated, or 0 if error + actualNumBlocks Number of blocks allocated, or 0 if error +_______________________________________________________________________ +*/ +static OSErr BlockFindContig( + ExtendedVCB *vcb, + u_int32_t startingBlock, + u_int32_t minBlocks, + u_int32_t maxBlocks, + hfs_block_alloc_flags_t flags, + u_int32_t *actualStartBlock, + u_int32_t *actualNumBlocks) +{ + OSErr retval = noErr; + uint32_t currentStart = startingBlock; + + uint32_t foundStart = 0; // values to emit to caller + uint32_t foundCount = 0; + + uint32_t collision_start = 0; // if we have to re-allocate a recently deleted extent, use this + uint32_t collision_count = 0; + + int err; + int allowReuse = (flags & HFS_ALLOC_FLUSHTXN); + Boolean useMetaZone = (flags & HFS_ALLOC_METAZONE); + + int recently_deleted = 0; + struct hfsmount *hfsmp = VCBTOHFS(vcb); + + if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_FIND_CONTIG_BITMAP | DBG_FUNC_START, startingBlock, minBlocks, maxBlocks, useMetaZone, 0); + + while ((retval == noErr) && (foundStart == 0) && (foundCount == 0)) { + + /* Try and find something that works. */ + + /* + * NOTE: If the only contiguous free extent of at least minBlocks + * crosses startingBlock (i.e. starts before, ends after), then we + * won't find it. Earlier versions *did* find this case by letting + * the second search look past startingBlock by minBlocks. But + * with the free extent cache, this can lead to duplicate entries + * in the cache, causing the same blocks to be allocated twice. + */ + retval = BlockFindContiguous(vcb, currentStart, vcb->allocLimit, minBlocks, + maxBlocks, useMetaZone, true, &foundStart, &foundCount, flags); + + if (retval == dskFulErr && currentStart != 0) { + /* + * We constrain the endingBlock so we don't bother looking for ranges + * that would overlap those found in the previous call, if the summary bitmap + * is not on for this volume. If it is, then we assume that it was not trust + * -worthy and do a full scan. + */ + if (hfsmp->hfs_flags & HFS_SUMMARY_TABLE) { + retval = BlockFindContiguous(vcb, 1, vcb->allocLimit, minBlocks, + maxBlocks, useMetaZone, false, &foundStart, &foundCount, flags); + } + else { + retval = BlockFindContiguous(vcb, 1, currentStart, minBlocks, + maxBlocks, useMetaZone, false, &foundStart, &foundCount, flags); + } + } + + if (retval != noErr) { + goto bailout; + } + + /* Do we overlap with the recently found collision extent? */ + if (collision_start) { + if (extents_overlap (foundStart, foundCount, collision_start, collision_count)) { + /* + * We've looped around, and the only thing we could use was the collision extent. + * Since we are allowed to use it, go ahead and do so now. + */ + if(allowReuse) { + /* + * then we couldn't find anything except values which might have been + * recently deallocated. just return our cached value if we are allowed to. + */ + foundStart = collision_start; + foundCount = collision_count; + goto bailout; + } + else { + /* Otherwise, we looped around and couldn't find anything that wouldn't require a journal flush. */ + retval = dskFulErr; + goto bailout; + } + } + } + + /* OK, we know we must not have collided . See if this one is recently deleted */ + if (hfsmp->jnl) { + recently_deleted = 0; + uint32_t nextStart; + err = CheckUnmappedBytes (hfsmp, (uint64_t)foundStart, + (uint64_t) foundCount, &recently_deleted, &nextStart); + if (err == 0) { + if(recently_deleted != 0) { + /* + * these blocks were recently deleted/deallocated. Cache the extent, but + * but keep searching to see if we can find one that won't collide here. + */ + if (collision_start == 0) { + collision_start = foundStart; + collision_count = foundCount; + } + recently_deleted = 0; + + /* + * advance currentStart to the point just past the overlap we just found. Note that + * we will automatically loop around to start of the bitmap as needed. + */ + currentStart = nextStart; + /* Unset foundStart/Count to allow it to loop around again. */ + foundStart = 0; + foundCount = 0; + } + } + } // end jnl/deleted case + + /* + * If we found something good, we'd break out of the loop at the top; foundCount + * and foundStart should be set. + */ + + } // end while loop. + +bailout: + + if (retval == noErr) { + *actualStartBlock = foundStart; + *actualNumBlocks = foundCount; + } + + if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_FIND_CONTIG_BITMAP | DBG_FUNC_END, foundStart, foundCount, retval, 0, 0); + + return retval; + +} + + +/* +_______________________________________________________________________ + +Routine: BlockFindAny + +Function: Find one or more allocation blocks and may return fewer than + requested. The caller guarantees that there is at least one + free block. + +Inputs: + vcb Pointer to volume where space is to be allocated + startingBlock Preferred first block for allocation + endingBlock Last block to check + 1 + maxBlocks Maximum number of contiguous blocks to allocate + useMetaZone + +Outputs: + actualStartBlock First block of range allocated, or 0 if error + actualNumBlocks Number of blocks allocated, or 0 if error +_______________________________________________________________________ +*/ + +static OSErr BlockFindAny( + ExtendedVCB *vcb, + u_int32_t startingBlock, + register u_int32_t endingBlock, + u_int32_t maxBlocks, + hfs_block_alloc_flags_t flags, + Boolean trustSummary, + u_int32_t *actualStartBlock, + u_int32_t *actualNumBlocks) +{ + + /* + * If it is enabled, scan through the summary table to find the first free block. + * + * If it reports that there are not any free blocks, we could have a false + * positive, so in that case, use the input arguments as a pass through. + */ + uint32_t start_blk = startingBlock; + uint32_t end_blk = endingBlock; + struct hfsmount *hfsmp; + OSErr err; + + hfsmp = (struct hfsmount*)vcb; + if (hfsmp->hfs_flags & HFS_SUMMARY_TABLE) { + uint32_t suggested_start; + + /* + * If the summary table is enabled, scan through it to find the first free + * block. If there was an error, or we couldn't find anything free in the + * summary table, then just leave the start_blk fields unmodified. We wouldn't + * have gotten to this point if the mount point made it look like there was possibly + * free space in the FS. + */ + err = hfs_find_summary_free (hfsmp, startingBlock, &suggested_start); + if (err == 0) { + start_blk = suggested_start; + } + else { + /* Differentiate between ENOSPC and a more esoteric error in the above call. */ + if ((err == ENOSPC) && (trustSummary)) { + /* + * The 'trustSummary' argument is for doing a full scan if we really + * really, need the space and we think it's somewhere but can't find it in the + * summary table. If it's true, then we trust the summary table and return + * dskFulErr if we couldn't find it above. + */ + return dskFulErr; + } + /* + * If either trustSummary was false or we got a different errno, then we + * want to fall through to the real bitmap single i/o code... + */ + } + } + + err = BlockFindAnyBitmap(vcb, start_blk, end_blk, maxBlocks, + flags, actualStartBlock, actualNumBlocks); + + return err; +} + + +/* + * BlockFindAnyBitmap finds free ranges by scanning the bitmap to + * figure out where the free allocation blocks are. Inputs and + * outputs are the same as for BlockFindAny. + */ + +static OSErr BlockFindAnyBitmap( + ExtendedVCB *vcb, + u_int32_t startingBlock, + register u_int32_t endingBlock, + u_int32_t maxBlocks, + hfs_block_alloc_flags_t flags, + u_int32_t *actualStartBlock, + u_int32_t *actualNumBlocks) +{ + OSErr err; + register u_int32_t block = 0; // current block number + register u_int32_t currentWord; // Pointer to current word within bitmap block + register u_int32_t bitMask; // Word with given bits already set (ready to OR in) + register u_int32_t wordsLeft; // Number of words left in this bitmap block + u_int32_t *buffer = NULL; + u_int32_t *currCache = NULL; + uintptr_t blockRef = 0; + u_int32_t bitsPerBlock; + u_int32_t wordsPerBlock; + struct hfsmount *hfsmp = VCBTOHFS(vcb); + Boolean useMetaZone = (flags & HFS_ALLOC_METAZONE); + Boolean forceFlush = (flags & HFS_ALLOC_FLUSHTXN); + + if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_ALLOC_ANY_BITMAP | DBG_FUNC_START, startingBlock, endingBlock, maxBlocks, useMetaZone, 0); + +restartSearchAny: + + /* + * When we're skipping the metadata zone and the start/end + * range overlaps with the metadata zone then adjust the + * start to be outside of the metadata zone. If the range + * is entirely inside the metadata zone then we can deny the + * request (dskFulErr). + */ + if (!useMetaZone && (vcb->hfs_flags & HFS_METADATA_ZONE)) { + if (startingBlock <= vcb->hfs_metazone_end) { + if (endingBlock > (vcb->hfs_metazone_end + 2)) + startingBlock = vcb->hfs_metazone_end + 1; + else { + err = dskFulErr; + goto Exit; + } + } + } + + // Since this routine doesn't wrap around + if (maxBlocks > (endingBlock - startingBlock)) { + maxBlocks = endingBlock - startingBlock; + } + + // + // Pre-read the first bitmap block + // + err = ReadBitmapBlock(vcb, startingBlock, &currCache, &blockRef, flags); + if (err != noErr) goto Exit; + buffer = currCache; + + // + // Set up the current position within the block + // + { + u_int32_t wordIndexInBlock; + + bitsPerBlock = vcb->vcbVBMIOSize * kBitsPerByte; + wordsPerBlock = vcb->vcbVBMIOSize / kBytesPerWord; + + wordIndexInBlock = (startingBlock & (bitsPerBlock-1)) / kBitsPerWord; + buffer += wordIndexInBlock; + wordsLeft = wordsPerBlock - wordIndexInBlock; + currentWord = SWAP_BE32 (*buffer); + bitMask = kHighBitInWordMask >> (startingBlock & kBitsWithinWordMask); + } + + /* + * While loop 1: + * Find the first unallocated block starting at 'block' + */ + uint32_t summary_block_scan = 0; + + block=startingBlock; + while (block < endingBlock) { + if ((currentWord & bitMask) == 0) + break; + + // Next bit + ++block; + bitMask >>= 1; + if (bitMask == 0) { + // Next word + bitMask = kHighBitInWordMask; + ++buffer; + + if (--wordsLeft == 0) { + // Next block + buffer = currCache = NULL; + if (hfsmp->hfs_flags & HFS_SUMMARY_TABLE) { + /* + * If summary_block_scan is non-zero, then we must have + * pulled a bitmap file block into core, and scanned through + * the entire thing. Because we're in this loop, we are + * implicitly trusting that the bitmap didn't have any knowledge + * about this particular block. As a result, update the bitmap + * (lazily, now that we've scanned it) with our findings that + * this particular block is completely used up. + */ + if (summary_block_scan != 0) { + uint32_t summary_bit; + (void) hfs_get_summary_index (hfsmp, summary_block_scan, &summary_bit); + hfs_set_summary (hfsmp, summary_bit, 1); + } + } + + err = ReleaseBitmapBlock(vcb, blockRef, false); + if (err != noErr) goto Exit; + + /* + * Skip over metadata blocks. + */ + if (!useMetaZone) { + block = NextBitmapBlock(vcb, block); + } + if (block >= endingBlock) { + err = dskFulErr; + goto Exit; + } + + err = ReadBitmapBlock(vcb, block, &currCache, &blockRef, flags); + if (err != noErr) goto Exit; + buffer = currCache; + summary_block_scan = block; + wordsLeft = wordsPerBlock; + } + currentWord = SWAP_BE32 (*buffer); + } + } + + // Did we get to the end of the bitmap before finding a free block? + // If so, then couldn't allocate anything. + if (block >= endingBlock) { + err = dskFulErr; + goto Exit; + } + + + /* + * Don't move forward just yet. Verify that either one of the following + * two conditions is true: + * 1) journaling is not enabled + * 2) block is not currently on any pending TRIM list. + */ + if (hfsmp->jnl != NULL && (forceFlush == false)) { + int recently_deleted = 0; + uint32_t nextblk; + err = CheckUnmappedBytes (hfsmp, (uint64_t) block, 1, &recently_deleted, &nextblk); + if ((err == 0) && (recently_deleted)) { + + /* release the bitmap block & unset currCache. we may jump past it. */ + err = ReleaseBitmapBlock(vcb, blockRef, false); + currCache = NULL; + if (err != noErr) { + goto Exit; + } + /* set our start to nextblk, and re-do the search. */ + startingBlock = nextblk; + goto restartSearchAny; + } + } + + + // Return the first block in the allocated range + *actualStartBlock = block; + + // If we could get the desired number of blocks before hitting endingBlock, + // then adjust endingBlock so we won't keep looking. Ideally, the comparison + // would be (block + maxBlocks) < endingBlock, but that could overflow. The + // comparison below yields identical results, but without overflow. + if (block < (endingBlock-maxBlocks)) { + endingBlock = block + maxBlocks; // if we get this far, we've found enough + } + + /* + * While loop 2: + * Scan the bitmap, starting at 'currentWord' in the current + * bitmap block. Continue iterating through the bitmap until + * either we hit an allocated block, or until we have accumuluated + * maxBlocks worth of bitmap. + */ + + /* Continue until we see an allocated block */ + while ((currentWord & bitMask) == 0) { + // Move to the next block. If no more, then exit. + ++block; + if (block == endingBlock) { + break; + } + + // Next bit + bitMask >>= 1; + if (bitMask == 0) { + // Next word + bitMask = kHighBitInWordMask; + ++buffer; + + if (--wordsLeft == 0) { + // Next block + buffer = currCache = NULL; + + /* We're only reading the bitmap here, so mark it as clean */ + err = ReleaseBitmapBlock(vcb, blockRef, false); + if (err != noErr) { + goto Exit; + } + + /* + * Skip over metadata blocks. + */ + if (!useMetaZone) { + u_int32_t nextBlock; + nextBlock = NextBitmapBlock(vcb, block); + if (nextBlock != block) { + goto Exit; /* allocation gap, so stop */ + } + } + + if (block >= endingBlock) { + goto Exit; + } + + err = ReadBitmapBlock(vcb, block, &currCache, &blockRef, flags); + if (err != noErr) { + goto Exit; + } + buffer = currCache; + wordsLeft = wordsPerBlock; + } + currentWord = SWAP_BE32 (*buffer); + } + } + +Exit: + if (currCache) { + /* Release the bitmap reference prior to marking bits in-use */ + (void) ReleaseBitmapBlock(vcb, blockRef, false); + currCache = NULL; + } + + if (err == noErr) { + *actualNumBlocks = block - *actualStartBlock; + + // sanity check + if ((*actualStartBlock + *actualNumBlocks) > vcb->allocLimit) { + panic("hfs: BlockFindAnyBitmap: allocation overflow on \"%s\"", vcb->vcbVN); + } + } + else { + *actualStartBlock = 0; + *actualNumBlocks = 0; + } + + if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_ALLOC_ANY_BITMAP | DBG_FUNC_END, err, *actualStartBlock, *actualNumBlocks, 0, 0); + + return err; +} + + +/* +_______________________________________________________________________ + +Routine: BlockFindKnown + +Function: Return a potential extent from the free extent cache. The + returned extent *must* be marked allocated and removed + from the cache by the *caller*. + +Inputs: + vcb Pointer to volume where space is to be allocated + maxBlocks Maximum number of contiguous blocks to allocate + +Outputs: + actualStartBlock First block of range allocated, or 0 if error + actualNumBlocks Number of blocks allocated, or 0 if error + +Returns: + dskFulErr Free extent cache is empty +_______________________________________________________________________ +*/ + +static OSErr BlockFindKnown( + ExtendedVCB *vcb, + u_int32_t maxBlocks, + u_int32_t *actualStartBlock, + u_int32_t *actualNumBlocks) +{ + OSErr err; + u_int32_t foundBlocks; + struct hfsmount *hfsmp = VCBTOHFS(vcb); + + if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_ALLOC_FIND_KNOWN | DBG_FUNC_START, 0, 0, maxBlocks, 0, 0); + + hfs_lock_mount (hfsmp); + lck_spin_lock(&vcb->vcbFreeExtLock); + if ( vcb->vcbFreeExtCnt == 0 || + vcb->vcbFreeExt[0].blockCount == 0) { + lck_spin_unlock(&vcb->vcbFreeExtLock); + hfs_unlock_mount(hfsmp); + if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_ALLOC_FIND_KNOWN | DBG_FUNC_END, dskFulErr, *actualStartBlock, *actualNumBlocks, 0, 0); + return dskFulErr; + } + lck_spin_unlock(&vcb->vcbFreeExtLock); + hfs_unlock_mount(hfsmp); + + lck_spin_lock(&vcb->vcbFreeExtLock); + + // Just grab up to maxBlocks of the first (largest) free exent. + *actualStartBlock = vcb->vcbFreeExt[0].startBlock; + foundBlocks = vcb->vcbFreeExt[0].blockCount; + if (foundBlocks > maxBlocks) + foundBlocks = maxBlocks; + *actualNumBlocks = foundBlocks; + + lck_spin_unlock(&vcb->vcbFreeExtLock); + + // sanity check + if ((*actualStartBlock + *actualNumBlocks) > vcb->allocLimit) + { + printf ("hfs: BlockAllocateKnown() found allocation overflow on \"%s\"", vcb->vcbVN); + hfs_mark_inconsistent(vcb, HFS_INCONSISTENCY_DETECTED); + err = EIO; + } else + err = 0; + + if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_ALLOC_FIND_KNOWN | DBG_FUNC_END, err, *actualStartBlock, *actualNumBlocks, 0, 0); + + return err; +} + +/* + * BlockMarkAllocated + * + * This is a wrapper function around the internal calls which will actually mark the blocks + * as in-use. It will mark the blocks in the red-black tree if appropriate. We need to do + * this logic here to avoid callers having to deal with whether or not the red-black tree + * is enabled. + */ + +OSErr BlockMarkAllocated( + ExtendedVCB *vcb, + u_int32_t startingBlock, + register u_int32_t numBlocks) +{ + return BlockMarkAllocatedInternal(vcb, startingBlock, numBlocks, 0); +} + + +/* +_______________________________________________________________________ + +Routine: BlockMarkAllocatedInternal + +Function: Mark a contiguous group of blocks as allocated (set in the + bitmap). It assumes those bits are currently marked + deallocated (clear in the bitmap). Note that this function + must be called regardless of whether or not the bitmap or + tree-based allocator is used, as all allocations must correctly + be marked on-disk. If the tree-based approach is running, then + this will be done before the node is removed from the tree. + +Inputs: + vcb Pointer to volume where space is to be allocated + startingBlock First block number to mark as allocated + numBlocks Number of blocks to mark as allocated +_______________________________________________________________________ +*/ +static +OSErr BlockMarkAllocatedInternal ( + ExtendedVCB *vcb, + u_int32_t startingBlock, + u_int32_t numBlocks, + hfs_block_alloc_flags_t flags) +{ + OSErr err; + register u_int32_t *currentWord; // Pointer to current word within bitmap block + register u_int32_t wordsLeft; // Number of words left in this bitmap block + register u_int32_t bitMask; // Word with given bits already set (ready to OR in) + u_int32_t firstBit; // Bit index within word of first bit to allocate + u_int32_t numBits; // Number of bits in word to allocate + u_int32_t *buffer = NULL; + uintptr_t blockRef = 0; + u_int32_t bitsPerBlock; + u_int32_t wordsPerBlock; + // XXXdbg + struct hfsmount *hfsmp = VCBTOHFS(vcb); + + if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_MARK_ALLOC_BITMAP | DBG_FUNC_START, startingBlock, numBlocks, flags, 0, 0); + +#if DEBUG + + if (!ISSET(flags, HFS_ALLOC_COMMIT) + || ISSET(flags, HFS_ALLOC_USE_TENTATIVE)) { + struct rl_entry *range; + TAILQ_FOREACH(range, &hfsmp->hfs_reserved_ranges[HFS_LOCKED_BLOCKS], rl_link) { + hfs_assert(rl_overlap(range, startingBlock, + startingBlock + numBlocks - 1) == RL_NOOVERLAP); + } + } + +#endif + + int force_flush = 0; + /* + * Since we are about to mark these bits as in-use + * in the bitmap, decide if we need to alert the caller + * that a journal flush might be appropriate. It's safe to + * poke at the journal pointer here since we MUST have + * called start_transaction by the time this function is invoked. + * If the journal is enabled, then it will have taken the requisite + * journal locks. If it is not enabled, then we have taken + * a shared lock on the global lock. + */ + if (hfsmp->jnl) { + uint32_t ignore; + err = CheckUnmappedBytes (hfsmp, (uint64_t) startingBlock, (uint64_t)numBlocks, &force_flush, &ignore); + if ((err == 0) && (force_flush)) { + journal_request_immediate_flush (hfsmp->jnl); + } + } + + hfs_unmap_alloc_extent(vcb, startingBlock, numBlocks); + + /* + * Don't make changes to the disk if we're just reserving. Note that + * we could do better in the tentative case because we could, in theory, + * avoid the journal flush above. However, that would mean that we would + * need to catch the callback to stop it incorrectly addding the extent + * to our free cache. + */ + if (ISSET(flags, HFS_ALLOC_LOCKED | HFS_ALLOC_TENTATIVE)) { + err = 0; + goto Exit; + } + + // + // Pre-read the bitmap block containing the first word of allocation + // + + err = ReadBitmapBlock(vcb, startingBlock, &buffer, &blockRef, + HFS_ALLOC_IGNORE_RESERVED); + if (err != noErr) goto Exit; + // + // Initialize currentWord, and wordsLeft. + // + { + u_int32_t wordIndexInBlock; + + bitsPerBlock = vcb->vcbVBMIOSize * kBitsPerByte; + wordsPerBlock = vcb->vcbVBMIOSize / kBytesPerWord; + + wordIndexInBlock = (startingBlock & (bitsPerBlock-1)) / kBitsPerWord; + currentWord = buffer + wordIndexInBlock; + wordsLeft = wordsPerBlock - wordIndexInBlock; + } + + // XXXdbg + if (hfsmp->jnl) { + journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef); + } + + // + // If the first block to allocate doesn't start on a word + // boundary in the bitmap, then treat that first word + // specially. + // + + firstBit = startingBlock % kBitsPerWord; + if (firstBit != 0) { + bitMask = kAllBitsSetInWord >> firstBit; // turn off all bits before firstBit + numBits = kBitsPerWord - firstBit; // number of remaining bits in this word + if (numBits > numBlocks) { + numBits = numBlocks; // entire allocation is inside this one word + bitMask &= ~(kAllBitsSetInWord >> (firstBit + numBits)); // turn off bits after last + } +#if DEBUG + if ((*currentWord & SWAP_BE32 (bitMask)) != 0) { + panic("hfs: BlockMarkAllocatedInternal: blocks already allocated!"); + } +#endif + *currentWord |= SWAP_BE32 (bitMask); // set the bits in the bitmap + numBlocks -= numBits; // adjust number of blocks left to allocate + + ++currentWord; // move to next word + --wordsLeft; // one less word left in this block + } + + // + // Allocate whole words (32 blocks) at a time. + // + + bitMask = kAllBitsSetInWord; // put this in a register for 68K + while (numBlocks >= kBitsPerWord) { + if (wordsLeft == 0) { + // Read in the next bitmap block + startingBlock += bitsPerBlock; // generate a block number in the next bitmap block + + buffer = NULL; + err = ReleaseBitmapBlock(vcb, blockRef, true); + if (err != noErr) goto Exit; + + err = ReadBitmapBlock(vcb, startingBlock, &buffer, &blockRef, + HFS_ALLOC_IGNORE_RESERVED); + if (err != noErr) goto Exit; + + // XXXdbg + if (hfsmp->jnl) { + journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef); + } + + // Readjust currentWord and wordsLeft + currentWord = buffer; + wordsLeft = wordsPerBlock; + } +#if DEBUG + if (*currentWord != 0) { + panic("hfs: BlockMarkAllocatedInternal: blocks already allocated!"); + } +#endif + *currentWord = SWAP_BE32 (bitMask); + numBlocks -= kBitsPerWord; + + ++currentWord; // move to next word + --wordsLeft; // one less word left in this block + } + + // + // Allocate any remaining blocks. + // + + if (numBlocks != 0) { + bitMask = ~(kAllBitsSetInWord >> numBlocks); // set first numBlocks bits + if (wordsLeft == 0) { + // Read in the next bitmap block + startingBlock += bitsPerBlock; // generate a block number in the next bitmap block + + buffer = NULL; + err = ReleaseBitmapBlock(vcb, blockRef, true); + if (err != noErr) goto Exit; + + err = ReadBitmapBlock(vcb, startingBlock, &buffer, &blockRef, + HFS_ALLOC_IGNORE_RESERVED); + if (err != noErr) goto Exit; + + // XXXdbg + if (hfsmp->jnl) { + journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef); + } + + currentWord = buffer; + } +#if DEBUG + if ((*currentWord & SWAP_BE32 (bitMask)) != 0) { + panic("hfs: BlockMarkAllocatedInternal: blocks already allocated!"); + } +#endif + *currentWord |= SWAP_BE32 (bitMask); // set the bits in the bitmap + + // No need to update currentWord or wordsLeft + } + +Exit: + + if (buffer) + (void)ReleaseBitmapBlock(vcb, blockRef, true); + + if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_MARK_ALLOC_BITMAP | DBG_FUNC_END, err, 0, 0, 0, 0); + + return err; +} + + +/* + * BlockMarkFree + * + * This is a wrapper function around the internal calls which will actually mark the blocks + * as freed. It will mark the blocks in the red-black tree if appropriate. We need to do + * this logic here to avoid callers having to deal with whether or not the red-black tree + * is enabled. + * + */ +OSErr BlockMarkFree( + ExtendedVCB *vcb, + u_int32_t startingBlock, + register u_int32_t numBlocks) +{ + return BlockMarkFreeInternal(vcb, startingBlock, numBlocks, true); +} + + +/* + * BlockMarkFreeUnused + * + * Scan the bitmap block beyond end of current file system for bits + * that are marked as used. If any of the bits are marked as used, + * this function marks them free. + * + * Note: This was specifically written to mark all bits beyond + * end of current file system during hfs_extendfs(), which makes + * sure that all the new blocks added to the file system are + * marked as free. We expect that all the blocks beyond end of + * current file system are always marked as free, but there might + * be cases where are marked as used. This function assumes that + * the number of blocks marked as used incorrectly are relatively + * small, otherwise this can overflow journal transaction size + * on certain file system configurations (example, large unused + * bitmap with relatively small journal). + * + * Input: + * startingBlock: First block of the range to mark unused + * numBlocks: Number of blocks in the range to mark unused + * + * Returns: zero on success, non-zero on error. + */ +OSErr BlockMarkFreeUnused(ExtendedVCB *vcb, u_int32_t startingBlock, register u_int32_t numBlocks) +{ + int error = 0; + struct hfsmount *hfsmp = VCBTOHFS(vcb); + u_int32_t curNumBlocks; + u_int32_t bitsPerBlock; + u_int32_t lastBit; + + /* Use the optimal bitmap I/O size instead of bitmap block size */ + bitsPerBlock = hfsmp->vcbVBMIOSize * kBitsPerByte; + + /* + * First clear any non bitmap allocation block aligned bits + * + * Calculate the first bit in the bitmap block next to + * the bitmap block containing the bit for startingBlock. + * Using this value, we calculate the total number of + * bits to be marked unused from startingBlock to the + * end of bitmap block containing startingBlock. + */ + lastBit = ((startingBlock + (bitsPerBlock - 1))/bitsPerBlock) * bitsPerBlock; + curNumBlocks = lastBit - startingBlock; + if (curNumBlocks > numBlocks) { + curNumBlocks = numBlocks; + } + error = BlockMarkFreeInternal(vcb, startingBlock, curNumBlocks, false); + if (error) { + return error; + } + startingBlock += curNumBlocks; + numBlocks -= curNumBlocks; + + /* + * Check a full bitmap block for any 'used' bit. If any bit is used, + * mark all the bits only in that bitmap block as free. This ensures + * that we do not write unmodified bitmap blocks and do not + * overwhelm the journal. + * + * The code starts by checking full bitmap block at a time, and + * marks entire bitmap block as free only if any bit in that bitmap + * block is marked as used. In the end, it handles the last bitmap + * block which might be partially full by only checking till the + * caller-specified last bit and if any bit is set, only mark that + * range as free. + */ + while (numBlocks) { + if (numBlocks >= bitsPerBlock) { + curNumBlocks = bitsPerBlock; + } else { + curNumBlocks = numBlocks; + } + if (hfs_isallocated(hfsmp, startingBlock, curNumBlocks) == true) { + error = BlockMarkFreeInternal(vcb, startingBlock, curNumBlocks, false); + if (error) { + return error; + } + } + startingBlock += curNumBlocks; + numBlocks -= curNumBlocks; + } + + return error; +} + +/* +_______________________________________________________________________ + +Routine: BlockMarkFreeInternal + +Function: Mark a contiguous group of blocks as free (clear in the + bitmap). It assumes those bits are currently marked + allocated (set in the bitmap). + +Inputs: + vcb Pointer to volume where space is to be freed + startingBlock First block number to mark as freed + numBlocks Number of blocks to mark as freed + do_validate If true, validate that the blocks being + deallocated to check if they are within totalBlocks + for current volume and whether they were allocated + before they are marked free. +_______________________________________________________________________ +*/ +static +OSErr BlockMarkFreeInternal( + ExtendedVCB *vcb, + u_int32_t startingBlock_in, + register u_int32_t numBlocks_in, + Boolean do_validate) +{ + OSErr err; + u_int32_t startingBlock = startingBlock_in; + u_int32_t numBlocks = numBlocks_in; + uint32_t unmapStart = startingBlock_in; + uint32_t unmapCount = numBlocks_in; + uint32_t wordIndexInBlock; + u_int32_t *currentWord; // Pointer to current word within bitmap block + u_int32_t wordsLeft; // Number of words left in this bitmap block + u_int32_t bitMask; // Word with given bits already set (ready to OR in) + u_int32_t currentBit; // Bit index within word of current bit to allocate + u_int32_t numBits; // Number of bits in word to allocate + u_int32_t *buffer = NULL; + uintptr_t blockRef = 0; + u_int32_t bitsPerBlock; + u_int32_t wordsPerBlock; + // XXXdbg + struct hfsmount *hfsmp = VCBTOHFS(vcb); + + if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_MARK_FREE_BITMAP | DBG_FUNC_START, startingBlock_in, numBlocks_in, do_validate, 0, 0); + + /* + * NOTE: We use vcb->totalBlocks instead of vcb->allocLimit because we + * need to be able to free blocks being relocated during hfs_truncatefs. + */ + if ((do_validate == true) && + (startingBlock + numBlocks > vcb->totalBlocks)) { +#if ALLOC_DEBUG || DEBUG + panic ("BlockMarkFreeInternal() free non-existent blocks at %u (numBlock=%u) on vol %s\n", startingBlock, numBlocks, vcb->vcbVN); + __builtin_unreachable(); +#else + printf ("hfs: BlockMarkFreeInternal() trying to free non-existent blocks starting at %u (numBlock=%u) on volume %s\n", startingBlock, numBlocks, vcb->vcbVN); + hfs_mark_inconsistent(vcb, HFS_INCONSISTENCY_DETECTED); + err = EIO; + goto Exit; +#endif + } + + // + // Pre-read the bitmap block containing the first word of allocation + // + + err = ReadBitmapBlock(vcb, startingBlock, &buffer, &blockRef, + HFS_ALLOC_IGNORE_RESERVED); + if (err != noErr) goto Exit; + // XXXdbg + if (hfsmp->jnl) { + journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef); + } + + uint32_t min_unmap = 0, max_unmap = UINT32_MAX; + + // Work out the bounds of any unmap we can send down + struct rl_entry *range; + for (int i = 0; i < 2; ++i) { + TAILQ_FOREACH(range, &hfsmp->hfs_reserved_ranges[i], rl_link) { + if (range->rl_start < startingBlock + && range->rl_end >= min_unmap) { + min_unmap = range->rl_end + 1; + } + if (range->rl_end >= startingBlock + numBlocks + && range->rl_start < max_unmap) { + max_unmap = range->rl_start; + } + } + } + + // + // Figure out how many bits and words per bitmap block. + // + bitsPerBlock = vcb->vcbVBMIOSize * kBitsPerByte; + wordsPerBlock = vcb->vcbVBMIOSize / kBytesPerWord; + wordIndexInBlock = (startingBlock & (bitsPerBlock-1)) / kBitsPerWord; + + // + // Look for a range of free blocks immediately before startingBlock + // (up to the start of the current bitmap block). Set unmapStart to + // the first free block. + // + currentWord = buffer + wordIndexInBlock; + currentBit = startingBlock % kBitsPerWord; + bitMask = kHighBitInWordMask >> currentBit; + while (unmapStart > min_unmap) { + // Move currentWord/bitMask back by one bit + bitMask <<= 1; + if (bitMask == 0) { + if (--currentWord < buffer) + break; + bitMask = kLowBitInWordMask; + } + + if (*currentWord & SWAP_BE32(bitMask)) + break; // Found an allocated block. Stop searching. + --unmapStart; + ++unmapCount; + } + + // + // If the first block to free doesn't start on a word + // boundary in the bitmap, then treat that first word + // specially. + // + + currentWord = buffer + wordIndexInBlock; + wordsLeft = wordsPerBlock - wordIndexInBlock; + currentBit = startingBlock % kBitsPerWord; + if (currentBit != 0) { + bitMask = kAllBitsSetInWord >> currentBit; // turn off all bits before currentBit + numBits = kBitsPerWord - currentBit; // number of remaining bits in this word + if (numBits > numBlocks) { + numBits = numBlocks; // entire allocation is inside this one word + bitMask &= ~(kAllBitsSetInWord >> (currentBit + numBits)); // turn off bits after last + } + if ((do_validate == true) && + (*currentWord & SWAP_BE32 (bitMask)) != SWAP_BE32 (bitMask)) { + goto Corruption; + } + *currentWord &= SWAP_BE32 (~bitMask); // clear the bits in the bitmap + numBlocks -= numBits; // adjust number of blocks left to free + + ++currentWord; // move to next word + --wordsLeft; // one less word left in this block + } + + // + // Free whole words (32 blocks) at a time. + // + + while (numBlocks >= kBitsPerWord) { + if (wordsLeft == 0) { + // Read in the next bitmap block + startingBlock += bitsPerBlock; // generate a block number in the next bitmap block + + buffer = NULL; + err = ReleaseBitmapBlock(vcb, blockRef, true); + if (err != noErr) goto Exit; + + err = ReadBitmapBlock(vcb, startingBlock, &buffer, &blockRef, + HFS_ALLOC_IGNORE_RESERVED); + if (err != noErr) goto Exit; + + // XXXdbg + if (hfsmp->jnl) { + journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef); + } + + // Readjust currentWord and wordsLeft + currentWord = buffer; + wordsLeft = wordsPerBlock; + } + if ((do_validate == true) && + (*currentWord != SWAP_BE32 (kAllBitsSetInWord))) { + goto Corruption; + } + *currentWord = 0; // clear the entire word + numBlocks -= kBitsPerWord; + + ++currentWord; // move to next word + --wordsLeft; // one less word left in this block + } + + // + // Free any remaining blocks. + // + + if (numBlocks != 0) { + bitMask = ~(kAllBitsSetInWord >> numBlocks); // set first numBlocks bits + if (wordsLeft == 0) { + // Read in the next bitmap block + startingBlock += bitsPerBlock; // generate a block number in the next bitmap block + + buffer = NULL; + err = ReleaseBitmapBlock(vcb, blockRef, true); + if (err != noErr) goto Exit; + + err = ReadBitmapBlock(vcb, startingBlock, &buffer, &blockRef, + HFS_ALLOC_IGNORE_RESERVED); + if (err != noErr) goto Exit; + + // XXXdbg + if (hfsmp->jnl) { + journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef); + } + + currentWord = buffer; + } + if ((do_validate == true) && + (*currentWord & SWAP_BE32 (bitMask)) != SWAP_BE32 (bitMask)) { + goto Corruption; + } + *currentWord &= SWAP_BE32 (~bitMask); // clear the bits in the bitmap + + // No need to update currentWord or wordsLeft + } + + // + // Look for a range of free blocks immediately after the range we just freed + // (up to the end of the current bitmap block). + // + wordIndexInBlock = ((startingBlock_in + numBlocks_in - 1) & (bitsPerBlock-1)) / kBitsPerWord; + wordsLeft = wordsPerBlock - wordIndexInBlock; + currentWord = buffer + wordIndexInBlock; + currentBit = (startingBlock_in + numBlocks_in - 1) % kBitsPerWord; + bitMask = kHighBitInWordMask >> currentBit; + while (unmapStart + unmapCount < max_unmap) { + // Move currentWord/bitMask/wordsLeft forward one bit + bitMask >>= 1; + if (bitMask == 0) { + if (--wordsLeft == 0) + break; + ++currentWord; + bitMask = kHighBitInWordMask; + } + + if (*currentWord & SWAP_BE32(bitMask)) + break; // Found an allocated block. Stop searching. + ++unmapCount; + } + +Exit: + + if (buffer) + (void)ReleaseBitmapBlock(vcb, blockRef, true); + + if (err == noErr) { + hfs_unmap_free_extent(vcb, unmapStart, unmapCount); + } + + if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_MARK_FREE_BITMAP | DBG_FUNC_END, err, 0, 0, 0, 0); + + return err; + +Corruption: +#if DEBUG + panic("hfs: BlockMarkFreeInternal: blocks not allocated!"); + __builtin_unreachable(); +#else + printf ("hfs: BlockMarkFreeInternal() trying to free unallocated blocks on volume %s <%u, %u>\n", + vcb->vcbVN, startingBlock_in, numBlocks_in); + hfs_mark_inconsistent(vcb, HFS_INCONSISTENCY_DETECTED); + err = EIO; + goto Exit; +#endif +} + + +/* +_______________________________________________________________________ + +Routine: BlockFindContiguous + +Function: Find a contiguous range of blocks that are free (bits + clear in the bitmap). If a contiguous range of the + minimum size can't be found, an error will be returned. + This is only needed to support the bitmap-scanning logic, + as the red-black tree should be able to do this by internally + searching its tree. + +Inputs: + vcb Pointer to volume where space is to be allocated + startingBlock Preferred first block of range + endingBlock Last possible block in range + 1 + minBlocks Minimum number of blocks needed. Must be > 0. + maxBlocks Maximum (ideal) number of blocks desired + useMetaZone OK to dip into metadata allocation zone + +Outputs: + actualStartBlock First block of range found, or 0 if error + actualNumBlocks Number of blocks found, or 0 if error + +Returns: + noErr Found at least minBlocks contiguous + dskFulErr No contiguous space found, or all less than minBlocks +_______________________________________________________________________ +*/ + +static OSErr BlockFindContiguous( + ExtendedVCB *vcb, + u_int32_t startingBlock, + u_int32_t endingBlock, + u_int32_t minBlocks, + u_int32_t maxBlocks, + Boolean useMetaZone, + Boolean trustSummary, + u_int32_t *actualStartBlock, + u_int32_t *actualNumBlocks, + hfs_block_alloc_flags_t flags) +{ + OSErr err; + register u_int32_t currentBlock; // Block we're currently looking at. + u_int32_t firstBlock; // First free block in current extent. + u_int32_t stopBlock; // If we get to this block, stop searching for first free block. + u_int32_t foundBlocks; // Number of contiguous free blocks in current extent. + u_int32_t *buffer = NULL; + register u_int32_t *currentWord; + register u_int32_t bitMask; + register u_int32_t wordsLeft; + register u_int32_t tempWord; + uintptr_t blockRef = 0; + u_int32_t wordsPerBlock; + u_int32_t updated_free_extent = 0; + struct hfsmount *hfsmp = (struct hfsmount*) vcb; + HFSPlusExtentDescriptor best = { 0, 0 }; + + if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_BLOCK_FIND_CONTIG | DBG_FUNC_START, startingBlock, endingBlock, minBlocks, maxBlocks, 0); + + /* + * When we're skipping the metadata zone and the start/end + * range overlaps with the metadata zone then adjust the + * start to be outside of the metadata zone. If the range + * is entirely inside the metadata zone then we can deny the + * request (dskFulErr). + */ + if (!useMetaZone && (vcb->hfs_flags & HFS_METADATA_ZONE)) { + if (startingBlock <= vcb->hfs_metazone_end) { + if (endingBlock > (vcb->hfs_metazone_end + 2)) + startingBlock = vcb->hfs_metazone_end + 1; + else + goto DiskFull; + } + } + + if ((endingBlock - startingBlock) < minBlocks) + { + // The set of blocks we're checking is smaller than the minimum number + // of blocks, so we couldn't possibly find a good range. + goto DiskFull; + } + + stopBlock = endingBlock - minBlocks + 1; + currentBlock = startingBlock; + firstBlock = 0; + + /* + * Skip over metadata blocks. + */ + if (!useMetaZone) + currentBlock = NextBitmapBlock(vcb, currentBlock); + + /* + * Use the summary table if we can. Skip over any totally + * allocated blocks. currentBlock should now point to the first + * block beyond the metadata zone if the metazone allocations are not + * allowed in this invocation. + */ + if ((trustSummary) && (hfsmp->hfs_flags & HFS_SUMMARY_TABLE)) { + uint32_t suggestion; + err = hfs_find_summary_free (hfsmp, currentBlock, &suggestion); + if (err && err != ENOSPC) + goto ErrorExit; + if (err == ENOSPC || suggestion >= stopBlock) + goto DiskFull; + currentBlock = suggestion; + } + + + // + // Pre-read the first bitmap block. + // + err = ReadBitmapBlock(vcb, currentBlock, &buffer, &blockRef, flags); + if ( err != noErr ) goto ErrorExit; + + // + // Figure out where currentBlock is within the buffer. + // + wordsPerBlock = vcb->vcbVBMIOSize / kBytesPerWord; + + wordsLeft = (currentBlock / kBitsPerWord) & (wordsPerBlock-1); // Current index into buffer + currentWord = buffer + wordsLeft; + wordsLeft = wordsPerBlock - wordsLeft; + + uint32_t remaining = (hfsmp->freeBlocks - hfsmp->lockedBlocks + - (ISSET(flags, HFS_ALLOC_IGNORE_TENTATIVE) + ? 0 : hfsmp->tentativeBlocks)); + + /* + * This outer do-while loop is the main body of this function. Its job is + * to search through the blocks (until we hit 'stopBlock'), and iterate + * through swaths of allocated bitmap until it finds free regions. + */ + + do + { + foundBlocks = 0; + /* + * We will try and update the summary table as we search + * below. Note that we will never update the summary table + * for the first and last blocks that the summary table + * covers. Ideally, we should, but the benefits probably + * aren't that significant so we leave things alone for now. + */ + uint32_t summary_block_scan = 0; + /* + * Inner while loop 1: + * Look for free blocks, skipping over allocated ones. + * + * Initialization starts with checking the initial partial word + * if applicable. + */ + bitMask = currentBlock & kBitsWithinWordMask; + if (bitMask) + { + tempWord = SWAP_BE32(*currentWord); // Fetch the current word only once + bitMask = kHighBitInWordMask >> bitMask; + while (tempWord & bitMask) + { + bitMask >>= 1; + ++currentBlock; + } + + // Did we find an unused bit (bitMask != 0), or run out of bits (bitMask == 0)? + if (bitMask) + goto FoundUnused; + + // Didn't find any unused bits, so we're done with this word. + ++currentWord; + --wordsLeft; + } + + // + // Check whole words + // + while (currentBlock < stopBlock) + { + // See if it's time to read another block. + if (wordsLeft == 0) + { + buffer = NULL; + if (hfsmp->hfs_flags & HFS_SUMMARY_TABLE) { + /* + * If summary_block_scan is non-zero, then we must have + * pulled a bitmap file block into core, and scanned through + * the entire thing. Because we're in this loop, we are + * implicitly trusting that the bitmap didn't have any knowledge + * about this particular block. As a result, update the bitmap + * (lazily, now that we've scanned it) with our findings that + * this particular block is completely used up. + */ + if (summary_block_scan != 0) { + uint32_t summary_bit; + (void) hfs_get_summary_index (hfsmp, summary_block_scan, &summary_bit); + hfs_set_summary (hfsmp, summary_bit, 1); + } + } + err = ReleaseBitmapBlock(vcb, blockRef, false); + if (err != noErr) goto ErrorExit; + + /* + * Skip over metadata blocks. + */ + if (!useMetaZone) { + currentBlock = NextBitmapBlock(vcb, currentBlock); + if (currentBlock >= stopBlock) { + goto LoopExit; + } + } + + /* Skip over fully allocated bitmap blocks if we can */ + if ((trustSummary) && (hfsmp->hfs_flags & HFS_SUMMARY_TABLE)) { + uint32_t suggestion; + err = hfs_find_summary_free (hfsmp, currentBlock, &suggestion); + if (err && err != ENOSPC) + goto ErrorExit; + if (err == ENOSPC || suggestion >= stopBlock) + goto LoopExit; + currentBlock = suggestion; + } + + err = ReadBitmapBlock(vcb, currentBlock, &buffer, &blockRef, flags); + if ( err != noErr ) goto ErrorExit; + + /* + * Set summary_block_scan to be the block we just read into the block cache. + * + * At this point, we've just read an allocation block worth of bitmap file + * into the buffer above, but we don't know if it is completely allocated or not. + * If we find that it is completely allocated/full then we will jump + * through this loop again and set the appropriate summary bit as fully allocated. + */ + summary_block_scan = currentBlock; + currentWord = buffer; + wordsLeft = wordsPerBlock; + } + + // See if any of the bits are clear + if ((tempWord = SWAP_BE32(*currentWord)) + 1) // non-zero if any bits were clear + { + // Figure out which bit is clear + bitMask = kHighBitInWordMask; + while (tempWord & bitMask) + { + bitMask >>= 1; + ++currentBlock; + } + + break; // Found the free bit; break out to FoundUnused. + } + + // Keep looking at the next word + currentBlock += kBitsPerWord; + ++currentWord; + --wordsLeft; + } + +FoundUnused: + // Make sure the unused bit is early enough to use + if (currentBlock >= stopBlock) + { + break; + } + + // Remember the start of the extent + firstBlock = currentBlock; + + + /* + * Inner while loop 2: + * We get here if we find a free block. Count the number + * of contiguous free blocks observed. + * + * Initialization starts with checking the initial partial word + * if applicable. + */ + bitMask = currentBlock & kBitsWithinWordMask; + if (bitMask) + { + tempWord = SWAP_BE32(*currentWord); // Fetch the current word only once + bitMask = kHighBitInWordMask >> bitMask; + while (bitMask && !(tempWord & bitMask)) + { + bitMask >>= 1; + ++currentBlock; + } + + // Did we find a used bit (bitMask != 0), or run out of bits (bitMask == 0)? + if (bitMask) + goto FoundUsed; + + // Didn't find any used bits, so we're done with this word. + ++currentWord; + --wordsLeft; + } + + // + // Check whole words + // + while (currentBlock < endingBlock) + { + // See if it's time to read another block. + if (wordsLeft == 0) + { + buffer = NULL; + err = ReleaseBitmapBlock(vcb, blockRef, false); + if (err != noErr) goto ErrorExit; + + /* + * Skip over metadata blocks. + */ + if (!useMetaZone) { + u_int32_t nextBlock; + + nextBlock = NextBitmapBlock(vcb, currentBlock); + if (nextBlock != currentBlock) { + goto LoopExit; /* allocation gap, so stop */ + } + } + + err = ReadBitmapBlock(vcb, currentBlock, &buffer, &blockRef, flags); + if ( err != noErr ) goto ErrorExit; + + currentWord = buffer; + wordsLeft = wordsPerBlock; + } + + // See if any of the bits are set + if ((tempWord = SWAP_BE32(*currentWord)) != 0) + { + // Figure out which bit is set + bitMask = kHighBitInWordMask; + while (!(tempWord & bitMask)) + { + bitMask >>= 1; + ++currentBlock; + } + + break; // Found the used bit; break out to FoundUsed. + } + + // Keep looking at the next word + currentBlock += kBitsPerWord; + ++currentWord; + --wordsLeft; + + // If we found at least maxBlocks, we can quit early. + if ((currentBlock - firstBlock) >= maxBlocks) + break; + } + +FoundUsed: + // Make sure we didn't run out of bitmap looking for a used block. + // If so, pin to the end of the bitmap. + if (currentBlock > endingBlock) + currentBlock = endingBlock; + + // Figure out how many contiguous free blocks there were. + // Pin the answer to maxBlocks. + foundBlocks = currentBlock - firstBlock; + if (foundBlocks > maxBlocks) + foundBlocks = maxBlocks; + + if (remaining) { + if (foundBlocks > remaining) { + hfs_debug("hfs: found more blocks than are indicated free!\n"); + remaining = UINT32_MAX; + } else + remaining -= foundBlocks; + } + + if (ISSET(flags, HFS_ALLOC_TRY_HARD)) { + if (foundBlocks > best.blockCount) { + best.startBlock = firstBlock; + best.blockCount = foundBlocks; + } + + if (foundBlocks >= maxBlocks || best.blockCount >= remaining) + break; + + /* + * Note that we will go ahead and add this free extent to our + * cache below but that's OK because we'll remove it again if we + * decide to use this extent. + */ + } else if (foundBlocks >= minBlocks) + break; // Found what we needed! + + /* + * We did not find the total blocks we were looking for, but + * add this free block run to our free extent cache list, if possible. + */ + + // If we're ignoring tentative ranges, we need to account for them here + if (ISSET(flags, HFS_ALLOC_IGNORE_TENTATIVE)) { + struct rl_entry free_extent = rl_make(firstBlock, firstBlock + foundBlocks - 1); + struct rl_entry *range;; + TAILQ_FOREACH(range, &hfsmp->hfs_reserved_ranges[HFS_TENTATIVE_BLOCKS], rl_link) { + rl_subtract(&free_extent, range); + if (rl_len(range) == 0) + break; + } + firstBlock = free_extent.rl_start; + foundBlocks = rl_len(&free_extent); + } + + if (foundBlocks) { + if (hfsmp->jnl == NULL) { + /* If there is no journal, go ahead and add to the free ext cache. */ + updated_free_extent = add_free_extent_cache(vcb, firstBlock, foundBlocks); + } + else { + /* + * If journaled, only add to the free extent cache if this block is not + * waiting for a TRIM to complete; that implies that the transaction that freed it + * has not yet been committed to stable storage. + */ + int recently_deleted = 0; + uint32_t nextblock; + err = CheckUnmappedBytes(hfsmp, (uint64_t)firstBlock, + (uint64_t)foundBlocks, &recently_deleted, &nextblock); + if ((err) || (recently_deleted == 0)) { + /* if we hit an error, or the blocks not recently freed, go ahead and insert it */ + updated_free_extent = add_free_extent_cache(vcb, firstBlock, foundBlocks); + } + } + } + } while (currentBlock < stopBlock); +LoopExit: + + if (ISSET(flags, HFS_ALLOC_TRY_HARD)) { + firstBlock = best.startBlock; + foundBlocks = best.blockCount; + } + + // Return the outputs. + if (foundBlocks < minBlocks) + { +DiskFull: + err = dskFulErr; +ErrorExit: + *actualStartBlock = 0; + *actualNumBlocks = 0; + } + else + { + err = noErr; + *actualStartBlock = firstBlock; + *actualNumBlocks = foundBlocks; + /* + * Sanity check for overflow + */ + if ((firstBlock + foundBlocks) > vcb->allocLimit) { + panic("hfs: blk allocation overflow on \"%s\" sb:0x%08x eb:0x%08x cb:0x%08x fb:0x%08x stop:0x%08x min:0x%08x found:0x%08x", + vcb->vcbVN, startingBlock, endingBlock, currentBlock, + firstBlock, stopBlock, minBlocks, foundBlocks); + } + } + + if (updated_free_extent && (vcb->hfs_flags & HFS_HAS_SPARSE_DEVICE)) { + int i; + u_int32_t min_start = vcb->totalBlocks; + + // set the nextAllocation pointer to the smallest free block number + // we've seen so on the next mount we won't rescan unnecessarily + lck_spin_lock(&vcb->vcbFreeExtLock); + for(i=0; i < (int)vcb->vcbFreeExtCnt; i++) { + if (vcb->vcbFreeExt[i].startBlock < min_start) { + min_start = vcb->vcbFreeExt[i].startBlock; + } + } + lck_spin_unlock(&vcb->vcbFreeExtLock); + if (min_start != vcb->totalBlocks) { + if (min_start < vcb->nextAllocation) { + vcb->nextAllocation = min_start; + } + if (min_start < vcb->sparseAllocation) { + vcb->sparseAllocation = min_start; + } + } + } + + if (buffer) + (void) ReleaseBitmapBlock(vcb, blockRef, false); + + if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_BLOCK_FIND_CONTIG | DBG_FUNC_END, err, *actualStartBlock, *actualNumBlocks, 0, 0); + + return err; +} + + +/* + * Count number of bits set in the given 32-bit unsigned number + * + * Returns: + * Number of bits set + */ +static int num_bits_set(u_int32_t num) +{ + int count; + + for (count = 0; num; count++) { + num &= num - 1; + } + + return count; +} + +/* + * For a given range of blocks, find the total number of blocks + * allocated. If 'stop_on_first' is true, it stops as soon as it + * encounters the first allocated block. This option is useful + * to determine if any block is allocated or not. + * + * Inputs: + * startingBlock First allocation block number of the range to be scanned. + * numBlocks Total number of blocks that need to be scanned. + * stop_on_first Stop the search after the first allocated block is found. + * + * Output: + * allocCount Total number of allocation blocks allocated in the given range. + * + * On error, it is the number of allocated blocks found + * before the function got an error. + * + * If 'stop_on_first' is set, + * allocCount = 1 if any allocated block was found. + * allocCount = 0 if no allocated block was found. + * + * Returns: + * 0 on success, non-zero on failure. + */ +static int +hfs_isallocated_internal(struct hfsmount *hfsmp, u_int32_t startingBlock, + u_int32_t numBlocks, Boolean stop_on_first, u_int32_t *allocCount) +{ + u_int32_t *currentWord; // Pointer to current word within bitmap block + u_int32_t wordsLeft; // Number of words left in this bitmap block + u_int32_t bitMask; // Word with given bits already set (ready to test) + u_int32_t firstBit; // Bit index within word of first bit to allocate + u_int32_t numBits; // Number of bits in word to allocate + u_int32_t *buffer = NULL; + uintptr_t blockRef; + u_int32_t bitsPerBlock; + u_int32_t wordsPerBlock; + u_int32_t blockCount = 0; + int error; + + if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_IS_ALLOCATED | DBG_FUNC_START, startingBlock, numBlocks, stop_on_first, 0, 0); + + /* + * Pre-read the bitmap block containing the first word of allocation + */ + error = ReadBitmapBlock(hfsmp, startingBlock, &buffer, &blockRef, + HFS_ALLOC_IGNORE_TENTATIVE); + if (error) + goto JustReturn; + + /* + * Initialize currentWord, and wordsLeft. + */ + { + u_int32_t wordIndexInBlock; + + bitsPerBlock = hfsmp->vcbVBMIOSize * kBitsPerByte; + wordsPerBlock = hfsmp->vcbVBMIOSize / kBytesPerWord; + + wordIndexInBlock = (startingBlock & (bitsPerBlock-1)) / kBitsPerWord; + currentWord = buffer + wordIndexInBlock; + wordsLeft = wordsPerBlock - wordIndexInBlock; + } + + /* + * First test any non word aligned bits. + */ + firstBit = startingBlock % kBitsPerWord; + if (firstBit != 0) { + bitMask = kAllBitsSetInWord >> firstBit; + numBits = kBitsPerWord - firstBit; + if (numBits > numBlocks) { + numBits = numBlocks; + bitMask &= ~(kAllBitsSetInWord >> (firstBit + numBits)); + } + if ((*currentWord & SWAP_BE32 (bitMask)) != 0) { + if (stop_on_first) { + blockCount = 1; + goto Exit; + } + blockCount += num_bits_set(*currentWord & SWAP_BE32 (bitMask)); + } + numBlocks -= numBits; + ++currentWord; + --wordsLeft; + } + + /* + * Test whole words (32 blocks) at a time. + */ + while (numBlocks >= kBitsPerWord) { + if (wordsLeft == 0) { + /* Read in the next bitmap block. */ + startingBlock += bitsPerBlock; + + buffer = NULL; + error = ReleaseBitmapBlock(hfsmp, blockRef, false); + if (error) goto Exit; + + error = ReadBitmapBlock(hfsmp, startingBlock, &buffer, &blockRef, + HFS_ALLOC_IGNORE_TENTATIVE); + if (error) goto Exit; + + /* Readjust currentWord and wordsLeft. */ + currentWord = buffer; + wordsLeft = wordsPerBlock; + } + if (*currentWord != 0) { + if (stop_on_first) { + blockCount = 1; + goto Exit; + } + blockCount += num_bits_set(*currentWord); + } + numBlocks -= kBitsPerWord; + ++currentWord; + --wordsLeft; + } + + /* + * Test any remaining blocks. + */ + if (numBlocks != 0) { + bitMask = ~(kAllBitsSetInWord >> numBlocks); + if (wordsLeft == 0) { + /* Read in the next bitmap block */ + startingBlock += bitsPerBlock; + + buffer = NULL; + error = ReleaseBitmapBlock(hfsmp, blockRef, false); + if (error) goto Exit; + + error = ReadBitmapBlock(hfsmp, startingBlock, &buffer, &blockRef, + HFS_ALLOC_IGNORE_TENTATIVE); + if (error) goto Exit; + + currentWord = buffer; + } + if ((*currentWord & SWAP_BE32 (bitMask)) != 0) { + if (stop_on_first) { + blockCount = 1; + goto Exit; + } + blockCount += num_bits_set(*currentWord & SWAP_BE32 (bitMask)); + } + } +Exit: + if (buffer) { + (void)ReleaseBitmapBlock(hfsmp, blockRef, false); + } + if (allocCount) { + *allocCount = blockCount; + } + +JustReturn: + if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_IS_ALLOCATED | DBG_FUNC_END, error, 0, blockCount, 0, 0); + + return (error); +} + +/* + * Count total number of blocks that are allocated in the given + * range from the bitmap. This is used to preflight total blocks + * that need to be relocated during volume resize. + * + * The journal or allocation file lock must be held. + * + * Returns: + * 0 on success, non-zero on failure. + * On failure, allocCount is zero. + */ + int +hfs_count_allocated(struct hfsmount *hfsmp, u_int32_t startBlock, + u_int32_t numBlocks, u_int32_t *allocCount) +{ + return hfs_isallocated_internal(hfsmp, startBlock, numBlocks, false, allocCount); +} + +/* + * Test to see if any blocks in a range are allocated. + * + * Note: On error, this function returns 1, which means that + * one or more blocks in the range are allocated. This function + * is primarily used for volume resize and we do not want + * to report to the caller that the blocks are free when we + * were not able to deterministically find it out. So on error, + * we always report that the blocks are allocated. + * + * The journal or allocation file lock must be held. + * + * Returns + * 0 if all blocks in the range are free. + * 1 if blocks in the range are allocated, or there was an error. + */ + int +hfs_isallocated(struct hfsmount *hfsmp, u_int32_t startingBlock, u_int32_t numBlocks) +{ + int error; + u_int32_t allocCount; + + error = hfs_isallocated_internal(hfsmp, startingBlock, numBlocks, true, &allocCount); + if (error) { + /* On error, we always say that the blocks are allocated + * so that volume resize does not return false success. + */ + return 1; + } else { + /* The function was deterministically able to find out + * if there was any block allocated or not. In that case, + * the value in allocCount is good enough to be returned + * back to the caller. + */ + return allocCount; + } +} + +/* + * CONFIG_HFS_RBTREE + * Check to see if the red-black tree is live. Allocation file lock must be held + * shared or exclusive to call this function. Note that we may call this even if + * HFS is built without activating the red-black tree code. + */ +int +hfs_isrbtree_active(struct hfsmount *hfsmp){ + +#pragma unused (hfsmp) + + /* Just return 0 for now */ + return 0; +} + + + +/* Summary Table Functions */ +/* + * hfs_check_summary: + * + * This function should be used to query the summary table to see if we can + * bypass a bitmap block or not when we're trying to find a free allocation block. + * + * + * Inputs: + * allocblock - allocation block number. Will be used to infer the correct summary bit. + * hfsmp -- filesystem in question. + * + * Output Arg: + * *freeblocks - set to 1 if we believe at least one free blocks in this vcbVBMIOSize + * page of bitmap file. + * + * + * Returns: + * 0 on success + * EINVAL on error + * + */ + +static int hfs_check_summary (struct hfsmount *hfsmp, uint32_t allocblock, uint32_t *freeblocks) { + + int err = EINVAL; + if (hfsmp->vcbVBMIOSize) { + if (hfsmp->hfs_flags & HFS_SUMMARY_TABLE) { + uint32_t index; + if (hfs_get_summary_index (hfsmp, allocblock, &index)) { + *freeblocks = 0; + return EINVAL; + } + + /* Ok, now that we have the bit index into the array, what byte is it in ? */ + uint32_t byteindex = index / kBitsPerByte; + uint8_t current_byte = hfsmp->hfs_summary_table[byteindex]; + uint8_t bit_in_byte = index % kBitsPerByte; + + if (current_byte & (1 << bit_in_byte)) { + /* + * We do not believe there is anything free in the + * entire vcbVBMIOSize'd block. + */ + *freeblocks = 0; + } + else { + /* Looks like there might be a free block here... */ + *freeblocks = 1; + } + } + err = 0; + } + + return err; +} + + +#if 0 +/* + * hfs_get_next_summary + * + * From a given allocation block, jump to the allocation block at the start of the + * next vcbVBMIOSize boundary. This is useful when trying to quickly skip over + * large swaths of bitmap once we have determined that the bitmap is relatively full. + * + * Inputs: hfsmount, starting allocation block number + * Output Arg: *newblock will contain the allocation block number to start + * querying. + * + * Returns: + * 0 on success + * EINVAL if the block argument is too large to be used, or the summary table not live. + * EFBIG if there are no more summary bits to be queried + */ +static int +hfs_get_next_summary (struct hfsmount *hfsmp, uint32_t block, uint32_t *newblock) { + + u_int32_t bits_per_iosize = hfsmp->vcbVBMIOSize * kBitsPerByte; + u_int32_t start_offset; + u_int32_t next_offset; + int err = EINVAL; + + if (hfsmp->hfs_flags & HFS_SUMMARY_TABLE) { + if ((err = hfs_get_summary_index(hfsmp, block, &start_offset))) { + return err; + } + + next_offset = start_offset++; + + if ((start_offset >= hfsmp->hfs_summary_size) || (next_offset >= hfsmp->hfs_summary_size)) { + /* Can't jump to the next summary bit. */ + return EINVAL; + } + + /* Otherwise, compute and return */ + *newblock = next_offset * bits_per_iosize; + if (*newblock >= hfsmp->totalBlocks) { + return EINVAL; + } + err = 0; + } + + return err; +} + +#endif + +/* + * hfs_release_summary + * + * Given an extent that is about to be de-allocated on-disk, determine the number + * of summary bitmap bits that need to be marked as 'potentially available'. + * Then go ahead and mark them as free. + * + * Inputs: + * hfsmp - hfs mount + * block - starting allocation block. + * length - length of the extent. + * + * Returns: + * EINVAL upon any errors. + */ +static int hfs_release_summary(struct hfsmount *hfsmp, uint32_t start_blk, uint32_t length) { + int err = EINVAL; + uint32_t end_blk = (start_blk + length) - 1; + + if (hfsmp->hfs_flags & HFS_SUMMARY_TABLE) { + /* Figure out what the starting / ending block's summary bits are */ + uint32_t start_bit; + uint32_t end_bit; + uint32_t current_bit; + + err = hfs_get_summary_index (hfsmp, start_blk, &start_bit); + if (err) { + goto release_err; + } + err = hfs_get_summary_index (hfsmp, end_blk, &end_bit); + if (err) { + goto release_err; + } + + if (ALLOC_DEBUG) { + if (start_bit > end_bit) { + panic ("HFS: start > end!, %d %d ", start_bit, end_bit); + } + } + current_bit = start_bit; + while (current_bit <= end_bit) { + err = hfs_set_summary (hfsmp, current_bit, 0); + current_bit++; + } + } + +release_err: + return err; +} + +/* + * hfs_find_summary_free + * + * Given a allocation block as input, returns an allocation block number as output as a + * suggestion for where to start scanning the bitmap in order to find free blocks. It will + * determine the vcbVBMIOsize of the input allocation block, convert that into a summary + * bit, then keep iterating over the summary bits in order to find the first free one. + * + * Inputs: + * hfsmp - hfs mount + * block - starting allocation block + * newblock - output block as suggestion + * + * Returns: + * 0 on success + * ENOSPC if we could not find a free block + */ + +int hfs_find_summary_free (struct hfsmount *hfsmp, uint32_t block, uint32_t *newblock) { + + int err = ENOSPC; + uint32_t bit_index = 0; + uint32_t maybe_has_blocks = 0; + + if (hfsmp->hfs_flags & HFS_SUMMARY_TABLE) { + uint32_t byte_index; + uint8_t curbyte; + uint8_t bit_in_byte; + uint32_t summary_cap; + + /* + * We generate a cap for the summary search because the summary table + * always represents a full summary of the bitmap FILE, which may + * be way more bits than are necessary for the actual filesystem + * whose allocations are mapped by the bitmap. + * + * Compute how much of hfs_summary_size is useable for the given number + * of allocation blocks eligible on this FS. + */ + err = hfs_get_summary_index (hfsmp, hfsmp->allocLimit - 1, &summary_cap); + if (err) { + goto summary_exit; + } + + /* Check the starting block first */ + err = hfs_check_summary (hfsmp, block, &maybe_has_blocks); + if (err) { + goto summary_exit; + } + + if (maybe_has_blocks) { + /* + * It looks like the initial start block could have something. + * Short-circuit and just use that. + */ + *newblock = block; + goto summary_exit; + } + + /* + * OK, now we know that the first block was useless. + * Get the starting summary bit, and find it in the array + */ + maybe_has_blocks = 0; + err = hfs_get_summary_index (hfsmp, block, &bit_index); + if (err) { + goto summary_exit; + } + + /* Iterate until we find something. */ + while (bit_index <= summary_cap) { + byte_index = bit_index / kBitsPerByte; + curbyte = hfsmp->hfs_summary_table[byte_index]; + bit_in_byte = bit_index % kBitsPerByte; + + if (curbyte & (1 << bit_in_byte)) { + /* nothing here. increment and move on */ + bit_index++; + } + else { + /* + * found something! convert bit_index back into + * an allocation block for use. 'newblock' will now + * contain the proper allocation block # based on the bit + * index. + */ + err = hfs_get_summary_allocblock (hfsmp, bit_index, newblock); + if (err) { + goto summary_exit; + } + maybe_has_blocks = 1; + break; + } + } + + /* If our loop didn't find anything, set err to ENOSPC */ + if (maybe_has_blocks == 0) { + err = ENOSPC; + } + } + + /* If the summary table is not active for this mount, we'll just return ENOSPC */ +summary_exit: + if (maybe_has_blocks) { + err = 0; + } + + return err; +} + +/* + * hfs_get_summary_allocblock + * + * Convert a summary bit into an allocation block number to use to start searching for free blocks. + * + * Inputs: + * hfsmp - hfs mount + * summarybit - summmary bit index + * *alloc - allocation block number in the bitmap file. + * + * Output: + * 0 on success + * EINVAL on failure + */ +int hfs_get_summary_allocblock (struct hfsmount *hfsmp, uint32_t + summarybit, uint32_t *alloc) { + uint32_t bits_per_iosize = hfsmp->vcbVBMIOSize * kBitsPerByte; + uint32_t allocblk; + + allocblk = summarybit * bits_per_iosize; + + if (allocblk >= hfsmp->totalBlocks) { + return EINVAL; + } + else { + *alloc = allocblk; + } + + return 0; +} + + +/* + * hfs_set_summary: + * + * This function should be used to manipulate the summary table + * + * The argument 'inuse' will set the value of the bit in question to one or zero + * depending on its value. + * + * Inputs: + * hfsmp - hfs mount + * summarybit - the bit index into the summary table to set/unset. + * inuse - the value to assign to the bit. + * + * Returns: + * 0 on success + * EINVAL on error + * + */ + +static int hfs_set_summary (struct hfsmount *hfsmp, uint32_t summarybit, uint32_t inuse) { + + int err = EINVAL; + if (hfsmp->vcbVBMIOSize) { + if (hfsmp->hfs_flags & HFS_SUMMARY_TABLE) { + + if (ALLOC_DEBUG) { + if (hfsmp->hfs_summary_table == NULL) { + panic ("hfs_set_summary: no table for %p ", hfsmp); + } + } + + /* Ok, now that we have the bit index into the array, what byte is it in ? */ + uint32_t byte_index = summarybit / kBitsPerByte; + uint8_t current_byte = hfsmp->hfs_summary_table[byte_index]; + uint8_t bit_in_byte = summarybit % kBitsPerByte; + + if (inuse) { + current_byte = (current_byte | (1 << bit_in_byte)); + } + else { + current_byte = (current_byte & ~(1 << bit_in_byte)); + } + + hfsmp->hfs_summary_table[byte_index] = current_byte; + } + err = 0; + } + + return err; +} + + +/* + * hfs_get_summary_index: + * + * This is a helper function which determines what summary bit represents the vcbVBMIOSize worth + * of IO against the bitmap file. + * + * Returns: + * 0 on success + * EINVAL on failure + */ +static int hfs_get_summary_index (struct hfsmount *hfsmp, uint32_t block, uint32_t* index) { + uint32_t summary_bit; + uint32_t bits_per_iosize; + int err = EINVAL; + + if (hfsmp->hfs_flags & HFS_SUMMARY_TABLE) { + /* Is the input block bigger than the total number of blocks? */ + if (block >= hfsmp->totalBlocks) { + return EINVAL; + } + + /* Is there even a vbmIOSize set? */ + if (hfsmp->vcbVBMIOSize == 0) { + return EINVAL; + } + + bits_per_iosize = hfsmp->vcbVBMIOSize * kBitsPerByte; + + summary_bit = block / bits_per_iosize; + + *index = summary_bit; + err = 0; + } + + return err; +} + +/* + * hfs_init_summary + * + * From a given mount structure, compute how big the summary table should be for the given + * filesystem, then allocate and bzero the memory. + * + * Returns: + * 0 on success + * EINVAL on failure + */ +int +hfs_init_summary (struct hfsmount *hfsmp) { + + uint32_t summary_size; + uint32_t summary_size_bytes; + uint8_t *summary_table; + + if (hfsmp->hfs_allocation_cp == NULL) { + if (ALLOC_DEBUG) { + printf("hfs: summary table cannot progress without a bitmap cnode! \n"); + } + return EINVAL; + } + /* + * The practical maximum size of the summary table is 16KB: + * + * (512MB maximum bitmap size / (4k -- min alloc block size)) / 8 bits/byte. + * + * HFS+ will allow filesystems with allocation block sizes smaller than 4k, but + * the end result is that we'll start to issue I/O in 2k or 1k sized chunks, which makes + * supporting this much worse. The math would instead look like this: + * (512MB / 2k) / 8 == 32k. + * + * So, we will disallow the summary table if the allocation block size is < 4k. + */ + + if (hfsmp->blockSize < HFS_MIN_SUMMARY_BLOCKSIZE) { + printf("hfs: summary table not allowed on FS with block size of %d\n", hfsmp->blockSize); + return EINVAL; + } + + summary_size = hfsmp->hfs_allocation_cp->c_blocks; + + if (ALLOC_DEBUG) { + printf("HFS Summary Table Initialization: Bitmap %u blocks\n", + hfsmp->hfs_allocation_cp->c_blocks); + } + + /* + * If the bitmap IO size is not the same as the allocation block size then + * then re-compute the number of summary bits necessary. Note that above, the + * the default size is the number of allocation blocks in the bitmap *FILE* + * (not the number of bits in the bitmap itself). If the allocation block size + * is large enough though, we may need to increase this. + */ + if (hfsmp->blockSize != hfsmp->vcbVBMIOSize) { + uint64_t lrg_size = (uint64_t) hfsmp->hfs_allocation_cp->c_blocks * (uint64_t) hfsmp->blockSize; + lrg_size = lrg_size / (uint64_t)hfsmp->vcbVBMIOSize; + + /* With a full bitmap and 64k-capped iosize chunks, this would be 64k */ + summary_size = (uint32_t) lrg_size; + } + + /* + * If the block size is the same as the IO Size, then the total number of blocks + * is already equal to the number of IO units, which is our number of summary bits. + */ + + summary_size_bytes = summary_size / kBitsPerByte; + /* Always add one byte, just in case we have a dangling number of bits */ + summary_size_bytes++; + + if (ALLOC_DEBUG) { + printf("HFS Summary Table: vcbVBMIOSize %d summary bits %d \n", hfsmp->vcbVBMIOSize, summary_size); + printf("HFS Summary Table Size (in bytes) %d \n", summary_size_bytes); + } + + /* Store the field in the mount point */ + hfsmp->hfs_summary_size = summary_size; + hfsmp->hfs_summary_bytes = summary_size_bytes; + + summary_table = hfs_mallocz(summary_size_bytes); + + /* enable the summary table */ + hfsmp->hfs_flags |= HFS_SUMMARY_TABLE; + hfsmp->hfs_summary_table = summary_table; + + if (ALLOC_DEBUG) { + if (hfsmp->hfs_summary_table == NULL) { + panic ("HFS Summary Init: no table for %p\n", hfsmp); + } + } + return 0; +} + +/* + * hfs_rebuild_summary + * + * This function should be used to allocate a new hunk of memory for use as a summary + * table, then copy the existing data into it. We use it whenever the filesystem's size + * changes. When a resize is in progress, you can still use the extant summary + * table if it is active. + * + * Inputs: + * hfsmp -- FS in question + * newlength -- new length of the FS in allocation blocks. + * + * Outputs: + * 0 on success, EINVAL on failure. If this function fails, the summary table + * will be disabled for future use. + * + */ +static int hfs_rebuild_summary (struct hfsmount *hfsmp) { + + uint32_t new_summary_size; + + new_summary_size = hfsmp->hfs_allocation_cp->c_blocks; + + + if (ALLOC_DEBUG) { + printf("HFS Summary Table Re-init: bitmap %u blocks\n", new_summary_size); + } + + /* + * If the bitmap IO size is not the same as the allocation block size, then re-compute + * the number of summary bits necessary. Note that above, the default size is the number + * of allocation blocks in the bitmap *FILE* (not the number of bits that the bitmap manages). + * If the allocation block size is large enough though, we may need to increase this, as + * bitmap IO is capped at 64k per IO + */ + if (hfsmp->blockSize != hfsmp->vcbVBMIOSize) { + uint64_t lrg_size = (uint64_t) hfsmp->hfs_allocation_cp->c_blocks * (uint64_t) hfsmp->blockSize; + lrg_size = lrg_size / (uint64_t)hfsmp->vcbVBMIOSize; + + /* With a full bitmap and 64k-capped iosize chunks, this would be 64k */ + new_summary_size = (uint32_t) lrg_size; + } + + /* + * Ok, we have the new summary bitmap theoretical max size. See if it's the same as + * what we've got already... + */ + if (new_summary_size != hfsmp->hfs_summary_size) { + uint32_t summarybytes = new_summary_size / kBitsPerByte; + uint32_t copysize; + uint8_t *newtable; + /* Add one byte for slop */ + summarybytes++; + + if (ALLOC_DEBUG) { + printf("HFS Summary Table: vcbVBMIOSize %d summary bits %d \n", hfsmp->vcbVBMIOSize, new_summary_size); + printf("HFS Summary Table Size (in bytes) %d \n", summarybytes); + } + + newtable = hfs_mallocz(summarybytes); + + /* + * The new table may be smaller than the old one. If this is true, then + * we can't copy the full size of the existing summary table into the new + * one. + * + * The converse is not an issue since we bzeroed the table above. + */ + copysize = hfsmp->hfs_summary_bytes; + if (summarybytes < hfsmp->hfs_summary_bytes) { + copysize = summarybytes; + } + memcpy (newtable, hfsmp->hfs_summary_table, copysize); + + /* We're all good. Destroy the old copy and update ptrs */ + hfs_free(hfsmp->hfs_summary_table, hfsmp->hfs_summary_bytes); + + hfsmp->hfs_summary_table = newtable; + hfsmp->hfs_summary_size = new_summary_size; + hfsmp->hfs_summary_bytes = summarybytes; + } + + return 0; +} + + +#if ALLOC_DEBUG +/* + * hfs_validate_summary + * + * Validation routine for the summary table. Debug-only function. + * + * Bitmap lock must be held. + * + */ +void hfs_validate_summary (struct hfsmount *hfsmp) { + uint32_t i; + int err; + + /* + * Iterate over all of the bits in the summary table, and verify if + * there really are free blocks in the pages that we believe may + * may contain free blocks. + */ + + if (hfsmp->hfs_summary_table == NULL) { + panic ("HFS Summary: No HFS summary table!"); + } + + /* 131072 bits == 16384 bytes. This is the theoretical max size of the summary table. we add 1 byte for slop */ + if (hfsmp->hfs_summary_size == 0 || hfsmp->hfs_summary_size > 131080) { + panic("HFS Summary: Size is bad! %d", hfsmp->hfs_summary_size); + } + + if (hfsmp->vcbVBMIOSize == 0) { + panic("HFS Summary: no VCB VBM IO Size !"); + } + + printf("hfs: summary validation beginning on %s\n", hfsmp->vcbVN); + printf("hfs: summary validation %d summary bits, %d summary blocks\n", hfsmp->hfs_summary_size, hfsmp->totalBlocks); + + + /* iterate through all possible summary bits */ + for (i = 0; i < hfsmp->hfs_summary_size ; i++) { + + uint32_t bits_per_iosize = hfsmp->vcbVBMIOSize * kBitsPerByte; + uint32_t byte_offset = hfsmp->vcbVBMIOSize * i; + + /* Compute the corresponding allocation block for the summary bit. */ + uint32_t alloc_block = i * bits_per_iosize; + + /* + * We use a uint32_t pointer here because it will speed up + * access to the real bitmap data on disk. + */ + uint32_t *block_data; + struct buf *bp; + int counter; + int counter_max; + int saw_free_bits = 0; + + /* Get the block */ + if ((err = ReadBitmapRange (hfsmp, byte_offset, hfsmp->vcbVBMIOSize, &block_data, &bp))) { + panic ("HFS Summary: error (%d) in ReadBitmapRange!", err); + } + + /* Query the status of the bit and then make sure we match */ + uint32_t maybe_has_free_blocks; + err = hfs_check_summary (hfsmp, alloc_block, &maybe_has_free_blocks); + if (err) { + panic ("HFS Summary: hfs_check_summary returned error (%d) ", err); + } + counter_max = hfsmp->vcbVBMIOSize / kBytesPerWord; + + for (counter = 0; counter < counter_max; counter++) { + uint32_t word = block_data[counter]; + + /* We assume that we'll not find any free bits here. */ + if (word != kAllBitsSetInWord) { + if (maybe_has_free_blocks) { + /* All done */ + saw_free_bits = 1; + break; + } + else { + panic ("HFS Summary: hfs_check_summary saw free bits!"); + } + } + } + + if (maybe_has_free_blocks && (saw_free_bits == 0)) { + panic ("HFS Summary: did not see free bits !"); + } + + /* Release the block. */ + if ((err = ReleaseScanBitmapRange (bp))) { + panic ("HFS Summary: Error (%d) in ReleaseScanBitmapRange", err); + } + } + + printf("hfs: summary validation completed successfully on %s\n", hfsmp->vcbVN); + + return; +} +#endif + +/* + * hfs_alloc_scan_range: + * + * This function should be used to scan large ranges of the allocation bitmap + * at one time. It makes two key assumptions: + * + * 1) Bitmap lock is held during the duration of the call (exclusive) + * 2) There are no pages in the buffer cache for any of the bitmap + * blocks that we may encounter. It *MUST* be completely empty. + * + * The expected use case is when we are scanning the bitmap in full while we are + * still mounting the filesystem in order to issue TRIMs or build up the summary + * table for the mount point. It should be done after any potential journal replays + * are completed and their I/Os fully issued. + * + * The key reason for assumption (2) above is that this function will try to issue + * I/O against the bitmap file in chunks as large a possible -- essentially as + * much as the buffer layer will handle (1MB). Because the size of these I/Os + * is larger than what would be expected during normal runtime we must invalidate + * the buffers as soon as we are done with them so that they do not persist in + * the buffer cache for other threads to find, as they'll typically be doing + * allocation-block size I/Os instead. + * + * Input Args: + * hfsmp - hfs mount data structure + * startbit - allocation block # to start our scan. It must be aligned + * on a vcbVBMIOsize boundary. + * list - journal trim list data structure for issuing TRIMs + * + * Output Args: + * bitToScan - Return the next bit to scan if this function is called again. + * Caller will supply this into the next invocation + * of this call as 'startbit'. + */ + +static int hfs_alloc_scan_range(struct hfsmount *hfsmp, u_int32_t startbit, + u_int32_t *bitToScan, struct jnl_trim_list *list) { + + int error; + int readwrite = 1; + u_int32_t curAllocBlock; + struct buf *blockRef = NULL; + u_int32_t *buffer = NULL; + u_int32_t free_offset = 0; //tracks the start of the current free range + u_int32_t size = 0; // tracks the length of the current free range. + u_int32_t iosize = 0; //how much io we should generate against the bitmap + u_int32_t byte_off; // byte offset into the bitmap file. + u_int32_t completed_size; // how much io was actually completed + u_int32_t last_bitmap_block; + u_int32_t current_word; + u_int32_t word_index = 0; + + /* summary table building */ + uint32_t summary_bit = 0; + uint32_t saw_free_blocks = 0; + uint32_t last_marked = 0; + + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + readwrite = 0; + } + + /* + * Compute how much I/O we should generate here. + * hfs_scan_range_size will validate that the start bit + * converted into a byte offset into the bitmap file, + * is aligned on a VBMIOSize boundary. + */ + error = hfs_scan_range_size (hfsmp, startbit, &iosize); + if (error) { + if (ALLOC_DEBUG) { + panic ("hfs_alloc_scan_range: hfs_scan_range_size error %d\n", error); + } + return error; + } + + if (iosize < hfsmp->vcbVBMIOSize) { + if (ALLOC_DEBUG) { + panic ("hfs_alloc_scan_range: iosize too small! (iosize %d)\n", iosize); + } + return EINVAL; + } + + /* hfs_scan_range_size should have verified startbit. Convert it to bytes */ + byte_off = startbit / kBitsPerByte; + + /* + * When the journal replays blocks, it does so by writing directly to the disk + * device (bypassing any filesystem vnodes and such). When it finishes its I/Os + * it also immediately re-reads and invalidates the range covered by the bp so + * it does not leave anything lingering in the cache (for iosize reasons). + * + * As such, it is safe to do large I/Os here with ReadBitmapRange. + * + * NOTE: It is not recommended, but it is possible to call the function below + * on sections of the bitmap that may be in core already as long as the pages are not + * dirty. In that case, we'd notice that something starting at that + * logical block of the bitmap exists in the metadata cache, and we'd check + * if the iosize requested is the same as what was already allocated for it. + * Odds are pretty good we're going to request something larger. In that case, + * we just free the existing memory associated with the buf and reallocate a + * larger range. This function should immediately invalidate it as soon as we're + * done scanning, so this shouldn't cause any coherency issues. + */ + + error = ReadBitmapRange(hfsmp, byte_off, iosize, &buffer, &blockRef); + if (error) { + if (ALLOC_DEBUG) { + panic ("hfs_alloc_scan_range: start %d iosize %d ReadBitmapRange error %d\n", startbit, iosize, error); + } + return error; + } + + /* + * At this point, we have a giant wired buffer that represents some portion of + * the bitmap file that we want to analyze. We may not have gotten all 'iosize' + * bytes though, so clip our ending bit to what we actually read in. + */ + completed_size = buf_count(blockRef); + last_bitmap_block = completed_size * kBitsPerByte; + last_bitmap_block = last_bitmap_block + startbit; + + /* Cap the last block to the total number of blocks if required */ + if (last_bitmap_block > hfsmp->totalBlocks) { + last_bitmap_block = hfsmp->totalBlocks; + } + + /* curAllocBlock represents the logical block we're analyzing. */ + curAllocBlock = startbit; + word_index = 0; + size = 0; + + if (hfsmp->hfs_flags & HFS_SUMMARY_TABLE) { + if (hfs_get_summary_index (hfsmp, startbit, &summary_bit)) { + error = EINVAL; + if (ALLOC_DEBUG) { + panic ("hfs_alloc_scan_range: Could not acquire summary index for %u", startbit); + } + return error; + } + /* + * summary_bit should now be set to the summary bit corresponding to + * the allocation block of the first bit that we're supposed to scan + */ + } + saw_free_blocks = 0; + + while (curAllocBlock < last_bitmap_block) { + u_int32_t bit; + + /* Update the summary table as needed */ + if (hfsmp->hfs_flags & HFS_SUMMARY_TABLE) { + if (ALLOC_DEBUG) { + if (hfsmp->hfs_summary_table == NULL) { + panic ("hfs_alloc_scan_range: no summary table!"); + } + } + + uint32_t temp_summary; + error = hfs_get_summary_index (hfsmp, curAllocBlock, &temp_summary); + if (error) { + if (ALLOC_DEBUG) { + panic ("hfs_alloc_scan_range: could not get summary index for %u", curAllocBlock); + } + return EINVAL; + } + + if (ALLOC_DEBUG) { + if (temp_summary < summary_bit) { + panic ("hfs_alloc_scan_range: backwards summary bit?\n"); + } + } + + /* + * If temp_summary is greater than summary_bit, then this + * means that the next allocation block crosses a vcbVBMIOSize boundary + * and we should treat this range of on-disk data as part of a new summary + * bit. + */ + if (temp_summary > summary_bit) { + if (saw_free_blocks == 0) { + /* Mark the bit as totally consumed in the summary table */ + hfs_set_summary (hfsmp, summary_bit, 1); + } + else { + /* Mark the bit as potentially free in summary table */ + hfs_set_summary (hfsmp, summary_bit, 0); + } + last_marked = summary_bit; + /* + * Any time we set the summary table, update our counter which tracks + * what the last bit that was fully marked in the summary table. + * + * Then reset our marker which says we haven't seen a free bit yet. + */ + saw_free_blocks = 0; + summary_bit = temp_summary; + } + } /* End summary table conditions */ + + current_word = SWAP_BE32(buffer[word_index]); + /* Iterate through the word 1 bit at a time... */ + for (bit = 0 ; bit < kBitsPerWord ; bit++, curAllocBlock++) { + if (curAllocBlock >= last_bitmap_block) { + break; + } + u_int32_t allocated = (current_word & (kHighBitInWordMask >> bit)); + + if (allocated) { + if (size != 0) { + if (readwrite) { + /* Insert the previously tracked range of free blocks to the trim list */ + hfs_track_unmap_blocks (hfsmp, free_offset, size, list); + } + add_free_extent_cache (hfsmp, free_offset, size); + size = 0; + free_offset = 0; + } + } + else { + /* Not allocated */ + size++; + if (free_offset == 0) { + /* Start a new run of free spcae at curAllocBlock */ + free_offset = curAllocBlock; + } + if (saw_free_blocks == 0) { + saw_free_blocks = 1; + } + } + } /* end for loop iterating through the word */ + + if (curAllocBlock < last_bitmap_block) { + word_index++; + } + + } /* End while loop (iterates through last_bitmap_block) */ + + + /* + * We've (potentially) completed our pass through this region of bitmap, + * but one thing we may not have done is updated that last summary bit for + * the last page we scanned, because we would have never transitioned across + * a vcbVBMIOSize boundary again. Check for that and update the last bit + * as needed. + * + * Note that 'last_bitmap_block' is *not* inclusive WRT the very last bit in the bitmap + * for the region of bitmap on-disk that we were scanning. (it is one greater). + */ + if ((curAllocBlock >= last_bitmap_block) && + (hfsmp->hfs_flags & HFS_SUMMARY_TABLE)) { + uint32_t temp_summary; + /* temp_block should be INSIDE the region we just scanned, so subtract 1 */ + uint32_t temp_block = last_bitmap_block - 1; + error = hfs_get_summary_index (hfsmp, temp_block, &temp_summary); + if (error) { + if (ALLOC_DEBUG) { + panic ("hfs_alloc_scan_range: end bit curAllocBlock %u, last_bitmap_block %u", curAllocBlock, last_bitmap_block); + } + return EINVAL; + } + + /* Did we already update this in the table? */ + if (temp_summary > last_marked) { + if (saw_free_blocks == 0) { + hfs_set_summary (hfsmp, temp_summary, 1); + } + else { + hfs_set_summary (hfsmp, temp_summary, 0); + } + } + } + + /* + * We may have been tracking a range of free blocks that hasn't been inserted yet. + * Keep the logic for the TRIM and free extent separate from that of the summary + * table management even though they are closely linked. + */ + if (size != 0) { + if (readwrite) { + hfs_track_unmap_blocks (hfsmp, free_offset, size, list); + } + add_free_extent_cache (hfsmp, free_offset, size); + } + + /* + * curAllocBlock represents the next block we need to scan when we return + * to this function. + */ + *bitToScan = curAllocBlock; + ReleaseScanBitmapRange(blockRef); + + return 0; + +} + + + +/* + * Compute the maximum I/O size to generate against the bitmap file + * Will attempt to generate at LEAST VBMIOsize I/Os for interior ranges of the bitmap. + * + * Inputs: + * hfsmp -- hfsmount to look at + * bitmap_off -- bit offset into the bitmap file + * + * Outputs: + * iosize -- iosize to generate. + * + * Returns: + * 0 on success; EINVAL otherwise + */ +static int hfs_scan_range_size (struct hfsmount *hfsmp, uint32_t bitmap_st, uint32_t *iosize) { + + /* + * The maximum bitmap size is 512MB regardless of ABN size, so we can get away + * with 32 bit math in this function. + */ + + uint32_t bitmap_len; + uint32_t remaining_bitmap; + uint32_t target_iosize; + uint32_t bitmap_off; + + /* Is this bit index not word aligned? If so, immediately fail. */ + if (bitmap_st % kBitsPerWord) { + if (ALLOC_DEBUG) { + panic ("hfs_scan_range_size unaligned start bit! bitmap_st %d \n", bitmap_st); + } + return EINVAL; + } + + /* bitmap_off is in bytes, not allocation blocks/bits */ + bitmap_off = bitmap_st / kBitsPerByte; + + if ((hfsmp->totalBlocks <= bitmap_st) || (bitmap_off > (512 * 1024 * 1024))) { + if (ALLOC_DEBUG) { + panic ("hfs_scan_range_size: invalid start! bitmap_st %d, bitmap_off %d\n", bitmap_st, bitmap_off); + } + return EINVAL; + } + + /* + * Also invalid if it's not at least aligned to HFS bitmap logical + * block boundaries. We don't have to emit an iosize that's an + * exact multiple of the VBMIOSize, but it must start on such + * a boundary. + * + * The vcbVBMIOSize may be SMALLER than the allocation block size + * on a FS with giant allocation blocks, but it will never be + * greater than it, so it should be safe to start I/O + * aligned on a VBMIOsize boundary. + */ + if (bitmap_off & (hfsmp->vcbVBMIOSize - 1)) { + if (ALLOC_DEBUG) { + panic ("hfs_scan_range_size: unaligned start! bitmap_off %d\n", bitmap_off); + } + return EINVAL; + } + + /* + * Generate the total bitmap file length in bytes, then round up + * that value to the end of the last allocation block, if needed (It + * will probably be needed). We won't scan past the last actual + * allocation block. + * + * Unless we're completing the bitmap scan (or bitmap < 1MB), we + * have to complete the I/O on VBMIOSize boundaries, but we can only read + * up until the end of the bitmap file. + */ + bitmap_len = roundup(hfsmp->totalBlocks, hfsmp->blockSize * 8) / 8; + + remaining_bitmap = bitmap_len - bitmap_off; + + /* + * io size is the MIN of the maximum I/O we can generate or the + * remaining amount of bitmap. + */ + target_iosize = MIN((MAXBSIZE), remaining_bitmap); + *iosize = target_iosize; + + return 0; +} + + + + +/* + * This function is basically the same as hfs_isallocated, except it's designed for + * use with the red-black tree validation code. It assumes we're only checking whether + * one bit is active, and that we're going to pass in the buf to use, since GenerateTree + * calls ReadBitmapBlock and will have that buf locked down for the duration of its operation. + * + * This should not be called in general purpose scanning code. + */ +int hfs_isallocated_scan(struct hfsmount *hfsmp, u_int32_t startingBlock, u_int32_t *bp_buf) { + + u_int32_t *currentWord; // Pointer to current word within bitmap block + u_int32_t bitMask; // Word with given bits already set (ready to test) + u_int32_t firstBit; // Bit index within word of first bit to allocate + u_int32_t numBits; // Number of bits in word to allocate + u_int32_t bitsPerBlock; + uintptr_t blockRef = 0; + u_int32_t numBlocks = 1; + u_int32_t *buffer = NULL; + + int inuse = 0; + int error; + + + if (bp_buf) { + /* just use passed-in buffer if avail. */ + buffer = bp_buf; + } + else { + /* + * Pre-read the bitmap block containing the first word of allocation + */ + error = ReadBitmapBlock(hfsmp, startingBlock, &buffer, &blockRef, + HFS_ALLOC_IGNORE_TENTATIVE); + if (error) + return (error); + } + + /* + * Initialize currentWord, and wordsLeft. + */ + u_int32_t wordIndexInBlock; + + bitsPerBlock = hfsmp->vcbVBMIOSize * kBitsPerByte; + + wordIndexInBlock = (startingBlock & (bitsPerBlock-1)) / kBitsPerWord; + currentWord = buffer + wordIndexInBlock; + + /* + * First test any non word aligned bits. + */ + firstBit = startingBlock % kBitsPerWord; + bitMask = kAllBitsSetInWord >> firstBit; + numBits = kBitsPerWord - firstBit; + if (numBits > numBlocks) { + numBits = numBlocks; + bitMask &= ~(kAllBitsSetInWord >> (firstBit + numBits)); + } + if ((*currentWord & SWAP_BE32 (bitMask)) != 0) { + inuse = 1; + goto Exit; + } + ++currentWord; + +Exit: + if(bp_buf == NULL) { + if (buffer) { + (void)ReleaseBitmapBlock(hfsmp, blockRef, false); + } + } + return (inuse); + + + +} + +/* + * This function resets all of the data structures relevant to the + * free extent cache stored in the hfsmount struct. + * + * If we are using the red-black tree code then we need to account for the fact that + * we may encounter situations where we need to jettison the tree. If that is the + * case, then we fail-over to the bitmap scanning logic, but we need to ensure that + * the free ext cache is zeroed before we start using it. + * + * We also reset and disable the cache when allocLimit is updated... which + * is when a volume is being resized (via hfs_truncatefs() or hfs_extendfs()). + * It is independent of the type of allocator being used currently. + */ +void ResetVCBFreeExtCache(struct hfsmount *hfsmp) +{ + int bytes; + void *freeExt; + + if (hfs_kdebug_allocation & HFSDBG_EXT_CACHE_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_RESET_EXTENT_CACHE | DBG_FUNC_START, 0, 0, 0, 0, 0); + + lck_spin_lock(&hfsmp->vcbFreeExtLock); + + /* reset Free Extent Count */ + hfsmp->vcbFreeExtCnt = 0; + + /* reset the actual array */ + bytes = kMaxFreeExtents * sizeof(HFSPlusExtentDescriptor); + freeExt = (void*)(hfsmp->vcbFreeExt); + + bzero (freeExt, bytes); + + lck_spin_unlock(&hfsmp->vcbFreeExtLock); + + if (hfs_kdebug_allocation & HFSDBG_EXT_CACHE_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_RESET_EXTENT_CACHE | DBG_FUNC_END, 0, 0, 0, 0, 0); + + return; +} + +/* + * This function is used to inform the allocator if we have to effectively shrink + * or grow the total number of allocation blocks via hfs_truncatefs or hfs_extendfs. + * + * The bitmap lock must be held when calling this function. This function also modifies the + * allocLimit field in the hfs mount point structure in the general case. + * + * In the shrinking case, we'll have to remove all free extents from the red-black + * tree past the specified offset new_end_block. In the growth case, we'll have to force + * a re-scan of the new allocation blocks from our current allocLimit to the new end block. + * + * new_end_block represents the total number of blocks available for allocation in the resized + * filesystem. Block #new_end_block should not be allocatable in the resized filesystem since it + * will be out of the (0, n-1) range that are indexable in the bitmap. + * + * Returns 0 on success + * errno on failure + */ +u_int32_t UpdateAllocLimit (struct hfsmount *hfsmp, u_int32_t new_end_block) { + + /* + * Update allocLimit to the argument specified + */ + hfsmp->allocLimit = new_end_block; + + /* Invalidate the free extent cache completely so that + * it does not have any extents beyond end of current + * volume. + */ + ResetVCBFreeExtCache(hfsmp); + + /* Force a rebuild of the summary table. */ + (void) hfs_rebuild_summary (hfsmp); + + // Delete any tentative ranges that are in the area we're shrinking + struct rl_entry *range, *next_range; + TAILQ_FOREACH_SAFE(range, &hfsmp->hfs_reserved_ranges[HFS_TENTATIVE_BLOCKS], + rl_link, next_range) { + if (rl_overlap(range, new_end_block, RL_INFINITY) != RL_NOOVERLAP) + hfs_release_reserved(hfsmp, range, HFS_TENTATIVE_BLOCKS); + } + + return 0; +} + +/* + * Remove an extent from the list of free extents. + * + * This is a low-level routine. It does not handle overlaps or splitting; + * that is the responsibility of the caller. The input extent must exactly + * match an extent already in the list; it will be removed, and any following + * extents in the list will be shifted up. + * + * Inputs: + * startBlock - Start of extent to remove + * blockCount - Number of blocks in extent to remove + * + * Result: + * The index of the extent that was removed. + */ +static void remove_free_extent_list(struct hfsmount *hfsmp, int index) +{ + if (index < 0 || (uint32_t)index >= hfsmp->vcbFreeExtCnt) { + if (ALLOC_DEBUG) + panic("hfs: remove_free_extent_list: %p: index (%d) out of range (0, %u)", hfsmp, index, hfsmp->vcbFreeExtCnt); + else + printf("hfs: remove_free_extent_list: %p: index (%d) out of range (0, %u)", hfsmp, index, hfsmp->vcbFreeExtCnt); + return; + } + int shift_count = hfsmp->vcbFreeExtCnt - index - 1; + if (shift_count > 0) { + memmove(&hfsmp->vcbFreeExt[index], &hfsmp->vcbFreeExt[index+1], shift_count * sizeof(hfsmp->vcbFreeExt[0])); + } + hfsmp->vcbFreeExtCnt--; +} + + +/* + * Add an extent to the list of free extents. + * + * This is a low-level routine. It does not handle overlaps or coalescing; + * that is the responsibility of the caller. This routine *does* make + * sure that the extent it is adding is inserted in the correct location. + * If the list is full, this routine will handle either removing the last + * extent in the list to make room for the new extent, or ignoring the + * new extent if it is "worse" than the last extent in the list. + * + * Inputs: + * startBlock - Start of extent to add + * blockCount - Number of blocks in extent to add + * + * Result: + * The index where the extent that was inserted, or kMaxFreeExtents + * if the extent was not inserted (the list was full, and the extent + * being added was "worse" than everything in the list). + */ +static int add_free_extent_list(struct hfsmount *hfsmp, u_int32_t startBlock, u_int32_t blockCount) +{ + uint32_t i; + + /* ALLOC_DEBUG: Make sure no extents in the list overlap or are contiguous with the input extent. */ + if (ALLOC_DEBUG) { + uint32_t endBlock = startBlock + blockCount; + for (i = 0; i < hfsmp->vcbFreeExtCnt; ++i) { + if (endBlock < hfsmp->vcbFreeExt[i].startBlock || + startBlock > (hfsmp->vcbFreeExt[i].startBlock + hfsmp->vcbFreeExt[i].blockCount)) { + continue; + } + panic("hfs: add_free_extent_list: %p: extent(%u %u) overlaps existing extent (%u %u) at index %d", + hfsmp, startBlock, blockCount, hfsmp->vcbFreeExt[i].startBlock, hfsmp->vcbFreeExt[i].blockCount, i); + } + } + + /* Figure out what index the new extent should be inserted at. */ + for (i = 0; i < hfsmp->vcbFreeExtCnt; ++i) { + if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) { + /* The list is sorted by increasing offset. */ + if (startBlock < hfsmp->vcbFreeExt[i].startBlock) { + break; + } + } else { + /* The list is sorted by decreasing size. */ + if (blockCount > hfsmp->vcbFreeExt[i].blockCount) { + break; + } + } + } + + /* When we get here, i is the index where the extent should be inserted. */ + if (i == kMaxFreeExtents) { + /* + * The new extent is worse than anything already in the list, + * and the list is full, so just ignore the extent to be added. + */ + return i; + } + + /* + * Grow the list (if possible) to make room for an insert. + */ + if (hfsmp->vcbFreeExtCnt < kMaxFreeExtents) + hfsmp->vcbFreeExtCnt++; + + /* + * If we'll be keeping any extents after the insert position, then shift them. + */ + int shift_count = hfsmp->vcbFreeExtCnt - i - 1; + if (shift_count > 0) { + memmove(&hfsmp->vcbFreeExt[i+1], &hfsmp->vcbFreeExt[i], shift_count * sizeof(hfsmp->vcbFreeExt[0])); + } + + /* Finally, store the new extent at its correct position. */ + hfsmp->vcbFreeExt[i].startBlock = startBlock; + hfsmp->vcbFreeExt[i].blockCount = blockCount; + return i; +} + + +/* + * Remove an entry from free extent cache after it has been allocated. + * + * This is a high-level routine. It handles removing a portion of a + * cached extent, potentially splitting it into two (if the cache was + * already full, throwing away the extent that would sort last). It + * also handles removing an extent that overlaps multiple extents in + * the cache. + * + * Inputs: + * hfsmp - mount point structure + * startBlock - starting block of the extent to be removed. + * blockCount - number of blocks of the extent to be removed. + */ +static void remove_free_extent_cache(struct hfsmount *hfsmp, u_int32_t startBlock, u_int32_t blockCount) +{ + u_int32_t i, insertedIndex; + u_int32_t currentStart, currentEnd, endBlock; + int extentsRemoved = 0; + + if (hfs_kdebug_allocation & HFSDBG_EXT_CACHE_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_REMOVE_EXTENT_CACHE | DBG_FUNC_START, startBlock, blockCount, 0, 0, 0); + + endBlock = startBlock + blockCount; + + lck_spin_lock(&hfsmp->vcbFreeExtLock); + + /* + * Iterate over all of the extents in the free extent cache, removing or + * updating any entries that overlap with the input extent. + */ + for (i = 0; i < hfsmp->vcbFreeExtCnt; ++i) { + currentStart = hfsmp->vcbFreeExt[i].startBlock; + currentEnd = currentStart + hfsmp->vcbFreeExt[i].blockCount; + + /* + * If the current extent is entirely before or entirely after the + * the extent to be removed, then we keep it as-is. + */ + if (currentEnd <= startBlock || currentStart >= endBlock) { + continue; + } + + /* + * If the extent being removed entirely contains the current extent, + * then remove the current extent. + */ + if (startBlock <= currentStart && endBlock >= currentEnd) { + remove_free_extent_list(hfsmp, i); + + /* + * We just removed the extent at index i. The extent at + * index i+1 just got shifted to index i. So decrement i + * to undo the loop's "++i", and the next iteration will + * examine index i again, which contains the next extent + * in the list. + */ + --i; + ++extentsRemoved; + continue; + } + + /* + * If the extent being removed is strictly "in the middle" of the + * current extent, then we need to split the current extent into + * two discontiguous extents (the "head" and "tail"). The good + * news is that we don't need to examine any other extents in + * the list. + */ + if (startBlock > currentStart && endBlock < currentEnd) { + remove_free_extent_list(hfsmp, i); + add_free_extent_list(hfsmp, currentStart, startBlock - currentStart); + add_free_extent_list(hfsmp, endBlock, currentEnd - endBlock); + break; + } + + /* + * The only remaining possibility is that the extent to be removed + * overlaps the start or end (but not both!) of the current extent. + * So we need to replace the current extent with a shorter one. + * + * The only tricky part is that the updated extent might be at a + * different index than the original extent. If the updated extent + * was inserted after the current extent, then we need to re-examine + * the entry at index i, since it now contains the extent that was + * previously at index i+1. If the updated extent was inserted + * before or at the same index as the removed extent, then the + * following extents haven't changed position. + */ + remove_free_extent_list(hfsmp, i); + if (startBlock > currentStart) { + /* Remove the tail of the current extent. */ + insertedIndex = add_free_extent_list(hfsmp, currentStart, startBlock - currentStart); + } else { + /* Remove the head of the current extent. */ + insertedIndex = add_free_extent_list(hfsmp, endBlock, currentEnd - endBlock); + } + if (insertedIndex > i) { + --i; /* Undo the "++i" in the loop, so we examine the entry at index i again. */ + } + } + + lck_spin_unlock(&hfsmp->vcbFreeExtLock); + + sanity_check_free_ext(hfsmp, 0); + + if (hfs_kdebug_allocation & HFSDBG_EXT_CACHE_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_REMOVE_EXTENT_CACHE | DBG_FUNC_END, 0, 0, 0, extentsRemoved, 0); + + return; +} + + +/* + * Add an entry to free extent cache after it has been deallocated. + * + * This is a high-level routine. It will merge overlapping or contiguous + * extents into a single, larger extent. + * + * If the extent provided has blocks beyond current allocLimit, it is + * clipped to allocLimit (so that we won't accidentally find and allocate + * space beyond allocLimit). + * + * Inputs: + * hfsmp - mount point structure + * startBlock - starting block of the extent to be removed. + * blockCount - number of blocks of the extent to be removed. + * + * Returns: + * true - if the extent was added successfully to the list + * false - if the extent was not added to the list, maybe because + * the extent was beyond allocLimit, or is not best + * candidate to be put in the cache. + */ +static Boolean add_free_extent_cache(struct hfsmount *hfsmp, u_int32_t startBlock, u_int32_t blockCount) +{ + Boolean retval = false; + uint32_t endBlock; + uint32_t currentEnd; + uint32_t i; + + if (hfs_kdebug_allocation & HFSDBG_EXT_CACHE_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_ADD_EXTENT_CACHE | DBG_FUNC_START, startBlock, blockCount, 0, 0, 0); + +#if DEBUG + for (i = 0; i < 2; ++i) { + struct rl_entry *range; + TAILQ_FOREACH(range, &hfsmp->hfs_reserved_ranges[i], rl_link) { + hfs_assert(rl_overlap(range, startBlock, + startBlock + blockCount - 1) == RL_NOOVERLAP); + } + } +#endif + + /* No need to add extent that is beyond current allocLimit */ + if (startBlock >= hfsmp->allocLimit) { + goto out_not_locked; + } + + /* If end of the free extent is beyond current allocLimit, clip the extent */ + if ((startBlock + blockCount) > hfsmp->allocLimit) { + blockCount = hfsmp->allocLimit - startBlock; + } + + lck_spin_lock(&hfsmp->vcbFreeExtLock); + + /* + * Make a pass through the free extent cache, looking for known extents that + * overlap or are contiguous with the extent to be added. We'll remove those + * extents from the cache, and incorporate them into the new extent to be added. + */ + endBlock = startBlock + blockCount; + for (i=0; i < hfsmp->vcbFreeExtCnt; ++i) { + currentEnd = hfsmp->vcbFreeExt[i].startBlock + hfsmp->vcbFreeExt[i].blockCount; + if (hfsmp->vcbFreeExt[i].startBlock > endBlock || currentEnd < startBlock) { + /* Extent i does not overlap and is not contiguous, so keep it. */ + continue; + } else { + /* We need to remove extent i and combine it with the input extent. */ + if (hfsmp->vcbFreeExt[i].startBlock < startBlock) + startBlock = hfsmp->vcbFreeExt[i].startBlock; + if (currentEnd > endBlock) + endBlock = currentEnd; + + remove_free_extent_list(hfsmp, i); + /* + * We just removed the extent at index i. The extent at + * index i+1 just got shifted to index i. So decrement i + * to undo the loop's "++i", and the next iteration will + * examine index i again, which contains the next extent + * in the list. + */ + --i; + } + } + add_free_extent_list(hfsmp, startBlock, endBlock - startBlock); + + lck_spin_unlock(&hfsmp->vcbFreeExtLock); + +out_not_locked: + sanity_check_free_ext(hfsmp, 0); + + if (hfs_kdebug_allocation & HFSDBG_EXT_CACHE_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_ADD_EXTENT_CACHE | DBG_FUNC_END, 0, 0, 0, retval, 0); + + return retval; +} + +/* Debug function to check if the free extent cache is good or not */ +static void sanity_check_free_ext(struct hfsmount *hfsmp, int check_allocated) +{ + u_int32_t i, j; + + /* Do not do anything if debug is not on */ + if (ALLOC_DEBUG == 0) { + return; + } + + lck_spin_lock(&hfsmp->vcbFreeExtLock); + + if (hfsmp->vcbFreeExtCnt > kMaxFreeExtents) + panic("hfs: %p: free extent count (%u) is too large", hfsmp, hfsmp->vcbFreeExtCnt); + + /* + * Iterate the Free extent cache and ensure no entries are bogus or refer to + * allocated blocks. + */ + for(i=0; i < hfsmp->vcbFreeExtCnt; i++) { + u_int32_t start, nblocks; + + start = hfsmp->vcbFreeExt[i].startBlock; + nblocks = hfsmp->vcbFreeExt[i].blockCount; + + /* Check if any of the blocks in free extent cache are allocated. + * This should not be enabled always because it might take + * very long for large extents that get added to the list. + * + * We have to drop vcbFreeExtLock while we call hfs_isallocated + * because it is going to do I/O. Note that the free extent + * cache could change. That's a risk we take when using this + * debugging code. (Another alternative would be to try to + * detect when the free extent cache changed, and perhaps + * restart if the list changed while we dropped the lock.) + */ + if (check_allocated) { + lck_spin_unlock(&hfsmp->vcbFreeExtLock); + if (hfs_isallocated(hfsmp, start, nblocks)) { + panic("hfs: %p: slot %d:(%u,%u) in the free extent array is allocated\n", + hfsmp, i, start, nblocks); + } + lck_spin_lock(&hfsmp->vcbFreeExtLock); + } + + /* Check if any part of the extent is beyond allocLimit */ + if ((start > hfsmp->allocLimit) || ((start + nblocks) > hfsmp->allocLimit)) { + panic ("hfs: %p: slot %d:(%u,%u) in the free extent array is beyond allocLimit=%u\n", + hfsmp, i, start, nblocks, hfsmp->allocLimit); + } + + /* Check if there are any duplicate start blocks */ + for(j=i+1; j < hfsmp->vcbFreeExtCnt; j++) { + if (start == hfsmp->vcbFreeExt[j].startBlock) { + panic("hfs: %p: slot %d:(%u,%u) and %d:(%u,%u) are duplicate\n", + hfsmp, i, start, nblocks, j, hfsmp->vcbFreeExt[j].startBlock, + hfsmp->vcbFreeExt[j].blockCount); + } + } + + /* Check if the entries are out of order */ + if ((i+1) != hfsmp->vcbFreeExtCnt) { + if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) { + /* sparse devices are sorted by starting block number (ascending) */ + if (hfsmp->vcbFreeExt[i].startBlock > hfsmp->vcbFreeExt[i+1].startBlock) { + panic ("hfs: %p: SPARSE %d:(%u,%u) and %d:(%u,%u) are out of order\n", + hfsmp, i, start, nblocks, i+1, hfsmp->vcbFreeExt[i+1].startBlock, + hfsmp->vcbFreeExt[i+1].blockCount); + } + } else { + /* normally sorted by block count (descending) */ + if (hfsmp->vcbFreeExt[i].blockCount < hfsmp->vcbFreeExt[i+1].blockCount) { + panic ("hfs: %p: %d:(%u,%u) and %d:(%u,%u) are out of order\n", + hfsmp, i, start, nblocks, i+1, hfsmp->vcbFreeExt[i+1].startBlock, + hfsmp->vcbFreeExt[i+1].blockCount); + } + } + } + } + lck_spin_unlock(&hfsmp->vcbFreeExtLock); +} + +#define BIT_RIGHT_MASK(bit) (0xffffffffffffffffull >> (bit)) + +static int clzll(uint64_t x) +{ + if (x == 0) + return 64; + else + return __builtin_clzll(x); +} + +#if !HFS_ALLOC_TEST + +static errno_t get_more_bits(bitmap_context_t *bitmap_ctx) +{ + uint32_t start_bit; + uint32_t iosize = 0; + uint32_t byte_offset; + uint32_t last_bitmap_block; + int error; + struct hfsmount *hfsmp = bitmap_ctx->hfsmp; +#if !HFS_ALLOC_TEST + uint64_t lock_elapsed; +#endif + + + if (bitmap_ctx->bp) + ReleaseScanBitmapRange(bitmap_ctx->bp); + + if (msleep(NULL, NULL, PINOD | PCATCH, + "hfs_fsinfo", NULL) == EINTR) { + return EINTR; + } + +#if !HFS_ALLOC_TEST + /* + * Let someone else use the allocation map after we've processed over HFS_FSINFO_MAX_LOCKHELD_TIME . + * lock_start is initialized in hfs_find_free_extents(). + */ + absolutetime_to_nanoseconds(mach_absolute_time() - bitmap_ctx->lock_start, &lock_elapsed); + + if (lock_elapsed >= HFS_FSINFO_MAX_LOCKHELD_TIME) { + + hfs_systemfile_unlock(hfsmp, bitmap_ctx->lockflags); + + /* add tsleep here to force context switch and fairness */ + tsleep((caddr_t)get_more_bits, PRIBIO, "hfs_fsinfo", 1); + + hfs_journal_lock(hfsmp); + + /* Flush the journal and wait for all I/Os to finish up */ + error = hfs_flush(hfsmp, HFS_FLUSH_JOURNAL_META); + if (error) { + hfs_journal_unlock(hfsmp); + return error; + } + + /* + * Take bitmap lock to ensure it is not being modified while journal is still held. + * Since we are reading larger than normal blocks from the bitmap, which + * might confuse other parts of the bitmap code using normal blocks, we + * take exclusive lock here. + */ + bitmap_ctx->lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK); + + bitmap_ctx->lock_start = mach_absolute_time(); + + /* Release the journal lock */ + hfs_journal_unlock(hfsmp); + + /* + * Bitmap is read in large block size (up to 1MB), + * unlike the runtime which reads the bitmap in the + * 4K block size. If the bitmap is read by both ways + * at the same time, it can result in multiple buf_t with + * different sizes and potentially case data corruption. + * To avoid this, we invalidate all the existing buffers + * associated with the bitmap vnode. + */ + error = buf_invalidateblks(hfsmp->hfs_allocation_vp, 0, 0, 0); + if (error) { + /* hfs_systemfile_unlock will be called in the caller */ + return error; + } + } +#endif + + start_bit = bitmap_ctx->run_offset; + + if (start_bit >= bitmap_ctx->hfsmp->totalBlocks) { + bitmap_ctx->chunk_end = 0; + bitmap_ctx->bp = NULL; + bitmap_ctx->bitmap = NULL; + return 0; + } + + hfs_assert(start_bit % 8 == 0); + + /* + * Compute how much I/O we should generate here. + * hfs_scan_range_size will validate that the start bit + * converted into a byte offset into the bitmap file, + * is aligned on a VBMIOSize boundary. + */ + error = hfs_scan_range_size (bitmap_ctx->hfsmp, start_bit, &iosize); + if (error) + return error; + + hfs_assert(iosize != 0); + + /* hfs_scan_range_size should have verified startbit. Convert it to bytes */ + byte_offset = start_bit / kBitsPerByte; + + /* + * When the journal replays blocks, it does so by writing directly to the disk + * device (bypassing any filesystem vnodes and such). When it finishes its I/Os + * it also immediately re-reads and invalidates the range covered by the bp so + * it does not leave anything lingering in the cache (for iosize reasons). + * + * As such, it is safe to do large I/Os here with ReadBitmapRange. + * + * NOTE: It is not recommended, but it is possible to call the function below + * on sections of the bitmap that may be in core already as long as the pages are not + * dirty. In that case, we'd notice that something starting at that + * logical block of the bitmap exists in the metadata cache, and we'd check + * if the iosize requested is the same as what was already allocated for it. + * Odds are pretty good we're going to request something larger. In that case, + * we just free the existing memory associated with the buf and reallocate a + * larger range. This function should immediately invalidate it as soon as we're + * done scanning, so this shouldn't cause any coherency issues. + */ + error = ReadBitmapRange(bitmap_ctx->hfsmp, byte_offset, iosize, (uint32_t **)&bitmap_ctx->bitmap, &bitmap_ctx->bp); + if (error) + return error; + + /* + * At this point, we have a giant wired buffer that represents some portion of + * the bitmap file that we want to analyze. We may not have gotten all 'iosize' + * bytes though, so clip our ending bit to what we actually read in. + */ + last_bitmap_block = start_bit + buf_count(bitmap_ctx->bp) * kBitsPerByte; + + /* Cap the last block to the total number of blocks if required */ + if (last_bitmap_block > bitmap_ctx->hfsmp->totalBlocks) + last_bitmap_block = bitmap_ctx->hfsmp->totalBlocks; + + bitmap_ctx->chunk_current = 0; // new chunk of bitmap + bitmap_ctx->chunk_end = last_bitmap_block - start_bit; + + return 0; +} + +#endif // !HFS_ALLOC_TEST + +// Returns number of contiguous bits set at start +static int bit_count_set(void *bitmap, int start, int end) +{ + if (start == end) + return 0; + + hfs_assert(end > start); + + const int start_bit = start & 63; + const int end_bit = end & 63; + + uint64_t *p = (uint64_t *)bitmap + start / 64; + uint64_t x = ~OSSwapBigToHostInt64(*p); + + if ((start & ~63) == (end & ~63)) { + // Start and end in same 64 bits + x = (x & BIT_RIGHT_MASK(start_bit)) | BIT_RIGHT_MASK(end_bit); + return clzll(x) - start_bit; + } + + // Deal with initial unaligned bit + x &= BIT_RIGHT_MASK(start_bit); + + if (x) + return clzll(x) - start_bit; + + // Go fast + ++p; + int count = 64 - start_bit; + int nquads = (end - end_bit - start - 1) / 64; + + while (nquads--) { + if (*p != 0xffffffffffffffffull) { + x = ~OSSwapBigToHostInt64(*p); + return count + clzll(x); + } + ++p; + count += 64; + } + + if (end_bit) { + x = ~OSSwapBigToHostInt64(*p) | BIT_RIGHT_MASK(end_bit); + count += clzll(x); + } + + return count; +} + +/* Returns the number of a run of cleared bits: + * bitmap is a single chunk of memory being examined + * start: the start bit relative to the current buffer to be examined; start is inclusive. + * end: the end bit relative to the current buffer to be examined; end is not inclusive. + */ +static int bit_count_clr(void *bitmap, int start, int end) +{ + if (start == end) + return 0; + + hfs_assert(end > start); + + const int start_bit = start & 63; + const int end_bit = end & 63; + + uint64_t *p = (uint64_t *)bitmap + start / 64; + uint64_t x = OSSwapBigToHostInt64(*p); + + if ((start & ~63) == (end & ~63)) { + // Start and end in same 64 bits + x = (x & BIT_RIGHT_MASK(start_bit)) | BIT_RIGHT_MASK(end_bit); + + return clzll(x) - start_bit; + } + + // Deal with initial unaligned bit + x &= BIT_RIGHT_MASK(start_bit); + + if (x) + return clzll(x) - start_bit; + + // Go fast + ++p; + int count = 64 - start_bit; + int nquads = (end - end_bit - start - 1) / 64; + + while (nquads--) { + if (*p) { + x = OSSwapBigToHostInt64(*p); + return count + clzll(x); + } + ++p; + count += 64; + } + + if (end_bit) { + x = OSSwapBigToHostInt64(*p) | BIT_RIGHT_MASK(end_bit); + + count += clzll(x); + } + + return count; +} + +#if !HFS_ALLOC_TEST +static errno_t update_summary_table(bitmap_context_t *bitmap_ctx, uint32_t start, uint32_t count, bool set) +{ + uint32_t end, start_summary_bit, end_summary_bit; + errno_t error = 0; + + if (count == 0) + goto out; + + if (!ISSET(bitmap_ctx->hfsmp->hfs_flags, HFS_SUMMARY_TABLE)) + return 0; + + if (hfs_get_summary_index (bitmap_ctx->hfsmp, start, &start_summary_bit)) { + error = EINVAL; + goto out; + } + + end = start + count - 1; + if (hfs_get_summary_index (bitmap_ctx->hfsmp, end, &end_summary_bit)) { + error = EINVAL; + goto out; + } + + // if summary table bit has been updated with free block previously, leave it. + if ((start_summary_bit == bitmap_ctx->last_free_summary_bit) && set) + start_summary_bit++; + + for (uint32_t summary_bit = start_summary_bit; summary_bit <= end_summary_bit; summary_bit++) + hfs_set_summary (bitmap_ctx->hfsmp, summary_bit, set); + + if (!set) + bitmap_ctx->last_free_summary_bit = end_summary_bit; + +out: + return error; + +} +#endif //!HFS_ALLOC_TEST + +/* + * Read in chunks of the bitmap into memory, and find a run of cleared/set bits; + * the run can extend across chunk boundaries. + * bit_count_clr can be passed to get a run of cleared bits. + * bit_count_set can be passed to get a run of set bits. + */ +static errno_t hfs_bit_count(bitmap_context_t *bitmap_ctx, int (*fn)(void *, int ,int), uint32_t *bit_count) +{ + int count; + errno_t error = 0; + + *bit_count = 0; + + do { + if (bitmap_ctx->run_offset == 0 || bitmap_ctx->chunk_current == bitmap_ctx->chunk_end) { + if ((error = get_more_bits(bitmap_ctx)) != 0) + goto out; + } + + if (bitmap_ctx->chunk_end == 0) + break; + + count = fn(bitmap_ctx->bitmap, bitmap_ctx->chunk_current, bitmap_ctx->chunk_end); + + bitmap_ctx->run_offset += count; + bitmap_ctx->chunk_current += count; + *bit_count += count; + + } while (bitmap_ctx->chunk_current >= bitmap_ctx->chunk_end && count); + +out: + return error; + +} + +// Returns count of number of bits clear +static errno_t hfs_bit_count_clr(bitmap_context_t *bitmap_ctx, uint32_t *count) +{ + return hfs_bit_count(bitmap_ctx, bit_count_clr, count); +} + +// Returns count of number of bits set +static errno_t hfs_bit_count_set(bitmap_context_t *bitmap_ctx, uint32_t *count) +{ + return hfs_bit_count(bitmap_ctx, bit_count_set, count); +} + +static uint32_t hfs_bit_offset(bitmap_context_t *bitmap_ctx) +{ + return bitmap_ctx->run_offset; +} + +/* + * Perform a full scan of the bitmap file. + * Note: during the scan of bitmap file, it may drop and reacquire the + * bitmap lock to let someone else use the bitmap for fairness. + * Currently it is used by HFS_GET_FSINFO statistic gathing, which + * is run while other processes might perform HFS operations. + */ + +errno_t hfs_find_free_extents(struct hfsmount *hfsmp, + void (*callback)(void *data, off_t free_extent_size), void *callback_arg) +{ + struct bitmap_context bitmap_ctx; + uint32_t count; + errno_t error = 0; + + if ((hfsmp->hfs_flags & HFS_SUMMARY_TABLE) == 0) { + error = hfs_init_summary(hfsmp); + if (error) + return error; + } + + bzero(&bitmap_ctx, sizeof(struct bitmap_context)); + + /* + * The journal maintains list of recently deallocated blocks to + * issue DKIOCUNMAPs when the corresponding journal transaction is + * flushed to the disk. To avoid any race conditions, we only + * want one active trim list. Therefore we make sure that the + * journal trim list is sync'ed, empty, and not modifiable for + * the duration of our scan. + * + * Take the journal lock before flushing the journal to the disk. + * We will keep on holding the journal lock till we don't get the + * bitmap lock to make sure that no new journal transactions can + * start. This will make sure that the journal trim list is not + * modified after the journal flush and before getting bitmap lock. + * We can release the journal lock after we acquire the bitmap + * lock as it will prevent any further block deallocations. + */ + hfs_journal_lock(hfsmp); + + /* Flush the journal and wait for all I/Os to finish up */ + error = hfs_flush(hfsmp, HFS_FLUSH_JOURNAL_META); + if (error) { + hfs_journal_unlock(hfsmp); + return error; + } + + /* + * Take bitmap lock to ensure it is not being modified. + * Since we are reading larger than normal blocks from the bitmap, which + * might confuse other parts of the bitmap code using normal blocks, we + * take exclusive lock here. + */ + bitmap_ctx.lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK); + +#if !HFS_ALLOC_TEST + bitmap_ctx.lock_start = mach_absolute_time(); +#endif + + /* Release the journal lock */ + hfs_journal_unlock(hfsmp); + + /* + * Bitmap is read in large block size (up to 1MB), + * unlike the runtime which reads the bitmap in the + * 4K block size. If the bitmap is read by both ways + * at the same time, it can result in multiple buf_t with + * different sizes and potentially case data corruption. + * To avoid this, we invalidate all the existing buffers + * associated with the bitmap vnode. + */ + error = buf_invalidateblks(hfsmp->hfs_allocation_vp, 0, 0, 0); + if (error) + goto out; + + /* + * Get the list of all free extent ranges. hfs_alloc_scan_range() + * will call hfs_fsinfo_data_add() to account for all the free + * extent ranges found during scan. + */ + bitmap_ctx.hfsmp = hfsmp; + bitmap_ctx.run_offset = 0; + + while (bitmap_ctx.run_offset < hfsmp->totalBlocks) { + + uint32_t start = hfs_bit_offset(&bitmap_ctx); + + if ((error = hfs_bit_count_clr(&bitmap_ctx, &count)) != 0) + goto out; + + if (count) + callback(callback_arg, hfs_blk_to_bytes(count, hfsmp->blockSize)); + + if ((error = update_summary_table(&bitmap_ctx, start, count, false)) != 0) + goto out; + + start = hfs_bit_offset(&bitmap_ctx); + + if ((error = hfs_bit_count_set(&bitmap_ctx, &count)) != 0) + goto out; + + if ((error = update_summary_table(&bitmap_ctx, start, count, true)) != 0) + goto out; + } + +out: + if (bitmap_ctx.lockflags) { + hfs_systemfile_unlock(hfsmp, bitmap_ctx.lockflags); + } + + return error; +} + diff --git a/core/hfs.h b/core/hfs.h new file mode 100644 index 0000000..786199c --- /dev/null +++ b/core/hfs.h @@ -0,0 +1,1171 @@ +/* + * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef __HFS__ +#define __HFS__ + +/* If set to 1, enables the code to allocate blocks from the start + * of the disk instead of the nextAllocation for sparse devices like + * sparse disk images or sparsebundle images. The free extent cache + * for such volumes is also maintained based on the start block instead + * of number of contiguous allocation blocks. These devices prefer + * allocation of blocks near the start of the disk to avoid the + * increasing the image size, but it can also result in file fragmentation. + */ +#define HFS_SPARSE_DEV 1 + +#if DEBUG +#define HFS_CHECK_LOCK_ORDER 1 +#endif + +#define HFS_TMPDBG 0 + +#include + +#ifdef KERNEL +#ifdef __APPLE_API_PRIVATE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../hfs_encodings/hfs_encodings.h" + +#include "hfs_journal.h" +#include "hfs_format.h" +#include "hfs_catalog.h" +#include "hfs_cnode.h" +#include "hfs_macos_defs.h" +#include "hfs_hotfiles.h" +#include "hfs_fsctl.h" + +__BEGIN_DECLS + +#if CONFIG_PROTECT +/* Forward declare the cprotect struct */ +struct cprotect; +#endif + +/* + * Just reported via MIG interface. + */ +#define VERSION_STRING "hfs-2 (4-12-99)" + +#define HFS_LINK_MAX 32767 + +#define HFS_MAX_DEFERED_ALLOC (1024*1024) + +#define HFS_MAX_FILES (UINT32_MAX - kHFSFirstUserCatalogNodeID) + +// 400 megs is a "big" file (i.e. one that when deleted +// would touch enough data that we should break it into +// multiple separate transactions) +#define HFS_BIGFILE_SIZE (400LL * 1024LL * 1024LL) + +enum { kMDBSize = 512 }; /* Size of I/O transfer to read entire MDB */ + +enum { kMasterDirectoryBlock = 2 }; /* MDB offset on disk in 512-byte blocks */ +enum { kMDBOffset = kMasterDirectoryBlock * 512 }; /* MDB offset on disk in bytes */ + +#define kRootDirID kHFSRootFolderID + + +/* number of locked buffer caches to hold for b-tree meta data */ +#define kMaxLockedMetaBuffers 32 + +extern struct timezone gTimeZone; + + +/* How many free extents to cache per volume */ +#define kMaxFreeExtents 10 + +/* Maximum file size that we're willing to defrag on open */ +#define HFS_MAX_DEFRAG_SIZE 104857600 // 100 * 1024 * 1024 (100MB) +#define HFS_INITIAL_DEFRAG_SIZE 20971520 // 20 * 1024 * 1024 (20MB) + + +/* The maximum time hfs locks can be held while performing hfs statistics gathering */ +#define HFS_FSINFO_MAX_LOCKHELD_TIME 20 * 1000000ULL /* at most 20 milliseconds. */ + +/* + * HFS_MINFREE gives the minimum acceptable percentage + * of file system blocks which may be free (but this + * minimum will never exceed HFS_MAXRESERVE bytes). If + * the free block count drops below this level only the + * superuser may continue to allocate blocks. + */ +#define HFS_MINFREE 1 +#define HFS_MAXRESERVE ((u_int64_t)(250*1024*1024)) +#define HFS_BT_MAXRESERVE ((u_int64_t)(10*1024*1024)) + +/* + * The system distinguishes between the desirable low-disk + * notifiaction levels for root volumes and non-root volumes. + * The various thresholds are computed as a fraction of the + * volume size, all capped at a certain fixed level + */ + +#define HFS_ROOTVERYLOWDISKTRIGGERFRACTION 5 +#define HFS_ROOTVERYLOWDISKTRIGGERLEVEL ((u_int64_t)(512*1024*1024)) +#define HFS_ROOTLOWDISKTRIGGERFRACTION 10 +#define HFS_ROOTLOWDISKTRIGGERLEVEL ((u_int64_t)(1024*1024*1024)) +#define HFS_ROOTNEARLOWDISKTRIGGERFRACTION 10.5 +#define HFS_ROOTNEARLOWDISKTRIGGERLEVEL ((u_int64_t)(1024*1024*1024 + 100*1024*1024)) +#define HFS_ROOTLOWDISKSHUTOFFFRACTION 11 +#define HFS_ROOTLOWDISKSHUTOFFLEVEL ((u_int64_t)(1024*1024*1024 + 250*1024*1024)) + +#define HFS_VERYLOWDISKTRIGGERFRACTION 1 +#define HFS_VERYLOWDISKTRIGGERLEVEL ((u_int64_t)(150*1024*1024)) +#define HFS_LOWDISKTRIGGERFRACTION 2 +#define HFS_LOWDISKTRIGGERLEVEL ((u_int64_t)(500*1024*1024)) +#define HFS_NEARLOWDISKTRIGGERFRACTION 10 +#define HFS_NEARLOWDISKTRIGGERLEVEL ((uint64_t)(1024*1024*1024)) +#define HFS_LOWDISKSHUTOFFFRACTION 12 +#define HFS_LOWDISKSHUTOFFLEVEL ((u_int64_t)(1024*1024*1024 + 200*1024*1024)) + +/* Internal Data structures*/ + +/* This structure describes the HFS specific mount structure data. */ +typedef struct hfsmount { + u_int32_t hfs_flags; /* see below */ + + /* Physical Description */ + u_int32_t hfs_logical_block_size; /* Logical block size of the disk as reported by ioctl(DKIOCGETBLOCKSIZE), always a multiple of 512 */ + daddr64_t hfs_logical_block_count; /* Number of logical blocks on the disk, as reported by ioctl(DKIOCGETBLOCKCOUNT) */ + u_int64_t hfs_logical_bytes; /* Number of bytes on the disk device this HFS is mounted on (blockcount * blocksize) */ + /* + * Regarding the two AVH sector fields below: + * Under normal circumstances, the filesystem's notion of the "right" location for the AVH is such that + * the partition and filesystem's are in sync. However, during a filesystem resize, HFS proactively + * writes a new AVH at the end of the filesystem, assuming that the partition will be resized accordingly. + * + * However, it is not technically a corruption if the partition size is never modified. As a result, we need + * to keep two copies of the AVH around "just in case" the partition size is not modified. + */ + daddr64_t hfs_partition_avh_sector; /* location of Alt VH w.r.t partition size */ + daddr64_t hfs_fs_avh_sector; /* location of Alt VH w.r.t filesystem size */ + + u_int32_t hfs_physical_block_size; /* Physical block size of the disk as reported by ioctl(DKIOCGETPHYSICALBLOCKSIZE) */ + u_int32_t hfs_log_per_phys; /* Number of logical blocks per physical block size */ + + /* Access to VFS and devices */ + struct mount *hfs_mp; /* filesystem vfs structure */ + struct vnode *hfs_devvp; /* block device mounted vnode */ + struct vnode * hfs_extents_vp; + struct vnode * hfs_catalog_vp; + struct vnode * hfs_allocation_vp; + struct vnode * hfs_attribute_vp; + struct vnode * hfs_startup_vp; + struct vnode * hfs_attrdata_vp; /* pseudo file */ + struct cnode * hfs_extents_cp; + struct cnode * hfs_catalog_cp; + struct cnode * hfs_allocation_cp; + struct cnode * hfs_attribute_cp; + struct cnode * hfs_startup_cp; + dev_t hfs_raw_dev; /* device mounted */ + u_int32_t hfs_logBlockSize; /* Size of buffer cache buffer for I/O */ + + /* Default values for HFS standard and non-init access */ + uid_t hfs_uid; /* uid to set as owner of the files */ + gid_t hfs_gid; /* gid to set as owner of the files */ + mode_t hfs_dir_mask; /* mask to and with directory protection bits */ + mode_t hfs_file_mask; /* mask to and with file protection bits */ + u_int32_t hfs_encoding; /* Default encoding for non hfs+ volumes */ + + /* Persistent fields (on disk, dynamic) */ + time_t hfs_mtime; /* file system last modification time */ + u_int32_t hfs_filecount; /* number of files in file system */ + u_int32_t hfs_dircount; /* number of directories in file system */ + u_int32_t freeBlocks; /* free allocation blocks */ + u_int32_t reclaimBlocks; /* number of blocks we are reclaiming during resize */ + u_int32_t tentativeBlocks; /* tentative allocation blocks -- see note below */ + u_int32_t nextAllocation; /* start of next allocation search */ + u_int32_t sparseAllocation; /* start of allocations for sparse devices */ + u_int32_t vcbNxtCNID; /* next unused catalog node ID - protected by catalog lock */ + u_int32_t vcbWrCnt; /* file system write count */ + u_int64_t encodingsBitmap; /* in-use encodings */ + u_int16_t vcbNmFls; /* HFS Only - root dir file count */ + u_int16_t vcbNmRtDirs; /* HFS Only - root dir directory count */ + + /* Persistent fields (on disk, static) */ + u_int16_t vcbSigWord; + + // Volume will be inconsistent if header is not flushed + bool hfs_header_dirty; + + // Volume header is dirty, but won't be inconsistent if not flushed + bool hfs_header_minor_change; + + u_int32_t vcbAtrb; + u_int32_t vcbJinfoBlock; + u_int32_t localCreateDate;/* volume create time from volume header (For HFS+, value is in local time) */ + time_t hfs_itime; /* file system creation time (creation date of the root folder) */ + time_t hfs_btime; /* file system last backup time */ + u_int32_t blockSize; /* size of allocation blocks */ + u_int32_t totalBlocks; /* total allocation blocks */ + u_int32_t allocLimit; /* Do not allocate this block or beyond */ + /* + * NOTE: When resizing a volume to make it smaller, allocLimit is set to the allocation + * block number which will contain the new alternate volume header. At all other times, + * allocLimit is set to totalBlocks. The allocation code uses allocLimit instead of + * totalBlocks to limit which blocks may be allocated, so that during a resize, we don't + * put new content into the blocks we're trying to truncate away. + */ + int32_t vcbClpSiz; + u_int32_t vcbFndrInfo[8]; + int16_t vcbVBMSt; /* HFS only */ + int16_t vcbAlBlSt; /* HFS only */ + + /* vcb stuff */ + u_int8_t vcbVN[256]; /* volume name in UTF-8 */ + u_int32_t volumeNameEncodingHint; + u_int32_t hfsPlusIOPosOffset; /* Disk block where HFS+ starts */ + u_int32_t vcbVBMIOSize; /* volume bitmap I/O size */ + + /* cache of largest known free extents */ + u_int32_t vcbFreeExtCnt; + HFSPlusExtentDescriptor vcbFreeExt[kMaxFreeExtents]; + lck_spin_t vcbFreeExtLock; + + /* Summary Table */ + u_int8_t *hfs_summary_table; /* Each bit is 1 vcbVBMIOSize of bitmap, byte indexed */ + u_int32_t hfs_summary_size; /* number of BITS in summary table defined above (not bytes!) */ + u_int32_t hfs_summary_bytes; /* number of BYTES in summary table */ + + u_int32_t scan_var; /* For initializing the summary table */ + + + u_int32_t reserveBlocks; /* free block reserve */ + u_int32_t loanedBlocks; /* blocks on loan for delayed allocations */ + u_int32_t lockedBlocks; /* blocks reserved and locked */ + + /* + * HFS+ Private system directories (two). Any access + * (besides looking at the cd_cnid) requires holding + * the Catalog File lock. + */ + struct cat_desc hfs_private_desc[2]; + struct cat_attr hfs_private_attr[2]; + + u_int32_t hfs_metadata_createdate; +#if CONFIG_HFS_STD + hfs_to_unicode_func_t hfs_get_unicode; + unicode_to_hfs_func_t hfs_get_hfsname; +#endif + + /* Quota variables: */ + struct quotafile hfs_qfiles[MAXQUOTAS]; /* quota files */ + + /* Journaling variables: */ + struct journal *jnl; // the journal for this volume (if one exists) + struct vnode *jvp; // device where the journal lives (may be equal to devvp) + u_int32_t jnl_start; // start block of the journal file (so we don't delete it) + u_int32_t jnl_size; + u_int32_t hfs_jnlfileid; + u_int32_t hfs_jnlinfoblkid; + lck_rw_t hfs_global_lock; + thread_t hfs_global_lockowner; + u_int32_t hfs_transaction_nesting; + + /* + * Notification variables + * See comments in hfs mount code for what the + * default levels are set to. + */ + u_int32_t hfs_notification_conditions; + u_int32_t hfs_freespace_notify_dangerlimit; + u_int32_t hfs_freespace_notify_warninglimit; + u_int32_t hfs_freespace_notify_nearwarninglimit; + u_int32_t hfs_freespace_notify_desiredlevel; + + /* time mounted and last mounted mod time "snapshot" */ + time_t hfs_mount_time; + time_t hfs_last_mounted_mtime; + + /* Metadata allocation zone variables: */ + u_int32_t hfs_metazone_start; + u_int32_t hfs_metazone_end; + u_int32_t hfs_hotfile_start; + u_int32_t hfs_hotfile_end; + u_int32_t hfs_min_alloc_start; + u_int32_t hfs_freed_block_count; + u_int64_t hfs_cs_hotfile_size; // in bytes + int hfs_hotfile_freeblks; + int hfs_hotfile_blk_adjust; // since we pass this to OSAddAtomic, this needs to be 4-byte aligned + int hfs_hotfile_maxblks; + int hfs_overflow_maxblks; + int hfs_catalog_maxblks; + + /* Hot File Clustering variables: */ + lck_mtx_t hfc_mutex; /* serialize hot file stages */ + enum hfc_stage hfc_stage; /* what are we up to... */ + time_t hfc_timebase; /* recording period start time */ + time_t hfc_timeout; /* recording period stop time */ + struct hotfile_data *hfc_recdata; + struct hotfilelist *hfc_filelist; + uint32_t hfc_maxfiles; /* maximum files to track */ + struct vnode * hfc_filevp; + + /* defrag-on-open variables */ + int hfs_defrag_nowait; //issue defrags now, regardless of whether or not we've gone past 3 min. + uint64_t hfs_defrag_max; //maximum file size we'll defragment on this mount + +#if HFS_SPARSE_DEV + /* Sparse device variables: */ + struct vnode * hfs_backingvp; + u_int32_t hfs_last_backingstatfs; + u_int32_t hfs_sparsebandblks; + u_int64_t hfs_backingfs_maxblocks; +#endif + size_t hfs_max_inline_attrsize; + + lck_mtx_t hfs_mutex; /* protects access to hfsmount data */ + + uint32_t hfs_syncers; // Count of the number of syncers running + enum { + HFS_THAWED, + HFS_WANT_TO_FREEZE, // This state stops hfs_sync from starting + HFS_FREEZING, // We're in this state whilst we're flushing + HFS_FROZEN // Everything gets blocked in hfs_lock_global + } hfs_freeze_state; + union { + /* + * When we're freezing (HFS_FREEZING) but not yet + * frozen (HFS_FROZEN), we record the freezing thread + * so that we stop other threads from taking locks, + * but allow the freezing thread. + */ + const struct thread *hfs_freezing_thread; + /* + * Once we have frozen (HFS_FROZEN), we record the + * process so that if it dies, we can automatically + * unfreeze. + */ + proc_t hfs_freezing_proc; + }; + + thread_t hfs_downgrading_thread; /* thread who's downgrading to rdonly */ + + /* Resize variables: */ + u_int32_t hfs_resize_blocksmoved; + u_int32_t hfs_resize_totalblocks; + u_int32_t hfs_resize_progress; +#if CONFIG_PROTECT + /* Data Protection fields */ + cpx_t hfs_resize_cpx; + u_int16_t hfs_running_cp_major_vers; + uint32_t default_cp_class; /* default effective class value */ + uint64_t cproot_flags; + uint8_t cp_crypto_generation; + cp_lock_state_t hfs_cp_lock_state; /* per-mount device lock state info */ +#if HFS_CONFIG_KEY_ROLL + uint32_t hfs_auto_roll_min_key_os_version; + uint32_t hfs_auto_roll_max_key_os_version; +#endif +#if HFS_TMPDBG +#if !SECURE_KERNEL + boolean_t hfs_cp_verbose; +#endif +#endif + +#endif + + /* the full UUID of the volume, not the one stored in finderinfo */ + uuid_t hfs_full_uuid; + + /* Per mount cnode hash variables: */ + lck_mtx_t hfs_chash_mutex; /* protects access to cnode hash table */ + u_long hfs_cnodehash; /* size of cnode hash table - 1 */ + LIST_HEAD(cnodehashhead, cnode) *hfs_cnodehashtbl; /* base of cnode hash */ + + /* Per mount fileid hash variables (protected by catalog lock!) */ + u_long hfs_idhash; /* size of cnid/fileid hash table -1 */ + LIST_HEAD(idhashhead, cat_preflightid) *hfs_idhashtbl; /* base of ID hash */ + + // Records the oldest outstanding sync request + struct timeval hfs_sync_req_oldest; + + /* Records the syncer thread so that we can avoid the syncer + queing more syncs. */ + thread_t hfs_syncer_thread; + + // Not currently used except for debugging purposes + // Since we pass this to OSAddAtomic, this needs to be 4-byte aligned. + uint32_t hfs_active_threads; + + enum { + // These are indices into the array below + + // Tentative ranges can be claimed back at any time + HFS_TENTATIVE_BLOCKS = 0, + + // Locked ranges cannot be claimed back, but the allocation + // won't have been written to disk yet + HFS_LOCKED_BLOCKS = 1, + }; + // These lists are not sorted like a range list usually is + struct rl_head hfs_reserved_ranges[2]; +} hfsmount_t; + +/* + * HFS_META_DELAY is a duration (in usecs) used for triggering the + * hfs_syncer() routine. We will back off if writes are in + * progress, but... + * HFS_MAX_META_DELAY is the maximum time we will allow the + * syncer to be delayed. + */ +enum { + HFS_META_DELAY = 100 * 1000, // 0.1 secs + HFS_MAX_META_DELAY = 5000 * 1000 // 5 secs +}; + +#define HFS_META_DELAY_TS \ + (struct timespec){ 0, HFS_META_DELAY * NSEC_PER_USEC } + +typedef hfsmount_t ExtendedVCB; + +/* Aliases for legacy (Mac OS 9) field names */ +#define vcbLsMod hfs_mtime +#define vcbVolBkUp hfs_btime +#define extentsRefNum hfs_extents_vp +#define catalogRefNum hfs_catalog_vp +#define allocationsRefNum hfs_allocation_vp +#define vcbFilCnt hfs_filecount +#define vcbDirCnt hfs_dircount + +static inline void MarkVCBDirty(hfsmount_t *hfsmp) +{ + hfsmp->hfs_header_dirty = true; +} + +static inline void MarkVCBClean(hfsmount_t *hfsmp) +{ + hfsmp->hfs_header_dirty = false; + hfsmp->hfs_header_minor_change = false; +} + +static inline bool IsVCBDirty(ExtendedVCB *vcb) +{ + return vcb->hfs_header_minor_change || vcb->hfs_header_dirty; +} + +// Header is changed but won't be inconsistent if we don't write it +static inline void hfs_note_header_minor_change(hfsmount_t *hfsmp) +{ + hfsmp->hfs_header_minor_change = true; +} + +// Must header be flushed for volume to be consistent? +static inline bool hfs_header_needs_flushing(hfsmount_t *hfsmp) +{ + return (hfsmp->hfs_header_dirty + || ISSET(hfsmp->hfs_catalog_cp->c_flag, C_MODIFIED) + || ISSET(hfsmp->hfs_extents_cp->c_flag, C_MODIFIED) + || (hfsmp->hfs_attribute_cp + && ISSET(hfsmp->hfs_attribute_cp->c_flag, C_MODIFIED)) + || (hfsmp->hfs_allocation_cp + && ISSET(hfsmp->hfs_allocation_cp->c_flag, C_MODIFIED)) + || (hfsmp->hfs_startup_cp + && ISSET(hfsmp->hfs_startup_cp->c_flag, C_MODIFIED))); +} + +/* + * There are two private directories in HFS+. + * + * One contains inodes for files that are hardlinked or open/unlinked. + * The other contains inodes for directories that are hardlinked. + */ +enum privdirtype {FILE_HARDLINKS, DIR_HARDLINKS}; + +#define HFS_ALLOCATOR_SCAN_INFLIGHT 0x0001 /* scan started */ +#define HFS_ALLOCATOR_SCAN_COMPLETED 0x0002 /* initial scan was completed */ + +/* HFS mount point flags */ +#define HFS_READ_ONLY 0x00001 +#define HFS_UNKNOWN_PERMS 0x00002 +#define HFS_WRITEABLE_MEDIA 0x00004 +#define HFS_CLEANED_ORPHANS 0x00008 +#define HFS_X 0x00010 +#define HFS_CASE_SENSITIVE 0x00020 +#define HFS_STANDARD 0x00040 +#define HFS_METADATA_ZONE 0x00080 +#define HFS_FRAGMENTED_FREESPACE 0x00100 +#define HFS_NEED_JNL_RESET 0x00200 +#define HFS_HAS_SPARSE_DEVICE 0x00400 +#define HFS_RESIZE_IN_PROGRESS 0x00800 +#define HFS_QUOTAS 0x01000 +#define HFS_CREATING_BTREE 0x02000 +/* When set, do not update nextAllocation in the mount structure */ +#define HFS_SKIP_UPDATE_NEXT_ALLOCATION 0x04000 +/* When set, the file system supports extent-based extended attributes */ +#define HFS_XATTR_EXTENTS 0x08000 +#define HFS_FOLDERCOUNT 0x10000 +/* When set, the file system exists on a virtual device, like disk image */ +#define HFS_VIRTUAL_DEVICE 0x20000 +/* When set, we're in hfs_changefs, so hfs_sync should do nothing. */ +#define HFS_IN_CHANGEFS 0x40000 +/* When set, we are in process of downgrading or have downgraded to read-only, + * so hfs_start_transaction should return EROFS. + */ +#define HFS_RDONLY_DOWNGRADE 0x80000 +#define HFS_DID_CONTIG_SCAN 0x100000 +#define HFS_UNMAP 0x200000 +#define HFS_SSD 0x400000 +#define HFS_SUMMARY_TABLE 0x800000 +#define HFS_CS 0x1000000 +#define HFS_CS_METADATA_PIN 0x2000000 +#define HFS_CS_HOTFILE_PIN 0x4000000 /* cooperative fusion (enables a hotfile variant) */ +#define HFS_FEATURE_BARRIER 0x8000000 /* device supports barrier-only flush */ +#define HFS_CS_SWAPFILE_PIN 0x10000000 +#define HFS_RUN_SYNCER 0x20000000 + +/* Macro to update next allocation block in the HFS mount structure. If + * the HFS_SKIP_UPDATE_NEXT_ALLOCATION is set, do not update + * nextAllocation block. + */ +#define HFS_UPDATE_NEXT_ALLOCATION(hfsmp, new_nextAllocation) \ + { \ + if ((hfsmp->hfs_flags & HFS_SKIP_UPDATE_NEXT_ALLOCATION) == 0)\ + hfsmp->nextAllocation = new_nextAllocation; \ + } \ + +/* Macro for incrementing and decrementing the folder count in a cnode + * attribute only if the HFS_FOLDERCOUNT bit is set in the mount flags + * and kHFSHasFolderCount bit is set in the cnode flags. Currently these + * bits are only set for case sensitive HFS+ volumes. + */ +#define INC_FOLDERCOUNT(hfsmp, cattr) \ + if ((hfsmp->hfs_flags & HFS_FOLDERCOUNT) && \ + (cattr.ca_recflags & kHFSHasFolderCountMask)) { \ + cattr.ca_dircount++; \ + } \ + +#define DEC_FOLDERCOUNT(hfsmp, cattr) \ + if ((hfsmp->hfs_flags & HFS_FOLDERCOUNT) && \ + (cattr.ca_recflags & kHFSHasFolderCountMask) && \ + (cattr.ca_dircount > 0)) { \ + cattr.ca_dircount--; \ + } \ + +typedef struct filefork FCB; + +/* + * Macros for creating item names for our special/private directories. + */ +#define MAKE_INODE_NAME(name, size, linkno) \ + (void) snprintf((name), size, "%s%d", HFS_INODE_PREFIX, (linkno)) +#define HFS_INODE_PREFIX_LEN 5 + +#define MAKE_DIRINODE_NAME(name, size, linkno) \ + (void) snprintf((name), size, "%s%d", HFS_DIRINODE_PREFIX, (linkno)) +#define HFS_DIRINODE_PREFIX_LEN 4 + +#define MAKE_DELETED_NAME(NAME, size, FID) \ + (void) snprintf((NAME), size, "%s%d", HFS_DELETE_PREFIX, (FID)) +#define HFS_DELETE_PREFIX_LEN 4 + + +#define HFS_AVERAGE_NAME_SIZE 22 +#define AVERAGE_HFSDIRENTRY_SIZE (8+HFS_AVERAGE_NAME_SIZE+4) + +#define STD_DIRENT_LEN(namlen) \ + ((sizeof(struct dirent) - (NAME_MAX+1)) + (((namlen)+1 + 3) &~ 3)) + +#define EXT_DIRENT_LEN(namlen) \ + ((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7) + + +enum { kHFSPlusMaxFileNameBytes = kHFSPlusMaxFileNameChars * 3 }; + + +/* macro to determine if hfs or hfsplus */ +#define ISHFSPLUS(VCB) ((VCB)->vcbSigWord == kHFSPlusSigWord) +#define ISHFS(VCB) ((VCB)->vcbSigWord == kHFSSigWord) + + +/* + * Various ways to acquire a VFS mount point pointer: + */ +#define VTOVFS(VP) vnode_mount((VP)) +#define HFSTOVFS(HFSMP) ((HFSMP)->hfs_mp) +#define VCBTOVFS(VCB) HFSTOVFS(VCB) + +/* + * Various ways to acquire an HFS mount point pointer: + */ +#define VTOHFS(VP) ((struct hfsmount *)vfs_fsprivate(vnode_mount((VP)))) +#define VFSTOHFS(MP) ((struct hfsmount *)vfs_fsprivate((MP))) +#define VCBTOHFS(VCB) (VCB) +#define FCBTOHFS(FCB) ((struct hfsmount *)vfs_fsprivate(vnode_mount((FCB)->ff_cp->c_vp))) + +/* + * Various ways to acquire a VCB (legacy) pointer: + */ +#define VTOVCB(VP) VTOHFS(VP) +#define VFSTOVCB(MP) VFSTOHFS(MP) +#define HFSTOVCB(HFSMP) (HFSMP) +#define FCBTOVCB(FCB) FCBTOHFS(FCB) + + +#define E_NONE 0 +#define kHFSBlockSize 512 + +/* + * Macros for getting the MDB/VH sector and offset + */ +#define HFS_PRI_SECTOR(blksize) (1024 / (blksize)) +#define HFS_PRI_OFFSET(blksize) ((blksize) > 1024 ? 1024 : 0) + +#define HFS_ALT_SECTOR(blksize, blkcnt) (((blkcnt) - 1) - (512 / (blksize))) +#define HFS_ALT_OFFSET(blksize) ((blksize) > 1024 ? (blksize) - 1024 : 0) + +/* Convert the logical sector number to be aligned on physical block size boundary. + * We are assuming the partition is a multiple of physical block size. + */ +#define HFS_PHYSBLK_ROUNDDOWN(sector_num, log_per_phys) ((sector_num / log_per_phys) * log_per_phys) + +/* + * HFS specific fcntl()'s + */ +#define HFS_GET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00004) +#define HFS_SET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00005) +/* See HFSIOC_EXT_BULKACCESS and friends for HFS specific fsctls*/ + + + +/* + * This is the straight GMT conversion constant: + * 00:00:00 January 1, 1970 - 00:00:00 January 1, 1904 + * (3600 * 24 * ((365 * (1970 - 1904)) + (((1970 - 1904) / 4) + 1))) + */ +#define MAC_GMT_FACTOR 2082844800UL + +static inline __attribute__((const)) +off_t hfs_blk_to_bytes(uint32_t blk, uint32_t blk_size) +{ + return (off_t)blk * blk_size; // Avoid the overflow +} + +/* + * For now, we use EIO to indicate consistency issues. It is safe to + * return or assign an error value to HFS_EINCONSISTENT but it is + * *not* safe to compare against it because EIO can be generated for + * other reasons. We take advantage of the fact that == has + * left-to-right associativity and so any uses of: + * + * if (error == HFS_EINCONSISTENT) + * + * will produce a compiler warning: "comparison between pointer and + * integer". + * + * Note that not everwhere is consistent with the use of + * HFS_EINCONSISTENT. Some places return EINVAL, EIO directly or + * other error codes. + */ +#define HFS_EINCONSISTENT (void *)0 == (void *)0 ? EIO : EIO + +#define HFS_ERESERVEDNAME -8 + +extern int (**hfs_specop_p)(void *); + +/***************************************************************************** + FUNCTION PROTOTYPES +******************************************************************************/ + +/***************************************************************************** + hfs_vnop_xxx functions from different files +******************************************************************************/ +int hfs_vnop_readdirattr(struct vnop_readdirattr_args *); /* in hfs_attrlist.c */ +int hfs_vnop_getattrlistbulk(struct vnop_getattrlistbulk_args *); /* in hfs_attrlist.c */ + +int hfs_vnop_inactive(struct vnop_inactive_args *); /* in hfs_cnode.c */ +int hfs_vnop_reclaim(struct vnop_reclaim_args *); /* in hfs_cnode.c */ + +int hfs_set_backingstore (struct vnode *vp, int val); /* in hfs_cnode.c */ +int hfs_is_backingstore (struct vnode *vp, int *val); /* in hfs_cnode.c */ + +int hfs_vnop_link(struct vnop_link_args *); /* in hfs_link.c */ + +int hfs_vnop_lookup(struct vnop_lookup_args *); /* in hfs_lookup.c */ + +int hfs_vnop_search(struct vnop_searchfs_args *); /* in hfs_search.c */ + +int hfs_vnop_read(struct vnop_read_args *); /* in hfs_readwrite.c */ +int hfs_vnop_write(struct vnop_write_args *); /* in hfs_readwrite.c */ +int hfs_vnop_ioctl(struct vnop_ioctl_args *); /* in hfs_readwrite.c */ +int hfs_vnop_select(struct vnop_select_args *); /* in hfs_readwrite.c */ +int hfs_vnop_strategy(struct vnop_strategy_args *); /* in hfs_readwrite.c */ +int hfs_vnop_allocate(struct vnop_allocate_args *); /* in hfs_readwrite.c */ +int hfs_vnop_pagein(struct vnop_pagein_args *); /* in hfs_readwrite.c */ +int hfs_vnop_pageout(struct vnop_pageout_args *); /* in hfs_readwrite.c */ +int hfs_vnop_bwrite(struct vnop_bwrite_args *); /* in hfs_readwrite.c */ +int hfs_vnop_blktooff(struct vnop_blktooff_args *); /* in hfs_readwrite.c */ +int hfs_vnop_offtoblk(struct vnop_offtoblk_args *); /* in hfs_readwrite.c */ +int hfs_vnop_blockmap(struct vnop_blockmap_args *); /* in hfs_readwrite.c */ +errno_t hfs_flush_invalid_ranges(vnode_t vp); /* in hfs_readwrite.c */ + +int hfs_vnop_getxattr(struct vnop_getxattr_args *); /* in hfs_xattr.c */ +int hfs_vnop_setxattr(struct vnop_setxattr_args *); /* in hfs_xattr.c */ +int hfs_vnop_removexattr(struct vnop_removexattr_args *); /* in hfs_xattr.c */ +int hfs_vnop_listxattr(struct vnop_listxattr_args *); /* in hfs_xattr.c */ +#if NAMEDSTREAMS +extern int hfs_vnop_getnamedstream(struct vnop_getnamedstream_args*); +extern int hfs_vnop_makenamedstream(struct vnop_makenamedstream_args*); +extern int hfs_vnop_removenamedstream(struct vnop_removenamedstream_args*); +#endif + + +/***************************************************************************** + Functions from MacOSStubs.c +******************************************************************************/ +time_t to_bsd_time(u_int32_t hfs_time); + +u_int32_t to_hfs_time(time_t bsd_time); + + +/***************************************************************************** + Functions from hfs_notifications.c +******************************************************************************/ +void hfs_generate_volume_notifications(struct hfsmount *hfsmp); + + +/***************************************************************************** + Functions from hfs_readwrite.c +******************************************************************************/ +extern int hfs_relocate(struct vnode *, u_int32_t, kauth_cred_t, struct proc *); + +/* flags for hfs_pin_block_range() and hfs_pin_vnode() */ +#define HFS_PIN_IT 0x0001 +#define HFS_UNPIN_IT 0x0002 +#define HFS_TEMP_PIN 0x0004 +#define HFS_EVICT_PIN 0x0008 +#define HFS_DATALESS_PIN 0x0010 + +// +// pin/un-pin an explicit range of blocks to the "fast" (usually ssd) device +// +int hfs_pin_block_range(struct hfsmount *hfsmp, int pin_state, uint32_t start_block, uint32_t nblocks); + +// +// pin/un-pin all the extents belonging to a vnode. +// also, if it is non-null, "num_blocks_pinned" returns the number of blocks pin/unpinned by the function +// +int hfs_pin_vnode(struct hfsmount *hfsmp, struct vnode *vp, int pin_state, uint32_t *num_blocks_pinned); + + +int hfs_pin_overflow_extents (struct hfsmount *hfsmp, uint32_t fileid, uint8_t forktype, uint32_t *pinned); + + +/* Flags for HFS truncate */ +#define HFS_TRUNCATE_SKIPTIMES 0x00000002 /* implied by skipupdate; it is a subset */ + + +extern int hfs_truncate(struct vnode *, off_t, int, int, vfs_context_t); + +extern int hfs_release_storage (struct hfsmount *hfsmp, struct filefork *datafork, + struct filefork *rsrcfork, u_int32_t fileid); + +extern int hfs_prepare_release_storage (struct hfsmount *hfsmp, struct vnode *vp); + +extern int hfs_bmap(struct vnode *, daddr_t, struct vnode **, daddr64_t *, unsigned int *); + +extern errno_t hfs_ubc_setsize(vnode_t vp, off_t len, bool have_cnode_lock); + + +/***************************************************************************** + Functions from hfs_resize.c +******************************************************************************/ +int hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context); +int hfs_truncatefs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context); + + +/***************************************************************************** + Functions from hfs_vfsops.c +******************************************************************************/ + +extern void hfs_getvoluuid(struct hfsmount *hfsmp, uuid_t result); + +/* used as a callback by the journaling code */ +extern void hfs_sync_metadata(void *arg); + +extern int hfs_vget(struct hfsmount *, cnid_t, struct vnode **, int, int); + +extern void hfs_setencodingbits(struct hfsmount *hfsmp, u_int32_t encoding); + +enum volop {VOL_UPDATE, VOL_MKDIR, VOL_RMDIR, VOL_MKFILE, VOL_RMFILE}; +extern int hfs_volupdate(struct hfsmount *hfsmp, enum volop op, int inroot); + +enum { + HFS_FVH_WAIT = 0x0001, + HFS_FVH_WRITE_ALT = 0x0002, + HFS_FVH_FLUSH_IF_DIRTY = 0x0004, +}; +typedef uint32_t hfs_flush_volume_header_options_t; +int hfs_flushvolumeheader(struct hfsmount *hfsmp, hfs_flush_volume_header_options_t); + +extern int hfs_extendfs(struct hfsmount *, u_int64_t, vfs_context_t); +extern int hfs_truncatefs(struct hfsmount *, u_int64_t, vfs_context_t); +extern int hfs_resize_progress(struct hfsmount *, u_int32_t *); + +/* If a runtime corruption is detected, mark the volume inconsistent + * bit in the volume attributes. + */ + +typedef enum { + HFS_INCONSISTENCY_DETECTED, + + // Used when unable to rollback an operation that failed + HFS_ROLLBACK_FAILED, + + // Used when the latter part of an operation failed, but we chose not to roll back + HFS_OP_INCOMPLETE, + + // Used when someone told us to force an fsck on next mount + HFS_FSCK_FORCED, +} hfs_inconsistency_reason_t; + +void hfs_mark_inconsistent(struct hfsmount *hfsmp, + hfs_inconsistency_reason_t reason); + +void hfs_scan_blocks (struct hfsmount *hfsmp); +int hfs_vfs_root(struct mount *mp, struct vnode **vpp, vfs_context_t context); + +/***************************************************************************** + Functions from hfs_vfsutils.c +******************************************************************************/ +u_int32_t BestBlockSizeFit(u_int32_t allocationBlockSize, + u_int32_t blockSizeLimit, + u_int32_t baseMultiple); + +#if CONFIG_HFS_STD +OSErr hfs_MountHFSVolume(struct hfsmount *hfsmp, HFSMasterDirectoryBlock *mdb, + struct proc *p); +#endif +OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, + off_t embeddedOffset, u_int64_t disksize, struct proc *p, void *args, kauth_cred_t cred); + +OSErr hfs_ValidateHFSPlusVolumeHeader(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp); + +extern int hfsUnmount(struct hfsmount *hfsmp, struct proc *p); + +extern bool overflow_extents(struct filefork *fp); + +extern int hfs_owner_rights(struct hfsmount *hfsmp, uid_t cnode_uid, kauth_cred_t cred, + struct proc *p, int invokesuperuserstatus); + +extern int check_for_dataless_file(struct vnode *vp, uint64_t op_type); +extern int hfs_generate_document_id(struct hfsmount *hfsmp, uint32_t *docid); +extern void hfs_pin_fs_metadata(struct hfsmount *hfsmp); + +/* Return information about number of metadata blocks for volume */ +extern int hfs_getinfo_metadata_blocks(struct hfsmount *hfsmp, struct hfsinfo_metadata *hinfo); + +/* + * Journal lock function prototypes + */ +int hfs_lock_global (struct hfsmount *hfsmp, enum hfs_locktype locktype); +void hfs_unlock_global (struct hfsmount *hfsmp); + +/* HFS mount lock/unlock prototypes */ +void hfs_lock_mount (struct hfsmount *hfsmp); +void hfs_unlock_mount (struct hfsmount *hfsmp); + + +/* HFS System file locking */ +#define SFL_CATALOG 0x0001 +#define SFL_EXTENTS 0x0002 +#define SFL_BITMAP 0x0004 +#define SFL_ATTRIBUTE 0x0008 +#define SFL_STARTUP 0x0010 +#define SFL_VM_PRIV 0x0020 +#define SFL_VALIDMASK (SFL_CATALOG | SFL_EXTENTS | SFL_BITMAP | SFL_ATTRIBUTE | SFL_STARTUP | SFL_VM_PRIV) + +extern u_int32_t GetFileInfo(ExtendedVCB *vcb, u_int32_t dirid, const char *name, + struct cat_attr *fattr, struct cat_fork *forkinfo); + +extern void hfs_remove_orphans(struct hfsmount *); + +u_int32_t GetLogicalBlockSize(struct vnode *vp); + +extern u_int32_t hfs_free_cnids(struct hfsmount * hfsmp); +extern u_int32_t hfs_freeblks(struct hfsmount * hfsmp, int wantreserve); + +short MacToVFSError(OSErr err); + +void hfs_metadatazone_init(struct hfsmount *hfsmp, int disable); + +/* HFS directory hint functions. */ +extern directoryhint_t * hfs_getdirhint(struct cnode *, int, int); +extern void hfs_reldirhint(struct cnode *, directoryhint_t *); +extern void hfs_reldirhints(struct cnode *, int); +extern void hfs_insertdirhint(struct cnode *, directoryhint_t *); + +extern int hfs_namecmp(const u_int8_t *str1, size_t len1, const u_int8_t *str2, size_t len2); + +extern int hfs_early_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, + void *_args, off_t embeddedOffset, daddr64_t mdb_offset, + HFSMasterDirectoryBlock *mdbp, kauth_cred_t cred); + +extern int hfs_virtualmetafile(struct cnode *); + +extern int hfs_start_transaction(struct hfsmount *hfsmp); +extern int hfs_end_transaction(struct hfsmount *hfsmp); +extern void hfs_journal_lock(struct hfsmount *hfsmp); +extern void hfs_journal_unlock(struct hfsmount *hfsmp); +extern void hfs_syncer_lock(struct hfsmount *hfsmp); +extern void hfs_syncer_unlock(struct hfsmount *hfsmp); +extern void hfs_syncer_wait(struct hfsmount *hfsmp, struct timespec *ts); +extern void hfs_syncer_wakeup(struct hfsmount *hfsmp); +extern void hfs_syncer(void *arg, wait_result_t); +extern void hfs_sync_ejectable(struct hfsmount *hfsmp); + +typedef enum hfs_flush_mode { + HFS_FLUSH_JOURNAL, // Flush journal + HFS_FLUSH_JOURNAL_META, // Flush journal and metadata blocks + HFS_FLUSH_FULL, // Flush journal and does a cache flush + HFS_FLUSH_CACHE, // Flush track cache to media + HFS_FLUSH_BARRIER, // Barrier-only flush to ensure write order + HFS_FLUSH_JOURNAL_BARRIER // Flush journal with barrier +} hfs_flush_mode_t; + +extern errno_t hfs_flush(struct hfsmount *hfsmp, hfs_flush_mode_t mode); + +extern void hfs_trim_callback(void *arg, uint32_t extent_count, const dk_extent_t *extents); + +/* Erase unused Catalog nodes due to . */ +extern int hfs_erase_unused_nodes(struct hfsmount *hfsmp); + +extern uint64_t hfs_usecs_to_deadline(uint64_t usecs); + +extern int hfs_freeze(struct hfsmount *hfsmp); +extern int hfs_thaw(struct hfsmount *hfsmp, const struct proc *process); + +void hfs_close_jvp(hfsmount_t *hfsmp); + +// Return a heap address suitable for logging or tracing +uintptr_t obfuscate_addr(void *addr); + +#if CONFIG_HFS_STD +int hfs_to_utf8(ExtendedVCB *vcb, const Str31 hfs_str, ByteCount maxDstLen, + ByteCount *actualDstLen, unsigned char* dstStr); +int utf8_to_hfs(ExtendedVCB *vcb, ByteCount srcLen, const unsigned char* srcStr, + Str31 dstStr); +int unicode_to_hfs(ExtendedVCB *vcb, ByteCount srcLen, u_int16_t* srcStr, Str31 dstStr, int retry); +#endif + +void *hfs_malloc(size_t size); +void hfs_free(void *ptr, size_t size); +void *hfs_mallocz(size_t size); + +typedef enum { + HFS_CNODE_ZONE, + HFS_FILEFORK_ZONE, + HFS_DIRHINT_ZONE, + HFS_NUM_ZONES +} hfs_zone_kind_t; + +typedef struct hfs_zone_entry { + hfs_zone_kind_t hze_kind; + size_t hze_elem_size; + const char * hze_name; + boolean_t hze_noencrypt; +} hfs_zone_entry_t; + +typedef struct hfs_zone { + zone_t hz_zone; + size_t hz_elem_size; +} hfs_zone_t; + +void hfs_init_zones(void); +void *hfs_zalloc(hfs_zone_kind_t type); +void hfs_zfree(void *ptr, hfs_zone_kind_t type); + +void hfs_sysctl_register(void); +void hfs_sysctl_unregister(void); + +#if HFS_MALLOC_DEBUG + +void hfs_alloc_trace_disable(void); +void hfs_alloc_trace_enable(void); +bool hfs_dump_allocations(void); + +#endif // HFS_MALLOC_DEBUG + +/***************************************************************************** + Functions from hfs_vnops.c +******************************************************************************/ +int hfs_write_access(struct vnode *vp, kauth_cred_t cred, struct proc *p, Boolean considerFlags); + +int hfs_chmod(struct vnode *vp, int mode, kauth_cred_t cred, struct proc *p); + +int hfs_chown(struct vnode *vp, uid_t uid, gid_t gid, kauth_cred_t cred, struct proc *p); + +int hfs_vnop_create(struct vnop_create_args *ap); + +int hfs_vnop_remove(struct vnop_remove_args*); + +#define kMaxSecsForFsync 5 +#define HFS_SYNCTRANS 1 +extern int hfs_btsync(struct vnode *vp, int sync_transaction); + +extern void replace_desc(struct cnode *cp, struct cat_desc *cdp); + +extern int hfs_vgetrsrc(struct hfsmount *hfsmp, struct vnode *vp, + struct vnode **rvpp); + +typedef enum { + // Push all modifications to disk (including minor ones) + HFS_UPDATE_FORCE = 0x01, +} hfs_update_options_t; + +extern int hfs_update(struct vnode *, int options); + +typedef enum hfs_sync_mode { + HFS_FSYNC, + HFS_FSYNC_FULL, + HFS_FSYNC_BARRIER +} hfs_fsync_mode_t; + +extern int hfs_fsync(struct vnode *, int, hfs_fsync_mode_t, struct proc *); + +const struct cat_fork * +hfs_prepare_fork_for_update(filefork_t *ff, + const struct cat_fork *cf, + struct cat_fork *cf_buf, + uint32_t block_size); + +struct decmpfs_cnode; +struct decmpfs_cnode *hfs_lazy_init_decmpfs_cnode (struct cnode *cp); + +/***************************************************************************** + Functions from hfs_xattr.c +******************************************************************************/ + +/* + * Maximum extended attribute size supported for all extended attributes except + * resource fork and finder info. + */ +#define HFS_XATTR_MAXSIZE INT32_MAX + +/* Number of bits used to represent maximum extended attribute size */ +#define HFS_XATTR_SIZE_BITS 31 + +int hfs_attrkeycompare(HFSPlusAttrKey *searchKey, HFSPlusAttrKey *trialKey); +int hfs_buildattrkey(u_int32_t fileID, const char *attrname, HFSPlusAttrKey *key); +void hfs_xattr_init(struct hfsmount * hfsmp); +int file_attribute_exist(struct hfsmount *hfsmp, uint32_t fileID); +int init_attrdata_vnode(struct hfsmount *hfsmp); +int hfs_xattr_read(vnode_t vp, const char *name, void *data, size_t *size); +int hfs_getxattr_internal(cnode_t *, struct vnop_getxattr_args *, + struct hfsmount *, u_int32_t); +int hfs_xattr_write(vnode_t vp, const char *name, const void *data, size_t size); +int hfs_setxattr_internal(struct cnode *, const void *, size_t, + struct vnop_setxattr_args *, struct hfsmount *, u_int32_t); +extern int hfs_removeallattr(struct hfsmount *hfsmp, u_int32_t fileid, + bool *open_transaction); + +int hfs_removexattr_by_id (struct hfsmount *hfsmp, uint32_t fileid, const char *xattr_name ); + +extern int hfs_set_volxattr(struct hfsmount *hfsmp, unsigned int xattrtype, int state); + + + +/***************************************************************************** + Functions from hfs_link.c +******************************************************************************/ + +extern int hfs_unlink(struct hfsmount *hfsmp, struct vnode *dvp, struct vnode *vp, + struct componentname *cnp, int skip_reserve); +extern int hfs_lookup_siblinglinks(struct hfsmount *hfsmp, cnid_t linkfileid, + cnid_t *prevlinkid, cnid_t *nextlinkid); +extern int hfs_lookup_lastlink(struct hfsmount *hfsmp, cnid_t linkfileid, + cnid_t *nextlinkid, struct cat_desc *cdesc); +extern void hfs_privatedir_init(struct hfsmount *, enum privdirtype); + +extern void hfs_savelinkorigin(cnode_t *cp, cnid_t parentcnid); +extern void hfs_relorigins(struct cnode *cp); +extern void hfs_relorigin(struct cnode *cp, cnid_t parentcnid); +extern int hfs_haslinkorigin(cnode_t *cp); +extern cnid_t hfs_currentparent(cnode_t *cp, bool have_lock); +extern cnid_t hfs_currentcnid(cnode_t *cp); +errno_t hfs_first_link(hfsmount_t *hfsmp, cnode_t *cp, cnid_t *link_id); + + +/***************************************************************************** + Functions from VolumeAllocation.c + ******************************************************************************/ +extern int hfs_isallocated(struct hfsmount *hfsmp, u_int32_t startingBlock, u_int32_t numBlocks); + +extern int hfs_count_allocated(struct hfsmount *hfsmp, u_int32_t startBlock, + u_int32_t numBlocks, u_int32_t *alloc_count); + +extern int hfs_isrbtree_active (struct hfsmount *hfsmp); + +/***************************************************************************** + Functions from hfs_fsinfo.c + ******************************************************************************/ +extern errno_t hfs_get_fsinfo(struct hfsmount *hfsmp, void *a_data); +extern void hfs_fsinfo_data_add(struct hfs_fsinfo_data *fsinfo, uint64_t entry); + +struct hfs_sysctl_chain { + struct sysctl_oid *oid; + struct hfs_sysctl_chain *next; +}; + +extern struct hfs_sysctl_chain *sysctl_list; + +SYSCTL_DECL(_vfs_generic_hfs); + +#define HFS_SYSCTL(kind, parent, flags, name, ...) \ + SYSCTL_##kind(parent, flags, name, __VA_ARGS__); \ + struct hfs_sysctl_chain hfs_sysctl_##parent##_##name##_chain = { \ + .oid = &sysctl_##parent##_##name \ + }; \ + static __attribute__((__constructor__)) void \ + hfs_sysctl_register_##parent##_##name(void) { \ + hfs_sysctl_##parent##_##name##_chain.next = sysctl_list; \ + sysctl_list = &hfs_sysctl_##parent##_##name##_chain; \ + } + +__END_DECLS + +#undef assert +#define assert Do_not_use_assert__Use_hfs_assert_instead + +#endif /* __APPLE_API_PRIVATE */ +#endif /* KERNEL */ +#endif /* __HFS__ */ diff --git a/core/hfs_alloc_trace.h b/core/hfs_alloc_trace.h new file mode 100644 index 0000000..304a194 --- /dev/null +++ b/core/hfs_alloc_trace.h @@ -0,0 +1,34 @@ +// +// hfs_alloc_trace.h +// hfs +// +// Created by Chris Suter on 8/19/15. +// +// + +#ifndef hfs_alloc_trace_h +#define hfs_alloc_trace_h + +#include +#include + +enum { + HFS_ALLOC_BACKTRACE_LEN = 4, +}; + +#pragma pack(push, 8) + +struct hfs_alloc_trace_info { + int entry_count; + bool more; + struct hfs_alloc_info_entry { + uint64_t ptr; + uint64_t sequence; + uint64_t size; + uint64_t backtrace[HFS_ALLOC_BACKTRACE_LEN]; + } entries[]; +}; + +#pragma pack(pop) + +#endif /* hfs_alloc_trace_h */ diff --git a/core/hfs_attrlist.c b/core/hfs_attrlist.c new file mode 100644 index 0000000..1fa4268 --- /dev/null +++ b/core/hfs_attrlist.c @@ -0,0 +1,1743 @@ +/* + * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * hfs_attrlist.c - HFS attribute list processing + * + * Copyright (c) 1998-2002, Apple Inc. All Rights Reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "hfs.h" +#include "hfs_cnode.h" +#include "hfs_mount.h" +#include "hfs_dbg.h" +#include "hfs_attrlist.h" +#include "hfs_btreeio.h" +#include "hfs_cprotect.h" + +/* Packing routines: */ + +static void packnameattr(struct attrblock *abp, struct vnode *vp, + const u_int8_t *name, int namelen); + +static void packcommonattr(struct attrblock *abp, struct hfsmount *hfsmp, + struct vnode *vp, struct cat_desc * cdp, + struct cat_attr * cap, struct vfs_context *ctx); + +static void packfileattr(struct attrblock *abp, struct hfsmount *hfsmp, + struct cat_attr *cattrp, struct cat_fork *datafork, + struct cat_fork *rsrcfork, struct vnode *vp); + +static void packdirattr(struct attrblock *abp, struct hfsmount *hfsmp, + struct vnode *vp, struct cat_desc * descp, + struct cat_attr * cattrp); + +static u_int32_t hfs_real_user_access(vnode_t vp, vfs_context_t ctx); + +static void get_vattr_data_for_attrs(struct attrlist *, struct vnode_attr *, + struct hfsmount *, struct vnode *, struct cat_desc *, struct cat_attr *, + struct cat_fork *, struct cat_fork *, vfs_context_t); + +static void vattr_data_for_common_attrs(struct attrlist *, struct vnode_attr *, + struct hfsmount *, struct vnode *, struct cat_desc *, struct cat_attr *, + vfs_context_t); + +static void vattr_data_for_dir_attrs(struct attrlist *, struct vnode_attr *, + struct hfsmount *, struct vnode *, struct cat_desc *, struct cat_attr *); + +static void vattr_data_for_file_attrs(struct attrlist *, struct vnode_attr *, + struct hfsmount *, struct cat_attr *, struct cat_fork *, struct cat_fork *, + struct vnode *vp); + +static int hfs_readdirattr_internal(struct vnode *, struct attrlist *, + struct vnode_attr *, uio_t, uint64_t, int, uint32_t *, int *, int *, + vfs_context_t); + +/* + * readdirattr operation will return attributes for the items in the + * directory specified. + * + * It does not do . and .. entries. The problem is if you are at the root of the + * hfs directory and go to .. you could be crossing a mountpoint into a + * different (ufs) file system. The attributes that apply for it may not + * apply for the file system you are doing the readdirattr on. To make life + * simpler, this call will only return entries in its directory, hfs like. + */ +int +hfs_vnop_readdirattr(ap) + struct vnop_readdirattr_args /* { + struct vnode *a_vp; + struct attrlist *a_alist; + struct uio *a_uio; + u_long a_maxcount; + u_long a_options; + u_long *a_newstate; + int *a_eofflag; + u_long *a_actualcount; + vfs_context_t a_context; + } */ *ap; +{ + int error; + struct attrlist *alist = ap->a_alist; + + /* Check for invalid options and buffer space. */ + if (((ap->a_options & ~(FSOPT_NOINMEMUPDATE | FSOPT_NOFOLLOW)) != 0) || + (ap->a_maxcount <= 0)) { + return (EINVAL); + } + /* + * Reject requests for unsupported attributes. + */ + if ((alist->bitmapcount != ATTR_BIT_MAP_COUNT) || + (alist->commonattr & ~HFS_ATTR_CMN_VALID) || + (alist->volattr != 0) || + (alist->dirattr & ~HFS_ATTR_DIR_VALID) || + (alist->fileattr & ~HFS_ATTR_FILE_VALID) || + (alist->forkattr != 0)) { + return (EINVAL); + } + + error = hfs_readdirattr_internal(ap->a_vp, alist, NULL, ap->a_uio, + (uint64_t)ap->a_options, ap->a_maxcount, ap->a_newstate, + ap->a_eofflag, (int *)ap->a_actualcount, ap->a_context); + + return (error); +} + + +/* + * getattrlistbulk, like readdirattr, will return attributes for the items in + * the directory specified. + * + * It does not do . and .. entries. The problem is if you are at the root of the + * hfs directory and go to .. you could be crossing a mountpoint into a + * different (ufs) file system. The attributes that apply for it may not + * apply for the file system you are doing the readdirattr on. To make life + * simpler, this call will only return entries in its directory, hfs like. + */ +int +hfs_vnop_getattrlistbulk(ap) + struct vnop_getattrlistbulk_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + struct attrlist *a_alist; + struct vnode_attr *a_vap; + struct uio *a_uio; + void *a_private; + uint64_t a_options; + int32_t *a_eofflag; + int32_t *a_actualcount; + vfs_context_t a_context; + } */ *ap; +{ + int error = 0; + + error = hfs_readdirattr_internal(ap->a_vp, ap->a_alist, ap->a_vap, + ap->a_uio, (uint64_t)ap->a_options, 0, NULL, ap->a_eofflag, + (int *)ap->a_actualcount, ap->a_context); + + return (error); +} + +/* + * Common function for both hfs_vnop_readdirattr and hfs_vnop_getattrlistbulk. + * This either fills in a vnode_attr structure or fills in an attrbute buffer + * Currently the difference in behaviour required for the two vnops is keyed + * on whether the passed in vnode_attr pointer is null or not. If the pointer + * is null we fill in buffer passed and if it is not null we fill in the fields + * of the vnode_attr structure. + */ +int +hfs_readdirattr_internal(struct vnode *dvp, struct attrlist *alist, + struct vnode_attr *vap, uio_t uio, uint64_t options, int maxcount, + uint32_t *newstate, int *eofflag, int *actualcount, vfs_context_t ctx) +{ + struct cnode *dcp; + struct hfsmount * hfsmp; + u_int32_t fixedblocksize; + u_int32_t maxattrblocksize = 0; + u_int32_t currattrbufsize; + void *attrbufptr = NULL; + void *attrptr = NULL; + void *varptr = NULL; + caddr_t namebuf = NULL; + struct attrblock attrblk; + int error = 0; + int index = 0; + int i = 0; + struct cat_desc *lastdescp = NULL; + struct cat_entrylist *ce_list = NULL; + directoryhint_t *dirhint = NULL; + unsigned int tag; + int maxentries = 0; + int lockflags; + u_int32_t dirchg = 0; + int reachedeof = 0; + int internal_actualcount; + int internal_eofflag; + + /* Lets makse sure we have something assign to actualcount always, min change required */ + if (actualcount == NULL) { + actualcount = &internal_actualcount; + } + /* Lets makse sure we have something assign to eofflag always, min change required */ + if (eofflag == NULL) { + eofflag = &internal_eofflag; + } + + *(actualcount) = 0; + *(eofflag) = 0; + + if ((uio_resid(uio) <= 0) || (uio_iovcnt(uio) > 1)) + return (EINVAL); + + if (VTOC(dvp)->c_bsdflags & UF_COMPRESSED) { + int compressed = hfs_file_is_compressed(VTOC(dvp), 0); /* 0 == take the cnode lock */ + + if (!compressed) { + error = check_for_dataless_file(dvp, NAMESPACE_HANDLER_READ_OP); + if (error) { + return error; + } + } + } + + /* + * Take an exclusive directory lock since we manipulate the directory hints + */ + if ((error = hfs_lock(VTOC(dvp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { + return (error); + } + dcp = VTOC(dvp); + hfsmp = VTOHFS(dvp); + + dirchg = dcp->c_dirchangecnt; + + /* Extract directory index and tag (sequence number) from uio_offset */ + index = uio_offset(uio) & HFS_INDEX_MASK; + tag = uio_offset(uio) & ~HFS_INDEX_MASK; + + /* + * We can't just use the valence as an optimization to avoid + * going to the catalog. It might be wrong (== 0), and that would + * cause us to avoid iterating the directory when it might actually have + * contents. Instead, use the catalog to tell us when we've hit EOF + * for this directory + */ + + /* Get a buffer to hold packed attributes. */ + fixedblocksize = (sizeof(u_int32_t) + hfs_attrblksize(alist)); /* 4 bytes for length */ + + if (!vap) { + maxattrblocksize = fixedblocksize; + if (alist->commonattr & ATTR_CMN_NAME) + maxattrblocksize += kHFSPlusMaxFileNameBytes + 1; + + attrbufptr = hfs_malloc(maxattrblocksize); + attrptr = attrbufptr; + varptr = (char *)attrbufptr + fixedblocksize; /* Point to variable-length storage */ + } else { + if ((alist->commonattr & ATTR_CMN_NAME) && !vap->va_name) { + namebuf = hfs_malloc(MAXPATHLEN); + if (!namebuf) { + error = ENOMEM; + goto exit2; + } + vap->va_name = namebuf; + } + } + /* Get a detached directory hint (cnode must be locked exclusive) */ + dirhint = hfs_getdirhint(dcp, ((index - 1) & HFS_INDEX_MASK) | tag, TRUE); + + /* Hide tag from catalog layer. */ + dirhint->dh_index &= HFS_INDEX_MASK; + if (dirhint->dh_index == HFS_INDEX_MASK) { + dirhint->dh_index = -1; + } + + /* + * Obtain a list of catalog entries and pack their attributes until + * the output buffer is full or maxcount entries have been packed. + */ + + /* + * Constrain our list size. + */ + maxentries = uio_resid(uio) / (fixedblocksize + HFS_AVERAGE_NAME_SIZE); + /* There is maxcount for the bulk vnop */ + if (!vap) + maxentries = min(maxentries, maxcount); + maxentries = min(maxentries, MAXCATENTRIES); + if (maxentries < 1) { + error = EINVAL; + goto exit2; + } + + /* Initialize a catalog entry list. */ + ce_list = hfs_mallocz(CE_LIST_SIZE(maxentries)); + ce_list->maxentries = maxentries; + + /* + * Populate the ce_list from the catalog file. + */ + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); + + error = cat_getentriesattr(hfsmp, dirhint, ce_list, &reachedeof); + /* Don't forget to release the descriptors later! */ + + hfs_systemfile_unlock(hfsmp, lockflags); + + if ((error == ENOENT) || (reachedeof != 0)) { + *(eofflag) = TRUE; + error = 0; + } + if (error) { + goto exit1; + } + + dcp->c_touch_acctime = TRUE; + + /* + * Check for a FS corruption in the valence. We're holding the cnode lock + * exclusive since we need to serialize the directory hints, so if we found + * that the valence reported 0, but we actually found some items here, then + * silently minimally self-heal and bump the valence to 1. + */ + if ((dcp->c_entries == 0) && (ce_list->realentries > 0)) { + dcp->c_entries++; + dcp->c_flag |= C_MODIFIED; + printf("%s : repairing valence to non-zero!\n", __FUNCTION__); + /* force an update on dcp while we're still holding the lock. */ + hfs_update(dvp, 0); + } + + /* + * Drop the directory lock so we don't deadlock when we: + * - acquire a child cnode lock + * - make calls to vnode_authorize() + * - make calls to kauth_cred_ismember_gid() + */ + hfs_unlock(dcp); + dcp = NULL; + + /* Process the catalog entries. */ + for (i = 0; i < (int)ce_list->realentries; ++i) { + struct cnode *cp = NULL; + struct vnode *vp = NULL; + struct cat_desc * cdescp; + struct cat_attr * cattrp; + struct cat_fork c_datafork; + struct cat_fork c_rsrcfork; + + bzero(&c_datafork, sizeof(c_datafork)); + bzero(&c_rsrcfork, sizeof(c_rsrcfork)); + cdescp = &ce_list->entry[i].ce_desc; + cattrp = &ce_list->entry[i].ce_attr; + c_datafork.cf_size = ce_list->entry[i].ce_datasize; + c_datafork.cf_blocks = ce_list->entry[i].ce_datablks; + c_rsrcfork.cf_size = ce_list->entry[i].ce_rsrcsize; + c_rsrcfork.cf_blocks = ce_list->entry[i].ce_rsrcblks; + + if (((alist->commonattr & ATTR_CMN_USERACCESS) && + (cattrp->ca_recflags & kHFSHasSecurityMask)) +#if CONFIG_PROTECT + || + ((alist->commonattr & ATTR_CMN_DATA_PROTECT_FLAGS) && (vap)) +#endif + ) { + /* + * Obtain vnode for our vnode_authorize() calls. + */ + if (hfs_vget(hfsmp, cattrp->ca_fileid, &vp, 0, 0) != 0) { + vp = NULL; + } + } else if (vap || !(options & FSOPT_NOINMEMUPDATE)) { + /* Get in-memory cnode data (if any). */ + vp = hfs_chash_getvnode(hfsmp, cattrp->ca_fileid, 0, 0, 0); + } + if (vp != NULL) { + cp = VTOC(vp); + /* Only use cnode's decriptor for non-hardlinks */ + if (!(cp->c_flag & C_HARDLINK)) + cdescp = &cp->c_desc; + cattrp = &cp->c_attr; + if (cp->c_datafork) { + c_datafork.cf_size = cp->c_datafork->ff_size; + c_datafork.cf_blocks = cp->c_datafork->ff_blocks; + } + if (cp->c_rsrcfork) { + c_rsrcfork.cf_size = cp->c_rsrcfork->ff_size; + c_rsrcfork.cf_blocks = cp->c_rsrcfork->ff_blocks; + } + /* All done with cnode. */ + hfs_unlock(cp); + cp = NULL; + } + + if (!vap) { + *((u_int32_t *)attrptr) = 0; + attrptr = ((u_int32_t *)attrptr) + 1; + attrblk.ab_attrlist = alist; + attrblk.ab_attrbufpp = &attrptr; + attrblk.ab_varbufpp = &varptr; + attrblk.ab_flags = 0; + attrblk.ab_blocksize = maxattrblocksize; + attrblk.ab_context = ctx; + + /* Pack catalog entries into attribute buffer. */ + hfs_packattrblk(&attrblk, hfsmp, vp, cdescp, cattrp, &c_datafork, &c_rsrcfork, ctx); + currattrbufsize = ((char *)varptr - (char *)attrbufptr); + + /* All done with vnode. */ + if (vp != NULL) { + vnode_put(vp); + vp = NULL; + } + + /* Make sure there's enough buffer space remaining. */ + // LP64todo - fix this! + if (uio_resid(uio) < 0 || + currattrbufsize > (u_int32_t)uio_resid(uio)) { + break; + } else { + *((u_int32_t *)attrbufptr) = currattrbufsize; + error = uiomove((caddr_t)attrbufptr, currattrbufsize, uio); + if (error != E_NONE) { + break; + } + attrptr = attrbufptr; + /* Point to variable-length storage */ + varptr = (char *)attrbufptr + fixedblocksize; + /* Save the last valid catalog entry */ + lastdescp = &ce_list->entry[i].ce_desc; + index++; + *actualcount += 1; + + /* Termination checks */ + if ((--maxcount <= 0) || + // LP64todo - fix this! + uio_resid(uio) < 0 || + ((u_int32_t)uio_resid(uio) < (fixedblocksize + HFS_AVERAGE_NAME_SIZE))){ + break; + } + } + } else { + size_t orig_resid = (size_t)uio_resid(uio); + size_t resid; + + get_vattr_data_for_attrs(alist, vap, hfsmp, vp, cdescp, + cattrp, &c_datafork, &c_rsrcfork, ctx); + +#if CONFIG_PROTECT + if ((alist->commonattr & ATTR_CMN_DATA_PROTECT_FLAGS) && + vp) { + cp_key_class_t class; + + if (!cp_vnode_getclass(vp, &class)) { + VATTR_RETURN(vap, va_dataprotect_class, + (uint32_t)class); + } + } +#endif + error = vfs_attr_pack(vp, uio, alist, options, vap, + NULL, ctx); + + /* All done with vnode. */ + if (vp) { + vnode_put(vp); + vp = NULL; + } + + resid = uio_resid(uio); + + /* Was this entry succesful ? */ + if (error || resid == orig_resid) + break; + + /* Save the last valid catalog entry */ + lastdescp = &ce_list->entry[i].ce_desc; + index++; + *actualcount += 1; + + /* Do we have the bare minimum for the next entry ? */ + if (resid < sizeof(uint32_t)) + break; + } + } /* for each catalog entry */ + + /* + * If we couldn't fit all the entries requested in the user's buffer, + * it's not EOF. + */ + if (*eofflag && (*actualcount < (int)ce_list->realentries)) + *eofflag = 0; + + /* If we skipped catalog entries for reserved files that should + * not be listed in namespace, update the index accordingly. + */ + if (ce_list->skipentries) { + index += ce_list->skipentries; + ce_list->skipentries = 0; + } + + /* + * If there are more entries then save the last name. + * Key this behavior based on whether or not we observed EOFFLAG. + * + * Do not use the valence as a way to determine if we hit EOF, since + * it can be wrong. Use the catalog's output only. + */ + if ((*(eofflag) == 0) && (lastdescp != NULL)) { + + /* Remember last entry */ + if ((dirhint->dh_desc.cd_flags & CD_HASBUF) && + (dirhint->dh_desc.cd_nameptr != NULL)) { + dirhint->dh_desc.cd_flags &= ~CD_HASBUF; + vfs_removename((const char *)dirhint->dh_desc.cd_nameptr); + } + if (lastdescp->cd_nameptr != NULL) { + dirhint->dh_desc.cd_namelen = lastdescp->cd_namelen; + dirhint->dh_desc.cd_nameptr = (const u_int8_t *) + vfs_addname((const char *)lastdescp->cd_nameptr, lastdescp->cd_namelen, 0, 0); + dirhint->dh_desc.cd_flags |= CD_HASBUF; + } else { + dirhint->dh_desc.cd_namelen = 0; + dirhint->dh_desc.cd_nameptr = NULL; + } + dirhint->dh_index = index - 1; + dirhint->dh_desc.cd_cnid = lastdescp->cd_cnid; + dirhint->dh_desc.cd_hint = lastdescp->cd_hint; + dirhint->dh_desc.cd_encoding = lastdescp->cd_encoding; + } + + /* All done with the catalog descriptors. */ + for (i = 0; i < (int)ce_list->realentries; ++i) + cat_releasedesc(&ce_list->entry[i].ce_desc); + ce_list->realentries = 0; + + (void) hfs_lock(VTOC(dvp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); + dcp = VTOC(dvp); + +exit1: + /* Pack directory index and tag into uio_offset. */ + while (tag == 0) tag = (++dcp->c_dirhinttag) << HFS_INDEX_BITS; + uio_setoffset(uio, index | tag); + dirhint->dh_index |= tag; + +exit2: + if (newstate) + *newstate = dirchg; + + /* + * Drop directory hint on error or if there are no more entries, + * only if EOF was seen. + */ + if (dirhint) { + if ((error != 0) || *(eofflag)) + hfs_reldirhint(dcp, dirhint); + else + hfs_insertdirhint(dcp, dirhint); + } + if (namebuf) { + hfs_free(namebuf, MAXPATHLEN); + vap->va_name = NULL; + } + if (attrbufptr) + hfs_free(attrbufptr, maxattrblocksize); + if (ce_list) + hfs_free(ce_list, CE_LIST_SIZE(maxentries)); + + if (vap && *actualcount && error) + error = 0; + + hfs_unlock(dcp); + return (error); +} + + +/*==================== Attribute list support routines ====================*/ + +/* + * Pack cnode attributes into an attribute block. + */ +void +hfs_packattrblk(struct attrblock *abp, + struct hfsmount *hfsmp, + struct vnode *vp, + struct cat_desc *descp, + struct cat_attr *attrp, + struct cat_fork *datafork, + struct cat_fork *rsrcfork, + struct vfs_context *ctx) +{ + struct attrlist *attrlistp = abp->ab_attrlist; + + if (attrlistp->commonattr) + packcommonattr(abp, hfsmp, vp, descp, attrp, ctx); + + if (attrlistp->dirattr && S_ISDIR(attrp->ca_mode)) + packdirattr(abp, hfsmp, vp, descp,attrp); + + if (attrlistp->fileattr && !S_ISDIR(attrp->ca_mode)) + packfileattr(abp, hfsmp, attrp, datafork, rsrcfork, vp); +} + +static char* +mountpointname(struct mount *mp) +{ + struct vfsstatfs *vsfs = vfs_statfs(mp); + + size_t namelength = strlen(vsfs->f_mntonname); + int foundchars = 0; + char *c; + + if (namelength == 0) + return (NULL); + + /* + * Look backwards through the name string, looking for + * the first slash encountered (which must precede the + * last part of the pathname). + */ + for (c = vsfs->f_mntonname + namelength - 1; + namelength > 0; --c, --namelength) { + if (*c != '/') { + foundchars = 1; + } else if (foundchars) { + return (c + 1); + } + } + + return vsfs->f_mntonname; +} + + +static void +packnameattr( + struct attrblock *abp, + struct vnode *vp, + const u_int8_t *name, + int namelen) +{ + void *varbufptr; + struct attrreference * attr_refptr; + char *mpname; + size_t mpnamelen; + u_int32_t attrlength; + u_int8_t empty = 0; + + /* A cnode's name may be incorrect for the root of a mounted + * filesystem (it can be mounted on a different directory name + * than the name of the volume, such as "blah-1"). So for the + * root directory, it's best to return the last element of the + location where the volume's mounted: + */ + if ((vp != NULL) && vnode_isvroot(vp) && + (mpname = mountpointname(vnode_mount(vp)))) { + mpnamelen = strlen(mpname); + + /* Trim off any trailing slashes: */ + while ((mpnamelen > 0) && (mpname[mpnamelen-1] == '/')) + --mpnamelen; + + /* If there's anything left, use it instead of the volume's name */ + if (mpnamelen > 0) { + name = (u_int8_t *)mpname; + namelen = mpnamelen; + } + } + if (name == NULL) { + name = ∅ + namelen = 0; + } + + varbufptr = *abp->ab_varbufpp; + attr_refptr = (struct attrreference *)(*abp->ab_attrbufpp); + + attrlength = namelen + 1; + attr_refptr->attr_dataoffset = (char *)varbufptr - (char *)attr_refptr; + attr_refptr->attr_length = attrlength; + (void) strncpy((char *)varbufptr, (const char *) name, attrlength); + /* + * Advance beyond the space just allocated and + * round up to the next 4-byte boundary: + */ + varbufptr = ((char *)varbufptr) + attrlength + ((4 - (attrlength & 3)) & 3); + ++attr_refptr; + + *abp->ab_attrbufpp = attr_refptr; + *abp->ab_varbufpp = varbufptr; +} + +static void +packcommonattr( + struct attrblock *abp, + struct hfsmount *hfsmp, + struct vnode *vp, + struct cat_desc * cdp, + struct cat_attr * cap, + struct vfs_context * ctx) +{ + attrgroup_t attr = abp->ab_attrlist->commonattr; + struct mount *mp = HFSTOVFS(hfsmp); + void *attrbufptr = *abp->ab_attrbufpp; + void *varbufptr = *abp->ab_varbufpp; + boolean_t is_64_bit = proc_is64bit(vfs_context_proc(ctx)); + uid_t cuid = 1; + int isroot = 0; + + if (attr & (ATTR_CMN_OWNERID | ATTR_CMN_GRPID)) { + cuid = kauth_cred_getuid(vfs_context_ucred(ctx)); + isroot = cuid == 0; + } + + if (ATTR_CMN_NAME & attr) { + packnameattr(abp, vp, cdp->cd_nameptr, cdp->cd_namelen); + attrbufptr = *abp->ab_attrbufpp; + varbufptr = *abp->ab_varbufpp; + } + if (ATTR_CMN_DEVID & attr) { + *((dev_t *)attrbufptr) = hfsmp->hfs_raw_dev; + attrbufptr = ((dev_t *)attrbufptr) + 1; + } + if (ATTR_CMN_FSID & attr) { + fsid_t fsid; + + fsid.val[0] = hfsmp->hfs_raw_dev; + fsid.val[1] = vfs_typenum(mp); + *((fsid_t *)attrbufptr) = fsid; + attrbufptr = ((fsid_t *)attrbufptr) + 1; + } + if (ATTR_CMN_OBJTYPE & attr) { + *((fsobj_type_t *)attrbufptr) = IFTOVT(cap->ca_mode); + attrbufptr = ((fsobj_type_t *)attrbufptr) + 1; + } + if (ATTR_CMN_OBJTAG & attr) { + *((fsobj_tag_t *)attrbufptr) = VT_HFS; + attrbufptr = ((fsobj_tag_t *)attrbufptr) + 1; + } + /* + * Exporting file IDs from HFS Plus: + * + * For "normal" files the c_fileid is the same value as the + * c_cnid. But for hard link files, they are different - the + * c_cnid belongs to the active directory entry (ie the link) + * and the c_fileid is for the actual inode (ie the data file). + * + * The stat call (getattr) will always return the c_fileid + * and Carbon APIs, which are hardlink-ignorant, will always + * receive the c_cnid (from getattrlist). + */ + if (ATTR_CMN_OBJID & attr) { + ((fsobj_id_t *)attrbufptr)->fid_objno = cdp->cd_cnid; + ((fsobj_id_t *)attrbufptr)->fid_generation = 0; + attrbufptr = ((fsobj_id_t *)attrbufptr) + 1; + } + if (ATTR_CMN_OBJPERMANENTID & attr) { + ((fsobj_id_t *)attrbufptr)->fid_objno = cdp->cd_cnid; + ((fsobj_id_t *)attrbufptr)->fid_generation = 0; + attrbufptr = ((fsobj_id_t *)attrbufptr) + 1; + } + if (ATTR_CMN_PAROBJID & attr) { + ((fsobj_id_t *)attrbufptr)->fid_objno = cdp->cd_parentcnid; + ((fsobj_id_t *)attrbufptr)->fid_generation = 0; + attrbufptr = ((fsobj_id_t *)attrbufptr) + 1; + } + if (ATTR_CMN_SCRIPT & attr) { + *((text_encoding_t *)attrbufptr) = cdp->cd_encoding; + attrbufptr = ((text_encoding_t *)attrbufptr) + 1; + } + if (ATTR_CMN_CRTIME & attr) { + if (is_64_bit) { + ((struct user64_timespec *)attrbufptr)->tv_sec = cap->ca_itime; + ((struct user64_timespec *)attrbufptr)->tv_nsec = 0; + attrbufptr = ((struct user64_timespec *)attrbufptr) + 1; + } + else { + ((struct user32_timespec *)attrbufptr)->tv_sec = cap->ca_itime; + ((struct user32_timespec *)attrbufptr)->tv_nsec = 0; + attrbufptr = ((struct user32_timespec *)attrbufptr) + 1; + } + } + if (ATTR_CMN_MODTIME & attr) { + if (is_64_bit) { + ((struct user64_timespec *)attrbufptr)->tv_sec = cap->ca_mtime; + ((struct user64_timespec *)attrbufptr)->tv_nsec = 0; + attrbufptr = ((struct user64_timespec *)attrbufptr) + 1; + } + else { + ((struct user32_timespec *)attrbufptr)->tv_sec = cap->ca_mtime; + ((struct user32_timespec *)attrbufptr)->tv_nsec = 0; + attrbufptr = ((struct user32_timespec *)attrbufptr) + 1; + } + } + if (ATTR_CMN_CHGTIME & attr) { + if (is_64_bit) { + ((struct user64_timespec *)attrbufptr)->tv_sec = cap->ca_ctime; + ((struct user64_timespec *)attrbufptr)->tv_nsec = 0; + attrbufptr = ((struct user64_timespec *)attrbufptr) + 1; + } + else { + ((struct user32_timespec *)attrbufptr)->tv_sec = cap->ca_ctime; + ((struct user32_timespec *)attrbufptr)->tv_nsec = 0; + attrbufptr = ((struct user32_timespec *)attrbufptr) + 1; + } + } + if (ATTR_CMN_ACCTIME & attr) { + if (is_64_bit) { + ((struct user64_timespec *)attrbufptr)->tv_sec = cap->ca_atime; + ((struct user64_timespec *)attrbufptr)->tv_nsec = 0; + attrbufptr = ((struct user64_timespec *)attrbufptr) + 1; + } + else { + ((struct user32_timespec *)attrbufptr)->tv_sec = cap->ca_atime; + ((struct user32_timespec *)attrbufptr)->tv_nsec = 0; + attrbufptr = ((struct user32_timespec *)attrbufptr) + 1; + } + } + if (ATTR_CMN_BKUPTIME & attr) { + if (is_64_bit) { + ((struct user64_timespec *)attrbufptr)->tv_sec = cap->ca_btime; + ((struct user64_timespec *)attrbufptr)->tv_nsec = 0; + attrbufptr = ((struct user64_timespec *)attrbufptr) + 1; + } + else { + ((struct user32_timespec *)attrbufptr)->tv_sec = cap->ca_btime; + ((struct user32_timespec *)attrbufptr)->tv_nsec = 0; + attrbufptr = ((struct user32_timespec *)attrbufptr) + 1; + } + } + if (ATTR_CMN_FNDRINFO & attr) { + u_int8_t *finfo = NULL; + bcopy(&cap->ca_finderinfo, attrbufptr, sizeof(u_int8_t) * 32); + finfo = (u_int8_t*)attrbufptr; + + /* Don't expose a symlink's private type/creator. */ + if (S_ISLNK(cap->ca_mode)) { + struct FndrFileInfo *fip; + + fip = (struct FndrFileInfo *)attrbufptr; + fip->fdType = 0; + fip->fdCreator = 0; + } + + /* advance 16 bytes into the attrbuf */ + finfo = finfo + 16; + + /* also don't expose the date_added or write_gen_counter fields */ + if (S_ISREG(cap->ca_mode) || S_ISLNK(cap->ca_mode)) { + struct FndrExtendedFileInfo *extinfo = (struct FndrExtendedFileInfo *)finfo; + extinfo->document_id = 0; + extinfo->date_added = 0; + extinfo->write_gen_counter = 0; + } + else if (S_ISDIR(cap->ca_mode)) { + struct FndrExtendedDirInfo *extinfo = (struct FndrExtendedDirInfo *)finfo; + extinfo->document_id = 0; + extinfo->date_added = 0; + extinfo->write_gen_counter = 0; + } + + attrbufptr = (char *)attrbufptr + sizeof(u_int8_t) * 32; + } + if (ATTR_CMN_OWNERID & attr) { + uid_t nuid = cap->ca_uid; + + if (!isroot) { + if (((unsigned int)vfs_flags(HFSTOVFS(hfsmp))) & MNT_UNKNOWNPERMISSIONS) + nuid = cuid; + else if (nuid == UNKNOWNUID) + nuid = cuid; + } + + *((uid_t *)attrbufptr) = nuid; + attrbufptr = ((uid_t *)attrbufptr) + 1; + } + if (ATTR_CMN_GRPID & attr) { + gid_t ngid = cap->ca_gid; + + if (!isroot) { + gid_t cgid = kauth_cred_getgid(vfs_context_ucred(ctx)); + if (((unsigned int)vfs_flags(HFSTOVFS(hfsmp))) & MNT_UNKNOWNPERMISSIONS) + ngid = cgid; + else if (ngid == UNKNOWNUID) + ngid = cgid; + } + + *((gid_t *)attrbufptr) = ngid; + attrbufptr = ((gid_t *)attrbufptr) + 1; + } + if (ATTR_CMN_ACCESSMASK & attr) { + /* + * [2856576] Since we are dynamically changing the owner, also + * effectively turn off the set-user-id and set-group-id bits, + * just like chmod(2) would when changing ownership. This prevents + * a security hole where set-user-id programs run as whoever is + * logged on (or root if nobody is logged in yet!) + */ + *((u_int32_t *)attrbufptr) = (cap->ca_uid == UNKNOWNUID) ? + cap->ca_mode & ~(S_ISUID | S_ISGID) : cap->ca_mode; + attrbufptr = ((u_int32_t *)attrbufptr) + 1; + } + if (ATTR_CMN_FLAGS & attr) { + *((u_int32_t *)attrbufptr) = cap->ca_flags; + attrbufptr = ((u_int32_t *)attrbufptr) + 1; + } + if (ATTR_CMN_USERACCESS & attr) { + u_int32_t user_access; + + /* Take the long path when we have an ACL */ + if ((vp != NULLVP) && (cap->ca_recflags & kHFSHasSecurityMask)) { + user_access = hfs_real_user_access(vp, abp->ab_context); + } else { + user_access = DerivePermissionSummary(cap->ca_uid, cap->ca_gid, + cap->ca_mode, mp, vfs_context_ucred(ctx), 0); + } + /* Also consider READ-ONLY file system. */ + if (vfs_flags(mp) & MNT_RDONLY) { + user_access &= ~W_OK; + } + /* Locked objects are not writable either */ + if ((cap->ca_flags & UF_IMMUTABLE) && (vfs_context_suser(abp->ab_context) != 0)) + user_access &= ~W_OK; + if ((cap->ca_flags & SF_IMMUTABLE) && (vfs_context_suser(abp->ab_context) == 0)) + user_access &= ~W_OK; + + *((u_int32_t *)attrbufptr) = user_access; + attrbufptr = ((u_int32_t *)attrbufptr) + 1; + } + if (ATTR_CMN_FILEID & attr) { + *((u_int64_t *)attrbufptr) = cap->ca_fileid; + attrbufptr = ((u_int64_t *)attrbufptr) + 1; + } + if (ATTR_CMN_PARENTID & attr) { + *((u_int64_t *)attrbufptr) = cdp->cd_parentcnid; + attrbufptr = ((u_int64_t *)attrbufptr) + 1; + } + + *abp->ab_attrbufpp = attrbufptr; + *abp->ab_varbufpp = varbufptr; +} + +static void +packdirattr( + struct attrblock *abp, + struct hfsmount *hfsmp, + struct vnode *vp, + struct cat_desc * descp, + struct cat_attr * cattrp) +{ + attrgroup_t attr = abp->ab_attrlist->dirattr; + void *attrbufptr = *abp->ab_attrbufpp; + u_int32_t entries; + + /* + * The DIR_LINKCOUNT is the count of real directory hard links. + * (i.e. its not the sum of the implied "." and ".." references + * typically used in stat's st_nlink field) + */ + if (ATTR_DIR_LINKCOUNT & attr) { + *((u_int32_t *)attrbufptr) = cattrp->ca_linkcount; + attrbufptr = ((u_int32_t *)attrbufptr) + 1; + } + if (ATTR_DIR_ENTRYCOUNT & attr) { + entries = cattrp->ca_entries; + + if (descp->cd_parentcnid == kHFSRootParentID) { + if (hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid != 0) + --entries; /* hide private dir */ + if (hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid != 0) + --entries; /* hide private dir */ + if (hfsmp->jnl || + ((hfsmp->vcbAtrb & kHFSVolumeJournaledMask) && + (hfsmp->hfs_flags & HFS_READ_ONLY))) + entries -= 2; /* hide the journal files */ + } + + *((u_int32_t *)attrbufptr) = entries; + attrbufptr = ((u_int32_t *)attrbufptr) + 1; + } + if (ATTR_DIR_MOUNTSTATUS & attr) { + if (vp != NULL && vnode_mountedhere(vp) != NULL) + *((u_int32_t *)attrbufptr) = DIR_MNTSTATUS_MNTPOINT; + else + *((u_int32_t *)attrbufptr) = 0; + attrbufptr = ((u_int32_t *)attrbufptr) + 1; + } + *abp->ab_attrbufpp = attrbufptr; +} + +static void +packfileattr( + struct attrblock *abp, + struct hfsmount *hfsmp, + struct cat_attr *cattrp, + struct cat_fork *datafork, + struct cat_fork *rsrcfork, + struct vnode *vp) +{ +#if !HFS_COMPRESSION +#pragma unused(vp) +#endif + attrgroup_t attr = abp->ab_attrlist->fileattr; + void *attrbufptr = *abp->ab_attrbufpp; + void *varbufptr = *abp->ab_varbufpp; + u_int32_t allocblksize; + + allocblksize = HFSTOVCB(hfsmp)->blockSize; + + off_t datasize = datafork->cf_size; + off_t totalsize = datasize + rsrcfork->cf_size; +#if HFS_COMPRESSION + int handle_compressed; + handle_compressed = (cattrp->ca_flags & UF_COMPRESSED);// && hfs_file_is_compressed(VTOC(vp), 1); + + if (handle_compressed) { + if (attr & (ATTR_FILE_DATALENGTH|ATTR_FILE_TOTALSIZE)) { + if ( 0 == hfs_uncompressed_size_of_compressed_file(hfsmp, vp, cattrp->ca_fileid, &datasize, 1) ) { /* 1 == don't take the cnode lock */ + /* total size of a compressed file is just the data size */ + totalsize = datasize; + } + } + } +#endif + + if (ATTR_FILE_LINKCOUNT & attr) { + *((u_int32_t *)attrbufptr) = cattrp->ca_linkcount; + attrbufptr = ((u_int32_t *)attrbufptr) + 1; + } + if (ATTR_FILE_TOTALSIZE & attr) { + *((off_t *)attrbufptr) = totalsize; + attrbufptr = ((off_t *)attrbufptr) + 1; + } + if (ATTR_FILE_ALLOCSIZE & attr) { + *((off_t *)attrbufptr) = + (off_t)cattrp->ca_blocks * (off_t)allocblksize; + attrbufptr = ((off_t *)attrbufptr) + 1; + } + if (ATTR_FILE_IOBLOCKSIZE & attr) { + *((u_int32_t *)attrbufptr) = hfsmp->hfs_logBlockSize; + attrbufptr = ((u_int32_t *)attrbufptr) + 1; + } + if (ATTR_FILE_CLUMPSIZE & attr) { + *((u_int32_t *)attrbufptr) = hfsmp->vcbClpSiz; + attrbufptr = ((u_int32_t *)attrbufptr) + 1; + } + if (ATTR_FILE_DEVTYPE & attr) { + if (S_ISBLK(cattrp->ca_mode) || S_ISCHR(cattrp->ca_mode)) + *((u_int32_t *)attrbufptr) = (u_int32_t)cattrp->ca_rdev; + else + *((u_int32_t *)attrbufptr) = 0; + attrbufptr = ((u_int32_t *)attrbufptr) + 1; + } + + if (ATTR_FILE_DATALENGTH & attr) { + *((off_t *)attrbufptr) = datasize; + attrbufptr = ((off_t *)attrbufptr) + 1; + } + +#if HFS_COMPRESSION + /* fake the data fork size on a decmpfs compressed file to reflect the + * uncompressed size. This ensures proper reading and copying of these files. + * NOTE: we may need to get the vnode here because the vnode parameter + * passed by hfs_vnop_readdirattr() may be null. + */ + + if ( handle_compressed ) { + if (attr & ATTR_FILE_DATAALLOCSIZE) { + *((off_t *)attrbufptr) = (off_t)rsrcfork->cf_blocks * (off_t)allocblksize; + attrbufptr = ((off_t *)attrbufptr) + 1; + } + if (attr & ATTR_FILE_RSRCLENGTH) { + *((off_t *)attrbufptr) = 0; + attrbufptr = ((off_t *)attrbufptr) + 1; + } + if (attr & ATTR_FILE_RSRCALLOCSIZE) { + *((off_t *)attrbufptr) = 0; + attrbufptr = ((off_t *)attrbufptr) + 1; + } + } + else +#endif + { + if (ATTR_FILE_DATAALLOCSIZE & attr) { + *((off_t *)attrbufptr) = (off_t)datafork->cf_blocks * (off_t)allocblksize; + attrbufptr = ((off_t *)attrbufptr) + 1; + } + if (ATTR_FILE_RSRCLENGTH & attr) { + *((off_t *)attrbufptr) = rsrcfork->cf_size; + attrbufptr = ((off_t *)attrbufptr) + 1; + } + if (ATTR_FILE_RSRCALLOCSIZE & attr) { + *((off_t *)attrbufptr) = (off_t)rsrcfork->cf_blocks * (off_t)allocblksize; + attrbufptr = ((off_t *)attrbufptr) + 1; + } + } + *abp->ab_attrbufpp = attrbufptr; + *abp->ab_varbufpp = varbufptr; +} + +/* + * Calculate the total size of an attribute block. + */ +int +hfs_attrblksize(struct attrlist *attrlist) +{ + int size; + attrgroup_t a; + int sizeof_timespec; + boolean_t is_64_bit = proc_is64bit(current_proc()); + + if (is_64_bit) + sizeof_timespec = sizeof(struct user64_timespec); + else + sizeof_timespec = sizeof(struct user32_timespec); + + hfs_assert((attrlist->commonattr & ~ATTR_CMN_VALIDMASK) == 0); + + hfs_assert((attrlist->volattr & ~ATTR_VOL_VALIDMASK) == 0); + + hfs_assert((attrlist->dirattr & ~ATTR_DIR_VALIDMASK) == 0); + + hfs_assert((attrlist->fileattr & ~ATTR_FILE_VALIDMASK) == 0); + + // disable this because it will break the simulator/build machines each + // time a new _CMNEXT_ bit is added + // hfs_assert(((attrlist->forkattr & ~ATTR_FORK_VALIDMASK) == 0) || + // ((attrlist->forkattr & ~ATTR_CMNEXT_VALIDMASK) == 0)); + + size = 0; + + if ((a = attrlist->commonattr) != 0) { + if (a & ATTR_CMN_NAME) size += sizeof(struct attrreference); + if (a & ATTR_CMN_DEVID) size += sizeof(dev_t); + if (a & ATTR_CMN_FSID) size += sizeof(fsid_t); + if (a & ATTR_CMN_OBJTYPE) size += sizeof(fsobj_type_t); + if (a & ATTR_CMN_OBJTAG) size += sizeof(fsobj_tag_t); + if (a & ATTR_CMN_OBJID) size += sizeof(fsobj_id_t); + if (a & ATTR_CMN_OBJPERMANENTID) size += sizeof(fsobj_id_t); + if (a & ATTR_CMN_PAROBJID) size += sizeof(fsobj_id_t); + if (a & ATTR_CMN_SCRIPT) size += sizeof(text_encoding_t); + if (a & ATTR_CMN_CRTIME) size += sizeof_timespec; + if (a & ATTR_CMN_MODTIME) size += sizeof_timespec; + if (a & ATTR_CMN_CHGTIME) size += sizeof_timespec; + if (a & ATTR_CMN_ACCTIME) size += sizeof_timespec; + if (a & ATTR_CMN_BKUPTIME) size += sizeof_timespec; + if (a & ATTR_CMN_FNDRINFO) size += 32 * sizeof(u_int8_t); + if (a & ATTR_CMN_OWNERID) size += sizeof(uid_t); + if (a & ATTR_CMN_GRPID) size += sizeof(gid_t); + if (a & ATTR_CMN_ACCESSMASK) size += sizeof(u_int32_t); + if (a & ATTR_CMN_FLAGS) size += sizeof(u_int32_t); + if (a & ATTR_CMN_USERACCESS) size += sizeof(u_int32_t); + if (a & ATTR_CMN_FILEID) size += sizeof(u_int64_t); + if (a & ATTR_CMN_PARENTID) size += sizeof(u_int64_t); + } + if ((a = attrlist->dirattr) != 0) { + if (a & ATTR_DIR_LINKCOUNT) size += sizeof(u_int32_t); + if (a & ATTR_DIR_ENTRYCOUNT) size += sizeof(u_int32_t); + if (a & ATTR_DIR_MOUNTSTATUS) size += sizeof(u_int32_t); + } + if ((a = attrlist->fileattr) != 0) { + if (a & ATTR_FILE_LINKCOUNT) size += sizeof(u_int32_t); + if (a & ATTR_FILE_TOTALSIZE) size += sizeof(off_t); + if (a & ATTR_FILE_ALLOCSIZE) size += sizeof(off_t); + if (a & ATTR_FILE_IOBLOCKSIZE) size += sizeof(u_int32_t); + if (a & ATTR_FILE_CLUMPSIZE) size += sizeof(u_int32_t); + if (a & ATTR_FILE_DEVTYPE) size += sizeof(u_int32_t); + if (a & ATTR_FILE_DATALENGTH) size += sizeof(off_t); + if (a & ATTR_FILE_DATAALLOCSIZE) size += sizeof(off_t); + if (a & ATTR_FILE_RSRCLENGTH) size += sizeof(off_t); + if (a & ATTR_FILE_RSRCALLOCSIZE) size += sizeof(off_t); + } + + return (size); +} + +#define KAUTH_DIR_WRITE_RIGHTS (KAUTH_VNODE_ACCESS | KAUTH_VNODE_ADD_FILE | \ + KAUTH_VNODE_ADD_SUBDIRECTORY | \ + KAUTH_VNODE_DELETE_CHILD) + +#define KAUTH_DIR_READ_RIGHTS (KAUTH_VNODE_ACCESS | KAUTH_VNODE_LIST_DIRECTORY) + +#define KAUTH_DIR_EXECUTE_RIGHTS (KAUTH_VNODE_ACCESS | KAUTH_VNODE_SEARCH) + +#define KAUTH_FILE_WRITE_RIGHTS (KAUTH_VNODE_ACCESS | KAUTH_VNODE_WRITE_DATA) + +#define KAUTH_FILE_READRIGHTS (KAUTH_VNODE_ACCESS | KAUTH_VNODE_READ_DATA) + +#define KAUTH_FILE_EXECUTE_RIGHTS (KAUTH_VNODE_ACCESS | KAUTH_VNODE_EXECUTE) + + +/* + * Compute the same [expensive] user_access value as getattrlist does + */ +static u_int32_t +hfs_real_user_access(vnode_t vp, vfs_context_t ctx) +{ + u_int32_t user_access = 0; + + if (vnode_isdir(vp)) { + if (vnode_authorize(vp, NULLVP, KAUTH_DIR_WRITE_RIGHTS, ctx) == 0) + user_access |= W_OK; + if (vnode_authorize(vp, NULLVP, KAUTH_DIR_READ_RIGHTS, ctx) == 0) + user_access |= R_OK; + if (vnode_authorize(vp, NULLVP, KAUTH_DIR_EXECUTE_RIGHTS, ctx) == 0) + user_access |= X_OK; + } else { + if (vnode_authorize(vp, NULLVP, KAUTH_FILE_WRITE_RIGHTS, ctx) == 0) + user_access |= W_OK; + if (vnode_authorize(vp, NULLVP, KAUTH_FILE_READRIGHTS, ctx) == 0) + user_access |= R_OK; + if (vnode_authorize(vp, NULLVP, KAUTH_FILE_EXECUTE_RIGHTS, ctx) == 0) + user_access |= X_OK; + } + return (user_access); +} + + +u_int32_t +DerivePermissionSummary(uid_t obj_uid, gid_t obj_gid, mode_t obj_mode, + struct mount *mp, kauth_cred_t cred, __unused struct proc *p) +{ + u_int32_t permissions; + + if (obj_uid == UNKNOWNUID) + obj_uid = kauth_cred_getuid(cred); + + /* User id 0 (root) always gets access. */ + if (!suser(cred, NULL)) { + permissions = R_OK | W_OK | X_OK; + goto Exit; + }; + + /* Otherwise, check the owner. */ + if (hfs_owner_rights(VFSTOHFS(mp), obj_uid, cred, NULL, false) == 0) { + permissions = ((u_int32_t)obj_mode & S_IRWXU) >> 6; + goto Exit; + } + + /* Otherwise, check the groups. */ + if (! (((unsigned int)vfs_flags(mp)) & MNT_UNKNOWNPERMISSIONS)) { + int is_member; + + if (kauth_cred_ismember_gid(cred, obj_gid, &is_member) == 0 && is_member) { + permissions = ((u_int32_t)obj_mode & S_IRWXG) >> 3; + goto Exit; + } + } + + /* Otherwise, settle for 'others' access. */ + permissions = (u_int32_t)obj_mode & S_IRWXO; + +Exit: + return (permissions); +} + + +/* + * =========================================================================== + * Support functions for filling up a vnode_attr structure based on attributes + * requested. + * =========================================================================== + */ +void +get_vattr_data_for_attrs(struct attrlist *alp, struct vnode_attr *vap, + struct hfsmount *hfsmp, struct vnode *vp, struct cat_desc *descp, + struct cat_attr *atrp, struct cat_fork *datafork, struct cat_fork *rsrcfork, + vfs_context_t ctx) +{ + if (alp->commonattr || alp->forkattr) { + vattr_data_for_common_attrs(alp, vap, hfsmp, vp, descp, atrp, + ctx); + } + + if (alp->dirattr && S_ISDIR(atrp->ca_mode)) + vattr_data_for_dir_attrs(alp, vap, hfsmp, vp, descp, atrp); + + if (alp->fileattr && !S_ISDIR(atrp->ca_mode)) { + vattr_data_for_file_attrs(alp, vap, hfsmp, atrp, datafork, + rsrcfork, vp); + } +} + +static void +copy_name_attr(struct vnode_attr *vap, struct vnode *vp, const u_int8_t *name, + int namelen) +{ + char *mpname; + size_t mpnamelen; + u_int32_t attrlength; + u_int8_t empty = 0; + + /* A cnode's name may be incorrect for the root of a mounted + * filesystem (it can be mounted on a different directory name + * than the name of the volume, such as "blah-1"). So for the + * root directory, it's best to return the last element of the + location where the volume's mounted: + */ + if ((vp != NULL) && vnode_isvroot(vp) && + (mpname = mountpointname(vnode_mount(vp)))) { + mpnamelen = strlen(mpname); + + /* Trim off any trailing slashes: */ + while ((mpnamelen > 0) && (mpname[mpnamelen-1] == '/')) + --mpnamelen; + + /* If there's anything left, use it instead of the volume's name */ + if (mpnamelen > 0) { + name = (u_int8_t *)mpname; + namelen = mpnamelen; + } + } + + if (name == NULL) { + name = ∅ + namelen = 0; + } + + attrlength = namelen + 1; + (void) strncpy((char *)vap->va_name, (const char *) name, attrlength); + /* + * round upto 8 and zero out the rounded up bytes. + */ + attrlength = min(kHFSPlusMaxFileNameBytes, ((attrlength + 7) & ~0x07)); + bzero(vap->va_name + attrlength, kHFSPlusMaxFileNameBytes - attrlength); +} + +static void +vattr_data_for_common_attrs( struct attrlist *alp, struct vnode_attr *vap, + struct hfsmount *hfsmp, struct vnode *vp, struct cat_desc *cdp, + struct cat_attr *cap, vfs_context_t ctx) +{ + attrgroup_t attr = alp->commonattr; + struct mount *mp = HFSTOVFS(hfsmp); + uid_t cuid = 1; + int isroot = 0; + + if (attr & (ATTR_CMN_OWNERID | ATTR_CMN_GRPID)) { + cuid = kauth_cred_getuid(vfs_context_ucred(ctx)); + isroot = cuid == 0; + } + + if (ATTR_CMN_NAME & attr) { + if (vap->va_name) { + copy_name_attr(vap, vp, cdp->cd_nameptr, + cdp->cd_namelen); + VATTR_SET_SUPPORTED(vap, va_name); + } else { + VATTR_CLEAR_SUPPORTED(vap, va_name); + } + } + + if (ATTR_CMN_DEVID & attr) { + vap->va_devid = hfsmp->hfs_raw_dev; + VATTR_SET_SUPPORTED(vap, va_devid); + } + + if (ATTR_CMN_FSID & attr) { + vap->va_fsid64.val[0] = hfsmp->hfs_raw_dev; + vap->va_fsid64.val[1] = vfs_typenum(mp); + VATTR_SET_SUPPORTED(vap, va_fsid64); + } + /* + * We always provide the objtype even if not asked because VFS helper + * functions depend on knowing the object's type. + */ + vap->va_objtype = IFTOVT(cap->ca_mode); + VATTR_SET_SUPPORTED(vap, va_objtype); + + if (ATTR_CMN_OBJTAG & attr) { + vap->va_objtag = VT_HFS; + VATTR_SET_SUPPORTED(vap, va_objtag); + } + /* + * Exporting file IDs from HFS Plus: + * + * For "normal" files the c_fileid is the same value as the + * c_cnid. But for hard link files, they are different - the + * c_cnid belongs to the active directory entry (ie the link) + * and the c_fileid is for the actual inode (ie the data file). + * + * The stat call (getattr) will always return the c_fileid + * and Carbon APIs, which are hardlink-ignorant, will always + * receive the c_cnid (from getattrlist). + * + * Forkattrs are now repurposed for Common Extended Attributes. + */ + if ((ATTR_CMN_OBJID & attr) || (ATTR_CMN_OBJPERMANENTID & attr) || + alp->forkattr & ATTR_CMNEXT_LINKID) { + vap->va_linkid = cdp->cd_cnid; + VATTR_SET_SUPPORTED(vap, va_linkid); + } + + if (ATTR_CMN_PAROBJID & attr) { + vap->va_parentid = cdp->cd_parentcnid; + VATTR_SET_SUPPORTED(vap, va_parentid); + } + + if (ATTR_CMN_SCRIPT & attr) { + vap->va_encoding = cdp->cd_encoding; + VATTR_SET_SUPPORTED(vap, va_encoding); + } + + if (ATTR_CMN_CRTIME & attr) { + vap->va_create_time.tv_sec = cap->ca_itime; + vap->va_create_time.tv_nsec = 0; + VATTR_SET_SUPPORTED(vap, va_create_time); + } + + if (ATTR_CMN_MODTIME & attr) { + vap->va_modify_time.tv_sec = cap->ca_mtime; + vap->va_modify_time.tv_nsec = 0; + VATTR_SET_SUPPORTED(vap, va_modify_time); + } + + if (ATTR_CMN_CHGTIME & attr) { + vap->va_change_time.tv_sec = cap->ca_ctime; + vap->va_change_time.tv_nsec = 0; + VATTR_SET_SUPPORTED(vap, va_change_time); + } + + if (ATTR_CMN_ACCTIME & attr) { + vap->va_access_time.tv_sec = cap->ca_atime; + vap->va_access_time.tv_nsec = 0; + VATTR_SET_SUPPORTED(vap, va_access_time); + } + + if (ATTR_CMN_BKUPTIME & attr) { + vap->va_backup_time.tv_sec = cap->ca_btime; + vap->va_backup_time.tv_nsec = 0; + VATTR_SET_SUPPORTED(vap, va_backup_time); + } + + if (ATTR_CMN_FNDRINFO & attr) { + u_int8_t *finfo = NULL; + + bcopy(&cap->ca_finderinfo, &vap->va_finderinfo[0], + sizeof(u_int8_t) * 32); + finfo = (u_int8_t*)(&vap->va_finderinfo[0]); + + /* Don't expose a symlink's private type/creator. */ + if (S_ISLNK(cap->ca_mode)) { + struct FndrFileInfo *fip; + + fip = (struct FndrFileInfo *)finfo; + fip->fdType = 0; + fip->fdCreator = 0; + } + + /* advance 16 bytes into the attrbuf */ + finfo = finfo + 16; + + /* also don't expose the date_added or write_gen_counter fields */ + if (S_ISREG(cap->ca_mode) || S_ISLNK(cap->ca_mode)) { + struct FndrExtendedFileInfo *extinfo = + (struct FndrExtendedFileInfo *)finfo; + extinfo->document_id = 0; + extinfo->date_added = 0; + extinfo->write_gen_counter = 0; + } else if (S_ISDIR(cap->ca_mode)) { + struct FndrExtendedDirInfo *extinfo = + (struct FndrExtendedDirInfo *)finfo; + extinfo->document_id = 0; + extinfo->date_added = 0; + extinfo->write_gen_counter = 0; + } + + VATTR_SET_SUPPORTED(vap, va_finderinfo); + } + + if (ATTR_CMN_OWNERID & attr) { + uid_t nuid = cap->ca_uid; + + if (!isroot) { + if (((unsigned int)vfs_flags(HFSTOVFS(hfsmp))) & MNT_UNKNOWNPERMISSIONS) + nuid = cuid; + else if (nuid == UNKNOWNUID) + nuid = cuid; + } + + vap->va_uid = nuid; + VATTR_SET_SUPPORTED(vap, va_uid); + } + + if (ATTR_CMN_GRPID & attr) { + gid_t ngid = cap->ca_gid; + + if (!isroot) { + gid_t cgid = kauth_cred_getgid(vfs_context_ucred(ctx)); + if (((unsigned int)vfs_flags(HFSTOVFS(hfsmp))) & MNT_UNKNOWNPERMISSIONS) + ngid = cgid; + else if (ngid == UNKNOWNUID) + ngid = cgid; + } + + vap->va_gid = ngid; + VATTR_SET_SUPPORTED(vap, va_gid); + } + + if (ATTR_CMN_ACCESSMASK & attr) { + uint32_t nmode; + /* + * [2856576] Since we are dynamically changing the owner, also + * effectively turn off the set-user-id and set-group-id bits, + * just like chmod(2) would when changing ownership. This prevents + * a security hole where set-user-id programs run as whoever is + * logged on (or root if nobody is logged in yet!) + */ + nmode = (cap->ca_uid == UNKNOWNUID) ? + cap->ca_mode & ~(S_ISUID | S_ISGID) : cap->ca_mode; + + vap->va_mode = nmode; + VATTR_SET_SUPPORTED(vap, va_mode); + } + + if (ATTR_CMN_FLAGS & attr) { + vap->va_flags = cap->ca_flags; + VATTR_SET_SUPPORTED(vap, va_flags); + } + + if (ATTR_CMN_GEN_COUNT & attr) { + vap->va_write_gencount = hfs_get_gencount_from_blob( + (const uint8_t *)cap->ca_finderinfo, cap->ca_mode); + VATTR_SET_SUPPORTED(vap, va_write_gencount); + } + + if (ATTR_CMN_DOCUMENT_ID & attr) { + vap->va_document_id = hfs_get_document_id_from_blob( + (const uint8_t *)cap->ca_finderinfo, cap->ca_mode); + VATTR_SET_SUPPORTED(vap, va_document_id); + } + + if (ATTR_CMN_USERACCESS & attr) { + u_int32_t user_access; + + /* Take the long path when we have an ACL */ + if ((vp != NULLVP) && (cap->ca_recflags & kHFSHasSecurityMask)) { + user_access = hfs_real_user_access(vp, ctx); + } else { + user_access = DerivePermissionSummary(cap->ca_uid, cap->ca_gid, + cap->ca_mode, mp, vfs_context_ucred(ctx), 0); + } + /* Also consider READ-ONLY file system. */ + if (vfs_flags(mp) & MNT_RDONLY) { + user_access &= ~W_OK; + } + /* Locked objects are not writable either */ + if ((cap->ca_flags & UF_IMMUTABLE) && (vfs_context_suser(ctx) != 0)) + user_access &= ~W_OK; + if ((cap->ca_flags & SF_IMMUTABLE) && (vfs_context_suser(ctx) == 0)) + user_access &= ~W_OK; + + vap->va_user_access = user_access; + VATTR_SET_SUPPORTED(vap, va_user_access); + } + + /* + * Right now the best we can do is tell if we *don't* have extended + * security (like hfs_vnop_getattr). + */ + if (ATTR_CMN_EXTENDED_SECURITY & attr) { + if (!(cap->ca_recflags & kHFSHasSecurityMask)) { + vap->va_acl = (kauth_acl_t) KAUTH_FILESEC_NONE; + VATTR_SET_SUPPORTED(vap, va_acl); + } + } + + if (ATTR_CMN_FILEID & attr) { + vap->va_fileid = cap->ca_fileid; + VATTR_SET_SUPPORTED(vap, va_fileid); + } + + if (ATTR_CMN_PARENTID & attr) { + vap->va_parentid = cdp->cd_parentcnid; + VATTR_SET_SUPPORTED(vap, va_parentid); + } + + if (ATTR_CMN_ADDEDTIME & attr) { + if (cap->ca_recflags & kHFSHasDateAddedMask) { + vap->va_addedtime.tv_sec = hfs_get_dateadded_from_blob( + (const uint8_t *)cap->ca_finderinfo, cap->ca_mode); + vap->va_addedtime.tv_nsec = 0; + VATTR_SET_SUPPORTED(vap, va_addedtime); + } + } +} + +static void +vattr_data_for_dir_attrs(struct attrlist *alp, struct vnode_attr *vap, + struct hfsmount *hfsmp, struct vnode *vp, struct cat_desc * descp, + struct cat_attr * cattrp) +{ + attrgroup_t attr = alp->dirattr; + u_int32_t entries; + + /* + * The DIR_LINKCOUNT is the count of real directory hard links. + * (i.e. its not the sum of the implied "." and ".." references + * typically used in stat's st_nlink field) + */ + if (ATTR_DIR_LINKCOUNT & attr) { + vap->va_dirlinkcount = cattrp->ca_linkcount; + VATTR_SET_SUPPORTED(vap, va_dirlinkcount); + } + if (ATTR_DIR_ENTRYCOUNT & attr) { + entries = cattrp->ca_entries; + + if (descp->cd_parentcnid == kHFSRootParentID) { + if (hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid != 0) + --entries; /* hide private dir */ + if (hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid != 0) + --entries; /* hide private dir */ + if (hfsmp->jnl || + ((hfsmp->vcbAtrb & kHFSVolumeJournaledMask) && + (hfsmp->hfs_flags & HFS_READ_ONLY))) + entries -= 2; /* hide the journal files */ + } + + vap->va_nchildren = entries; + VATTR_SET_SUPPORTED(vap, va_nchildren); + } + + if (ATTR_DIR_MOUNTSTATUS & attr) { + /* + * There is not vnode_attr for mount point status. + * XXX. Should there be ? + */ + u_int32_t mstatus = 0; + + if (vp != NULL && vnode_mountedhere(vp) != NULL) + mstatus = DIR_MNTSTATUS_MNTPOINT; + } +} + +static void +vattr_data_for_file_attrs(struct attrlist *alp, struct vnode_attr *vap, + struct hfsmount *hfsmp, struct cat_attr *cattrp, struct cat_fork *datafork, + struct cat_fork *rsrcfork, struct vnode *vp) +{ +#if !HFS_COMPRESSION +#pragma unused(vp) +#endif + attrgroup_t attr = alp->fileattr; + off_t da_size, rsrc_len, rsrc_alloc; + u_int32_t allocblksize; + + allocblksize = HFSTOVCB(hfsmp)->blockSize; + + off_t datasize = datafork->cf_size; + off_t totalsize = datasize + rsrcfork->cf_size; +#if HFS_COMPRESSION + int handle_compressed; + handle_compressed = (cattrp->ca_flags & UF_COMPRESSED);// && hfs_file_is_compressed(VTOC(vp), 1); + + if (handle_compressed) { + if (attr & (ATTR_FILE_DATALENGTH|ATTR_FILE_TOTALSIZE)) { + if ( 0 == hfs_uncompressed_size_of_compressed_file(hfsmp, vp, cattrp->ca_fileid, &datasize, 1) ) { /* 1 == don't take the cnode lock */ + /* total size of a compressed file is just the data size */ + totalsize = datasize; + } + } + } +#endif + + if (ATTR_FILE_LINKCOUNT & attr) { + vap->va_nlink = cattrp->ca_linkcount; + VATTR_SET_SUPPORTED(vap, va_nlink); + } + if (ATTR_FILE_TOTALSIZE & attr) { + VATTR_RETURN(vap, va_total_size, totalsize); + } + if (ATTR_FILE_ALLOCSIZE & attr) { + VATTR_RETURN(vap, va_total_alloc, + (off_t)cattrp->ca_blocks * (off_t)allocblksize ); + } + if (ATTR_FILE_IOBLOCKSIZE & attr) { + VATTR_RETURN(vap, va_iosize, hfsmp->hfs_logBlockSize); + } + + /* ATTR_FILE_CLUMPSIZE is obsolete */ + + if (ATTR_FILE_DEVTYPE & attr) { + dev_t dev = 0; + + if (S_ISBLK(cattrp->ca_mode) || S_ISCHR(cattrp->ca_mode)) + dev = (u_int32_t)cattrp->ca_rdev; + + VATTR_RETURN(vap, va_rdev, dev); + } + + if (ATTR_FILE_DATALENGTH & attr) { + VATTR_RETURN(vap, va_data_size, datasize); + } +#if HFS_COMPRESSION + /* fake the data fork size on a decmpfs compressed file to reflect the + * uncompressed size. This ensures proper reading and copying of these + * files. + * NOTE: we may need to get the vnode here because the vnode parameter + * passed by hfs_vnop_readdirattr() may be null. + */ + + if (handle_compressed) { + da_size = (off_t)rsrcfork->cf_blocks * (off_t)allocblksize; + rsrc_len = 0; + rsrc_alloc = 0; + } + else +#endif + { + da_size = (off_t)datafork->cf_blocks * (off_t)allocblksize; + rsrc_len = rsrcfork->cf_size; + rsrc_alloc = (off_t)rsrcfork->cf_blocks * (off_t)allocblksize; + } + + if (ATTR_FILE_DATAALLOCSIZE & attr) { + VATTR_RETURN(vap, va_data_alloc, da_size); + } + + if (ATTR_FILE_RSRCLENGTH & attr) { + VATTR_RETURN(vap, va_rsrc_length, rsrc_len); + } + + if (ATTR_FILE_RSRCALLOCSIZE & attr) { + VATTR_RETURN(vap, va_rsrc_alloc, rsrc_alloc); + } +} diff --git a/core/hfs_attrlist.h b/core/hfs_attrlist.h new file mode 100644 index 0000000..b9ceb09 --- /dev/null +++ b/core/hfs_attrlist.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2002-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#ifndef _HFS_ATTRLIST_H_ +#define _HFS_ATTRLIST_H_ + +#include + +#ifdef KERNEL +#ifdef __APPLE_API_PRIVATE +#include +#include + +#include "hfs_catalog.h" +#include "hfs_cnode.h" + + +struct attrblock { + struct attrlist * ab_attrlist; + void ** ab_attrbufpp; + void ** ab_varbufpp; + int ab_flags; + int ab_blocksize; + vfs_context_t ab_context; +}; + +/* + * The following define the attributes that HFS supports: + */ + +#define HFS_ATTR_CMN_VALID \ + (ATTR_CMN_NAME | ATTR_CMN_DEVID | \ + ATTR_CMN_FSID | ATTR_CMN_OBJTYPE | \ + ATTR_CMN_OBJTAG | ATTR_CMN_OBJID | \ + ATTR_CMN_OBJPERMANENTID | ATTR_CMN_PAROBJID | \ + ATTR_CMN_SCRIPT | ATTR_CMN_CRTIME | \ + ATTR_CMN_MODTIME | ATTR_CMN_CHGTIME | \ + ATTR_CMN_ACCTIME | ATTR_CMN_BKUPTIME | \ + ATTR_CMN_FNDRINFO |ATTR_CMN_OWNERID | \ + ATTR_CMN_GRPID | ATTR_CMN_ACCESSMASK | \ + ATTR_CMN_FLAGS | ATTR_CMN_USERACCESS | \ + ATTR_CMN_FILEID | ATTR_CMN_PARENTID ) + +#define HFS_ATTR_CMN_SEARCH_VALID \ + (ATTR_CMN_NAME | ATTR_CMN_OBJID | \ + ATTR_CMN_PAROBJID | ATTR_CMN_CRTIME | \ + ATTR_CMN_MODTIME | ATTR_CMN_CHGTIME | \ + ATTR_CMN_ACCTIME | ATTR_CMN_BKUPTIME | \ + ATTR_CMN_FNDRINFO | ATTR_CMN_OWNERID | \ + ATTR_CMN_GRPID | ATTR_CMN_ACCESSMASK | \ + ATTR_CMN_FILEID | ATTR_CMN_PARENTID ) + + + +#define HFS_ATTR_DIR_VALID \ + (ATTR_DIR_LINKCOUNT | ATTR_DIR_ENTRYCOUNT | ATTR_DIR_MOUNTSTATUS) + +#define HFS_ATTR_DIR_SEARCH_VALID \ + (ATTR_DIR_ENTRYCOUNT) + +#define HFS_ATTR_FILE_VALID \ + (ATTR_FILE_LINKCOUNT |ATTR_FILE_TOTALSIZE | \ + ATTR_FILE_ALLOCSIZE | ATTR_FILE_IOBLOCKSIZE | \ + ATTR_FILE_CLUMPSIZE | ATTR_FILE_DEVTYPE | \ + ATTR_FILE_DATALENGTH | ATTR_FILE_DATAALLOCSIZE | \ + ATTR_FILE_RSRCLENGTH | ATTR_FILE_RSRCALLOCSIZE) + +#define HFS_ATTR_FILE_SEARCH_VALID \ + (ATTR_FILE_DATALENGTH | ATTR_FILE_DATAALLOCSIZE | \ + ATTR_FILE_RSRCLENGTH | ATTR_FILE_RSRCALLOCSIZE ) + +extern int hfs_attrblksize(struct attrlist *attrlist); + +extern u_int32_t DerivePermissionSummary(uid_t obj_uid, gid_t obj_gid, + mode_t obj_mode, struct mount *mp, + kauth_cred_t cred, struct proc *p); + +extern void hfs_packattrblk(struct attrblock *abp, struct hfsmount *hfsmp, + struct vnode *vp, struct cat_desc *descp, struct cat_attr *attrp, + struct cat_fork *datafork, struct cat_fork *rsrcfork, struct vfs_context *ctx); + +#endif /* __APPLE_API_PRIVATE */ +#endif /* KERNEL */ +#endif /* ! _HFS_ATTRLIST_H_ */ diff --git a/core/hfs_btreeio.c b/core/hfs_btreeio.c new file mode 100644 index 0000000..ec2072e --- /dev/null +++ b/core/hfs_btreeio.c @@ -0,0 +1,948 @@ +/* + * Copyright (c) 2000-2017 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include + + +#include "hfs.h" +#include "hfs_cnode.h" +#include "hfs_dbg.h" +#include "hfs_endian.h" +#include "hfs_btreeio.h" + +#include "FileMgrInternal.h" +#include "BTreesPrivate.h" + +/* From bsd/vfs/vfs_bio.c */ +extern int bdwrite_internal(struct buf *, int); + +static int ClearBTNodes(struct vnode *vp, int blksize, off_t offset, off_t amount); +static int btree_journal_modify_block_end(struct hfsmount *hfsmp, struct buf *bp); + +void btree_swap_node(struct buf *bp, __unused void *arg); + +/* + * Return btree node size for given vnode. + * + * Returns: + * For btree vnode, returns btree node size. + * For non-btree vnodes, returns 0. + */ +u_int16_t get_btree_nodesize(struct vnode *vp) +{ + BTreeControlBlockPtr btree; + u_int16_t node_size = 0; + + if (vnode_issystem(vp)) { + btree = (BTreeControlBlockPtr) VTOF(vp)->fcbBTCBPtr; + if (btree) { + node_size = btree->nodeSize; + } + } + + return node_size; +} + +OSStatus SetBTreeBlockSize(FileReference vp, ByteCount blockSize, __unused ItemCount minBlockCount) +{ + BTreeControlBlockPtr bTreePtr; + + hfs_assert(vp != NULL); + hfs_assert(blockSize >= kMinNodeSize); + if (blockSize > MAXBSIZE ) + return (fsBTBadNodeSize); + + bTreePtr = (BTreeControlBlockPtr)VTOF(vp)->fcbBTCBPtr; + bTreePtr->nodeSize = blockSize; + + return (E_NONE); +} + + +OSStatus GetBTreeBlock(FileReference vp, u_int32_t blockNum, GetBlockOptions options, BlockDescriptor *block) +{ + OSStatus retval = E_NONE; + struct buf *bp = NULL; + u_int8_t allow_empty_node; + + /* If the btree block is being read using hint, it is + * fine for the swap code to find zeroed out nodes. + */ + if (options & kGetBlockHint) { + allow_empty_node = true; + } else { + allow_empty_node = false; + } + + if (options & kGetEmptyBlock) { + daddr64_t blkno; + off_t offset; + + offset = (daddr64_t)blockNum * (daddr64_t)block->blockSize; + bp = buf_getblk(vp, (daddr64_t)blockNum, block->blockSize, 0, 0, BLK_META); + if (bp && !hfs_vnop_blockmap(&(struct vnop_blockmap_args){ + .a_vp = vp, + .a_foffset = offset, + .a_size = block->blockSize, + .a_bpn = &blkno + })) { + buf_setblkno(bp, blkno); + } + } else { + retval = buf_meta_bread(vp, (daddr64_t)blockNum, block->blockSize, NOCRED, &bp); + } + if (bp == NULL) + retval = -1; //XXX need better error + + if (retval == E_NONE) { + block->blockHeader = bp; + block->buffer = (char *)buf_dataptr(bp); + block->blockNum = buf_lblkno(bp); + block->blockReadFromDisk = (buf_fromcache(bp) == 0); /* not found in cache ==> came from disk */ + + // XXXdbg + block->isModified = 0; + + /* Check and endian swap B-Tree node (only if it's a valid block) */ + if (!(options & kGetEmptyBlock)) { + + /* This happens when we first open the b-tree, we might not have all the node data on hand */ + if ((((BTNodeDescriptor *)block->buffer)->kind == kBTHeaderNode) && + (((BTHeaderRec *)((char *)block->buffer + 14))->nodeSize != buf_count(bp)) && + (SWAP_BE16 (((BTHeaderRec *)((char *)block->buffer + 14))->nodeSize) != buf_count(bp))) { + + /* + * Don't swap the node descriptor, record offsets, or other records. + * This record will be invalidated and re-read with the correct node + * size once the B-tree control block is set up with the node size + * from the header record. + */ + retval = hfs_swap_BTNode (block, vp, kSwapBTNodeHeaderRecordOnly, allow_empty_node); + + } else { + /* + * In this case, we have enough data in-hand to do basic validation + * on the B-Tree node. + */ + if (block->blockReadFromDisk) { + /* + * The node was just read from disk, so always swap/check it. + * This is necessary on big endian since the test below won't trigger. + */ + retval = hfs_swap_BTNode (block, vp, kSwapBTNodeBigToHost, allow_empty_node); + } + else { + /* + * Block wasn't read from disk; it was found in the cache. + */ + if (*((u_int16_t *)((char *)block->buffer + (block->blockSize - sizeof (u_int16_t)))) == 0x0e00) { + /* + * The node was left in the cache in non-native order, so swap it. + * This only happens on little endian, after the node is written + * back to disk. + */ + retval = hfs_swap_BTNode (block, vp, kSwapBTNodeBigToHost, allow_empty_node); + } + else if (*((u_int16_t *)((char *)block->buffer + (block->blockSize - sizeof (u_int16_t)))) == 0x000e) { + /* + * The node was in-cache in native-endianness. We don't need to do + * anything here, because the node is ready to use. Set retval == 0. + */ + retval = 0; + } + /* + * If the node doesn't have hex 14 (0xe) in the last two bytes of the buffer, + * it doesn't necessarily mean that this is a bad node. Zeroed nodes that are + * marked as unused in the b-tree map node would be OK and not have valid content. + */ + } + } + + /* + * If we got an error, then the node is only partially swapped. + * We mark the buffer invalid so that the next attempt to get the + * node will read it and attempt to swap again, and will notice + * the error again. If we didn't do this, the next attempt to get + * the node might use the partially swapped node as-is. + */ + if (retval) + buf_markinvalid(bp); + } + } + + if (retval) { + if (bp) + buf_brelse(bp); + block->blockHeader = NULL; + block->buffer = NULL; + } + + return (retval); +} + + +void ModifyBlockStart(FileReference vp, BlockDescPtr blockPtr) +{ + struct hfsmount *hfsmp = VTOHFS(vp); + struct buf *bp = NULL; + + if (hfsmp->jnl == NULL) { + return; + } + + bp = (struct buf *) blockPtr->blockHeader; + if (bp == NULL) { + panic("hfs: ModifyBlockStart: null bp for blockdescptr %p?!?\n", blockPtr); + return; + } + + journal_modify_block_start(hfsmp->jnl, bp); + blockPtr->isModified = 1; +} + +void +btree_swap_node(struct buf *bp, __unused void *arg) +{ + // struct hfsmount *hfsmp = (struct hfsmount *)arg; + int retval; + struct vnode *vp = buf_vnode(bp); + BlockDescriptor block; + + /* Prepare the block pointer */ + block.blockHeader = bp; + block.buffer = (char *)buf_dataptr(bp); + block.blockNum = buf_lblkno(bp); + /* not found in cache ==> came from disk */ + block.blockReadFromDisk = (buf_fromcache(bp) == 0); + block.blockSize = buf_count(bp); + + /* Swap the data now that this node is ready to go to disk. + * We allow swapping of zeroed out nodes here because we might + * be writing node whose last record just got deleted. + */ + retval = hfs_swap_BTNode (&block, vp, kSwapBTNodeHostToBig, true); + if (retval) + panic("hfs: btree_swap_node: about to write corrupt node!\n"); +} + + +static int +btree_journal_modify_block_end(struct hfsmount *hfsmp, struct buf *bp) +{ + return journal_modify_block_end(hfsmp->jnl, bp, btree_swap_node, hfsmp); +} + + +OSStatus ReleaseBTreeBlock(FileReference vp, BlockDescPtr blockPtr, ReleaseBlockOptions options) +{ + struct hfsmount *hfsmp = VTOHFS(vp); + OSStatus retval = E_NONE; + struct buf *bp = NULL; + + bp = (struct buf *) blockPtr->blockHeader; + + if (bp == NULL) { + retval = -1; + goto exit; + } + + if (options & kTrashBlock) { + buf_markinvalid(bp); + + if (hfsmp->jnl && (buf_flags(bp) & B_LOCKED)) { + journal_kill_block(hfsmp->jnl, bp); + } else { + buf_brelse(bp); /* note: B-tree code will clear blockPtr->blockHeader and blockPtr->buffer */ + } + + /* Don't let anyone else try to use this bp, it's been consumed */ + blockPtr->blockHeader = NULL; + + } else { + if (options & kForceWriteBlock) { + if (hfsmp->jnl) { + if (blockPtr->isModified == 0) { + panic("hfs: releaseblock: modified is 0 but forcewrite set! bp %p\n", bp); + } + + retval = btree_journal_modify_block_end(hfsmp, bp); + blockPtr->isModified = 0; + } else { + retval = VNOP_BWRITE(bp); + } + + /* Don't let anyone else try to use this bp, it's been consumed */ + blockPtr->blockHeader = NULL; + + } else if (options & kMarkBlockDirty) { + struct timeval tv; + microuptime(&tv); + if ((options & kLockTransaction) && hfsmp->jnl == NULL) { + /* + * + * Set the B_LOCKED flag and unlock the buffer, causing buf_brelse to move + * the buffer onto the LOCKED free list. This is necessary, otherwise + * getnewbuf() would try to reclaim the buffers using buf_bawrite, which + * isn't going to work. + * + */ + /* Don't hog all the buffers... */ + if (count_lock_queue() > kMaxLockedMetaBuffers) { + hfs_btsync(vp, HFS_SYNCTRANS); + /* Rollback sync time to cause a sync on lock release... */ + (void) BTSetLastSync(VTOF(vp), tv.tv_sec - (kMaxSecsForFsync + 1)); + } + buf_setflags(bp, B_LOCKED); + } + + /* + * Delay-write this block. + * If the maximum delayed buffers has been exceeded then + * free up some buffers and fall back to an asynchronous write. + */ + if (hfsmp->jnl) { + if (blockPtr->isModified == 0) { + panic("hfs: releaseblock: modified is 0 but markdirty set! bp %p\n", bp); + } + retval = btree_journal_modify_block_end(hfsmp, bp); + blockPtr->isModified = 0; + } else if (bdwrite_internal(bp, 1) != 0) { + hfs_btsync(vp, 0); + /* Rollback sync time to cause a sync on lock release... */ + (void) BTSetLastSync(VTOF(vp), tv.tv_sec - (kMaxSecsForFsync + 1)); + + buf_clearflags(bp, B_LOCKED); + buf_bawrite(bp); + } + + /* Don't let anyone else try to use this bp, it's been consumed */ + blockPtr->blockHeader = NULL; + + } else { + // check if we had previously called journal_modify_block_start() + // on this block and if so, abort it (which will call buf_brelse()). + if (hfsmp->jnl && blockPtr->isModified) { + // XXXdbg - I don't want to call modify_block_abort() + // because I think it may be screwing up the + // journal and blowing away a block that has + // valid data in it. + // + // journal_modify_block_abort(hfsmp->jnl, bp); + //panic("hfs: releaseblock called for 0x%x but mod_block_start previously called.\n", bp); + btree_journal_modify_block_end(hfsmp, bp); + blockPtr->isModified = 0; + } else { + buf_brelse(bp); /* note: B-tree code will clear blockPtr->blockHeader and blockPtr->buffer */ + } + + /* Don't let anyone else try to use this bp, it's been consumed */ + blockPtr->blockHeader = NULL; + } + } + +exit: + return (retval); +} + + +OSStatus ExtendBTreeFile(FileReference vp, FSSize minEOF, FSSize maxEOF) +{ +#pragma unused (maxEOF) + + OSStatus retval = 0, ret = 0; + int64_t actualBytesAdded, origSize; + u_int64_t bytesToAdd; + u_int32_t startAllocation; + u_int32_t fileblocks; + BTreeInfoRec btInfo; + ExtendedVCB *vcb; + FCB *filePtr; + struct proc *p = NULL; + int64_t trim = 0; + int lockflags = 0; + + filePtr = GetFileControlBlock(vp); + + if ( (off_t)minEOF > filePtr->fcbEOF ) + { + bytesToAdd = minEOF - filePtr->fcbEOF; + + if (bytesToAdd < filePtr->ff_clumpsize) + bytesToAdd = filePtr->ff_clumpsize; //XXX why not always be a mutiple of clump size? + } + else + { + return -1; + } + + vcb = VTOVCB(vp); + + /* + * The Extents B-tree can't have overflow extents. ExtendFileC will + * return an error if an attempt is made to extend the Extents B-tree + * when the resident extents are exhausted. + */ + + /* Protect allocation bitmap and extents overflow file. */ + lockflags = SFL_BITMAP; + if (VTOC(vp)->c_fileid != kHFSExtentsFileID) + lockflags |= SFL_EXTENTS; + lockflags = hfs_systemfile_lock(vcb, lockflags, HFS_EXCLUSIVE_LOCK); + + (void) BTGetInformation(filePtr, 0, &btInfo); + +#if 0 // XXXdbg + /* + * The b-tree code expects nodes to be contiguous. So when + * the allocation block size is less than the b-tree node + * size, we need to force disk allocations to be contiguous. + */ + if (vcb->blockSize >= btInfo.nodeSize) { + extendFlags = 0; + } else { + /* Ensure that all b-tree nodes are contiguous on disk */ + extendFlags = kEFContigMask; + } +#endif + + origSize = filePtr->fcbEOF; + fileblocks = filePtr->ff_blocks; + startAllocation = vcb->nextAllocation; + + // loop trying to get a contiguous chunk that's an integer multiple + // of the btree node size. if we can't get a contiguous chunk that + // is at least the node size then we break out of the loop and let + // the error propagate back up. + while((off_t)bytesToAdd >= btInfo.nodeSize) { + do { + retval = ExtendFileC(vcb, filePtr, bytesToAdd, 0, + kEFContigMask | kEFMetadataMask | kEFNoClumpMask, + (int64_t *)&actualBytesAdded); + if (retval == dskFulErr && actualBytesAdded == 0) { + bytesToAdd >>= 1; + if (bytesToAdd < btInfo.nodeSize) { + break; + } else if ((bytesToAdd % btInfo.nodeSize) != 0) { + // make sure it's an integer multiple of the nodeSize + bytesToAdd -= (bytesToAdd % btInfo.nodeSize); + } + } + } while (retval == dskFulErr && actualBytesAdded == 0); + + if (retval == dskFulErr && actualBytesAdded == 0 && bytesToAdd <= btInfo.nodeSize) { + break; + } + + filePtr->fcbEOF = (u_int64_t)filePtr->ff_blocks * (u_int64_t)vcb->blockSize; + bytesToAdd = minEOF - filePtr->fcbEOF; + } + + /* + * If a new extent was added then move the roving allocator + * reference forward by the current b-tree file size so + * there's plenty of room to grow. + */ + if ((retval == 0) && + ((VCBTOHFS(vcb)->hfs_flags & HFS_METADATA_ZONE) == 0) && + (vcb->nextAllocation > startAllocation) && + ((vcb->nextAllocation + fileblocks) < vcb->allocLimit)) { + HFS_UPDATE_NEXT_ALLOCATION(vcb, vcb->nextAllocation + fileblocks); + } + + filePtr->fcbEOF = (u_int64_t)filePtr->ff_blocks * (u_int64_t)vcb->blockSize; + + // XXXdbg ExtendFileC() could have returned an error even though + // it grew the file to be big enough for our needs. If this is + // the case, we don't care about retval so we blow it away. + // + if (filePtr->fcbEOF >= (off_t)minEOF && retval != 0) { + retval = 0; + } + + // XXXdbg if the file grew but isn't large enough or isn't an + // even multiple of the nodeSize then trim things back. if + // the file isn't large enough we trim back to the original + // size. otherwise we trim back to be an even multiple of the + // btree node size. + // + if ((filePtr->fcbEOF < (off_t)minEOF) || ((filePtr->fcbEOF - origSize) % btInfo.nodeSize) != 0) { + + if (filePtr->fcbEOF < (off_t)minEOF) { + retval = dskFulErr; + + if (filePtr->fcbEOF < origSize) { + panic("hfs: btree file eof %lld less than orig size %lld!\n", + filePtr->fcbEOF, origSize); + } + + trim = filePtr->fcbEOF - origSize; + } else { + trim = ((filePtr->fcbEOF - origSize) % btInfo.nodeSize); + } + + ret = TruncateFileC(vcb, filePtr, filePtr->fcbEOF - trim, 0, 0, FTOC(filePtr)->c_fileid, 0); + filePtr->fcbEOF = (u_int64_t)filePtr->ff_blocks * (u_int64_t)vcb->blockSize; + + // XXXdbg - panic if the file didn't get trimmed back properly + if ((filePtr->fcbEOF % btInfo.nodeSize) != 0) { + panic("hfs: truncate file didn't! fcbEOF %lld nsize %d fcb %p\n", + filePtr->fcbEOF, btInfo.nodeSize, filePtr); + } + + if (ret) { + // XXXdbg - this probably doesn't need to be a panic() + panic("hfs: error truncating btree files (sz 0x%llx, trim %lld, ret %ld)\n", + filePtr->fcbEOF, trim, (long)ret); + goto out; + } + } + + if(VTOC(vp)->c_fileid != kHFSExtentsFileID) { + /* + * Get any extents overflow b-tree changes to disk ASAP! + */ + (void) BTFlushPath(VTOF(vcb->extentsRefNum)); + (void) hfs_fsync(vcb->extentsRefNum, MNT_WAIT, 0, p); + } + hfs_systemfile_unlock(vcb, lockflags); + lockflags = 0; + + if ((filePtr->fcbEOF % btInfo.nodeSize) != 0) { + panic("hfs: extendbtree: fcb %p has eof 0x%llx not a multiple of 0x%x (trim %llx)\n", + filePtr, filePtr->fcbEOF, btInfo.nodeSize, trim); + } + + /* + * Update the Alternate MDB or Alternate VolumeHeader + */ + VTOC(vp)->c_flag |= C_MODIFIED; + if ((VTOC(vp)->c_fileid == kHFSExtentsFileID) || + (VTOC(vp)->c_fileid == kHFSCatalogFileID) || + (VTOC(vp)->c_fileid == kHFSAttributesFileID) + ) { + MarkVCBDirty( vcb ); + ret = hfs_flushvolumeheader(VCBTOHFS(vcb), HFS_FVH_WAIT | HFS_FVH_WRITE_ALT); + } else { + VTOC(vp)->c_touch_chgtime = TRUE; + VTOC(vp)->c_touch_modtime = TRUE; + (void) hfs_update(vp, 0); + } + + ret = ClearBTNodes(vp, btInfo.nodeSize, origSize, (filePtr->fcbEOF - origSize)); +out: + if (retval == 0) + retval = ret; + + if (lockflags) + hfs_systemfile_unlock(vcb, lockflags); + + return retval; +} + + +/* + * Clear out (zero) new b-tree nodes on disk. + */ +static int +ClearBTNodes(struct vnode *vp, int blksize, off_t offset, off_t amount) +{ + struct hfsmount *hfsmp = VTOHFS(vp); + struct buf *bp = NULL; + daddr64_t blk; + daddr64_t blkcnt; + + blk = offset / blksize; + blkcnt = amount / blksize; + + while (blkcnt > 0) { + bp = buf_getblk(vp, blk, blksize, 0, 0, BLK_META); + if (bp == NULL) + continue; + + // XXXdbg + if (hfsmp->jnl) { + // XXXdbg -- skipping this for now since it makes a transaction + // become *way* too large + //journal_modify_block_start(hfsmp->jnl, bp); + } + bzero((char *)buf_dataptr(bp), blksize); + + buf_markaged(bp); + + // XXXdbg + if (hfsmp->jnl) { + // XXXdbg -- skipping this for now since it makes a transaction + // become *way* too large + //journal_modify_block_end(hfsmp->jnl, bp); + + // XXXdbg - remove this once we decide what to do with the + // writes to the journal + if ((blk % 32) == 0) + VNOP_BWRITE(bp); + else + buf_bawrite(bp); + } else { + /* wait/yield every 32 blocks so we don't hog all the buffers */ + if ((blk % 32) == 0) + VNOP_BWRITE(bp); + else + buf_bawrite(bp); + } + --blkcnt; + ++blk; + } + + return (0); +} + + +extern char hfs_attrname[]; + +/* + * Create an HFS+ Attribute B-tree File. + * + * No global resources should be held. + */ +int +hfs_create_attr_btree(struct hfsmount *hfsmp, u_int32_t nodesize, u_int32_t nodecnt) +{ + struct vnode* vp = NULLVP; + struct cat_desc cndesc; + struct cat_attr cnattr; + struct cat_fork cfork; + BlockDescriptor blkdesc; + BTNodeDescriptor *ndp; + BTHeaderRec *bthp; + BTreeControlBlockPtr btcb = NULL; + struct buf *bp = NULL; + void * buffer; + u_int8_t *bitmap; + u_int16_t *index; + u_int32_t node_num, num_map_nodes; + u_int32_t bytes_per_map_record; + u_int32_t temp; + u_int16_t offset; + int intrans = 0; + int result; + int newvnode_flags = 0; + +again: + /* + * Serialize creation using HFS_CREATING_BTREE flag. + */ + hfs_lock_mount (hfsmp); + if (hfsmp->hfs_flags & HFS_CREATING_BTREE) { + /* Someone else beat us, wait for them to finish. */ + (void) msleep(&hfsmp->hfs_attribute_cp, &hfsmp->hfs_mutex, + PDROP | PINOD, "hfs_create_attr_btree", 0); + if (hfsmp->hfs_attribute_vp) { + return (0); + } + goto again; + } + hfsmp->hfs_flags |= HFS_CREATING_BTREE; + hfs_unlock_mount (hfsmp); + + /* Check if were out of usable disk space. */ + if ((hfs_freeblks(hfsmp, 1) == 0)) { + result = ENOSPC; + goto exit; + } + + /* + * Set up Attribute B-tree vnode + * (this must be done before we start a transaction + * or take any system file locks) + */ + bzero(&cndesc, sizeof(cndesc)); + cndesc.cd_parentcnid = kHFSRootParentID; + cndesc.cd_flags |= CD_ISMETA; + cndesc.cd_nameptr = (const u_int8_t *)hfs_attrname; + cndesc.cd_namelen = strlen(hfs_attrname); + cndesc.cd_cnid = kHFSAttributesFileID; + + bzero(&cnattr, sizeof(cnattr)); + cnattr.ca_linkcount = 1; + cnattr.ca_mode = S_IFREG; + cnattr.ca_fileid = cndesc.cd_cnid; + + bzero(&cfork, sizeof(cfork)); + cfork.cf_clump = nodesize * nodecnt; + + result = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, + &cfork, &vp, &newvnode_flags); + if (result) { + goto exit; + } + /* + * Set up Attribute B-tree control block + */ + btcb = hfs_mallocz(sizeof(*btcb)); + + btcb->nodeSize = nodesize; + btcb->maxKeyLength = kHFSPlusAttrKeyMaximumLength; + btcb->btreeType = 0xFF; + btcb->attributes = kBTVariableIndexKeysMask | kBTBigKeysMask; + btcb->version = kBTreeVersion; + btcb->writeCount = 1; + btcb->flags = 0; /* kBTHeaderDirty */ + btcb->fileRefNum = vp; + btcb->getBlockProc = GetBTreeBlock; + btcb->releaseBlockProc = ReleaseBTreeBlock; + btcb->setEndOfForkProc = ExtendBTreeFile; + btcb->keyCompareProc = (KeyCompareProcPtr)hfs_attrkeycompare; + + /* + * NOTE: We must make sure to zero out this pointer if we error out in this function! + * If we don't, then unmount will treat it as a valid pointer which can lead to a + * use-after-free + */ + VTOF(vp)->fcbBTCBPtr = btcb; + + /* + * Allocate some space + */ + if (hfs_start_transaction(hfsmp) != 0) { + result = EINVAL; + goto exit; + } + intrans = 1; + + /* Note ExtendBTreeFile will acquire the necessary system file locks. */ + result = ExtendBTreeFile(vp, nodesize, cfork.cf_clump); + if (result) + goto exit; + + btcb->totalNodes = VTOF(vp)->ff_size / nodesize; + + /* + * Figure out how many map nodes we'll need. + * + * bytes_per_map_record = the number of bytes in the map record of a + * map node. Since that is the only record in the node, it is the size + * of the node minus the node descriptor at the start, and two record + * offsets at the end of the node. The "- 2" is to round the size down + * to a multiple of 4 bytes (since sizeof(BTNodeDescriptor) is not a + * multiple of 4). + * + * The value "temp" here is the number of *bits* in the map record of + * the header node. + */ + bytes_per_map_record = nodesize - sizeof(BTNodeDescriptor) - 2*sizeof(u_int16_t) - 2; + temp = 8 * (nodesize - sizeof(BTNodeDescriptor) + - sizeof(BTHeaderRec) + - kBTreeHeaderUserBytes + - 4 * sizeof(u_int16_t)); + if (btcb->totalNodes > temp) { + num_map_nodes = howmany(btcb->totalNodes - temp, bytes_per_map_record * 8); + } + else { + num_map_nodes = 0; + } + + btcb->freeNodes = btcb->totalNodes - 1 - num_map_nodes; + + /* + * Initialize the b-tree header on disk + */ + bp = buf_getblk(vp, 0, nodesize, 0, 0, BLK_META); + if (bp == NULL) { + result = EIO; + goto exit; + } + + buffer = (void *)buf_dataptr(bp); + blkdesc.buffer = buffer; + blkdesc.blockHeader = (void *)bp; + blkdesc.blockReadFromDisk = 0; + blkdesc.isModified = 0; + + ModifyBlockStart(vp, &blkdesc); + + if (buf_size(bp) != nodesize) + panic("hfs_create_attr_btree: bad buffer size (%d)\n", buf_size(bp)); + + bzero(buffer, nodesize); + index = (u_int16_t *)buffer; + + /* FILL IN THE NODE DESCRIPTOR: */ + ndp = (BTNodeDescriptor *)buffer; + if (num_map_nodes != 0) + ndp->fLink = 1; + ndp->kind = kBTHeaderNode; + ndp->numRecords = 3; + offset = sizeof(BTNodeDescriptor); + index[(nodesize / 2) - 1] = offset; + + /* FILL IN THE HEADER RECORD: */ + bthp = (BTHeaderRec *)((u_int8_t *)buffer + offset); + bthp->nodeSize = nodesize; + bthp->totalNodes = btcb->totalNodes; + bthp->freeNodes = btcb->freeNodes; + bthp->clumpSize = cfork.cf_clump; + bthp->btreeType = 0xFF; + bthp->attributes = kBTVariableIndexKeysMask | kBTBigKeysMask; + bthp->maxKeyLength = kHFSPlusAttrKeyMaximumLength; + bthp->keyCompareType = kHFSBinaryCompare; + offset += sizeof(BTHeaderRec); + index[(nodesize / 2) - 2] = offset; + + /* FILL IN THE USER RECORD: */ + offset += kBTreeHeaderUserBytes; + index[(nodesize / 2) - 3] = offset; + + /* Mark the header node and map nodes in use in the map record. + * + * NOTE: Assumes that the header node's map record has at least + * (num_map_nodes + 1) bits. + */ + bitmap = (u_int8_t *) buffer + offset; + temp = num_map_nodes + 1; /* +1 for the header node */ + while (temp >= 8) { + *(bitmap++) = 0xFF; + temp -= 8; + } + *bitmap = ~(0xFF >> temp); + + offset += nodesize - sizeof(BTNodeDescriptor) - sizeof(BTHeaderRec) + - kBTreeHeaderUserBytes - (4 * sizeof(int16_t)); + index[(nodesize / 2) - 4] = offset; + + if (hfsmp->jnl) { + result = btree_journal_modify_block_end(hfsmp, bp); + } else { + result = VNOP_BWRITE(bp); + } + if (result) + goto exit; + + /* Create the map nodes: node numbers 1 .. num_map_nodes */ + for (node_num=1; node_num <= num_map_nodes; ++node_num) { + bp = buf_getblk(vp, node_num, nodesize, 0, 0, BLK_META); + if (bp == NULL) { + result = EIO; + goto exit; + } + buffer = (void *)buf_dataptr(bp); + blkdesc.buffer = buffer; + blkdesc.blockHeader = (void *)bp; + blkdesc.blockReadFromDisk = 0; + blkdesc.isModified = 0; + + ModifyBlockStart(vp, &blkdesc); + + bzero(buffer, nodesize); + index = (u_int16_t *)buffer; + + /* Fill in the node descriptor */ + ndp = (BTNodeDescriptor *)buffer; + if (node_num != num_map_nodes) + ndp->fLink = node_num + 1; + ndp->kind = kBTMapNode; + ndp->numRecords = 1; + offset = sizeof(BTNodeDescriptor); + index[(nodesize / 2) - 1] = offset; + + + /* Fill in the map record's offset */ + /* Note: We assume that the map record is all zeroes */ + offset = sizeof(BTNodeDescriptor) + bytes_per_map_record; + index[(nodesize / 2) - 2] = offset; + + if (hfsmp->jnl) { + result = btree_journal_modify_block_end(hfsmp, bp); + } else { + result = VNOP_BWRITE(bp); + } + if (result) + goto exit; + } + + /* Update vp/cp for attribute btree */ + hfs_lock_mount (hfsmp); + hfsmp->hfs_attribute_cp = VTOC(vp); + hfsmp->hfs_attribute_vp = vp; + hfs_unlock_mount (hfsmp); + + (void) hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT | HFS_FVH_WRITE_ALT); + + if (intrans) { + hfs_end_transaction(hfsmp); + intrans = 0; + } + + /* Initialize the vnode for virtual attribute data file */ + result = init_attrdata_vnode(hfsmp); + if (result) { + printf("hfs_create_attr_btree: vol=%s init_attrdata_vnode() error=%d\n", hfsmp->vcbVN, result); + } + +exit: + + if (vp && result) { + /* + * If we're about to error out, then make sure to zero out the B-Tree control block pointer + * from the filefork of the EA B-Tree cnode/vnode. Failing to do this will lead to a use + * after free at unmount or BTFlushPath. Since we're about to error out anyway, this memory + * will be freed. + */ + VTOF(vp)->fcbBTCBPtr = NULL; + } + + + if (vp) { + hfs_unlock(VTOC(vp)); + } + if (result) { + hfs_free(btcb, sizeof(*btcb)); + if (vp) { + vnode_put(vp); + } + /* XXX need to give back blocks ? */ + } + if (intrans) { + hfs_end_transaction(hfsmp); + } + + /* + * All done, clear HFS_CREATING_BTREE, and wake up any sleepers. + */ + hfs_lock_mount (hfsmp); + hfsmp->hfs_flags &= ~HFS_CREATING_BTREE; + wakeup((caddr_t)&hfsmp->hfs_attribute_cp); + hfs_unlock_mount (hfsmp); + + return (result); +} + diff --git a/core/hfs_btreeio.h b/core/hfs_btreeio.h new file mode 100644 index 0000000..740734d --- /dev/null +++ b/core/hfs_btreeio.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2005-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#ifndef _HFS_BTREEIO_H_ +#define _HFS_BTREEIO_H_ + +#include + +#ifdef KERNEL +#ifdef __APPLE_API_PRIVATE + +#include "hfs.h" +#include "BTreesInternal.h" + +/* BTree accessor routines */ +extern OSStatus SetBTreeBlockSize(FileReference vp, ByteCount blockSize, + ItemCount minBlockCount); + +extern OSStatus GetBTreeBlock(FileReference vp, u_int32_t blockNum, + GetBlockOptions options, BlockDescriptor *block); + +extern OSStatus ReleaseBTreeBlock(FileReference vp, BlockDescPtr blockPtr, + ReleaseBlockOptions options); + +extern OSStatus ExtendBTreeFile(FileReference vp, FSSize minEOF, FSSize maxEOF); + +extern void ModifyBlockStart(FileReference vp, BlockDescPtr blockPtr); + +int hfs_create_attr_btree(struct hfsmount *hfsmp, u_int32_t nodesize, u_int32_t nodecnt); + +u_int16_t get_btree_nodesize(struct vnode *vp); + +#endif /* __APPLE_API_PRIVATE */ +#endif /* KERNEL */ +#endif /* ! _HFS_BTREEIO_H_ */ diff --git a/core/hfs_catalog.c b/core/hfs_catalog.c new file mode 100644 index 0000000..d48a106 --- /dev/null +++ b/core/hfs_catalog.c @@ -0,0 +1,4813 @@ +/* + * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "hfs.h" +#include "hfs_catalog.h" +#include "hfs_format.h" +#include "hfs_endian.h" + +#include "BTreesInternal.h" +#include "BTreesPrivate.h" +#include "HFSUnicodeWrappers.h" + + +/* + * Initialization of an FSBufferDescriptor structure. + */ +#define BDINIT(bd, addr) { \ + (bd).bufferAddress = (addr); \ + (bd).itemSize = sizeof(*(addr)); \ + (bd).itemCount = 1; \ +} + + +struct btobj { + BTreeIterator iterator; + HFSPlusCatalogKey key; + CatalogRecord data; +}; + +struct update_state { + struct cat_desc * s_desc; + struct cat_attr * s_attr; + const struct cat_fork * s_datafork; + const struct cat_fork * s_rsrcfork; + struct hfsmount * s_hfsmp; +}; + +struct position_state { + int error; + u_int32_t count; + u_int32_t index; + u_int32_t parentID; + struct hfsmount *hfsmp; +}; + +/* Map file mode type to directory entry types */ +u_char modetodirtype[16] = { + DT_REG, DT_FIFO, DT_CHR, DT_UNKNOWN, + DT_DIR, DT_UNKNOWN, DT_BLK, DT_UNKNOWN, + DT_REG, DT_UNKNOWN, DT_LNK, DT_UNKNOWN, + DT_SOCK, DT_UNKNOWN, DT_WHT, DT_UNKNOWN +}; +#define MODE_TO_DT(mode) (modetodirtype[((mode) & S_IFMT) >> 12]) + + +#define HFS_LOOKUP_SYSFILE 0x1 /* If set, allow lookup of system files */ +#define HFS_LOOKUP_HARDLINK 0x2 /* If set, allow lookup of hard link records and not resolve the hard links */ +#define HFS_LOOKUP_CASESENSITIVE 0x4 /* If set, verify results of a file/directory record match input case */ +static int cat_lookupbykey(struct hfsmount *hfsmp, CatalogKey *keyp, int flags, u_int32_t hint, int wantrsrc, + struct cat_desc *descp, struct cat_attr *attrp, struct cat_fork *forkp, cnid_t *desc_cnid); + +int cat_lookupmangled(struct hfsmount *hfsmp, struct cat_desc *descp, int wantrsrc, + struct cat_desc *outdescp, struct cat_attr *attrp, struct cat_fork *forkp); + +/* Internal catalog support routines */ + +static int cat_findposition(const CatalogKey *ckp, const CatalogRecord *crp, + struct position_state *state); + +static int resolvelinkid(struct hfsmount *hfsmp, u_int32_t linkref, ino_t *ino); + +static int getkey(struct hfsmount *hfsmp, cnid_t cnid, CatalogKey * key); + +static int buildkey(struct hfsmount *hfsmp, struct cat_desc *descp, + HFSPlusCatalogKey *key, int retry); + +static void buildthreadkey(HFSCatalogNodeID parentID, int std_hfs, CatalogKey *key); + +static void buildrecord(struct cat_attr *attrp, cnid_t cnid, int std_hfs, u_int32_t encoding, CatalogRecord *crp, u_int32_t *recordSize); + +static int catrec_update(const CatalogKey *ckp, CatalogRecord *crp, struct update_state *state); + +static int builddesc(const HFSPlusCatalogKey *key, cnid_t cnid, u_int32_t hint, u_int32_t encoding, + int isdir, struct cat_desc *descp); + +static void getbsdattr(struct hfsmount *hfsmp, const struct HFSPlusCatalogFile *crp, struct cat_attr * attrp); + +#if CONFIG_HFS_STD +static void promotekey(struct hfsmount *hfsmp, const HFSCatalogKey *hfskey, HFSPlusCatalogKey *keyp, u_int32_t *encoding); +static void promotefork(struct hfsmount *hfsmp, const struct HFSCatalogFile *file, int resource, struct cat_fork * forkp); +static void promoteattr(struct hfsmount *hfsmp, const CatalogRecord *dataPtr, struct HFSPlusCatalogFile *crp); +#endif + +static cnid_t getcnid(const CatalogRecord *crp); +static u_int32_t getencoding(const CatalogRecord *crp); +static cnid_t getparentcnid(const CatalogRecord *recp); + +static int isadir(const CatalogRecord *crp); + +static int buildthread(void *keyp, void *recp, int std_hfs, int directory); + +static int cat_makealias(struct hfsmount *hfsmp, u_int32_t inode_num, struct HFSPlusCatalogFile *crp); + +static int cat_update_internal(struct hfsmount *hfsmp, int update_hardlink, struct cat_desc *descp, struct cat_attr *attrp, + const struct cat_fork *dataforkp, const struct cat_fork *rsrcforkp); + + + +/* HFS ID Hashtable Functions */ +#define IDHASH(hfsmp, inum) (&hfsmp->hfs_idhashtbl[(inum) & hfsmp->hfs_idhash]) + +/* Initialize the HFS ID hash table */ +void +hfs_idhash_init (struct hfsmount *hfsmp) { + /* secured by catalog lock so no lock init needed */ + hfsmp->hfs_idhashtbl = hashinit(HFS_IDHASH_DEFAULT, M_TEMP, &hfsmp->hfs_idhash); +} + +/* Free the HFS ID hash table */ +void +hfs_idhash_destroy (struct hfsmount *hfsmp) { + /* during failed mounts & unmounts */ + FREE(hfsmp->hfs_idhashtbl, M_TEMP); +} + +/* +from hfs_catalog.h: +typedef struct cat_preflightid { + cnid_t fileid; + LIST_ENTRY(cat_preflightid) id_hash; +} cat_preflightid_t; + +from hfs.h: + u_long hfs_idhash; / size of cnid/fileid hash table -1 / + LIST_HEAD(idhashhead, cat_preflightid) *hfs_idhashtbl; / base of ID hash / +*/ + +/* + * Check the run-time ID hashtable. + * + * The catalog lock must be held (like other functions in this file). + * + * Returns: + * 1 if the ID is in the hash table. + * 0 if the ID is not in the hash table + */ +int cat_check_idhash (struct hfsmount *hfsmp, cnid_t test_fileid) { + + cat_preflightid_t *preflight; + int found = 0; + + for (preflight = IDHASH(hfsmp, test_fileid)->lh_first; preflight ; preflight = preflight->id_hash.le_next) { + if (preflight->fileid == test_fileid) { + found = 1; + break; + } + } + + return found; +} + +/* Insert the supplied preflight into the ID hash table */ +int cat_insert_idhash (struct hfsmount *hfsmp, cat_preflightid_t *preflight) { + + if (preflight) { + LIST_INSERT_HEAD(IDHASH(hfsmp, (preflight->fileid)), preflight, id_hash); + return 0; + } + return -1; +} + + +/* Remove the data structure with the specified ID from the hashtable */ +int cat_remove_idhash (cat_preflightid_t *preflight) { + + if ((preflight) && ((preflight->id_hash.le_next || preflight->id_hash.le_prev))) { + LIST_REMOVE (preflight, id_hash); + preflight->id_hash.le_next = NULL; + preflight->id_hash.le_prev = NULL; + + return 0; + } + + return -1; +} + +/* + * Acquire a new CNID for use. + * + * This is slightly more complicated than just pulling the value from the + * hfsmount data structure. We need to validate that the ID is not in-use + * even if we've not wrapped around and that there are not any lingering + * or orphaned fileIDs for this ID. + * + * Also validate that there are not any pending insertions into the + * catalog by checking the ID hash table. + */ +int +cat_acquire_cnid (struct hfsmount *hfsmp, cnid_t *new_cnid) +{ + uint32_t nextCNID; + struct BTreeIterator *iterator; + FSBufferDescriptor btdata; + uint16_t datasize; + CatalogRecord *recp; + int result = 0; + int std_hfs; + int wrapped = 0; + + std_hfs = (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord); + /* + * Get the next CNID. We can change it since we hold the catalog lock. + */ +nextid: + nextCNID = hfsmp->vcbNxtCNID; + if (nextCNID == 0xFFFFFFFF) { + if (std_hfs) { + return (ENOSPC); + } else { + wrapped++; + if (wrapped > 1) { + /* don't allow more than one wrap-around */ + return ENOSPC; + } + hfs_lock_mount (hfsmp); + hfsmp->vcbNxtCNID = kHFSFirstUserCatalogNodeID; + hfsmp->vcbAtrb |= kHFSCatalogNodeIDsReusedMask; + hfs_unlock_mount (hfsmp); + } + } else { + hfsmp->vcbNxtCNID++; + } + hfs_note_header_minor_change(hfsmp); + + /* First check that there are not any entries pending in the hash table with this ID */ + if (cat_check_idhash (hfsmp, nextCNID)) { + /* Someone wants to insert this into the catalog but hasn't done so yet. Skip it */ + goto nextid; + } + + /* Check to see if a thread record exists for the target ID we just got */ + iterator = hfs_mallocz(sizeof(*iterator)); + buildthreadkey(nextCNID, std_hfs, (CatalogKey *)&iterator->key); + + recp = hfs_malloc(sizeof(CatalogRecord)); + BDINIT(btdata, recp); + + result = BTSearchRecord(hfsmp->hfs_catalog_cp->c_datafork, iterator, &btdata, &datasize, iterator); + hfs_free(recp, sizeof(CatalogRecord)); + hfs_free(iterator, sizeof(*iterator)); + + if (result == btNotFound) { + /* Good. File ID was not in use. Move on to checking EA B-Tree */ + result = file_attribute_exist (hfsmp, nextCNID); + if (result == EEXIST) { + /* This CNID has orphaned EAs. Skip it and move on to the next one */ + result = 0; + goto nextid; + } + if (result) { + /* For any other error, return the result */ + return result; + } + + /* + * Now validate that there are no lingering cnodes with this ID. If a cnode + * has been removed on-disk (marked C_NOEXISTS), but has not yet been reclaimed, + * then it will still have an entry in the cnode hash table. This means that + * a subsequent lookup will find THAT entry and believe this one has been deleted + * prematurely. If there is a lingering cnode, then just skip this entry and move on. + * + * Note that we pass (existence_only == 1) argument to hfs_chash_snoop. + */ + if (!std_hfs && (hfsmp->vcbAtrb & kHFSCatalogNodeIDsReusedMask)) { + if (hfs_chash_snoop (hfsmp, nextCNID, 1, NULL, NULL) == 0) { + goto nextid; + } + } + + /* + * If we get here, then we didn't see any thread records, orphaned EAs, + * or stale cnodes. This ID is safe to vend out. + */ + *new_cnid = nextCNID; + } + else if (result == noErr) { + /* move on to the next ID */ + goto nextid; + } + else { + /* For any other situation, just bail out */ + return EIO; + } + + return 0; + +} + +int +cat_preflight(struct hfsmount *hfsmp, catops_t ops, cat_cookie_t *cookie, __unused proc_t p) +{ + int lockflags = 0; + int result; + + if (hfsmp->hfs_catalog_cp->c_lockowner != current_thread()) + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); + + result = BTReserveSpace(hfsmp->hfs_catalog_cp->c_datafork, ops, (void*)cookie); + + if (lockflags) + hfs_systemfile_unlock(hfsmp, lockflags); + + return MacToVFSError(result); +} + +void +cat_postflight(struct hfsmount *hfsmp, cat_cookie_t *cookie, __unused proc_t p) +{ + int lockflags = 0; + + if (hfsmp->hfs_catalog_cp->c_lockowner != current_thread()) + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); + + (void) BTReleaseReserve(hfsmp->hfs_catalog_cp->c_datafork, (void*)cookie); + + if (lockflags) + hfs_systemfile_unlock(hfsmp, lockflags); +} + +void +cat_convertattr( + struct hfsmount *hfsmp, + CatalogRecord * recp, + struct cat_attr *attrp, + struct cat_fork *datafp, + struct cat_fork *rsrcfp) +{ + int std_hfs = HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord; + + if (std_hfs == 0) { + getbsdattr(hfsmp, (struct HFSPlusCatalogFile *)recp, attrp); + } +#if CONFIG_HFS_STD + else { + struct HFSPlusCatalogFile cnoderec; + + promoteattr(hfsmp, recp, &cnoderec); + getbsdattr(hfsmp, &cnoderec, attrp); + } +#endif + + if (isadir(recp)) { + bzero(datafp, sizeof(*datafp)); + } +#if CONFIG_HFS_STD + else if (std_hfs) { + promotefork(hfsmp, (HFSCatalogFile *)&recp->hfsFile, 0, datafp); + promotefork(hfsmp, (HFSCatalogFile *)&recp->hfsFile, 1, rsrcfp); + } +#endif + else { + /* Convert the data fork. */ + datafp->cf_size = recp->hfsPlusFile.dataFork.logicalSize; + datafp->cf_new_size = 0; + datafp->cf_blocks = recp->hfsPlusFile.dataFork.totalBlocks; + if ((hfsmp->hfc_stage == HFC_RECORDING) && + (attrp->ca_atime >= hfsmp->hfc_timebase)) { + datafp->cf_bytesread = + recp->hfsPlusFile.dataFork.clumpSize * + HFSTOVCB(hfsmp)->blockSize; + } else { + datafp->cf_bytesread = 0; + } + datafp->cf_vblocks = 0; + bcopy(&recp->hfsPlusFile.dataFork.extents[0], + &datafp->cf_extents[0], sizeof(HFSPlusExtentRecord)); + + /* Convert the resource fork. */ + rsrcfp->cf_size = recp->hfsPlusFile.resourceFork.logicalSize; + rsrcfp->cf_new_size = 0; + rsrcfp->cf_blocks = recp->hfsPlusFile.resourceFork.totalBlocks; + if ((hfsmp->hfc_stage == HFC_RECORDING) && + (attrp->ca_atime >= hfsmp->hfc_timebase)) { + datafp->cf_bytesread = + recp->hfsPlusFile.resourceFork.clumpSize * + HFSTOVCB(hfsmp)->blockSize; + } else { + datafp->cf_bytesread = 0; + } + rsrcfp->cf_vblocks = 0; + bcopy(&recp->hfsPlusFile.resourceFork.extents[0], + &rsrcfp->cf_extents[0], sizeof(HFSPlusExtentRecord)); + } +} + +/* + * Convert a raw catalog key and record into an in-core catalog descriptor. + * + * Note: The caller is responsible for releasing the catalog descriptor. + */ +int +cat_convertkey( + struct hfsmount *hfsmp, + CatalogKey *key, + CatalogRecord * recp, + struct cat_desc *descp) +{ + int std_hfs = HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord; + HFSPlusCatalogKey * pluskey = NULL; + u_int32_t encoding; + cnid_t cnid = 0; + int err = 0; + + if (std_hfs == 0) { + pluskey = (HFSPlusCatalogKey *)key; + encoding = getencoding(recp); + } +#if CONFIG_HFS_STD + else { + pluskey = hfs_malloc(sizeof(HFSPlusCatalogKey)); + promotekey(hfsmp, (HFSCatalogKey *)key, pluskey, &encoding); + } +#endif + + /* Get the CNID before calling builddesc. Need to error check it. */ + cnid = getcnid(recp); + if (cnid == 0) { + /* If ths CNID == 0, it's invalid. Mark as corrupt */ + hfs_mark_inconsistent (hfsmp, HFS_INCONSISTENCY_DETECTED); + err = EINVAL; + } + else { + builddesc(pluskey, cnid, 0, encoding, isadir(recp), descp); + } + +#if CONFIG_HFS_STD + if (std_hfs) { + hfs_free(pluskey, sizeof(*pluskey)); + } +#endif + + return err; +} + + +/* + * cat_releasedesc + */ +void +cat_releasedesc(struct cat_desc *descp) +{ + const u_int8_t * name; + + if (descp == NULL) + return; + + if ((descp->cd_flags & CD_HASBUF) && + (descp->cd_nameptr != NULL)) { + name = descp->cd_nameptr; + descp->cd_nameptr = NULL; + descp->cd_namelen = 0; + vfs_removename((const char *)name); + } + descp->cd_nameptr = NULL; + descp->cd_namelen = 0; + descp->cd_flags &= ~CD_HASBUF; +} + +/* + * These Catalog functions allow access to the HFS Catalog (database). + * The catalog b-tree lock must be acquired before calling any of these routines. + */ + +/* + * cat_lookup - lookup a catalog node using a cnode descriptor + * + * Note: The caller is responsible for releasing the output + * catalog descriptor (when supplied outdescp is non-null). + */ +int +cat_lookup(struct hfsmount *hfsmp, struct cat_desc *descp, int wantrsrc, int force_casesensitive_lookup, + struct cat_desc *outdescp, struct cat_attr *attrp, + struct cat_fork *forkp, cnid_t *desc_cnid) +{ + CatalogKey * keyp; + int std_hfs; + int result; + int flags; + + std_hfs = (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord); + flags = force_casesensitive_lookup ? HFS_LOOKUP_CASESENSITIVE : 0; + + keyp = hfs_malloc(sizeof(CatalogKey)); + + result = buildkey(hfsmp, descp, (HFSPlusCatalogKey *)keyp, 1); + if (result) + goto exit; + + result = cat_lookupbykey(hfsmp, keyp, flags, descp->cd_hint, wantrsrc, outdescp, attrp, forkp, desc_cnid); + + if (result == ENOENT) { + if (!std_hfs) { + struct cat_desc temp_desc; + if (outdescp == NULL) { + bzero(&temp_desc, sizeof(temp_desc)); + outdescp = &temp_desc; + } + result = cat_lookupmangled(hfsmp, descp, wantrsrc, outdescp, attrp, forkp); + if (desc_cnid) { + *desc_cnid = outdescp->cd_cnid; + } + if (outdescp == &temp_desc) { + /* Release the local copy of desc */ + cat_releasedesc(outdescp); + } + } else if (hfsmp->hfs_encoding != kTextEncodingMacRoman) { + // make MacRoman key from utf-8 + // result = cat_lookupbykey(hfsmp, keyp, descp->cd_hint, attrp, forkp); + // update desc text encoding so that other catalog ops succeed + } + } +exit: + hfs_free(keyp, sizeof(*keyp)); + + return (result); +} + +int +cat_insertfilethread(struct hfsmount *hfsmp, struct cat_desc *descp) +{ + struct BTreeIterator *iterator; + struct FSBufferDescriptor file_data; + struct HFSCatalogFile file_rec; + u_int16_t datasize; + FCB *fcb; + int result; + + if (HFSTOVCB(hfsmp)->vcbSigWord != kHFSSigWord) + return (EINVAL); + + fcb = GetFileControlBlock(HFSTOVCB(hfsmp)->catalogRefNum); + + iterator = hfs_mallocz(2 * sizeof(*iterator)); + result = buildkey(hfsmp, descp, (HFSPlusCatalogKey *)&iterator[0].key, 0); + if (result) + goto exit; + + BDINIT(file_data, &file_rec); + result = BTSearchRecord(fcb, &iterator[0], &file_data, &datasize, &iterator[0]); + if (result) + goto exit; + + if (file_rec.recordType != kHFSFileRecord) { + result = EISDIR; + goto exit; + } + + if ((file_rec.flags & kHFSThreadExistsMask) == 0) { + struct FSBufferDescriptor thread_data; + struct HFSCatalogThread thread_rec; + + file_rec.flags |= kHFSThreadExistsMask; + BDINIT(thread_data, &thread_rec); + thread_data.itemSize = buildthread(&iterator[0].key, &thread_rec, 1, 0); + buildthreadkey(file_rec.fileID, 1, (CatalogKey *)&iterator[1].key); + + result = BTInsertRecord(fcb, &iterator[1], &thread_data, thread_data.itemSize); + if (result) + goto exit; + + (void) BTReplaceRecord(fcb, &iterator[0], &file_data, datasize); + (void) BTFlushPath(fcb); + } +exit: + (void) BTFlushPath(fcb); + hfs_free(iterator, 2 * sizeof(*iterator)); + + return MacToVFSError(result); +} + + +/* + * cat_findname - obtain a descriptor from cnid + * + * Only a thread lookup is performed. + * + * Note: The caller is responsible for releasing the output + * catalog descriptor (when supplied outdescp is non-null). + + */ +int +cat_findname(struct hfsmount *hfsmp, cnid_t cnid, struct cat_desc *outdescp) +{ + struct BTreeIterator * iterator; + FSBufferDescriptor btdata; + CatalogKey * keyp; + CatalogRecord * recp; + int isdir; + int result; + int std_hfs; + + isdir = 0; +#if CONFIG_HFS_STD + std_hfs = (hfsmp->hfs_flags & HFS_STANDARD); +#else + std_hfs = 0; +#endif + + iterator = hfs_malloc(sizeof(*iterator)); + buildthreadkey(cnid, std_hfs, (CatalogKey *)&iterator->key); + iterator->hint.nodeNum = 0; + + recp = hfs_malloc(sizeof(CatalogRecord)); + BDINIT(btdata, recp); + + result = BTSearchRecord(VTOF(hfsmp->hfs_catalog_vp), iterator, &btdata, NULL, NULL); + if (result) + goto exit; + + /* Turn thread record into a cnode key (in place). */ + switch (recp->recordType) { + +#if CONFIG_HFS_STD + case kHFSFolderThreadRecord: + isdir = 1; + /* fall through */ + case kHFSFileThreadRecord: + keyp = (CatalogKey *)((char *)&recp->hfsThread.reserved + 6); + keyp->hfs.keyLength = kHFSCatalogKeyMinimumLength + keyp->hfs.nodeName[0]; + break; +#endif + + case kHFSPlusFolderThreadRecord: + isdir = 1; + /* fall through */ + case kHFSPlusFileThreadRecord: + keyp = (CatalogKey *)&recp->hfsPlusThread.reserved; + keyp->hfsPlus.keyLength = kHFSPlusCatalogKeyMinimumLength + + (keyp->hfsPlus.nodeName.length * 2); + break; + default: + result = ENOENT; + goto exit; + } + +#if CONFIG_HFS_STD + if (std_hfs) { + HFSPlusCatalogKey * pluskey = NULL; + u_int32_t encoding; + + pluskey = hfs_malloc(sizeof(HFSPlusCatalogKey)); + promotekey(hfsmp, &keyp->hfs, pluskey, &encoding); + builddesc(pluskey, cnid, 0, encoding, isdir, outdescp); + hfs_free(pluskey, sizeof(*pluskey)); + } else +#endif + { + builddesc((HFSPlusCatalogKey *)keyp, cnid, 0, 0, isdir, outdescp); + } + +exit: + hfs_free(recp, sizeof(*recp)); + hfs_free(iterator, sizeof(*iterator)); + + return MacToVFSError(result); +} + +/* + * cat_idlookup - lookup a catalog node using a cnode id + * + * Note: The caller is responsible for releasing the output + * catalog descriptor (when supplied outdescp is non-null). + */ +int +cat_idlookup(struct hfsmount *hfsmp, cnid_t cnid, int allow_system_files, int wantrsrc, + struct cat_desc *outdescp, struct cat_attr *attrp, struct cat_fork *forkp) +{ + struct BTreeIterator * iterator; + FSBufferDescriptor btdata; + u_int16_t datasize; + CatalogKey * keyp; + CatalogRecord * recp; + int result; + int std_hfs; + + std_hfs = (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord); + + iterator = hfs_mallocz(sizeof(*iterator)); + buildthreadkey(cnid, std_hfs, (CatalogKey *)&iterator->key); + + recp = hfs_malloc(sizeof(CatalogRecord)); + BDINIT(btdata, recp); + + result = BTSearchRecord(VTOF(HFSTOVCB(hfsmp)->catalogRefNum), iterator, + &btdata, &datasize, iterator); + if (result) + goto exit; + + /* Turn thread record into a cnode key (in place) */ + switch (recp->recordType) { + +#if CONFIG_HFS_STD + case kHFSFileThreadRecord: + case kHFSFolderThreadRecord: + keyp = (CatalogKey *)((char *)&recp->hfsThread.reserved + 6); + + /* check for NULL name */ + if (keyp->hfs.nodeName[0] == 0) { + result = ENOENT; + goto exit; + } + + keyp->hfs.keyLength = kHFSCatalogKeyMinimumLength + keyp->hfs.nodeName[0]; + break; +#endif + + case kHFSPlusFileThreadRecord: + case kHFSPlusFolderThreadRecord: + keyp = (CatalogKey *)&recp->hfsPlusThread.reserved; + + /* check for NULL name */ + if (keyp->hfsPlus.nodeName.length == 0) { + result = ENOENT; + goto exit; + } + + keyp->hfsPlus.keyLength = kHFSPlusCatalogKeyMinimumLength + + (keyp->hfsPlus.nodeName.length * 2); + break; + + default: + result = ENOENT; + goto exit; + } + + result = cat_lookupbykey(hfsmp, keyp, + ((allow_system_files != 0) ? HFS_LOOKUP_SYSFILE : 0), + 0, wantrsrc, outdescp, attrp, forkp, NULL); + /* No corresponding file/folder record found for a thread record, + * mark the volume inconsistent. + */ + if (result == 0 && outdescp) { + cnid_t dcnid = outdescp->cd_cnid; + /* + * Just for sanity's case, let's make sure that + * the key in the thread matches the key in the record. + */ + if (cnid != dcnid) { + printf("hfs: cat_idlookup: Requested cnid (%d / %08x) != dcnid (%d / %08x)\n", cnid, cnid, dcnid, dcnid); + result = ENOENT; + } + } +exit: + hfs_free(recp, sizeof(*recp)); + hfs_free(iterator, sizeof(*iterator)); + + return MacToVFSError(result); +} + + +/* + * cat_lookupmangled - lookup a catalog node using a mangled name + */ +int +cat_lookupmangled(struct hfsmount *hfsmp, struct cat_desc *descp, int wantrsrc, + struct cat_desc *outdescp, struct cat_attr *attrp, struct cat_fork *forkp) +{ + cnid_t fileID; + u_int32_t prefixlen; + int result; + u_int8_t utf8[NAME_MAX + 1]; + u_int32_t utf8len; + u_int16_t unicode[kHFSPlusMaxFileNameChars + 1]; + size_t unicodelen; + + if (wantrsrc) + return (ENOENT); + + fileID = GetEmbeddedFileID(descp->cd_nameptr, descp->cd_namelen, &prefixlen); + if (fileID < (cnid_t)kHFSFirstUserCatalogNodeID) + return (ENOENT); + + if (fileID == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid || + fileID == hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid || + fileID == hfsmp->hfs_jnlfileid || + fileID == hfsmp->hfs_jnlinfoblkid) { + return (ENOENT); + } + + result = cat_idlookup(hfsmp, fileID, 0, 0, outdescp, attrp, forkp); + if (result) + return (ENOENT); + /* It must be in the correct directory */ + if (descp->cd_parentcnid != outdescp->cd_parentcnid) + goto falsematch; + + /* + * Compare the mangled version of file name looked up from the + * disk with the mangled name provided by the user. Note that + * this comparison is case-sensitive, which should be fine + * since we're trying to prevent user space from constructing + * a mangled name that differs from the one they'd get from the + * file system. + */ + result = utf8_decodestr(outdescp->cd_nameptr, outdescp->cd_namelen, + unicode, &unicodelen, sizeof(unicode), ':', 0); + if (result) { + goto falsematch; + } + result = ConvertUnicodeToUTF8Mangled(unicodelen, unicode, + sizeof(utf8), &utf8len, utf8, fileID); + if ((result != 0) || + ((u_int16_t)descp->cd_namelen != utf8len) || + (bcmp(descp->cd_nameptr, utf8, utf8len) != 0)) { + goto falsematch; + } + + return (0); + +falsematch: + cat_releasedesc(outdescp); + return (ENOENT); +} + + +/* + * cat_lookupbykey - lookup a catalog node using a cnode key + */ +static int +cat_lookupbykey(struct hfsmount *hfsmp, CatalogKey *keyp, int flags, u_int32_t hint, int wantrsrc, + struct cat_desc *descp, struct cat_attr *attrp, struct cat_fork *forkp, cnid_t *desc_cnid) +{ + struct BTreeIterator * iterator; + FSBufferDescriptor btdata; + CatalogRecord * recp; + u_int16_t datasize; + int result; + int std_hfs; + u_int32_t ilink = 0; + cnid_t cnid = 0; + u_int32_t encoding = 0; + cnid_t parentid = 0; + + std_hfs = (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord); + + recp = hfs_malloc(sizeof(CatalogRecord)); + BDINIT(btdata, recp); + iterator = hfs_mallocz(sizeof(*iterator)); + iterator->hint.nodeNum = hint; + bcopy(keyp, &iterator->key, sizeof(CatalogKey)); + + result = BTSearchRecord(VTOF(HFSTOVCB(hfsmp)->catalogRefNum), iterator, + &btdata, &datasize, iterator); + if (result) + goto exit; + + /* Save the cnid, parentid, and encoding now in case there's a hard link or inode */ + cnid = getcnid(recp); + if (cnid == 0) { + /* CNID of 0 is invalid. Mark as corrupt */ + hfs_mark_inconsistent (hfsmp, HFS_INCONSISTENCY_DETECTED); + result = EINVAL; + goto exit; + } + + if (std_hfs == 0) { + parentid = keyp->hfsPlus.parentID; + } + + encoding = getencoding(recp); + hint = iterator->hint.nodeNum; + + /* Hide the journal files (if any) */ + if ((hfsmp->jnl || ((HFSTOVCB(hfsmp)->vcbAtrb & kHFSVolumeJournaledMask) && (hfsmp->hfs_flags & HFS_READ_ONLY))) && + ((cnid == hfsmp->hfs_jnlfileid) || (cnid == hfsmp->hfs_jnlinfoblkid)) && + !(flags & HFS_LOOKUP_SYSFILE)) { + result = HFS_ERESERVEDNAME; + goto exit; + } + + if (!std_hfs && !(hfsmp->hfs_flags & HFS_CASE_SENSITIVE)) { + /* Make sure the case of the file was correct if requested */ + if (flags & HFS_LOOKUP_CASESENSITIVE) { + if (0 != cat_binarykeycompare(&keyp->hfsPlus, (HFSPlusCatalogKey *)&iterator->key)) { + result = HFS_ERESERVEDNAME; + goto exit; + } + } + } + + /* + * When a hardlink link is encountered, auto resolve it. + * + * The catalog record will change, and possibly its type. + */ + if (!std_hfs + && (attrp || forkp) + && (recp->recordType == kHFSPlusFileRecord) + && ((to_bsd_time(recp->hfsPlusFile.createDate) == (time_t)hfsmp->hfs_itime) || + (to_bsd_time(recp->hfsPlusFile.createDate) == (time_t)hfsmp->hfs_metadata_createdate))) { + int isdirlink = 0; + int isfilelink = 0; + + if ((SWAP_BE32(recp->hfsPlusFile.userInfo.fdType) == kHardLinkFileType) && + (SWAP_BE32(recp->hfsPlusFile.userInfo.fdCreator) == kHFSPlusCreator)) { + isfilelink = 1; + } else if ((recp->hfsPlusFile.flags & kHFSHasLinkChainMask) && + (SWAP_BE32(recp->hfsPlusFile.userInfo.fdType) == kHFSAliasType) && + (SWAP_BE32(recp->hfsPlusFile.userInfo.fdCreator) == kHFSAliasCreator)) { + isdirlink = 1; + } + if ((isfilelink || isdirlink) && !(flags & HFS_LOOKUP_HARDLINK)) { + ilink = recp->hfsPlusFile.hl_linkReference; + (void) cat_resolvelink(hfsmp, ilink, isdirlink, (struct HFSPlusCatalogFile *)recp); + } + } + + if (attrp != NULL) { + if (std_hfs == 0) { + getbsdattr(hfsmp, (struct HFSPlusCatalogFile *)recp, attrp); + if (ilink) { + /* Update the inode number for this hard link */ + attrp->ca_linkref = ilink; + } + + /* + * Set kHFSHasLinkChainBit for hard links, and reset it for all + * other items. Also set linkCount to 1 for regular files. + * + * Due to some bug (rdar://8505977), some regular files can have + * kHFSHasLinkChainBit set and linkCount more than 1 even if they + * are not really hard links. The runtime code should not consider + * these files has hard links. Therefore we reset the kHFSHasLinkChainBit + * and linkCount for regular file before we vend it out. This might + * also result in repairing the bad files on disk, if the corresponding + * file is modified and updated on disk. + */ + if (ilink) { + /* This is a hard link and the link count bit was not set */ + if (!(attrp->ca_recflags & kHFSHasLinkChainMask)) { + printf ("hfs: set hardlink bit on vol=%s cnid=%u inoid=%u\n", hfsmp->vcbVN, cnid, ilink); + attrp->ca_recflags |= kHFSHasLinkChainMask; + } + } else { + /* Make sure that this non-hard link (regular) record is not + * an inode record that was looked up and we do not end up + * reseting the hard link bit on it. + */ + if ((parentid != hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid) && + (parentid != hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid)) { + /* This is not a hard link or inode and the link count bit was set */ + if (attrp->ca_recflags & kHFSHasLinkChainMask) { + printf ("hfs: clear hardlink bit on vol=%s cnid=%u\n", hfsmp->vcbVN, cnid); + attrp->ca_recflags &= ~kHFSHasLinkChainMask; + } + /* This is a regular file and the link count was more than 1 */ + if (S_ISREG(attrp->ca_mode) && (attrp->ca_linkcount > 1)) { + printf ("hfs: set linkcount=1 on vol=%s cnid=%u old=%u\n", hfsmp->vcbVN, cnid, attrp->ca_linkcount); + attrp->ca_linkcount = 1; + } + } + } + } +#if CONFIG_HFS_STD + else { + struct HFSPlusCatalogFile cnoderec; + + promoteattr(hfsmp, recp, &cnoderec); + getbsdattr(hfsmp, &cnoderec, attrp); + } +#endif + } + if (forkp != NULL) { + if (isadir(recp)) { + bzero(forkp, sizeof(*forkp)); + } +#if CONFIG_HFS_STD + else if (std_hfs) { + promotefork(hfsmp, (HFSCatalogFile *)&recp->hfsFile, wantrsrc, forkp); + } +#endif + else if (wantrsrc) { + /* Convert the resource fork. */ + forkp->cf_size = recp->hfsPlusFile.resourceFork.logicalSize; + forkp->cf_new_size = 0; + forkp->cf_blocks = recp->hfsPlusFile.resourceFork.totalBlocks; + if ((hfsmp->hfc_stage == HFC_RECORDING) && + (to_bsd_time(recp->hfsPlusFile.accessDate) >= hfsmp->hfc_timebase)) { + forkp->cf_bytesread = + recp->hfsPlusFile.resourceFork.clumpSize * + HFSTOVCB(hfsmp)->blockSize; + } else { + forkp->cf_bytesread = 0; + } + forkp->cf_vblocks = 0; + bcopy(&recp->hfsPlusFile.resourceFork.extents[0], + &forkp->cf_extents[0], sizeof(HFSPlusExtentRecord)); + } else { + int i; + u_int32_t validblks; + + /* Convert the data fork. */ + forkp->cf_size = recp->hfsPlusFile.dataFork.logicalSize; + forkp->cf_new_size = 0; + forkp->cf_blocks = recp->hfsPlusFile.dataFork.totalBlocks; + if ((hfsmp->hfc_stage == HFC_RECORDING) && + (to_bsd_time(recp->hfsPlusFile.accessDate) >= hfsmp->hfc_timebase)) { + forkp->cf_bytesread = + recp->hfsPlusFile.dataFork.clumpSize * + HFSTOVCB(hfsmp)->blockSize; + } else { + forkp->cf_bytesread = 0; + } + forkp->cf_vblocks = 0; + bcopy(&recp->hfsPlusFile.dataFork.extents[0], + &forkp->cf_extents[0], sizeof(HFSPlusExtentRecord)); + + /* Validate the fork's resident extents. */ + validblks = 0; + for (i = 0; i < kHFSPlusExtentDensity; ++i) { + if (forkp->cf_extents[i].startBlock + forkp->cf_extents[i].blockCount >= hfsmp->totalBlocks) { + /* Suppress any bad extents so a remove can succeed. */ + forkp->cf_extents[i].startBlock = 0; + forkp->cf_extents[i].blockCount = 0; + /* Disable writes */ + if (attrp != NULL) { + attrp->ca_mode &= S_IFMT | S_IRUSR | S_IRGRP | S_IROTH; + } + } else { + validblks += forkp->cf_extents[i].blockCount; + } + } + /* Adjust for any missing blocks. */ + if ((validblks < forkp->cf_blocks) && (forkp->cf_extents[7].blockCount == 0)) { + off_t psize; + + /* + * This is technically a volume corruption. + * If the total number of blocks calculated by iterating + summing + * the extents in the resident extent records, is less than that + * which is reported in the catalog entry, we should force a fsck. + * Only modifying ca_blocks here is not guaranteed to make it out + * to disk; it is a runtime-only field. + * + * Note that we could have gotten into this state if we had invalid ranges + * that existed in borrowed blocks that somehow made it out to disk. + * The cnode's on disk block count should never be greater + * than that which is in its extent records. + */ + + (void) hfs_mark_inconsistent (hfsmp, HFS_INCONSISTENCY_DETECTED); + + forkp->cf_blocks = validblks; + if (attrp != NULL) { + attrp->ca_blocks = validblks + recp->hfsPlusFile.resourceFork.totalBlocks; + } + psize = (off_t)validblks * (off_t)hfsmp->blockSize; + if (psize < forkp->cf_size) { + forkp->cf_size = psize; + } + + } + } + } + if (descp != NULL) { + HFSPlusCatalogKey * pluskey = NULL; + + if (std_hfs == 0) { + pluskey = (HFSPlusCatalogKey *)&iterator->key; + } +#if CONFIG_HFS_STD + else { + pluskey = hfs_malloc(sizeof(HFSPlusCatalogKey)); + promotekey(hfsmp, (HFSCatalogKey *)&iterator->key, pluskey, &encoding); + + } +#endif + + builddesc(pluskey, cnid, hint, encoding, isadir(recp), descp); + +#if CONFIG_HFS_STD + if (std_hfs) { + hfs_free(pluskey, sizeof(*pluskey)); + } +#endif + + } + + if (desc_cnid != NULL) { + *desc_cnid = cnid; + } +exit: + hfs_free(iterator, sizeof(*iterator)); + hfs_free(recp, sizeof(*recp)); + + return MacToVFSError(result); +} + + +/* + * cat_create - create a node in the catalog + * using MacRoman encoding + * + * NOTE: both the catalog file and attribute file locks must + * be held before calling this function. + * + * The caller is responsible for releasing the output + * catalog descriptor (when supplied outdescp is non-null). + */ +int +cat_create(struct hfsmount *hfsmp, cnid_t new_fileid, struct cat_desc *descp, struct cat_attr *attrp, + struct cat_desc *out_descp) +{ + FCB * fcb; + struct btobj * bto; + FSBufferDescriptor btdata; + u_int32_t datalen; + int std_hfs; + int result = 0; + u_int32_t encoding = kTextEncodingMacRoman; + int modeformat; + + modeformat = attrp->ca_mode & S_IFMT; + + fcb = hfsmp->hfs_catalog_cp->c_datafork; + std_hfs = (hfsmp->hfs_flags & HFS_STANDARD); + + /* The caller is expected to reserve a CNID before calling this function! */ + + /* Get space for iterator, key and data */ + bto = hfs_malloc(sizeof(struct btobj)); + bto->iterator.hint.nodeNum = 0; + + result = buildkey(hfsmp, descp, &bto->key, 0); + if (result) + goto exit; + + /* + * Insert the thread record first + */ + if (!std_hfs || (modeformat == S_IFDIR)) { + datalen = buildthread((void*)&bto->key, &bto->data, std_hfs, + S_ISDIR(attrp->ca_mode)); + btdata.bufferAddress = &bto->data; + btdata.itemSize = datalen; + btdata.itemCount = 1; + + /* Caller asserts the following: + * 1) this CNID is not in use by any orphaned EAs + * 2) There are no lingering cnodes (removed on-disk but still in-core) with this CNID + * 3) There are no thread or catalog records for this ID + */ + buildthreadkey(new_fileid, std_hfs, (CatalogKey *) &bto->iterator.key); + result = BTInsertRecord(fcb, &bto->iterator, &btdata, datalen); + if (result) { + goto exit; + } + } + + /* + * Now insert the file/directory record + */ + buildrecord(attrp, new_fileid, std_hfs, encoding, &bto->data, &datalen); + btdata.bufferAddress = &bto->data; + btdata.itemSize = datalen; + btdata.itemCount = 1; + + bcopy(&bto->key, &bto->iterator.key, sizeof(bto->key)); + + result = BTInsertRecord(fcb, &bto->iterator, &btdata, datalen); + if (result) { + if (result == btExists) + result = EEXIST; + + /* Back out the thread record */ + if (!std_hfs || S_ISDIR(attrp->ca_mode)) { + buildthreadkey(new_fileid, std_hfs, (CatalogKey *)&bto->iterator.key); + if (BTDeleteRecord(fcb, &bto->iterator)) { + /* Error on deleting extra thread record, mark + * volume inconsistent + */ + printf ("hfs: cat_create() failed to delete thread record id=%u on vol=%s\n", new_fileid, hfsmp->vcbVN); + hfs_mark_inconsistent(hfsmp, HFS_ROLLBACK_FAILED); + } + } + goto exit; + } + + /* + * Insert was successful, update name, parent and volume + */ + if (out_descp != NULL) { + HFSPlusCatalogKey * pluskey = NULL; + + if (std_hfs == 0) { + pluskey = (HFSPlusCatalogKey *)&bto->iterator.key; + } +#if CONFIG_HFS_STD + else { + pluskey = hfs_malloc(sizeof(HFSPlusCatalogKey)); + promotekey(hfsmp, (HFSCatalogKey *)&bto->iterator.key, pluskey, &encoding); + } +#endif + + builddesc(pluskey, new_fileid, bto->iterator.hint.nodeNum, + encoding, S_ISDIR(attrp->ca_mode), out_descp); +#if CONFIG_HFS_STD + if (std_hfs) { + hfs_free(pluskey, sizeof(*pluskey)); + } +#endif + + } + attrp->ca_fileid = new_fileid; + +exit: + (void) BTFlushPath(fcb); + hfs_free(bto, sizeof(*bto)); + + return MacToVFSError(result); +} + + +/* + * cnode_rename - rename a catalog node + * + * Assumes that the target's directory exists. + * + * Order of B-tree operations: + * 1. BTSearchRecord(from_cnode, &data); + * 2. BTInsertRecord(to_cnode, &data); + * 3. BTDeleteRecord(from_cnode); + * 4. BTDeleteRecord(from_thread); + * 5. BTInsertRecord(to_thread); + * + * Note: The caller is responsible for releasing the output + * catalog descriptor (when supplied out_cdp is non-null). + */ +int +cat_rename ( + struct hfsmount * hfsmp, + struct cat_desc * from_cdp, + struct cat_desc * todir_cdp, + struct cat_desc * to_cdp, + struct cat_desc * out_cdp ) +{ + struct BTreeIterator * to_iterator = NULL; + struct BTreeIterator * from_iterator = NULL; + FSBufferDescriptor btdata; + CatalogRecord * recp = NULL; + HFSPlusCatalogKey * to_key; + ExtendedVCB * vcb; + FCB * fcb; + u_int16_t datasize; + int result = 0; + int sourcegone = 0; + int skipthread = 0; + int directory = from_cdp->cd_flags & CD_ISDIR; + int is_dirlink = 0; + int std_hfs; + u_int32_t encoding = 0; + + vcb = HFSTOVCB(hfsmp); + fcb = GetFileControlBlock(vcb->catalogRefNum); + std_hfs = (vcb->vcbSigWord == kHFSSigWord); + + if (from_cdp->cd_namelen == 0 || to_cdp->cd_namelen == 0) + return (EINVAL); + + from_iterator = hfs_mallocz(sizeof(*from_iterator)); + if ((result = buildkey(hfsmp, from_cdp, (HFSPlusCatalogKey *)&from_iterator->key, 0))) + goto exit; + + to_iterator = hfs_mallocz(sizeof(*to_iterator)); + if ((result = buildkey(hfsmp, to_cdp, (HFSPlusCatalogKey *)&to_iterator->key, 0))) + goto exit; + + to_key = (HFSPlusCatalogKey *)&to_iterator->key; + recp = hfs_malloc(sizeof(CatalogRecord)); + BDINIT(btdata, recp); + + /* + * When moving a directory, make sure its a valid move. + */ + if (directory && (from_cdp->cd_parentcnid != to_cdp->cd_parentcnid)) { + struct BTreeIterator *dir_iterator = NULL; + + cnid_t cnid = from_cdp->cd_cnid; + cnid_t pathcnid = todir_cdp->cd_parentcnid; + + /* First check the obvious ones */ + if (cnid == fsRtDirID || + cnid == to_cdp->cd_parentcnid || + cnid == pathcnid) { + result = EINVAL; + goto exit; + } + /* now allocate the dir_iterator */ + dir_iterator = hfs_mallocz(sizeof(struct BTreeIterator)); + + /* + * Traverse destination path all the way back to the root + * making sure that source directory is not encountered. + * + */ + while (pathcnid > fsRtDirID) { + buildthreadkey(pathcnid, std_hfs, (CatalogKey *)&dir_iterator->key); + result = BTSearchRecord(fcb, dir_iterator, &btdata, &datasize, NULL); + if (result) { + hfs_free(dir_iterator, sizeof(*dir_iterator)); + goto exit; + } + pathcnid = getparentcnid(recp); + if (pathcnid == cnid || pathcnid == 0) { + result = EINVAL; + hfs_free(dir_iterator, sizeof(*dir_iterator)); + goto exit; + } + } + hfs_free(dir_iterator, sizeof(*dir_iterator)); + } + + /* + * Step 1: Find cnode data at old location + */ + result = BTSearchRecord(fcb, from_iterator, &btdata, + &datasize, from_iterator); + if (result) { + if (std_hfs || (result != btNotFound)) + goto exit; + + struct cat_desc temp_desc; + + /* Probably the node has mangled name */ + result = cat_lookupmangled(hfsmp, from_cdp, 0, &temp_desc, NULL, NULL); + if (result) + goto exit; + + /* The file has mangled name. Search the cnode data using full name */ + bzero(from_iterator, sizeof(*from_iterator)); + result = buildkey(hfsmp, &temp_desc, (HFSPlusCatalogKey *)&from_iterator->key, 0); + if (result) { + cat_releasedesc(&temp_desc); + goto exit; + } + + result = BTSearchRecord(fcb, from_iterator, &btdata, &datasize, from_iterator); + if (result) { + cat_releasedesc(&temp_desc); + goto exit; + } + + cat_releasedesc(&temp_desc); + } + + /* Check if the source is directory hard link. We do not change + * directory flag because it is later used to initialize result descp + */ + if ((!std_hfs) && + (directory) && + (recp->recordType == kHFSPlusFileRecord) && + (recp->hfsPlusFile.flags & kHFSHasLinkChainMask)) { + is_dirlink = 1; + } + + /* + * Update the text encoding (on disk and in descriptor), + * using hfs_pickencoding to get the new encoding when available. + * + * Note that hardlink inodes don't require a text encoding hint. + */ + if (!std_hfs && + todir_cdp->cd_parentcnid != hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid && + todir_cdp->cd_parentcnid != hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) { + +#if TARGET_OS_OSX + encoding = hfs_pickencoding(to_key->nodeName.unicode, to_key->nodeName.length); +#else // !TARGET_OS_OSX + encoding = kTextEncodingMacRoman; +#endif // TARGET_OS_OSX + + hfs_setencodingbits(hfsmp, encoding); + recp->hfsPlusFile.textEncoding = encoding; + if (out_cdp) + out_cdp->cd_encoding = encoding; + } + +#if CONFIG_HFS_STD + if (std_hfs && !directory && + !(recp->hfsFile.flags & kHFSThreadExistsMask)) { + skipthread = 1; + } +#endif + +#if 0 + /* + * If the keys are identical then there's nothing left to do! + * + * update the hint and exit + * + */ + if (std_hfs && hfskeycompare(to_key, iter->key) == 0) + goto exit; + if (!std_hfs && hfspluskeycompare(to_key, iter->key) == 0) + goto exit; +#endif + + /* Step 2: Insert cnode at new location */ + result = BTInsertRecord(fcb, to_iterator, &btdata, datasize); + if (result == btExists) { + int fromtype = recp->recordType; + cnid_t cnid = 0; + + if (from_cdp->cd_parentcnid != to_cdp->cd_parentcnid) + goto exit; /* EEXIST */ + + /* Find cnode data at new location */ + result = BTSearchRecord(fcb, to_iterator, &btdata, &datasize, NULL); + if (result) + goto exit; + + /* Get the CNID after calling searchrecord */ + cnid = getcnid (recp); + if (cnid == 0) { + hfs_mark_inconsistent(hfsmp, HFS_INCONSISTENCY_DETECTED); + result = EINVAL; + goto exit; + } + + if ((fromtype != recp->recordType) || + (from_cdp->cd_cnid != cnid)) { + result = EEXIST; + goto exit; /* EEXIST */ + } + /* The old name is a case variant and must be removed */ + result = BTDeleteRecord(fcb, from_iterator); + if (result) + goto exit; + + /* Insert cnode (now that case duplicate is gone) */ + result = BTInsertRecord(fcb, to_iterator, &btdata, datasize); + if (result) { + /* Try and restore original before leaving */ + // XXXdbg + #if 1 + { + int err; + err = BTInsertRecord(fcb, from_iterator, &btdata, datasize); + if (err) { + printf("hfs: cat_create: could not undo (BTInsert = %d)\n", err); + hfs_mark_inconsistent(hfsmp, HFS_ROLLBACK_FAILED); + result = err; + goto exit; + } + } + #else + (void) BTInsertRecord(fcb, from_iterator, &btdata, datasize); + #endif + goto exit; + } + sourcegone = 1; + } + if (result) + goto exit; + + /* Step 3: Remove cnode from old location */ + if (!sourcegone) { + result = BTDeleteRecord(fcb, from_iterator); + if (result) { + /* Try and delete new record before leaving */ + // XXXdbg + #if 1 + { + int err; + err = BTDeleteRecord(fcb, to_iterator); + if (err) { + printf("hfs: cat_create: could not undo (BTDelete = %d)\n", err); + hfs_mark_inconsistent(hfsmp, HFS_ROLLBACK_FAILED); + result = err; + goto exit; + } + } + #else + (void) BTDeleteRecord(fcb, to_iterator); + #endif + goto exit; + } + } + + /* #### POINT OF NO RETURN #### */ + + /* + * Step 4: Remove cnode's old thread record + */ + buildthreadkey(from_cdp->cd_cnid, std_hfs, (CatalogKey *)&from_iterator->key); + (void) BTDeleteRecord(fcb, from_iterator); + + /* + * Step 5: Insert cnode's new thread record + * (optional for HFS files) + */ + if (!skipthread) { + /* For directory hard links, always create a file thread + * record. For everything else, use the directory flag. + */ + if (is_dirlink) { + datasize = buildthread(&to_iterator->key, recp, std_hfs, false); + } else { + datasize = buildthread(&to_iterator->key, recp, std_hfs, directory); + } + btdata.itemSize = datasize; + buildthreadkey(from_cdp->cd_cnid, std_hfs, (CatalogKey *)&from_iterator->key); + result = BTInsertRecord(fcb, from_iterator, &btdata, datasize); + } + + if (out_cdp) { + HFSPlusCatalogKey * pluskey = NULL; + + if (std_hfs == 0) { + pluskey = (HFSPlusCatalogKey *)&to_iterator->key; + } +#if CONFIG_HFS_STD + else { + pluskey = hfs_malloc(sizeof(HFSPlusCatalogKey)); + promotekey(hfsmp, (HFSCatalogKey *)&to_iterator->key, pluskey, &encoding); + + /* Save the real encoding hint in the Finder Info (field 4). */ + if (directory && from_cdp->cd_cnid == kHFSRootFolderID) { + u_int32_t realhint; + + realhint = hfs_pickencoding(pluskey->nodeName.unicode, pluskey->nodeName.length); + vcb->vcbFndrInfo[4] = SET_HFS_TEXT_ENCODING(realhint); + } + } +#endif + + builddesc(pluskey, from_cdp->cd_cnid, to_iterator->hint.nodeNum, + encoding, directory, out_cdp); +#if CONFIG_HFS_STD + if (std_hfs) { + hfs_free(pluskey, sizeof(*pluskey)); + } +#endif + + } +exit: + (void) BTFlushPath(fcb); + if (from_iterator) + hfs_free(from_iterator, sizeof(*from_iterator)); + if (to_iterator) + hfs_free(to_iterator, sizeof(*to_iterator)); + if (recp) + hfs_free(recp, sizeof(*recp)); + return MacToVFSError(result); +} + + +/* + * cat_delete - delete a node from the catalog + * + * Order of B-tree operations: + * 1. BTDeleteRecord(cnode); + * 2. BTDeleteRecord(thread); + * 3. BTUpdateRecord(parent); + */ +int +cat_delete(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attrp) +{ + FCB * fcb; + BTreeIterator *iterator; + cnid_t cnid; + int std_hfs; + int result; + + fcb = hfsmp->hfs_catalog_cp->c_datafork; + std_hfs = (hfsmp->hfs_flags & HFS_STANDARD); + + /* Preflight check: + * + * The root directory cannot be deleted + * A directory must be empty + * A file must be zero length (no blocks) + */ + if (descp->cd_cnid < kHFSFirstUserCatalogNodeID || + descp->cd_parentcnid == kHFSRootParentID) + return (EINVAL); + + /* XXX Preflight Missing */ + + /* Borrow the btcb iterator since we have an exclusive catalog lock. */ + iterator = &((BTreeControlBlockPtr)(fcb->ff_sysfileinfo))->iterator; + iterator->hint.nodeNum = 0; + + /* + * Derive a key from either the file ID (for a virtual inode) + * or the descriptor. + */ + if (descp->cd_namelen == 0) { + result = getkey(hfsmp, attrp->ca_fileid, (CatalogKey *)&iterator->key); + cnid = attrp->ca_fileid; + } else { + result = buildkey(hfsmp, descp, (HFSPlusCatalogKey *)&iterator->key, 0); + cnid = descp->cd_cnid; + } + if (result) + goto exit; + + /* Delete record */ + result = BTDeleteRecord(fcb, iterator); + if (result) { + if (std_hfs || (result != btNotFound)) + goto exit; + + struct cat_desc temp_desc; + + /* Probably the node has mangled name */ + result = cat_lookupmangled(hfsmp, descp, 0, &temp_desc, attrp, NULL); + if (result) + goto exit; + + /* The file has mangled name. Delete the file using full name */ + bzero(iterator, sizeof(*iterator)); + result = buildkey(hfsmp, &temp_desc, (HFSPlusCatalogKey *)&iterator->key, 0); + cnid = temp_desc.cd_cnid; + if (result) { + cat_releasedesc(&temp_desc); + goto exit; + } + + result = BTDeleteRecord(fcb, iterator); + if (result) { + cat_releasedesc(&temp_desc); + goto exit; + } + + cat_releasedesc(&temp_desc); + } + + /* Delete thread record. On error, mark volume inconsistent */ + buildthreadkey(cnid, std_hfs, (CatalogKey *)&iterator->key); + if (BTDeleteRecord(fcb, iterator)) { + if (!std_hfs) { + printf ("hfs: cat_delete() failed to delete thread record id=%u on vol=%s\n", cnid, hfsmp->vcbVN); + hfs_mark_inconsistent(hfsmp, HFS_OP_INCOMPLETE); + } + } + +exit: + (void) BTFlushPath(fcb); + + return MacToVFSError(result); +} + + +/* + * cat_update_internal - update the catalog node described by descp + * using the data from attrp and forkp. + * If update_hardlink is true, the hard link catalog record is updated + * and not the inode catalog record. + */ +static int +cat_update_internal(struct hfsmount *hfsmp, int update_hardlink, struct cat_desc *descp, struct cat_attr *attrp, + const struct cat_fork *dataforkp, const struct cat_fork *rsrcforkp) +{ + FCB * fcb; + BTreeIterator * iterator; + struct update_state state; + int result; + + fcb = hfsmp->hfs_catalog_cp->c_datafork; + + state.s_desc = descp; + state.s_attr = attrp; + state.s_datafork = dataforkp; + state.s_rsrcfork = rsrcforkp; + state.s_hfsmp = hfsmp; + + /* Borrow the btcb iterator since we have an exclusive catalog lock. */ + iterator = &((BTreeControlBlockPtr)(fcb->ff_sysfileinfo))->iterator; + + /* + * For open-deleted files we need to do a lookup by cnid + * (using thread rec). + * + * For hard links and if not requested by caller, the target + * of the update is the inode itself (not the link record) + * so a lookup by fileid (i.e. thread rec) is needed. + */ + if ((update_hardlink == false) && + ((descp->cd_cnid != attrp->ca_fileid) || + (descp->cd_namelen == 0) || + (attrp->ca_recflags & kHFSHasLinkChainMask))) { + result = getkey(hfsmp, attrp->ca_fileid, (CatalogKey *)&iterator->key); + } else { + result = buildkey(hfsmp, descp, (HFSPlusCatalogKey *)&iterator->key, 0); + } + if (result) + goto exit; + + /* Pass a node hint */ + iterator->hint.nodeNum = descp->cd_hint; + + result = BTUpdateRecord(fcb, iterator, + (IterateCallBackProcPtr)catrec_update, &state); + if (result) + goto exit; + + /* Update the node hint. */ + descp->cd_hint = iterator->hint.nodeNum; + +exit: + (void) BTFlushPath(fcb); + + return MacToVFSError(result); +} + +/* + * cat_update - update the catalog node described by descp + * using the data from attrp and forkp. + */ +int +cat_update(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attrp, + const struct cat_fork *dataforkp, const struct cat_fork *rsrcforkp) +{ + return cat_update_internal(hfsmp, false, descp, attrp, dataforkp, rsrcforkp); +} + +/* + * catrec_update - Update the fields of a catalog record + * This is called from within BTUpdateRecord. + */ +static int +catrec_update(const CatalogKey *ckp, CatalogRecord *crp, struct update_state *state) +{ + struct cat_desc *descp; + struct cat_attr *attrp; + const struct cat_fork *forkp; + struct hfsmount *hfsmp; + long blksize; + + descp = state->s_desc; + attrp = state->s_attr; + hfsmp = state->s_hfsmp; + blksize = HFSTOVCB(hfsmp)->blockSize; + + switch (crp->recordType) { + +#if CONFIG_HFS_STD + case kHFSFolderRecord: { + HFSCatalogFolder *dir; + + dir = (struct HFSCatalogFolder *)crp; + /* Do a quick sanity check */ + if ((ckp->hfs.parentID != descp->cd_parentcnid) || + (dir->folderID != descp->cd_cnid)) + return (btNotFound); + dir->valence = attrp->ca_entries; + dir->createDate = UTCToLocal(to_hfs_time(attrp->ca_itime)); + dir->modifyDate = UTCToLocal(to_hfs_time(attrp->ca_mtime)); + dir->backupDate = UTCToLocal(to_hfs_time(attrp->ca_btime)); + bcopy(&attrp->ca_finderinfo[0], &dir->userInfo, 16); + bcopy(&attrp->ca_finderinfo[16], &dir->finderInfo, 16); + break; + } + case kHFSFileRecord: { + HFSCatalogFile *file; + int i; + + file = (struct HFSCatalogFile *)crp; + /* Do a quick sanity check */ + if ((ckp->hfs.parentID != descp->cd_parentcnid) || + (file->fileID != attrp->ca_fileid)) + return (btNotFound); + file->createDate = UTCToLocal(to_hfs_time(attrp->ca_itime)); + file->modifyDate = UTCToLocal(to_hfs_time(attrp->ca_mtime)); + file->backupDate = UTCToLocal(to_hfs_time(attrp->ca_btime)); + bcopy(&attrp->ca_finderinfo[0], &file->userInfo, 16); + bcopy(&attrp->ca_finderinfo[16], &file->finderInfo, 16); + if (state->s_rsrcfork) { + forkp = state->s_rsrcfork; + file->rsrcLogicalSize = forkp->cf_size; + file->rsrcPhysicalSize = forkp->cf_blocks * blksize; + for (i = 0; i < kHFSExtentDensity; ++i) { + file->rsrcExtents[i].startBlock = + (u_int16_t)forkp->cf_extents[i].startBlock; + file->rsrcExtents[i].blockCount = + (u_int16_t)forkp->cf_extents[i].blockCount; + } + } + if (state->s_datafork) { + forkp = state->s_datafork; + file->dataLogicalSize = forkp->cf_size; + file->dataPhysicalSize = forkp->cf_blocks * blksize; + for (i = 0; i < kHFSExtentDensity; ++i) { + file->dataExtents[i].startBlock = + (u_int16_t)forkp->cf_extents[i].startBlock; + file->dataExtents[i].blockCount = + (u_int16_t)forkp->cf_extents[i].blockCount; + } + } + + /* Synchronize the lock state */ + if (attrp->ca_flags & (SF_IMMUTABLE | UF_IMMUTABLE)) + file->flags |= kHFSFileLockedMask; + else + file->flags &= ~kHFSFileLockedMask; + break; + } +#endif + + case kHFSPlusFolderRecord: { + HFSPlusCatalogFolder *dir; + + dir = (struct HFSPlusCatalogFolder *)crp; + /* Do a quick sanity check */ + if (dir->folderID != attrp->ca_fileid) { + printf("hfs: catrec_update: id %d != %d, vol=%s\n", dir->folderID, attrp->ca_fileid, hfsmp->vcbVN); + return (btNotFound); + } + dir->flags = attrp->ca_recflags; + dir->valence = attrp->ca_entries; + dir->createDate = to_hfs_time(attrp->ca_itime); + dir->contentModDate = to_hfs_time(attrp->ca_mtime); + dir->backupDate = to_hfs_time(attrp->ca_btime); + dir->accessDate = to_hfs_time(attrp->ca_atime); + attrp->ca_atimeondisk = attrp->ca_atime; + dir->attributeModDate = to_hfs_time(attrp->ca_ctime); + /* Note: directory hardlink inodes don't require a text encoding hint. */ + if (ckp->hfsPlus.parentID != hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) { + dir->textEncoding = descp->cd_encoding; + } + dir->folderCount = attrp->ca_dircount; + bcopy(&attrp->ca_finderinfo[0], &dir->userInfo, 32); + /* + * Update the BSD Info if it was already initialized on + * disk or if the runtime values have been modified. + * + * If the BSD info was already initialized, but + * MNT_UNKNOWNPERMISSIONS is set, then the runtime IDs are + * probably different than what was on disk. We don't want + * to overwrite the on-disk values (so if we turn off + * MNT_UNKNOWNPERMISSIONS, the old IDs get used again). + * This way, we can still change fields like the mode or + * dates even when MNT_UNKNOWNPERMISSIONS is set. + * + * Note that if MNT_UNKNOWNPERMISSIONS is set, hfs_chown + * won't change the uid or gid from their defaults. So, if + * the BSD info wasn't set, and the runtime values are not + * default, then what changed was the mode or flags. We + * have to set the uid and gid to something, so use the + * supplied values (which will be default), which has the + * same effect as creating a new file while + * MNT_UNKNOWNPERMISSIONS is set. + */ + if ((dir->bsdInfo.fileMode != 0) || + (attrp->ca_flags != 0) || + (attrp->ca_uid != hfsmp->hfs_uid) || + (attrp->ca_gid != hfsmp->hfs_gid) || + ((attrp->ca_mode & ALLPERMS) != + (hfsmp->hfs_dir_mask & ACCESSPERMS))) { + if ((dir->bsdInfo.fileMode == 0) || + (((unsigned int)vfs_flags(HFSTOVFS(hfsmp))) & MNT_UNKNOWNPERMISSIONS) == 0) { + dir->bsdInfo.ownerID = attrp->ca_uid; + dir->bsdInfo.groupID = attrp->ca_gid; + } + dir->bsdInfo.ownerFlags = attrp->ca_flags & 0x000000FF; + dir->bsdInfo.adminFlags = attrp->ca_flags >> 16; + dir->bsdInfo.fileMode = attrp->ca_mode; + /* A directory hardlink has a link count. */ + if (attrp->ca_linkcount > 1 || dir->hl_linkCount > 1) { + dir->hl_linkCount = attrp->ca_linkcount; + } + } + break; + } + case kHFSPlusFileRecord: { + HFSPlusCatalogFile *file; + int is_dirlink; + + file = (struct HFSPlusCatalogFile *)crp; + /* Do a quick sanity check */ + if (file->fileID != attrp->ca_fileid) + return (btNotFound); + file->flags = attrp->ca_recflags; + file->createDate = to_hfs_time(attrp->ca_itime); + file->contentModDate = to_hfs_time(attrp->ca_mtime); + file->backupDate = to_hfs_time(attrp->ca_btime); + file->accessDate = to_hfs_time(attrp->ca_atime); + attrp->ca_atimeondisk = attrp->ca_atime; + file->attributeModDate = to_hfs_time(attrp->ca_ctime); + /* + * Note: file hardlink inodes don't require a text encoding + * hint, but they do have a first link value. + */ + if (ckp->hfsPlus.parentID == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid) { + file->hl_firstLinkID = attrp->ca_firstlink; + } else { + file->textEncoding = descp->cd_encoding; + } + bcopy(&attrp->ca_finderinfo[0], &file->userInfo, 32); + /* + * Update the BSD Info if it was already initialized on + * disk or if the runtime values have been modified. + * + * If the BSD info was already initialized, but + * MNT_UNKNOWNPERMISSIONS is set, then the runtime IDs are + * probably different than what was on disk. We don't want + * to overwrite the on-disk values (so if we turn off + * MNT_UNKNOWNPERMISSIONS, the old IDs get used again). + * This way, we can still change fields like the mode or + * dates even when MNT_UNKNOWNPERMISSIONS is set. + * + * Note that if MNT_UNKNOWNPERMISSIONS is set, hfs_chown + * won't change the uid or gid from their defaults. So, if + * the BSD info wasn't set, and the runtime values are not + * default, then what changed was the mode or flags. We + * have to set the uid and gid to something, so use the + * supplied values (which will be default), which has the + * same effect as creating a new file while + * MNT_UNKNOWNPERMISSIONS is set. + * + * Do not modify bsdInfo for directory hard link records. + * They are set during creation and are not modifiable, so just + * leave them alone. + */ + is_dirlink = (file->flags & kHFSHasLinkChainMask) && + (SWAP_BE32(file->userInfo.fdType) == kHFSAliasType) && + (SWAP_BE32(file->userInfo.fdCreator) == kHFSAliasCreator); + + if (!is_dirlink && + ((file->bsdInfo.fileMode != 0) || + (attrp->ca_flags != 0) || + (attrp->ca_uid != hfsmp->hfs_uid) || + (attrp->ca_gid != hfsmp->hfs_gid) || + ((attrp->ca_mode & ALLPERMS) != + (hfsmp->hfs_file_mask & ACCESSPERMS)))) { + if ((file->bsdInfo.fileMode == 0) || + (((unsigned int)vfs_flags(HFSTOVFS(hfsmp))) & MNT_UNKNOWNPERMISSIONS) == 0) { + file->bsdInfo.ownerID = attrp->ca_uid; + file->bsdInfo.groupID = attrp->ca_gid; + } + file->bsdInfo.ownerFlags = attrp->ca_flags & 0x000000FF; + file->bsdInfo.adminFlags = attrp->ca_flags >> 16; + file->bsdInfo.fileMode = attrp->ca_mode; + } + if (state->s_rsrcfork) { + forkp = state->s_rsrcfork; + file->resourceFork.logicalSize = forkp->cf_size; + file->resourceFork.totalBlocks = forkp->cf_blocks; + bcopy(&forkp->cf_extents[0], &file->resourceFork.extents, + sizeof(HFSPlusExtentRecord)); + /* Push blocks read to disk */ + file->resourceFork.clumpSize = + howmany(forkp->cf_bytesread, blksize); + } + if (state->s_datafork) { + forkp = state->s_datafork; + file->dataFork.logicalSize = forkp->cf_size; + file->dataFork.totalBlocks = forkp->cf_blocks; + bcopy(&forkp->cf_extents[0], &file->dataFork.extents, + sizeof(HFSPlusExtentRecord)); + /* Push blocks read to disk */ + file->dataFork.clumpSize = + howmany(forkp->cf_bytesread, blksize); + } + + if ((file->resourceFork.extents[0].startBlock != 0) && + (file->resourceFork.extents[0].startBlock == + file->dataFork.extents[0].startBlock)) { + panic("hfs: catrec_update: rsrc fork == data fork"); + } + + /* Synchronize the lock state */ + if (attrp->ca_flags & (SF_IMMUTABLE | UF_IMMUTABLE)) + file->flags |= kHFSFileLockedMask; + else + file->flags &= ~kHFSFileLockedMask; + + /* Push out special field if necessary */ + if (S_ISBLK(attrp->ca_mode) || S_ISCHR(attrp->ca_mode)) { + file->bsdInfo.special.rawDevice = attrp->ca_rdev; + } + else { + /* + * Protect against the degenerate case where the descriptor contains the + * raw inode ID in its CNID field. If the HFSPlusCatalogFile record indicates + * the linkcount was greater than 1 (the default value), then it must have become + * a hardlink. In this case, update the linkcount from the cat_attr passed in. + */ + if ((descp->cd_cnid != attrp->ca_fileid) || (attrp->ca_linkcount > 1 ) || + (file->hl_linkCount > 1)) { + file->hl_linkCount = attrp->ca_linkcount; + } + } + break; + } + default: + return (btNotFound); + } + return (0); +} + +/* This function sets kHFSHasChildLinkBit in a directory hierarchy in the + * catalog btree of given cnid by walking up the parent chain till it reaches + * either the root folder, or the private metadata directory for storing + * directory hard links. This function updates the corresponding in-core + * cnode, if any, and the directory record in the catalog btree. + * On success, returns zero. On failure, returns non-zero value. + */ +int +cat_set_childlinkbit(struct hfsmount *hfsmp, cnid_t cnid) +{ + int retval = 0; + int lockflags = 0; + struct cat_desc desc; + struct cat_attr attr; + + while ((cnid != kHFSRootFolderID) && (cnid != kHFSRootParentID) && + (cnid != hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid)) { + /* Update the bit in corresponding cnode, if any, in the hash. + * If the cnode has the bit already set, stop the traversal. + */ + retval = hfs_chash_set_childlinkbit(hfsmp, cnid); + if (retval == 0) { + break; + } + + /* Update the catalog record on disk if either cnode was not + * found in the hash, or if a cnode was found and the cnode + * did not have the bit set previously. + */ + retval = hfs_start_transaction(hfsmp); + if (retval) { + break; + } + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); + + /* Look up our catalog folder record */ + retval = cat_idlookup(hfsmp, cnid, 0, 0, &desc, &attr, NULL); + if (retval) { + hfs_systemfile_unlock(hfsmp, lockflags); + hfs_end_transaction(hfsmp); + break; + } + + /* Update the bit in the catalog record */ + attr.ca_recflags |= kHFSHasChildLinkMask; + retval = cat_update(hfsmp, &desc, &attr, NULL, NULL); + if (retval) { + hfs_systemfile_unlock(hfsmp, lockflags); + hfs_end_transaction(hfsmp); + cat_releasedesc(&desc); + break; + } + + hfs_systemfile_unlock(hfsmp, lockflags); + hfs_end_transaction(hfsmp); + + cnid = desc.cd_parentcnid; + cat_releasedesc(&desc); + } + + return retval; +} + +/* This function traverses the parent directory hierarchy from the given + * directory to one level below root directory and checks if any of its + * ancestors is - + * 1. A directory hard link. + * 2. The 'pointed at' directory. + * If any of these conditions fail or an internal error is encountered + * during look up of the catalog record, this function returns non-zero value. + */ +int +cat_check_link_ancestry(struct hfsmount *hfsmp, cnid_t cnid, cnid_t pointed_at_cnid) +{ + HFSPlusCatalogKey *keyp; + BTreeIterator *ip; + FSBufferDescriptor btdata; + HFSPlusCatalogFolder folder; + FCB *fcb; + int invalid; + int result; + + invalid = 0; + BDINIT(btdata, &folder); + ip = hfs_malloc(sizeof(*ip)); + keyp = (HFSPlusCatalogKey *)&ip->key; + fcb = hfsmp->hfs_catalog_cp->c_datafork; + + while (cnid != kHFSRootParentID) { + /* Check if the 'pointed at' directory is an ancestor */ + if (pointed_at_cnid == cnid) { + invalid = 1; + break; + } + if ((result = getkey(hfsmp, cnid, (CatalogKey *)keyp))) { + printf("hfs: cat_check_link_ancestry: getkey failed id=%u, vol=%s\n", cnid, hfsmp->vcbVN); + invalid = 1; /* On errors, assume an invalid parent */ + break; + } + if ((result = BTSearchRecord(fcb, ip, &btdata, NULL, NULL))) { + printf("hfs: cat_check_link_ancestry: cannot find id=%u, vol=%s\n", cnid, hfsmp->vcbVN); + invalid = 1; /* On errors, assume an invalid parent */ + break; + } + /* Check if this ancestor is a directory hard link */ + if (folder.flags & kHFSHasLinkChainMask) { + invalid = 1; + break; + } + cnid = keyp->parentID; + } + hfs_free(ip, sizeof(*ip)); + return (invalid); +} + + +/* + * update_siblinglinks_callback - update a link's chain + */ + +struct linkupdate_state { + cnid_t filelinkid; + cnid_t prevlinkid; + cnid_t nextlinkid; +}; + +static int +update_siblinglinks_callback(__unused const CatalogKey *ckp, CatalogRecord *crp, struct linkupdate_state *state) +{ + HFSPlusCatalogFile *file; + + if (crp->recordType != kHFSPlusFileRecord) { + printf("hfs: update_siblinglinks_callback: unexpected rec type %d\n", crp->recordType); + return (btNotFound); + } + + file = (struct HFSPlusCatalogFile *)crp; + if (file->flags & kHFSHasLinkChainMask) { + if (state->prevlinkid != HFS_IGNORABLE_LINK) { + file->hl_prevLinkID = state->prevlinkid; + } + if (state->nextlinkid != HFS_IGNORABLE_LINK) { + file->hl_nextLinkID = state->nextlinkid; + } + } else { + printf("hfs: update_siblinglinks_callback: file %d isn't a chain\n", file->fileID); + } + return (0); +} + +/* + * cat_update_siblinglinks - update a link's chain + */ +int +cat_update_siblinglinks(struct hfsmount *hfsmp, cnid_t linkfileid, cnid_t prevlinkid, cnid_t nextlinkid) +{ + FCB * fcb; + BTreeIterator * iterator; + struct linkupdate_state state; + int result; + + fcb = hfsmp->hfs_catalog_cp->c_datafork; + state.filelinkid = linkfileid; + state.prevlinkid = prevlinkid; + state.nextlinkid = nextlinkid; + + /* Create an iterator for use by us temporarily */ + iterator = hfs_mallocz(sizeof(*iterator)); + + result = getkey(hfsmp, linkfileid, (CatalogKey *)&iterator->key); + if (result == 0) { + result = BTUpdateRecord(fcb, iterator, (IterateCallBackProcPtr)update_siblinglinks_callback, &state); + (void) BTFlushPath(fcb); + } else { + printf("hfs: cat_update_siblinglinks: couldn't resolve cnid=%d, vol=%s\n", linkfileid, hfsmp->vcbVN); + } + + hfs_free(iterator, sizeof(*iterator)); + return MacToVFSError(result); +} + +/* + * cat_lookuplink - lookup a link by it's name + */ +int +cat_lookuplink(struct hfsmount *hfsmp, struct cat_desc *descp, cnid_t *linkfileid, cnid_t *prevlinkid, cnid_t *nextlinkid) +{ + FCB * fcb; + BTreeIterator * iterator; + struct FSBufferDescriptor btdata; + struct HFSPlusCatalogFile file; + int result; + + fcb = hfsmp->hfs_catalog_cp->c_datafork; + + /* Create an iterator for use by us temporarily */ + iterator = hfs_mallocz(sizeof(*iterator)); + + if ((result = buildkey(hfsmp, descp, (HFSPlusCatalogKey *)&iterator->key, 0))) { + goto exit; + } + BDINIT(btdata, &file); + + if ((result = BTSearchRecord(fcb, iterator, &btdata, NULL, NULL))) { + goto exit; + } + if (file.recordType != kHFSPlusFileRecord) { + result = ENOENT; + goto exit; + } + *linkfileid = file.fileID; + + if (file.flags & kHFSHasLinkChainMask) { + *prevlinkid = file.hl_prevLinkID; + *nextlinkid = file.hl_nextLinkID; + } else { + *prevlinkid = 0; + *nextlinkid = 0; + } +exit: + hfs_free(iterator, sizeof(*iterator)); + return MacToVFSError(result); +} + + +/* + * cat_lookup_siblinglinks - lookup previous and next link ID for link using its cnid + */ +int +cat_lookup_siblinglinks(struct hfsmount *hfsmp, cnid_t linkfileid, cnid_t *prevlinkid, cnid_t *nextlinkid) +{ + FCB * fcb; + BTreeIterator * iterator; + struct FSBufferDescriptor btdata; + struct HFSPlusCatalogFile file; + int result; + + fcb = hfsmp->hfs_catalog_cp->c_datafork; + + /* Create an iterator for use by us temporarily */ + iterator = hfs_mallocz(sizeof(*iterator)); + + if ((result = getkey(hfsmp, linkfileid, (CatalogKey *)&iterator->key))) { + goto exit; + } + BDINIT(btdata, &file); + + if ((result = BTSearchRecord(fcb, iterator, &btdata, NULL, NULL))) { + goto exit; + } + /* The prev/next chain is only valid when kHFSHasLinkChainMask is set. */ + if (file.flags & kHFSHasLinkChainMask) { + cnid_t parent; + + parent = ((HFSPlusCatalogKey *)&iterator->key)->parentID; + + /* directory inodes don't have a chain (its in an EA) */ + if (parent == hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) { + result = ENOLINK; /* signal to caller to get head of list */ + } else if (parent == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid) { + *prevlinkid = 0; + *nextlinkid = file.hl_firstLinkID; + } else { + *prevlinkid = file.hl_prevLinkID; + *nextlinkid = file.hl_nextLinkID; + } + } else { + *prevlinkid = 0; + *nextlinkid = 0; + } +exit: + hfs_free(iterator, sizeof(*iterator)); + return MacToVFSError(result); +} + + +/* + * cat_lookup_lastlink - find the last sibling link in the chain (no "next" ptr) + */ +int +cat_lookup_lastlink(struct hfsmount *hfsmp, cnid_t linkfileid, + cnid_t *lastlink, struct cat_desc *cdesc) +{ + FCB * fcb; + BTreeIterator * iterator; + struct FSBufferDescriptor btdata; + struct HFSPlusCatalogFile file; + int result = 0; + int itercount = 0; + int foundlast = 0; + cnid_t currentlink = linkfileid; + + fcb = hfsmp->hfs_catalog_cp->c_datafork; + + /* Create an iterator for use by us temporarily */ + iterator = hfs_malloc(sizeof(*iterator)); + + while ((foundlast == 0) && (itercount < HFS_LINK_MAX )) { + itercount++; + bzero(iterator, sizeof(*iterator)); + + if ((result = getkey(hfsmp, currentlink, (CatalogKey *)&iterator->key))) { + goto exit; + } + BDINIT(btdata, &file); + + if ((result = BTSearchRecord(fcb, iterator, &btdata, NULL, NULL))) { + goto exit; + } + + /* The prev/next chain is only valid when kHFSHasLinkChainMask is set. */ + if (file.flags & kHFSHasLinkChainMask) { + cnid_t parent; + + parent = ((HFSPlusCatalogKey *)&iterator->key)->parentID; + /* + * The raw inode for a directory hardlink doesn't have a chain. + * Its link information lives in an EA. + */ + if (parent == hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) { + /* We don't iterate to find the oldest directory hardlink. */ + result = ENOLINK; + goto exit; + } + else if (parent == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid) { + /* Raw inode for file hardlink (the base inode) */ + currentlink = file.hl_firstLinkID; + + /* + * One minor special-casing here is necessary. + * If our ID brought us to the raw hardlink inode, and it does + * not have any siblings, then it's an open-unlinked file, and we + * should not proceed any further. + */ + if (currentlink == 0) { + result = ENOLINK; + goto exit; + } + } + else { + /* Otherwise, this item's parent is a legitimate directory in the namespace */ + if (file.hl_nextLinkID == 0) { + /* If nextLinkID is 0, then we found the end; no more hardlinks */ + foundlast = 1; + *lastlink = currentlink; + /* + * Since we had to construct a catalog key to do this lookup + * we still hold it in-hand. We might as well use it to build + * the descriptor that the caller asked for. + */ + builddesc ((HFSPlusCatalogKey*)&iterator->key, currentlink, 0, 0, 0, cdesc); + break; + } + + currentlink = file.hl_nextLinkID; + } + } + else { + /* Sorry, can't help you without a link chain */ + result = ENOLINK; + goto exit; + } + } +exit: + /* If we didn't find what we were looking for, zero out the args */ + if (foundlast == 0) { + if (cdesc) { + bzero (cdesc, sizeof(struct cat_desc)); + } + if (lastlink) { + *lastlink = 0; + } + } + + hfs_free(iterator, sizeof(*iterator)); + return MacToVFSError(result); +} + + +/* + * cat_createlink - create a link in the catalog + * + * The following cat_attr fields are expected to be set: + * ca_linkref + * ca_itime + * ca_mode (S_IFREG) + * ca_recflags + * ca_flags + * ca_finderinfo (type and creator) + */ +int +cat_createlink(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attrp, + cnid_t nextlinkid, cnid_t *linkfileid) +{ + FCB * fcb; + struct btobj * bto; + FSBufferDescriptor btdata; + HFSPlusForkData *rsrcforkp; + u_int32_t nextCNID; + u_int32_t datalen; + int thread_inserted = 0; + int alias_allocated = 0; + int result = 0; + int std_hfs; + + std_hfs = (hfsmp->hfs_flags & HFS_STANDARD); + + fcb = hfsmp->hfs_catalog_cp->c_datafork; + + /* + * Get the next CNID. Note that we are currently holding catalog lock. + */ + result = cat_acquire_cnid(hfsmp, &nextCNID); + if (result) { + return result; + } + + /* Get space for iterator, key and data */ + bto = hfs_malloc(sizeof(struct btobj)); + bto->iterator.hint.nodeNum = 0; + rsrcforkp = &bto->data.hfsPlusFile.resourceFork; + + result = buildkey(hfsmp, descp, &bto->key, 0); + if (result) { + printf("hfs: cat_createlink: err %d from buildkey\n", result); + goto exit; + } + + /* + * Insert the thread record first. + */ + datalen = buildthread((void*)&bto->key, &bto->data, 0, 0); + btdata.bufferAddress = &bto->data; + btdata.itemSize = datalen; + btdata.itemCount = 1; + + buildthreadkey(nextCNID, 0, (CatalogKey *) &bto->iterator.key); + result = BTInsertRecord(fcb, &bto->iterator, &btdata, datalen); + if (result) { + goto exit; + } + thread_inserted = 1; + + /* + * Now insert the link record. + */ + buildrecord(attrp, nextCNID, 0, kTextEncodingMacUnicode, &bto->data, &datalen); + + bto->data.hfsPlusFile.hl_prevLinkID = 0; + bto->data.hfsPlusFile.hl_nextLinkID = nextlinkid; + bto->data.hfsPlusFile.hl_linkReference = attrp->ca_linkref; + + /* For directory hard links, create alias in resource fork */ + if (descp->cd_flags & CD_ISDIR) { + if ((result = cat_makealias(hfsmp, attrp->ca_linkref, &bto->data.hfsPlusFile))) { + goto exit; + } + alias_allocated = 1; + } + btdata.bufferAddress = &bto->data; + btdata.itemSize = datalen; + btdata.itemCount = 1; + + bcopy(&bto->key, &bto->iterator.key, sizeof(bto->key)); + + result = BTInsertRecord(fcb, &bto->iterator, &btdata, datalen); + if (result) { + if (result == btExists) + result = EEXIST; + goto exit; + } + if (linkfileid != NULL) { + *linkfileid = nextCNID; + } +exit: + if (result) { + if (thread_inserted) { + printf("hfs: cat_createlink: BTInsertRecord err=%d, vol=%s\n", MacToVFSError(result), hfsmp->vcbVN); + + buildthreadkey(nextCNID, 0, (CatalogKey *)&bto->iterator.key); + if (BTDeleteRecord(fcb, &bto->iterator)) { + printf("hfs: cat_createlink() failed to delete thread record on volume %s\n", hfsmp->vcbVN); + hfs_mark_inconsistent(hfsmp, HFS_ROLLBACK_FAILED); + } + } + if (alias_allocated && rsrcforkp->extents[0].startBlock != 0) { + (void) BlockDeallocate(hfsmp, rsrcforkp->extents[0].startBlock, + rsrcforkp->extents[0].blockCount, 0); + rsrcforkp->extents[0].startBlock = 0; + rsrcforkp->extents[0].blockCount = 0; + } + } + (void) BTFlushPath(fcb); + hfs_free(bto, sizeof(*bto)); + + return MacToVFSError(result); +} + +/* Directory hard links are visible as aliases on pre-Leopard systems and + * as normal directories on Leopard or later. All directory hard link aliases + * have the same resource fork content except for the three uniquely + * identifying values that are updated in the resource fork data when the alias + * is created. The following array is the constant resource fork data used + * only for creating directory hard link aliases. + */ +static const char hfs_dirlink_alias_rsrc[] = { + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x9e, 0x00, 0x00, 0x00, 0x9e, 0x00, 0x00, 0x00, 0x32, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x9a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9a, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x48, 0x2b, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x9e, 0x00, 0x00, 0x00, 0x9e, 0x00, 0x00, 0x00, 0x32, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x32, 0x00, 0x00, 0x61, 0x6c, 0x69, 0x73, + 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +/* Constants for directory hard link alias */ +enum { + /* Size of resource fork data array for directory hard link alias */ + kHFSAliasSize = 0x1d0, + + /* Volume type for ejectable devices like disk image */ + kHFSAliasVolTypeEjectable = 0x5, + + /* Offset for volume create date, in Mac OS local time */ + kHFSAliasVolCreateDateOffset = 0x12a, + + /* Offset for the type of volume */ + kHFSAliasVolTypeOffset = 0x130, + + /* Offset for folder ID of the parent directory of the directory inode */ + kHFSAliasParentIDOffset = 0x132, + + /* Offset for folder ID of the directory inode */ + kHFSAliasTargetIDOffset = 0x176, +}; + +/* Create and write an alias that points at the directory represented by given + * inode number on the same volume. Directory hard links are visible as + * aliases in pre-Leopard systems and this function creates these aliases. + * + * Note: This code is very specific to creating alias for the purpose + * of directory hard links only, and should not be generalized. + */ +static int +cat_makealias(struct hfsmount *hfsmp, u_int32_t inode_num, struct HFSPlusCatalogFile *crp) +{ + struct buf *bp; + daddr64_t blkno; + u_int32_t blkcount; + int blksize; + int sectorsize; + int result; + HFSPlusForkData *rsrcforkp; + char *alias; + uint32_t *valptr; + + rsrcforkp = &(crp->resourceFork); + + blksize = hfsmp->blockSize; + blkcount = howmany(kHFSAliasSize, blksize); + sectorsize = hfsmp->hfs_logical_block_size; + bzero(rsrcforkp, sizeof(HFSPlusForkData)); + + /* Allocate some disk space for the alias content. */ + result = BlockAllocate(hfsmp, 0, blkcount, blkcount, + HFS_ALLOC_FORCECONTIG | HFS_ALLOC_METAZONE, + &rsrcforkp->extents[0].startBlock, + &rsrcforkp->extents[0].blockCount); + /* Did it fail with an out of space error? If so, re-try and allow journal flushing. */ + if (result == dskFulErr ) { + result = BlockAllocate(hfsmp, 0, blkcount, blkcount, + HFS_ALLOC_FORCECONTIG | HFS_ALLOC_METAZONE | HFS_ALLOC_FLUSHTXN, + &rsrcforkp->extents[0].startBlock, + &rsrcforkp->extents[0].blockCount); + } + if (result) { + rsrcforkp->extents[0].startBlock = 0; + goto exit; + } + + /* Acquire a buffer cache block for our block. */ + blkno = ((u_int64_t)rsrcforkp->extents[0].startBlock * (u_int64_t)blksize) / sectorsize; + blkno += hfsmp->hfsPlusIOPosOffset / sectorsize; + + bp = buf_getblk(hfsmp->hfs_devvp, blkno, roundup(kHFSAliasSize, hfsmp->hfs_logical_block_size), 0, 0, BLK_META); + if (hfsmp->jnl) { + journal_modify_block_start(hfsmp->jnl, bp); + } + + /* Generate alias content */ + alias = (char *)buf_dataptr(bp); + bzero(alias, buf_size(bp)); + bcopy(hfs_dirlink_alias_rsrc, alias, kHFSAliasSize); + + /* Set the volume create date, local time in Mac OS format */ + valptr = (uint32_t *)(alias + kHFSAliasVolCreateDateOffset); + *valptr = OSSwapHostToBigInt32(hfsmp->localCreateDate); + + /* If the file system is on a virtual device like disk image, + * update the volume type to be ejectable device. + */ + if (hfsmp->hfs_flags & HFS_VIRTUAL_DEVICE) { + *(uint16_t *)(alias + kHFSAliasVolTypeOffset) = + OSSwapHostToBigInt16(kHFSAliasVolTypeEjectable); + } + + /* Set id of the parent of the target directory */ + valptr = (uint32_t *)(alias + kHFSAliasParentIDOffset); + *valptr = OSSwapHostToBigInt32(hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid); + + /* Set id of the target directory */ + valptr = (uint32_t *)(alias + kHFSAliasTargetIDOffset); + *valptr = OSSwapHostToBigInt32(inode_num); + + /* Write alias content to disk. */ + if (hfsmp->jnl) { + journal_modify_block_end(hfsmp->jnl, bp, NULL, NULL); + } else if ((result = buf_bwrite(bp))) { + goto exit; + } + + /* Finish initializing the fork data. */ + rsrcforkp->logicalSize = kHFSAliasSize; + rsrcforkp->totalBlocks = rsrcforkp->extents[0].blockCount; + +exit: + if (result && rsrcforkp->extents[0].startBlock != 0) { + (void) BlockDeallocate(hfsmp, rsrcforkp->extents[0].startBlock, rsrcforkp->extents[0].blockCount, 0); + rsrcforkp->extents[0].startBlock = 0; + rsrcforkp->extents[0].blockCount = 0; + rsrcforkp->logicalSize = 0; + rsrcforkp->totalBlocks = 0; + } + return (result); +} + +/* + * cat_deletelink - delete a link from the catalog + */ +int +cat_deletelink(struct hfsmount *hfsmp, struct cat_desc *descp) +{ + struct HFSPlusCatalogFile file; + struct cat_attr cattr; + uint32_t totalBlocks; + int i; + int result; + + bzero(&file, sizeof (file)); + bzero(&cattr, sizeof (cattr)); + cattr.ca_fileid = descp->cd_cnid; + + /* Directory links have alias content to remove. */ + if (descp->cd_flags & CD_ISDIR) { + FCB * fcb; + BTreeIterator * iterator; + struct FSBufferDescriptor btdata; + + fcb = hfsmp->hfs_catalog_cp->c_datafork; + + /* Borrow the btcb iterator since we have an exclusive catalog lock. */ + iterator = &((BTreeControlBlockPtr)(fcb->ff_sysfileinfo))->iterator; + iterator->hint.nodeNum = 0; + + if ((result = buildkey(hfsmp, descp, (HFSPlusCatalogKey *)&iterator->key, 0))) { + goto exit; + } + BDINIT(btdata, &file); + + if ((result = BTSearchRecord(fcb, iterator, &btdata, NULL, NULL))) { + goto exit; + } + } + + result = cat_delete(hfsmp, descp, &cattr); + + if ((result == 0) && + (descp->cd_flags & CD_ISDIR) && + (file.recordType == kHFSPlusFileRecord)) { + + totalBlocks = file.resourceFork.totalBlocks; + + for (i = 0; (i < 8) && (totalBlocks > 0); i++) { + if ((file.resourceFork.extents[i].blockCount == 0) && + (file.resourceFork.extents[i].startBlock == 0)) { + break; + } + + (void) BlockDeallocate(hfsmp, + file.resourceFork.extents[i].startBlock, + file.resourceFork.extents[i].blockCount, 0); + + totalBlocks -= file.resourceFork.extents[i].blockCount; + file.resourceFork.extents[i].startBlock = 0; + file.resourceFork.extents[i].blockCount = 0; + } + } +exit: + return (result); +} + + +/* + * Callback to collect directory entries. + * Called with readattr_state for each item in a directory. + */ +struct readattr_state { + struct hfsmount *hfsmp; + struct cat_entrylist *list; + cnid_t dir_cnid; + int stdhfs; + int error; + int reached_eof; +}; + +static int +getentriesattr_callback(const CatalogKey *key, const CatalogRecord *rec, + struct readattr_state *state) +{ + struct cat_entrylist *list = state->list; + struct hfsmount *hfsmp = state->hfsmp; + struct cat_entry *cep; + cnid_t parentcnid; + + if (list->realentries >= list->maxentries) + return (0); /* stop */ + + parentcnid = state->stdhfs ? key->hfs.parentID : key->hfsPlus.parentID; + + switch(rec->recordType) { + case kHFSPlusFolderRecord: + case kHFSPlusFileRecord: +#if CONFIG_HFS_STD + case kHFSFolderRecord: + case kHFSFileRecord: +#endif + if (parentcnid != state->dir_cnid) { + state->error = ENOENT; + state->reached_eof = 1; + return (0); /* stop */ + } + break; + default: + state->error = ENOENT; + return (0); /* stop */ + } + + /* Hide the private system directories and journal files */ + if (parentcnid == kHFSRootFolderID) { + if (rec->recordType == kHFSPlusFolderRecord) { + if (rec->hfsPlusFolder.folderID == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid || + rec->hfsPlusFolder.folderID == hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) { + list->skipentries++; + return (1); /* continue */ + } + } + if ((hfsmp->jnl || ((HFSTOVCB(hfsmp)->vcbAtrb & kHFSVolumeJournaledMask) && (hfsmp->hfs_flags & HFS_READ_ONLY))) && + (rec->recordType == kHFSPlusFileRecord) && + ((rec->hfsPlusFile.fileID == hfsmp->hfs_jnlfileid) || + (rec->hfsPlusFile.fileID == hfsmp->hfs_jnlinfoblkid))) { + list->skipentries++; + return (1); /* continue */ + } + } + + cep = &list->entry[list->realentries++]; + + if (state->stdhfs == 0) { + getbsdattr(hfsmp, (const struct HFSPlusCatalogFile *)rec, &cep->ce_attr); + builddesc((const HFSPlusCatalogKey *)key, getcnid(rec), 0, getencoding(rec), + isadir(rec), &cep->ce_desc); + + if (rec->recordType == kHFSPlusFileRecord) { + cep->ce_datasize = rec->hfsPlusFile.dataFork.logicalSize; + cep->ce_datablks = rec->hfsPlusFile.dataFork.totalBlocks; + cep->ce_rsrcsize = rec->hfsPlusFile.resourceFork.logicalSize; + cep->ce_rsrcblks = rec->hfsPlusFile.resourceFork.totalBlocks; + + /* Save link reference for later processing. */ + if ((SWAP_BE32(rec->hfsPlusFile.userInfo.fdType) == kHardLinkFileType) && + (SWAP_BE32(rec->hfsPlusFile.userInfo.fdCreator) == kHFSPlusCreator)) { + cep->ce_attr.ca_linkref = rec->hfsPlusFile.bsdInfo.special.iNodeNum; + } else if ((rec->hfsPlusFile.flags & kHFSHasLinkChainMask) && + (SWAP_BE32(rec->hfsPlusFile.userInfo.fdType) == kHFSAliasType) && + (SWAP_BE32(rec->hfsPlusFile.userInfo.fdCreator) == kHFSAliasCreator)) { + cep->ce_attr.ca_linkref = rec->hfsPlusFile.bsdInfo.special.iNodeNum; + } + } + } +#if CONFIG_HFS_STD + else { + struct HFSPlusCatalogFile cnoderec; + HFSPlusCatalogKey * pluskey; + u_int32_t encoding; + + promoteattr(hfsmp, rec, &cnoderec); + getbsdattr(hfsmp, &cnoderec, &cep->ce_attr); + + pluskey = hfs_malloc(sizeof(HFSPlusCatalogKey)); + promotekey(hfsmp, (const HFSCatalogKey *)key, pluskey, &encoding); + builddesc(pluskey, getcnid(rec), 0, encoding, isadir(rec), &cep->ce_desc); + hfs_free(pluskey, sizeof(*pluskey)); + + if (rec->recordType == kHFSFileRecord) { + int blksize = HFSTOVCB(hfsmp)->blockSize; + + cep->ce_datasize = rec->hfsFile.dataLogicalSize; + cep->ce_datablks = rec->hfsFile.dataPhysicalSize / blksize; + cep->ce_rsrcsize = rec->hfsFile.rsrcLogicalSize; + cep->ce_rsrcblks = rec->hfsFile.rsrcPhysicalSize / blksize; + } + } +#endif + + return (list->realentries < list->maxentries); +} + +/* + * Pack a cat_entrylist buffer with attributes from the catalog + * + * Note: index is zero relative + */ +int +cat_getentriesattr(struct hfsmount *hfsmp, directoryhint_t *dirhint, struct cat_entrylist *ce_list, int *reachedeof) +{ + FCB* fcb; + CatalogKey * key; + BTreeIterator * iterator; + struct readattr_state state; + cnid_t parentcnid; + int i; + int std_hfs; + int index; + int have_key; + int result = 0; + int reached_eof = 0; + + ce_list->realentries = 0; + + fcb = GetFileControlBlock(HFSTOVCB(hfsmp)->catalogRefNum); + std_hfs = (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord); + parentcnid = dirhint->dh_desc.cd_parentcnid; + + bzero (&state, sizeof(struct readattr_state)); + + state.hfsmp = hfsmp; + state.list = ce_list; + state.dir_cnid = parentcnid; + state.stdhfs = std_hfs; + state.error = 0; + + iterator = hfs_mallocz(sizeof(*iterator)); + key = (CatalogKey *)&iterator->key; + have_key = 0; + iterator->hint.nodeNum = dirhint->dh_desc.cd_hint; + index = dirhint->dh_index + 1; + + /* + * Attempt to build a key from cached filename + */ + if (dirhint->dh_desc.cd_namelen != 0) { + if (buildkey(hfsmp, &dirhint->dh_desc, (HFSPlusCatalogKey *)key, 0) == 0) { + have_key = 1; + } + } + + /* + * If the last entry wasn't cached then position the btree iterator + */ + if ((index == 0) || !have_key) { + /* + * Position the iterator at the directory's thread record. + * (i.e. just before the first entry) + */ + buildthreadkey(dirhint->dh_desc.cd_parentcnid, (hfsmp->hfs_flags & HFS_STANDARD), key); + result = BTSearchRecord(fcb, iterator, NULL, NULL, iterator); + if (result) { + result = MacToVFSError(result); + goto exit; + } + + /* + * Iterate until we reach the entry just + * before the one we want to start with. + */ + if (index > 0) { + struct position_state ps; + + ps.error = 0; + ps.count = 0; + ps.index = index; + ps.parentID = dirhint->dh_desc.cd_parentcnid; + ps.hfsmp = hfsmp; + + result = BTIterateRecords(fcb, kBTreeNextRecord, iterator, + (IterateCallBackProcPtr)cat_findposition, &ps); + if (ps.error) + result = ps.error; + else + result = MacToVFSError(result); + + if (result) { + /* + * Note: the index may now point to EOF if the directory + * was modified in between system calls. We will return + * ENOENT from cat_findposition if this is the case, and + * when we bail out with an error, our caller (hfs_readdirattr_internal) + * will suppress the error and indicate EOF to its caller. + */ + result = MacToVFSError(result); + goto exit; + } + } + } + + /* Fill list with entries starting at iterator->key. */ + result = BTIterateRecords(fcb, kBTreeNextRecord, iterator, + (IterateCallBackProcPtr)getentriesattr_callback, &state); + + if (state.error) { + result = state.error; + reached_eof = state.reached_eof; + } + else if (ce_list->realentries == 0) { + result = ENOENT; + reached_eof = 1; + } + else { + result = MacToVFSError(result); + } + + if (std_hfs) + goto exit; + + /* + * Resolve any hard links. + */ + for (i = 0; i < (int)ce_list->realentries; ++i) { + struct FndrFileInfo *fip; + struct cat_entry *cep; + struct HFSPlusCatalogFile filerec; + int isdirlink = 0; + int isfilelink = 0; + + cep = &ce_list->entry[i]; + if (cep->ce_attr.ca_linkref == 0) + continue; + + /* Note: Finder info is still in Big Endian */ + fip = (struct FndrFileInfo *)&cep->ce_attr.ca_finderinfo; + + if (S_ISREG(cep->ce_attr.ca_mode) && + (SWAP_BE32(fip->fdType) == kHardLinkFileType) && + (SWAP_BE32(fip->fdCreator) == kHFSPlusCreator)) { + isfilelink = 1; + } + if (S_ISREG(cep->ce_attr.ca_mode) && + (SWAP_BE32(fip->fdType) == kHFSAliasType) && + (SWAP_BE32(fip->fdCreator) == kHFSAliasCreator) && + (cep->ce_attr.ca_recflags & kHFSHasLinkChainMask)) { + isdirlink = 1; + } + if (isfilelink || isdirlink) { + if (cat_resolvelink(hfsmp, cep->ce_attr.ca_linkref, isdirlink, &filerec) != 0) + continue; + /* Repack entry from inode record. */ + getbsdattr(hfsmp, &filerec, &cep->ce_attr); + cep->ce_datasize = filerec.dataFork.logicalSize; + cep->ce_datablks = filerec.dataFork.totalBlocks; + cep->ce_rsrcsize = filerec.resourceFork.logicalSize; + cep->ce_rsrcblks = filerec.resourceFork.totalBlocks; + } + } + +exit: + hfs_free(iterator, sizeof(*iterator)); + *reachedeof = reached_eof; + return MacToVFSError(result); +} + +#define SMALL_DIRENTRY_SIZE (int)(sizeof(struct dirent) - (MAXNAMLEN + 1) + 8) + +/* + * Callback to pack directory entries. + * Called with packdirentry_state for each item in a directory. + */ + +/* Hard link information collected during cat_getdirentries. */ +struct linkinfo { + u_int32_t link_ref; + user_addr_t dirent_addr; +}; +typedef struct linkinfo linkinfo_t; + +/* State information for the getdirentries_callback function. */ +struct packdirentry_state { + int cbs_flags; /* VNODE_READDIR_* flags */ + u_int32_t cbs_parentID; + u_int32_t cbs_index; + uio_t cbs_uio; + ExtendedVCB * cbs_hfsmp; + int cbs_result; + int32_t cbs_nlinks; + int32_t cbs_maxlinks; + linkinfo_t * cbs_linkinfo; + struct cat_desc * cbs_desc; + u_int8_t * cbs_namebuf; + /* + * The following fields are only used for NFS readdir, which + * uses the next file id as the seek offset of each entry. + */ + struct direntry * cbs_direntry; + struct direntry * cbs_prevdirentry; + u_int32_t cbs_previlinkref; + Boolean cbs_hasprevdirentry; + Boolean cbs_eof; +}; + +/* + * getdirentries callback for HFS Plus directories. + */ +static int +getdirentries_callback(const CatalogKey *ckp, const CatalogRecord *crp, + struct packdirentry_state *state) +{ + struct hfsmount *hfsmp; + const CatalogName *cnp; + cnid_t curID; + OSErr result; + struct dirent catent; + struct direntry * entry = NULL; + time_t itime; + u_int32_t ilinkref = 0; + u_int32_t curlinkref = 0; + cnid_t cnid; + int hide = 0; + u_int8_t type = DT_UNKNOWN; + u_int8_t is_mangled = 0; + u_int8_t is_link = 0; + u_int8_t *nameptr; + user_addr_t uiobase = USER_ADDR_NULL; + size_t namelen = 0; + size_t maxnamelen; + size_t uiosize = 0; + caddr_t uioaddr; + Boolean stop_after_pack = false; + + hfsmp = state->cbs_hfsmp; + curID = ckp->hfsPlus.parentID; + + /* We're done when parent directory changes */ + if (state->cbs_parentID != curID) { + /* + * If the parent ID is different from curID this means we've hit + * the EOF for the directory. To help future callers, we mark + * the cbs_eof boolean. However, we should only mark the EOF + * boolean if we're about to return from this function. + * + * This is because this callback function does its own uiomove + * to get the data to userspace. If we set the boolean before determining + * whether or not the current entry has enough room to write its + * data to userland, we could fool the callers of this catalog function + * into thinking they've hit EOF earlier than they really would have. + * In that case, we'd know that we have more entries to process and + * send to userland, but we didn't have enough room. + * + * To be safe, we mark cbs_eof here ONLY for the cases where we know we're + * about to return and won't write any new data back + * to userland. In the stop_after_pack case, we'll set this boolean + * regardless, so it's slightly safer to let that logic mark the boolean, + * especially since it's closer to the return of this function. + */ + + if (state->cbs_flags & VNODE_READDIR_EXTENDED) { + /* The last record has not been returned yet, so we + * want to stop after packing the last item + */ + if (state->cbs_hasprevdirentry) { + stop_after_pack = true; + } else { + state->cbs_eof = true; + state->cbs_result = ENOENT; + return (0); /* stop */ + } + } else { + state->cbs_eof = true; + state->cbs_result = ENOENT; + return (0); /* stop */ + } + } + + if (state->cbs_flags & VNODE_READDIR_EXTENDED) { + entry = state->cbs_direntry; + nameptr = (u_int8_t *)&entry->d_name[0]; + if (state->cbs_flags & VNODE_READDIR_NAMEMAX) { + /* + * The NFS server sometimes needs to make filenames fit in + * NAME_MAX bytes (since its client may not be able to + * handle a longer name). In that case, NFS will ask us + * to mangle the name to keep it short enough. + */ + maxnamelen = NAME_MAX + 1; + } else { + maxnamelen = sizeof(entry->d_name); + } + } else { + nameptr = (u_int8_t *)&catent.d_name[0]; + maxnamelen = sizeof(catent.d_name); + } + + if ((state->cbs_flags & VNODE_READDIR_EXTENDED) && stop_after_pack) { + /* The last item returns a non-zero invalid cookie */ + cnid = INT_MAX; + } else { + switch(crp->recordType) { + case kHFSPlusFolderRecord: + type = DT_DIR; + cnid = crp->hfsPlusFolder.folderID; + /* Hide our private system directories. */ + if (curID == kHFSRootFolderID) { + if (cnid == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid || + cnid == hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) { + hide = 1; + } + } + break; + case kHFSPlusFileRecord: + itime = to_bsd_time(crp->hfsPlusFile.createDate); + type = MODE_TO_DT(crp->hfsPlusFile.bsdInfo.fileMode); + cnid = crp->hfsPlusFile.fileID; + /* + * When a hardlink link is encountered save its link ref. + */ + if ((SWAP_BE32(crp->hfsPlusFile.userInfo.fdType) == kHardLinkFileType) && + (SWAP_BE32(crp->hfsPlusFile.userInfo.fdCreator) == kHFSPlusCreator) && + ((itime == (time_t)hfsmp->hfs_itime) || + (itime == (time_t)hfsmp->hfs_metadata_createdate))) { + /* If link ref is inode's file id then use it directly. */ + if (crp->hfsPlusFile.flags & kHFSHasLinkChainMask) { + cnid = crp->hfsPlusFile.hl_linkReference; + } else { + ilinkref = crp->hfsPlusFile.hl_linkReference; + } + is_link =1; + } else if ((SWAP_BE32(crp->hfsPlusFile.userInfo.fdType) == kHFSAliasType) && + (SWAP_BE32(crp->hfsPlusFile.userInfo.fdCreator) == kHFSAliasCreator) && + (crp->hfsPlusFile.flags & kHFSHasLinkChainMask) && + (crp->hfsPlusFile.hl_linkReference >= kHFSFirstUserCatalogNodeID) && + ((itime == (time_t)hfsmp->hfs_itime) || + (itime == (time_t)hfsmp->hfs_metadata_createdate))) { + /* A directory's link resolves to a directory. */ + type = DT_DIR; + /* A directory's link ref is always inode's file id. */ + cnid = crp->hfsPlusFile.hl_linkReference; + is_link = 1; + } + /* Hide the journal files */ + if ((curID == kHFSRootFolderID) && + ((hfsmp->jnl || ((HFSTOVCB(hfsmp)->vcbAtrb & kHFSVolumeJournaledMask) && (hfsmp->hfs_flags & HFS_READ_ONLY)))) && + ((cnid == hfsmp->hfs_jnlfileid) || + (cnid == hfsmp->hfs_jnlinfoblkid))) { + hide = 1; + } + break; + default: + return (0); /* stop */ + }; + + cnp = (const CatalogName*) &ckp->hfsPlus.nodeName; + + namelen = cnp->ustr.length; + /* + * For MacRoman encoded names (textEncoding == 0), assume that it's ascii + * and convert it directly in an attempt to avoid the more + * expensive utf8_encodestr conversion. + */ + if ((namelen < maxnamelen) && (crp->hfsPlusFile.textEncoding == 0)) { + int i; + u_int16_t ch; + const u_int16_t *chp; + + chp = &cnp->ustr.unicode[0]; + for (i = 0; i < (int)namelen; ++i) { + ch = *chp++; + if (ch > 0x007f || ch == 0x0000) { + /* Perform expensive utf8_encodestr conversion */ + goto encodestr; + } + nameptr[i] = (ch == '/') ? ':' : (u_int8_t)ch; + } + nameptr[namelen] = '\0'; + result = 0; + } else { +encodestr: + result = utf8_encodestr(cnp->ustr.unicode, namelen * sizeof(UniChar), + nameptr, &namelen, maxnamelen, ':', 0); + } + + /* Check result returned from encoding the filename to utf8 */ + if (result == ENAMETOOLONG) { + /* + * If we were looking at a catalog record for a hardlink (not the inode), + * then we want to use its link ID as opposed to the inode ID for + * a mangled name. For all other cases, they are the same. Note that + * due to the way directory hardlinks are implemented, the actual link + * is going to be counted as a file record, so we can catch both + * with is_link. + */ + cnid_t linkid = cnid; + if (is_link) { + linkid = crp->hfsPlusFile.fileID; + } + + result = ConvertUnicodeToUTF8Mangled(cnp->ustr.length * sizeof(UniChar), + cnp->ustr.unicode, maxnamelen, + (ByteCount*)&namelen, nameptr, linkid); + is_mangled = 1; + } + } + + if (state->cbs_flags & VNODE_READDIR_EXTENDED) { + /* + * The index is 1 relative and includes "." and ".." + * + * Also stuff the cnid in the upper 32 bits of the cookie. + * The cookie is stored to the previous entry, which will + * be packed and copied this time + */ + state->cbs_prevdirentry->d_seekoff = (state->cbs_index + 3) | ((u_int64_t)cnid << 32); + uiosize = state->cbs_prevdirentry->d_reclen; + uioaddr = (caddr_t) state->cbs_prevdirentry; + } else { + catent.d_type = type; + catent.d_namlen = namelen; + catent.d_reclen = uiosize = STD_DIRENT_LEN(namelen); + if (hide) + catent.d_fileno = 0; /* file number = 0 means skip entry */ + else + catent.d_fileno = cnid; + uioaddr = (caddr_t) &catent; + } + + /* Save current base address for post processing of hard-links. */ + if (ilinkref || state->cbs_previlinkref) { + uiobase = uio_curriovbase(state->cbs_uio); + } + /* If this entry won't fit then we're done */ + if ((uiosize > (user_size_t)uio_resid(state->cbs_uio)) || + (ilinkref != 0 && state->cbs_nlinks == state->cbs_maxlinks)) { + return (0); /* stop */ + } + + if (!(state->cbs_flags & VNODE_READDIR_EXTENDED) || state->cbs_hasprevdirentry) { + state->cbs_result = uiomove(uioaddr, uiosize, state->cbs_uio); + if (state->cbs_result == 0) { + ++state->cbs_index; + + /* Remember previous entry */ + state->cbs_desc->cd_cnid = cnid; + if (type == DT_DIR) { + state->cbs_desc->cd_flags |= CD_ISDIR; + } else { + state->cbs_desc->cd_flags &= ~CD_ISDIR; + } + if (state->cbs_desc->cd_nameptr != NULL) { + state->cbs_desc->cd_namelen = 0; + } +#if 0 + state->cbs_desc->cd_encoding = xxxx; +#endif + if (!is_mangled) { + state->cbs_desc->cd_namelen = namelen; + bcopy(nameptr, state->cbs_namebuf, namelen + 1); + } else { + /* Store unmangled name for the directory hint else it will + * restart readdir at the last location again + */ + u_int8_t *new_nameptr; + size_t bufsize; + size_t tmp_namelen = 0; + + cnp = (const CatalogName *)&ckp->hfsPlus.nodeName; + bufsize = 1 + utf8_encodelen(cnp->ustr.unicode, + cnp->ustr.length * sizeof(UniChar), + ':', 0); + new_nameptr = hfs_malloc(bufsize); + result = utf8_encodestr(cnp->ustr.unicode, + cnp->ustr.length * sizeof(UniChar), + new_nameptr, &tmp_namelen, bufsize, ':', 0); + + state->cbs_desc->cd_namelen = tmp_namelen; + bcopy(new_nameptr, state->cbs_namebuf, tmp_namelen + 1); + + hfs_free(new_nameptr, bufsize); + } + } + if (state->cbs_hasprevdirentry) { + curlinkref = ilinkref; /* save current */ + ilinkref = state->cbs_previlinkref; /* use previous */ + } + /* + * Record any hard links for post processing. + */ + if ((ilinkref != 0) && + (state->cbs_result == 0) && + (state->cbs_nlinks < state->cbs_maxlinks)) { + state->cbs_linkinfo[state->cbs_nlinks].dirent_addr = uiobase; + state->cbs_linkinfo[state->cbs_nlinks].link_ref = ilinkref; + state->cbs_nlinks++; + } + if (state->cbs_hasprevdirentry) { + ilinkref = curlinkref; /* restore current */ + } + } + + /* Fill the direntry to be used the next time */ + if (state->cbs_flags & VNODE_READDIR_EXTENDED) { + if (stop_after_pack) { + state->cbs_eof = true; + return (0); /* stop */ + } + entry->d_type = type; + entry->d_namlen = namelen; + entry->d_reclen = EXT_DIRENT_LEN(namelen); + if (hide) { + /* File number = 0 means skip entry */ + entry->d_fileno = 0; + } else { + entry->d_fileno = cnid; + } + /* swap the current and previous entry */ + struct direntry * tmp; + tmp = state->cbs_direntry; + state->cbs_direntry = state->cbs_prevdirentry; + state->cbs_prevdirentry = tmp; + state->cbs_hasprevdirentry = true; + state->cbs_previlinkref = ilinkref; + } + + /* Continue iteration if there's room */ + return (state->cbs_result == 0 && + uio_resid(state->cbs_uio) >= SMALL_DIRENTRY_SIZE); +} + +#if CONFIG_HFS_STD +/* + * getdirentries callback for standard HFS (non HFS+) directories. + */ +static int +getdirentries_std_callback(const CatalogKey *ckp, const CatalogRecord *crp, + struct packdirentry_state *state) +{ + struct hfsmount *hfsmp; + const CatalogName *cnp; + cnid_t curID; + OSErr result; + struct dirent catent; + cnid_t cnid; + u_int8_t type = DT_UNKNOWN; + u_int8_t *nameptr; + size_t namelen = 0; + size_t maxnamelen; + size_t uiosize = 0; + caddr_t uioaddr; + + hfsmp = state->cbs_hfsmp; + + curID = ckp->hfs.parentID; + + /* We're done when parent directory changes */ + if (state->cbs_parentID != curID) { + state->cbs_result = ENOENT; + return (0); /* stop */ + } + + nameptr = (u_int8_t *)&catent.d_name[0]; + maxnamelen = sizeof(catent.d_name); + + switch(crp->recordType) { + case kHFSFolderRecord: + type = DT_DIR; + cnid = crp->hfsFolder.folderID; + break; + case kHFSFileRecord: + type = DT_REG; + cnid = crp->hfsFile.fileID; + break; + default: + return (0); /* stop */ + }; + + cnp = (const CatalogName*) ckp->hfs.nodeName; + result = hfs_to_utf8(hfsmp, cnp->pstr, maxnamelen, (ByteCount *)&namelen, nameptr); + /* + * When an HFS name cannot be encoded with the current + * volume encoding we use MacRoman as a fallback. + */ + if (result) { + result = mac_roman_to_utf8(cnp->pstr, maxnamelen, (ByteCount *)&namelen, nameptr); + } + catent.d_type = type; + catent.d_namlen = namelen; + catent.d_reclen = uiosize = STD_DIRENT_LEN(namelen); + catent.d_fileno = cnid; + uioaddr = (caddr_t) &catent; + + /* If this entry won't fit then we're done */ + if (uiosize > (user_size_t)uio_resid(state->cbs_uio)) { + return (0); /* stop */ + } + + state->cbs_result = uiomove(uioaddr, uiosize, state->cbs_uio); + if (state->cbs_result == 0) { + ++state->cbs_index; + + /* Remember previous entry */ + state->cbs_desc->cd_cnid = cnid; + if (type == DT_DIR) { + state->cbs_desc->cd_flags |= CD_ISDIR; + } else { + state->cbs_desc->cd_flags &= ~CD_ISDIR; + } + if (state->cbs_desc->cd_nameptr != NULL) { + state->cbs_desc->cd_namelen = 0; + } + state->cbs_desc->cd_namelen = namelen; + bcopy(nameptr, state->cbs_namebuf, namelen + 1); + } + + /* Continue iteration if there's room */ + return (state->cbs_result == 0 && uio_resid(state->cbs_uio) >= SMALL_DIRENTRY_SIZE); +} +#endif + +/* + * Pack a uio buffer with directory entries from the catalog + */ +int +cat_getdirentries(struct hfsmount *hfsmp, u_int32_t entrycnt, directoryhint_t *dirhint, + uio_t uio, int flags, int * items, int * eofflag) +{ + FCB* fcb; + BTreeIterator * iterator; + CatalogKey * key; + struct packdirentry_state state; + void * buffer; + int bufsize; + int maxlinks; + int result; + int index; + int have_key; + int extended; + + extended = flags & VNODE_READDIR_EXTENDED; + + if (extended && (hfsmp->hfs_flags & HFS_STANDARD)) { + return (ENOTSUP); + } + fcb = hfsmp->hfs_catalog_cp->c_datafork; + + #define MAX_LINKINFO_ENTRIES 275 + /* + * Get a buffer for link info array, btree iterator and a direntry. + * + * We impose an cap of 275 link entries when trying to compute + * the total number of hardlink entries that we'll allow in the + * linkinfo array, as this has been shown to noticeably impact performance. + * + * Note that in the case where there are very few hardlinks, + * this does not restrict or prevent us from vending out as many entries + * as we can to the uio_resid, because the getdirentries callback + * uiomoves the directory entries to the uio itself and does not use + * this MALLOC'd array. It also limits itself to maxlinks of hardlinks. + */ + + // This value cannot underflow: both entrycnt and the rhs are unsigned 32-bit + // ints, so the worst-case MIN of them is 0. + maxlinks = MIN (entrycnt, (u_int32_t)(uio_resid(uio) / SMALL_DIRENTRY_SIZE)); + // Prevent overflow. + maxlinks = MIN (maxlinks, MAX_LINKINFO_ENTRIES); + bufsize = MAXPATHLEN + (maxlinks * sizeof(linkinfo_t)) + sizeof(*iterator); + + if (extended) { + bufsize += 2*sizeof(struct direntry); + } + buffer = hfs_mallocz(bufsize); + + state.cbs_flags = flags; + state.cbs_hasprevdirentry = false; + state.cbs_previlinkref = 0; + state.cbs_nlinks = 0; + state.cbs_maxlinks = maxlinks; + state.cbs_linkinfo = (linkinfo_t *)((char *)buffer + MAXPATHLEN); + /* + * We need to set cbs_eof to false regardless of whether or not the + * control flow is actually in the extended case, since we use this + * field to track whether or not we've returned EOF from the iterator function. + */ + state.cbs_eof = false; + + iterator = (BTreeIterator *) ((char *)state.cbs_linkinfo + (maxlinks * sizeof(linkinfo_t))); + key = (CatalogKey *)&iterator->key; + have_key = 0; + index = dirhint->dh_index + 1; + if (extended) { + state.cbs_direntry = (struct direntry *)((char *)iterator + sizeof(BTreeIterator)); + state.cbs_prevdirentry = state.cbs_direntry + 1; + } + /* + * Attempt to build a key from cached filename + */ + if (dirhint->dh_desc.cd_namelen != 0) { + if (buildkey(hfsmp, &dirhint->dh_desc, (HFSPlusCatalogKey *)key, 0) == 0) { + iterator->hint.nodeNum = dirhint->dh_desc.cd_hint; + have_key = 1; + } + } + + if (index == 0 && dirhint->dh_threadhint != 0) { + /* + * Position the iterator at the directory's thread record. + * (i.e. just before the first entry) + */ + buildthreadkey(dirhint->dh_desc.cd_parentcnid, (hfsmp->hfs_flags & HFS_STANDARD), key); + iterator->hint.nodeNum = dirhint->dh_threadhint; + iterator->hint.index = 0; + have_key = 1; + } + + /* + * If the last entry wasn't cached then position the btree iterator + */ + if (!have_key) { + /* + * Position the iterator at the directory's thread record. + * (i.e. just before the first entry) + */ + buildthreadkey(dirhint->dh_desc.cd_parentcnid, (hfsmp->hfs_flags & HFS_STANDARD), key); + result = BTSearchRecord(fcb, iterator, NULL, NULL, iterator); + if (result) { + result = MacToVFSError(result); + goto cleanup; + } + if (index == 0) { + dirhint->dh_threadhint = iterator->hint.nodeNum; + } + /* + * Iterate until we reach the entry just + * before the one we want to start with. + */ + if (index > 0) { + struct position_state ps; + + ps.error = 0; + ps.count = 0; + ps.index = index; + ps.parentID = dirhint->dh_desc.cd_parentcnid; + ps.hfsmp = hfsmp; + + result = BTIterateRecords(fcb, kBTreeNextRecord, iterator, + (IterateCallBackProcPtr)cat_findposition, &ps); + if (ps.error) + result = ps.error; + else + result = MacToVFSError(result); + if (result) { + result = MacToVFSError(result); + if (result == ENOENT) { + /* + * ENOENT means we've hit the EOF. + * suppress the error, and set the eof flag. + */ + result = 0; + dirhint->dh_desc.cd_flags |= CD_EOF; + *eofflag = 1; + } + goto cleanup; + } + } + } + + state.cbs_index = index; + state.cbs_hfsmp = hfsmp; + state.cbs_uio = uio; + state.cbs_desc = &dirhint->dh_desc; + state.cbs_namebuf = (u_int8_t *)buffer; + state.cbs_result = 0; + state.cbs_parentID = dirhint->dh_desc.cd_parentcnid; + + /* Use a temporary buffer to hold intermediate descriptor names. */ + if (dirhint->dh_desc.cd_namelen > 0 && dirhint->dh_desc.cd_nameptr != NULL) { + bcopy(dirhint->dh_desc.cd_nameptr, buffer, dirhint->dh_desc.cd_namelen+1); + if (dirhint->dh_desc.cd_flags & CD_HASBUF) { + dirhint->dh_desc.cd_flags &= ~CD_HASBUF; + vfs_removename((const char *)dirhint->dh_desc.cd_nameptr); + } + } + dirhint->dh_desc.cd_nameptr = (u_int8_t *)buffer; + + enum BTreeIterationOperations op; + if (extended && index != 0 && have_key) + op = kBTreeCurrentRecord; + else + op = kBTreeNextRecord; + + /* + * Process as many entries as possible starting at iterator->key. + */ + if ((hfsmp->hfs_flags & HFS_STANDARD) == 0) { + /* HFS+ */ + result = BTIterateRecords(fcb, op, iterator, + (IterateCallBackProcPtr)getdirentries_callback, &state); + + /* For extended calls, every call to getdirentries_callback() + * transfers the previous directory entry found to the user + * buffer. Therefore when BTIterateRecords reaches the end of + * Catalog BTree, call getdirentries_callback() again with + * dummy values to copy the last directory entry stored in + * packdirentry_state + */ + if (extended && (result == fsBTRecordNotFoundErr)) { + CatalogKey ckp; + CatalogRecord crp; + + bzero(&ckp, sizeof(ckp)); + bzero(&crp, sizeof(crp)); + + result = getdirentries_callback(&ckp, &crp, &state); + } + } +#if CONFIG_HFS_STD + else { + /* HFS (standard) */ + result = BTIterateRecords(fcb, op, iterator, + (IterateCallBackProcPtr)getdirentries_std_callback, &state); + } +#endif + + /* Note that state.cbs_index is still valid on errors */ + *items = state.cbs_index - index; + index = state.cbs_index; + + /* + * Also note that cbs_eof is set in all cases if we ever hit EOF + * during the enumeration by the catalog callback. Mark the directory's hint + * descriptor as having hit EOF. + */ + + if (state.cbs_eof) { + dirhint->dh_desc.cd_flags |= CD_EOF; + *eofflag = 1; + } + + /* Finish updating the catalog iterator. */ + dirhint->dh_desc.cd_hint = iterator->hint.nodeNum; + dirhint->dh_desc.cd_flags |= CD_DECOMPOSED; + dirhint->dh_index = index - 1; + + /* Fix up the name. */ + if (dirhint->dh_desc.cd_namelen > 0) { + dirhint->dh_desc.cd_nameptr = (const u_int8_t *)vfs_addname((char *)buffer, dirhint->dh_desc.cd_namelen, 0, 0); + dirhint->dh_desc.cd_flags |= CD_HASBUF; + } else { + dirhint->dh_desc.cd_nameptr = NULL; + dirhint->dh_desc.cd_namelen = 0; + } + + /* + * Post process any hard links to get the real file id. + */ + if (state.cbs_nlinks > 0) { + ino_t fileid = 0; + user_addr_t address; + int i; + + for (i = 0; i < state.cbs_nlinks; ++i) { + if (resolvelinkid(hfsmp, state.cbs_linkinfo[i].link_ref, &fileid) != 0) + continue; + /* This assumes that d_ino is always first field. */ + address = state.cbs_linkinfo[i].dirent_addr; + if (address == (user_addr_t)0) + continue; + if (uio_isuserspace(uio)) { + if (extended) { + ino64_t fileid_64 = (ino64_t)fileid; + (void) copyout(&fileid_64, address, sizeof(fileid_64)); + } else { + (void) copyout(&fileid, address, sizeof(fileid)); + } + } else /* system space */ { + if (extended) { + ino64_t fileid_64 = (ino64_t)fileid; + bcopy(&fileid_64, (void*) CAST_DOWN(caddr_t, address), sizeof(fileid_64)); + } else { + bcopy(&fileid, (void*) CAST_DOWN(caddr_t, address), sizeof(fileid)); + } + } + } + } + + if (state.cbs_result) + result = state.cbs_result; + else + result = MacToVFSError(result); + + if (result == ENOENT) { + result = 0; + } + +cleanup: + hfs_free(buffer, bufsize); + + return (result); +} + + +/* + * Callback to establish directory position. + * Called with position_state for each item in a directory. + */ +static int +cat_findposition(const CatalogKey *ckp, const CatalogRecord *crp, + struct position_state *state) +{ + cnid_t curID = 0; + + if ((state->hfsmp->hfs_flags & HFS_STANDARD) == 0) { + curID = ckp->hfsPlus.parentID; + } +#if CONFIG_HFS_STD + else { + curID = ckp->hfs.parentID; + } +#endif + + /* Make sure parent directory didn't change */ + if (state->parentID != curID) { + /* + * The parent ID is different from curID this means we've hit + * the EOF for the directory. + */ + state->error = ENOENT; + return (0); /* stop */ + } + + /* Count this entry */ + switch(crp->recordType) { + case kHFSPlusFolderRecord: + case kHFSPlusFileRecord: +#if CONFIG_HFS_STD + case kHFSFolderRecord: + case kHFSFileRecord: +#endif + ++state->count; + break; + default: + printf("hfs: cat_findposition: invalid record type %d in dir %d\n", + crp->recordType, curID); + state->error = EINVAL; + return (0); /* stop */ + }; + + return (state->count < state->index); +} + + +/* + * cat_binarykeycompare - compare two HFS Plus catalog keys. + + * The name portion of the key is compared using a 16-bit binary comparison. + * This is called from the b-tree code. + */ +int +cat_binarykeycompare(HFSPlusCatalogKey *searchKey, HFSPlusCatalogKey *trialKey) +{ + u_int32_t searchParentID, trialParentID; + int result; + + searchParentID = searchKey->parentID; + trialParentID = trialKey->parentID; + result = 0; + + if (searchParentID > trialParentID) { + ++result; + } else if (searchParentID < trialParentID) { + --result; + } else { + u_int16_t * str1 = &searchKey->nodeName.unicode[0]; + u_int16_t * str2 = &trialKey->nodeName.unicode[0]; + int length1 = searchKey->nodeName.length; + int length2 = trialKey->nodeName.length; + + result = UnicodeBinaryCompare (str1, length1, str2, length2); + } + + return result; +} + + +#if CONFIG_HFS_STD +/* + * Compare two standard HFS catalog keys + * + * Result: +n search key > trial key + * 0 search key = trial key + * -n search key < trial key + */ +int +CompareCatalogKeys(HFSCatalogKey *searchKey, HFSCatalogKey *trialKey) +{ + cnid_t searchParentID, trialParentID; + int result; + + searchParentID = searchKey->parentID; + trialParentID = trialKey->parentID; + + if (searchParentID > trialParentID) + result = 1; + else if (searchParentID < trialParentID) + result = -1; + else /* parent dirID's are equal, compare names */ + result = FastRelString(searchKey->nodeName, trialKey->nodeName); + + return result; +} +#endif + + +/* + * Compare two HFS+ catalog keys + * + * Result: +n search key > trial key + * 0 search key = trial key + * -n search key < trial key + */ +int +CompareExtendedCatalogKeys(HFSPlusCatalogKey *searchKey, HFSPlusCatalogKey *trialKey) +{ + cnid_t searchParentID, trialParentID; + int result; + + searchParentID = searchKey->parentID; + trialParentID = trialKey->parentID; + + if (searchParentID > trialParentID) { + result = 1; + } + else if (searchParentID < trialParentID) { + result = -1; + } else { + /* parent node ID's are equal, compare names */ + if ( searchKey->nodeName.length == 0 || trialKey->nodeName.length == 0 ) + result = searchKey->nodeName.length - trialKey->nodeName.length; + else + result = FastUnicodeCompare(&searchKey->nodeName.unicode[0], + searchKey->nodeName.length, + &trialKey->nodeName.unicode[0], + trialKey->nodeName.length); + } + + return result; +} + + +/* + * buildkey - build a Catalog b-tree key from a cnode descriptor + */ +static int +buildkey(struct hfsmount *hfsmp, struct cat_desc *descp, + HFSPlusCatalogKey *key, int retry) +{ + int std_hfs = (hfsmp->hfs_flags & HFS_STANDARD); + int utf8_flags = UTF_ESCAPE_ILLEGAL; + int result = 0; + size_t unicodeBytes = 0; + + if (std_hfs == 0) { + retry = 0; + } + + if (descp->cd_namelen == 0 || descp->cd_nameptr[0] == '\0') + return (EINVAL); /* invalid name */ + + key->parentID = descp->cd_parentcnid; + key->nodeName.length = 0; + /* + * Convert filename from UTF-8 into Unicode + */ + + if ((descp->cd_flags & CD_DECOMPOSED) == 0) + utf8_flags |= UTF_DECOMPOSED; + result = utf8_decodestr(descp->cd_nameptr, descp->cd_namelen, + key->nodeName.unicode, &unicodeBytes, + sizeof(key->nodeName.unicode), ':', utf8_flags); + key->nodeName.length = unicodeBytes / sizeof(UniChar); + key->keyLength = kHFSPlusCatalogKeyMinimumLength + unicodeBytes; + if (result) { + if (result != ENAMETOOLONG) + result = EINVAL; /* name has invalid characters */ + return (result); + } + +#if CONFIG_HFS_STD + /* + * For HFS volumes convert to an HFS compatible key + * + * XXX need to save the encoding that succeeded + */ + if (std_hfs) { + HFSCatalogKey hfskey; + + bzero(&hfskey, sizeof(hfskey)); + hfskey.keyLength = kHFSCatalogKeyMinimumLength; + hfskey.parentID = key->parentID; + hfskey.nodeName[0] = 0; + if (key->nodeName.length > 0) { + int res; + if ((res = unicode_to_hfs(HFSTOVCB(hfsmp), + key->nodeName.length * 2, + key->nodeName.unicode, + &hfskey.nodeName[0], retry)) != 0) { + if (res != ENAMETOOLONG) + res = EINVAL; + + return res; + } + hfskey.keyLength += hfskey.nodeName[0]; + } + bcopy(&hfskey, key, sizeof(hfskey)); + } +#endif + + return (0); + } + + +/* + * Resolve hard link reference to obtain the inode record. + */ +int +cat_resolvelink(struct hfsmount *hfsmp, u_int32_t linkref, int isdirlink, struct HFSPlusCatalogFile *recp) +{ + FSBufferDescriptor btdata; + struct BTreeIterator *iterator; + struct cat_desc idesc; + char inodename[32]; + cnid_t parentcnid; + int result = 0; + + BDINIT(btdata, recp); + + if (isdirlink) { + MAKE_DIRINODE_NAME(inodename, sizeof(inodename), (unsigned int)linkref); + parentcnid = hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid; + } else { + MAKE_INODE_NAME(inodename, sizeof(inodename), (unsigned int)linkref); + parentcnid = hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid; + } + + /* Get space for iterator */ + iterator = hfs_mallocz(sizeof(*iterator)); + + /* Build a descriptor for private dir. */ + idesc.cd_parentcnid = parentcnid; + idesc.cd_nameptr = (const u_int8_t *)inodename; + idesc.cd_namelen = strlen(inodename); + idesc.cd_flags = 0; + idesc.cd_hint = 0; + idesc.cd_encoding = 0; + (void) buildkey(hfsmp, &idesc, (HFSPlusCatalogKey *)&iterator->key, 0); + + result = BTSearchRecord(VTOF(HFSTOVCB(hfsmp)->catalogRefNum), iterator, + &btdata, NULL, NULL); + + if (result == 0) { + /* Make sure there's a reference */ + if (recp->hl_linkCount == 0) + recp->hl_linkCount = 2; + } else { + printf("hfs: cat_resolvelink: can't find inode=%s on vol=%s\n", inodename, hfsmp->vcbVN); + } + + hfs_free(iterator, sizeof(*iterator)); + + return (result ? ENOENT : 0); +} + +/* + * Resolve hard link reference to obtain the inode number. + */ +static int +resolvelinkid(struct hfsmount *hfsmp, u_int32_t linkref, ino_t *ino) +{ + struct HFSPlusCatalogFile record; + int error; + + /* + * Since we know resolvelinkid is only called from + * cat_getdirentries, we can assume that only file + * hardlinks need to be resolved (cat_getdirentries + * can resolve directory hardlinks in place). + */ + error = cat_resolvelink(hfsmp, linkref, 0, &record); + if (error == 0) { + if (record.fileID == 0) + error = ENOENT; + else + *ino = record.fileID; + } + return (error); +} + +/* + * getkey - get a key from id by doing a thread lookup + */ +static int +getkey(struct hfsmount *hfsmp, cnid_t cnid, CatalogKey * key) +{ + struct BTreeIterator * iterator; + FSBufferDescriptor btdata; + u_int16_t datasize; + CatalogKey * keyp; + CatalogRecord * recp; + int result; + int std_hfs; + + std_hfs = (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord); + + iterator = hfs_mallocz(sizeof(*iterator)); + buildthreadkey(cnid, std_hfs, (CatalogKey *)&iterator->key); + + recp = hfs_malloc(sizeof(CatalogRecord)); + BDINIT(btdata, recp); + + result = BTSearchRecord(VTOF(HFSTOVCB(hfsmp)->catalogRefNum), iterator, + &btdata, &datasize, iterator); + if (result) + goto exit; + + /* Turn thread record into a cnode key (in place) */ + switch (recp->recordType) { + +#if CONFIG_HFS_STD + case kHFSFileThreadRecord: + case kHFSFolderThreadRecord: + keyp = (CatalogKey *)((char *)&recp->hfsThread.reserved + 6); + keyp->hfs.keyLength = kHFSCatalogKeyMinimumLength + keyp->hfs.nodeName[0]; + bcopy(keyp, key, keyp->hfs.keyLength + 1); + break; +#endif + + case kHFSPlusFileThreadRecord: + case kHFSPlusFolderThreadRecord: + keyp = (CatalogKey *)&recp->hfsPlusThread.reserved; + keyp->hfsPlus.keyLength = kHFSPlusCatalogKeyMinimumLength + + (keyp->hfsPlus.nodeName.length * 2); + bcopy(keyp, key, keyp->hfsPlus.keyLength + 2); + break; + + default: + result = ENOENT; + break; + } + +exit: + hfs_free(iterator, sizeof(*iterator)); + hfs_free(recp, sizeof(*recp)); + + return MacToVFSError(result); +} + +/* + * getkeyplusattr - From id, fetch the key and the bsd attrs for a file/dir (could pass + * null arguments to cat_idlookup instead, but we save around 10% by not building the + * cat_desc here). Both key and attrp must point to real structures. + * + * The key's parent id is the only part of the key expected to be used by the caller. + * The name portion of the key may not always be valid (ie in the case of a hard link). + */ +int +cat_getkeyplusattr(struct hfsmount *hfsmp, cnid_t cnid, CatalogKey * key, struct cat_attr *attrp) +{ + int result; + + result = getkey(hfsmp, cnid, key); + + if (result == 0) { + result = cat_lookupbykey(hfsmp, key, 0, 0, 0, NULL, attrp, NULL, NULL); + } + /* + * Check for a raw file hardlink inode. + * Fix up the parent id in the key if necessary. + * Only hard links created by Mac OS X 10.5 or later can be resolved here. + */ + if ((result == 0) && + (key->hfsPlus.parentID == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid) && + (attrp->ca_recflags & kHFSHasLinkChainMask)) { + cnid_t nextlinkid = 0; + cnid_t prevlinkid = 0; + struct cat_desc linkdesc; + + /* + * Pick up the first link in the chain and get a descriptor for it. + * This allows blind bulk access checks to work for hardlinks. + */ + if ((cat_lookup_siblinglinks(hfsmp, cnid, &prevlinkid, &nextlinkid) == 0) && + (nextlinkid != 0)) { + if (cat_findname(hfsmp, nextlinkid, &linkdesc) == 0) { + key->hfsPlus.parentID = linkdesc.cd_parentcnid; + cat_releasedesc(&linkdesc); + } + } + } + return MacToVFSError(result); +} + + +/* + * buildrecord - build a default catalog directory or file record + */ +static void +buildrecord(struct cat_attr *attrp, cnid_t cnid, int std_hfs, u_int32_t encoding, + CatalogRecord *crp, u_int32_t *recordSize) +{ + int type = attrp->ca_mode & S_IFMT; + u_int32_t createtime = to_hfs_time(attrp->ca_itime); + + if (std_hfs == 0) { + struct HFSPlusBSDInfo * bsdp = NULL; + + if (type == S_IFDIR) { + crp->recordType = kHFSPlusFolderRecord; + crp->hfsPlusFolder.flags = attrp->ca_recflags; + crp->hfsPlusFolder.valence = 0; + crp->hfsPlusFolder.folderID = cnid; + crp->hfsPlusFolder.createDate = createtime; + crp->hfsPlusFolder.contentModDate = createtime; + crp->hfsPlusFolder.attributeModDate = createtime; + crp->hfsPlusFolder.accessDate = createtime; + crp->hfsPlusFolder.backupDate = 0; + crp->hfsPlusFolder.textEncoding = encoding; + crp->hfsPlusFolder.folderCount = 0; + bcopy(attrp->ca_finderinfo, &crp->hfsPlusFolder.userInfo, 32); + bsdp = &crp->hfsPlusFolder.bsdInfo; + bsdp->special.linkCount = 1; + *recordSize = sizeof(HFSPlusCatalogFolder); + } else { + crp->recordType = kHFSPlusFileRecord; + crp->hfsPlusFile.flags = attrp->ca_recflags; + crp->hfsPlusFile.reserved1 = 0; + crp->hfsPlusFile.fileID = cnid; + crp->hfsPlusFile.createDate = createtime; + crp->hfsPlusFile.contentModDate = createtime; + crp->hfsPlusFile.accessDate = createtime; + crp->hfsPlusFile.attributeModDate = createtime; + crp->hfsPlusFile.backupDate = 0; + crp->hfsPlusFile.textEncoding = encoding; + crp->hfsPlusFile.reserved2 = 0; + bcopy(attrp->ca_finderinfo, &crp->hfsPlusFile.userInfo, 32); + bsdp = &crp->hfsPlusFile.bsdInfo; + /* BLK/CHR need to save the device info */ + if (type == S_IFBLK || type == S_IFCHR) { + bsdp->special.rawDevice = attrp->ca_rdev; + } else { + bsdp->special.linkCount = 1; + } + bzero(&crp->hfsPlusFile.dataFork, 2*sizeof(HFSPlusForkData)); + *recordSize = sizeof(HFSPlusCatalogFile); + } + bsdp->ownerID = attrp->ca_uid; + bsdp->groupID = attrp->ca_gid; + bsdp->fileMode = attrp->ca_mode; + bsdp->adminFlags = attrp->ca_flags >> 16; + bsdp->ownerFlags = attrp->ca_flags & 0x000000FF; + } +#if CONFIG_HFS_STD + else { + createtime = UTCToLocal(createtime); + if (type == S_IFDIR) { + bzero(crp, sizeof(HFSCatalogFolder)); + crp->recordType = kHFSFolderRecord; + crp->hfsFolder.folderID = cnid; + crp->hfsFolder.createDate = createtime; + crp->hfsFolder.modifyDate = createtime; + bcopy(attrp->ca_finderinfo, &crp->hfsFolder.userInfo, 32); + *recordSize = sizeof(HFSCatalogFolder); + } else { + bzero(crp, sizeof(HFSCatalogFile)); + crp->recordType = kHFSFileRecord; + crp->hfsFile.fileID = cnid; + crp->hfsFile.createDate = createtime; + crp->hfsFile.modifyDate = createtime; + bcopy(attrp->ca_finderinfo, &crp->hfsFile.userInfo, 16); + bcopy(&attrp->ca_finderinfo[16], &crp->hfsFile.finderInfo, 16); + *recordSize = sizeof(HFSCatalogFile); + } + } +#endif + +} + + +/* + * builddesc - build a cnode descriptor from an HFS+ key + */ +static int +builddesc(const HFSPlusCatalogKey *key, cnid_t cnid, u_int32_t hint, u_int32_t encoding, + int isdir, struct cat_desc *descp) +{ + int result = 0; + unsigned char * nameptr; + size_t bufsize; + size_t utf8len; + unsigned char tmpbuff[128]; + + /* guess a size... */ + bufsize = (3 * key->nodeName.length) + 1; + if (bufsize >= sizeof(tmpbuff) - 1) { + nameptr = hfs_malloc(bufsize); + } else { + nameptr = &tmpbuff[0]; + } + + result = utf8_encodestr(key->nodeName.unicode, + key->nodeName.length * sizeof(UniChar), + nameptr, (size_t *)&utf8len, + bufsize, ':', 0); + + if (result == ENAMETOOLONG) { + if (nameptr != &tmpbuff[0]) + hfs_free(nameptr, bufsize); + bufsize = 1 + utf8_encodelen(key->nodeName.unicode, + key->nodeName.length * sizeof(UniChar), + ':', 0); + nameptr = hfs_malloc(bufsize); + + result = utf8_encodestr(key->nodeName.unicode, + key->nodeName.length * sizeof(UniChar), + nameptr, (size_t *)&utf8len, + bufsize, ':', 0); + } + descp->cd_parentcnid = key->parentID; + descp->cd_nameptr = (const u_int8_t *)vfs_addname((char *)nameptr, utf8len, 0, 0); + descp->cd_namelen = utf8len; + descp->cd_cnid = cnid; + descp->cd_hint = hint; + descp->cd_flags = CD_DECOMPOSED | CD_HASBUF; + if (isdir) + descp->cd_flags |= CD_ISDIR; + descp->cd_encoding = encoding; + if (nameptr != &tmpbuff[0]) { + hfs_free(nameptr, bufsize); + } + return result; +} + + +/* + * getbsdattr - get attributes in bsd format + * + */ +static void +getbsdattr(struct hfsmount *hfsmp, const struct HFSPlusCatalogFile *crp, struct cat_attr * attrp) +{ + int isDirectory = (crp->recordType == kHFSPlusFolderRecord); + const struct HFSPlusBSDInfo *bsd = &crp->bsdInfo; + + attrp->ca_recflags = crp->flags; + attrp->ca_atime = to_bsd_time(crp->accessDate); + attrp->ca_atimeondisk = attrp->ca_atime; + attrp->ca_mtime = to_bsd_time(crp->contentModDate); + attrp->ca_ctime = to_bsd_time(crp->attributeModDate); + attrp->ca_itime = to_bsd_time(crp->createDate); + attrp->ca_btime = to_bsd_time(crp->backupDate); + + if ((bsd->fileMode & S_IFMT) == 0) { + attrp->ca_flags = 0; + attrp->ca_uid = hfsmp->hfs_uid; + attrp->ca_gid = hfsmp->hfs_gid; + if (isDirectory) { + attrp->ca_mode = S_IFDIR | (hfsmp->hfs_dir_mask & ACCESSPERMS); + } else { + attrp->ca_mode = S_IFREG | (hfsmp->hfs_file_mask & ACCESSPERMS); + } + attrp->ca_linkcount = 1; + attrp->ca_rdev = 0; + } else { + attrp->ca_linkcount = 1; /* may be overridden below */ + attrp->ca_rdev = 0; + attrp->ca_uid = bsd->ownerID; + attrp->ca_gid = bsd->groupID; + attrp->ca_flags = bsd->ownerFlags | (bsd->adminFlags << 16); + attrp->ca_mode = (mode_t)bsd->fileMode; + switch (attrp->ca_mode & S_IFMT) { + case S_IFCHR: /* fall through */ + case S_IFBLK: + attrp->ca_rdev = bsd->special.rawDevice; + break; + case S_IFIFO: + case S_IFSOCK: + case S_IFDIR: + case S_IFREG: + /* Pick up the hard link count */ + if (bsd->special.linkCount > 0) + attrp->ca_linkcount = bsd->special.linkCount; + break; + } + + /* + * Override the permissions as determined by the mount auguments + * in ALMOST the same way unset permissions are treated but keep + * track of whether or not the file or folder is hfs locked + * by leaving the h_pflags field unchanged from what was unpacked + * out of the catalog. + */ + /* + * This code was used to do UID translation with MNT_IGNORE_OWNERS + * (aka MNT_UNKNOWNPERMISSIONS) at the HFS layer. It's largely done + * at the VFS layer, so there is no need to do it here now; this also + * allows VFS to let root see the real UIDs. + * + * if (((unsigned int)vfs_flags(HFSTOVFS(hfsmp))) & MNT_UNKNOWNPERMISSIONS) { + * attrp->ca_uid = hfsmp->hfs_uid; + * attrp->ca_gid = hfsmp->hfs_gid; + * } + */ + } + + if (isDirectory) { + if (!S_ISDIR(attrp->ca_mode)) { + attrp->ca_mode &= ~S_IFMT; + attrp->ca_mode |= S_IFDIR; + } + attrp->ca_entries = ((const HFSPlusCatalogFolder *)crp)->valence; + attrp->ca_dircount = ((hfsmp->hfs_flags & HFS_FOLDERCOUNT) && (attrp->ca_recflags & kHFSHasFolderCountMask)) ? + ((const HFSPlusCatalogFolder *)crp)->folderCount : 0; + + /* Keep UF_HIDDEN bit in sync with Finder Info's invisible bit */ + if (((const HFSPlusCatalogFolder *)crp)->userInfo.frFlags & OSSwapHostToBigConstInt16(kFinderInvisibleMask)) + attrp->ca_flags |= UF_HIDDEN; + } else { + /* Keep IMMUTABLE bits in sync with HFS locked flag */ + if (crp->flags & kHFSFileLockedMask) { + /* The file's supposed to be locked: + Make sure at least one of the IMMUTABLE bits is set: */ + if ((attrp->ca_flags & (SF_IMMUTABLE | UF_IMMUTABLE)) == 0) + attrp->ca_flags |= UF_IMMUTABLE; + } else { + /* The file's supposed to be unlocked: */ + attrp->ca_flags &= ~(SF_IMMUTABLE | UF_IMMUTABLE); + } + /* Keep UF_HIDDEN bit in sync with Finder Info's invisible bit */ + if (crp->userInfo.fdFlags & OSSwapHostToBigConstInt16(kFinderInvisibleMask)) + attrp->ca_flags |= UF_HIDDEN; + /* get total blocks (both forks) */ + attrp->ca_blocks = crp->dataFork.totalBlocks + crp->resourceFork.totalBlocks; + + /* On HFS+ the ThreadExists flag must always be set. */ + if ((hfsmp->hfs_flags & HFS_STANDARD) == 0) + attrp->ca_recflags |= kHFSThreadExistsMask; + + /* Pick up the hardlink first link, if any. */ + attrp->ca_firstlink = (attrp->ca_recflags & kHFSHasLinkChainMask) ? crp->hl_firstLinkID : 0; + } + + attrp->ca_fileid = crp->fileID; + + bcopy(&crp->userInfo, attrp->ca_finderinfo, 32); +} + +#if CONFIG_HFS_STD +/* + * promotekey - promote hfs key to hfs plus key + * + */ +static void +promotekey(struct hfsmount *hfsmp, const HFSCatalogKey *hfskey, + HFSPlusCatalogKey *keyp, u_int32_t *encoding) +{ + hfs_to_unicode_func_t hfs_get_unicode = hfsmp->hfs_get_unicode; + u_int32_t uniCount; + int error; + + *encoding = hfsmp->hfs_encoding; + + error = hfs_get_unicode(hfskey->nodeName, keyp->nodeName.unicode, + kHFSPlusMaxFileNameChars, &uniCount); + /* + * When an HFS name cannot be encoded with the current + * encoding use MacRoman as a fallback. + */ + if (error && hfsmp->hfs_encoding != kTextEncodingMacRoman) { + *encoding = 0; + (void) mac_roman_to_unicode(hfskey->nodeName, + keyp->nodeName.unicode, + kHFSPlusMaxFileNameChars, + &uniCount); + } + + keyp->nodeName.length = uniCount; + keyp->parentID = hfskey->parentID; +} + +/* + * promotefork - promote hfs fork info to hfs plus + * + */ +static void +promotefork(struct hfsmount *hfsmp, const struct HFSCatalogFile *filep, + int resource, struct cat_fork * forkp) +{ + struct HFSPlusExtentDescriptor *xp; + u_int32_t blocksize = HFSTOVCB(hfsmp)->blockSize; + + bzero(forkp, sizeof(*forkp)); + xp = &forkp->cf_extents[0]; + if (resource) { + forkp->cf_size = filep->rsrcLogicalSize; + forkp->cf_blocks = filep->rsrcPhysicalSize / blocksize; + forkp->cf_bytesread = 0; + forkp->cf_vblocks = 0; + xp[0].startBlock = (u_int32_t)filep->rsrcExtents[0].startBlock; + xp[0].blockCount = (u_int32_t)filep->rsrcExtents[0].blockCount; + xp[1].startBlock = (u_int32_t)filep->rsrcExtents[1].startBlock; + xp[1].blockCount = (u_int32_t)filep->rsrcExtents[1].blockCount; + xp[2].startBlock = (u_int32_t)filep->rsrcExtents[2].startBlock; + xp[2].blockCount = (u_int32_t)filep->rsrcExtents[2].blockCount; + } else { + forkp->cf_size = filep->dataLogicalSize; + forkp->cf_blocks = filep->dataPhysicalSize / blocksize; + forkp->cf_bytesread = 0; + forkp->cf_vblocks = 0; + xp[0].startBlock = (u_int32_t)filep->dataExtents[0].startBlock; + xp[0].blockCount = (u_int32_t)filep->dataExtents[0].blockCount; + xp[1].startBlock = (u_int32_t)filep->dataExtents[1].startBlock; + xp[1].blockCount = (u_int32_t)filep->dataExtents[1].blockCount; + xp[2].startBlock = (u_int32_t)filep->dataExtents[2].startBlock; + xp[2].blockCount = (u_int32_t)filep->dataExtents[2].blockCount; + } +} + +/* + * promoteattr - promote standard hfs catalog attributes to hfs plus + * + */ +static void +promoteattr(struct hfsmount *hfsmp, const CatalogRecord *dataPtr, struct HFSPlusCatalogFile *crp) +{ + u_int32_t blocksize = HFSTOVCB(hfsmp)->blockSize; + + if (dataPtr->recordType == kHFSFolderRecord) { + const struct HFSCatalogFolder * folder; + + folder = (const struct HFSCatalogFolder *) dataPtr; + crp->recordType = kHFSPlusFolderRecord; + crp->flags = folder->flags; + crp->fileID = folder->folderID; + crp->createDate = LocalToUTC(folder->createDate); + crp->contentModDate = LocalToUTC(folder->modifyDate); + crp->backupDate = LocalToUTC(folder->backupDate); + crp->reserved1 = folder->valence; + crp->reserved2 = 0; + bcopy(&folder->userInfo, &crp->userInfo, 32); + } else /* file */ { + const struct HFSCatalogFile * file; + + file = (const struct HFSCatalogFile *) dataPtr; + crp->recordType = kHFSPlusFileRecord; + crp->flags = file->flags; + crp->fileID = file->fileID; + crp->createDate = LocalToUTC(file->createDate); + crp->contentModDate = LocalToUTC(file->modifyDate); + crp->backupDate = LocalToUTC(file->backupDate); + crp->reserved1 = 0; + crp->reserved2 = 0; + bcopy(&file->userInfo, &crp->userInfo, 16); + bcopy(&file->finderInfo, &crp->finderInfo, 16); + crp->dataFork.totalBlocks = file->dataPhysicalSize / blocksize; + crp->resourceFork.totalBlocks = file->rsrcPhysicalSize / blocksize; + } + crp->textEncoding = 0; + crp->attributeModDate = crp->contentModDate; + crp->accessDate = crp->contentModDate; + bzero(&crp->bsdInfo, sizeof(HFSPlusBSDInfo)); +} +#endif + +/* + * Build a catalog node thread record from a catalog key + * and return the size of the record. + */ +static int +buildthread(void *keyp, void *recp, int std_hfs, int directory) +{ + int size = 0; + + if (std_hfs == 0) { + HFSPlusCatalogKey *key = (HFSPlusCatalogKey *)keyp; + HFSPlusCatalogThread *rec = (HFSPlusCatalogThread *)recp; + + size = sizeof(HFSPlusCatalogThread); + if (directory) + rec->recordType = kHFSPlusFolderThreadRecord; + else + rec->recordType = kHFSPlusFileThreadRecord; + rec->reserved = 0; + rec->parentID = key->parentID; + bcopy(&key->nodeName, &rec->nodeName, + sizeof(UniChar) * (key->nodeName.length + 1)); + + /* HFS Plus has variable sized thread records */ + size -= (sizeof(rec->nodeName.unicode) - + (rec->nodeName.length * sizeof(UniChar))); + + } +#if CONFIG_HFS_STD + else { + HFSCatalogKey *key = (HFSCatalogKey *)keyp; + HFSCatalogThread *rec = (HFSCatalogThread *)recp; + + size = sizeof(HFSCatalogThread); + bzero(rec, size); + if (directory) + rec->recordType = kHFSFolderThreadRecord; + else + rec->recordType = kHFSFileThreadRecord; + rec->parentID = key->parentID; + bcopy(key->nodeName, rec->nodeName, key->nodeName[0]+1); + + } +#endif + + return (size); +} + +/* + * Build a catalog node thread key. + */ +static void +buildthreadkey(HFSCatalogNodeID parentID, int std_hfs, CatalogKey *key) +{ + if (std_hfs == 0) { + key->hfsPlus.keyLength = kHFSPlusCatalogKeyMinimumLength; + key->hfsPlus.parentID = parentID; + key->hfsPlus.nodeName.length = 0; + } +#if CONFIG_HFS_STD + else { + key->hfs.keyLength = kHFSCatalogKeyMinimumLength; + key->hfs.reserved = 0; + key->hfs.parentID = parentID; + key->hfs.nodeName[0] = 0; + } +#endif + +} + +/* + * Extract the text encoding from a catalog node record. + */ +static u_int32_t +getencoding(const CatalogRecord *crp) +{ + u_int32_t encoding; + + if (crp->recordType == kHFSPlusFolderRecord) + encoding = crp->hfsPlusFolder.textEncoding; + else if (crp->recordType == kHFSPlusFileRecord) + encoding = crp->hfsPlusFile.textEncoding; + else + encoding = 0; + + return (encoding); +} + +/* + * Extract the CNID from a catalog node record. + */ +static cnid_t +getcnid(const CatalogRecord *crp) +{ + cnid_t cnid = 0; + + switch (crp->recordType) { + +#if CONFIG_HFS_STD + case kHFSFolderRecord: + cnid = crp->hfsFolder.folderID; + break; + case kHFSFileRecord: + cnid = crp->hfsFile.fileID; + break; +#endif + + case kHFSPlusFolderRecord: + cnid = crp->hfsPlusFolder.folderID; + break; + case kHFSPlusFileRecord: + cnid = crp->hfsPlusFile.fileID; + break; + default: + printf("hfs: getcnid: unknown recordType=%d\n", crp->recordType); + break; + } + + return (cnid); +} + +/* + * Extract the parent ID from a catalog node record. + */ +static cnid_t +getparentcnid(const CatalogRecord *recp) +{ + cnid_t cnid = 0; + + switch (recp->recordType) { + +#if CONFIG_HFS_STD + case kHFSFileThreadRecord: + case kHFSFolderThreadRecord: + cnid = recp->hfsThread.parentID; + break; +#endif + + case kHFSPlusFileThreadRecord: + case kHFSPlusFolderThreadRecord: + cnid = recp->hfsPlusThread.parentID; + break; + default: + panic("hfs: getparentcnid: unknown recordType (crp @ %p)\n", recp); + break; + } + + return (cnid); +} + +/* + * Determine if a catalog node record is a directory. + */ +static int +isadir(const CatalogRecord *crp) +{ + if (crp->recordType == kHFSPlusFolderRecord) { + return 1; + } +#if CONFIG_HFS_STD + if (crp->recordType == kHFSFolderRecord) { + return 1; + } +#endif + + return 0; +} + +/* + * cat_lookup_dirlink - lookup a catalog record for directory hard link + * (not inode) using catalog record id. Note that this function does + * NOT resolve directory hard link to its directory inode and return + * the link record. + * + * Note: The caller is responsible for releasing the output catalog + * descriptor (when supplied outdescp is non-null). + */ +int +cat_lookup_dirlink(struct hfsmount *hfsmp, cnid_t dirlink_id, + u_int8_t forktype, struct cat_desc *outdescp, + struct cat_attr *attrp, struct cat_fork *forkp) +{ + struct BTreeIterator *iterator = NULL; + FSBufferDescriptor btdata; + u_int16_t datasize; + CatalogKey *keyp; + CatalogRecord *recp = NULL; + int error; + + /* No directory hard links on standard HFS */ + if (hfsmp->vcbSigWord == kHFSSigWord) { + return ENOTSUP; + } + + iterator = hfs_mallocz(sizeof(*iterator)); + buildthreadkey(dirlink_id, 1, (CatalogKey *)&iterator->key); + + recp = hfs_malloc(sizeof(CatalogRecord)); + BDINIT(btdata, recp); + + error = BTSearchRecord(VTOF(HFSTOVCB(hfsmp)->catalogRefNum), iterator, + &btdata, &datasize, iterator); + if (error) { + goto out; + } + /* Directory hard links are catalog file record */ + if (recp->recordType != kHFSPlusFileThreadRecord) { + error = ENOENT; + goto out; + } + + keyp = (CatalogKey *)&recp->hfsPlusThread.reserved; + keyp->hfsPlus.keyLength = kHFSPlusCatalogKeyMinimumLength + + (keyp->hfsPlus.nodeName.length * 2); + if (forktype == kHFSResourceForkType) { + /* Lookup resource fork for directory hard link */ + error = cat_lookupbykey(hfsmp, keyp, HFS_LOOKUP_HARDLINK, 0, true, outdescp, attrp, forkp, NULL); + } else { + /* Lookup data fork, if any, for directory hard link */ + error = cat_lookupbykey(hfsmp, keyp, HFS_LOOKUP_HARDLINK, 0, false, outdescp, attrp, forkp, NULL); + } + if (error) { + printf ("hfs: cat_lookup_dirlink(): Error looking up file record for id=%u (error=%d)\n", dirlink_id, error); + hfs_mark_inconsistent(hfsmp, HFS_INCONSISTENCY_DETECTED); + goto out; + } + /* Just for sanity, make sure that id in catalog record and thread record match */ + if ((outdescp != NULL) && (dirlink_id != outdescp->cd_cnid)) { + printf ("hfs: cat_lookup_dirlink(): Requested cnid=%u != found_cnid=%u\n", dirlink_id, outdescp->cd_cnid); + hfs_mark_inconsistent(hfsmp, HFS_INCONSISTENCY_DETECTED); + error = ENOENT; + } + +out: + if (recp) { + hfs_free(recp, sizeof(*recp)); + } + hfs_free(iterator, sizeof(*iterator)); + + return MacToVFSError(error); +} + +/* + * cnode_update_dirlink - update the catalog node for directory hard link + * described by descp using the data from attrp and forkp. + */ +int +cat_update_dirlink(struct hfsmount *hfsmp, u_int8_t forktype, + struct cat_desc *descp, struct cat_attr *attrp, struct cat_fork *forkp) +{ + if (forktype == kHFSResourceForkType) { + return cat_update_internal(hfsmp, true, descp, attrp, NULL, forkp); + } else { + return cat_update_internal(hfsmp, true, descp, attrp, forkp, NULL); + } +} + +void hfs_fork_copy(struct cat_fork *dst, const struct cat_fork *src, + HFSPlusExtentDescriptor *extents) +{ + /* Copy everything but the extents into the dest fork */ + memcpy(dst, src, offsetof(struct cat_fork, cf_extents)); + /* Then copy the supplied extents into the fork */ + memcpy(dst->cf_extents, extents, sizeof(HFSPlusExtentRecord)); +} diff --git a/core/hfs_catalog.h b/core/hfs_catalog.h new file mode 100644 index 0000000..0227dc2 --- /dev/null +++ b/core/hfs_catalog.h @@ -0,0 +1,512 @@ +/* + * Copyright (c) 2002-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#ifndef __HFS_CATALOG__ +#define __HFS_CATALOG__ + +#include + +#ifdef KERNEL +#ifdef __APPLE_API_PRIVATE +#include + +#include "hfs_format.h" + +/* HFS Catalog */ + + +/* + * Catalog ADTs + * + * The cat_desc, cat_attr, and cat_fork structures are + * use to import/export data to/from the Catalog file. + * The fields in these structures are always in BSD + * runtime format (e.g. dates and names). + */ + +typedef u_int32_t cnid_t; + +/* + * Catalog Node Descriptor (runtime) + */ +struct cat_desc { + u_int8_t cd_flags; /* see below (8 bits) */ + u_int8_t cd_encoding; /* name encoding */ + int16_t cd_namelen; /* length of cnode name */ + cnid_t cd_parentcnid; /* parent directory CNID */ + u_int32_t cd_hint; /* catalog file hint */ + cnid_t cd_cnid; /* cnode id (for getattrlist) */ + const u_int8_t * cd_nameptr; /* pointer to cnode name */ +}; + +/* cd_flags + * + * CD_EOF is used by hfs_vnop_readdir / cat_getdirentries to indicate EOF was + * encountered during a directory enumeration. When this flag is observed + * on the next call to hfs_vnop_readdir it tells the caller that there's no + * need to descend into the catalog as EOF was encountered during the last call. + * This flag should only be set on the descriptor embedded in the directoryhint. + */ + +#define CD_HASBUF 0x01 /* allocated filename buffer */ +#define CD_DECOMPOSED 0x02 /* name is fully decomposed */ +#define CD_EOF 0x04 /* see above */ +#define CD_ISMETA 0x40 /* describes a metadata file */ +#define CD_ISDIR 0x80 /* describes a directory */ + +/* + * Catalog Node Attributes (runtime) + */ +struct cat_attr { + cnid_t ca_fileid; /* inode number (for stat) normally == cnid */ + mode_t ca_mode; /* file access mode and type (16 bits) */ + u_int16_t ca_recflags; /* catalog record flags (16 bit integer) */ + u_int32_t ca_linkcount; /* real hard link count */ + uid_t ca_uid; /* file owner */ + gid_t ca_gid; /* file group */ + union { + dev_t cau_rdev; /* special file device (VBLK or VCHAR only) */ + u_int32_t cau_linkref; /* hardlink reference number */ + } ca_union1; + time_t ca_atime; /* last access time */ + time_t ca_atimeondisk; /* access time value on disk */ + time_t ca_mtime; /* last data modification time */ + time_t ca_ctime; /* last file status change */ + time_t ca_itime; /* file initialization time */ + time_t ca_btime; /* last backup time */ + u_int32_t ca_flags; /* status flags (chflags) */ + union { + u_int32_t cau_blocks; /* total file blocks used (rsrc + data) */ + u_int32_t cau_entries; /* total directory entries (valence) */ + } ca_union2; + union { + u_int32_t cau_dircount; /* count of sub dirs (for posix nlink) */ + u_int32_t cau_firstlink; /* first hardlink link (files only) */ + } ca_union3; + union { + u_int8_t ca_finderinfo[32]; /* Opaque Finder information */ + struct { + FndrFileInfo ca_finderfileinfo; + struct FndrExtendedFileInfo ca_finderextendedfileinfo; + }; + struct { + FndrDirInfo ca_finderdirinfo; + struct FndrExtendedDirInfo ca_finderextendeddirinfo; + }; + }; +}; + +/* Aliases for common fields */ +#define ca_rdev ca_union1.cau_rdev +#define ca_linkref ca_union1.cau_linkref +#define ca_blocks ca_union2.cau_blocks +#define ca_entries ca_union2.cau_entries +#define ca_dircount ca_union3.cau_dircount +#define ca_firstlink ca_union3.cau_firstlink + +/* + * Catalog Node Fork (runtime) + * + * NOTE: this is not the same as a struct HFSPlusForkData + * + * NOTE: if cf_new_size > cf_size, then a write is in progress and is extending + * the EOF; the new EOF will be cf_new_size. Writes and pageouts may validly + * write up to cf_new_size, but reads should only read up to cf_size. When + * an extending write is not in progress, cf_new_size is zero. + */ +struct cat_fork { + off_t cf_size; /* fork's logical size in bytes */ + off_t cf_new_size; /* fork's logical size after write completes */ + union { + u_int32_t cfu_clump; /* fork's clump size in bytes (sys files only) */ + u_int64_t cfu_bytesread; /* bytes read from this fork */ + } cf_union; + u_int32_t cf_vblocks; /* virtual (unalloated) blocks */ + u_int32_t cf_blocks; /* total blocks used by this fork */ + struct HFSPlusExtentDescriptor cf_extents[8]; /* initial set of extents */ + + /* + * NOTE: If you change this structure, make sure you change you change + * hfs_fork_copy. + */ +}; + +#define cf_clump cf_union.cfu_clump +#define cf_bytesread cf_union.cfu_bytesread + +void hfs_fork_copy(struct cat_fork *dst, const struct cat_fork *src, + HFSPlusExtentDescriptor *extents); + +/* + * Directory Hint + * Used to hold state across directory enumerations. + * + */ +struct directoryhint { + TAILQ_ENTRY(directoryhint) dh_link; /* chain */ + int dh_index; /* index into directory (zero relative) */ + u_int32_t dh_threadhint; /* node hint of a directory's thread record */ + u_int32_t dh_time; + struct cat_desc dh_desc; /* entry's descriptor */ +}; +typedef struct directoryhint directoryhint_t; + +/* + * HFS_MAXDIRHINTS cannot be larger than 63 without reducing + * HFS_INDEX_BITS, because given the 6-bit tag, at most 63 different + * tags can exist. When HFS_MAXDIRHINTS is larger than 63, the same + * list may contain dirhints of the same tag, and a staled dirhint may + * be returned. + */ +#define HFS_MAXDIRHINTS 32 +#define HFS_DIRHINT_TTL 45 + +#define HFS_INDEX_MASK 0x03ffffff +#define HFS_INDEX_BITS 26 + + +/* + * Catalog Node Entry + * + * A cat_entry is used for bulk enumerations (hfs_readdirattr). + */ +struct cat_entry { + struct cat_desc ce_desc; + struct cat_attr ce_attr; + off_t ce_datasize; + off_t ce_rsrcsize; + u_int32_t ce_datablks; + u_int32_t ce_rsrcblks; +}; + +/* + * Starting in 10.5, hfs_vnop_readdirattr() only makes one + * call to cat_getentriesattr(). So we increased MAXCATENTRIES + * while keeping the total size of the CE LIST buffer <= 8K + * (which works out to be 60 entries per call). The 8K limit + * keeps the memory coming from a kalloc zone instead of + * valuable/fragment-able kernel map space. + */ +#define MAXCATENTRIES \ + (1 + (8192 - sizeof (struct cat_entrylist)) / sizeof (struct cat_entry)) + +/* + * Catalog Node Entry List + * + * A cat_entrylist is a list of Catalog Node Entries. + */ +struct cat_entrylist { + u_int32_t maxentries; /* number of entries requested */ + u_int32_t realentries; /* number of valid entries returned */ + u_int32_t skipentries; /* number of entries skipped (reserved HFS+ files) */ + struct cat_entry entry[1]; /* array of entries */ +}; + +#define CE_LIST_SIZE(entries) \ + sizeof (*ce_list) + (((entries) - 1) * sizeof (struct cat_entry)) + +struct hfsmount; + +/* + * Catalog FileID/CNID Acquisition / Lookup + * + * Some use-cases require that we find a valid CNID + * before we may be ready to enter the item into the namespace. + * In order to resolve this, we support a hashtable attached to + * the mount that is secured by the catalog lock. + * + * Finding the next valid CNID is easy if the wraparound bit is + * not set -- you just pull from the hfsmp next pointer. + * If it is set then you must find a free entry in the catalog + * and also query the hashtable to see if the item is free or not. + * + * If you want to request a CNID before there is a backing item + * in the catalog, you must find one that is valid, then insert + * it into the hash table until such time that the item is + * inserted into the catalog. After successful catalog insertion, + * you must remove the item from the hashtable. + */ + +typedef struct cat_preflightid { + cnid_t fileid; + LIST_ENTRY(cat_preflightid) id_hash; +} cat_preflightid_t; + +extern int cat_remove_idhash (cat_preflightid_t *preflight); +extern int cat_insert_idhash (struct hfsmount *hfsmp, cat_preflightid_t *preflight); +extern int cat_check_idhash (struct hfsmount *hfsmp, cnid_t test_fileid); + +/* initialize the id look up hashtable during mount */ +extern void hfs_idhash_init (struct hfsmount *hfsmp); + +/* release the id lookup hashtable during unmount */ +extern void hfs_idhash_destroy (struct hfsmount *hfsmp); + +/* Get a new CNID for use */ +extern int cat_acquire_cnid (struct hfsmount *hfsmp, cnid_t *new_cnid); + + +/* default size of ID hash is 64 entries */ +#define HFS_IDHASH_DEFAULT 64 + + +/* + * Catalog Operations Hint + * + * lower 16 bits: count of B-tree insert operations + * upper 16 bits: count of B-tree delete operations + * + */ +#define CAT_DELETE 0x00010000 +#define CAT_CREATE 0x00000002 +#define CAT_RENAME 0x00010002 +#define CAT_EXCHANGE 0x00010002 + +typedef u_int32_t catops_t; + +/* + * The size of cat_cookie_t much match the size of + * the nreserve struct (in BTreeNodeReserve.c). + */ +typedef struct cat_cookie_t { +#if defined(__LP64__) + char opaque[40]; +#else + char opaque[24]; +#endif +} cat_cookie_t; + +/* Universal catalog key */ +union CatalogKey { + HFSCatalogKey hfs; + HFSPlusCatalogKey hfsPlus; +}; +typedef union CatalogKey CatalogKey; + +/* Universal catalog data record */ +union CatalogRecord { + int16_t recordType; + HFSCatalogFolder hfsFolder; + HFSCatalogFile hfsFile; + HFSCatalogThread hfsThread; + HFSPlusCatalogFolder hfsPlusFolder; + HFSPlusCatalogFile hfsPlusFile; + HFSPlusCatalogThread hfsPlusThread; +}; +typedef union CatalogRecord CatalogRecord; + +/* Constants for HFS fork types */ +enum { + kHFSDataForkType = 0x0, /* data fork */ + kHFSResourceForkType = 0xff /* resource fork */ +}; + +/* + * Catalog Interface + * + * These functions perform a catalog transactions. The + * catalog b-tree is abstracted through this interface. + * (please don't go around it) + */ + + +extern void cat_releasedesc(struct cat_desc *descp); + +extern int cat_create ( struct hfsmount *hfsmp, + cnid_t new_fileid, + struct cat_desc *descp, + struct cat_attr *attrp, + struct cat_desc *out_descp); + +extern int cat_delete ( struct hfsmount *hfsmp, + struct cat_desc *descp, + struct cat_attr *attrp); + +extern int cat_lookup ( struct hfsmount *hfsmp, + struct cat_desc *descp, + int wantrsrc, + int force_casesensitive_lookup, + struct cat_desc *outdescp, + struct cat_attr *attrp, + struct cat_fork *forkp, + cnid_t *desc_cnid); + +extern int cat_idlookup (struct hfsmount *hfsmp, + cnid_t cnid, + int allow_system_files, + int wantrsrc, + struct cat_desc *outdescp, + struct cat_attr *attrp, + struct cat_fork *forkp); + +extern int cat_findname (struct hfsmount *hfsmp, + cnid_t cnid, + struct cat_desc *outdescp); + +extern int cat_getentriesattr( + struct hfsmount *hfsmp, + directoryhint_t *dirhint, + struct cat_entrylist *ce_list, + int *reachedeof); + +extern int cat_rename ( struct hfsmount * hfsmp, + struct cat_desc * from_cdp, + struct cat_desc * todir_cdp, + struct cat_desc * to_cdp, + struct cat_desc * cdp); + +extern int cat_update ( struct hfsmount *hfsmp, + struct cat_desc *descp, + struct cat_attr *attrp, + const struct cat_fork *dataforkp, + const struct cat_fork *rsrcforkp); + +extern int cat_getdirentries( + struct hfsmount *hfsmp, + u_int32_t entrycnt, + directoryhint_t *dirhint, + uio_t uio, + int extended, + int * items, + int * eofflag); + +extern int cat_insertfilethread ( + struct hfsmount *hfsmp, + struct cat_desc *descp); + +extern int cat_preflight( + struct hfsmount *hfsmp, + catops_t ops, + cat_cookie_t *cookie, + struct proc *p); + +extern void cat_postflight( + struct hfsmount *hfsmp, + cat_cookie_t *cookie, + struct proc *p); + +extern int cat_binarykeycompare( + HFSPlusCatalogKey *searchKey, + HFSPlusCatalogKey *trialKey); + +extern int CompareCatalogKeys( + HFSCatalogKey *searchKey, + HFSCatalogKey *trialKey); + +extern int CompareExtendedCatalogKeys( + HFSPlusCatalogKey *searchKey, + HFSPlusCatalogKey *trialKey); + +extern void cat_convertattr( + struct hfsmount *hfsmp, + CatalogRecord * recp, + struct cat_attr *attrp, + struct cat_fork *datafp, + struct cat_fork *rsrcfp); + +extern int cat_convertkey( + struct hfsmount *hfsmp, + CatalogKey *key, + CatalogRecord * recp, + struct cat_desc *descp); + +extern int cat_getkeyplusattr( + struct hfsmount *hfsmp, + cnid_t cnid, + CatalogKey *key, + struct cat_attr *attrp); + +/* Hard link functions. */ + +extern int cat_check_link_ancestry( + struct hfsmount *hfsmp, + cnid_t parentid, + cnid_t pointed_at_cnid); + +extern int cat_set_childlinkbit( + struct hfsmount *hfsmp, + cnid_t cnid); + +#define HFS_IGNORABLE_LINK 0x00000001 + +extern int cat_resolvelink( struct hfsmount *hfsmp, + u_int32_t linkref, + int isdirlink, + struct HFSPlusCatalogFile *recp); + +extern int cat_createlink( struct hfsmount *hfsmp, + struct cat_desc *descp, + struct cat_attr *attr, + cnid_t nextlinkid, + cnid_t *linkfileid); + +/* Finder Info's file type and creator for directory hard link alias */ +enum { + kHFSAliasType = 0x66647270, /* 'fdrp' */ + kHFSAliasCreator = 0x4D414353 /* 'MACS' */ +}; + +extern int cat_deletelink( struct hfsmount *hfsmp, + struct cat_desc *descp); + +extern int cat_update_siblinglinks( struct hfsmount *hfsmp, + cnid_t linkfileid, + cnid_t prevlinkid, + cnid_t nextlinkid); + +extern int cat_lookuplink( struct hfsmount *hfsmp, + struct cat_desc *descp, + cnid_t *linkfileid, + cnid_t *prevlinkid, + cnid_t *nextlinkid); + +extern int cat_lookup_siblinglinks( struct hfsmount *hfsmp, + cnid_t linkfileid, + cnid_t *prevlinkid, + cnid_t *nextlinkid); + +extern int cat_lookup_lastlink( struct hfsmount *hfsmp, + cnid_t startid, + cnid_t *nextlinkid, + struct cat_desc *cdesc); + +extern int cat_lookup_dirlink(struct hfsmount *hfsmp, + cnid_t dirlink_id, + u_int8_t forktype, + struct cat_desc *outdescp, + struct cat_attr *attrp, + struct cat_fork *forkp); + +extern int cat_update_dirlink(struct hfsmount *hfsmp, + u_int8_t forktype, + struct cat_desc *descp, + struct cat_attr *attrp, + struct cat_fork *rsrcforkp); + +#endif /* __APPLE_API_PRIVATE */ +#endif /* KERNEL */ +#endif /* __HFS_CATALOG__ */ diff --git a/core/hfs_chash.c b/core/hfs_chash.c new file mode 100644 index 0000000..5fe0bc3 --- /dev/null +++ b/core/hfs_chash.c @@ -0,0 +1,578 @@ +/* + * Copyright (c) 2002-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * Copyright (c) 1982, 1986, 1989, 1991, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)hfs_chash.c + * derived from @(#)ufs_ihash.c 8.7 (Berkeley) 5/17/95 + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "hfs.h" /* XXX bringup */ +#include "hfs_cnode.h" + +extern lck_attr_t * hfs_lock_attr; +extern lck_grp_t * hfs_mutex_group; +extern lck_grp_t * hfs_rwlock_group; + +lck_grp_t * chash_lck_grp; +lck_grp_attr_t * chash_lck_grp_attr; +lck_attr_t * chash_lck_attr; + +#define CNODEHASH(hfsmp, inum) (&hfsmp->hfs_cnodehashtbl[(inum) & hfsmp->hfs_cnodehash]) + +/* + * Initialize cnode hash table. + */ +void +hfs_chashinit() +{ + chash_lck_grp_attr= lck_grp_attr_alloc_init(); + chash_lck_grp = lck_grp_alloc_init("cnode_hash", chash_lck_grp_attr); + chash_lck_attr = lck_attr_alloc_init(); +} + +static void hfs_chash_lock(struct hfsmount *hfsmp) +{ + lck_mtx_lock(&hfsmp->hfs_chash_mutex); +} + +static void hfs_chash_lock_spin(struct hfsmount *hfsmp) +{ + lck_mtx_lock_spin(&hfsmp->hfs_chash_mutex); +} + +static void hfs_chash_lock_convert(struct hfsmount *hfsmp) +{ + lck_mtx_convert_spin(&hfsmp->hfs_chash_mutex); +} + +static void hfs_chash_unlock(struct hfsmount *hfsmp) +{ + lck_mtx_unlock(&hfsmp->hfs_chash_mutex); +} + +void +hfs_chashinit_finish(struct hfsmount *hfsmp) +{ + lck_mtx_init(&hfsmp->hfs_chash_mutex, chash_lck_grp, chash_lck_attr); + + hfsmp->hfs_cnodehashtbl = hashinit(desiredvnodes / 4, M_TEMP, &hfsmp->hfs_cnodehash); +} + +void +hfs_delete_chash(struct hfsmount *hfsmp) +{ + lck_mtx_destroy(&hfsmp->hfs_chash_mutex, chash_lck_grp); + + FREE(hfsmp->hfs_cnodehashtbl, M_TEMP); +} + + +/* + * Use the device, inum pair to find the incore cnode. + * + * If it is in core, but locked, wait for it. + */ +struct vnode * +hfs_chash_getvnode(struct hfsmount *hfsmp, ino_t inum, int wantrsrc, int skiplock, int allow_deleted) +{ + struct cnode *cp; + struct vnode *vp; + int error; + u_int32_t vid; + + /* + * Go through the hash list + * If a cnode is in the process of being cleaned out or being + * allocated, wait for it to be finished and then try again. + */ +loop: + hfs_chash_lock_spin(hfsmp); + + for (cp = CNODEHASH(hfsmp, inum)->lh_first; cp; cp = cp->c_hash.le_next) { + if (cp->c_fileid != inum) + continue; + /* Wait if cnode is being created or reclaimed. */ + if (ISSET(cp->c_hflag, H_ALLOC | H_TRANSIT | H_ATTACH)) { + SET(cp->c_hflag, H_WAITING); + + (void) msleep(cp, &hfsmp->hfs_chash_mutex, PDROP | PINOD, + "hfs_chash_getvnode", 0); + goto loop; + } + /* Obtain the desired vnode. */ + vp = wantrsrc ? cp->c_rsrc_vp : cp->c_vp; + if (vp == NULLVP) + goto exit; + + vid = vnode_vid(vp); + hfs_chash_unlock(hfsmp); + + if ((error = vnode_getwithvid(vp, vid))) { + /* + * If vnode is being reclaimed, or has + * already changed identity, no need to wait + */ + return (NULL); + } + if (!skiplock && hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT) != 0) { + vnode_put(vp); + return (NULL); + } + + /* + * Skip cnodes that are not in the name space anymore + * we need to check with the cnode lock held because + * we may have blocked acquiring the vnode ref or the + * lock on the cnode which would allow the node to be + * unlinked + */ + if (!allow_deleted) { + if (cp->c_flag & (C_NOEXISTS | C_DELETED)) { + if (!skiplock) { + hfs_unlock(cp); + } + vnode_put(vp); + return (NULL); + } + } + return (vp); + } +exit: + hfs_chash_unlock(hfsmp); + return (NULL); +} + + +/* + * Use the device, fileid pair to snoop an incore cnode. + * + * A cnode can exists in chash even after it has been + * deleted from the catalog, so this function returns + * ENOENT if C_NOEXIST is set in the cnode's flag. + * + */ +int +hfs_chash_snoop(struct hfsmount *hfsmp, ino_t inum, int existence_only, + int (*callout)(const cnode_t *cp, void *), void * arg) +{ + struct cnode *cp; + int result = ENOENT; + + /* + * Go through the hash list + * If a cnode is in the process of being cleaned out or being + * allocated, wait for it to be finished and then try again. + */ + hfs_chash_lock(hfsmp); + + for (cp = CNODEHASH(hfsmp, inum)->lh_first; cp; cp = cp->c_hash.le_next) { + if (cp->c_fileid != inum) + continue; + + /* + * Under normal circumstances, we would want to return ENOENT if a cnode is in + * the hash and it is marked C_NOEXISTS or C_DELETED. However, if the CNID + * namespace has wrapped around, then we have the possibility of collisions. + * In that case, we may use this function to validate whether or not we + * should trust the nextCNID value in the hfs mount point. + * + * If we didn't do this, then it would be possible for a cnode that is no longer backed + * by anything on-disk (C_NOEXISTS) to still exist in the hash along with its + * vnode. The cat_create routine could then create a new entry in the catalog + * re-using that CNID. Then subsequent hfs_getnewvnode calls will repeatedly fail + * trying to look it up/validate it because it is marked C_NOEXISTS. So we want + * to prevent that from happening as much as possible. + */ + if (existence_only) { + result = 0; + break; + } + + /* Skip cnodes that have been removed from the catalog */ + if (cp->c_flag & (C_NOEXISTS | C_DELETED)) { + result = EACCES; + break; + } + + /* Skip cnodes being created or reclaimed. */ + if (!ISSET(cp->c_hflag, H_ALLOC | H_TRANSIT | H_ATTACH)) { + result = callout(cp, arg); + } + break; + } + hfs_chash_unlock(hfsmp); + + return (result); +} + +/* + * Use the device, fileid pair to find the incore cnode. + * If no cnode if found one is created + * + * If it is in core, but locked, wait for it. + * + * If the cnode is C_DELETED, then return NULL since that + * inum is no longer valid for lookups (open-unlinked file). + * + * If the cnode is C_DELETED but also marked C_RENAMED, then that means + * the cnode was renamed over and a new entry exists in its place. The caller + * should re-drive the lookup to get the newer entry. In that case, we'll still + * return NULL for the cnode, but also return GNV_CHASH_RENAMED in the output flags + * of this function to indicate the caller that they should re-drive. + */ +struct cnode * +hfs_chash_getcnode(struct hfsmount *hfsmp, ino_t inum, struct vnode **vpp, + int wantrsrc, int skiplock, int *out_flags, int *hflags) +{ + struct cnode *cp; + struct cnode *ncp = NULL; + vnode_t vp; + u_int32_t vid; + + /* + * Go through the hash list + * If a cnode is in the process of being cleaned out or being + * allocated, wait for it to be finished and then try again. + */ +loop: + hfs_chash_lock_spin(hfsmp); + +loop_with_lock: + for (cp = CNODEHASH(hfsmp, inum)->lh_first; cp; cp = cp->c_hash.le_next) { + if (cp->c_fileid != inum) + continue; + /* + * Wait if cnode is being created, attached to or reclaimed. + */ + if (ISSET(cp->c_hflag, H_ALLOC | H_ATTACH | H_TRANSIT)) { + SET(cp->c_hflag, H_WAITING); + + (void) msleep(cp, &hfsmp->hfs_chash_mutex, PINOD, + "hfs_chash_getcnode", 0); + goto loop_with_lock; + } + vp = wantrsrc ? cp->c_rsrc_vp : cp->c_vp; + if (vp == NULL) { + /* + * The desired vnode isn't there so tag the cnode. + */ + SET(cp->c_hflag, H_ATTACH); + *hflags |= H_ATTACH; + + hfs_chash_unlock(hfsmp); + } else { + vid = vnode_vid(vp); + + hfs_chash_unlock(hfsmp); + + if (vnode_getwithvid(vp, vid)) + goto loop; + } + if (ncp) { + /* + * someone else won the race to create + * this cnode and add it to the hash + * just dump our allocation + */ + hfs_zfree(ncp, HFS_CNODE_ZONE); + ncp = NULL; + } + + if (!skiplock) { + hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); + } + + /* + * Skip cnodes that are not in the name space anymore + * we need to check with the cnode lock held because + * we may have blocked acquiring the vnode ref or the + * lock on the cnode which would allow the node to be + * unlinked. + * + * Don't return a cnode in this case since the inum + * is no longer valid for lookups. + */ + if ((cp->c_flag & (C_NOEXISTS | C_DELETED)) && !wantrsrc) { + int renamed = 0; + if (cp->c_flag & C_RENAMED) { + renamed = 1; + } + if (!skiplock) + hfs_unlock(cp); + if (vp != NULLVP) { + vnode_put(vp); + } else { + hfs_chash_lock_spin(hfsmp); + CLR(cp->c_hflag, H_ATTACH); + *hflags &= ~H_ATTACH; + if (ISSET(cp->c_hflag, H_WAITING)) { + CLR(cp->c_hflag, H_WAITING); + wakeup((caddr_t)cp); + } + hfs_chash_unlock(hfsmp); + } + vp = NULL; + cp = NULL; + if (renamed) { + *out_flags = GNV_CHASH_RENAMED; + } + } + *vpp = vp; + return (cp); + } + + /* + * Allocate a new cnode + */ + if (skiplock && !wantrsrc) + panic("%s - should never get here when skiplock is set \n", __FUNCTION__); + + if (ncp == NULL) { + hfs_chash_unlock(hfsmp); + + ncp = hfs_zalloc(HFS_CNODE_ZONE); + + /* + * since we dropped the chash lock, + * we need to go back and re-verify + * that this node hasn't come into + * existence... + */ + goto loop; + } + hfs_chash_lock_convert(hfsmp); + +#if HFS_MALLOC_DEBUG + bzero(ncp, __builtin_offsetof(struct cnode, magic)); +#else + bzero(ncp, sizeof(*ncp)); +#endif + + SET(ncp->c_hflag, H_ALLOC); + *hflags |= H_ALLOC; + ncp->c_fileid = inum; + TAILQ_INIT(&ncp->c_hintlist); /* make the list empty */ + TAILQ_INIT(&ncp->c_originlist); + + lck_rw_init(&ncp->c_rwlock, hfs_rwlock_group, hfs_lock_attr); + if (!skiplock) + (void) hfs_lock(ncp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + + /* Insert the new cnode with it's H_ALLOC flag set */ + LIST_INSERT_HEAD(CNODEHASH(hfsmp, inum), ncp, c_hash); + hfs_chash_unlock(hfsmp); + + *vpp = NULL; + return (ncp); +} + + +void +hfs_chashwakeup(struct hfsmount *hfsmp, struct cnode *cp, int hflags) +{ + hfs_chash_lock_spin(hfsmp); + + CLR(cp->c_hflag, hflags); + + if (ISSET(cp->c_hflag, H_WAITING)) { + CLR(cp->c_hflag, H_WAITING); + wakeup((caddr_t)cp); + } + hfs_chash_unlock(hfsmp); +} + + +/* + * Re-hash two cnodes in the hash table. + */ +void +hfs_chash_rehash(struct hfsmount *hfsmp, struct cnode *cp1, struct cnode *cp2) +{ + hfs_chash_lock_spin(hfsmp); + + LIST_REMOVE(cp1, c_hash); + LIST_REMOVE(cp2, c_hash); + LIST_INSERT_HEAD(CNODEHASH(hfsmp, cp1->c_fileid), cp1, c_hash); + LIST_INSERT_HEAD(CNODEHASH(hfsmp, cp2->c_fileid), cp2, c_hash); + + hfs_chash_unlock(hfsmp); +} + + +/* + * Remove a cnode from the hash table. + */ +int +hfs_chashremove(struct hfsmount *hfsmp, struct cnode *cp) +{ + hfs_chash_lock_spin(hfsmp); + + /* Check if a vnode is getting attached */ + if (ISSET(cp->c_hflag, H_ATTACH)) { + hfs_chash_unlock(hfsmp); + return (EBUSY); + } + if (cp->c_hash.le_next || cp->c_hash.le_prev) { + LIST_REMOVE(cp, c_hash); + cp->c_hash.le_next = NULL; + cp->c_hash.le_prev = NULL; + } + hfs_chash_unlock(hfsmp); + + return (0); +} + +/* + * Remove a cnode from the hash table and wakeup any waiters. + */ +void +hfs_chash_abort(struct hfsmount *hfsmp, struct cnode *cp) +{ + hfs_chash_lock_spin(hfsmp); + + LIST_REMOVE(cp, c_hash); + cp->c_hash.le_next = NULL; + cp->c_hash.le_prev = NULL; + + CLR(cp->c_hflag, H_ATTACH | H_ALLOC); + if (ISSET(cp->c_hflag, H_WAITING)) { + CLR(cp->c_hflag, H_WAITING); + wakeup((caddr_t)cp); + } + hfs_chash_unlock(hfsmp); +} + + +/* + * mark a cnode as in transition + */ +void +hfs_chash_mark_in_transit(struct hfsmount *hfsmp, struct cnode *cp) +{ + hfs_chash_lock_spin(hfsmp); + + SET(cp->c_hflag, H_TRANSIT); + + hfs_chash_unlock(hfsmp); +} + +/* Search a cnode in the hash. This function does not return cnode which + * are getting created, destroyed or in transition. Note that this function + * does not acquire the cnode hash mutex, and expects the caller to acquire it. + * On success, returns pointer to the cnode found. On failure, returns NULL. + */ +static +struct cnode * +hfs_chash_search_cnid(struct hfsmount *hfsmp, cnid_t cnid) +{ + struct cnode *cp; + + for (cp = CNODEHASH(hfsmp, cnid)->lh_first; cp; cp = cp->c_hash.le_next) { + if (cp->c_fileid == cnid) { + break; + } + } + + /* If cnode is being created or reclaimed, return error. */ + if (cp && ISSET(cp->c_hflag, H_ALLOC | H_TRANSIT | H_ATTACH)) { + cp = NULL; + } + + return cp; +} + +/* Search a cnode corresponding to given device and ID in the hash. If the + * found cnode has kHFSHasChildLinkBit cleared, set it. If the cnode is not + * found, no new cnode is created and error is returned. + * + * Return values - + * -1 : The cnode was not found. + * 0 : The cnode was found, and the kHFSHasChildLinkBit was already set. + * 1 : The cnode was found, the kHFSHasChildLinkBit was not set, and the + * function had to set that bit. + */ +int +hfs_chash_set_childlinkbit(struct hfsmount *hfsmp, cnid_t cnid) +{ + int retval = -1; + struct cnode *cp; + + hfs_chash_lock_spin(hfsmp); + + cp = hfs_chash_search_cnid(hfsmp, cnid); + if (cp) { + if (cp->c_attr.ca_recflags & kHFSHasChildLinkMask) { + retval = 0; + } else { + cp->c_attr.ca_recflags |= kHFSHasChildLinkMask; + retval = 1; + } + } + hfs_chash_unlock(hfsmp); + + return retval; +} diff --git a/core/hfs_cnode.c b/core/hfs_cnode.c new file mode 100644 index 0000000..12b126c --- /dev/null +++ b/core/hfs_cnode.c @@ -0,0 +1,2561 @@ +/* + * Copyright (c) 2002-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include "hfs.h" +#include "hfs_catalog.h" +#include "hfs_cnode.h" +#include "hfs_quota.h" +#include "hfs_format.h" +#include "hfs_kdebug.h" +#include "hfs_cprotect.h" + +extern int prtactive; + +extern lck_attr_t * hfs_lock_attr; +extern lck_grp_t * hfs_mutex_group; +extern lck_grp_t * hfs_rwlock_group; + +static void hfs_reclaim_cnode(hfsmount_t *hfsmp, struct cnode *); +static int hfs_cnode_teardown (struct vnode *vp, vfs_context_t ctx, int reclaim); +static int hfs_isordered(struct cnode *, struct cnode *); + +extern int hfs_removefile_callback(struct buf *bp, void *hfsmp); + + +__inline__ int hfs_checkdeleted (struct cnode *cp) { + return ((cp->c_flag & (C_DELETED | C_NOEXISTS)) ? ENOENT : 0); +} + +/* + * Function used by a special fcntl() that decorates a cnode/vnode that + * indicates it is backing another filesystem, like a disk image. + * + * the argument 'val' indicates whether or not to set the bit in the cnode flags + * + * Returns non-zero on failure. 0 on success + */ +int hfs_set_backingstore (struct vnode *vp, int val) { + struct cnode *cp = NULL; + int err = 0; + + cp = VTOC(vp); + if (!vnode_isreg(vp) && !vnode_isdir(vp)) { + return EINVAL; + } + + /* lock the cnode */ + err = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + if (err) { + return err; + } + + if (val) { + cp->c_flag |= C_BACKINGSTORE; + } + else { + cp->c_flag &= ~C_BACKINGSTORE; + } + + /* unlock everything */ + hfs_unlock (cp); + + return err; +} + +/* + * Function used by a special fcntl() that check to see if a cnode/vnode + * indicates it is backing another filesystem, like a disk image. + * + * the argument 'val' is an output argument for whether or not the bit is set + * + * Returns non-zero on failure. 0 on success + */ + +int hfs_is_backingstore (struct vnode *vp, int *val) { + struct cnode *cp = NULL; + int err = 0; + + if (!vnode_isreg(vp) && !vnode_isdir(vp)) { + *val = 0; + return 0; + } + + cp = VTOC(vp); + + /* lock the cnode */ + err = hfs_lock (cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT); + if (err) { + return err; + } + + if (cp->c_flag & C_BACKINGSTORE) { + *val = 1; + } + else { + *val = 0; + } + + /* unlock everything */ + hfs_unlock (cp); + + return err; +} + + +/* + * hfs_cnode_teardown + * + * This is an internal function that is invoked from both hfs_vnop_inactive + * and hfs_vnop_reclaim. As VNOP_INACTIVE is not necessarily called from vnodes + * being recycled and reclaimed, it is important that we do any post-processing + * necessary for the cnode in both places. Important tasks include things such as + * releasing the blocks from an open-unlinked file when all references to it have dropped, + * and handling resource forks separately from data forks. + * + * Note that we take only the vnode as an argument here (rather than the cnode). + * Recall that each cnode supports two forks (rsrc/data), and we can always get the right + * cnode from either of the vnodes, but the reverse is not true -- we can't determine which + * vnode we need to reclaim if only the cnode is supplied. + * + * This function is idempotent and safe to call from both hfs_vnop_inactive and hfs_vnop_reclaim + * if both are invoked right after the other. In the second call, most of this function's if() + * conditions will fail, since they apply generally to cnodes still marked with C_DELETED. + * As a quick check to see if this function is necessary, determine if the cnode is already + * marked C_NOEXISTS. If it is, then it is safe to skip this function. The only tasks that + * remain for cnodes marked in such a fashion is to teardown their fork references and + * release all directory hints and hardlink origins. However, both of those are done + * in hfs_vnop_reclaim. hfs_update, by definition, is not necessary if the cnode's catalog + * entry is no longer there. + * + * 'reclaim' argument specifies whether or not we were called from hfs_vnop_reclaim. If we are + * invoked from hfs_vnop_reclaim, we can not call functions that cluster_push since the UBC info + * is totally gone by that point. + * + * Assumes that both truncate and cnode locks for 'cp' are held. + */ +static +int hfs_cnode_teardown (struct vnode *vp, vfs_context_t ctx, int reclaim) +{ + int forkcount = 0; + enum vtype v_type; + struct cnode *cp; + int error = 0; + bool started_tr = false; + struct hfsmount *hfsmp = VTOHFS(vp); + struct proc *p = vfs_context_proc(ctx); + int truncated = 0; + cat_cookie_t cookie; + int cat_reserve = 0; + int lockflags; + int ea_error = 0; + + v_type = vnode_vtype(vp); + cp = VTOC(vp); + + if (cp->c_datafork) { + ++forkcount; + } + if (cp->c_rsrcfork) { + ++forkcount; + } + + /* + * Push file data out for normal files that haven't been evicted from + * the namespace. We only do this if this function was not called from reclaim, + * because by that point the UBC information has been totally torn down. + * + * There should also be no way that a normal file that has NOT been deleted from + * the namespace to skip INACTIVE and go straight to RECLAIM. That race only happens + * when the file becomes open-unlinked. + */ + if ((v_type == VREG) && + (!ISSET(cp->c_flag, C_DELETED)) && + (!ISSET(cp->c_flag, C_NOEXISTS)) && + (VTOF(vp)->ff_blocks) && + (reclaim == 0)) { + /* + * If we're called from hfs_vnop_inactive, all this means is at the time + * the logic for deciding to call this function, there were not any lingering + * mmap/fd references for this file. However, there is nothing preventing the system + * from creating a new reference in between the time that logic was checked + * and we entered hfs_vnop_inactive. As a result, the only time we can guarantee + * that there aren't any references is during vnop_reclaim. + */ + hfs_filedone(vp, ctx, 0); + } + + /* + * Remove any directory hints or cached origins + */ + if (v_type == VDIR) { + hfs_reldirhints(cp, 0); + } + if (cp->c_flag & C_HARDLINK) { + hfs_relorigins(cp); + } + + /* + * -- Handle open unlinked files -- + * + * If the vnode is in use, it means a force unmount is in progress + * in which case we defer cleaning up until either we come back + * through here via hfs_vnop_reclaim, at which point the UBC + * information will have been torn down and the vnode might no + * longer be in use, or if it's still in use, it will get cleaned + * up when next remounted. + */ + if (ISSET(cp->c_flag, C_DELETED) && !vnode_isinuse(vp, 0)) { + /* + * This check is slightly complicated. We should only truncate data + * in very specific cases for open-unlinked files. This is because + * we want to ensure that the resource fork continues to be available + * if the caller has the data fork open. However, this is not symmetric; + * someone who has the resource fork open need not be able to access the data + * fork once the data fork has gone inactive. + * + * If we're the last fork, then we have cleaning up to do. + * + * A) last fork, and vp == c_vp + * Truncate away own fork data. If rsrc fork is not in core, truncate it too. + * + * B) last fork, and vp == c_rsrc_vp + * Truncate ourselves, assume data fork has been cleaned due to C). + * + * If we're not the last fork, then things are a little different: + * + * C) not the last fork, vp == c_vp + * Truncate ourselves. Once the file has gone out of the namespace, + * it cannot be further opened. Further access to the rsrc fork may + * continue, however. + * + * D) not the last fork, vp == c_rsrc_vp + * Don't enter the block below, just clean up vnode and push it out of core. + */ + + if ((v_type == VREG || v_type == VLNK) && + ((forkcount == 1) || (!VNODE_IS_RSRC(vp)))) { + + /* Truncate away our own fork data. (Case A, B, C above) */ + if (VTOF(vp)->ff_blocks != 0) { + /* + * SYMLINKS only: + * + * Encapsulate the entire change (including truncating the link) in + * nested transactions if we are modifying a symlink, because we know that its + * file length will be at most 4k, and we can fit both the truncation and + * any relevant bitmap changes into a single journal transaction. We also want + * the kill_block code to execute in the same transaction so that any dirty symlink + * blocks will not be written. Otherwise, rely on + * hfs_truncate doing its own transactions to ensure that we don't blow up + * the journal. + */ + if (!started_tr && (v_type == VLNK)) { + if (hfs_start_transaction(hfsmp) != 0) { + error = EINVAL; + goto out; + } + else { + started_tr = true; + } + } + + /* + * At this point, we have decided that this cnode is + * suitable for full removal. We are about to deallocate + * its blocks and remove its entry from the catalog. + * If it was a symlink, then it's possible that the operation + * which created it is still in the current transaction group + * due to coalescing. Take action here to kill the data blocks + * of the symlink out of the journal before moving to + * deallocate the blocks. We need to be in the middle of + * a transaction before calling buf_iterate like this. + * + * Note: we have to kill any potential symlink buffers out of + * the journal prior to deallocating their blocks. This is so + * that we don't race with another thread that may be doing an + * an allocation concurrently and pick up these blocks. It could + * generate I/O against them which could go out ahead of our journal + * transaction. + */ + + if (hfsmp->jnl && vnode_islnk(vp)) { + buf_iterate(vp, hfs_removefile_callback, BUF_SKIP_NONLOCKED, (void *)hfsmp); + } + + + /* + * This truncate call (and the one below) is fine from VNOP_RECLAIM's + * context because we're only removing blocks, not zero-filling new + * ones. The C_DELETED check above makes things much simpler. + */ + error = hfs_truncate(vp, (off_t)0, IO_NDELAY, 0, ctx); + if (error) { + goto out; + } + truncated = 1; + + /* (SYMLINKS ONLY): Close/End our transaction after truncating the file record */ + if (started_tr) { + hfs_end_transaction(hfsmp); + started_tr = false; + } + + } + + /* + * Truncate away the resource fork, if we represent the data fork and + * it is the last fork. That means, by definition, the rsrc fork is not in + * core. To avoid bringing a vnode into core for the sole purpose of deleting the + * data in the resource fork, we call cat_lookup directly, then hfs_release_storage + * to get rid of the resource fork's data. Note that because we are holding the + * cnode lock, it is impossible for a competing thread to create the resource fork + * vnode from underneath us while we do this. + * + * This is invoked via case A above only. + */ + if ((cp->c_blocks > 0) && (forkcount == 1) && (vp != cp->c_rsrc_vp)) { + struct cat_lookup_buffer *lookup_rsrc = NULL; + struct cat_desc *desc_ptr = NULL; + lockflags = 0; + + lookup_rsrc = hfs_mallocz(sizeof(*lookup_rsrc)); + + if (cp->c_desc.cd_namelen == 0) { + /* Initialize the rsrc descriptor for lookup if necessary*/ + MAKE_DELETED_NAME (lookup_rsrc->lookup_name, HFS_TEMPLOOKUP_NAMELEN, cp->c_fileid); + + lookup_rsrc->lookup_desc.cd_nameptr = (const uint8_t*) lookup_rsrc->lookup_name; + lookup_rsrc->lookup_desc.cd_namelen = strlen (lookup_rsrc->lookup_name); + lookup_rsrc->lookup_desc.cd_parentcnid = hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid; + lookup_rsrc->lookup_desc.cd_cnid = cp->c_cnid; + + desc_ptr = &lookup_rsrc->lookup_desc; + } + else { + desc_ptr = &cp->c_desc; + } + + lockflags = hfs_systemfile_lock (hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); + + error = cat_lookup (hfsmp, desc_ptr, 1, 0, (struct cat_desc *) NULL, + (struct cat_attr*) NULL, &lookup_rsrc->lookup_fork.ff_data, NULL); + + hfs_systemfile_unlock (hfsmp, lockflags); + + if (error) { + hfs_free(lookup_rsrc, sizeof(*lookup_rsrc)); + goto out; + } + + /* + * Make the filefork in our temporary struct look like a real + * filefork. Fill in the cp, sysfileinfo and rangelist fields.. + */ + rl_init (&lookup_rsrc->lookup_fork.ff_invalidranges); + lookup_rsrc->lookup_fork.ff_cp = cp; + + /* + * If there were no errors, then we have the catalog's fork information + * for the resource fork in question. Go ahead and delete the data in it now. + */ + + error = hfs_release_storage (hfsmp, NULL, &lookup_rsrc->lookup_fork, cp->c_fileid); + hfs_free(lookup_rsrc, sizeof(*lookup_rsrc)); + + if (error) { + goto out; + } + + /* + * This fileid's resource fork extents have now been fully deleted on-disk + * and this CNID is no longer valid. At this point, we should be able to + * zero out cp->c_blocks to indicate there is no data left in this file. + */ + cp->c_blocks = 0; + } + } + + /* + * If we represent the last fork (or none in the case of a dir), + * and the cnode has become open-unlinked... + * + * We check c_blocks here because it is possible in the force + * unmount case for the data fork to be in use but the resource + * fork to not be in use in which case we will truncate the + * resource fork, but not the data fork. It will get cleaned + * up upon next mount. + */ + if (forkcount <= 1 && !cp->c_blocks) { + /* + * If it has EA's, then we need to get rid of them. + * + * Note that this must happen outside of any other transactions + * because it starts/ends its own transactions and grabs its + * own locks. This is to prevent a file with a lot of attributes + * from creating a transaction that is too large (which panics). + */ + if (ISSET(cp->c_attr.ca_recflags, kHFSHasAttributesMask)) + ea_error = hfs_removeallattr(hfsmp, cp->c_fileid, &started_tr); + + /* + * Remove the cnode's catalog entry and release all blocks it + * may have been using. + */ + + /* + * Mark cnode in transit so that no one can get this + * cnode from cnode hash. + */ + // hfs_chash_mark_in_transit(hfsmp, cp); + // XXXdbg - remove the cnode from the hash table since it's deleted + // otherwise someone could go to sleep on the cnode and not + // be woken up until this vnode gets recycled which could be + // a very long time... + hfs_chashremove(hfsmp, cp); + + cp->c_flag |= C_NOEXISTS; // XXXdbg + cp->c_rdev = 0; + + if (!started_tr) { + if (hfs_start_transaction(hfsmp) != 0) { + error = EINVAL; + goto out; + } + started_tr = true; + } + + /* + * Reserve some space in the Catalog file. + */ + if ((error = cat_preflight(hfsmp, CAT_DELETE, &cookie, p))) { + goto out; + } + cat_reserve = 1; + + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_ATTRIBUTE, HFS_EXCLUSIVE_LOCK); + + if (cp->c_blocks > 0) { + printf("hfs_inactive: deleting non-empty%sfile %d, " + "blks %d\n", VNODE_IS_RSRC(vp) ? " rsrc " : " ", + (int)cp->c_fileid, (int)cp->c_blocks); + } + + // + // release the name pointer in the descriptor so that + // cat_delete() will use the file-id to do the deletion. + // in the case of hard links this is imperative (in the + // case of regular files the fileid and cnid are the + // same so it doesn't matter). + // + cat_releasedesc(&cp->c_desc); + + /* + * The descriptor name may be zero, + * in which case the fileid is used. + */ + error = cat_delete(hfsmp, &cp->c_desc, &cp->c_attr); + + if (error && truncated && (error != ENXIO)) { + printf("hfs_inactive: couldn't delete a truncated file!"); + } + + /* Update HFS Private Data dir */ + if (error == 0) { + hfsmp->hfs_private_attr[FILE_HARDLINKS].ca_entries--; + if (vnode_isdir(vp)) { + DEC_FOLDERCOUNT(hfsmp, hfsmp->hfs_private_attr[FILE_HARDLINKS]); + } + (void)cat_update(hfsmp, &hfsmp->hfs_private_desc[FILE_HARDLINKS], + &hfsmp->hfs_private_attr[FILE_HARDLINKS], NULL, NULL); + } + + hfs_systemfile_unlock(hfsmp, lockflags); + + if (error) { + goto out; + } + + #if QUOTA + if (hfsmp->hfs_flags & HFS_QUOTAS) + (void)hfs_chkiq(cp, -1, NOCRED, 0); + #endif /* QUOTA */ + + /* Already set C_NOEXISTS at the beginning of this block */ + cp->c_flag &= ~C_DELETED; + cp->c_touch_chgtime = TRUE; + cp->c_touch_modtime = TRUE; + + if (error == 0) + hfs_volupdate(hfsmp, (v_type == VDIR) ? VOL_RMDIR : VOL_RMFILE, 0); + } + } // if + + hfs_update(vp, reclaim ? HFS_UPDATE_FORCE : 0); + + /* + * Since we are about to finish what might be an inactive call, propagate + * any remaining modified or touch bits from the cnode to the vnode. This + * serves as a hint to vnode recycling that we shouldn't recycle this vnode + * synchronously. + * + * For now, if the node *only* has a dirty atime, we don't mark + * the vnode as dirty. VFS's asynchronous recycling can actually + * lead to worse performance than having it synchronous. When VFS + * is fixed to be more performant, we can be more honest about + * marking vnodes as dirty when it's only the atime that's dirty. + */ + if (hfs_is_dirty(cp) == HFS_DIRTY || ISSET(cp->c_flag, C_DELETED)) { + vnode_setdirty(vp); + } else { + vnode_cleardirty(vp); + } + +out: + if (cat_reserve) + cat_postflight(hfsmp, &cookie, p); + + if (started_tr) { + hfs_end_transaction(hfsmp); + started_tr = false; + } + + return error; +} + + +/* + * hfs_vnop_inactive + * + * The last usecount on the vnode has gone away, so we need to tear down + * any remaining data still residing in the cnode. If necessary, write out + * remaining blocks or delete the cnode's entry in the catalog. + */ +int +hfs_vnop_inactive(struct vnop_inactive_args *ap) +{ + struct vnode *vp = ap->a_vp; + struct cnode *cp; + struct hfsmount *hfsmp = VTOHFS(vp); + struct proc *p = vfs_context_proc(ap->a_context); + int error = 0; + int took_trunc_lock = 0; + enum vtype v_type; + + v_type = vnode_vtype(vp); + cp = VTOC(vp); + + if ((hfsmp->hfs_flags & HFS_READ_ONLY) || vnode_issystem(vp) || + (hfsmp->hfs_freezing_proc == p)) { + error = 0; + goto inactive_done; + } + + /* + * For safety, do NOT call vnode_recycle from inside this function. This can cause + * problems in the following scenario: + * + * vnode_create -> vnode_reclaim_internal -> vclean -> VNOP_INACTIVE + * + * If we're being invoked as a result of a reclaim that was already in-flight, then we + * cannot call vnode_recycle again. Being in reclaim means that there are no usecounts or + * iocounts by definition. As a result, if we were to call vnode_recycle, it would immediately + * try to re-enter reclaim again and panic. + * + * Currently, there are three things that can cause us (VNOP_INACTIVE) to get called. + * 1) last usecount goes away on the vnode (vnode_rele) + * 2) last iocount goes away on a vnode that previously had usecounts but didn't have + * vnode_recycle called (vnode_put) + * 3) vclean by way of reclaim + * + * In this function we would generally want to call vnode_recycle to speed things + * along to ensure that we don't leak blocks due to open-unlinked files. However, by + * virtue of being in this function already, we can call hfs_cnode_teardown, which + * will release blocks held by open-unlinked files, and mark them C_NOEXISTS so that + * there's no entry in the catalog and no backing store anymore. If that's the case, + * then we really don't care all that much when the vnode actually goes through reclaim. + * Further, the HFS VNOPs that manipulated the namespace in order to create the open- + * unlinked file in the first place should have already called vnode_recycle on the vnode + * to guarantee that it would go through reclaim in a speedy way. + */ + + if (cp->c_flag & C_NOEXISTS) { + /* + * If the cnode has already had its cat entry removed, then + * just skip to the end. We don't need to do anything here. + */ + error = 0; + goto inactive_done; + } + + if ((v_type == VREG || v_type == VLNK)) { + hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + took_trunc_lock = 1; + } + + (void) hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); + + /* + * Call cnode_teardown to push out dirty blocks to disk, release open-unlinked + * files' blocks from being in use, and move the cnode from C_DELETED to C_NOEXISTS. + */ + error = hfs_cnode_teardown (vp, ap->a_context, 0); + + /* + * Drop the truncate lock before unlocking the cnode + * (which can potentially perform a vnode_put and + * recycle the vnode which in turn might require the + * truncate lock) + */ + if (took_trunc_lock) { + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); + } + + hfs_unlock(cp); + +inactive_done: + + return error; +} + + +/* + * File clean-up (zero fill and shrink peof). + */ + +int +hfs_filedone(struct vnode *vp, vfs_context_t context, + hfs_file_done_opts_t opts) +{ + struct cnode *cp; + struct filefork *fp; + struct hfsmount *hfsmp; + off_t leof; + u_int32_t blks, blocksize; + + cp = VTOC(vp); + fp = VTOF(vp); + hfsmp = VTOHFS(vp); + leof = fp->ff_size; + + if ((hfsmp->hfs_flags & HFS_READ_ONLY) || (fp->ff_blocks == 0)) + return (0); + + hfs_flush_invalid_ranges(vp); + + blocksize = VTOVCB(vp)->blockSize; + blks = leof / blocksize; + if (((off_t)blks * (off_t)blocksize) != leof) + blks++; + /* + * Shrink the peof to the smallest size neccessary to contain the leof. + */ + if (blks < fp->ff_blocks) { + (void) hfs_truncate(vp, leof, IO_NDELAY, HFS_TRUNCATE_SKIPTIMES, context); + } + + if (!ISSET(opts, HFS_FILE_DONE_NO_SYNC)) { + hfs_unlock(cp); + cluster_push(vp, IO_CLOSE); + hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); + + /* + * If the hfs_truncate didn't happen to flush the vnode's + * information out to disk, force it to be updated now that + * all invalid ranges have been zero-filled and validated: + */ + hfs_update(vp, 0); + } + + return (0); +} + + +/* + * Reclaim a cnode so that it can be used for other purposes. + */ +int +hfs_vnop_reclaim(struct vnop_reclaim_args *ap) +{ + struct vnode *vp = ap->a_vp; + struct cnode *cp; + struct filefork *fp = NULL; + struct filefork *altfp = NULL; + struct hfsmount *hfsmp = VTOHFS(vp); + vfs_context_t ctx = ap->a_context; + int reclaim_cnode = 0; + int err = 0; + enum vtype v_type; + + v_type = vnode_vtype(vp); + cp = VTOC(vp); + + /* + * We don't take the truncate lock since by the time reclaim comes along, + * all dirty pages have been synced and nobody should be competing + * with us for this thread. + */ + (void) hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); + + /* + * Sync to disk any remaining data in the cnode/vnode. This includes + * a call to hfs_update if the cnode has outbound data. + * + * If C_NOEXISTS is set on the cnode, then there's nothing teardown needs to do + * because the catalog entry for this cnode is already gone. + */ + if (!ISSET(cp->c_flag, C_NOEXISTS)) { + err = hfs_cnode_teardown(vp, ctx, 1); + } + + /* + * Keep track of an inactive hot file. Don't bother on ssd's since + * the tracking is done differently (it's done at read() time) + */ + if (!vnode_isdir(vp) && + !vnode_issystem(vp) && + !(cp->c_flag & (C_DELETED | C_NOEXISTS)) && + !(hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN)) { + (void) hfs_addhotfile(vp); + } + vnode_removefsref(vp); + + /* + * Find file fork for this vnode (if any) + * Also check if another fork is active + */ + if (cp->c_vp == vp) { + fp = cp->c_datafork; + altfp = cp->c_rsrcfork; + + cp->c_datafork = NULL; + cp->c_vp = NULL; + } else if (cp->c_rsrc_vp == vp) { + fp = cp->c_rsrcfork; + altfp = cp->c_datafork; + + cp->c_rsrcfork = NULL; + cp->c_rsrc_vp = NULL; + } else { + panic("hfs_vnop_reclaim: vp points to wrong cnode (vp=%p cp->c_vp=%p cp->c_rsrc_vp=%p)\n", vp, cp->c_vp, cp->c_rsrc_vp); + } + /* + * On the last fork, remove the cnode from its hash chain. + */ + if (altfp == NULL) { + /* If we can't remove it then the cnode must persist! */ + if (hfs_chashremove(hfsmp, cp) == 0) + reclaim_cnode = 1; + /* + * Remove any directory hints + */ + if (vnode_isdir(vp)) { + hfs_reldirhints(cp, 0); + } + + if(cp->c_flag & C_HARDLINK) { + hfs_relorigins(cp); + } + } + /* Release the file fork and related data */ + if (fp) { + /* Dump cached symlink data */ + if (vnode_islnk(vp) && (fp->ff_symlinkptr != NULL)) { + hfs_free(fp->ff_symlinkptr, fp->ff_size); + } + rl_remove_all(&fp->ff_invalidranges); + hfs_zfree(fp, HFS_FILEFORK_ZONE); + } + + /* + * If there was only one active fork then we can release the cnode. + */ + if (reclaim_cnode) { + hfs_chashwakeup(hfsmp, cp, H_ALLOC | H_TRANSIT); + hfs_unlock(cp); + hfs_reclaim_cnode(hfsmp, cp); + } + else { + /* + * cnode in use. If it is a directory, it could have + * no live forks. Just release the lock. + */ + hfs_unlock(cp); + } + + vnode_clearfsnode(vp); + return (0); +} + + +extern int (**hfs_vnodeop_p) (void *); +#if FIFO +extern int (**hfs_fifoop_p) (void *); +#endif + +#if CONFIG_HFS_STD +extern int (**hfs_std_vnodeop_p) (void *); +#endif + +/* + * hfs_getnewvnode - get new default vnode + * + * The vnode is returned with an iocount and the cnode locked. + * The cnode of the parent vnode 'dvp' may or may not be locked, depending on + * the circumstances. The cnode in question (if acquiring the resource fork), + * may also already be locked at the time we enter this function. + * + * Note that there are both input and output flag arguments to this function. + * If one of the input flags (specifically, GNV_USE_VP), is set, then + * hfs_getnewvnode will use the parameter *vpp, which is traditionally only + * an output parameter, as both an input and output parameter. It will use + * the vnode provided in the output, and pass it to vnode_create with the + * proper flavor so that a new vnode is _NOT_ created on our behalf when + * we dispatch to VFS. This may be important in various HFS vnode creation + * routines, such a create or get-resource-fork, because we risk deadlock if + * jetsam is involved. + * + * Deadlock potential exists if jetsam is synchronously invoked while we are waiting + * for a vnode to be recycled in order to give it the identity we want. If jetsam + * happens to target a process for termination that is blocked in-kernel, waiting to + * acquire the cnode lock on our parent 'dvp', while our current thread has it locked, + * neither side will make forward progress and the watchdog timer will eventually fire. + * To prevent this, a caller of hfs_getnewvnode may choose to proactively force + * any necessary vnode reclamation/recycling while it is not holding any locks and + * thus not prone to deadlock. If this is the case, GNV_USE_VP will be set and + * the parameter will be used as described above. + * + * !!! !!!! + * In circumstances when GNV_USE_VP is set, this function _MUST_ clean up and either consume + * or dispose of the provided vnode. We funnel all errors to a single return value so that + * if provided_vp is still non-NULL, then we will dispose of the vnode. This will occur in + * all error cases of this function -- anywhere we zero/NULL out the *vpp parameter. It may + * also occur if the current thread raced with another to create the same vnode, and we + * find the entry already present in the cnode hash. + * !!! !!! + */ +int +hfs_getnewvnode( + struct hfsmount *hfsmp, + struct vnode *dvp, + struct componentname *cnp, + struct cat_desc *descp, + int flags, + struct cat_attr *attrp, + struct cat_fork *forkp, + struct vnode **vpp, + int *out_flags) +{ + struct mount *mp = HFSTOVFS(hfsmp); + struct vnode *vp = NULL; + struct vnode **cvpp; + struct vnode *tvp = NULLVP; + struct cnode *cp = NULL; + struct filefork *fp = NULL; + int hfs_standard = 0; + int retval = 0; + int issystemfile; + int wantrsrc; + int hflags = 0; + int need_update_identity = 0; + struct vnode_fsparam vfsp; + enum vtype vtype; + + struct vnode *provided_vp = NULL; + + +#if QUOTA + int i; +#endif /* QUOTA */ + + hfs_standard = (hfsmp->hfs_flags & HFS_STANDARD); + + if (flags & GNV_USE_VP) { + /* Store the provided VP for later use */ + provided_vp = *vpp; + } + + /* Zero out the vpp regardless of provided input */ + *vpp = NULL; + + /* Zero out the out_flags */ + *out_flags = 0; + + if (attrp->ca_fileid == 0) { + retval = ENOENT; + goto gnv_exit; + } + +#if !FIFO + if (IFTOVT(attrp->ca_mode) == VFIFO) { + retval = ENOTSUP; + goto gnv_exit; + } +#endif /* !FIFO */ + vtype = IFTOVT(attrp->ca_mode); + issystemfile = (descp->cd_flags & CD_ISMETA) && (vtype == VREG); + wantrsrc = flags & GNV_WANTRSRC; + + /* Sanity checks: */ + if (vtype == VBAD || + (vtype != VDIR && forkp && + (attrp->ca_blocks < forkp->cf_blocks || + howmany((uint64_t)forkp->cf_size, hfsmp->blockSize) > forkp->cf_blocks || + (vtype == VLNK && (uint64_t)forkp->cf_size > MAXPATHLEN)))) { + /* Mark the FS as corrupt and bail out */ + hfs_mark_inconsistent(hfsmp, HFS_INCONSISTENCY_DETECTED); + retval = EINVAL; + goto gnv_exit; + } + +#ifdef HFS_CHECK_LOCK_ORDER + /* + * The only case where it's permissible to hold the parent cnode + * lock is during a create operation (hfs_makenode) or when + * we don't need the cnode lock (GNV_SKIPLOCK). + */ + if ((dvp != NULL) && + (flags & (GNV_CREATE | GNV_SKIPLOCK)) == 0 && + VTOC(dvp)->c_lockowner == current_thread()) { + panic("hfs_getnewvnode: unexpected hold of parent cnode %p", VTOC(dvp)); + } +#endif /* HFS_CHECK_LOCK_ORDER */ + + /* + * Get a cnode (new or existing) + */ + cp = hfs_chash_getcnode(hfsmp, attrp->ca_fileid, vpp, wantrsrc, + (flags & GNV_SKIPLOCK), out_flags, &hflags); + + /* + * If the id is no longer valid for lookups we'll get back a NULL cp. + */ + if (cp == NULL) { + retval = ENOENT; + goto gnv_exit; + } + /* + * We may have been provided a vnode via + * GNV_USE_VP. In this case, we have raced with + * a 2nd thread to create the target vnode. The provided + * vnode that was passed in will be dealt with at the + * end of the function, as we don't zero out the field + * until we're ready to pass responsibility to VFS. + */ + + + /* + * If we get a cnode/vnode pair out of hfs_chash_getcnode, then update the + * descriptor in the cnode as needed if the cnode represents a hardlink. + * We want the caller to get the most up-to-date copy of the descriptor + * as possible. However, we only do anything here if there was a valid vnode. + * If there isn't a vnode, then the cnode is brand new and needs to be initialized + * as it doesn't have a descriptor or cat_attr yet. + * + * If we are about to replace the descriptor with the user-supplied one, then validate + * that the descriptor correctly acknowledges this item is a hardlink. We could be + * subject to a race where the calling thread invoked cat_lookup, got a valid lookup + * result but the file was not yet a hardlink. With sufficient delay between there + * and here, we might accidentally copy in the raw inode ID into the descriptor in the + * call below. If the descriptor's CNID is the same as the fileID then it must + * not yet have been a hardlink when the lookup occurred. + */ + + if (!(hfs_checkdeleted(cp))) { + // + // If the bytes of the filename in the descp do not match the bytes in the + // cnp (and we're not looking up the resource fork), then we want to update + // the vnode identity to contain the bytes that HFS stores so that when an + // fsevent gets generated, it has the correct filename. otherwise daemons + // that match filenames produced by fsevents with filenames they have stored + // elsewhere (e.g. bladerunner, backupd, mds), the filenames will not match. + // See: FSEvents doesn't always decompose diacritical unicode chars in the paths of the changed directories + // for more details. + // +#ifdef CN_WANTSRSRCFORK + if (*vpp && cnp && cnp->cn_nameptr && !(cnp->cn_flags & CN_WANTSRSRCFORK) && descp && descp->cd_nameptr && strncmp((const char *)cnp->cn_nameptr, (const char *)descp->cd_nameptr, descp->cd_namelen) != 0) { +#else + if (*vpp && cnp && cnp->cn_nameptr && descp && descp->cd_nameptr && strncmp((const char *)cnp->cn_nameptr, (const char *)descp->cd_nameptr, descp->cd_namelen) != 0) { +#endif + vnode_update_identity (*vpp, dvp, (const char *)descp->cd_nameptr, descp->cd_namelen, 0, VNODE_UPDATE_NAME); + } + if ((cp->c_flag & C_HARDLINK) && descp->cd_nameptr && descp->cd_namelen > 0) { + /* If cnode is uninitialized, its c_attr will be zeroed out; cnids wont match. */ + if ((descp->cd_cnid == cp->c_attr.ca_fileid) && + (attrp->ca_linkcount != cp->c_attr.ca_linkcount)){ + + if ((flags & GNV_SKIPLOCK) == 0) { + /* + * Then we took the lock. Drop it before calling + * vnode_put, which may invoke hfs_vnop_inactive and need to take + * the cnode lock again. + */ + hfs_unlock(cp); + } + + /* + * Emit ERECYCLE and GNV_CAT_ATTRCHANGED to + * force a re-drive in the lookup routine. + * Drop the iocount on the vnode obtained from + * chash_getcnode if needed. + */ + if (*vpp != NULL) { + vnode_put (*vpp); + *vpp = NULL; + } + + /* + * If we raced with VNOP_RECLAIM for this vnode, the hash code could + * have observed it after the c_vp or c_rsrc_vp fields had been torn down; + * the hash code peeks at those fields without holding the cnode lock because + * it needs to be fast. As a result, we may have set H_ATTACH in the chash + * call above. Since we're bailing out, unset whatever flags we just set, and + * wake up all waiters for this cnode. + */ + if (hflags) { + hfs_chashwakeup(hfsmp, cp, hflags); + } + + *out_flags = GNV_CAT_ATTRCHANGED; + retval = ERECYCLE; + goto gnv_exit; + } + else { + /* + * Otherwise, CNID != fileid. Go ahead and copy in the new descriptor. + * + * Replacing the descriptor here is fine because we looked up the item without + * a vnode in hand before. If a vnode existed, its identity must be attached to this + * item. We are not susceptible to the lookup fastpath issue at this point. + */ + replace_desc(cp, descp); + + /* + * This item was a hardlink, and its name needed to be updated. By replacing the + * descriptor above, we've now updated the cnode's internal representation of + * its link ID/CNID, parent ID, and its name. However, VFS must now be alerted + * to the fact that this vnode now has a new parent, since we cannot guarantee + * that the new link lived in the same directory as the alternative name for + * this item. + */ + if ((*vpp != NULL) && (cnp || cp->c_desc.cd_nameptr)) { + /* we could be requesting the rsrc of a hardlink file... */ +#ifdef CN_WANTSRSRCFORK + if (cp->c_desc.cd_nameptr && (cnp == NULL || !(cnp->cn_flags & CN_WANTSRSRCFORK))) { +#else + if (cp->c_desc.cd_nameptr) { +#endif + // + // Update the identity with what we have stored on disk as + // the name of this file. This is related to: + // FSEvents doesn't always decompose diacritical unicode chars in the paths of the changed directories + // + vnode_update_identity (*vpp, dvp, (const char *)cp->c_desc.cd_nameptr, cp->c_desc.cd_namelen, 0, + (VNODE_UPDATE_PARENT | VNODE_UPDATE_NAME)); + } else if (cnp) { + vnode_update_identity (*vpp, dvp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_hash, + (VNODE_UPDATE_PARENT | VNODE_UPDATE_NAME)); + } + } + } + } + } + + /* + * At this point, we have performed hardlink and open-unlinked checks + * above. We have now validated the state of the vnode that was given back + * to us from the cnode hash code and find it safe to return. + */ + if (*vpp != NULL) { + retval = 0; + goto gnv_exit; + } + + /* + * If this is a new cnode then initialize it. + */ + if (ISSET(cp->c_hflag, H_ALLOC)) { + lck_rw_init(&cp->c_truncatelock, hfs_rwlock_group, hfs_lock_attr); +#if HFS_COMPRESSION + cp->c_decmp = NULL; +#endif + + /* Make sure its still valid (ie exists on disk). */ + if (!(flags & GNV_CREATE)) { + int error = 0; + if (!hfs_valid_cnode (hfsmp, dvp, (wantrsrc ? NULL : cnp), cp->c_fileid, attrp, &error)) { + hfs_chash_abort(hfsmp, cp); + if ((flags & GNV_SKIPLOCK) == 0) { + hfs_unlock(cp); + } + hfs_reclaim_cnode(hfsmp, cp); + *vpp = NULL; + /* + * If we hit this case, that means that the entry was there in the catalog when + * we did a cat_lookup earlier. Think hfs_lookup. However, in between the time + * that we checked the catalog and the time we went to get a vnode/cnode for it, + * it had been removed from the namespace and the vnode totally reclaimed. As a result, + * it's not there in the catalog during the check in hfs_valid_cnode and we bubble out + * an ENOENT. To indicate to the caller that they should really double-check the + * entry (it could have been renamed over and gotten a new fileid), we mark a bit + * in the output flags. + */ + if (error == ENOENT) { + *out_flags = GNV_CAT_DELETED; + retval = ENOENT; + goto gnv_exit; + } + + /* + * Also, we need to protect the cat_attr acquired during hfs_lookup and passed into + * this function as an argument because the catalog may have changed w.r.t hardlink + * link counts and the firstlink field. If that validation check fails, then let + * lookup re-drive itself to get valid/consistent data with the same failure condition below. + */ + if (error == ERECYCLE) { + *out_flags = GNV_CAT_ATTRCHANGED; + retval = ERECYCLE; + goto gnv_exit; + } + } + } + bcopy(attrp, &cp->c_attr, sizeof(struct cat_attr)); + bcopy(descp, &cp->c_desc, sizeof(struct cat_desc)); + + /* The name was inherited so clear descriptor state... */ + descp->cd_namelen = 0; + descp->cd_nameptr = NULL; + descp->cd_flags &= ~CD_HASBUF; + + /* Tag hardlinks */ + if ((vtype == VREG || vtype == VDIR + || vtype == VSOCK || vtype == VFIFO) + && (descp->cd_cnid != attrp->ca_fileid + || ISSET(attrp->ca_recflags, kHFSHasLinkChainMask))) { + cp->c_flag |= C_HARDLINK; + } + /* + * Fix-up dir link counts. + * + * Earlier versions of Leopard used ca_linkcount for posix + * nlink support (effectively the sub-directory count + 2). + * That is now accomplished using the ca_dircount field with + * the corresponding kHFSHasFolderCountMask flag. + * + * For directories the ca_linkcount is the true link count, + * tracking the number of actual hardlinks to a directory. + * + * We only do this if the mount has HFS_FOLDERCOUNT set; + * at the moment, we only set that for HFSX volumes. + */ + if ((hfsmp->hfs_flags & HFS_FOLDERCOUNT) && + (vtype == VDIR) && + !(attrp->ca_recflags & kHFSHasFolderCountMask) && + (cp->c_attr.ca_linkcount > 1)) { + if (cp->c_attr.ca_entries == 0) + cp->c_attr.ca_dircount = 0; + else + cp->c_attr.ca_dircount = cp->c_attr.ca_linkcount - 2; + + cp->c_attr.ca_linkcount = 1; + cp->c_attr.ca_recflags |= kHFSHasFolderCountMask; + if ( !(hfsmp->hfs_flags & HFS_READ_ONLY) ) + cp->c_flag |= C_MODIFIED; + } +#if QUOTA + if (hfsmp->hfs_flags & HFS_QUOTAS) { + for (i = 0; i < MAXQUOTAS; i++) + cp->c_dquot[i] = NODQUOT; + } +#endif /* QUOTA */ + /* Mark the output flag that we're vending a new cnode */ + *out_flags |= GNV_NEW_CNODE; + } + + if (vtype == VDIR) { + if (cp->c_vp != NULL) + panic("hfs_getnewvnode: orphaned vnode (data)"); + cvpp = &cp->c_vp; + } else { + /* + * Allocate and initialize a file fork... + */ + fp = hfs_zalloc(HFS_FILEFORK_ZONE); + fp->ff_cp = cp; + if (forkp) + bcopy(forkp, &fp->ff_data, sizeof(struct cat_fork)); + else + bzero(&fp->ff_data, sizeof(struct cat_fork)); + rl_init(&fp->ff_invalidranges); + fp->ff_sysfileinfo = 0; + + if (wantrsrc) { + if (cp->c_rsrcfork != NULL) + panic("hfs_getnewvnode: orphaned rsrc fork"); + if (cp->c_rsrc_vp != NULL) + panic("hfs_getnewvnode: orphaned vnode (rsrc)"); + cp->c_rsrcfork = fp; + cvpp = &cp->c_rsrc_vp; + if ( (tvp = cp->c_vp) != NULLVP ) + cp->c_flag |= C_NEED_DVNODE_PUT; + } else { + if (cp->c_datafork != NULL) + panic("hfs_getnewvnode: orphaned data fork"); + if (cp->c_vp != NULL) + panic("hfs_getnewvnode: orphaned vnode (data)"); + cp->c_datafork = fp; + cvpp = &cp->c_vp; + if ( (tvp = cp->c_rsrc_vp) != NULLVP) + cp->c_flag |= C_NEED_RVNODE_PUT; + } + } + if (tvp != NULLVP) { + /* + * grab an iocount on the vnode we weren't + * interested in (i.e. we want the resource fork + * but the cnode already has the data fork) + * to prevent it from being + * recycled by us when we call vnode_create + * which will result in a deadlock when we + * try to take the cnode lock in hfs_vnop_fsync or + * hfs_vnop_reclaim... vnode_get can be called here + * because we already hold the cnode lock which will + * prevent the vnode from changing identity until + * we drop it.. vnode_get will not block waiting for + * a change of state... however, it will return an + * error if the current iocount == 0 and we've already + * started to terminate the vnode... we don't need/want to + * grab an iocount in the case since we can't cause + * the fileystem to be re-entered on this thread for this vp + * + * the matching vnode_put will happen in hfs_unlock + * after we've dropped the cnode lock + */ + if ( vnode_get(tvp) != 0) + cp->c_flag &= ~(C_NEED_RVNODE_PUT | C_NEED_DVNODE_PUT); + } + vfsp.vnfs_mp = mp; + vfsp.vnfs_vtype = vtype; + vfsp.vnfs_str = "hfs"; + if ((cp->c_flag & C_HARDLINK) && (vtype == VDIR)) { + vfsp.vnfs_dvp = NULL; /* no parent for me! */ + vfsp.vnfs_cnp = NULL; /* no name for me! */ + } else { + vfsp.vnfs_dvp = dvp; + vfsp.vnfs_cnp = cnp; + } + + vfsp.vnfs_fsnode = cp; + + /* + * Special Case HFS Standard VNOPs from HFS+, since + * HFS standard is readonly/deprecated as of 10.6 + */ + +#if FIFO + if (vtype == VFIFO ) + vfsp.vnfs_vops = hfs_fifoop_p; + else +#endif + if (vtype == VBLK || vtype == VCHR) + vfsp.vnfs_vops = hfs_specop_p; +#if CONFIG_HFS_STD + else if (hfs_standard) + vfsp.vnfs_vops = hfs_std_vnodeop_p; +#endif + else + vfsp.vnfs_vops = hfs_vnodeop_p; + + if (vtype == VBLK || vtype == VCHR) + vfsp.vnfs_rdev = attrp->ca_rdev; + else + vfsp.vnfs_rdev = 0; + + if (forkp) + vfsp.vnfs_filesize = forkp->cf_size; + else + vfsp.vnfs_filesize = 0; + + vfsp.vnfs_flags = VNFS_ADDFSREF; +#ifdef CN_WANTSRSRCFORK + if (cnp && cnp->cn_nameptr && !(cnp->cn_flags & CN_WANTSRSRCFORK) && cp->c_desc.cd_nameptr && strncmp((const char *)cnp->cn_nameptr, (const char *)cp->c_desc.cd_nameptr, cp->c_desc.cd_namelen) != 0) { +#else + if (cnp && cnp->cn_nameptr && cp->c_desc.cd_nameptr && strncmp((const char *)cnp->cn_nameptr, (const char *)cp->c_desc.cd_nameptr, cp->c_desc.cd_namelen) != 0) { +#endif + // + // We don't want VFS to add an entry for this vnode because the name in the + // cnp does not match the bytes stored on disk for this file. Instead we'll + // update the identity later after the vnode is created and we'll do so with + // the correct bytes for this filename. For more details, see: + // FSEvents doesn't always decompose diacritical unicode chars in the paths of the changed directories + // + vfsp.vnfs_flags |= VNFS_NOCACHE; + need_update_identity = 1; + } else if (dvp == NULLVP || cnp == NULL || !(cnp->cn_flags & MAKEENTRY) || (flags & GNV_NOCACHE)) { + vfsp.vnfs_flags |= VNFS_NOCACHE; + } + + /* Tag system files */ + vfsp.vnfs_marksystem = issystemfile; + + /* Tag root directory */ + if (descp->cd_cnid == kHFSRootFolderID) + vfsp.vnfs_markroot = 1; + else + vfsp.vnfs_markroot = 0; + + /* + * If provided_vp was non-NULL, then it is an already-allocated (but not + * initialized) vnode. We simply need to initialize it to this identity. + * If it was NULL, then assume that we need to call vnode_create with the + * normal arguments/types. + */ + if (provided_vp) { + vp = provided_vp; + /* + * After we assign the value of provided_vp into 'vp' (so that it can be + * mutated safely by vnode_initialize), we can NULL it out. At this point, the disposal + * and handling of the provided vnode will be the responsibility of VFS, which will + * clean it up and vnode_put it properly if vnode_initialize fails. + */ + provided_vp = NULL; + + retval = vnode_initialize (VNCREATE_FLAVOR, VCREATESIZE, &vfsp, &vp); + /* See error handling below for resolving provided_vp */ + } + else { + /* Do a standard vnode_create */ + retval = vnode_create (VNCREATE_FLAVOR, VCREATESIZE, &vfsp, &vp); + } + + /* + * We used a local variable to hold the result of vnode_create/vnode_initialize so that + * on error cases in vnode_create we won't accidentally harm the cnode's fields + */ + + if (retval) { + /* Clean up if we encountered an error */ + if (fp) { + if (fp == cp->c_datafork) + cp->c_datafork = NULL; + else + cp->c_rsrcfork = NULL; + + hfs_zfree(fp, HFS_FILEFORK_ZONE); + } + /* + * If this is a newly created cnode or a vnode reclaim + * occurred during the attachment, then cleanup the cnode. + */ + if ((cp->c_vp == NULL) && (cp->c_rsrc_vp == NULL)) { + hfs_chash_abort(hfsmp, cp); + hfs_reclaim_cnode(hfsmp, cp); + } + else { + hfs_chashwakeup(hfsmp, cp, H_ALLOC | H_ATTACH); + if ((flags & GNV_SKIPLOCK) == 0){ + hfs_unlock(cp); + } + } + *vpp = NULL; + goto gnv_exit; + } + + /* If no error, then assign the value into the cnode's fields */ + *cvpp = vp; + + vnode_settag(vp, VT_HFS); + if (cp->c_flag & C_HARDLINK) { + vnode_setmultipath(vp); + } + + if (cp->c_attr.ca_recflags & kHFSFastDevCandidateMask) { + vnode_setfastdevicecandidate(vp); + } + + if (cp->c_attr.ca_recflags & kHFSAutoCandidateMask) { + vnode_setautocandidate(vp); + } + + + + + if (vp && need_update_identity) { + // + // As above, update the name of the vnode if the bytes stored in hfs do not match + // the bytes in the cnp. See this radar: + // FSEvents doesn't always decompose diacritical unicode chars in the paths of the changed directories + // for more details. + // + vnode_update_identity (vp, dvp, (const char *)cp->c_desc.cd_nameptr, cp->c_desc.cd_namelen, 0, VNODE_UPDATE_NAME); + } + + /* + * Tag resource fork vnodes as needing an VNOP_INACTIVE + * so that any deferred removes (open unlinked files) + * have the chance to process the resource fork. + */ + if (VNODE_IS_RSRC(vp)) { + int err; + + KDBG(HFSDBG_GETNEWVNODE, kdebug_vnode(cp->c_vp), kdebug_vnode(cp->c_rsrc_vp)); + + /* Force VL_NEEDINACTIVE on this vnode */ + err = vnode_ref(vp); + if (err == 0) { + vnode_rele(vp); + } + } + hfs_chashwakeup(hfsmp, cp, H_ALLOC | H_ATTACH); + + /* + * Stop tracking an active hot file. + */ + if (!(flags & GNV_CREATE) && (vtype != VDIR) && !issystemfile && !(hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN)) { + (void) hfs_removehotfile(vp); + } + +#if CONFIG_PROTECT + /* Initialize the cp data structures. The key should be in place now. */ + if (!issystemfile && (*out_flags & GNV_NEW_CNODE)) { + cp_entry_init(cp, mp); + } +#endif + + *vpp = vp; + retval = 0; + +gnv_exit: + if (provided_vp) { + /* Release our empty vnode if it was not used */ + vnode_put (provided_vp); + } + return retval; +} + + +static void +hfs_reclaim_cnode(hfsmount_t *hfsmp, struct cnode *cp) +{ +#if QUOTA + int i; + + for (i = 0; i < MAXQUOTAS; i++) { + if (cp->c_dquot[i] != NODQUOT) { + dqreclaim(cp->c_dquot[i]); + cp->c_dquot[i] = NODQUOT; + } + } +#endif /* QUOTA */ + + /* + * If the descriptor has a name then release it + */ + if ((cp->c_desc.cd_flags & CD_HASBUF) && (cp->c_desc.cd_nameptr != 0)) { + const char *nameptr; + + nameptr = (const char *) cp->c_desc.cd_nameptr; + cp->c_desc.cd_nameptr = 0; + cp->c_desc.cd_flags &= ~CD_HASBUF; + cp->c_desc.cd_namelen = 0; + vfs_removename(nameptr); + } + + /* + * We only call this function if we are in hfs_vnop_reclaim and + * attempting to reclaim a cnode with only one live fork. Because the vnode + * went through reclaim, any future attempts to use this item will have to + * go through lookup again, which will need to create a new vnode. Thus, + * destroying the locks below is safe. + */ + + lck_rw_destroy(&cp->c_rwlock, hfs_rwlock_group); + lck_rw_destroy(&cp->c_truncatelock, hfs_rwlock_group); +#if HFS_COMPRESSION + if (cp->c_decmp) { + decmpfs_cnode_destroy(cp->c_decmp); + decmpfs_cnode_free(cp->c_decmp); + } +#endif +#if CONFIG_PROTECT + cp_entry_destroy(hfsmp, cp->c_cpentry); + cp->c_cpentry = NULL; +#else + (void)hfsmp; // Prevent compiler warning +#endif + + hfs_zfree(cp, HFS_CNODE_ZONE); +} + + +/* + * hfs_valid_cnode + * + * This function is used to validate data that is stored in-core against what is contained + * in the catalog. Common uses include validating that the parent-child relationship still exist + * for a specific directory entry (guaranteeing it has not been renamed into a different spot) at + * the point of the check. + */ +int +hfs_valid_cnode(struct hfsmount *hfsmp, struct vnode *dvp, struct componentname *cnp, + cnid_t cnid, struct cat_attr *cattr, int *error) +{ + struct cat_attr attr; + struct cat_desc cndesc; + int stillvalid = 0; + int lockflags; + + /* System files are always valid */ + if (cnid < kHFSFirstUserCatalogNodeID) { + *error = 0; + return (1); + } + + /* XXX optimization: check write count in dvp */ + + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); + + if (dvp && cnp) { + int lookup = 0; + struct cat_fork fork; + bzero(&cndesc, sizeof(cndesc)); + cndesc.cd_nameptr = (const u_int8_t *)cnp->cn_nameptr; + cndesc.cd_namelen = cnp->cn_namelen; + cndesc.cd_parentcnid = VTOC(dvp)->c_fileid; + cndesc.cd_hint = VTOC(dvp)->c_childhint; + + /* + * We have to be careful when calling cat_lookup. The result argument + * 'attr' may get different results based on whether or not you ask + * for the filefork to be supplied as output. This is because cat_lookupbykey + * will attempt to do basic validation/smoke tests against the resident + * extents if there are no overflow extent records, but it needs someplace + * in memory to store the on-disk fork structures. + * + * Since hfs_lookup calls cat_lookup with a filefork argument, we should + * do the same here, to verify that block count differences are not + * due to calling the function with different styles. cat_lookupbykey + * will request the volume be fsck'd if there is true on-disk corruption + * where the number of blocks does not match the number generated by + * summing the number of blocks in the resident extents. + */ + + lookup = cat_lookup (hfsmp, &cndesc, 0, 0, NULL, &attr, &fork, NULL); + + if ((lookup == 0) && (cnid == attr.ca_fileid)) { + stillvalid = 1; + *error = 0; + } + else { + *error = ENOENT; + } + + /* + * In hfs_getnewvnode, we may encounter a time-of-check vs. time-of-vnode creation + * race. Specifically, if there is no vnode/cnode pair for the directory entry + * being looked up, we have to go to the catalog. But since we don't hold any locks (aside + * from the dvp in 'shared' mode) there is nothing to protect us against the catalog record + * changing in between the time we do the cat_lookup there and the time we re-grab the + * catalog lock above to do another cat_lookup. + * + * However, we need to check more than just the CNID and parent-child name relationships above. + * Hardlinks can suffer the same race in the following scenario: Suppose we do a + * cat_lookup, and find a leaf record and a raw inode for a hardlink. Now, we have + * the cat_attr in hand (passed in above). But in between then and now, the vnode was + * created by a competing hfs_getnewvnode call, and is manipulated and reclaimed before we get + * a chance to do anything. This is possible if there are a lot of threads thrashing around + * with the cnode hash. In this case, if we don't check/validate the cat_attr in-hand, we will + * blindly stuff it into the cnode, which will make the in-core data inconsistent with what is + * on disk. So validate the cat_attr below, if required. This race cannot happen if the cnode/vnode + * already exists, as it does in the case of rename and delete. + */ + if (stillvalid && cattr != NULL) { + if (cattr->ca_linkcount != attr.ca_linkcount) { + stillvalid = 0; + *error = ERECYCLE; + goto notvalid; + } + + if (cattr->ca_union1.cau_linkref != attr.ca_union1.cau_linkref) { + stillvalid = 0; + *error = ERECYCLE; + goto notvalid; + } + + if (cattr->ca_union3.cau_firstlink != attr.ca_union3.cau_firstlink) { + stillvalid = 0; + *error = ERECYCLE; + goto notvalid; + } + + if (cattr->ca_union2.cau_blocks != attr.ca_union2.cau_blocks) { + stillvalid = 0; + *error = ERECYCLE; + goto notvalid; + } + } + } else { + if (cat_idlookup(hfsmp, cnid, 0, 0, NULL, NULL, NULL) == 0) { + stillvalid = 1; + *error = 0; + } + else { + *error = ENOENT; + } + } +notvalid: + hfs_systemfile_unlock(hfsmp, lockflags); + + return (stillvalid); +} + + +/* + * Per HI and Finder requirements, HFS should add in the + * date/time that a particular directory entry was added + * to the containing directory. + * This is stored in the extended Finder Info for the + * item in question. + * + * Note that this field is also set explicitly in the hfs_vnop_setxattr code. + * We must ignore user attempts to set this part of the finderinfo, and + * so we need to save a local copy of the date added, write in the user + * finderinfo, then stuff the value back in. + */ +void hfs_write_dateadded (struct cat_attr *attrp, u_int32_t dateadded) { + u_int8_t *finfo = NULL; + + /* overlay the FinderInfo to the correct pointer, and advance */ + finfo = (u_int8_t*)attrp->ca_finderinfo; + finfo = finfo + 16; + + /* + * Make sure to write it out as big endian, since that's how + * finder info is defined. + * + * NOTE: This is a Unix-epoch timestamp, not a HFS/Traditional Mac timestamp. + */ + if (S_ISREG(attrp->ca_mode)) { + struct FndrExtendedFileInfo *extinfo = (struct FndrExtendedFileInfo *)finfo; + extinfo->date_added = OSSwapHostToBigInt32(dateadded); + attrp->ca_recflags |= kHFSHasDateAddedMask; + } + else if (S_ISDIR(attrp->ca_mode)) { + struct FndrExtendedDirInfo *extinfo = (struct FndrExtendedDirInfo *)finfo; + extinfo->date_added = OSSwapHostToBigInt32(dateadded); + attrp->ca_recflags |= kHFSHasDateAddedMask; + } + /* If it were neither directory/file, then we'd bail out */ + return; +} + +static u_int32_t +hfs_get_dateadded_internal(const uint8_t *finderinfo, mode_t mode) +{ + const uint8_t *finfo = NULL; + u_int32_t dateadded = 0; + + + + /* overlay the FinderInfo to the correct pointer, and advance */ + finfo = finderinfo + 16; + + /* + * FinderInfo is written out in big endian... make sure to convert it to host + * native before we use it. + */ + if (S_ISREG(mode)) { + const struct FndrExtendedFileInfo *extinfo = (const struct FndrExtendedFileInfo *)finfo; + dateadded = OSSwapBigToHostInt32 (extinfo->date_added); + } + else if (S_ISDIR(mode)) { + const struct FndrExtendedDirInfo *extinfo = (const struct FndrExtendedDirInfo *)finfo; + dateadded = OSSwapBigToHostInt32 (extinfo->date_added); + } + + return dateadded; +} + +u_int32_t +hfs_get_dateadded(struct cnode *cp) +{ + if ((cp->c_attr.ca_recflags & kHFSHasDateAddedMask) == 0) { + /* Date added was never set. Return 0. */ + return (0); + } + + return (hfs_get_dateadded_internal((u_int8_t*)cp->c_finderinfo, + cp->c_attr.ca_mode)); +} + +u_int32_t +hfs_get_dateadded_from_blob(const uint8_t *finderinfo, mode_t mode) +{ + return (hfs_get_dateadded_internal(finderinfo, mode)); +} + +/* + * Per HI and Finder requirements, HFS maintains a "write/generation + * count" for each file that is incremented on any write & pageout. + * It should start at 1 to reserve "0" as a special value. If it + * should ever wrap around, it will skip using 0. + * + * Note that finderinfo is manipulated in hfs_vnop_setxattr and care + * is and should be taken to ignore user attempts to set the part of + * the finderinfo that records the generation counter. + * + * Any change to the generation counter *must* not be visible before + * the change that caused it (for obvious reasons), and given the + * limitations of our current architecture, the change to the + * generation counter may occur some time afterwards (particularly in + * the case where a file is mapped writable---more on that below). + * + * We make no guarantees about the consistency of a file. In other + * words, a reader that is operating concurrently with a writer might + * see some, but not all of writer's changes, and the generation + * counter will *not* necessarily tell you this has happened. To + * enforce consistency, clients must make their own arrangements + * e.g. use file locking. + * + * We treat files that are mapped writable as a special case: when + * that happens, clients requesting the generation count will be told + * it has a generation count of zero and they use that knowledge as a + * hint that the file is changing and it therefore might be prudent to + * wait until it is no longer mapped writable. Clients should *not* + * rely on this behaviour however; we might decide that it's better + * for us to publish the fact that a file is mapped writable via + * alternate means and return the generation counter when it is mapped + * writable as it still has some, albeit limited, use. We reserve the + * right to make this change. + * + * Lastly, it's important to realise that because data and metadata + * take different paths through the system, it's possible upon crash + * or sudden power loss and after a restart, that a change may be + * visible to the rest of the system without a corresponding change to + * the generation counter. The reverse may also be true, but for all + * practical applications this shouldn't be an issue. + */ +void hfs_write_gencount (struct cat_attr *attrp, uint32_t gencount) { + u_int8_t *finfo = NULL; + + /* overlay the FinderInfo to the correct pointer, and advance */ + finfo = (u_int8_t*)attrp->ca_finderinfo; + finfo = finfo + 16; + + /* + * Make sure to write it out as big endian, since that's how + * finder info is defined. + * + * Generation count is only supported for files. + */ + if (S_ISREG(attrp->ca_mode)) { + struct FndrExtendedFileInfo *extinfo = (struct FndrExtendedFileInfo *)finfo; + extinfo->write_gen_counter = OSSwapHostToBigInt32(gencount); + } + + /* If it were neither directory/file, then we'd bail out */ + return; +} + +/* + * Increase the gen count by 1; if it wraps around to 0, increment by + * two. The cnode *must* be locked exclusively by the caller. + * + * You may think holding the lock is unnecessary because we only need + * to change the counter, but consider this sequence of events: thread + * A calls hfs_incr_gencount and the generation counter is 2 upon + * entry. A context switch occurs and thread B increments the counter + * to 3, thread C now gets the generation counter (for whatever + * purpose), and then another thread makes another change and the + * generation counter is incremented again---it's now 4. Now thread A + * continues and it sets the generation counter back to 3. So you can + * see, thread C would miss the change that caused the generation + * counter to increment to 4 and for this reason the cnode *must* + * always be locked exclusively. + */ +uint32_t hfs_incr_gencount (struct cnode *cp) { + u_int8_t *finfo = NULL; + u_int32_t gcount = 0; + + /* overlay the FinderInfo to the correct pointer, and advance */ + finfo = (u_int8_t*)cp->c_finderinfo; + finfo = finfo + 16; + + /* + * FinderInfo is written out in big endian... make sure to convert it to host + * native before we use it. + * + * NOTE: the write_gen_counter is stored in the same location in both the + * FndrExtendedFileInfo and FndrExtendedDirInfo structs (it's the + * last 32-bit word) so it is safe to have one code path here. + */ + if (S_ISDIR(cp->c_attr.ca_mode) || S_ISREG(cp->c_attr.ca_mode)) { + struct FndrExtendedFileInfo *extinfo = (struct FndrExtendedFileInfo *)finfo; + gcount = OSSwapBigToHostInt32 (extinfo->write_gen_counter); + + /* Was it zero to begin with (file originated in 10.8 or earlier?) */ + if (gcount == 0) { + gcount++; + } + + /* now bump it */ + gcount++; + + /* Did it wrap around ? */ + if (gcount == 0) { + gcount++; + } + extinfo->write_gen_counter = OSSwapHostToBigInt32 (gcount); + + SET(cp->c_flag, C_MINOR_MOD); + } + else { + gcount = 0; + } + + return gcount; +} + +/* + * There is no need for any locks here (other than an iocount on an + * associated vnode) because reading and writing an aligned 32 bit + * integer should be atomic on all platforms we support. + */ +static u_int32_t +hfs_get_gencount_internal(const uint8_t *finderinfo, mode_t mode) +{ + const uint8_t *finfo = NULL; + u_int32_t gcount = 0; + + /* overlay the FinderInfo to the correct pointer, and advance */ + finfo = finderinfo; + finfo = finfo + 16; + + /* + * FinderInfo is written out in big endian... make sure to convert it to host + * native before we use it. + * + * NOTE: the write_gen_counter is stored in the same location in both the + * FndrExtendedFileInfo and FndrExtendedDirInfo structs (it's the + * last 32-bit word) so it is safe to have one code path here. + */ + if (S_ISDIR(mode) || S_ISREG(mode)) { + const struct FndrExtendedFileInfo *extinfo = (const struct FndrExtendedFileInfo *)finfo; + gcount = OSSwapBigToHostInt32 (extinfo->write_gen_counter); + + /* + * Is it zero? File might originate in 10.8 or earlier. We lie and bump it to 1, + * since the incrementer code is able to handle this case and will double-increment + * for us. + */ + if (gcount == 0) { + gcount++; + } + } + + return gcount; +} + +/* Getter for the gen count */ +u_int32_t hfs_get_gencount (struct cnode *cp) { + return hfs_get_gencount_internal(cp->c_finderinfo, cp->c_attr.ca_mode); +} + +/* Getter for the gen count from a buffer (currently pointer to finderinfo)*/ +u_int32_t hfs_get_gencount_from_blob (const uint8_t *finfoblob, mode_t mode) { + return hfs_get_gencount_internal(finfoblob, mode); +} + +void hfs_clear_might_be_dirty_flag(cnode_t *cp) +{ + /* + * If we're about to touch both mtime and ctime, we can clear the + * C_MIGHT_BE_DIRTY_FROM_MAPPING since we can guarantee that + * subsequent page-outs can only be for data made dirty before + * now. + */ + CLR(cp->c_flag, C_MIGHT_BE_DIRTY_FROM_MAPPING); +} + +/* + * Touch cnode times based on c_touch_xxx flags + * + * cnode must be locked exclusive + * + * This will also update the volume modify time + */ +void +hfs_touchtimes(struct hfsmount *hfsmp, struct cnode* cp) +{ + vfs_context_t ctx; + + if (ISSET(hfsmp->hfs_flags, HFS_READ_ONLY) || ISSET(cp->c_flag, C_NOEXISTS)) { + cp->c_touch_acctime = FALSE; + cp->c_touch_chgtime = FALSE; + cp->c_touch_modtime = FALSE; + CLR(cp->c_flag, C_NEEDS_DATEADDED); + return; + } +#if CONFIG_HFS_STD + else if (hfsmp->hfs_flags & HFS_STANDARD) { + /* HFS Standard doesn't support access times */ + cp->c_touch_acctime = FALSE; + } +#endif + + ctx = vfs_context_current(); + /* + * Skip access time updates if: + * . MNT_NOATIME is set + * . a file system freeze is in progress + * . a file system resize is in progress + * . the vnode associated with this cnode is marked for rapid aging + */ + if (cp->c_touch_acctime) { + if ((vfs_flags(hfsmp->hfs_mp) & MNT_NOATIME) || + hfsmp->hfs_freeze_state != HFS_THAWED || + (hfsmp->hfs_flags & HFS_RESIZE_IN_PROGRESS) || + (cp->c_vp && ((vnode_israge(cp->c_vp) || (vfs_ctx_skipatime(ctx)))))) { + + cp->c_touch_acctime = FALSE; + } + } + if (cp->c_touch_acctime || cp->c_touch_chgtime || + cp->c_touch_modtime || (cp->c_flag & C_NEEDS_DATEADDED)) { + struct timeval tv; + int touchvol = 0; + + if (cp->c_touch_modtime && cp->c_touch_chgtime) + hfs_clear_might_be_dirty_flag(cp); + + microtime(&tv); + + if (cp->c_touch_acctime) { + /* + * When the access time is the only thing changing, we + * won't necessarily write it to disk immediately. We + * only do the atime update at vnode recycle time, when + * fsync is called or when there's another reason to write + * to the metadata. + */ + cp->c_atime = tv.tv_sec; + cp->c_touch_acctime = FALSE; + } + if (cp->c_touch_modtime) { + cp->c_touch_modtime = FALSE; + time_t new_time = tv.tv_sec; +#if CONFIG_HFS_STD + /* + * HFS dates that WE set must be adjusted for DST + */ + if ((hfsmp->hfs_flags & HFS_STANDARD) && gTimeZone.tz_dsttime) { + new_time += 3600; + } +#endif + if (cp->c_mtime != new_time) { + cp->c_mtime = new_time; + cp->c_flag |= C_MINOR_MOD; + touchvol = 1; + } + } + if (cp->c_touch_chgtime) { + cp->c_touch_chgtime = FALSE; + if (cp->c_ctime != tv.tv_sec) { + cp->c_ctime = tv.tv_sec; + cp->c_flag |= C_MINOR_MOD; + touchvol = 1; + } + } + + if (cp->c_flag & C_NEEDS_DATEADDED) { + hfs_write_dateadded (&(cp->c_attr), tv.tv_sec); + cp->c_flag |= C_MINOR_MOD; + /* untwiddle the bit */ + cp->c_flag &= ~C_NEEDS_DATEADDED; + touchvol = 1; + } + + /* Touch the volume modtime if needed */ + if (touchvol) { + hfs_note_header_minor_change(hfsmp); + HFSTOVCB(hfsmp)->vcbLsMod = tv.tv_sec; + } + } +} + +// Use this if you don't want to check the return code +void hfs_lock_always(cnode_t *cp, enum hfs_locktype locktype) +{ + hfs_lock(cp, locktype, HFS_LOCK_ALWAYS); +} + +/* + * Lock a cnode. + * N.B. If you add any failure cases, *make* sure hfs_lock_always works + */ +int +hfs_lock(struct cnode *cp, enum hfs_locktype locktype, enum hfs_lockflags flags) +{ + thread_t thread = current_thread(); + + if (cp->c_lockowner == thread) { + /* + * Only the extents and bitmap files support lock recursion + * here. The other system files support lock recursion in + * hfs_systemfile_lock. Eventually, we should change to + * handle recursion solely in hfs_systemfile_lock. + */ + if ((cp->c_fileid == kHFSExtentsFileID) || + (cp->c_fileid == kHFSAllocationFileID)) { + cp->c_syslockcount++; + } else { + panic("hfs_lock: locking against myself!"); + } + } else if (locktype == HFS_SHARED_LOCK) { + lck_rw_lock_shared(&cp->c_rwlock); + cp->c_lockowner = HFS_SHARED_OWNER; + } else { /* HFS_EXCLUSIVE_LOCK */ + lck_rw_lock_exclusive(&cp->c_rwlock); + cp->c_lockowner = thread; + + /* Only the extents and bitmap files support lock recursion. */ + if ((cp->c_fileid == kHFSExtentsFileID) || + (cp->c_fileid == kHFSAllocationFileID)) { + cp->c_syslockcount = 1; + } + } + +#ifdef HFS_CHECK_LOCK_ORDER + /* + * Regular cnodes (non-system files) cannot be locked + * while holding the journal lock or a system file lock. + */ + if (!(cp->c_desc.cd_flags & CD_ISMETA) && + ((cp->c_fileid > kHFSFirstUserCatalogNodeID) || (cp->c_fileid == kHFSRootFolderID))) { + vnode_t vp = NULLVP; + + /* Find corresponding vnode. */ + if (cp->c_vp != NULLVP && VTOC(cp->c_vp) == cp) { + vp = cp->c_vp; + } else if (cp->c_rsrc_vp != NULLVP && VTOC(cp->c_rsrc_vp) == cp) { + vp = cp->c_rsrc_vp; + } + if (vp != NULLVP) { + struct hfsmount *hfsmp = VTOHFS(vp); + + if (hfsmp->jnl && (journal_owner(hfsmp->jnl) == thread)) { + /* This will eventually be a panic here, but we need + to fix where we create the hot files BTree + first. */ + printf("hfs_lock: bad lock order (cnode after journal)\n"); + } + if (hfsmp->hfs_catalog_cp && hfsmp->hfs_catalog_cp->c_lockowner == thread) { + panic("hfs_lock: bad lock order (cnode after catalog)"); + } + if (hfsmp->hfs_attribute_cp && hfsmp->hfs_attribute_cp->c_lockowner == thread) { + panic("hfs_lock: bad lock order (cnode after attribute)"); + } + if (hfsmp->hfs_extents_cp && hfsmp->hfs_extents_cp->c_lockowner == thread) { + panic("hfs_lock: bad lock order (cnode after extents)"); + } + } + } +#endif /* HFS_CHECK_LOCK_ORDER */ + + /* + * Skip cnodes for regular files that no longer exist + * (marked deleted, catalog entry gone). + */ + if (((flags & HFS_LOCK_ALLOW_NOEXISTS) == 0) && + ((cp->c_desc.cd_flags & CD_ISMETA) == 0) && + (cp->c_flag & C_NOEXISTS)) { + hfs_unlock(cp); + return (ENOENT); + } + return (0); +} + +bool hfs_lock_upgrade(cnode_t *cp) +{ + if (lck_rw_lock_shared_to_exclusive(&cp->c_rwlock)) { + cp->c_lockowner = current_thread(); + return true; + } else + return false; +} + +/* + * Lock a pair of cnodes. + */ +int +hfs_lockpair(struct cnode *cp1, struct cnode *cp2, enum hfs_locktype locktype) +{ + struct cnode *first, *last; + int error; + + /* + * If cnodes match then just lock one. + */ + if (cp1 == cp2) { + return hfs_lock(cp1, locktype, HFS_LOCK_DEFAULT); + } + + /* + * Lock in cnode address order. + */ + if (cp1 < cp2) { + first = cp1; + last = cp2; + } else { + first = cp2; + last = cp1; + } + + if ( (error = hfs_lock(first, locktype, HFS_LOCK_DEFAULT))) { + return (error); + } + if ( (error = hfs_lock(last, locktype, HFS_LOCK_DEFAULT))) { + hfs_unlock(first); + return (error); + } + return (0); +} + +/* + * Check ordering of two cnodes. Return true if they are are in-order. + */ +static int +hfs_isordered(struct cnode *cp1, struct cnode *cp2) +{ + if (cp1 == cp2) + return (0); + if (cp1 == NULL || cp2 == (struct cnode *)0xffffffff) + return (1); + if (cp2 == NULL || cp1 == (struct cnode *)0xffffffff) + return (0); + /* + * Locking order is cnode address order. + */ + return (cp1 < cp2); +} + +/* + * Acquire 4 cnode locks. + * - locked in cnode address order (lesser address first). + * - all or none of the locks are taken + * - only one lock taken per cnode (dup cnodes are skipped) + * - some of the cnode pointers may be null + */ +int +hfs_lockfour(struct cnode *cp1, struct cnode *cp2, struct cnode *cp3, + struct cnode *cp4, enum hfs_locktype locktype, struct cnode **error_cnode) +{ + struct cnode * a[3]; + struct cnode * b[3]; + struct cnode * list[4]; + struct cnode * tmp; + int i, j, k; + int error; + if (error_cnode) { + *error_cnode = NULL; + } + + if (hfs_isordered(cp1, cp2)) { + a[0] = cp1; a[1] = cp2; + } else { + a[0] = cp2; a[1] = cp1; + } + if (hfs_isordered(cp3, cp4)) { + b[0] = cp3; b[1] = cp4; + } else { + b[0] = cp4; b[1] = cp3; + } + a[2] = (struct cnode *)0xffffffff; /* sentinel value */ + b[2] = (struct cnode *)0xffffffff; /* sentinel value */ + + /* + * Build the lock list, skipping over duplicates + */ + for (i = 0, j = 0, k = 0; (i < 2 || j < 2); ) { + tmp = hfs_isordered(a[i], b[j]) ? a[i++] : b[j++]; + if (k == 0 || tmp != list[k-1]) + list[k++] = tmp; + } + + /* + * Now we can lock using list[0 - k]. + * Skip over NULL entries. + */ + for (i = 0; i < k; ++i) { + if (list[i]) + if ((error = hfs_lock(list[i], locktype, HFS_LOCK_DEFAULT))) { + /* Only stuff error_cnode if requested */ + if (error_cnode) { + *error_cnode = list[i]; + } + /* Drop any locks we acquired. */ + while (--i >= 0) { + if (list[i]) + hfs_unlock(list[i]); + } + return (error); + } + } + return (0); +} + + +/* + * Unlock a cnode. + */ +void +hfs_unlock(struct cnode *cp) +{ + vnode_t rvp = NULLVP; + vnode_t vp = NULLVP; + u_int32_t c_flag = 0; + + /* + * Only the extents and bitmap file's support lock recursion. + */ + if ((cp->c_fileid == kHFSExtentsFileID) || + (cp->c_fileid == kHFSAllocationFileID)) { + if (--cp->c_syslockcount > 0) { + return; + } + } + + const thread_t thread = current_thread(); + + if (cp->c_lockowner == thread) { + c_flag = cp->c_flag; + + // If we have the truncate lock, we must defer the puts + if (cp->c_truncatelockowner == thread) { + if (ISSET(c_flag, C_NEED_DVNODE_PUT) + && !cp->c_need_dvnode_put_after_truncate_unlock) { + CLR(c_flag, C_NEED_DVNODE_PUT); + cp->c_need_dvnode_put_after_truncate_unlock = true; + } + if (ISSET(c_flag, C_NEED_RVNODE_PUT) + && !cp->c_need_rvnode_put_after_truncate_unlock) { + CLR(c_flag, C_NEED_RVNODE_PUT); + cp->c_need_rvnode_put_after_truncate_unlock = true; + } + } + + CLR(cp->c_flag, (C_NEED_DATA_SETSIZE | C_NEED_RSRC_SETSIZE + | C_NEED_DVNODE_PUT | C_NEED_RVNODE_PUT)); + + if (c_flag & (C_NEED_DVNODE_PUT | C_NEED_DATA_SETSIZE)) { + vp = cp->c_vp; + } + if (c_flag & (C_NEED_RVNODE_PUT | C_NEED_RSRC_SETSIZE)) { + rvp = cp->c_rsrc_vp; + } + + cp->c_lockowner = NULL; + lck_rw_unlock_exclusive(&cp->c_rwlock); + } else { + lck_rw_unlock_shared(&cp->c_rwlock); + } + + /* Perform any vnode post processing after cnode lock is dropped. */ + if (vp) { + if (c_flag & C_NEED_DATA_SETSIZE) { + ubc_setsize(vp, VTOF(vp)->ff_size); +#if HFS_COMPRESSION + /* + * If this is a compressed file, we need to reset the + * compression state. We will have set the size to zero + * above and it will get fixed up later (in exactly the + * same way that new vnodes are fixed up). Note that we + * should only be able to get here if the truncate lock is + * held exclusively and so we do the reset when that's + * unlocked. + */ + decmpfs_cnode *dp = VTOCMP(vp); + if (dp && decmpfs_cnode_get_vnode_state(dp) != FILE_TYPE_UNKNOWN) + cp->c_need_decmpfs_reset = true; +#endif + } + if (c_flag & C_NEED_DVNODE_PUT) + vnode_put(vp); + } + if (rvp) { + if (c_flag & C_NEED_RSRC_SETSIZE) + ubc_setsize(rvp, VTOF(rvp)->ff_size); + if (c_flag & C_NEED_RVNODE_PUT) + vnode_put(rvp); + } +} + +/* + * Unlock a pair of cnodes. + */ +void +hfs_unlockpair(struct cnode *cp1, struct cnode *cp2) +{ + hfs_unlock(cp1); + if (cp2 != cp1) + hfs_unlock(cp2); +} + +/* + * Unlock a group of cnodes. + */ +void +hfs_unlockfour(struct cnode *cp1, struct cnode *cp2, struct cnode *cp3, struct cnode *cp4) +{ + struct cnode * list[4]; + int i, k = 0; + + if (cp1) { + hfs_unlock(cp1); + list[k++] = cp1; + } + if (cp2) { + for (i = 0; i < k; ++i) { + if (list[i] == cp2) + goto skip1; + } + hfs_unlock(cp2); + list[k++] = cp2; + } +skip1: + if (cp3) { + for (i = 0; i < k; ++i) { + if (list[i] == cp3) + goto skip2; + } + hfs_unlock(cp3); + list[k++] = cp3; + } +skip2: + if (cp4) { + for (i = 0; i < k; ++i) { + if (list[i] == cp4) + return; + } + hfs_unlock(cp4); + } +} + + +/* + * Protect a cnode against a truncation. + * + * Used mainly by read/write since they don't hold the + * cnode lock across calls to the cluster layer. + * + * The process doing a truncation must take the lock + * exclusive. The read/write processes can take it + * shared. The locktype argument is the same as supplied to + * hfs_lock. + */ +void +hfs_lock_truncate(struct cnode *cp, enum hfs_locktype locktype, enum hfs_lockflags flags) +{ + thread_t thread = current_thread(); + + if (cp->c_truncatelockowner == thread) { + /* + * Ignore grabbing the lock if it the current thread already + * holds exclusive lock. + * + * This is needed on the hfs_vnop_pagein path where we need to ensure + * the file does not change sizes while we are paging in. However, + * we may already hold the lock exclusive due to another + * VNOP from earlier in the call stack. So if we already hold + * the truncate lock exclusive, allow it to proceed, but ONLY if + * it's in the recursive case. + */ + if ((flags & HFS_LOCK_SKIP_IF_EXCLUSIVE) == 0) { + panic("hfs_lock_truncate: cnode %p locked!", cp); + } + } else if (locktype == HFS_SHARED_LOCK) { + lck_rw_lock_shared(&cp->c_truncatelock); + cp->c_truncatelockowner = HFS_SHARED_OWNER; + } else { /* HFS_EXCLUSIVE_LOCK */ + lck_rw_lock_exclusive(&cp->c_truncatelock); + cp->c_truncatelockowner = thread; + } +} + +bool hfs_truncate_lock_upgrade(struct cnode *cp) +{ + hfs_assert(cp->c_truncatelockowner == HFS_SHARED_OWNER); + if (!lck_rw_lock_shared_to_exclusive(&cp->c_truncatelock)) + return false; + cp->c_truncatelockowner = current_thread(); + return true; +} + +void hfs_truncate_lock_downgrade(struct cnode *cp) +{ + hfs_assert(cp->c_truncatelockowner == current_thread()); + lck_rw_lock_exclusive_to_shared(&cp->c_truncatelock); + cp->c_truncatelockowner = HFS_SHARED_OWNER; +} + +/* + * Attempt to get the truncate lock. If it cannot be acquired, error out. + * This function is needed in the degenerate hfs_vnop_pagein during force unmount + * case. To prevent deadlocks while a VM copy object is moving pages, HFS vnop pagein will + * temporarily need to disable V2 semantics. + */ +int hfs_try_trunclock (struct cnode *cp, enum hfs_locktype locktype, enum hfs_lockflags flags) +{ + thread_t thread = current_thread(); + boolean_t didlock = false; + + if (cp->c_truncatelockowner == thread) { + /* + * Ignore grabbing the lock if the current thread already + * holds exclusive lock. + * + * This is needed on the hfs_vnop_pagein path where we need to ensure + * the file does not change sizes while we are paging in. However, + * we may already hold the lock exclusive due to another + * VNOP from earlier in the call stack. So if we already hold + * the truncate lock exclusive, allow it to proceed, but ONLY if + * it's in the recursive case. + */ + if ((flags & HFS_LOCK_SKIP_IF_EXCLUSIVE) == 0) { + panic("hfs_lock_truncate: cnode %p locked!", cp); + } + } else if (locktype == HFS_SHARED_LOCK) { + didlock = lck_rw_try_lock(&cp->c_truncatelock, LCK_RW_TYPE_SHARED); + if (didlock) { + cp->c_truncatelockowner = HFS_SHARED_OWNER; + } + } else { /* HFS_EXCLUSIVE_LOCK */ + didlock = lck_rw_try_lock (&cp->c_truncatelock, LCK_RW_TYPE_EXCLUSIVE); + if (didlock) { + cp->c_truncatelockowner = thread; + } + } + + return didlock; +} + + +/* + * Unlock the truncate lock, which protects against size changes. + * + * If HFS_LOCK_SKIP_IF_EXCLUSIVE flag was set, it means that a previous + * hfs_lock_truncate() might have skipped grabbing a lock because + * the current thread was already holding the lock exclusive and + * we may need to return from this function without actually unlocking + * the truncate lock. + */ +void +hfs_unlock_truncate(struct cnode *cp, enum hfs_lockflags flags) +{ + thread_t thread = current_thread(); + + /* + * If HFS_LOCK_SKIP_IF_EXCLUSIVE is set in the flags AND the current + * lock owner of the truncate lock is our current thread, then + * we must have skipped taking the lock earlier by in + * hfs_lock_truncate() by setting HFS_LOCK_SKIP_IF_EXCLUSIVE in the + * flags (as the current thread was current lock owner). + * + * If HFS_LOCK_SKIP_IF_EXCLUSIVE is not set (most of the time) then + * we check the lockowner field to infer whether the lock was taken + * exclusively or shared in order to know what underlying lock + * routine to call. + */ + if (flags & HFS_LOCK_SKIP_IF_EXCLUSIVE) { + if (cp->c_truncatelockowner == thread) { + return; + } + } + + /* HFS_LOCK_EXCLUSIVE */ + if (thread == cp->c_truncatelockowner) { + vnode_t vp = NULL, rvp = NULL; + + /* + * If there are pending set sizes, the cnode lock should be dropped + * first. + */ + hfs_assert(!(cp->c_lockowner == thread + && ISSET(cp->c_flag, C_NEED_DATA_SETSIZE | C_NEED_RSRC_SETSIZE))); + + if (cp->c_need_dvnode_put_after_truncate_unlock) { + vp = cp->c_vp; + cp->c_need_dvnode_put_after_truncate_unlock = false; + } + if (cp->c_need_rvnode_put_after_truncate_unlock) { + rvp = cp->c_rsrc_vp; + cp->c_need_rvnode_put_after_truncate_unlock = false; + } + +#if HFS_COMPRESSION + bool reset_decmpfs = cp->c_need_decmpfs_reset; + cp->c_need_decmpfs_reset = false; +#endif + + cp->c_truncatelockowner = NULL; + lck_rw_unlock_exclusive(&cp->c_truncatelock); + +#if HFS_COMPRESSION + if (reset_decmpfs) { + decmpfs_cnode *dp = cp->c_decmp; + if (dp && decmpfs_cnode_get_vnode_state(dp) != FILE_TYPE_UNKNOWN) + decmpfs_cnode_set_vnode_state(dp, FILE_TYPE_UNKNOWN, 0); + } +#endif + + // Do the puts now + if (vp) + vnode_put(vp); + if (rvp) + vnode_put(rvp); + } else { /* HFS_LOCK_SHARED */ + lck_rw_unlock_shared(&cp->c_truncatelock); + } +} diff --git a/core/hfs_cnode.h b/core/hfs_cnode.h new file mode 100644 index 0000000..088c445 --- /dev/null +++ b/core/hfs_cnode.h @@ -0,0 +1,630 @@ +/* + * Copyright (c) 2002-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#ifndef _HFS_CNODE_H_ +#define _HFS_CNODE_H_ + +#include + +#ifdef KERNEL +#ifdef __APPLE_API_PRIVATE +#include +#include +#include +#include +#include +#include +#if HFS_COMPRESSION +#include +#endif +#if CONFIG_PROTECT +#include +#endif +#include + +#include "hfs_catalog.h" +#include "rangelist.h" +#include "hfs_dbg.h" + +/* + * The filefork is used to represent an HFS file fork (data or resource). + * Reading or writing any of these fields requires holding cnode lock. + */ +struct filefork { + struct cnode *ff_cp; /* cnode associated with this fork */ + struct rl_head ff_invalidranges; /* Areas of disk that should read back as zeroes */ + union { + void *ffu_sysfileinfo; /* additional info for system files */ + char *ffu_symlinkptr; /* symbolic link pathname */ + } ff_union; + struct cat_fork ff_data; /* fork data (size, extents) */ +}; +typedef struct filefork filefork_t; + + +#define HFS_TEMPLOOKUP_NAMELEN 32 + +/* + * Catalog Lookup struct (runtime) + * + * This is used so that when we need to malloc a container for a catalog + * lookup operation, we can acquire memory for everything in one fell swoop + * as opposed to putting many of these objects on the stack. The cat_fork + * data structure can take up 100+bytes easily, and that can add to stack + * overhead. + * + * As a result, we use this to easily pass around the memory needed for a + * lookup operation. + */ +struct cat_lookup_buffer { + struct cat_desc lookup_desc; + struct cat_attr lookup_attr; + struct filefork lookup_fork; + struct componentname lookup_cn; + char lookup_name[HFS_TEMPLOOKUP_NAMELEN]; /* for open-unlinked paths only */ +}; + + +/* Aliases for common fields */ +#define ff_size ff_data.cf_size +#define ff_new_size ff_data.cf_new_size +#define ff_clumpsize ff_data.cf_clump +#define ff_bytesread ff_data.cf_bytesread +#define ff_extents ff_data.cf_extents + +/* + * Note that the blocks fields are protected by the cnode lock, *not* + * the truncate lock. + */ +#define ff_blocks ff_data.cf_blocks +#define ff_unallocblocks ff_data.cf_vblocks +static inline uint32_t ff_allocblocks(filefork_t *ff) +{ + hfs_assert(ff->ff_blocks >= ff->ff_unallocblocks); + return ff->ff_blocks - ff->ff_unallocblocks; +} + +#define ff_symlinkptr ff_union.ffu_symlinkptr +#define ff_sysfileinfo ff_union.ffu_sysfileinfo + + +/* The btree code still needs these... */ +#define fcbEOF ff_size +#define fcbExtents ff_extents +#define fcbBTCBPtr ff_sysfileinfo + +typedef u_int8_t atomicflag_t; + + +/* + * Hardlink Origin (for hardlinked directories). + */ +struct linkorigin { + TAILQ_ENTRY(linkorigin) lo_link; /* chain */ + void * lo_thread; /* thread that performed the lookup */ + cnid_t lo_cnid; /* hardlink's cnid */ + cnid_t lo_parentcnid; /* hardlink's parent cnid */ +}; +typedef struct linkorigin linkorigin_t; + +#define MAX_CACHED_ORIGINS 10 +#define MAX_CACHED_FILE_ORIGINS 8 + +/* + * The cnode is used to represent each active (or recently active) + * file or directory in the HFS filesystem. + * + * Reading or writing any of these fields requires holding c_lock. + */ +struct cnode { + lck_rw_t c_rwlock; /* cnode's lock */ + thread_t c_lockowner; /* cnode's lock owner (exclusive case only) */ + lck_rw_t c_truncatelock; /* protects file from truncation during read/write */ + thread_t c_truncatelockowner; /* truncate lock owner (exclusive case only) */ + LIST_ENTRY(cnode) c_hash; /* cnode's hash chain */ + u_int32_t c_flag; /* cnode's runtime flags */ + u_int32_t c_hflag; /* cnode's flags for maintaining hash - protected by global hash lock */ + struct vnode *c_vp; /* vnode for data fork or dir */ + struct vnode *c_rsrc_vp; /* vnode for resource fork */ + struct dquot *c_dquot[MAXQUOTAS]; /* cnode's quota info */ + u_int32_t c_childhint; /* catalog hint for children (small dirs only) */ + u_int32_t c_dirthreadhint; /* catalog hint for directory's thread rec */ + struct cat_desc c_desc; /* cnode's descriptor */ + struct cat_attr c_attr; /* cnode's attributes */ + TAILQ_HEAD(hfs_originhead, linkorigin) c_originlist; /* hardlink origin cache */ + TAILQ_HEAD(hfs_hinthead, directoryhint) c_hintlist; /* readdir directory hint list */ + int16_t c_dirhinttag; /* directory hint tag */ + union { + int16_t cu_dirhintcnt; /* directory hint count */ + int16_t cu_syslockcount; /* system file use only */ + } c_union; + u_int32_t c_dirchangecnt; /* changes each insert/delete (in-core only) */ + struct filefork *c_datafork; /* cnode's data fork */ + struct filefork *c_rsrcfork; /* cnode's rsrc fork */ + atomicflag_t c_touch_acctime; + atomicflag_t c_touch_chgtime; + atomicflag_t c_touch_modtime; + + // The following flags are protected by the truncate lock + union { + struct { + bool c_need_dvnode_put_after_truncate_unlock : 1; + bool c_need_rvnode_put_after_truncate_unlock : 1; +#if HFS_COMPRESSION + bool c_need_decmpfs_reset : 1; +#endif + }; + uint8_t c_tflags; + }; + + /* + * Where we're using a journal, we keep track of the last + * transaction that we did an update in. If a minor modification + * is made, we'll still push it if we're still on the same + * transaction. + */ + uint32_t c_update_txn; + +#if HFS_COMPRESSION + struct decmpfs_cnode *c_decmp; +#endif /* HFS_COMPRESSION */ +#if CONFIG_PROTECT + struct cprotect *c_cpentry; /* content protection data */ +#endif + +#if HFS_MALLOC_DEBUG + // N.B. — *must* always be last + uint64_t magic; +#endif +}; +typedef struct cnode cnode_t; + +/* Aliases for common cnode fields */ +#define c_cnid c_desc.cd_cnid +#define c_hint c_desc.cd_hint +#define c_parentcnid c_desc.cd_parentcnid +#define c_encoding c_desc.cd_encoding + +#define c_fileid c_attr.ca_fileid +#define c_mode c_attr.ca_mode +#define c_linkcount c_attr.ca_linkcount +#define c_uid c_attr.ca_uid +#define c_gid c_attr.ca_gid +#define c_rdev c_attr.ca_union1.cau_rdev +#define c_atime c_attr.ca_atime +#define c_mtime c_attr.ca_mtime +#define c_ctime c_attr.ca_ctime +#define c_itime c_attr.ca_itime +#define c_btime c_attr.ca_btime +#define c_bsdflags c_attr.ca_flags +#define c_finderinfo c_attr.ca_finderinfo +#define c_blocks c_attr.ca_union2.cau_blocks +#define c_entries c_attr.ca_union2.cau_entries +#define c_zftimeout c_childhint + +#define c_dirhintcnt c_union.cu_dirhintcnt +#define c_syslockcount c_union.cu_syslockcount + + +/* hash maintenance flags kept in c_hflag and protected by hfs_chash_mutex */ +#define H_ALLOC 0x00001 /* CNode is being allocated */ +#define H_ATTACH 0x00002 /* CNode is being attached to by another vnode */ +#define H_TRANSIT 0x00004 /* CNode is getting recycled */ +#define H_WAITING 0x00008 /* CNode is being waited for */ + + +/* + * Runtime cnode flags (kept in c_flag) + */ +#define C_NEED_RVNODE_PUT 0x0000001 /* Need to do a vnode_put on c_rsrc_vp after the unlock */ +#define C_NEED_DVNODE_PUT 0x0000002 /* Need to do a vnode_put on c_vp after the unlock */ +#define C_ZFWANTSYNC 0x0000004 /* fsync requested and file has holes */ +#define C_FROMSYNC 0x0000008 /* fsync was called from sync */ + +#define C_MODIFIED 0x0000010 /* CNode has been modified */ +#define C_NOEXISTS 0x0000020 /* CNode has been deleted, catalog entry is gone */ +#define C_DELETED 0x0000040 /* CNode has been marked to be deleted */ +#define C_HARDLINK 0x0000080 /* CNode is a hard link (file or dir) */ + +/* + * A minor modification is one where the volume would not be inconsistent if + * the change was not pushed to disk. For example, changes to times. + */ +#define C_MINOR_MOD 0x0000100 /* CNode has a minor modification */ + +#define C_HASXATTRS 0x0000200 /* cnode has extended attributes */ +#define C_NEG_ENTRIES 0x0000400 /* directory has negative name entries */ +/* + * For C_SSD_STATIC: SSDs may want to deal with the file payload data in a + * different manner knowing that the content is not likely to be modified. This is + * purely advisory at the HFS level, and is not maintained after the cnode goes out of core. + */ +#define C_SSD_STATIC 0x0000800 /* Assume future writes contain static content */ + +#define C_NEED_DATA_SETSIZE 0x0001000 /* Do a ubc_setsize(0) on c_rsrc_vp after the unlock */ +#define C_NEED_RSRC_SETSIZE 0x0002000 /* Do a ubc_setsize(0) on c_vp after the unlock */ +#define C_DIR_MODIFICATION 0x0004000 /* Directory is being modified, wait for lookups */ +#define C_ALWAYS_ZEROFILL 0x0008000 /* Always zero-fill the file on an fsync */ + +#define C_RENAMED 0x0010000 /* cnode was deleted as part of rename; C_DELETED should also be set */ +#define C_NEEDS_DATEADDED 0x0020000 /* cnode needs date-added written to the finderinfo bit */ +#define C_BACKINGSTORE 0x0040000 /* cnode is a backing store for an existing or currently-mounting filesystem */ + +/* + * This flag indicates the cnode might be dirty because it + * was mapped writable so if we get any page-outs, update + * the modification and change times. + */ +#define C_MIGHT_BE_DIRTY_FROM_MAPPING 0x0080000 + +/* + * For C_SSD_GREEDY_MODE: SSDs may want to write the file payload data using the greedy mode knowing + * that the content needs to be written out to the disk quicker than normal at the expense of storage efficiency. + * This is purely advisory at the HFS level, and is not maintained after the cnode goes out of core. + */ +#define C_SSD_GREEDY_MODE 0x0100000 /* Assume future writes are recommended to be written in SLC mode */ + +/* 0x0200000 is currently unused */ + +#define C_IO_ISOCHRONOUS 0x0400000 /* device-specific isochronous throughput I/O */ + +#define ZFTIMELIMIT (5 * 60) + +/* + * The following is the "invisible" bit from the fdFlags field + * in the FndrFileInfo. + */ +enum { kFinderInvisibleMask = 1 << 14 }; + + +/* + * Convert between cnode pointers and vnode pointers + */ +#define VTOC(vp) ((struct cnode *)vnode_fsnode((vp))) + +#define CTOV(cp,rsrc) (((rsrc) && S_ISREG((cp)->c_mode)) ? \ + (cp)->c_rsrc_vp : (cp)->c_vp) + +/* + * Convert between vnode pointers and file forks + * + * Note: no CTOF since that is ambiguous + */ + +#define FTOC(fp) ((fp)->ff_cp) + +#define VTOF(vp) ((vp) == VTOC((vp))->c_rsrc_vp ? \ + VTOC((vp))->c_rsrcfork : \ + VTOC((vp))->c_datafork) + +#define VCTOF(vp, cp) ((vp) == (cp)->c_rsrc_vp ? \ + (cp)->c_rsrcfork : \ + (cp)->c_datafork) + +#define FTOV(fp) ((fp) == FTOC(fp)->c_rsrcfork ? \ + FTOC(fp)->c_rsrc_vp : \ + FTOC(fp)->c_vp) + +/* + * This is a helper function used for determining whether or not a cnode has become open + * unlinked in between the time we acquired its vnode and the time we acquire the cnode lock + * to start manipulating it. Due to the SMP nature of VFS, it is probably necessary to + * use this macro every time we acquire a cnode lock, as the content of the Cnode may have + * been modified in betweeen the lookup and a VNOP. Whether or not to call this is dependent + * upon the VNOP in question. Sometimes it is OK to use an open-unlinked file, for example, in, + * reading. But other times, such as on the source of a VNOP_RENAME, it should be disallowed. + */ +int hfs_checkdeleted(struct cnode *cp); + +/* + * Test for a resource fork + */ +#define FORK_IS_RSRC(fp) ((fp) == FTOC(fp)->c_rsrcfork) + +#define VNODE_IS_RSRC(vp) ((vp) == VTOC((vp))->c_rsrc_vp) + +#if HFS_COMPRESSION +/* + * VTOCMP(vp) returns a pointer to vp's decmpfs_cnode; this could be NULL + * if the file is not compressed or if hfs_file_is_compressed() hasn't + * yet been called on this file. + */ +#define VTOCMP(vp) (VTOC((vp))->c_decmp) +int hfs_file_is_compressed(struct cnode *cp, int skiplock); +int hfs_uncompressed_size_of_compressed_file(struct hfsmount *hfsmp, struct vnode *vp, cnid_t fid, off_t *size, int skiplock); +int hfs_hides_rsrc(vfs_context_t ctx, struct cnode *cp, int skiplock); +int hfs_hides_xattr(vfs_context_t ctx, struct cnode *cp, const char *name, int skiplock); +#endif + +#define ATIME_ONDISK_ACCURACY 300 + +static inline bool hfs_should_save_atime(cnode_t *cp) +{ + /* + * We only write atime updates to disk if the delta is greater + * than ATIME_ONDISK_ACCURACY. + */ + return (cp->c_atime < cp->c_attr.ca_atimeondisk + || cp->c_atime - cp->c_attr.ca_atimeondisk > ATIME_ONDISK_ACCURACY); +} + +typedef enum { + HFS_NOT_DIRTY = 0, + HFS_DIRTY = 1, + HFS_DIRTY_ATIME = 2 +} hfs_dirty_t; + +static inline hfs_dirty_t hfs_is_dirty(cnode_t *cp) +{ + if (ISSET(cp->c_flag, C_NOEXISTS)) + return HFS_NOT_DIRTY; + + if (ISSET(cp->c_flag, C_MODIFIED | C_MINOR_MOD | C_NEEDS_DATEADDED) + || cp->c_touch_chgtime || cp->c_touch_modtime) { + return HFS_DIRTY; + } + + if (cp->c_touch_acctime || hfs_should_save_atime(cp)) + return HFS_DIRTY_ATIME; + + return HFS_NOT_DIRTY; +} + +/* This overlays the FileID portion of NFS file handles. */ +struct hfsfid { + u_int32_t hfsfid_cnid; /* Catalog node ID. */ + u_int32_t hfsfid_gen; /* Generation number (create date). */ +}; + + +/* Get new default vnode */ +extern int hfs_getnewvnode(struct hfsmount *hfsmp, struct vnode *dvp, struct componentname *cnp, + struct cat_desc *descp, int flags, struct cat_attr *attrp, + struct cat_fork *forkp, struct vnode **vpp, int *out_flags); + +/* Input flags for hfs_getnewvnode */ + +#define GNV_WANTRSRC 0x01 /* Request the resource fork vnode. */ +#define GNV_SKIPLOCK 0x02 /* Skip taking the cnode lock (when getting resource fork). */ +#define GNV_CREATE 0x04 /* The vnode is for a newly created item. */ +#define GNV_NOCACHE 0x08 /* Delay entering this item in the name cache */ +#define GNV_USE_VP 0x10 /* Use the vnode provided in *vpp instead of creating a new one */ + +/* Output flags for hfs_getnewvnode */ +#define GNV_CHASH_RENAMED 0x01 /* The cnode was renamed in-flight */ +#define GNV_CAT_DELETED 0x02 /* The cnode was deleted from the catalog */ +#define GNV_NEW_CNODE 0x04 /* We are vending out a newly initialized cnode */ +#define GNV_CAT_ATTRCHANGED 0x08 /* Something in struct cat_attr changed in between cat_lookups */ + + +/* Touch cnode times based on c_touch_xxx flags */ +extern void hfs_touchtimes(struct hfsmount *, struct cnode *); +extern void hfs_write_dateadded (struct cat_attr *cattrp, u_int32_t dateadded); +extern u_int32_t hfs_get_dateadded (struct cnode *cp); +extern u_int32_t hfs_get_dateadded_from_blob(const uint8_t * /* finderinfo */, mode_t /* mode */); + +/* Gen counter methods */ +extern void hfs_write_gencount(struct cat_attr *cattrp, uint32_t gencount); +extern uint32_t hfs_get_gencount(struct cnode *cp); +extern uint32_t hfs_incr_gencount (struct cnode *cp); +extern uint32_t hfs_get_gencount_from_blob(const uint8_t * /* finderinfo */, mode_t /* mode */); + +/* Document id methods */ +extern uint32_t hfs_get_document_id(struct cnode * /* cp */); +extern uint32_t hfs_get_document_id_from_blob(const uint8_t * /* finderinfo */, mode_t /* mode */); + +/* Zero-fill file and push regions out to disk */ +enum { + // Use this flag if you're going to sync later + HFS_FILE_DONE_NO_SYNC = 1, +}; +typedef uint32_t hfs_file_done_opts_t; +extern int hfs_filedone(struct vnode *vp, vfs_context_t context, + hfs_file_done_opts_t opts); + +/* + * HFS cnode hash functions. + */ +extern void hfs_chashinit(void); +extern void hfs_chashinit_finish(struct hfsmount *hfsmp); +extern void hfs_delete_chash(struct hfsmount *hfsmp); +extern int hfs_chashremove(struct hfsmount *hfsmp, struct cnode *cp); +extern void hfs_chash_abort(struct hfsmount *hfsmp, struct cnode *cp); +extern void hfs_chash_rehash(struct hfsmount *hfsmp, struct cnode *cp1, struct cnode *cp2); +extern void hfs_chashwakeup(struct hfsmount *hfsmp, struct cnode *cp, int flags); +extern void hfs_chash_mark_in_transit(struct hfsmount *hfsmp, struct cnode *cp); + +extern struct vnode * hfs_chash_getvnode(struct hfsmount *hfsmp, ino_t inum, int wantrsrc, + int skiplock, int allow_deleted); +extern struct cnode * hfs_chash_getcnode(struct hfsmount *hfsmp, ino_t inum, struct vnode **vpp, + int wantrsrc, int skiplock, int *out_flags, int *hflags); +extern int hfs_chash_snoop(struct hfsmount *, ino_t, int, int (*)(const cnode_t *, void *), void *); +extern int hfs_valid_cnode(struct hfsmount *hfsmp, struct vnode *dvp, struct componentname *cnp, + cnid_t cnid, struct cat_attr *cattr, int *error); + +extern int hfs_chash_set_childlinkbit(struct hfsmount *hfsmp, cnid_t cnid); + +/* + * HFS cnode lock functions. + * + * HFS Locking Order: + * + * 1. cnode truncate lock (if needed) -- see below for more on this + * + * + hfs_vnop_pagein/out handles recursive use of this lock (by + * using flag option HFS_LOCK_SKIP_IF_EXCLUSIVE) although there + * are issues with this (see #16620278). + * + * + If locking multiple cnodes then the truncate lock must be taken on + * all (in address order), before taking the cnode locks. + * + * 2. Hot Files stage mutex (grabbed before manipulating individual vnodes/cnodes) + * + * 3. cnode locks in address order (if needed) + * + * 4. journal (if needed) + * + * 5. Hot Files B-Tree lock (not treated as a system file) + * + * 6. system files (as needed) + * + * A. Catalog B-tree file + * B. Attributes B-tree file + * C. Startup file (if there is one) + * D. Allocation Bitmap file (always exclusive, supports recursion) + * E. Overflow Extents B-tree file (always exclusive, supports recursion) + * + * 7. hfs mount point (always last) + * + * + * I. HFS cnode hash lock (must not acquire any new locks while holding this lock, always taken last) + */ + +/* + * -- The Truncate Lock -- + * + * The truncate lock is used for a few purposes (more than its name + * might suggest). The first thing to note is that the cnode lock + * cannot be held whilst issuing any I/O other than metadata changes, + * so the truncate lock, in either shared or exclusive form, must + * usually be held in these cases. This includes calls to ubc_setsize + * where the new size is less than the current size known to the VM + * subsystem (for two reasons: a) because reaping pages can block + * (e.g. on pages that are busy or being cleaned); b) reaping pages + * might require page-in for tasks that have that region mapped + * privately). The same applies to other calls into the VM subsystem. + * + * Here are some (but not necessarily all) cases that the truncate + * lock protects for: + * + * + When reading and writing a file, we hold the truncate lock + * shared to ensure that the underlying blocks cannot be deleted + * and on systems that use content protection, this also ensures + * the keys remain valid (which might be being used by the + * underlying layers). + * + * + We need to protect against the following sequence of events: + * + * A file is initially size X. A thread issues an append to that + * file. Another thread truncates the file and then extends it + * to a a new size Y. Now the append can be applied at offset X + * and then the data is lost when the file is truncated; or it + * could be applied after the truncate, i.e. at offset 0; or it + * can be applied at offset Y. What we *cannot* do is apply the + * append at offset X and for the data to be visible at the end. + * (Note that we are free to choose when we apply the append + * operation.) + * + * To solve this, we keep things simple and take the truncate lock + * exclusively in order to sequence the append with other size + * changes. Therefore any size change must take the truncate lock + * exclusively. + * + * (N.B. we could do better and allow readers to run concurrently + * during the append and other size changes.) + * + * So here are the rules: + * + * + If you plan to change ff_size, you must take the truncate lock + * exclusively, *but* be careful what I/O you do whilst you have + * the truncate lock exclusively and try and avoid it if you can: + * if the VM subsystem tries to do something with some pages on a + * different thread and you try and do some I/O with those same + * pages, we will deadlock. (See #16620278.) + * + * + If you do anything that requires blocks to not be deleted or + * encryption keys to remain valid, you must take the truncate lock + * shared. + * + * + And it follows therefore, that if you want to delete blocks or + * delete keys, you must take the truncate lock exclusively. Note + * that for asynchronous writes, the truncate lock will be dropped + * after issuing I/O but before the I/O has completed which means + * that before manipulating keys, you *must* issue + * vnode_wait_for_writes in addition to holding the truncate lock. + * + * N.B. ff_size is actually protected by the cnode lock and so you + * must hold the cnode lock exclusively to change it and shared to + * read it. + * + */ + +enum hfs_locktype { + HFS_SHARED_LOCK = 1, + HFS_EXCLUSIVE_LOCK = 2 +}; + +/* Option flags for cnode and truncate lock functions */ +enum hfs_lockflags { + HFS_LOCK_DEFAULT = 0x0, /* Default flag, no options provided */ + HFS_LOCK_ALLOW_NOEXISTS = 0x1, /* Allow locking of all cnodes, including cnode marked deleted with no catalog entry */ + HFS_LOCK_SKIP_IF_EXCLUSIVE = 0x2, /* Skip locking if the current thread already holds the lock exclusive */ + + // Used when you do not want to check return from hfs_lock + HFS_LOCK_ALWAYS = HFS_LOCK_ALLOW_NOEXISTS, +}; +#define HFS_SHARED_OWNER (void *)0xffffffff + +void hfs_lock_always(cnode_t *cnode, enum hfs_locktype); +int hfs_lock(struct cnode *, enum hfs_locktype, enum hfs_lockflags); +bool hfs_lock_upgrade(cnode_t *cp); +int hfs_lockpair(struct cnode *, struct cnode *, enum hfs_locktype); +int hfs_lockfour(struct cnode *, struct cnode *, struct cnode *, struct cnode *, + enum hfs_locktype, struct cnode **); +void hfs_unlock(struct cnode *); +void hfs_unlockpair(struct cnode *, struct cnode *); +void hfs_unlockfour(struct cnode *, struct cnode *, struct cnode *, struct cnode *); + +void hfs_lock_truncate(struct cnode *, enum hfs_locktype, enum hfs_lockflags); +bool hfs_truncate_lock_upgrade(struct cnode *cp); +void hfs_truncate_lock_downgrade(struct cnode *cp); +void hfs_unlock_truncate(struct cnode *, enum hfs_lockflags); +int hfs_try_trunclock(struct cnode *, enum hfs_locktype, enum hfs_lockflags); + +extern int hfs_systemfile_lock(struct hfsmount *, int, enum hfs_locktype); +extern void hfs_systemfile_unlock(struct hfsmount *, int); + +void hfs_clear_might_be_dirty_flag(cnode_t *cp); + +int hfs_set_bsd_flags(struct hfsmount *, struct cnode *, + u_int32_t, u_int32_t, vfs_context_t, int *); +bool hfs_is_journal_file(struct hfsmount *, struct cnode *); + +// cnode must be locked +static inline __attribute__((pure)) +bool hfs_has_rsrc(const cnode_t *cp) +{ + if (cp->c_rsrcfork) + return cp->c_rsrcfork->ff_blocks > 0; + else + return cp->c_datafork && cp->c_blocks > cp->c_datafork->ff_blocks; +} + +#endif /* __APPLE_API_PRIVATE */ +#endif /* KERNEL */ + +#endif /* ! _HFS_CNODE_H_ */ diff --git a/core/hfs_cprotect.c b/core/hfs_cprotect.c new file mode 100644 index 0000000..3fa485b --- /dev/null +++ b/core/hfs_cprotect.c @@ -0,0 +1,2773 @@ +/* + * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#if CONFIG_PROTECT + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hfs.h" +#include "hfs_cnode.h" +#include "hfs_fsctl.h" +#include "hfs_cprotect.h" +#include "hfs_iokit.h" + +#if HFS_CONFIG_KEY_ROLL +#include "hfs_key_roll.h" +#endif + +#define PTR_ADD(type, base, offset) (type)((uintptr_t)(base) + (offset)) + +extern int (**hfs_vnodeop_p) (void *); + +/* + * CP private functions + */ +static int cp_root_major_vers(mount_t mp); +static int cp_getxattr(cnode_t *, struct hfsmount *hfsmp, struct cprotect **); +static void cp_entry_dealloc(hfsmount_t *hfsmp, struct cprotect *entry); +static int cp_restore_keys(struct cprotect *, struct hfsmount *hfsmp, struct cnode *); +static int cp_lock_vnode_callback(vnode_t, void *); +static int cp_vnode_is_eligible (vnode_t); +static int cp_check_access (cnode_t *cp, struct hfsmount *hfsmp, int vnop); +static int cp_unwrap(struct hfsmount *, struct cprotect *, struct cnode *); +static void cp_init_access(aks_cred_t access, struct cnode *cp); + +// -- cp_key_pair accessors -- + +void cpkp_init(cp_key_pair_t *cpkp, uint16_t max_pers_key_len, + uint16_t max_cached_key_len) +{ + cpkp->cpkp_max_pers_key_len = max_pers_key_len; + cpkp->cpkp_pers_key_len = 0; + cpx_init(cpkp_cpx(cpkp), max_cached_key_len); + + // Default to using offsets + cpx_set_use_offset_for_iv(cpkp_cpx(cpkp), true); +} + +uint16_t cpkp_max_pers_key_len(const cp_key_pair_t *cpkp) +{ + return cpkp->cpkp_max_pers_key_len; +} + +uint16_t cpkp_pers_key_len(const cp_key_pair_t *cpkp) +{ + return cpkp->cpkp_pers_key_len; +} + +static bool cpkp_has_pers_key(const cp_key_pair_t *cpkp) +{ + return cpkp->cpkp_pers_key_len > 0; +} + +static void *cpkp_pers_key(const cp_key_pair_t *cpkp) +{ + return PTR_ADD(void *, &cpkp->cpkp_cpx, cpx_sizex(cpkp_cpx(cpkp))); +} + +static void cpkp_set_pers_key_len(cp_key_pair_t *cpkp, uint16_t key_len) +{ + if (key_len > cpkp->cpkp_max_pers_key_len) + panic("hfs_cprotect: key too big!"); + cpkp->cpkp_pers_key_len = key_len; +} + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wcast-qual" +cpx_t cpkp_cpx(const cp_key_pair_t *cpkp) +{ + // Cast to remove const qualifier + return (cpx_t)&cpkp->cpkp_cpx; +} +#pragma clang diagnostic pop + +size_t cpkp_size(uint16_t pers_key_len, uint16_t cached_key_len) +{ + return sizeof(cp_key_pair_t) + pers_key_len + cpx_size(cached_key_len); +} + +size_t cpkp_sizex(const cp_key_pair_t *cpkp) +{ + return cpkp_size(cpkp->cpkp_max_pers_key_len, cpx_max_key_len(cpkp_cpx(cpkp))); +} + +void cpkp_flush(cp_key_pair_t *cpkp) +{ + cpx_flush(cpkp_cpx(cpkp)); + cpkp->cpkp_pers_key_len = 0; + bzero(cpkp_pers_key(cpkp), cpkp->cpkp_max_pers_key_len); +} + +bool cpkp_can_copy(const cp_key_pair_t *src, const cp_key_pair_t *dst) +{ + return (cpkp_pers_key_len(src) <= dst->cpkp_max_pers_key_len + && cpx_can_copy(cpkp_cpx(src), cpkp_cpx(dst))); +} + +void cpkp_copy(const cp_key_pair_t *src, cp_key_pair_t *dst) +{ + const uint16_t key_len = cpkp_pers_key_len(src); + cpkp_set_pers_key_len(dst, key_len); + memcpy(cpkp_pers_key(dst), cpkp_pers_key(src), key_len); + cpx_copy(cpkp_cpx(src), cpkp_cpx(dst)); +} + +// -- + +bool cp_is_supported_version(uint16_t vers) +{ + return vers == CP_VERS_4 || vers == CP_VERS_5; +} + +/* + * Return the appropriate key and, if requested, the physical offset and + * maximum length for a particular I/O operation. + */ +void cp_io_params(__unused hfsmount_t *hfsmp, cprotect_t cpr, + __unused off_rsrc_t off_rsrc, + __unused int direction, cp_io_params_t *io_params) +{ +#if HFS_CONFIG_KEY_ROLL + hfs_cp_key_roll_ctx_t *ckr = cpr->cp_key_roll_ctx; + + if (ckr && off_rsrc < ckr->ckr_off_rsrc) { + /* + * When we're in the process of rolling an extent, ckr_off_rsrc will + * indicate the end of the extent. + */ + const off_rsrc_t roll_loc = ckr->ckr_off_rsrc + - hfs_blk_to_bytes(ckr->ckr_roll_extent.blockCount, + hfsmp->blockSize); + + if (off_rsrc < roll_loc) { + io_params->max_len = roll_loc - off_rsrc; + io_params->phys_offset = -1; + } else { + /* + * We should never get reads to the extent we're rolling + * because the pages should be locked in the UBC. If we + * did get reads it's not obvious what the right thing to + * do is either: we could read from the old location, but + * we might have written later data to the new location, + * or we could read from the new location, but data might + * not have been written there yet. + * + * Note that whilst raw encrypted reads don't lock any + * pages, or take a cluster_read_direct lock, the call to + * hfs_key_roll_up_to in hfs_vnop_read will have ensured + * that the file has been rolled beyond the offset being + * read so this path should never be taken in that case. + */ + hfs_assert(direction == VNODE_WRITE); + + // For release builds, just in case... + if (direction == VNODE_READ) { + // Use the old key and offset + goto old_key; + } + + io_params->max_len = ckr->ckr_off_rsrc - off_rsrc; + io_params->phys_offset = hfs_blk_to_bytes(ckr->ckr_roll_extent.startBlock, + hfsmp->blockSize) + off_rsrc - roll_loc; + } + + // Use new key + io_params->cpx = cpkp_cpx(&ckr->ckr_keys); + return; + } +old_key: + // Use old key... +#endif + + io_params->max_len = INT64_MAX; + io_params->phys_offset = -1; + io_params->cpx = cpkp_cpx(&cpr->cp_keys); +} + +static void cp_flush_cached_keys(cprotect_t cpr) +{ + cpx_flush(cpkp_cpx(&cpr->cp_keys)); +#if HFS_CONFIG_KEY_ROLL + if (cpr->cp_key_roll_ctx) + cpx_flush(cpkp_cpx(&cpr->cp_key_roll_ctx->ckr_keys)); +#endif +} + +static bool cp_needs_pers_key(cprotect_t cpr) +{ + if (CP_CLASS(cpr->cp_pclass) == PROTECTION_CLASS_F) + return !cpx_has_key(cpkp_cpx(&cpr->cp_keys)); + else + return !cpkp_has_pers_key(&cpr->cp_keys); +} + +static cp_key_revision_t cp_initial_key_revision(__unused hfsmount_t *hfsmp) +{ + return 1; +} + +cp_key_revision_t cp_next_key_revision(cp_key_revision_t rev) +{ + rev = (rev + 0x0100) ^ (mach_absolute_time() & 0xff); + if (!rev) + rev = 1; + return rev; +} + +/* + * Allocate and initialize a cprotect blob for a new cnode. + * Called from hfs_getnewvnode: cnode is locked exclusive. + * + * Read xattr data off the cnode. Then, if conditions permit, + * unwrap the file key and cache it in the cprotect blob. + */ +int +cp_entry_init(struct cnode *cp, struct mount *mp) +{ + struct cprotect *entry = NULL; + int error = 0; + struct hfsmount *hfsmp = VFSTOHFS(mp); + + /* + * The cnode should be locked at this point, regardless of whether or not + * we are creating a new item in the namespace or vending a vnode on behalf + * of lookup. The only time we tell getnewvnode to skip the lock is when + * constructing a resource fork vnode. But a resource fork vnode must come + * after the regular data fork cnode has already been constructed. + */ + if (!cp_fs_protected (mp)) { + cp->c_cpentry = NULL; + return 0; + } + + if (!S_ISREG(cp->c_mode) && !S_ISDIR(cp->c_mode)) { + cp->c_cpentry = NULL; + return 0; + } + + if (hfsmp->hfs_running_cp_major_vers == 0) { + panic ("hfs cp: no running mount point version! "); + } + + hfs_assert(cp->c_cpentry == NULL); + + error = cp_getxattr(cp, hfsmp, &entry); + if (error == ENOATTR) { + /* + * Normally, we should always have a CP EA for a file or directory that + * we are initializing here. However, there are some extenuating circumstances, + * such as the root directory immediately following a newfs_hfs. + * + * As a result, we leave code here to deal with an ENOATTR which will always + * default to a 'D/NONE' key, though we don't expect to use it much. + */ + cp_key_class_t target_class = PROTECTION_CLASS_D; + + if (S_ISDIR(cp->c_mode)) { + target_class = PROTECTION_CLASS_DIR_NONE; + } + + cp_key_revision_t key_revision = cp_initial_key_revision(hfsmp); + + /* allow keybag to override our class preferences */ + error = cp_new (&target_class, hfsmp, cp, cp->c_mode, CP_KEYWRAP_DIFFCLASS, + key_revision, (cp_new_alloc_fn)cp_entry_alloc, (void **)&entry); + if (error == 0) { + entry->cp_pclass = target_class; + entry->cp_key_os_version = cp_os_version(); + entry->cp_key_revision = key_revision; + error = cp_setxattr (cp, entry, hfsmp, cp->c_fileid, XATTR_CREATE); + } + } + + /* + * Bail out if: + * a) error was not ENOATTR (we got something bad from the getxattr call) + * b) we encountered an error setting the xattr above. + * c) we failed to generate a new cprotect data structure. + */ + if (error) { + goto out; + } + + cp->c_cpentry = entry; + +out: + if (error == 0) { + entry->cp_backing_cnode = cp; + } + else { + if (entry) { + cp_entry_destroy(hfsmp, entry); + } + cp->c_cpentry = NULL; + } + + return error; +} + +/* + * cp_setup_newentry + * + * Generate a keyless cprotect structure for use with the new AppleKeyStore kext. + * Since the kext is now responsible for vending us both wrapped/unwrapped keys + * we need to create a keyless xattr upon file / directory creation. When we have the inode value + * and the file/directory is established, then we can ask it to generate keys. Note that + * this introduces a potential race; If the device is locked and the wrapping + * keys are purged between the time we call this function and the time we ask it to generate + * keys for us, we could have to fail the open(2) call and back out the entry. + */ + +int cp_setup_newentry (struct hfsmount *hfsmp, struct cnode *dcp, + cp_key_class_t suppliedclass, mode_t cmode, + struct cprotect **tmpentry) +{ + int isdir = 0; + struct cprotect *entry = NULL; + uint32_t target_class = hfsmp->default_cp_class; + suppliedclass = CP_CLASS(suppliedclass); + + if (hfsmp->hfs_running_cp_major_vers == 0) { + panic ("CP: major vers not set in mount!"); + } + + if (S_ISDIR (cmode)) { + isdir = 1; + } + + /* Decide the target class. Input argument takes priority. */ + if (cp_is_valid_class (isdir, suppliedclass)) { + /* caller supplies -1 if it was not specified so we will default to the mount point value */ + target_class = suppliedclass; + /* + * One exception, F is never valid for a directory + * because its children may inherit and userland will be + * unable to read/write to the files. + */ + if (isdir) { + if (target_class == PROTECTION_CLASS_F) { + *tmpentry = NULL; + return EINVAL; + } + } + } + else { + /* + * If no valid class was supplied, behave differently depending on whether or not + * the item being created is a file or directory. + * + * for FILE: + * If parent directory has a non-zero class, use that. + * If parent directory has a zero class (not set), then attempt to + * apply the mount point default. + * + * for DIRECTORY: + * Directories always inherit from the parent; if the parent + * has a NONE class set, then we can continue to use that. + */ + if ((dcp) && (dcp->c_cpentry)) { + uint32_t parentclass = CP_CLASS(dcp->c_cpentry->cp_pclass); + /* If the parent class is not valid, default to the mount point value */ + if (cp_is_valid_class(1, parentclass)) { + if (isdir) { + target_class = parentclass; + } + else if (parentclass != PROTECTION_CLASS_DIR_NONE) { + /* files can inherit so long as it's not NONE */ + target_class = parentclass; + } + } + /* Otherwise, we already defaulted to the mount point's default */ + } + } + + /* Generate the cprotect to vend out */ + entry = cp_entry_alloc(NULL, 0, 0, NULL); + if (entry == NULL) { + *tmpentry = NULL; + return ENOMEM; + } + + /* + * We don't have keys yet, so fill in what we can. At this point + * this blob has no keys and it has no backing xattr. We just know the + * target class. + */ + entry->cp_flags = CP_NO_XATTR; + /* Note this is only the effective class */ + entry->cp_pclass = target_class; + *tmpentry = entry; + + return 0; +} + +/* + * Set up an initial key/class pair for a disassociated cprotect entry. + * This function is used to generate transient keys that will never be + * written to disk. We use class F for this since it provides the exact + * semantics that are needed here. Because we never attach this blob to + * a cnode directly, we take a pointer to the cprotect struct. + * + * This function is primarily used in the HFS FS truncation codepath + * where we may rely on AES symmetry to relocate encrypted data from + * one spot in the disk to another. + */ +int cpx_gentempkeys(cpx_t *pcpx, __unused struct hfsmount *hfsmp) +{ + cpx_t cpx = cpx_alloc(CP_MAX_KEYSIZE); + + cpx_set_key_len(cpx, CP_MAX_KEYSIZE); + read_random(cpx_key(cpx), CP_MAX_KEYSIZE); + cpx_set_use_offset_for_iv(cpx, true); + + *pcpx = cpx; + + return 0; +} + +/* + * Tear down and clear a cprotect blob for a closing file. + * Called at hfs_reclaim_cnode: cnode is locked exclusive. + */ +void +cp_entry_destroy(hfsmount_t *hfsmp, struct cprotect *entry_ptr) +{ + if (entry_ptr == NULL) { + /* nothing to clean up */ + return; + } + cp_entry_dealloc(hfsmp, entry_ptr); +} + + +int +cp_fs_protected (mount_t mnt) +{ + return (vfs_flags(mnt) & MNT_CPROTECT); +} + + +/* + * Return a pointer to underlying cnode if there is one for this vnode. + * Done without taking cnode lock, inspecting only vnode state. + */ +struct cnode * +cp_get_protected_cnode(struct vnode *vp) +{ + if (!cp_vnode_is_eligible(vp)) { + return NULL; + } + + if (!cp_fs_protected(VTOVFS(vp))) { + /* mount point doesn't support it */ + return NULL; + } + + return vnode_fsnode(vp); +} + + +/* + * Sets *class to persistent class associated with vnode, + * or returns error. + */ +int +cp_vnode_getclass(struct vnode *vp, cp_key_class_t *class) +{ + struct cprotect *entry; + int error = 0; + struct cnode *cp; + int took_truncate_lock = 0; + struct hfsmount *hfsmp = NULL; + + /* Is this an interesting vp? */ + if (!cp_vnode_is_eligible (vp)) { + return EBADF; + } + + /* Is the mount point formatted for content protection? */ + if (!cp_fs_protected(VTOVFS(vp))) { + return ENOTSUP; + } + + cp = VTOC(vp); + hfsmp = VTOHFS(vp); + + /* + * Take the truncate lock up-front in shared mode because we may need + * to manipulate the CP blob. Pend lock events until we're done here. + */ + hfs_lock_truncate (cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT); + took_truncate_lock = 1; + + /* + * We take only the shared cnode lock up-front. If it turns out that + * we need to manipulate the CP blob to write a key out, drop the + * shared cnode lock and acquire an exclusive lock. + */ + error = hfs_lock(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT); + if (error) { + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); + return error; + } + + /* pull the class from the live entry */ + entry = cp->c_cpentry; + + if (entry == NULL) { + panic("Content Protection: uninitialized cnode %p", cp); + } + + /* Note that we may not have keys yet, but we know the target class. */ + + if (error == 0) { + *class = CP_CLASS(entry->cp_pclass); + } + + if (took_truncate_lock) { + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); + } + + hfs_unlock(cp); + return error; +} + +/* + * Sets persistent class for this file or directory. + * If vnode cannot be protected (system file, non-regular file, non-hfs), EBADF. + * If the new class can't be accessed now, EPERM. + * Otherwise, record class and re-wrap key if the mount point is content-protected. + */ +int +cp_vnode_setclass(struct vnode *vp, cp_key_class_t newclass) +{ + struct cnode *cp; + struct cprotect *entry = 0; + int error = 0; + int took_truncate_lock = 0; + struct hfsmount *hfsmp = NULL; + int isdir = 0; + + if (vnode_isdir (vp)) { + isdir = 1; + } + + /* Ensure we only use the effective class here */ + newclass = CP_CLASS(newclass); + + if (!cp_is_valid_class(isdir, newclass)) { + printf("hfs: CP: cp_setclass called with invalid class %d\n", newclass); + return EINVAL; + } + + /* Is this an interesting vp? */ + if (!cp_vnode_is_eligible(vp)) { + return EBADF; + } + + /* Is the mount point formatted for content protection? */ + if (!cp_fs_protected(VTOVFS(vp))) { + return ENOTSUP; + } + + hfsmp = VTOHFS(vp); + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return EROFS; + } + + /* + * Take the cnode truncate lock exclusive because we want to manipulate the + * CP blob. The lock-event handling code is doing the same. This also forces + * all pending IOs to drain before we can re-write the persistent and cache keys. + */ + cp = VTOC(vp); + hfs_lock_truncate (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + took_truncate_lock = 1; + + /* + * The truncate lock is not sufficient to guarantee the CP blob + * isn't being used. We must wait for existing writes to finish. + */ + vnode_waitforwrites(vp, 0, 0, 0, "cp_vnode_setclass"); + + if (hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT)) { + return EINVAL; + } + + entry = cp->c_cpentry; + if (entry == NULL) { + error = EINVAL; + goto out; + } + + /* + * re-wrap per-file key with new class. + * Generate an entirely new key if switching to F. + */ + if (vnode_isreg(vp)) { + /* + * The vnode is a file. Before proceeding with the re-wrap, we need + * to unwrap the keys before proceeding. This is to ensure that + * the destination class's properties still work appropriately for the + * target class (since B allows I/O but an unwrap prior to the next unlock + * will not be allowed). + */ + if (!cpx_has_key(cpkp_cpx(&entry->cp_keys))) { + error = cp_restore_keys (entry, hfsmp, cp); + if (error) { + goto out; + } + } + + if (newclass == PROTECTION_CLASS_F) { + /* Verify that file is blockless if switching to class F */ + if (cp->c_datafork->ff_size > 0) { + error = EINVAL; + goto out; + } + + cp_key_pair_t *cpkp; + cprotect_t new_entry = cp_entry_alloc(NULL, 0, CP_MAX_KEYSIZE, &cpkp); + + if (!new_entry) { + error = ENOMEM; + goto out; + } + + /* newclass is only the effective class */ + new_entry->cp_pclass = newclass; + new_entry->cp_key_os_version = cp_os_version(); + new_entry->cp_key_revision = cp_next_key_revision(entry->cp_key_revision); + + cpx_t cpx = cpkp_cpx(cpkp); + + /* Class F files are not wrapped, so they continue to use MAX_KEYSIZE */ + cpx_set_key_len(cpx, CP_MAX_KEYSIZE); + read_random (cpx_key(cpx), CP_MAX_KEYSIZE); + + cp_replace_entry(hfsmp, cp, new_entry); + + error = 0; + goto out; + } + + /* Deny the setclass if file is to be moved from F to something else */ + if (entry->cp_pclass == PROTECTION_CLASS_F) { + error = EPERM; + goto out; + } + + if (!cpkp_has_pers_key(&entry->cp_keys)) { + struct cprotect *new_entry = NULL; + /* + * We want to fail if we can't wrap to the target class. By not setting + * CP_KEYWRAP_DIFFCLASS, we tell keygeneration that if it can't wrap + * to 'newclass' then error out. + */ + uint32_t flags = 0; + error = cp_generate_keys (hfsmp, cp, newclass, flags, &new_entry); + if (error == 0) { + cp_replace_entry (hfsmp, cp, new_entry); + } + /* Bypass the setxattr code below since generate_keys does it for us */ + goto out; + } + + cprotect_t new_entry; + error = cp_rewrap(cp, hfsmp, &newclass, &entry->cp_keys, entry, + (cp_new_alloc_fn)cp_entry_alloc, (void **)&new_entry); + if (error) { + /* we didn't have perms to set this class. leave file as-is and error out */ + goto out; + } + +#if HFS_CONFIG_KEY_ROLL + hfs_cp_key_roll_ctx_t *new_key_roll_ctx = NULL; + if (entry->cp_key_roll_ctx) { + error = cp_rewrap(cp, hfsmp, &newclass, &entry->cp_key_roll_ctx->ckr_keys, + entry->cp_key_roll_ctx, + (cp_new_alloc_fn)hfs_key_roll_ctx_alloc, + (void **)&new_key_roll_ctx); + + if (error) { + cp_entry_dealloc(hfsmp, new_entry); + goto out; + } + + new_entry->cp_key_roll_ctx = new_key_roll_ctx; + } +#endif + + new_entry->cp_pclass = newclass; + + cp_replace_entry(hfsmp, cp, new_entry); + entry = new_entry; + } + else if (vnode_isdir(vp)) { + /* For directories, just update the pclass. newclass is only effective class */ + entry->cp_pclass = newclass; + error = 0; + } + else { + /* anything else, just error out */ + error = EINVAL; + goto out; + } + + /* + * We get here if the new class was F, or if we were re-wrapping a cprotect that already + * existed. If the keys were never generated, then they'll skip the setxattr calls. + */ + + error = cp_setxattr(cp, cp->c_cpentry, VTOHFS(vp), 0, XATTR_REPLACE); + if (error == ENOATTR) { + error = cp_setxattr(cp, cp->c_cpentry, VTOHFS(vp), 0, XATTR_CREATE); + } + +out: + + if (took_truncate_lock) { + hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT); + } + hfs_unlock(cp); + return error; +} + + +int cp_vnode_transcode(vnode_t vp, cp_key_t *k) +{ + struct cnode *cp; + struct cprotect *entry = 0; + int error = 0; + int took_truncate_lock = 0; + struct hfsmount *hfsmp = NULL; + + /* Structures passed between HFS and AKS */ + struct aks_cred_s access_in; + struct aks_wrapped_key_s wrapped_key_in, wrapped_key_out; + + /* Is this an interesting vp? */ + if (!cp_vnode_is_eligible(vp)) { + return EBADF; + } + + /* Is the mount point formatted for content protection? */ + if (!cp_fs_protected(VTOVFS(vp))) { + return ENOTSUP; + } + + cp = VTOC(vp); + hfsmp = VTOHFS(vp); + + /* + * Take the cnode truncate lock exclusive because we want to manipulate the + * CP blob. The lock-event handling code is doing the same. This also forces + * all pending IOs to drain before we can re-write the persistent and cache keys. + */ + hfs_lock_truncate (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + took_truncate_lock = 1; + + if (hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT)) { + return EINVAL; + } + + entry = cp->c_cpentry; + if (entry == NULL) { + error = EINVAL; + goto out; + } + + /* Send the per-file key in wrapped form for re-wrap with the current class information + * Send NULLs in the output parameters of the wrapper() and AKS will do the rest. + * Don't need to process any outputs, so just clear the locks and pass along the error. */ + if (vnode_isreg(vp)) { + + /* Picked up the following from cp_wrap(). + * If needed, more comments available there. */ + + if (CP_CLASS(entry->cp_pclass) == PROTECTION_CLASS_F) { + error = EINVAL; + goto out; + } + + cp_init_access(&access_in, cp); + + bzero(&wrapped_key_in, sizeof(wrapped_key_in)); + bzero(&wrapped_key_out, sizeof(wrapped_key_out)); + + cp_key_pair_t *cpkp = &entry->cp_keys; + +#if HFS_CONFIG_KEY_ROLL + if (entry->cp_key_roll_ctx) + cpkp = &entry->cp_key_roll_ctx->ckr_keys; +#endif + + wrapped_key_in.key = cpkp_pers_key(cpkp); + wrapped_key_in.key_len = cpkp_pers_key_len(cpkp); + + if (!wrapped_key_in.key_len) { + error = EINVAL; + goto out; + } + + /* Use the actual persistent class when talking to AKS */ + wrapped_key_in.dp_class = entry->cp_pclass; + wrapped_key_out.key = k->key; + wrapped_key_out.key_len = k->len; + + error = hfs_backup_key(&access_in, + &wrapped_key_in, + &wrapped_key_out); + + if(error) + error = EPERM; + else + k->len = wrapped_key_out.key_len; + } + +out: + if (took_truncate_lock) { + hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT); + } + hfs_unlock(cp); + return error; +} + + +/* + * Check permission for the given operation (read, write) on this node. + * Additionally, if the node needs work, do it: + * - create a new key for the file if one hasn't been set before + * - write out the xattr if it hasn't already been saved + * - unwrap the key if needed + * + * Takes cnode lock, and upgrades to exclusive if modifying cprotect. + * + * Note that this function does *NOT* take the cnode truncate lock. This is because + * the thread calling us may already have the truncate lock. It is not necessary + * because either we successfully finish this function before the keys are tossed + * and the IO will fail, or the keys are tossed and then this function will fail. + * Either way, the cnode lock still ultimately guards the keys. We only rely on the + * truncate lock to protect us against tossing the keys as a cluster call is in-flight. + */ +int +cp_handle_vnop(struct vnode *vp, int vnop, int ioflag) +{ + struct cprotect *entry; + int error = 0; + struct hfsmount *hfsmp = NULL; + struct cnode *cp = NULL; + + /* + * First, do validation against the vnode before proceeding any further: + * Is this vnode originating from a valid content-protected filesystem ? + */ + if (cp_vnode_is_eligible(vp) == 0) { + /* + * It is either not HFS or not a file/dir. Just return success. This is a valid + * case if servicing i/o against another filesystem type from VFS + */ + return 0; + } + + if (cp_fs_protected (VTOVFS(vp)) == 0) { + /* + * The underlying filesystem does not support content protection. This is also + * a valid case. Simply return success. + */ + return 0; + } + + /* + * At this point, we know we have a HFS vnode that backs a file or directory on a + * filesystem that supports content protection + */ + cp = VTOC(vp); + + if ((error = hfs_lock(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT))) { + return error; + } + + entry = cp->c_cpentry; + + if (entry == NULL) { + /* + * If this cnode is not content protected, simply return success. + * Note that this function is called by all I/O-based call sites + * when CONFIG_PROTECT is enabled during XNU building. + */ + + /* + * All files should have cprotect structs. It's possible to encounter + * a directory from a V2.0 CP system but all files should have protection + * EAs + */ + if (vnode_isreg(vp)) { + error = EPERM; + } + + goto out; + } + + vp = CTOV(cp, 0); + if (vp == NULL) { + /* is it a rsrc */ + vp = CTOV(cp,1); + if (vp == NULL) { + error = EINVAL; + goto out; + } + } + hfsmp = VTOHFS(vp); + + if ((error = cp_check_access(cp, hfsmp, vnop))) { + /* check for raw encrypted access before bailing out */ + if ((ioflag & IO_ENCRYPTED) +#if HFS_CONFIG_KEY_ROLL + // If we're rolling, we need the keys + && !hfs_is_key_rolling(cp) +#endif + && (vnop == CP_READ_ACCESS)) { + /* + * read access only + asking for the raw encrypted bytes + * is legitimate, so reset the error value to 0 + */ + error = 0; + } + else { + goto out; + } + } + + if (!ISSET(entry->cp_flags, CP_NO_XATTR)) { + if (!S_ISREG(cp->c_mode)) + goto out; + + // If we have a persistent key and the cached key, we're done + if (!cp_needs_pers_key(entry) + && cpx_has_key(cpkp_cpx(&entry->cp_keys))) { + goto out; + } + } + + /* upgrade to exclusive lock */ + if (lck_rw_lock_shared_to_exclusive(&cp->c_rwlock) == FALSE) { + if ((error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { + return error; + } + } else { + cp->c_lockowner = current_thread(); + } + + /* generate new keys if none have ever been saved */ + if (cp_needs_pers_key(entry)) { + struct cprotect *newentry = NULL; + /* + * It's ok if this ends up being wrapped in a different class than 'pclass'. + * class modification is OK here. + */ + uint32_t flags = CP_KEYWRAP_DIFFCLASS; + + error = cp_generate_keys (hfsmp, cp, CP_CLASS(cp->c_cpentry->cp_pclass), flags, &newentry); + if (error == 0) { + cp_replace_entry (hfsmp, cp, newentry); + entry = newentry; + } + else { + goto out; + } + } + + /* unwrap keys if needed */ + if (!cpx_has_key(cpkp_cpx(&entry->cp_keys))) { + if ((vnop == CP_READ_ACCESS) && (ioflag & IO_ENCRYPTED)) { + /* no need to try to restore keys; they are not going to be used */ + error = 0; + } + else { + error = cp_restore_keys(entry, hfsmp, cp); + if (error) { + goto out; + } + } + } + + /* write out the xattr if it's new */ + if (entry->cp_flags & CP_NO_XATTR) + error = cp_setxattr(cp, entry, VTOHFS(cp->c_vp), 0, XATTR_CREATE); + +out: + + hfs_unlock(cp); + return error; +} + +#if HFS_TMPDBG +#if !SECURE_KERNEL +static void cp_log_eperm (struct vnode* vp, int pclass, boolean_t create) { + char procname[256] = {}; + const char *fname = "unknown"; + const char *dbgop = "open"; + + int ppid = proc_selfpid(); + /* selfname does a strlcpy so we're OK */ + proc_selfname(procname, sizeof(procname)); + if (vp && vp->v_name) { + /* steal from the namecache */ + fname = vp->v_name; + } + + if (create) { + dbgop = "create"; + } + + printf("proc %s (pid %d) class %d, op: %s failure @ file %s\n", procname, ppid, pclass, dbgop, fname); +} +#endif +#endif + + +int +cp_handle_open(struct vnode *vp, int mode) +{ + struct cnode *cp = NULL ; + struct cprotect *entry = NULL; + struct hfsmount *hfsmp; + int error = 0; + + /* If vnode not eligible, just return success */ + if (!cp_vnode_is_eligible(vp)) { + return 0; + } + + /* If mount point not properly set up, then also return success */ + if (!cp_fs_protected(VTOVFS(vp))) { + return 0; + } + + cp = VTOC(vp); + + // Allow if raw encrypted mode requested + if (ISSET(mode, FENCRYPTED)) { +#if HFS_CONFIG_KEY_ROLL + // If we're rolling, we need the keys + hfs_lock_always(cp, HFS_SHARED_LOCK); + bool rolling = hfs_is_key_rolling(cp); + hfs_unlock(cp); + if (!rolling) + return 0; +#else + return 0; +#endif + } + if (ISSET(mode, FUNENCRYPTED)) { + return 0; + } + + /* We know the vnode is in a valid state. Acquire cnode and validate */ + hfsmp = VTOHFS(vp); + + if ((error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { + return error; + } + + entry = cp->c_cpentry; + if (entry == NULL) { + /* + * If the mount is protected and we couldn't get a cprotect for this vnode, + * then it's not valid for opening. + */ + if (vnode_isreg(vp)) { + error = EPERM; + } + goto out; + } + + if (!S_ISREG(cp->c_mode)) + goto out; + + /* + * Does the cnode have keys yet? If not, then generate them. + */ + if (cp_needs_pers_key(entry)) { + struct cprotect *newentry = NULL; + /* Allow the keybag to override our class preferences */ + uint32_t flags = CP_KEYWRAP_DIFFCLASS; + error = cp_generate_keys (hfsmp, cp, CP_CLASS(cp->c_cpentry->cp_pclass), flags, &newentry); + if (error == 0) { + cp_replace_entry (hfsmp, cp, newentry); + entry = newentry; + } + else { + goto out; + } + } + + /* + * We want to minimize the number of unwraps that we'll have to do since + * the cost can vary, depending on the platform we're running. + */ + switch (CP_CLASS(entry->cp_pclass)) { + case PROTECTION_CLASS_B: + if (mode & O_CREAT) { + /* + * Class B always allows creation. Since O_CREAT was passed through + * we infer that this was a newly created vnode/cnode. Even though a potential + * race exists when multiple threads attempt to create/open a particular + * file, only one can "win" and actually create it. VFS will unset the + * O_CREAT bit on the loser. + * + * Note that skipping the unwrap check here is not a security issue -- + * we have to unwrap the key permanently upon the first I/O. + */ + break; + } + + if (cpx_has_key(cpkp_cpx(&entry->cp_keys)) && !ISSET(mode, FENCRYPTED)) { + /* + * For a class B file, attempt the unwrap if we have the key in + * core already. + * The device could have just transitioned into the lock state, and + * this vnode may not yet have been purged from the vnode cache (which would + * remove the keys). + */ + struct aks_cred_s access_in; + struct aks_wrapped_key_s wrapped_key_in; + + cp_init_access(&access_in, cp); + bzero(&wrapped_key_in, sizeof(wrapped_key_in)); + wrapped_key_in.key = cpkp_pers_key(&entry->cp_keys); + wrapped_key_in.key_len = cpkp_pers_key_len(&entry->cp_keys); + /* Use the persistent class when talking to AKS */ + wrapped_key_in.dp_class = entry->cp_pclass; + error = hfs_unwrap_key(&access_in, &wrapped_key_in, NULL); + if (error) { + error = EPERM; + } + break; + } + /* otherwise, fall through to attempt the unwrap/restore */ + case PROTECTION_CLASS_A: + case PROTECTION_CLASS_C: + /* + * At this point, we know that we need to attempt an unwrap if needed; we want + * to makes sure that open(2) fails properly if the device is either just-locked + * or never made it past first unlock. Since the keybag serializes access to the + * unwrapping keys for us and only calls our VFS callback once they've been purged, + * we will get here in two cases: + * + * A) we're in a window before the wrapping keys are purged; this is OK since when they get + * purged, the vnode will get flushed if needed. + * + * B) The keys are already gone. In this case, the restore_keys call below will fail. + * + * Since this function is bypassed entirely if we're opening a raw encrypted file, + * we can always attempt the restore. + */ + if (!cpx_has_key(cpkp_cpx(&entry->cp_keys))) { + error = cp_restore_keys(entry, hfsmp, cp); + } + + if (error) { + error = EPERM; + } + + break; + + case PROTECTION_CLASS_D: + default: + break; + } + +out: + +#if HFS_TMPDBG +#if !SECURE_KERNEL + if ((hfsmp->hfs_cp_verbose) && (error == EPERM)) { + cp_log_eperm (vp, CP_CLASS(entry->cp_pclass), false); + } +#endif +#endif + + hfs_unlock(cp); + return error; +} + + +/* + * cp_getrootxattr: + * Gets the EA we set on the root folder (fileid 1) to get information about the + * version of Content Protection that was used to write to this filesystem. + * Note that all multi-byte fields are written to disk little endian so they must be + * converted to native endian-ness as needed. + */ +int +cp_getrootxattr(struct hfsmount* hfsmp, struct cp_root_xattr *outxattr) +{ + void *buf; + + /* + * We allow for an extra 64 bytes to cater for upgrades. This wouldn't + * be necessary if the xattr routines just returned what we asked for. + */ + size_t bufsize = roundup(sizeof(struct cp_root_xattr) + 64, 64); + + int error = 0; + + hfs_assert(outxattr); + + buf = hfs_malloc(bufsize); + + uio_t uio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ); + + uio_addiov(uio, CAST_USER_ADDR_T(buf), bufsize); + + size_t attrsize = bufsize; + + struct vnop_getxattr_args args = { + .a_uio = uio, + .a_name = CONTENT_PROTECTION_XATTR_NAME, + .a_size = &attrsize + }; + + error = hfs_getxattr_internal(NULL, &args, hfsmp, 1); + + uio_free(uio); + + if (error != 0) { + goto out; + } + + if (attrsize < CP_ROOT_XATTR_MIN_LEN) { + error = HFS_EINCONSISTENT; + goto out; + } + + const struct cp_root_xattr *xattr = buf; + + bzero(outxattr, sizeof(*outxattr)); + + /* Now convert the multi-byte fields to native endianness */ + outxattr->major_version = OSSwapLittleToHostInt16(xattr->major_version); + outxattr->minor_version = OSSwapLittleToHostInt16(xattr->minor_version); + outxattr->flags = OSSwapLittleToHostInt64(xattr->flags); + + if (outxattr->major_version >= CP_VERS_5) { + if (attrsize < sizeof(struct cp_root_xattr)) { + error = HFS_EINCONSISTENT; + goto out; + } +#if HFS_CONFIG_KEY_ROLL + outxattr->auto_roll_min_version = OSSwapLittleToHostInt32(xattr->auto_roll_min_version); + outxattr->auto_roll_max_version = OSSwapLittleToHostInt32(xattr->auto_roll_max_version); +#endif + } + +out: + hfs_free(buf, bufsize); + return error; +} + +/* + * cp_setrootxattr: + * Sets the EA we set on the root folder (fileid 1) to get information about the + * version of Content Protection that was used to write to this filesystem. + * Note that all multi-byte fields are written to disk little endian so they must be + * converted to little endian as needed. + * + * This will be written to the disk when it detects the EA is not there, or when we need + * to make a modification to the on-disk version that can be done in-place. + */ +int +cp_setrootxattr(struct hfsmount *hfsmp, struct cp_root_xattr *newxattr) +{ + int error = 0; + struct vnop_setxattr_args args; + + args.a_desc = NULL; + args.a_vp = NULL; + args.a_name = CONTENT_PROTECTION_XATTR_NAME; + args.a_uio = NULL; //pass data ptr instead + args.a_options = 0; + args.a_context = NULL; //no context needed, only done from mount. + + const uint64_t flags = newxattr->flags; + + /* Now convert the multi-byte fields to little endian before writing to disk. */ + newxattr->flags = OSSwapHostToLittleInt64(newxattr->flags); + + int xattr_size = sizeof(struct cp_root_xattr); + +#if HFS_CONFIG_KEY_ROLL + bool upgraded = false; + + if (newxattr->auto_roll_min_version || newxattr->auto_roll_max_version) { + if (newxattr->major_version < CP_VERS_5) { + printf("hfs: upgrading to cp version %u\n", CP_CURRENT_VERS); + + newxattr->major_version = CP_CURRENT_VERS; + newxattr->minor_version = CP_MINOR_VERS; + + upgraded = true; + } + + newxattr->auto_roll_min_version = OSSwapHostToLittleInt32(newxattr->auto_roll_min_version); + newxattr->auto_roll_max_version = OSSwapHostToLittleInt32(newxattr->auto_roll_max_version); + } else if (newxattr->major_version == CP_VERS_4) + xattr_size = offsetof(struct cp_root_xattr, auto_roll_min_version); +#endif + + newxattr->major_version = OSSwapHostToLittleInt16(newxattr->major_version); + newxattr->minor_version = OSSwapHostToLittleInt16(newxattr->minor_version); + + error = hfs_setxattr_internal(NULL, (caddr_t)newxattr, + xattr_size, &args, hfsmp, 1); + + if (!error) { + hfsmp->cproot_flags = flags; +#if HFS_CONFIG_KEY_ROLL + if (upgraded) + hfsmp->hfs_running_cp_major_vers = CP_CURRENT_VERS; +#endif + } + + return error; +} + + +/* + * Stores new xattr data on the cnode. + * cnode lock held exclusive (if available). + * + * This function is also invoked during file creation. + */ +int cp_setxattr(struct cnode *cp, struct cprotect *entry, struct hfsmount *hfsmp, + uint32_t fileid, int options) +{ + int error = 0; + cp_key_pair_t *cpkp = &entry->cp_keys; +#if HFS_CONFIG_KEY_ROLL + bool rolling = entry->cp_key_roll_ctx != NULL; + + if (rolling && entry->cp_key_roll_ctx->ckr_off_rsrc == INT64_MAX) { + // We've finished rolling, but we still have the context + rolling = false; + cpkp = &entry->cp_key_roll_ctx->ckr_keys; + } +#endif + + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return EROFS; + } + + if (hfsmp->hfs_running_cp_major_vers < CP_CURRENT_VERS) { + // Upgrade + printf("hfs: upgrading to cp version %u\n", CP_CURRENT_VERS); + + struct cp_root_xattr root_xattr; + + error = cp_getrootxattr(hfsmp, &root_xattr); + if (error) + return error; + + root_xattr.major_version = CP_CURRENT_VERS; + root_xattr.minor_version = CP_MINOR_VERS; + + error = cp_setrootxattr(hfsmp, &root_xattr); + if (error) + return error; + + hfsmp->hfs_running_cp_major_vers = CP_CURRENT_VERS; + } + + struct cp_xattr_v5 *xattr; + xattr = hfs_malloc(sizeof(*xattr)); + + xattr->xattr_major_version = OSSwapHostToLittleConstInt16(CP_VERS_5); + xattr->xattr_minor_version = OSSwapHostToLittleConstInt16(CP_MINOR_VERS); + xattr->flags = 0; +#if HFS_CONFIG_KEY_ROLL + if (rolling) + xattr->flags |= CP_XAF_KEY_ROLLING; +#endif + xattr->persistent_class = OSSwapHostToLittleInt32(entry->cp_pclass); + xattr->key_os_version = OSSwapHostToLittleInt32(entry->cp_key_os_version); + xattr->key_revision = OSSwapHostToLittleInt16(entry->cp_key_revision); + + uint16_t key_len = cpkp_pers_key_len(cpkp); + xattr->key_len = OSSwapHostToLittleInt16(key_len); + memcpy(xattr->persistent_key, cpkp_pers_key(cpkp), key_len); + + size_t xattr_len = offsetof(struct cp_xattr_v5, persistent_key) + key_len; + +#if HFS_CONFIG_KEY_ROLL + if (rolling) { + struct cp_roll_info *roll_info = PTR_ADD(struct cp_roll_info *, xattr, xattr_len); + + roll_info->off_rsrc = OSSwapHostToLittleInt64(entry->cp_key_roll_ctx->ckr_off_rsrc); + + key_len = cpkp_pers_key_len(&entry->cp_key_roll_ctx->ckr_keys); + roll_info->key_len = OSSwapHostToLittleInt16(key_len); + + memcpy(roll_info->key, cpkp_pers_key(&entry->cp_key_roll_ctx->ckr_keys), key_len); + + xattr_len += offsetof(struct cp_roll_info, key) + key_len; + } +#endif + + struct vnop_setxattr_args args = { + .a_vp = cp ? cp->c_vp : NULL, + .a_name = CONTENT_PROTECTION_XATTR_NAME, + .a_options = options, + .a_context = vfs_context_current(), + }; + + error = hfs_setxattr_internal(cp, xattr, xattr_len, &args, hfsmp, fileid); + + hfs_free(xattr, sizeof(*xattr)); + + if (error == 0 ) { + entry->cp_flags &= ~CP_NO_XATTR; + } + + return error; +} + +/* + * Used by an fcntl to query the underlying FS for its content protection version # + */ + +int +cp_get_root_major_vers(vnode_t vp, uint32_t *level) +{ + int err = 0; + struct hfsmount *hfsmp = NULL; + struct mount *mp = NULL; + + mp = VTOVFS(vp); + + /* check if it supports content protection */ + if (cp_fs_protected(mp) == 0) { + return ENOTSUP; + } + + hfsmp = VFSTOHFS(mp); + /* figure out the level */ + + err = cp_root_major_vers(mp); + + if (err == 0) { + *level = hfsmp->hfs_running_cp_major_vers; + } + /* in error case, cp_root_major_vers will just return EINVAL. Use that */ + + return err; +} + +/* Used by fcntl to query default protection level of FS */ +int cp_get_default_level (struct vnode *vp, uint32_t *level) { + int err = 0; + struct hfsmount *hfsmp = NULL; + struct mount *mp = NULL; + + mp = VTOVFS(vp); + + /* check if it supports content protection */ + if (cp_fs_protected(mp) == 0) { + return ENOTSUP; + } + + hfsmp = VFSTOHFS(mp); + /* figure out the default */ + + *level = hfsmp->default_cp_class; + return err; +} + +/******************** + * Private Functions + *******************/ + +static int +cp_root_major_vers(mount_t mp) +{ + int err = 0; + struct cp_root_xattr xattr; + struct hfsmount *hfsmp = NULL; + + hfsmp = vfs_fsprivate(mp); + err = cp_getrootxattr (hfsmp, &xattr); + + if (err == 0) { + hfsmp->hfs_running_cp_major_vers = xattr.major_version; + } + else { + return EINVAL; + } + + return 0; +} + +static int +cp_vnode_is_eligible(struct vnode *vp) +{ + return !vnode_issystem(vp) && (vnode_isreg(vp) || vnode_isdir(vp)); +} + +#if DEBUG +static const uint32_t cp_magic1 = 0x7b727063; // cpr{ +static const uint32_t cp_magic2 = 0x7270637d; // }cpr +#endif + +struct cprotect * +cp_entry_alloc(cprotect_t old, uint16_t pers_key_len, + uint16_t cached_key_len, cp_key_pair_t **pcpkp) +{ + struct cprotect *cp_entry; + + if (pers_key_len > CP_MAX_WRAPPEDKEYSIZE) + return (NULL); + + size_t size = (sizeof(struct cprotect) - sizeof(cp_key_pair_t) + + cpkp_size(pers_key_len, cached_key_len)); + +#if DEBUG + size += 4; // Extra for magic2 +#endif + + cp_entry = hfs_malloc(size); + + if (old) { + memcpy(cp_entry, old, offsetof(struct cprotect, cp_keys)); + +#if HFS_CONFIG_KEY_ROLL + // We don't copy the key roll context + cp_entry->cp_key_roll_ctx = NULL; +#endif + } else { + bzero(cp_entry, offsetof(struct cprotect, cp_keys)); + } + +#if DEBUG + cp_entry->cp_magic1 = cp_magic1; + *PTR_ADD(uint32_t *, cp_entry, size - 4) = cp_magic2; +#endif + + cpkp_init(&cp_entry->cp_keys, pers_key_len, cached_key_len); + + /* + * If we've been passed the old entry, then we are in the process of + * rewrapping in which case we need to copy the cached key. This is + * important for class B files when the device is locked because we + * won't be able to unwrap whilst in this state, yet we still need the + * unwrapped key. + */ + if (old) + cpx_copy(cpkp_cpx(&old->cp_keys), cpkp_cpx(&cp_entry->cp_keys)); + + if (pcpkp) + *pcpkp = &cp_entry->cp_keys; + + return cp_entry; +} + +static void +cp_entry_dealloc(__unused hfsmount_t *hfsmp, struct cprotect *entry) +{ +#if HFS_CONFIG_KEY_ROLL + hfs_release_key_roll_ctx(hfsmp, entry); +#endif + + cpkp_flush(&entry->cp_keys); + + size_t entry_size = (sizeof(struct cprotect) - sizeof(cp_key_pair_t) + + cpkp_sizex(&entry->cp_keys)); + +#if DEBUG + hfs_assert(entry->cp_magic1 == cp_magic1); + hfs_assert(*PTR_ADD(uint32_t *, entry, (sizeof(struct cprotect) - sizeof(cp_key_pair_t) + + cpkp_sizex(&entry->cp_keys) == cp_magic2))); + + entry_size += 4; // Extra for magic2 +#endif + + hfs_free(entry, entry_size); +} + +static int cp_read_xattr_v4(__unused hfsmount_t *hfsmp, struct cp_xattr_v4 *xattr, + size_t xattr_len, cprotect_t *pcpr, cp_getxattr_options_t options) +{ + /* Endian swap the multi-byte fields into host endianness from L.E. */ + xattr->xattr_major_version = OSSwapLittleToHostInt16(xattr->xattr_major_version); + xattr->xattr_minor_version = OSSwapLittleToHostInt16(xattr->xattr_minor_version); + xattr->key_size = OSSwapLittleToHostInt32(xattr->key_size); + xattr->flags = OSSwapLittleToHostInt32(xattr->flags); + xattr->persistent_class = OSSwapLittleToHostInt32(xattr->persistent_class); + xattr->key_os_version = OSSwapLittleToHostInt32(xattr->key_os_version); + + /* + * Prevent a buffer overflow, and validate the key length obtained from the + * EA. If it's too big, then bail out, because the EA can't be trusted at this + * point. + */ + if (xattr->key_size > CP_MAX_WRAPPEDKEYSIZE) + return HFS_EINCONSISTENT; + + size_t min_len = offsetof(struct cp_xattr_v4, persistent_key) + xattr->key_size; + if (xattr_len < min_len) + return HFS_EINCONSISTENT; + + /* + * Class F files have no backing key; their keylength should be 0, + * though they should have the proper flags set. + * + * A request to instantiate a CP for a class F file should result + * in a bzero'd cp that just says class F, with key_flushed set. + */ + if (CP_CLASS(xattr->persistent_class) == PROTECTION_CLASS_F + || ISSET(xattr->flags, CP_XAF_NEEDS_KEYS)) { + xattr->key_size = 0; + } + + /* set up entry with information from xattr */ + cp_key_pair_t *cpkp; + cprotect_t entry; + + if (ISSET(options, CP_GET_XATTR_BASIC_INFO)) { + /* caller passed in a pre-allocated structure to get the basic info */ + entry = *pcpr; + bzero(entry, offsetof(struct cprotect, cp_keys)); + } + else { + entry = cp_entry_alloc(NULL, xattr->key_size, CP_MAX_CACHEBUFLEN, &cpkp); + } + + entry->cp_pclass = xattr->persistent_class; + entry->cp_key_os_version = xattr->key_os_version; + + + if (!ISSET(options, CP_GET_XATTR_BASIC_INFO)) { + if (xattr->key_size) { + cpkp_set_pers_key_len(cpkp, xattr->key_size); + memcpy(cpkp_pers_key(cpkp), xattr->persistent_key, xattr->key_size); + } + + *pcpr = entry; + } + else if (xattr->key_size) { + SET(entry->cp_flags, CP_HAS_A_KEY); + } + + return 0; +} + +int cp_read_xattr_v5(hfsmount_t *hfsmp, struct cp_xattr_v5 *xattr, + size_t xattr_len, cprotect_t *pcpr, cp_getxattr_options_t options) +{ + if (xattr->xattr_major_version == OSSwapHostToLittleConstInt16(CP_VERS_4)) { + return cp_read_xattr_v4(hfsmp, (struct cp_xattr_v4 *)xattr, xattr_len, pcpr, options); + } + + xattr->xattr_major_version = OSSwapLittleToHostInt16(xattr->xattr_major_version); + + if (xattr->xattr_major_version != CP_VERS_5) { + printf("hfs: cp_getxattr: unsupported xattr version %d\n", + xattr->xattr_major_version); + return ENOTSUP; + } + + size_t min_len = offsetof(struct cp_xattr_v5, persistent_key); + + if (xattr_len < min_len) + return HFS_EINCONSISTENT; + + xattr->xattr_minor_version = OSSwapLittleToHostInt16(xattr->xattr_minor_version); + xattr->flags = OSSwapLittleToHostInt32(xattr->flags); + xattr->persistent_class = OSSwapLittleToHostInt32(xattr->persistent_class); + xattr->key_os_version = OSSwapLittleToHostInt32(xattr->key_os_version); + xattr->key_revision = OSSwapLittleToHostInt16(xattr->key_revision); + xattr->key_len = OSSwapLittleToHostInt16(xattr->key_len); + + uint16_t pers_key_len = xattr->key_len; + + min_len += pers_key_len; + if (xattr_len < min_len) + return HFS_EINCONSISTENT; + +#if HFS_CONFIG_KEY_ROLL + struct cp_roll_info *roll_info = NULL; + + if (ISSET(xattr->flags, CP_XAF_KEY_ROLLING)) { + roll_info = PTR_ADD(struct cp_roll_info *, xattr, min_len); + + min_len += offsetof(struct cp_roll_info, key); + + if (xattr_len < min_len) + return HFS_EINCONSISTENT; + + roll_info->off_rsrc = OSSwapLittleToHostInt64(roll_info->off_rsrc); + + if (roll_info->off_rsrc % hfsmp->blockSize) + return HFS_EINCONSISTENT; + + roll_info->key_len = OSSwapLittleToHostInt16(roll_info->key_len); + + min_len += roll_info->key_len; + if (xattr_len < min_len) + return HFS_EINCONSISTENT; + } +#endif + + cp_key_pair_t *cpkp; + cprotect_t entry; + + /* + * If option CP_GET_XATTR_BASIC_INFO is set, we only return basic + * information about the file's protection (and not the key) and + * we store the result in the structure the caller passed to us. + */ + if (ISSET(options, CP_GET_XATTR_BASIC_INFO)) { + entry = *pcpr; + bzero(entry, offsetof(struct cprotect, cp_keys)); +#if HFS_CONFIG_KEY_ROLL + if (ISSET(xattr->flags, CP_XAF_KEY_ROLLING)) { + SET(entry->cp_flags, CP_KEY_IS_ROLLING); + } +#endif + } else { + entry = cp_entry_alloc(NULL, xattr->key_len, CP_MAX_CACHEBUFLEN, &cpkp); + } + + entry->cp_pclass = xattr->persistent_class; + entry->cp_key_os_version = xattr->key_os_version; + entry->cp_key_revision = xattr->key_revision; + + if (!ISSET(options, CP_GET_XATTR_BASIC_INFO)) { + if (xattr->key_len) { + cpkp_set_pers_key_len(cpkp, xattr->key_len); + memcpy(cpkp_pers_key(cpkp), xattr->persistent_key, xattr->key_len); + } + +#if HFS_CONFIG_KEY_ROLL + if (roll_info) { + entry->cp_key_roll_ctx = hfs_key_roll_ctx_alloc(NULL, roll_info->key_len, + CP_MAX_CACHEBUFLEN, &cpkp); + + entry->cp_key_roll_ctx->ckr_off_rsrc = roll_info->off_rsrc; + + if (roll_info->key_len) { + cpkp_set_pers_key_len(cpkp, roll_info->key_len); + memcpy(cpkp_pers_key(cpkp), roll_info->key, roll_info->key_len); + } + } +#endif + + *pcpr = entry; + } + else if (xattr->key_len) { + SET(entry->cp_flags, CP_HAS_A_KEY); + } + + return 0; +} + +/* + * Initializes a new cprotect entry with xattr data from the cnode. + * cnode lock held shared + */ +static int +cp_getxattr(struct cnode *cp, struct hfsmount *hfsmp, cprotect_t *outentry) +{ + size_t xattr_len; + struct cp_xattr_v5 *xattr; + + xattr = hfs_malloc(xattr_len = sizeof(*xattr)); + + int error = hfs_xattr_read(cp->c_vp, CONTENT_PROTECTION_XATTR_NAME, + xattr, &xattr_len); + + if (!error) { + if (xattr_len < CP_XATTR_MIN_LEN) + error = HFS_EINCONSISTENT; + else + error = cp_read_xattr_v5(hfsmp, xattr, xattr_len, outentry, 0); + } + +#if DEBUG + if (error && error != ENOATTR) { + printf("cp_getxattr: bad cp xattr (%d):\n", error); + for (size_t i = 0; i < xattr_len; ++i) + printf("%02x ", ((uint8_t *)xattr)[i]); + printf("\n"); + } +#endif + + hfs_free(xattr, sizeof(*xattr)); + + return error; +} + +/* + * If permitted, restore entry's unwrapped key from the persistent key. + * If not, clear key and set CP_KEY_FLUSHED. + * cnode lock held exclusive + */ +static int +cp_restore_keys(struct cprotect *entry, struct hfsmount *hfsmp, struct cnode *cp) +{ + int error = 0; + + error = cp_unwrap(hfsmp, entry, cp); + if (error) { + cp_flush_cached_keys(entry); + error = EPERM; + } + return error; +} + +void cp_device_locked_callback(mount_t mp, cp_lock_state_t state) +{ + struct hfsmount *hfsmp; + + /* + * When iterating the various mount points that may + * be present on a content-protected device, we need to skip + * those that do not have it enabled. + */ + if (!cp_fs_protected(mp)) { + return; + } + + hfsmp = VFSTOHFS(mp); + + hfsmp->hfs_cp_lock_state = state; + + if (state == CP_LOCKED_STATE) { + /* + * We respond only to lock events. Since cprotect structs + * decrypt/restore keys lazily, the unlock events don't + * actually cause anything to happen. + */ + vnode_iterate(mp, 0, cp_lock_vnode_callback, (void *)(uintptr_t)state); + } +} + +/* + * Deny access to protected files if keys have been locked. + */ +static int +cp_check_access(struct cnode *cp, struct hfsmount *hfsmp, int vnop __unused) +{ + int error = 0; + + /* + * For now it's OK to examine the state variable here without + * holding the HFS lock. This is only a short-circuit; if the state + * transitions (or is in transition) after we examine this field, we'd + * have to handle that anyway. + */ + if (hfsmp->hfs_cp_lock_state == CP_UNLOCKED_STATE) { + return 0; + } + + if (!cp->c_cpentry) { + /* unprotected node */ + return 0; + } + + if (!S_ISREG(cp->c_mode)) { + return 0; + } + + /* Deny all access for class A files */ + switch (CP_CLASS(cp->c_cpentry->cp_pclass)) { + case PROTECTION_CLASS_A: { + error = EPERM; + break; + } + default: + error = 0; + break; + } + + return error; +} + +/* + * Respond to a lock or unlock event. + * On lock: clear out keys from memory, then flush file contents. + * On unlock: nothing (function not called). + */ +static int +cp_lock_vnode_callback(struct vnode *vp, void *arg) +{ + cnode_t *cp = NULL; + struct cprotect *entry = NULL; + int error = 0; + int locked = 1; + unsigned long action = 0; + int took_truncate_lock = 0; + + error = vnode_getwithref (vp); + if (error) { + return error; + } + + cp = VTOC(vp); + + /* + * When cleaning cnodes due to a lock event, we must + * take the truncate lock AND the cnode lock. By taking + * the truncate lock here, we force (nearly) all pending IOs + * to drain before we can acquire the truncate lock. All HFS cluster + * io calls except for swapfile IO need to acquire the truncate lock + * prior to calling into the cluster layer. + */ + hfs_lock_truncate (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + took_truncate_lock = 1; + + hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); + + entry = cp->c_cpentry; + if (!entry) { + /* unprotected vnode: not a regular file */ + goto out; + } + + action = (unsigned long) arg; + switch (action) { + case CP_LOCKED_STATE: { + vfs_context_t ctx; + if (CP_CLASS(entry->cp_pclass) != PROTECTION_CLASS_A || + vnode_isdir(vp)) { + /* + * There is no change at lock for other classes than A. + * B is kept in memory for writing, and class F (for VM) does + * not have a wrapped key, so there is no work needed for + * wrapping/unwrapping. + * + * Note that 'class F' is relevant here because if + * hfs_vnop_strategy does not take the cnode lock + * to protect the cp blob across IO operations, we rely + * implicitly on the truncate lock to be held when doing IO. + * The only case where the truncate lock is not held is during + * swapfile IO because HFS just funnels the VNOP_PAGEOUT + * directly to cluster_pageout. + */ + goto out; + } + + /* Before doing anything else, zero-fill sparse ranges as needed */ + ctx = vfs_context_current(); + (void) hfs_filedone (vp, ctx, 0); + + /* first, sync back dirty pages */ + hfs_unlock (cp); + ubc_msync (vp, 0, ubc_getsize(vp), NULL, UBC_PUSHALL | UBC_INVALIDATE | UBC_SYNC); + hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); + + /* flush keys: + * There was a concern here(9206856) about flushing keys before nand layer is done using them. + * But since we are using ubc_msync with UBC_SYNC, it blocks until all IO is completed. + * Once IOFS caches or is done with these keys, it calls the completion routine in IOSF. + * Which in turn calls buf_biodone() and eventually unblocks ubc_msync() + * Also verified that the cached data in IOFS is overwritten by other data, and there + * is no key leakage in that layer. + */ + + cp_flush_cached_keys(entry); + + /* some write may have arrived in the mean time. dump those pages */ + hfs_unlock(cp); + locked = 0; + + ubc_msync (vp, 0, ubc_getsize(vp), NULL, UBC_INVALIDATE | UBC_SYNC); + break; + } + case CP_UNLOCKED_STATE: { + /* no-op */ + break; + } + default: + panic("Content Protection: unknown lock action %lu\n", action); + } + +out: + if (locked) { + hfs_unlock(cp); + } + + if (took_truncate_lock) { + hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT); + } + + vnode_put (vp); + return error; +} + + +/* + * cp_rewrap: + * + * Generate a new wrapped key based on the existing cache key. + */ + +int +cp_rewrap(struct cnode *cp, __unused hfsmount_t *hfsmp, + cp_key_class_t *newclass, cp_key_pair_t *cpkp, const void *old_holder, + cp_new_alloc_fn alloc_fn, void **pholder) +{ + struct cprotect *entry = cp->c_cpentry; + + uint8_t new_persistent_key[CP_MAX_WRAPPEDKEYSIZE]; + unsigned keylen = CP_MAX_WRAPPEDKEYSIZE; + int error = 0; + const cp_key_class_t key_class = CP_CLASS(*newclass); + + /* Structures passed between HFS and AKS */ + struct aks_cred_s access_in; + struct aks_wrapped_key_s wrapped_key_in; + struct aks_wrapped_key_s wrapped_key_out; + + /* + * PROTECTION_CLASS_F is in-use by VM swapfile; it represents a transient + * key that is only good as long as the file is open. There is no + * wrapped key, so there isn't anything to wrap. + */ + if (key_class == PROTECTION_CLASS_F) { + return EINVAL; + } + + cp_init_access(&access_in, cp); + + bzero(&wrapped_key_in, sizeof(wrapped_key_in)); + wrapped_key_in.key = cpkp_pers_key(cpkp); + wrapped_key_in.key_len = cpkp_pers_key_len(cpkp); + /* Use the persistent class when talking to AKS */ + wrapped_key_in.dp_class = entry->cp_pclass; + + bzero(&wrapped_key_out, sizeof(wrapped_key_out)); + wrapped_key_out.key = new_persistent_key; + wrapped_key_out.key_len = keylen; + + /* + * inode is passed here to find the backup bag wrapped blob + * from userspace. This lookup will occur shortly after creation + * and only if the file still exists. Beyond this lookup the + * inode is not used. Technically there is a race, we practically + * don't lose. + */ + error = hfs_rewrap_key(&access_in, + key_class, /* new class */ + &wrapped_key_in, + &wrapped_key_out); + + keylen = wrapped_key_out.key_len; + + if (error == 0) { + /* + * Verify that AKS returned to us a wrapped key of the + * target class requested. + */ + /* Get the effective class here */ + cp_key_class_t effective = CP_CLASS(wrapped_key_out.dp_class); + if (effective != key_class) { + /* + * Fail the operation if defaults or some other enforcement + * dictated that the class be wrapped differently. + */ + + /* TODO: Invalidate the key when 12170074 unblocked */ + return EPERM; + } + + /* Allocate a new cpentry */ + cp_key_pair_t *new_cpkp; + *pholder = alloc_fn(old_holder, keylen, CP_MAX_CACHEBUFLEN, &new_cpkp); + + /* copy the new key into the entry */ + cpkp_set_pers_key_len(new_cpkp, keylen); + memcpy(cpkp_pers_key(new_cpkp), new_persistent_key, keylen); + + /* Actually record/store what AKS reported back, not the effective class stored in newclass */ + *newclass = wrapped_key_out.dp_class; + } + else { + error = EPERM; + } + + return error; +} + +static int cpkp_unwrap(cnode_t *cp, cp_key_class_t key_class, cp_key_pair_t *cpkp) +{ + int error = 0; + uint8_t iv_key[CP_IV_KEYSIZE]; + cpx_t cpx = cpkp_cpx(cpkp); + + /* Structures passed between HFS and AKS */ + struct aks_cred_s access_in; + struct aks_wrapped_key_s wrapped_key_in; + struct aks_raw_key_s key_out; + + cp_init_access(&access_in, cp); + + bzero(&wrapped_key_in, sizeof(wrapped_key_in)); + wrapped_key_in.key = cpkp_pers_key(cpkp); + wrapped_key_in.key_len = cpkp_max_pers_key_len(cpkp); + /* Use the persistent class when talking to AKS */ + wrapped_key_in.dp_class = key_class; + + bzero(&key_out, sizeof(key_out)); + key_out.iv_key = iv_key; + key_out.key = cpx_key(cpx); + /* + * The unwrapper should validate/set the key length for + * the IV key length and the cache key length, however we need + * to supply the correct buffer length so that AKS knows how + * many bytes it has to work with. + */ + key_out.iv_key_len = CP_IV_KEYSIZE; + key_out.key_len = cpx_max_key_len(cpx); + + error = hfs_unwrap_key(&access_in, &wrapped_key_in, &key_out); + if (!error) { + if (key_out.key_len == 0 || key_out.key_len > CP_MAX_CACHEBUFLEN) { + panic ("cp_unwrap: invalid key length! (%ul)\n", key_out.key_len); + } + + if (key_out.iv_key_len != CP_IV_KEYSIZE) + panic ("cp_unwrap: invalid iv key length! (%ul)\n", key_out.iv_key_len); + + cpx_set_key_len(cpx, key_out.key_len); + + cpx_set_aes_iv_key(cpx, iv_key); + cpx_set_is_sep_wrapped_key(cpx, ISSET(key_out.flags, AKS_RAW_KEY_WRAPPEDKEY)); + } else { + error = EPERM; + } + + return error; +} + +static int +cp_unwrap(__unused struct hfsmount *hfsmp, struct cprotect *entry, struct cnode *cp) +{ + /* + * PROTECTION_CLASS_F is in-use by VM swapfile; it represents a transient + * key that is only good as long as the file is open. There is no + * wrapped key, so there isn't anything to unwrap. + */ + if (CP_CLASS(entry->cp_pclass) == PROTECTION_CLASS_F) { + return EPERM; + } + + int error = cpkp_unwrap(cp, entry->cp_pclass, &entry->cp_keys); + +#if HFS_CONFIG_KEY_ROLL + if (!error && entry->cp_key_roll_ctx) { + error = cpkp_unwrap(cp, entry->cp_pclass, &entry->cp_key_roll_ctx->ckr_keys); + if (error) + cpx_flush(cpkp_cpx(&entry->cp_keys)); + } +#endif + + return error; +} + +/* + * cp_generate_keys + * + * Take a cnode that has already been initialized and establish persistent and + * cache keys for it at this time. Note that at the time this is called, the + * directory entry has already been created and we are holding the cnode lock + * on 'cp'. + * + */ +int cp_generate_keys (struct hfsmount *hfsmp, struct cnode *cp, cp_key_class_t targetclass, + uint32_t keyflags, struct cprotect **newentry) +{ + + int error = 0; + struct cprotect *newcp = NULL; + *newentry = NULL; + + /* Target class must be an effective class only */ + targetclass = CP_CLASS(targetclass); + + /* Validate that it has a cprotect already */ + if (cp->c_cpentry == NULL) { + /* We can't do anything if it shouldn't be protected. */ + return 0; + } + + /* Asserts for the underlying cprotect */ + if (cp->c_cpentry->cp_flags & CP_NO_XATTR) { + /* should already have an xattr by this point. */ + error = EINVAL; + goto out; + } + + if (S_ISREG(cp->c_mode)) { + if (!cp_needs_pers_key(cp->c_cpentry)) { + error = EINVAL; + goto out; + } + } + + cp_key_revision_t key_revision = cp_initial_key_revision(hfsmp); + + error = cp_new (&targetclass, hfsmp, cp, cp->c_mode, keyflags, key_revision, + (cp_new_alloc_fn)cp_entry_alloc, (void **)&newcp); + if (error) { + /* + * Key generation failed. This is not necessarily fatal + * since the device could have transitioned into the lock + * state before we called this. + */ + error = EPERM; + goto out; + } + + newcp->cp_pclass = targetclass; + newcp->cp_key_os_version = cp_os_version(); + newcp->cp_key_revision = key_revision; + + /* + * If we got here, then we have a new cprotect. + * Attempt to write the new one out. + */ + error = cp_setxattr (cp, newcp, hfsmp, cp->c_fileid, XATTR_REPLACE); + + if (error) { + /* Tear down the new cprotect; Tell MKB that it's invalid. Bail out */ + /* TODO: rdar://12170074 needs to be fixed before we can tell MKB */ + if (newcp) { + cp_entry_destroy(hfsmp, newcp); + } + goto out; + } + + /* + * If we get here then we can assert that: + * 1) generated wrapped/unwrapped keys. + * 2) wrote the new keys to disk. + * 3) cprotect is ready to go. + */ + + *newentry = newcp; + +out: + return error; + +} + +void cp_replace_entry (hfsmount_t *hfsmp, struct cnode *cp, struct cprotect *newentry) +{ + if (cp->c_cpentry) { +#if HFS_CONFIG_KEY_ROLL + // Transfer the tentative reservation + if (cp->c_cpentry->cp_key_roll_ctx && newentry->cp_key_roll_ctx) { + newentry->cp_key_roll_ctx->ckr_tentative_reservation + = cp->c_cpentry->cp_key_roll_ctx->ckr_tentative_reservation; + + cp->c_cpentry->cp_key_roll_ctx->ckr_tentative_reservation = NULL; + } +#endif + + cp_entry_destroy (hfsmp, cp->c_cpentry); + } + cp->c_cpentry = newentry; + newentry->cp_backing_cnode = cp; + + return; +} + + +/* + * cp_new + * + * Given a double-pointer to a cprotect, generate keys (either in-kernel or from keystore), + * allocate a cprotect, and vend it back to the caller. + * + * Additionally, decide if keys are even needed -- directories get cprotect data structures + * but they do not have keys. + * + */ + +int +cp_new(cp_key_class_t *newclass_eff, __unused struct hfsmount *hfsmp, struct cnode *cp, + mode_t cmode, int32_t keyflags, cp_key_revision_t key_revision, + cp_new_alloc_fn alloc_fn, void **pholder) +{ + int error = 0; + uint8_t new_key[CP_MAX_CACHEBUFLEN]; + unsigned new_key_len = CP_MAX_CACHEBUFLEN; /* AKS tell us the proper key length, how much of this is used */ + uint8_t new_persistent_key[CP_MAX_WRAPPEDKEYSIZE]; + unsigned new_persistent_len = CP_MAX_WRAPPEDKEYSIZE; + uint8_t iv_key[CP_IV_KEYSIZE]; + unsigned iv_key_len = CP_IV_KEYSIZE; + int iswrapped = 0; + cp_key_class_t key_class = CP_CLASS(*newclass_eff); + + /* Structures passed between HFS and AKS */ + struct aks_cred_s access_in; + struct aks_wrapped_key_s wrapped_key_out; + struct aks_raw_key_s key_out; + + /* Sanity check that it's a file or directory here */ + if (!(S_ISREG(cmode)) && !(S_ISDIR(cmode))) { + return EPERM; + } + + /* + * Step 1: Generate Keys if needed. + * + * For class F files, the kernel provides the key. + * PROTECTION_CLASS_F is in-use by VM swapfile; it represents a transient + * key that is only good as long as the file is open. There is no + * wrapped key, so there isn't anything to wrap. + * + * For class A->D files, the key store provides the key + * + * For Directories, we only give them a class ; no keys. + */ + if (S_ISDIR (cmode)) { + /* Directories */ + new_persistent_len = 0; + new_key_len = 0; + + error = 0; + } + else { + /* Must be a file */ + if (key_class == PROTECTION_CLASS_F) { + /* class F files are not wrapped; they can still use the max key size */ + new_key_len = CP_MAX_KEYSIZE; + read_random (&new_key[0], new_key_len); + new_persistent_len = 0; + + error = 0; + } + else { + /* + * The keystore is provided the file ID so that it can associate + * the wrapped backup blob with this key from userspace. This + * lookup occurs after successful file creation. Beyond this, the + * file ID is not used. Note that there is a potential race here if + * the file ID is re-used. + */ + cp_init_access(&access_in, cp); + + bzero(&key_out, sizeof(key_out)); + key_out.key = new_key; + key_out.iv_key = iv_key; + /* + * AKS will override our key length fields, but we need to supply + * the length of the buffer in those length fields so that + * AKS knows hoa many bytes it has to work with. + */ + key_out.key_len = new_key_len; + key_out.iv_key_len = iv_key_len; + + bzero(&wrapped_key_out, sizeof(wrapped_key_out)); + wrapped_key_out.key = new_persistent_key; + wrapped_key_out.key_len = new_persistent_len; + + access_in.key_revision = key_revision; + + error = hfs_new_key(&access_in, + key_class, + &key_out, + &wrapped_key_out); + + if (error) { + /* keybag returned failure */ + error = EPERM; + goto cpnew_fail; + } + + /* Now sanity-check the output from new_key */ + if (key_out.key_len == 0 || key_out.key_len > CP_MAX_CACHEBUFLEN) { + panic ("cp_new: invalid key length! (%ul) \n", key_out.key_len); + } + + if (key_out.iv_key_len != CP_IV_KEYSIZE) { + panic ("cp_new: invalid iv key length! (%ul) \n", key_out.iv_key_len); + } + + /* + * AKS is allowed to override our preferences and wrap with a + * different class key for policy reasons. If we were told that + * any class other than the one specified is unacceptable then error out + * if that occurred. Check that the effective class returned by + * AKS is the same as our effective new class + */ + if (CP_CLASS(wrapped_key_out.dp_class) != key_class) { + if (!ISSET(keyflags, CP_KEYWRAP_DIFFCLASS)) { + error = EPERM; + /* TODO: When 12170074 fixed, release/invalidate the key! */ + goto cpnew_fail; + } + } + + *newclass_eff = wrapped_key_out.dp_class; + new_key_len = key_out.key_len; + iv_key_len = key_out.iv_key_len; + new_persistent_len = wrapped_key_out.key_len; + + /* Is the key a SEP wrapped key? */ + if (key_out.flags & AKS_RAW_KEY_WRAPPEDKEY) { + iswrapped = 1; + } + } + } + + /* + * Step 2: allocate cprotect and initialize it. + */ + + cp_key_pair_t *cpkp; + *pholder = alloc_fn(NULL, new_persistent_len, new_key_len, &cpkp); + if (*pholder == NULL) { + return ENOMEM; + } + + /* Copy the cache key & IV keys into place if needed. */ + if (new_key_len > 0) { + cpx_t cpx = cpkp_cpx(cpkp); + + cpx_set_key_len(cpx, new_key_len); + memcpy(cpx_key(cpx), new_key, new_key_len); + + /* Initialize the IV key */ + if (key_class != PROTECTION_CLASS_F) + cpx_set_aes_iv_key(cpx, iv_key); + + cpx_set_is_sep_wrapped_key(cpx, iswrapped); + } + if (new_persistent_len > 0) { + cpkp_set_pers_key_len(cpkp, new_persistent_len); + memcpy(cpkp_pers_key(cpkp), new_persistent_key, new_persistent_len); + } + +cpnew_fail: + +#if HFS_TMPDBG +#if !SECURE_KERNEL + if ((hfsmp->hfs_cp_verbose) && (error == EPERM)) { + /* Only introspect the data fork */ + cp_log_eperm (cp->c_vp, *newclass_eff, true); + } +#endif +#endif + + return error; +} + + +/* Initialize the aks_cred_t structure passed to AKS */ +static void cp_init_access(aks_cred_t access, struct cnode *cp) +{ + vfs_context_t context = vfs_context_current(); + kauth_cred_t cred = vfs_context_ucred(context); + proc_t proc = vfs_context_proc(context); + struct hfsmount *hfsmp; + struct vnode *vp; + uuid_t hfs_uuid; + + bzero(access, sizeof(*access)); + + vp = CTOV(cp, 0); + if (vp == NULL) { + /* is it a rsrc */ + vp = CTOV(cp,1); + if (vp == NULL) { + //leave the struct bzeroed. + return; + } + } + + hfsmp = VTOHFS(vp); + hfs_getvoluuid(hfsmp, hfs_uuid); + + /* Note: HFS uses 32-bit fileID, even though inode is a 64-bit value */ + access->inode = cp->c_fileid; + access->pid = proc_pid(proc); + access->uid = kauth_cred_getuid(cred); + uuid_copy (access->volume_uuid, hfs_uuid); + + if (cp->c_cpentry) + access->key_revision = cp->c_cpentry->cp_key_revision; + + return; +} + +#if HFS_CONFIG_KEY_ROLL + +errno_t cp_set_auto_roll(hfsmount_t *hfsmp, + const hfs_key_auto_roll_args_t *args) +{ + // 64 bytes should be OK on the stack + _Static_assert(sizeof(struct cp_root_xattr) < 64, "cp_root_xattr too big!"); + + struct cp_root_xattr xattr; + errno_t ret; + + ret = cp_getrootxattr(hfsmp, &xattr); + if (ret) + return ret; + + ret = hfs_start_transaction(hfsmp); + if (ret) + return ret; + + xattr.auto_roll_min_version = args->min_key_os_version; + xattr.auto_roll_max_version = args->max_key_os_version; + + bool roll_old_class_gen = ISSET(args->flags, HFS_KEY_AUTO_ROLL_OLD_CLASS_GENERATION); + + if (roll_old_class_gen) + SET(xattr.flags, CP_ROOT_AUTO_ROLL_OLD_CLASS_GENERATION); + else + CLR(xattr.flags, CP_ROOT_AUTO_ROLL_OLD_CLASS_GENERATION); + + ret = cp_setrootxattr(hfsmp, &xattr); + + errno_t ret2 = hfs_end_transaction(hfsmp); + + if (!ret) + ret = ret2; + + if (ret) + return ret; + + hfs_lock_mount(hfsmp); + hfsmp->hfs_auto_roll_min_key_os_version = args->min_key_os_version; + hfsmp->hfs_auto_roll_max_key_os_version = args->max_key_os_version; + hfs_unlock_mount(hfsmp); + + return ret; +} + +bool cp_should_auto_roll(hfsmount_t *hfsmp, cprotect_t cpr) +{ + if (cpr->cp_key_roll_ctx) { + // Already rolling + return false; + } + + // Only automatically roll class A, B & C + if (CP_CLASS(cpr->cp_pclass) < PROTECTION_CLASS_A + || CP_CLASS(cpr->cp_pclass) > PROTECTION_CLASS_C) { + return false; + } + + if (!cpkp_has_pers_key(&cpr->cp_keys)) + return false; + + /* + * Remember, the class generation stored in HFS+ is updated at the *end*, + * so it's old if it matches the generation we have stored. + */ + if (ISSET(hfsmp->cproot_flags, CP_ROOT_AUTO_ROLL_OLD_CLASS_GENERATION) + && cp_get_crypto_generation(cpr->cp_pclass) == hfsmp->cp_crypto_generation) { + return true; + } + + if (!hfsmp->hfs_auto_roll_min_key_os_version + && !hfsmp->hfs_auto_roll_max_key_os_version) { + // No minimum or maximum set + return false; + } + + if (hfsmp->hfs_auto_roll_min_key_os_version + && cpr->cp_key_os_version < hfsmp->hfs_auto_roll_min_key_os_version) { + // Before minimum + return false; + } + + if (hfsmp->hfs_auto_roll_max_key_os_version + && cpr->cp_key_os_version >= hfsmp->hfs_auto_roll_max_key_os_version) { + // Greater than maximum + return false; + } + + return true; +} + +#endif // HFS_CONFIG_KEY_ROLL + +errno_t cp_handle_strategy(buf_t bp) +{ + vnode_t vp = buf_vnode(bp); + cnode_t *cp = NULL; + + if (bufattr_rawencrypted(buf_attr(bp)) + || !(cp = cp_get_protected_cnode(vp)) + || !cp->c_cpentry) { + // Nothing to do + return 0; + } + + /* + * For filesystem resize, we may not have access to the underlying + * file's cache key for whatever reason (device may be locked). + * However, we do not need it since we are going to use the + * temporary HFS-wide resize key which is generated once we start + * relocating file content. If this file's I/O should be done + * using the resize key, it will have been supplied already, so do + * not attach the file's cp blob to the buffer. + */ + if (ISSET(cp->c_cpentry->cp_flags, CP_RELOCATION_INFLIGHT)) + return 0; + +#if HFS_CONFIG_KEY_ROLL + /* + * We don't require any locks here. Pages will be locked so no + * key rolling can take place until this I/O has completed. + */ + if (!cp->c_cpentry->cp_key_roll_ctx) +#endif + { + // Fast path + cpx_t cpx = cpkp_cpx(&cp->c_cpentry->cp_keys); + + if (cpx_has_key(cpx)) { + bufattr_setcpx(buf_attr(bp), cpx); + return 0; + } + } + + /* + * We rely mostly (see note below) upon the truncate lock to + * protect the CP cache key from getting tossed prior to our IO + * finishing here. Nearly all cluster io calls to manipulate file + * payload from HFS take the truncate lock before calling into the + * cluster layer to ensure the file size does not change, or that + * they have exclusive right to change the EOF of the file. That + * same guarantee protects us here since the code that deals with + * CP lock events must now take the truncate lock before doing + * anything. + * + * If you want to change content protection structures, then the + * truncate lock is not sufficient; you must take the truncate + * lock and then wait for outstanding writes to complete. This is + * necessary because asynchronous I/O only holds the truncate lock + * whilst I/O is being queued. + * + * One exception should be the VM swapfile IO, because HFS will + * funnel the VNOP_PAGEOUT directly into a cluster_pageout call + * for the swapfile code only without holding the truncate lock. + * This is because individual swapfiles are maintained at + * fixed-length sizes by the VM code. In non-swapfile IO we use + * PAGEOUT_V2 semantics which allow us to create our own UPL and + * thus take the truncate lock before calling into the cluster + * layer. In that case, however, we are not concerned with the CP + * blob being wiped out in the middle of the IO because there + * isn't anything to toss; the VM swapfile key stays in-core as + * long as the file is open. + */ + + off_rsrc_t off_rsrc = off_rsrc_make(buf_lblkno(bp) * GetLogicalBlockSize(vp), + VNODE_IS_RSRC(vp)); + cp_io_params_t io_params; + + + /* + * We want to take the cnode lock here and because the vnode write + * count is a pseudo-lock, we need to do something to preserve + * lock ordering; the cnode lock comes before the write count. + * Ideally, the write count would be incremented after the + * strategy routine returns, but that becomes complicated if the + * strategy routine where to call buf_iodone before returning. + * For now, we drop the write count here and then pick it up again + * later. + */ + if (!ISSET(buf_flags(bp), B_READ) && !ISSET(buf_flags(bp), B_RAW)) + vnode_writedone(vp); + + hfs_lock_always(cp, HFS_SHARED_LOCK); + cp_io_params(VTOHFS(vp), cp->c_cpentry, off_rsrc, + ISSET(buf_flags(bp), B_READ) ? VNODE_READ : VNODE_WRITE, + &io_params); + hfs_unlock(cp); + + /* + * Last chance: If this data protected I/O does not have unwrapped + * keys present, then try to get them. We already know that it + * should, by this point. + */ + if (!cpx_has_key(io_params.cpx)) { + int io_op = ( (buf_flags(bp) & B_READ) ? CP_READ_ACCESS : CP_WRITE_ACCESS); + errno_t error = cp_handle_vnop(vp, io_op, 0); + if (error) { + /* + * We have to be careful here. By this point in the I/O + * path, VM or the cluster engine has prepared a buf_t + * with the proper file offsets and all the rest, so + * simply erroring out will result in us leaking this + * particular buf_t. We need to properly decorate the + * buf_t just as buf_strategy would so as to make it + * appear that the I/O errored out with the particular + * error code. + */ + if (!ISSET(buf_flags(bp), B_READ) && !ISSET(buf_flags(bp), B_RAW)) + vnode_startwrite(vp); + buf_seterror (bp, error); + buf_biodone(bp); + return error; + } + + hfs_lock_always(cp, HFS_SHARED_LOCK); + cp_io_params(VTOHFS(vp), cp->c_cpentry, off_rsrc, + ISSET(buf_flags(bp), B_READ) ? VNODE_READ : VNODE_WRITE, + &io_params); + hfs_unlock(cp); + } + + hfs_assert(buf_count(bp) <= io_params.max_len); + bufattr_setcpx(buf_attr(bp), io_params.cpx); + + if (!ISSET(buf_flags(bp), B_READ) && !ISSET(buf_flags(bp), B_RAW)) + vnode_startwrite(vp); + + return 0; +} + +#endif /* CONFIG_PROTECT */ diff --git a/core/hfs_cprotect.h b/core/hfs_cprotect.h new file mode 100644 index 0000000..1b409e8 --- /dev/null +++ b/core/hfs_cprotect.h @@ -0,0 +1,424 @@ +/* + * Copyright (c) 2009-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef HFS_CPROTECT_H_ +#define HFS_CPROTECT_H_ + +#if CONFIG_PROTECT + +#include + +#include +#include +#include +#include +#include + +#include "hfs.h" +#include "hfs_fsctl.h" + +__BEGIN_DECLS + +#define CP_IV_KEYSIZE 16 /* 16x8 = 128 */ +#define CP_MAX_KEYSIZE 32 /* 8x4 = 32, 32x8 = 256 */ +#define CP_MAX_CACHEBUFLEN 64 /* Maximum size of cp cache buffer/array */ + +#define CP_INITIAL_WRAPPEDKEYSIZE 40 +#define CP_V2_WRAPPEDKEYSIZE 40 /* Size of the wrapped key in a v2 EA */ +#define CP_V4_RESERVEDBYTES 16 /* Number of reserved bytes in EA still present */ + +#define CP_LOCKED_KEYCHAIN 0 +#define CP_UNLOCKED_KEYCHAIN 1 + +#define CONTENT_PROTECTION_XATTR_NAME "com.apple.system.cprotect" +#define CONTENT_PROTECTION_XATTR_NAME_CHARS \ + { 'c', 'o', 'm', '.', 'a', 'p', 'p', 'l', 'e', \ + '.', 's', 'y', 's', 't', 'e', 'm', \ + '.', 'c', 'p', 'r', 'o', 't', 'e', 'c', 't' } +#define CP_CURRENT_VERS CP_VERS_5 +#define CP_VERS_5 5 // iOS 8.1 +#define CP_VERS_4 4 // iOS 5 +#define CP_VERS_2 2 // iOS 4 +#define CP_MINOR_VERS 0 + +/* the class occupies the lowest 5 bits, so there are 32 values (0-31) */ +#define CP_EFFECTIVE_CLASSMASK 0x0000001f + +/* macros for quick access/typing to mask out the classmask */ +#define CP_CLASS(x) ((cp_key_class_t)(CP_EFFECTIVE_CLASSMASK & (x))) + +#define CP_CRYPTO_G1 0x00000020 + +typedef struct cp_xattr *cp_xattr_t; +typedef struct cnode * cnode_ptr_t; +//forward declare the struct. +struct hfsmount; + +/* + * Flags for Key Generation Behavior + * + * These are passed to cp_generate_keys() and cp_new() in the + * flags arguments + */ +#define CP_KEYWRAP_DIFFCLASS 0x00000001 /* wrapping with a different class bag is OK */ + +/* + * off_rsrc_t: this structure represents an offset and whether or not it's + * the resource fork. It's done this way so that we can easily do comparisons + * i.e. + * + * { 0, data-fork } < { 100, rsrc-fork } + */ + +enum { + OFF_RSRC_BIT = 0x4000000000000000, +}; + +typedef int64_t off_rsrc_t; + +static inline bool off_rsrc_is_rsrc(off_rsrc_t off_rsrc) +{ + return off_rsrc & OFF_RSRC_BIT; +} + +static inline off_t off_rsrc_get_off(off_rsrc_t off_rsrc) +{ + return off_rsrc & (OFF_RSRC_BIT - 1); +} + +static inline off_rsrc_t off_rsrc_make(off_t offset, bool is_rsrc) +{ + return offset | (is_rsrc ? OFF_RSRC_BIT : 0); +} + +// -- struct cp_key_pair -- + +/* + * This structure maintains the pair of keys; the persistent, wrapped key that + * is written to disk, and the unwrapped key (cpx_t) that we pass to lower + * layers. + */ + +typedef struct cp_key_pair { + uint16_t cpkp_max_pers_key_len; + uint16_t cpkp_pers_key_len; + uint8_t cpkp_cpx[]; + + // cpkp_cpx is variable length so the location of the persistent key varies + // uint8_t cpkp_persistent_key[]; +} cp_key_pair_t; + +// -- struct cprotect -- + +/* + * Runtime-only structure containing the content protection status for + * the given file. This is referenced by the cnode. It has the + * variable length key pair at the end. + */ + +typedef uint32_t cp_flags_t; +enum { + CP_NO_XATTR = 0x01, /* Key info has not been saved as EA to the FS */ + CP_RELOCATION_INFLIGHT = 0x02, /* File with offset IVs is in the process of being relocated. */ + +#if HFS_CONFIG_KEY_ROLL + // These flags are only set if you ask for basic info from read_xattr_v5 + CP_KEY_IS_ROLLING = 0x04, /* File is in the middle of key rolling */ +#endif + CP_HAS_A_KEY = 0x08, /* File has a non-zero length key */ +}; + +typedef struct cprotect { +#if DEBUG + uint32_t cp_magic1; +#endif + cp_flags_t cp_flags; + cp_key_class_t cp_pclass; /* persistent class stored on-disk */ + void* cp_backing_cnode; + cp_key_os_version_t cp_key_os_version; + cp_key_revision_t cp_key_revision; + uint16_t cp_raw_open_count; +#if HFS_CONFIG_KEY_ROLL + struct hfs_cp_key_roll_ctx *cp_key_roll_ctx; +#endif + cp_key_pair_t cp_keys; // Variable length +} *cprotect_t; + +// -- On-Disk Structures -- + +typedef uint32_t cp_xattr_flags_t; +enum { + /* + * Be careful about using flags 0x02 to 0x20. Older code used to write + * flags that were used for in-memory purposes to disk and therefore + * they might be used in V4 structures. Here's what they were: + * + * CP_KEY_FLUSHED 0x02 Should never have made it to disk + * CP_NO_XATTR 0x04 Should never have made it to disk + * CP_OFF_IV_ENABLED 0x08 Probably made it to disk + * CP_RELOCATION_INFLIGHT 0x10 Should never have made it to disk + * CP_SEP_WRAPPEDKEY 0x20 Probably made it to disk + * + */ + + CP_XAF_NEEDS_KEYS = 0x0001, /* V4 only: file needs persistent keys */ + +}; + +/* + * V2 structure written as the per-file EA payload + * All on-disk multi-byte fields for the CP XATTR must be stored + * little-endian on-disk. This means they must be endian swapped to + * L.E on getxattr() and converted to LE on setxattr(). + * + * This structure is a fixed length and is tightly packed. + * 56 bytes total. + */ +struct cp_xattr_v2 { + u_int16_t xattr_major_version; + u_int16_t xattr_minor_version; + cp_xattr_flags_t flags; + u_int32_t persistent_class; + u_int32_t key_size; + uint8_t persistent_key[CP_V2_WRAPPEDKEYSIZE]; +} __attribute__((aligned(2), packed)); + + +/* + * V4 Content Protection EA On-Disk Layout. + * + * This structure must be tightly packed, but the *size can vary* + * depending on the length of the key. At MOST, the key length will be + * CP_MAX_WRAPPEDKEYSIZE, but the length is defined by the key_size field. + * + * Either way, the packing must be applied to ensure that the key data is + * retrievable in the right location relative to the start of the struct. + * + * Fully packed, this structure can range from : + * MIN: 36 bytes (no key -- used with directories) + * MAX: 164 bytes (with 128 byte key) + * + * During runtime we always allocate with the full 128 byte key, but only + * use as much of the key buffer as needed. It must be tightly packed, though. + */ + +struct cp_xattr_v4 { + u_int16_t xattr_major_version; + u_int16_t xattr_minor_version; + cp_xattr_flags_t flags; + cp_key_class_t persistent_class; + u_int32_t key_size; + // This field will be zero on older systems + cp_key_os_version_t key_os_version; + /* CP V4 Reserved Bytes == 16 */ + u_int8_t reserved[CP_V4_RESERVEDBYTES]; + /* All above fields are fixed regardless of key length (36 bytes) */ + /* Max Wrapped Size == 128 */ + uint8_t persistent_key[CP_MAX_WRAPPEDKEYSIZE]; +} __attribute__((aligned(2), packed)); + +// -- Version 5 -- + +#if HFS_CONFIG_KEY_ROLL +struct cp_roll_info { + off_rsrc_t off_rsrc; + uint16_t key_len; + uint8_t key[CP_MAX_WRAPPEDKEYSIZE]; +} __attribute__((aligned(2), packed)); +#endif + +struct cp_xattr_v5 { + uint16_t xattr_major_version; + uint16_t xattr_minor_version; + cp_xattr_flags_t flags; + cp_key_class_t persistent_class; + cp_key_os_version_t key_os_version; + cp_key_revision_t key_revision; + uint16_t key_len; + + // 20 bytes to here + + // Variable length from here + uint8_t persistent_key[CP_MAX_WRAPPEDKEYSIZE]; + +#if HFS_CONFIG_KEY_ROLL + // NOTE: data not necessarily here because preceding is variable + uint8_t roll_key_[sizeof(struct cp_roll_info)]; +#endif + + // Wouldn't be necessary if xattr routines returned just what we ask for + uint8_t spare[512]; +} __attribute__((aligned(2), packed)); + +enum { + CP_XATTR_MIN_LEN = 20, // Minimum length for all versions +}; + +/* + * The Root Directory's EA (fileid 1) is special; it defines information about + * what capabilities the filesystem is using. + * + * The data is still stored little endian. + */ +struct cp_root_xattr { + u_int16_t major_version; + u_int16_t minor_version; + u_int64_t flags; +#if HFS_CONFIG_KEY_ROLL + cp_key_os_version_t auto_roll_min_version; + cp_key_os_version_t auto_roll_max_version; +#endif +} __attribute__((aligned(2), packed)); + +enum { + CP_ROOT_XATTR_MIN_LEN = 12, +}; + + +// -- Function Prototypes -- + +int cp_entry_init(cnode_ptr_t, struct mount *); +int cpx_gentempkeys(cpx_t *pcpx, struct hfsmount *hfsmp); +void cp_entry_destroy(struct hfsmount *hfsmp, struct cprotect *entry_ptr); +void cp_replace_entry (struct hfsmount *hfsmp, struct cnode *cp, struct cprotect *newentry); +cnode_ptr_t cp_get_protected_cnode(vnode_t); +int cp_fs_protected (mount_t); +int cp_getrootxattr (struct hfsmount *hfsmp, struct cp_root_xattr *outxattr); +int cp_setrootxattr (struct hfsmount *hfsmp, struct cp_root_xattr *newxattr); +int cp_generate_keys (struct hfsmount *hfsmp, struct cnode *cp, + cp_key_class_t targetclass, uint32_t flags, + struct cprotect **newentry); +int cp_setup_newentry (struct hfsmount *hfsmp, struct cnode *dcp, + cp_key_class_t suppliedclass, mode_t cmode, + struct cprotect **tmpentry); +int cp_is_valid_class (int isdir, int32_t protectionclass); +int cp_set_trimmed(struct hfsmount*); +int cp_set_rewrapped(struct hfsmount *); +int cp_flop_generation (struct hfsmount*); +bool cp_is_supported_version(uint16_t version); +int cp_vnode_getclass(struct vnode *vp, cp_key_class_t *class); +int cp_vnode_setclass(struct vnode *vp, cp_key_class_t newclass); +int cp_get_root_major_vers(vnode_t vp, uint32_t *level); +int cp_vnode_transcode(vnode_t vp, cp_key_t *k); +int cp_get_default_level (struct vnode *vp, uint32_t *level); +void cp_device_locked_callback(mount_t mp, cp_lock_state_t state); + +#if HFS_CONFIG_KEY_ROLL +bool cp_should_auto_roll(struct hfsmount *hfsmp, cprotect_t cpr); +errno_t cp_set_auto_roll(struct hfsmount *hfsmp, + const hfs_key_auto_roll_args_t *args); +#endif + +typedef struct cp_io_params { + // The key to use + cpx_t cpx; + + /* + * The physical offset for this I/O or -1 if unknown (i.e. caller must + * do a regular look up). + */ + off_t phys_offset; + + // The maximum length allowed for this I/O + off_t max_len; +} cp_io_params_t; + +// Return the I/O parameters for this I/O +void cp_io_params(struct hfsmount *hfsmp, cprotect_t cpr, off_rsrc_t off_rsrc, + int direction, cp_io_params_t *io_params); + +int cp_setxattr(struct cnode *cp, struct cprotect *entry, struct hfsmount *hfsmp, + uint32_t fileid, int xattr_opts); + +typedef void * (* cp_new_alloc_fn)(const void *old, uint16_t pers_key_len, + uint16_t cached_key_len, + cp_key_pair_t **pcpkp); + +int cp_new(cp_key_class_t *newclass_eff, struct hfsmount *hfsmp, + struct cnode *cp, mode_t cmode, int32_t keyflags, + cp_key_revision_t key_revision, + cp_new_alloc_fn alloc_fn, void **pholder); + +int cp_rewrap(struct cnode *cp, __unused struct hfsmount *hfsmp, + cp_key_class_t *newclass, cp_key_pair_t *cpkp, const void *old_holder, + cp_new_alloc_fn alloc_fn, void **pholder); + +cprotect_t cp_entry_alloc(cprotect_t old, uint16_t pers_keylen, + uint16_t cached_key_len, cp_key_pair_t **pcpkp); + +cp_key_os_version_t cp_os_version(void); + +cp_key_revision_t cp_next_key_revision(cp_key_revision_t rev); + +typedef uint32_t cp_getxattr_options_t; +enum { + // Return just basic information (not the key) + CP_GET_XATTR_BASIC_INFO = 1, +}; + +int cp_read_xattr_v5(struct hfsmount *hfsmp, struct cp_xattr_v5 *xattr, + size_t xattr_len, cprotect_t *pcpr, cp_getxattr_options_t options); + + +errno_t cp_handle_strategy(buf_t bp); + +typedef enum { + CP_READ_ACCESS = 0x1, + CP_WRITE_ACCESS = 0x2 +} cp_mode_t; + +int cp_handle_open(struct vnode *vp, int mode); +int cp_handle_vnop(struct vnode *vp, int mode, int ioflag); + +// -- cp_key_pair_t functions -- + +size_t cpkp_size(uint16_t pers_key_len, uint16_t cached_key_len); +size_t cpkp_sizex(const cp_key_pair_t *cpkp); +void cpkp_init(cp_key_pair_t *cpkp, uint16_t max_pers_key_len, + uint16_t max_cached_key_len); +void cpkp_flush(cp_key_pair_t *cpkp); +void cpkp_copy(const cp_key_pair_t *src, cp_key_pair_t *dst); +uint16_t cpkp_max_pers_key_len(const cp_key_pair_t *cpkp); +uint16_t cpkp_pers_key_len(const cp_key_pair_t *cpkp); +bool cpkp_can_copy(const cp_key_pair_t *src, const cp_key_pair_t *dst); +cpx_t cpkp_cpx(const cp_key_pair_t *cpkp) __attribute__((pure)); + +// -- Helper Functions -- + +static inline int cp_get_crypto_generation (cp_key_class_t protclass) { + if (protclass & CP_CRYPTO_G1) { + return 1; + } + else return 0; +} + +__END_DECLS + +#endif // CONFIG_PROTECT + +#endif /* !HFS_CPROTECT_H_ */ diff --git a/core/hfs_dbg.h b/core/hfs_dbg.h new file mode 100644 index 0000000..eada538 --- /dev/null +++ b/core/hfs_dbg.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2000, 2005 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef HFS_DBG_H_ +#define HFS_DBG_H_ + +#include + +__BEGIN_DECLS + +#include + +// So that the analyzer acknowledges assertions... +#if defined(__clang_analyzer__) || DEBUG +#define panic_on_assert true +#else +extern bool panic_on_assert; +#endif + +#if DEBUG +extern bool hfs_corruption_panics; +#else +#define hfs_corruption_panics false +#endif + +__attribute__((noreturn)) +void hfs_assert_fail(const char *file, unsigned line, const char *expr); + +#define hfs_assert(expr) \ + do { \ + if (__builtin_expect(panic_on_assert, false) \ + && __builtin_expect(!(expr), false)) { \ + hfs_assert_fail(__FILE__, __LINE__, #expr); \ + } \ + } while (0) + +// On production, will printf rather than assert +#define hfs_warn(format, ...) \ + do { \ + if (__builtin_expect(panic_on_assert, false)) { \ + panic(format, ## __VA_ARGS__); \ + __builtin_unreachable(); \ + } else \ + printf(format, ## __VA_ARGS__); \ + } while (0) + +// Quiet on production +#define hfs_debug(format, ...) \ + do { \ + if (__builtin_expect(panic_on_assert, false)) \ + printf(format, ## __VA_ARGS__); \ + } while (0) + +// Panic on debug unless boot-arg tells us not to +#define hfs_corruption_debug(format, ...) \ + do { \ + if (__builtin_expect(hfs_corruption_panics, false)) { \ + panic(format, ## __VA_ARGS__); \ + __builtin_unreachable(); \ + } \ + else \ + printf(format, ## __VA_ARGS__); \ + } while (0) + +__END_DECLS + +#endif // HFS_DBG_H_ diff --git a/core/hfs_endian.c b/core/hfs_endian.c new file mode 100644 index 0000000..bdc9c7a --- /dev/null +++ b/core/hfs_endian.c @@ -0,0 +1,1227 @@ +/* + * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * hfs_endian.c + * + * This file implements endian swapping routines for the HFS/HFS Plus + * volume format. + */ + +#include "hfs_endian.h" +#include "hfs_dbg.h" +#include "BTreesPrivate.h" + +#undef ENDIAN_DEBUG + +/* + * Internal swapping routines + * + * These routines handle swapping the records of leaf and index nodes. The + * layout of the keys and records varies depending on the kind of B-tree + * (determined by fileID). + * + * The direction parameter must be kSwapBTNodeBigToHost or kSwapBTNodeHostToBig. + * The kSwapBTNodeHeaderRecordOnly "direction" is not valid for these routines. + */ +int hfs_swap_HFSPlusBTInternalNode (BlockDescriptor *src, HFSCatalogNodeID fileID, enum HFSBTSwapDirection direction); +void hfs_swap_HFSPlusForkData (HFSPlusForkData *src); + +#if CONFIG_HFS_STD +int hfs_swap_HFSBTInternalNode (BlockDescriptor *src, HFSCatalogNodeID fileID, enum HFSBTSwapDirection direction); +#endif + +/* + * hfs_swap_HFSPlusForkData + */ +void +hfs_swap_HFSPlusForkData ( + HFSPlusForkData *src +) +{ + int i; + + src->logicalSize = SWAP_BE64 (src->logicalSize); + + src->clumpSize = SWAP_BE32 (src->clumpSize); + src->totalBlocks = SWAP_BE32 (src->totalBlocks); + + for (i = 0; i < kHFSPlusExtentDensity; i++) { + src->extents[i].startBlock = SWAP_BE32 (src->extents[i].startBlock); + src->extents[i].blockCount = SWAP_BE32 (src->extents[i].blockCount); + } +} + +/* + * hfs_swap_BTNode + * + * NOTE: This operation is not naturally symmetric. + * We have to determine which way we're swapping things. + */ +int +hfs_swap_BTNode ( + BlockDescriptor *src, + vnode_t vp, + enum HFSBTSwapDirection direction, + u_int8_t allow_empty_node +) +{ + BTNodeDescriptor *srcDesc = src->buffer; + u_int16_t *srcOffs = NULL; + BTreeControlBlockPtr btcb = (BTreeControlBlockPtr)VTOF(vp)->fcbBTCBPtr; + u_int16_t i; /* index to match srcDesc->numRecords */ + int error = 0; + +#ifdef ENDIAN_DEBUG + if (direction == kSwapBTNodeBigToHost) { + printf ("hfs: BE -> Native Swap\n"); + } else if (direction == kSwapBTNodeHostToBig) { + printf ("hfs: Native -> BE Swap\n"); + } else if (direction == kSwapBTNodeHeaderRecordOnly) { + printf ("hfs: Not swapping descriptors\n"); + } else { + panic ("hfs_swap_BTNode: This is impossible"); + } +#endif + + /* + * If we are doing a swap from on-disk to in-memory, then swap the node + * descriptor and record offsets before we need to use them. + */ + if (direction == kSwapBTNodeBigToHost) { + srcDesc->fLink = SWAP_BE32 (srcDesc->fLink); + srcDesc->bLink = SWAP_BE32 (srcDesc->bLink); + + /* + * When first opening a BTree, we have to read the header node before the + * control block is initialized. In this case, totalNodes will be zero, + * so skip the bounds checking. Also, we should ignore the header node when + * checking for invalid forwards and backwards links, since the header node's + * links can point back to itself legitimately. + */ + if (btcb->totalNodes != 0) { + if (srcDesc->fLink >= btcb->totalNodes) { + hfs_corruption_debug("hfs_swap_BTNode: invalid forward link (0x%08x >= 0x%08x)\n", srcDesc->fLink, btcb->totalNodes); + error = fsBTInvalidHeaderErr; + goto fail; + } + if (srcDesc->bLink >= btcb->totalNodes) { + hfs_corruption_debug("hfs_swap_BTNode: invalid backward link (0x%08x >= 0x%08x)\n", srcDesc->bLink, btcb->totalNodes); + error = fsBTInvalidHeaderErr; + goto fail; + } + + if ((src->blockNum != 0) && (srcDesc->fLink == (u_int32_t) src->blockNum)) { + hfs_corruption_debug("hfs_swap_BTNode: invalid forward link (0x%08x == 0x%08x)\n", srcDesc->fLink, (u_int32_t) src->blockNum); + error = fsBTInvalidHeaderErr; + goto fail; + } + if ((src->blockNum != 0) && (srcDesc->bLink == (u_int32_t) src->blockNum)) { + hfs_corruption_debug("hfs_swap_BTNode: invalid backward link (0x%08x == 0x%08x)\n", srcDesc->bLink, (u_int32_t) src->blockNum); + error = fsBTInvalidHeaderErr; + goto fail; + } + + + } + + /* + * Check srcDesc->kind. Don't swap it because it's only one byte. + */ + if (srcDesc->kind < kBTLeafNode || srcDesc->kind > kBTMapNode) { + printf("hfs_swap_BTNode: invalid node kind (%d)\n", srcDesc->kind); + error = fsBTInvalidHeaderErr; + goto fail; + } + + /* + * Check srcDesc->height. Don't swap it because it's only one byte. + */ + if (srcDesc->height > kMaxTreeDepth) { + printf("hfs_swap_BTNode: invalid node height (%d)\n", srcDesc->height); + error = fsBTInvalidHeaderErr; + goto fail; + } + + /* Don't swap srcDesc->reserved */ + + srcDesc->numRecords = SWAP_BE16 (srcDesc->numRecords); + + /* + * Swap the node offsets (including the free space one!). + */ + srcOffs = (u_int16_t *)((char *)src->buffer + (src->blockSize - ((srcDesc->numRecords + 1) * sizeof (u_int16_t)))); + + /* + * Sanity check that the record offsets are within the node itself. + */ + if ((char *)srcOffs > ((char *)src->buffer + src->blockSize) || + (char *)srcOffs < ((char *)src->buffer + sizeof(BTNodeDescriptor))) { + printf("hfs_swap_BTNode: invalid record count (0x%04X)\n", srcDesc->numRecords); + error = fsBTInvalidHeaderErr; + goto fail; + } + + /* + * Swap and sanity check each of the record offsets. + */ + for (i = 0; i <= srcDesc->numRecords; i++) { + srcOffs[i] = SWAP_BE16 (srcOffs[i]); + + /* + * Sanity check: must be even, and within the node itself. + * + * We may be called to swap an unused node, which contains all zeroes. + * Unused nodes are expected only when allow_empty_node is true. + * If it is false and record offset is zero, return error. + */ + if ((srcOffs[i] & 1) || ( + (allow_empty_node == false) && (srcOffs[i] == 0)) || + (srcOffs[i] < sizeof(BTNodeDescriptor) && srcOffs[i] != 0) || + (srcOffs[i] > (src->blockSize - 2 * (srcDesc->numRecords + 1)))) { + printf("hfs_swap_BTNode: offset #%d invalid (0x%04X) (blockSize 0x%x numRecords %d)\n", + i, srcOffs[i], src->blockSize, srcDesc->numRecords); + error = fsBTInvalidHeaderErr; + goto fail; + } + + /* + * Make sure the offsets are strictly increasing. Note that we're looping over + * them backwards, hence the order in the comparison. + */ + if ((i != 0) && (srcOffs[i] >= srcOffs[i-1])) { + printf("hfs_swap_BTNode: offsets %d and %d out of order (0x%04X, 0x%04X)\n", + i, i-1, srcOffs[i], srcOffs[i-1]); + error = fsBTInvalidHeaderErr; + goto fail; + } + } + } + + /* + * Swap the records (ordered by frequency of access) + */ + if ((srcDesc->kind == kBTIndexNode) || + (srcDesc-> kind == kBTLeafNode)) { + + if (VTOVCB(vp)->vcbSigWord == kHFSPlusSigWord) { + error = hfs_swap_HFSPlusBTInternalNode (src, VTOC(vp)->c_fileid, direction); + } +#if CONFIG_HFS_STD + else { + error = hfs_swap_HFSBTInternalNode (src, VTOC(vp)->c_fileid, direction); + } +#endif + + if (error) goto fail; + + } else if (srcDesc-> kind == kBTMapNode) { + /* Don't swap the bitmaps, they'll be done in the bitmap routines */ + + } else if (srcDesc-> kind == kBTHeaderNode) { + /* The header's offset is hard-wired because we cannot trust the offset pointers. */ + BTHeaderRec *srcHead = (BTHeaderRec *)((char *)src->buffer + sizeof(BTNodeDescriptor)); + + srcHead->treeDepth = SWAP_BE16 (srcHead->treeDepth); + + srcHead->rootNode = SWAP_BE32 (srcHead->rootNode); + srcHead->leafRecords = SWAP_BE32 (srcHead->leafRecords); + srcHead->firstLeafNode = SWAP_BE32 (srcHead->firstLeafNode); + srcHead->lastLeafNode = SWAP_BE32 (srcHead->lastLeafNode); + + srcHead->nodeSize = SWAP_BE16 (srcHead->nodeSize); + srcHead->maxKeyLength = SWAP_BE16 (srcHead->maxKeyLength); + + srcHead->totalNodes = SWAP_BE32 (srcHead->totalNodes); + srcHead->freeNodes = SWAP_BE32 (srcHead->freeNodes); + + srcHead->clumpSize = SWAP_BE32 (srcHead->clumpSize); + srcHead->attributes = SWAP_BE32 (srcHead->attributes); + + /* Don't swap srcHead->reserved1 */ + /* Don't swap srcHead->btreeType; it's only one byte */ + /* Don't swap srcHead->reserved2 */ + /* Don't swap srcHead->reserved3 */ + /* Don't swap bitmap */ + } + + /* + * If we are doing a swap from in-memory to on-disk, then swap the node + * descriptor and record offsets after we're done using them. + */ + if (direction == kSwapBTNodeHostToBig) { + /* + * Sanity check and swap the forward and backward links. + * Ignore the header node since its forward and backwards links can legitimately + * point to itself. + */ + if (srcDesc->fLink >= btcb->totalNodes) { + panic("hfs_UNswap_BTNode: invalid forward link (0x%08X)\n", srcDesc->fLink); + error = fsBTInvalidHeaderErr; + goto fail; + } + if ((src->blockNum != 0) && (srcDesc->fLink == (u_int32_t) src->blockNum)) { + panic ("hfs_UNswap_BTNode: invalid forward link (0x%08x == 0x%08x)\n", + srcDesc->fLink, (u_int32_t) src->blockNum); + error = fsBTInvalidHeaderErr; + goto fail; + } + + if (srcDesc->bLink >= btcb->totalNodes) { + panic("hfs_UNswap_BTNode: invalid backward link (0x%08X)\n", srcDesc->bLink); + error = fsBTInvalidHeaderErr; + goto fail; + } + if ((src->blockNum != 0) && (srcDesc->bLink == (u_int32_t) src->blockNum)) { + panic ("hfs_UNswap_BTNode: invalid backward link (0x%08x == 0x%08x)\n", + srcDesc->bLink, (u_int32_t) src->blockNum); + error = fsBTInvalidHeaderErr; + goto fail; + } + + + srcDesc->fLink = SWAP_BE32 (srcDesc->fLink); + srcDesc->bLink = SWAP_BE32 (srcDesc->bLink); + + /* + * Check srcDesc->kind. Don't swap it because it's only one byte. + */ + if (srcDesc->kind < kBTLeafNode || srcDesc->kind > kBTMapNode) { + panic("hfs_UNswap_BTNode: invalid node kind (%d)\n", srcDesc->kind); + error = fsBTInvalidHeaderErr; + goto fail; + } + + /* + * Check srcDesc->height. Don't swap it because it's only one byte. + */ + if (srcDesc->height > kMaxTreeDepth) { + panic("hfs_UNswap_BTNode: invalid node height (%d)\n", srcDesc->height); + error = fsBTInvalidHeaderErr; + goto fail; + } + + /* Don't swap srcDesc->reserved */ + + /* + * Swap the node offsets (including the free space one!). + */ + srcOffs = (u_int16_t *)((char *)src->buffer + (src->blockSize - ((srcDesc->numRecords + 1) * sizeof (u_int16_t)))); + + /* + * Sanity check that the record offsets are within the node itself. + */ + if ((char *)srcOffs > ((char *)src->buffer + src->blockSize) || + (char *)srcOffs < ((char *)src->buffer + sizeof(BTNodeDescriptor))) { + panic("hfs_UNswap_BTNode: invalid record count (0x%04X)\n", srcDesc->numRecords); + error = fsBTInvalidHeaderErr; + goto fail; + } + + /* + * Swap and sanity check each of the record offsets. + */ + for (i = 0; i <= srcDesc->numRecords; i++) { + /* + * Sanity check: must be even, and within the node itself. + * + * We may be called to swap an unused node, which contains all zeroes. + * This can happen when the last record from a node gets deleted. + * This is why we allow the record offset to be zero. + * Unused nodes are expected only when allow_empty_node is true + * (the caller should set it to true for kSwapBTNodeBigToHost). + */ + if ((srcOffs[i] & 1) || + ((allow_empty_node == false) && (srcOffs[i] == 0)) || + (srcOffs[i] < sizeof(BTNodeDescriptor) && srcOffs[i] != 0) || + (srcOffs[i] > (src->blockSize - 2 * (srcDesc->numRecords + 1)))) { + panic("hfs_UNswap_BTNode: offset #%d invalid (0x%04X) (blockSize 0x%x numRecords %d)\n", + i, srcOffs[i], src->blockSize, srcDesc->numRecords); + error = fsBTInvalidHeaderErr; + goto fail; + } + + /* + * Make sure the offsets are strictly increasing. Note that we're looping over + * them backwards, hence the order in the comparison. + */ + if ((i < srcDesc->numRecords) && (srcOffs[i+1] >= srcOffs[i])) { + panic("hfs_UNswap_BTNode: offsets %d and %d out of order (0x%04X, 0x%04X)\n", + i+1, i, srcOffs[i+1], srcOffs[i]); + error = fsBTInvalidHeaderErr; + goto fail; + } + + srcOffs[i] = SWAP_BE16 (srcOffs[i]); + } + + srcDesc->numRecords = SWAP_BE16 (srcDesc->numRecords); + } + +fail: + if (error) { + /* + * Log some useful information about where the corrupt node is. + */ + printf("hfs: node=%lld fileID=%u volume=%s device=%s\n", src->blockNum, VTOC(vp)->c_fileid, + VTOVCB(vp)->vcbVN, vfs_statfs(vnode_mount(vp))->f_mntfromname); + hfs_mark_inconsistent(VTOVCB(vp), HFS_INCONSISTENCY_DETECTED); + } + + return (error); +} + +int +hfs_swap_HFSPlusBTInternalNode ( + BlockDescriptor *src, + HFSCatalogNodeID fileID, + enum HFSBTSwapDirection direction +) +{ + BTNodeDescriptor *srcDesc = src->buffer; + u_int16_t *srcOffs = (u_int16_t *)((char *)src->buffer + (src->blockSize - (srcDesc->numRecords * sizeof (u_int16_t)))); + char *nextRecord; /* Points to start of record following current one */ + + /* + * i is an int32 because it needs to be negative to index the offset to free space. + * srcDesc->numRecords is a u_int16_t and is unlikely to become 32-bit so this should be ok. + */ + + int32_t i; + u_int32_t j; + + if (fileID == kHFSExtentsFileID) { + HFSPlusExtentKey *srcKey; + HFSPlusExtentDescriptor *srcRec; + size_t recordSize; /* Size of the data part of the record, or node number for index nodes */ + + if (srcDesc->kind == kBTIndexNode) + recordSize = sizeof(u_int32_t); + else + recordSize = sizeof(HFSPlusExtentDescriptor); + + for (i = 0; i < srcDesc->numRecords; i++) { + /* Point to the start of the record we're currently checking. */ + srcKey = (HFSPlusExtentKey *)((char *)src->buffer + srcOffs[i]); + + /* + * Point to start of next (larger offset) record. We'll use this + * to be sure the current record doesn't overflow into the next + * record. + */ + nextRecord = (char *)src->buffer + srcOffs[i-1]; + + /* + * Make sure the key and data are within the buffer. Since both key + * and data are fixed size, this is relatively easy. Note that this + * relies on the keyLength being a constant; we verify the keyLength + * below. + */ + if ((char *)srcKey + sizeof(HFSPlusExtentKey) + recordSize > nextRecord) { + if (direction == kSwapBTNodeHostToBig) { + panic("hfs_swap_HFSPlusBTInternalNode: extents key #%d offset too big (0x%04X)\n", srcDesc->numRecords-i-1, srcOffs[i]); + } else { + printf("hfs_swap_HFSPlusBTInternalNode: extents key #%d offset too big (0x%04X)\n", srcDesc->numRecords-i-1, srcOffs[i]); + } + return fsBTInvalidNodeErr; + } + + if (direction == kSwapBTNodeBigToHost) + srcKey->keyLength = SWAP_BE16 (srcKey->keyLength); + if (srcKey->keyLength != sizeof(*srcKey) - sizeof(srcKey->keyLength)) { + if (direction == kSwapBTNodeHostToBig) { + panic("hfs_swap_HFSPlusBTInternalNode: extents key #%d invalid length (%d)\n", srcDesc->numRecords-i-1, srcKey->keyLength); + } else { + printf("hfs_swap_HFSPlusBTInternalNode: extents key #%d invalid length (%d)\n", srcDesc->numRecords-i-1, srcKey->keyLength); + } + return fsBTInvalidNodeErr; + } + srcRec = (HFSPlusExtentDescriptor *)((char *)srcKey + srcKey->keyLength + sizeof(srcKey->keyLength)); + if (direction == kSwapBTNodeHostToBig) + srcKey->keyLength = SWAP_BE16 (srcKey->keyLength); + + /* Don't swap srcKey->forkType; it's only one byte */ + /* Don't swap srcKey->pad */ + + srcKey->fileID = SWAP_BE32 (srcKey->fileID); + srcKey->startBlock = SWAP_BE32 (srcKey->startBlock); + + if (srcDesc->kind == kBTIndexNode) { + /* For index nodes, the record data is just a child node number. */ + *((u_int32_t *)srcRec) = SWAP_BE32 (*((u_int32_t *)srcRec)); + } else { + /* Swap the extent data */ + for (j = 0; j < kHFSPlusExtentDensity; j++) { + srcRec[j].startBlock = SWAP_BE32 (srcRec[j].startBlock); + srcRec[j].blockCount = SWAP_BE32 (srcRec[j].blockCount); + } + } + } + + } else if (fileID == kHFSCatalogFileID) { + HFSPlusCatalogKey *srcKey; + int16_t *srcPtr; + u_int16_t keyLength; + + for (i = 0; i < srcDesc->numRecords; i++) { + /* Point to the start of the record we're currently checking. */ + srcKey = (HFSPlusCatalogKey *)((char *)src->buffer + srcOffs[i]); + + /* + * Point to start of next (larger offset) record. We'll use this + * to be sure the current record doesn't overflow into the next + * record. + */ + nextRecord = (char *)src->buffer + (uintptr_t)(srcOffs[i-1]); + + /* + * Make sure we can safely dereference the keyLength and parentID fields. + */ + if ((char *)srcKey + offsetof(HFSPlusCatalogKey, nodeName.unicode[0]) > nextRecord) { + if (direction == kSwapBTNodeHostToBig) { + panic("hfs_swap_HFSPlusBTInternalNode: catalog key #%d offset too big (0x%04X)\n", srcDesc->numRecords-i-1, srcOffs[i]); + } else { + printf("hfs_swap_HFSPlusBTInternalNode: catalog key #%d offset too big (0x%04X)\n", srcDesc->numRecords-i-1, srcOffs[i]); + } + return fsBTInvalidNodeErr; + } + + /* + * Swap and sanity check the key length + */ + if (direction == kSwapBTNodeBigToHost) + srcKey->keyLength = SWAP_BE16 (srcKey->keyLength); + keyLength = srcKey->keyLength; /* Put it in a local (native order) because we use it several times */ + if (direction == kSwapBTNodeHostToBig) + srcKey->keyLength = SWAP_BE16 (keyLength); + + /* Sanity check the key length */ + if (keyLength < kHFSPlusCatalogKeyMinimumLength || keyLength > kHFSPlusCatalogKeyMaximumLength) { + if (direction == kSwapBTNodeHostToBig) { + panic("hfs_swap_HFSPlusBTInternalNode: catalog key #%d invalid length (%d)\n", srcDesc->numRecords-i-1, keyLength); + } else { + printf("hfs_swap_HFSPlusBTInternalNode: catalog key #%d invalid length (%d)\n", srcDesc->numRecords-i-1, keyLength); + } + return fsBTInvalidNodeErr; + } + + /* + * Make sure that we can safely dereference the record's type field or + * an index node's child node number. + */ + srcPtr = (int16_t *)((char *)srcKey + keyLength + sizeof(srcKey->keyLength)); + if ((char *)srcPtr + sizeof(u_int32_t) > nextRecord) { + if (direction == kSwapBTNodeHostToBig) { + panic("hfs_swap_HFSPlusBTInternalNode: catalog key #%d too big\n", srcDesc->numRecords-i-1); + } else { + printf("hfs_swap_HFSPlusBTInternalNode: catalog key #%d too big\n", srcDesc->numRecords-i-1); + } + return fsBTInvalidNodeErr; + } + + srcKey->parentID = SWAP_BE32 (srcKey->parentID); + + /* + * Swap and sanity check the key's node name + */ + if (direction == kSwapBTNodeBigToHost) + srcKey->nodeName.length = SWAP_BE16 (srcKey->nodeName.length); + /* Make sure name length is consistent with key length */ + if (keyLength < sizeof(srcKey->parentID) + sizeof(srcKey->nodeName.length) + + srcKey->nodeName.length*sizeof(srcKey->nodeName.unicode[0])) { + if (direction == kSwapBTNodeHostToBig) { + panic("hfs_swap_HFSPlusBTInternalNode: catalog record #%d keyLength=%d expected=%lu\n", + srcDesc->numRecords-i, keyLength, sizeof(srcKey->parentID) + sizeof(srcKey->nodeName.length) + + srcKey->nodeName.length*sizeof(srcKey->nodeName.unicode[0])); + } else { + printf("hfs_swap_HFSPlusBTInternalNode: catalog record #%d keyLength=%d expected=%lu\n", + srcDesc->numRecords-i, keyLength, sizeof(srcKey->parentID) + sizeof(srcKey->nodeName.length) + + srcKey->nodeName.length*sizeof(srcKey->nodeName.unicode[0])); + } + return fsBTInvalidNodeErr; + } + for (j = 0; j < srcKey->nodeName.length; j++) { + srcKey->nodeName.unicode[j] = SWAP_BE16 (srcKey->nodeName.unicode[j]); + } + if (direction == kSwapBTNodeHostToBig) + srcKey->nodeName.length = SWAP_BE16 (srcKey->nodeName.length); + + /* + * For index nodes, the record data is just the child's node number. + * Skip over swapping the various types of catalog record. + */ + if (srcDesc->kind == kBTIndexNode) { + *((u_int32_t *)srcPtr) = SWAP_BE32 (*((u_int32_t *)srcPtr)); + continue; + } + + /* Make sure the recordType is in native order before using it. */ + if (direction == kSwapBTNodeBigToHost) + srcPtr[0] = SWAP_BE16 (srcPtr[0]); + + if (srcPtr[0] == kHFSPlusFolderRecord) { + HFSPlusCatalogFolder *srcRec = (HFSPlusCatalogFolder *)srcPtr; + if ((char *)srcRec + sizeof(*srcRec) > nextRecord) { + if (direction == kSwapBTNodeHostToBig) { + panic("hfs_swap_HFSPlusBTInternalNode: catalog folder record #%d too big\n", srcDesc->numRecords-i-1); + } else { + printf("hfs_swap_HFSPlusBTInternalNode: catalog folder record #%d too big\n", srcDesc->numRecords-i-1); + } + return fsBTInvalidNodeErr; + } + + srcRec->flags = SWAP_BE16 (srcRec->flags); + srcRec->valence = SWAP_BE32 (srcRec->valence); + srcRec->folderID = SWAP_BE32 (srcRec->folderID); + srcRec->createDate = SWAP_BE32 (srcRec->createDate); + srcRec->contentModDate = SWAP_BE32 (srcRec->contentModDate); + srcRec->attributeModDate = SWAP_BE32 (srcRec->attributeModDate); + srcRec->accessDate = SWAP_BE32 (srcRec->accessDate); + srcRec->backupDate = SWAP_BE32 (srcRec->backupDate); + + srcRec->bsdInfo.ownerID = SWAP_BE32 (srcRec->bsdInfo.ownerID); + srcRec->bsdInfo.groupID = SWAP_BE32 (srcRec->bsdInfo.groupID); + + /* Don't swap srcRec->bsdInfo.adminFlags; it's only one byte */ + /* Don't swap srcRec->bsdInfo.ownerFlags; it's only one byte */ + + srcRec->bsdInfo.fileMode = SWAP_BE16 (srcRec->bsdInfo.fileMode); + srcRec->bsdInfo.special.iNodeNum = SWAP_BE32 (srcRec->bsdInfo.special.iNodeNum); + + srcRec->textEncoding = SWAP_BE32 (srcRec->textEncoding); + + /* Don't swap srcRec->userInfo */ + /* Don't swap srcRec->finderInfo */ + srcRec->folderCount = SWAP_BE32 (srcRec->folderCount); + + } else if (srcPtr[0] == kHFSPlusFileRecord) { + HFSPlusCatalogFile *srcRec = (HFSPlusCatalogFile *)srcPtr; + if ((char *)srcRec + sizeof(*srcRec) > nextRecord) { + if (direction == kSwapBTNodeHostToBig) { + panic("hfs_swap_HFSPlusBTInternalNode: catalog file record #%d too big\n", srcDesc->numRecords-i-1); + } else { + printf("hfs_swap_HFSPlusBTInternalNode: catalog file record #%d too big\n", srcDesc->numRecords-i-1); + } + return fsBTInvalidNodeErr; + } + + srcRec->flags = SWAP_BE16 (srcRec->flags); + + srcRec->fileID = SWAP_BE32 (srcRec->fileID); + + srcRec->createDate = SWAP_BE32 (srcRec->createDate); + srcRec->contentModDate = SWAP_BE32 (srcRec->contentModDate); + srcRec->attributeModDate = SWAP_BE32 (srcRec->attributeModDate); + srcRec->accessDate = SWAP_BE32 (srcRec->accessDate); + srcRec->backupDate = SWAP_BE32 (srcRec->backupDate); + + srcRec->bsdInfo.ownerID = SWAP_BE32 (srcRec->bsdInfo.ownerID); + srcRec->bsdInfo.groupID = SWAP_BE32 (srcRec->bsdInfo.groupID); + + /* Don't swap srcRec->bsdInfo.adminFlags; it's only one byte */ + /* Don't swap srcRec->bsdInfo.ownerFlags; it's only one byte */ + + srcRec->bsdInfo.fileMode = SWAP_BE16 (srcRec->bsdInfo.fileMode); + srcRec->bsdInfo.special.iNodeNum = SWAP_BE32 (srcRec->bsdInfo.special.iNodeNum); + + srcRec->textEncoding = SWAP_BE32 (srcRec->textEncoding); + + /* If kHFSHasLinkChainBit is set, reserved1 is hl_FirstLinkID. + * In all other context, it is expected to be zero. + */ + srcRec->reserved1 = SWAP_BE32 (srcRec->reserved1); + + /* Don't swap srcRec->userInfo */ + /* Don't swap srcRec->finderInfo */ + /* Don't swap srcRec->reserved2 */ + + hfs_swap_HFSPlusForkData (&srcRec->dataFork); + hfs_swap_HFSPlusForkData (&srcRec->resourceFork); + + } else if ((srcPtr[0] == kHFSPlusFolderThreadRecord) || + (srcPtr[0] == kHFSPlusFileThreadRecord)) { + + /* + * Make sure there is room for parentID and name length. + */ + HFSPlusCatalogThread *srcRec = (HFSPlusCatalogThread *)srcPtr; + if ((char *) &srcRec->nodeName.unicode[0] > nextRecord) { + if (direction == kSwapBTNodeHostToBig) { + panic("hfs_swap_HFSPlusBTInternalNode: catalog thread record #%d too big\n", srcDesc->numRecords-i-1); + } else { + printf("hfs_swap_HFSPlusBTInternalNode: catalog thread record #%d too big\n", srcDesc->numRecords-i-1); + } + return fsBTInvalidNodeErr; + } + + /* Don't swap srcRec->reserved */ + + srcRec->parentID = SWAP_BE32 (srcRec->parentID); + + if (direction == kSwapBTNodeBigToHost) + srcRec->nodeName.length = SWAP_BE16 (srcRec->nodeName.length); + + /* + * Make sure there is room for the name in the buffer. + * Then swap the characters of the name itself. + */ + if ((char *) &srcRec->nodeName.unicode[srcRec->nodeName.length] > nextRecord) { + if (direction == kSwapBTNodeHostToBig) { + panic("hfs_swap_HFSPlusBTInternalNode: catalog thread record #%d name too big\n", srcDesc->numRecords-i-1); + } else { + printf("hfs_swap_HFSPlusBTInternalNode: catalog thread record #%d name too big\n", srcDesc->numRecords-i-1); + } + return fsBTInvalidNodeErr; + } + for (j = 0; j < srcRec->nodeName.length; j++) { + srcRec->nodeName.unicode[j] = SWAP_BE16 (srcRec->nodeName.unicode[j]); + } + + if (direction == kSwapBTNodeHostToBig) + srcRec->nodeName.length = SWAP_BE16 (srcRec->nodeName.length); + + } else { + if (direction == kSwapBTNodeHostToBig) { + panic("hfs_swap_HFSPlusBTInternalNode: unrecognized catalog record type (0x%04X; record #%d)\n", srcPtr[0], srcDesc->numRecords-i-1); + } else { + printf("hfs_swap_HFSPlusBTInternalNode: unrecognized catalog record type (0x%04X; record #%d)\n", srcPtr[0], srcDesc->numRecords-i-1); + } + return fsBTInvalidNodeErr; + } + + /* We can swap the record type now that we're done using it. */ + if (direction == kSwapBTNodeHostToBig) + srcPtr[0] = SWAP_BE16 (srcPtr[0]); + } + + } else if (fileID == kHFSAttributesFileID) { + HFSPlusAttrKey *srcKey; + HFSPlusAttrRecord *srcRec; + u_int16_t keyLength; + u_int32_t attrSize = 0; + + for (i = 0; i < srcDesc->numRecords; i++) { + /* Point to the start of the record we're currently checking. */ + srcKey = (HFSPlusAttrKey *)((char *)src->buffer + srcOffs[i]); + + /* + * Point to start of next (larger offset) record. We'll use this + * to be sure the current record doesn't overflow into the next + * record. + */ + nextRecord = (char *)src->buffer + srcOffs[i-1]; + + /* Make sure there is room in the buffer for a minimal key */ + if ((char *) &srcKey->attrName[1] > nextRecord) { + if (direction == kSwapBTNodeHostToBig) { + panic("hfs_swap_HFSPlusBTInternalNode: attr key #%d offset too big (0x%04X)\n", srcDesc->numRecords-i-1, srcOffs[i]); + } else { + printf("hfs_swap_HFSPlusBTInternalNode: attr key #%d offset too big (0x%04X)\n", srcDesc->numRecords-i-1, srcOffs[i]); + } + return fsBTInvalidNodeErr; + } + + /* Swap the key length field */ + if (direction == kSwapBTNodeBigToHost) + srcKey->keyLength = SWAP_BE16(srcKey->keyLength); + keyLength = srcKey->keyLength; /* Keep a copy in native order */ + if (direction == kSwapBTNodeHostToBig) + srcKey->keyLength = SWAP_BE16(srcKey->keyLength); + + /* + * Make sure that we can safely dereference the record's type field or + * an index node's child node number. + */ + srcRec = (HFSPlusAttrRecord *)((char *)srcKey + keyLength + sizeof(srcKey->keyLength)); + if ((char *)srcRec + sizeof(u_int32_t) > nextRecord) { + if (direction == kSwapBTNodeHostToBig) { + panic("hfs_swap_HFSPlusBTInternalNode: attr key #%d too big (%d)\n", srcDesc->numRecords-i-1, keyLength); + } else { + printf("hfs_swap_HFSPlusBTInternalNode: attr key #%d too big (%d)\n", srcDesc->numRecords-i-1, keyLength); + } + return fsBTInvalidNodeErr; + } + + srcKey->fileID = SWAP_BE32(srcKey->fileID); + srcKey->startBlock = SWAP_BE32(srcKey->startBlock); + + /* + * Swap and check the attribute name + */ + if (direction == kSwapBTNodeBigToHost) + srcKey->attrNameLen = SWAP_BE16(srcKey->attrNameLen); + /* Sanity check the attribute name length */ + if (srcKey->attrNameLen > kHFSMaxAttrNameLen || keyLength < (kHFSPlusAttrKeyMinimumLength + sizeof(u_int16_t)*srcKey->attrNameLen)) { + if (direction == kSwapBTNodeHostToBig) { + panic("hfs_swap_HFSPlusBTInternalNode: attr key #%d keyLength=%d attrNameLen=%d\n", srcDesc->numRecords-i-1, keyLength, srcKey->attrNameLen); + } else { + printf("hfs_swap_HFSPlusBTInternalNode: attr key #%d keyLength=%d attrNameLen=%d\n", srcDesc->numRecords-i-1, keyLength, srcKey->attrNameLen); + } + return fsBTInvalidNodeErr; + } + for (j = 0; j < srcKey->attrNameLen; j++) + srcKey->attrName[j] = SWAP_BE16(srcKey->attrName[j]); + if (direction == kSwapBTNodeHostToBig) + srcKey->attrNameLen = SWAP_BE16(srcKey->attrNameLen); + + /* + * For index nodes, the record data is just the child's node number. + * Skip over swapping the various types of attribute record. + */ + if (srcDesc->kind == kBTIndexNode) { + *((u_int32_t *)srcRec) = SWAP_BE32 (*((u_int32_t *)srcRec)); + continue; + } + + /* Swap the record data */ + if (direction == kSwapBTNodeBigToHost) + srcRec->recordType = SWAP_BE32(srcRec->recordType); + switch (srcRec->recordType) { + case kHFSPlusAttrInlineData: + /* Is there room for the inline data header? */ + if ((char *) &srcRec->attrData.attrData[0] > nextRecord) { + if (direction == kSwapBTNodeHostToBig) { + panic("hfs_swap_HFSPlusBTInternalNode: attr inline #%d too big\n", srcDesc->numRecords-i-1); + } else { + printf("hfs_swap_HFSPlusBTInternalNode: attr inline #%d too big\n", srcDesc->numRecords-i-1); + } + return fsBTInvalidNodeErr; + } + + /* We're not swapping the reserved fields */ + + /* Swap the attribute size */ + if (direction == kSwapBTNodeHostToBig) + attrSize = srcRec->attrData.attrSize; + srcRec->attrData.attrSize = SWAP_BE32(srcRec->attrData.attrSize); + if (direction == kSwapBTNodeBigToHost) + attrSize = srcRec->attrData.attrSize; + + /* Is there room for the inline attribute data? */ + if ((char *) &srcRec->attrData.attrData[attrSize] > nextRecord) { + if (direction == kSwapBTNodeHostToBig) { + panic("hfs_swap_HFSPlusBTInternalNode: attr inline #%d too big (attrSize=%u)\n", srcDesc->numRecords-i-1, attrSize); + } else { + printf("hfs_swap_HFSPlusBTInternalNode: attr inline #%d too big (attrSize=%u)\n", srcDesc->numRecords-i-1, attrSize); + } + return fsBTInvalidNodeErr; + } + + /* Not swapping the attribute data itself */ + break; + + case kHFSPlusAttrForkData: + /* Is there room for the fork data record? */ + if ((char *)srcRec + sizeof(HFSPlusAttrForkData) > nextRecord) { + if (direction == kSwapBTNodeHostToBig) { + panic("hfs_swap_HFSPlusBTInternalNode: attr fork data #%d too big\n", srcDesc->numRecords-i-1); + } else { + printf("hfs_swap_HFSPlusBTInternalNode: attr fork data #%d too big\n", srcDesc->numRecords-i-1); + } + return fsBTInvalidNodeErr; + } + + /* We're not swapping the reserved field */ + + hfs_swap_HFSPlusForkData(&srcRec->forkData.theFork); + break; + + case kHFSPlusAttrExtents: + /* Is there room for an extent record? */ + if ((char *)srcRec + sizeof(HFSPlusAttrExtents) > nextRecord) { + if (direction == kSwapBTNodeHostToBig) { + panic("hfs_swap_HFSPlusBTInternalNode: attr extents #%d too big\n", srcDesc->numRecords-i-1); + } else { + printf("hfs_swap_HFSPlusBTInternalNode: attr extents #%d too big\n", srcDesc->numRecords-i-1); + } + return fsBTInvalidNodeErr; + } + + /* We're not swapping the reserved field */ + + for (j = 0; j < kHFSPlusExtentDensity; j++) { + srcRec->overflowExtents.extents[j].startBlock = + SWAP_BE32(srcRec->overflowExtents.extents[j].startBlock); + srcRec->overflowExtents.extents[j].blockCount = + SWAP_BE32(srcRec->overflowExtents.extents[j].blockCount); + } + break; + } + if (direction == kSwapBTNodeHostToBig) + srcRec->recordType = SWAP_BE32(srcRec->recordType); + } + } else if (fileID > kHFSFirstUserCatalogNodeID) { + /* The only B-tree with a non-system CNID that we use is the hotfile B-tree */ + HotFileKey *srcKey; + u_int32_t *srcRec; + + for (i = 0; i < srcDesc->numRecords; i++) { + /* Point to the start of the record we're currently checking. */ + srcKey = (HotFileKey *)((char *)src->buffer + srcOffs[i]); + + /* + * Point to start of next (larger offset) record. We'll use this + * to be sure the current record doesn't overflow into the next + * record. + */ + nextRecord = (char *)src->buffer + srcOffs[i-1]; + + /* Make sure there is room for the key (HotFileKey) and data (u_int32_t) */ + if ((char *)srcKey + sizeof(HotFileKey) + sizeof(u_int32_t) > nextRecord) { + if (direction == kSwapBTNodeHostToBig) { + panic("hfs_swap_HFSPlusBTInternalNode: hotfile #%d offset too big (0x%04X)\n", srcDesc->numRecords-i-1, srcOffs[i]); + } else { + printf("hfs_swap_HFSPlusBTInternalNode: hotfile #%d offset too big (0x%04X)\n", srcDesc->numRecords-i-1, srcOffs[i]); + } + return fsBTInvalidNodeErr; + } + + /* Swap and sanity check the key length field */ + if (direction == kSwapBTNodeBigToHost) + srcKey->keyLength = SWAP_BE16 (srcKey->keyLength); + if (srcKey->keyLength != sizeof(*srcKey) - sizeof(srcKey->keyLength)) { + if (direction == kSwapBTNodeHostToBig) { + panic("hfs_swap_HFSPlusBTInternalNode: hotfile #%d incorrect keyLength %d\n", srcDesc->numRecords-i-1, srcKey->keyLength); + } else { + printf("hfs_swap_HFSPlusBTInternalNode: hotfile #%d incorrect keyLength %d\n", srcDesc->numRecords-i-1, srcKey->keyLength); + } + return fsBTInvalidNodeErr; + } + srcRec = (u_int32_t *)((char *)srcKey + srcKey->keyLength + sizeof(srcKey->keyLength)); + if (direction == kSwapBTNodeHostToBig) + srcKey->keyLength = SWAP_BE16 (srcKey->keyLength); + + /* Don't swap srcKey->forkType */ + /* Don't swap srcKey->pad */ + + srcKey->temperature = SWAP_BE32 (srcKey->temperature); + srcKey->fileID = SWAP_BE32 (srcKey->fileID); + + *((u_int32_t *)srcRec) = SWAP_BE32 (*((u_int32_t *)srcRec)); + } + } else { + panic ("hfs_swap_HFSPlusBTInternalNode: fileID %u is not a system B-tree\n", fileID); + } + + + return (0); +} + +#if CONFIG_HFS_STD +int +hfs_swap_HFSBTInternalNode ( + BlockDescriptor *src, + HFSCatalogNodeID fileID, + enum HFSBTSwapDirection direction +) +{ + BTNodeDescriptor *srcDesc = src->buffer; + u_int16_t *srcOffs = (u_int16_t *)((char *)src->buffer + (src->blockSize - (srcDesc->numRecords * sizeof (u_int16_t)))); + char *nextRecord; /* Points to start of record following current one */ + + /* + * i is an int32 because it needs to be negative to index the offset to free space. + * srcDesc->numRecords is a u_int16_t and is unlikely to become 32-bit so this should be ok. + */ + int32_t i; + u_int32_t j; + + if (fileID == kHFSExtentsFileID) { + HFSExtentKey *srcKey; + HFSExtentDescriptor *srcRec; + size_t recordSize; /* Size of the data part of the record, or node number for index nodes */ + + if (srcDesc->kind == kBTIndexNode) + recordSize = sizeof(u_int32_t); + else + recordSize = sizeof(HFSExtentDescriptor); + + for (i = 0; i < srcDesc->numRecords; i++) { + /* Point to the start of the record we're currently checking. */ + srcKey = (HFSExtentKey *)((char *)src->buffer + srcOffs[i]); + + /* + * Point to start of next (larger offset) record. We'll use this + * to be sure the current record doesn't overflow into the next + * record. + */ + nextRecord = (char *)src->buffer + srcOffs[i-1]; + + /* + * Make sure the key and data are within the buffer. Since both key + * and data are fixed size, this is relatively easy. Note that this + * relies on the keyLength being a constant; we verify the keyLength + * below. + */ + if ((char *)srcKey + sizeof(HFSExtentKey) + recordSize > nextRecord) { + if (direction == kSwapBTNodeHostToBig) { + panic("hfs_swap_HFSBTInternalNode: extents key #%d offset too big (0x%04X)\n", srcDesc->numRecords-i-1, srcOffs[i]); + } else { + printf("hfs_swap_HFSBTInternalNode: extents key #%d offset too big (0x%04X)\n", srcDesc->numRecords-i-1, srcOffs[i]); + } + return fsBTInvalidNodeErr; + } + + /* Don't swap srcKey->keyLength (it's only one byte), but do sanity check it */ + if (srcKey->keyLength != sizeof(*srcKey) - sizeof(srcKey->keyLength)) { + if (direction == kSwapBTNodeHostToBig) { + panic("hfs_swap_HFSBTInternalNode: extents key #%d invalid length (%d)\n", srcDesc->numRecords-i-1, srcKey->keyLength); + } else { + printf("hfs_swap_HFSBTInternalNode: extents key #%d invalid length (%d)\n", srcDesc->numRecords-i-1, srcKey->keyLength); + } + return fsBTInvalidNodeErr; + } + + /* Don't swap srcKey->forkType; it's only one byte */ + + srcKey->fileID = SWAP_BE32 (srcKey->fileID); + srcKey->startBlock = SWAP_BE16 (srcKey->startBlock); + + /* Point to record data (round up to even byte boundary) */ + srcRec = (HFSExtentDescriptor *)((char *)srcKey + ((srcKey->keyLength + 2) & ~1)); + + if (srcDesc->kind == kBTIndexNode) { + /* For index nodes, the record data is just a child node number. */ + *((u_int32_t *)srcRec) = SWAP_BE32 (*((u_int32_t *)srcRec)); + } else { + /* Swap the extent data */ + for (j = 0; j < kHFSExtentDensity; j++) { + srcRec[j].startBlock = SWAP_BE16 (srcRec[j].startBlock); + srcRec[j].blockCount = SWAP_BE16 (srcRec[j].blockCount); + } + } + } + + } else if (fileID == kHFSCatalogFileID) { + HFSCatalogKey *srcKey; + int16_t *srcPtr; + unsigned expectedKeyLength; + + for (i = 0; i < srcDesc->numRecords; i++) { + /* Point to the start of the record we're currently checking. */ + srcKey = (HFSCatalogKey *)((char *)src->buffer + srcOffs[i]); + + /* + * Point to start of next (larger offset) record. We'll use this + * to be sure the current record doesn't overflow into the next + * record. + */ + nextRecord = (char *)src->buffer + srcOffs[i-1]; + + /* + * Make sure we can safely dereference the keyLength and parentID fields. + * The value 8 below is 1 bytes for keyLength + 1 byte reserved + 4 bytes + * for parentID + 1 byte for nodeName's length + 1 byte to round up the + * record start to an even offset, which forms a minimal key. + */ + if ((char *)srcKey + 8 > nextRecord) { + if (direction == kSwapBTNodeHostToBig) { + panic("hfs_swap_HFSBTInternalNode: catalog key #%d offset too big (0x%04X)\n", srcDesc->numRecords-i-1, srcOffs[i]); + } else { + printf("hfs_swap_HFSBTInternalNode: catalog key #%d offset too big (0x%04X)\n", srcDesc->numRecords-i-1, srcOffs[i]); + } + return fsBTInvalidNodeErr; + } + + /* Don't swap srcKey->keyLength (it's only one byte), but do sanity check it */ + if (srcKey->keyLength < kHFSCatalogKeyMinimumLength || srcKey->keyLength > kHFSCatalogKeyMaximumLength) { + if (direction == kSwapBTNodeHostToBig) { + panic("hfs_swap_HFSBTInternalNode: catalog key #%d invalid length (%d)\n", srcDesc->numRecords-i-1, srcKey->keyLength); + } else { + printf("hfs_swap_HFSBTInternalNode: catalog key #%d invalid length (%d)\n", srcDesc->numRecords-i-1, srcKey->keyLength); + } + return fsBTInvalidNodeErr; + } + + /* Don't swap srcKey->reserved */ + + srcKey->parentID = SWAP_BE32 (srcKey->parentID); + + /* Don't swap srcKey->nodeName */ + + /* Make sure the keyLength is big enough for the key's content */ + if (srcDesc->kind == kBTIndexNode) + expectedKeyLength = sizeof(*srcKey) - sizeof(srcKey->keyLength); + else + expectedKeyLength = srcKey->nodeName[0] + kHFSCatalogKeyMinimumLength; + if (srcKey->keyLength < expectedKeyLength) { + if (direction == kSwapBTNodeHostToBig) { + panic("hfs_swap_HFSBTInternalNode: catalog record #%d keyLength=%u expected=%u\n", + srcDesc->numRecords-i, srcKey->keyLength, expectedKeyLength); + } else { + printf("hfs_swap_HFSBTInternalNode: catalog record #%d keyLength=%u expected=%u\n", + srcDesc->numRecords-i, srcKey->keyLength, expectedKeyLength); + } + return fsBTInvalidNodeErr; + } + + /* Point to record data (round up to even byte boundary) */ + srcPtr = (int16_t *)((char *)srcKey + ((srcKey->keyLength + 2) & ~1)); + + /* + * Make sure that we can safely dereference the record's type field or + * and index node's child node number. + */ + if ((char *)srcPtr + sizeof(u_int32_t) > nextRecord) { + if (direction == kSwapBTNodeHostToBig) { + panic("hfs_swap_HFSBTInternalNode: catalog key #%d too big\n", srcDesc->numRecords-i-1); + } else { + printf("hfs_swap_HFSBTInternalNode: catalog key #%d too big\n", srcDesc->numRecords-i-1); + } + return fsBTInvalidNodeErr; + } + + /* + * For index nodes, the record data is just the child's node number. + * Skip over swapping the various types of catalog record. + */ + if (srcDesc->kind == kBTIndexNode) { + *((u_int32_t *)srcPtr) = SWAP_BE32 (*((u_int32_t *)srcPtr)); + continue; + } + + /* Make sure the recordType is in native order before using it. */ + if (direction == kSwapBTNodeBigToHost) + srcPtr[0] = SWAP_BE16 (srcPtr[0]); + + if (srcPtr[0] == kHFSFolderRecord) { + HFSCatalogFolder *srcRec = (HFSCatalogFolder *)srcPtr; + if ((char *)srcRec + sizeof(*srcRec) > nextRecord) { + if (direction == kSwapBTNodeHostToBig) { + panic("hfs_swap_HFSBTInternalNode: catalog folder record #%d too big\n", srcDesc->numRecords-i-1); + } else { + printf("hfs_swap_HFSBTInternalNode: catalog folder record #%d too big\n", srcDesc->numRecords-i-1); + } + return fsBTInvalidNodeErr; + } + + srcRec->flags = SWAP_BE16 (srcRec->flags); + srcRec->valence = SWAP_BE16 (srcRec->valence); + + srcRec->folderID = SWAP_BE32 (srcRec->folderID); + srcRec->createDate = SWAP_BE32 (srcRec->createDate); + srcRec->modifyDate = SWAP_BE32 (srcRec->modifyDate); + srcRec->backupDate = SWAP_BE32 (srcRec->backupDate); + + /* Don't swap srcRec->userInfo */ + /* Don't swap srcRec->finderInfo */ + /* Don't swap resserved array */ + + } else if (srcPtr[0] == kHFSFileRecord) { + HFSCatalogFile *srcRec = (HFSCatalogFile *)srcPtr; + if ((char *)srcRec + sizeof(*srcRec) > nextRecord) { + if (direction == kSwapBTNodeHostToBig) { + panic("hfs_swap_HFSBTInternalNode: catalog file record #%d too big\n", srcDesc->numRecords-i-1); + } else { + printf("hfs_swap_HFSBTInternalNode: catalog file record #%d too big\n", srcDesc->numRecords-i-1); + } + return fsBTInvalidNodeErr; + } + + srcRec->flags = srcRec->flags; + srcRec->fileType = srcRec->fileType; + + /* Don't swap srcRec->userInfo */ + + srcRec->fileID = SWAP_BE32 (srcRec->fileID); + + srcRec->dataStartBlock = SWAP_BE16 (srcRec->dataStartBlock); + srcRec->dataLogicalSize = SWAP_BE32 (srcRec->dataLogicalSize); + srcRec->dataPhysicalSize = SWAP_BE32 (srcRec->dataPhysicalSize); + + srcRec->rsrcStartBlock = SWAP_BE16 (srcRec->rsrcStartBlock); + srcRec->rsrcLogicalSize = SWAP_BE32 (srcRec->rsrcLogicalSize); + srcRec->rsrcPhysicalSize = SWAP_BE32 (srcRec->rsrcPhysicalSize); + + srcRec->createDate = SWAP_BE32 (srcRec->createDate); + srcRec->modifyDate = SWAP_BE32 (srcRec->modifyDate); + srcRec->backupDate = SWAP_BE32 (srcRec->backupDate); + + /* Don't swap srcRec->finderInfo */ + + srcRec->clumpSize = SWAP_BE16 (srcRec->clumpSize); + + /* Swap the two sets of extents as an array of six (three each) u_int16_t */ + for (j = 0; j < kHFSExtentDensity * 2; j++) { + srcRec->dataExtents[j].startBlock = SWAP_BE16 (srcRec->dataExtents[j].startBlock); + srcRec->dataExtents[j].blockCount = SWAP_BE16 (srcRec->dataExtents[j].blockCount); + } + + /* Don't swap srcRec->reserved */ + + } else if ((srcPtr[0] == kHFSFolderThreadRecord) || + (srcPtr[0] == kHFSFileThreadRecord)) { + HFSCatalogThread *srcRec = (HFSCatalogThread *)srcPtr; + + /* Make sure there is room for parentID and name length */ + if ((char *) &srcRec->nodeName[1] > nextRecord) { + if (direction == kSwapBTNodeHostToBig) { + panic("hfs_swap_HFSBTInternalNode: catalog thread record #%d too big\n", srcDesc->numRecords-i-1); + } else { + printf("hfs_swap_HFSBTInternalNode: catalog thread record #%d too big\n", srcDesc->numRecords-i-1); + } + return fsBTInvalidNodeErr; + } + + /* Don't swap srcRec->reserved array */ + + srcRec->parentID = SWAP_BE32 (srcRec->parentID); + + /* Don't swap srcRec->nodeName */ + + /* Make sure there is room for the name in the buffer */ + if ((char *) &srcRec->nodeName[srcRec->nodeName[0]] > nextRecord) { + if (direction == kSwapBTNodeHostToBig) { + panic("hfs_swap_HFSBTInternalNode: catalog thread record #%d name too big\n", srcDesc->numRecords-i-1); + } else { + printf("hfs_swap_HFSBTInternalNode: catalog thread record #%d name too big\n", srcDesc->numRecords-i-1); + } + return fsBTInvalidNodeErr; + } + } else { + if (direction == kSwapBTNodeHostToBig) { + panic("hfs_swap_HFSBTInternalNode: unrecognized catalog record type (0x%04X; record #%d)\n", srcPtr[0], srcDesc->numRecords-i-1); + } else { + printf("hfs_swap_HFSBTInternalNode: unrecognized catalog record type (0x%04X; record #%d)\n", srcPtr[0], srcDesc->numRecords-i-1); + } + return fsBTInvalidNodeErr; + } + + /* We can swap the record type now that we're done using it */ + if (direction == kSwapBTNodeHostToBig) + srcPtr[0] = SWAP_BE16 (srcPtr[0]); + } + + } else { + panic ("hfs_swap_HFSBTInternalNode: fileID %u is not a system B-tree\n", fileID); + } + + return (0); +} +#endif + diff --git a/core/hfs_endian.h b/core/hfs_endian.h new file mode 100644 index 0000000..a916319 --- /dev/null +++ b/core/hfs_endian.h @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2000, 2002-2003, 2005-2008 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#ifndef __HFS_ENDIAN_H__ +#define __HFS_ENDIAN_H__ + +#include + +#ifdef KERNEL +#ifdef __APPLE_API_PRIVATE +/* + * hfs_endian.h + * + * This file prototypes endian swapping routines for the HFS/HFS Plus + * volume format. + */ +#include "hfs.h" +#include "BTreesInternal.h" +#include + +/*********************/ +/* BIG ENDIAN Macros */ +/*********************/ +#define SWAP_BE16(__a) OSSwapBigToHostInt16 (__a) +#define SWAP_BE32(__a) OSSwapBigToHostInt32 (__a) +#define SWAP_BE64(__a) OSSwapBigToHostInt64 (__a) + +#if BYTE_ORDER == BIG_ENDIAN + + /* HFS is always big endian, no swapping needed */ + #define SWAP_HFS_PLUS_FORK_DATA(__a) + +/************************/ +/* LITTLE ENDIAN Macros */ +/************************/ +#elif BYTE_ORDER == LITTLE_ENDIAN + + #define SWAP_HFS_PLUS_FORK_DATA(__a) hfs_swap_HFSPlusForkData ((__a)) + +#else +#warning Unknown byte order +#error +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Constants for the "unswap" argument to hfs_swap_BTNode: + */ +enum HFSBTSwapDirection { + kSwapBTNodeBigToHost = 0, + kSwapBTNodeHostToBig = 1, + + /* + * kSwapBTNodeHeaderRecordOnly is used to swap just the header record + * of a header node from big endian (on disk) to host endian (in memory). + * It does not swap the node descriptor (forward/backward links, record + * count, etc.). It assumes the header record is at offset 0x000E. + * + * Since HFS Plus doesn't have fixed B-tree node sizes, we have to read + * the header record to determine the actual node size for that tree + * before we can set up the B-tree control block. We read it initially + * as 512 bytes, then re-read it once we know the correct node size. Since + * we may not have read the entire header node the first time, we can't + * swap the record offsets, other records, or do most sanity checks. + */ + kSwapBTNodeHeaderRecordOnly = 3 +}; + +int hfs_swap_BTNode (BlockDescriptor *src, vnode_t vp, enum HFSBTSwapDirection direction, + u_int8_t allow_empty_node); + +#ifdef __cplusplus +} +#endif + +#endif /* __APPLE_API_PRIVATE */ +#endif /* KERNEL */ +#endif /* __HFS_FORMAT__ */ diff --git a/core/hfs_extents.c b/core/hfs_extents.c new file mode 100644 index 0000000..ce4154d --- /dev/null +++ b/core/hfs_extents.c @@ -0,0 +1,771 @@ +/* + * Copyright (c) 2014-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#if HFS_EXTENTS_TEST + +#include "../tests/hfs_extents_test.h" +#include "hfs_extents.h" + +#else + +#include "hfs_extents.h" + +// In this file, group refers to a set of 8 extents + +static uint32_t hfs_total_blocks(const HFSPlusExtentDescriptor *ext, int count); +static errno_t hfs_ext_iter_next_group(struct hfs_ext_iter *iter); +static errno_t hfs_ext_iter_update(struct hfs_ext_iter *iter, + HFSPlusExtentDescriptor *extents, + int count, + HFSPlusExtentRecord cat_extents); +static errno_t hfs_ext_iter_check_group(hfs_ext_iter_t *iter); + +#endif + +#define CHECK(x, var, goto_label) \ + do { \ + var = (x); \ + if (var) { \ + printf("%s:%u error: %d\n", __func__, __LINE__, var); \ + goto goto_label; \ + } \ + } while (0) + +#define min(a,b) \ + ({ typeof (a) _a = (a); typeof (b) _b = (b); _a < _b ? _a : _b; }) + +static __attribute__((pure)) +const HFSPlusExtentKey *hfs_ext_iter_key(const hfs_ext_iter_t *iter) +{ + return (const HFSPlusExtentKey *)&iter->bt_iter.key; +} + +static __attribute__((pure)) +HFSPlusExtentKey *hfs_ext_iter_key_mut(hfs_ext_iter_t *iter) +{ + return (HFSPlusExtentKey *)&iter->bt_iter.key; +} + +// Returns the total number of blocks for the @count extents provided +uint32_t hfs_total_blocks(const HFSPlusExtentDescriptor *extents, int count) +{ + uint32_t block_count = 0; + for (int i = 0; i < count; ++i) + block_count += extents[i].blockCount; + return block_count; +} + +/* + * Checks a group of extents: makes sure that if it's the last group + * for a fork, that all the remaining extents are properly zeroed and + * if it's not then checks that all extents are set. This also sets + * @group_block_count and @last_in_fork. Returns ESTALE if + * inconsistent. + */ +errno_t hfs_ext_iter_check_group(hfs_ext_iter_t *iter) +{ + filefork_t *ff = VTOF(iter->vp); + const HFSPlusExtentKey *key = hfs_ext_iter_key(iter); + uint32_t count = 0; + int i; + + for (i = 0; i < kHFSPlusExtentDensity; ++i) { + if (!iter->group[i].blockCount) + break; + count += iter->group[i].blockCount; + } + + if (i < kHFSPlusExtentDensity) { + iter->last_in_fork = true; + if (key->startBlock + count != ff_allocblocks(ff)) + goto bad; + + // Check remainder of extents + for (++i; i < kHFSPlusExtentDensity; ++i) { + if (iter->group[i].blockCount) + goto bad; + } + } else { + if (key->startBlock + count > ff_allocblocks(ff)) + goto bad; + + iter->last_in_fork = (key->startBlock + count == ff_allocblocks(ff)); + } + + iter->group_block_count = count; + + return 0; + +bad: + +#if DEBUG + printf("hfs_ext_iter_check_group: bad group; start: %u, total blocks: %u\n", + key->startBlock, ff_allocblocks(ff)); + + for (int j = 0; j < kHFSPlusExtentDensity; ++j) { + printf("%s<%u, %u>", j ? ", " : "", + iter->group[j].startBlock, iter->group[j].blockCount); + } + + printf("\n"); +#endif + + return ESTALE; +} + +// NOTE: doesn't copy group data +static void hfs_ext_iter_copy(const hfs_ext_iter_t *src, hfs_ext_iter_t *dst) +{ + dst->vp = src->vp; + memcpy(&dst->bt_iter.key, &src->bt_iter.key, sizeof(HFSPlusExtentKey)); + + dst->file_block = src->file_block; + dst->ndx = src->ndx; + + dst->bt_iter.hint = src->bt_iter.hint; + dst->bt_iter.version = 0; + dst->bt_iter.reserved = 0; + dst->bt_iter.hitCount = 0; + dst->bt_iter.maxLeafRecs = 0; +} + +bool hfs_ext_iter_is_catalog_extents(hfs_ext_iter_t *iter) +{ + return hfs_ext_iter_key(iter)->startBlock == 0; +} + +#if !HFS_EXTENTS_TEST + +/* + * Finds the extent for offset. It might be in the catalog or the extents + * file. + */ +errno_t hfs_ext_find(vnode_t vp, off_t offset, hfs_ext_iter_t *iter) +{ + errno_t ret; + hfsmount_t *hfsmp = VTOHFS(vp); + + iter->vp = vp; + + uint32_t end_block, index; + HFSPlusExtentKey *key = hfs_ext_iter_key_mut(iter); + + filefork_t *ff = VTOF(vp); + + CHECK(SearchExtentFile(hfsmp, ff, offset, + key, iter->group, &index, + &iter->bt_iter.hint.nodeNum, &end_block), ret, exit); + + iter->ndx = index; + iter->file_block = end_block - iter->group[index].blockCount; + + if (!key->keyLength) { + // We're pointing at the catalog record extents so fix up the key + key->keyLength = kHFSPlusExtentKeyMaximumLength; + key->forkType = (VNODE_IS_RSRC(iter->vp) + ? kHFSResourceForkType : kHFSDataForkType); + key->pad = 0; + key->fileID = VTOC(iter->vp)->c_fileid; + key->startBlock = 0; + } + + CHECK(hfs_ext_iter_check_group(iter), ret, exit); + + ret = 0; + +exit: + + return MacToVFSError(ret); +} + +static uint32_t hfs_ext_iter_next_group_block(const hfs_ext_iter_t *iter) +{ + const HFSPlusExtentKey *key = hfs_ext_iter_key(iter); + + return key->startBlock + iter->group_block_count; +} + +/* + * Move the iterator to the next group. Don't call if there's a chance + * there is no entry; the caller should check last_in_fork instead. + */ +static errno_t hfs_ext_iter_next_group(hfs_ext_iter_t *iter) +{ + errno_t ret; + hfsmount_t *hfsmp = VTOHFS(iter->vp); + filefork_t * const tree = hfsmp->hfs_extents_cp->c_datafork; + HFSPlusExtentKey *key = hfs_ext_iter_key_mut(iter); + const bool catalog_extents = hfs_ext_iter_is_catalog_extents(iter); + const uint32_t next_block = hfs_ext_iter_next_group_block(iter); + + FSBufferDescriptor fbd = { + .bufferAddress = &iter->group, + .itemCount = 1, + .itemSize = sizeof(iter->group) + }; + + if (catalog_extents) { + key->startBlock = next_block; + + CHECK(BTSearchRecord(tree, &iter->bt_iter, &fbd, NULL, + &iter->bt_iter), ret, exit); + } else { + const uint32_t file_id = key->fileID; + const uint8_t fork_type = key->forkType; + + CHECK(BTIterateRecord(tree, kBTreeNextRecord, &iter->bt_iter, + &fbd, NULL), ret, exit); + + if (key->fileID != file_id + || key->forkType != fork_type + || key->startBlock != next_block) { + // This indicates an inconsistency + ret = ESTALE; + goto exit; + } + } + + iter->file_block = key->startBlock; + iter->ndx = 0; + + CHECK(hfs_ext_iter_check_group(iter), ret, exit); + + ret = 0; + +exit: + + return MacToVFSError(ret); +} + +/* + * Updates with the extents provided and sets the key up for the next group. + * It is assumed that any previous record that might collide has been deleted. + * NOTE: @extents must point to a buffer that can be zero padded to multiple + * of 8 extents. + */ +errno_t hfs_ext_iter_update(hfs_ext_iter_t *iter, + HFSPlusExtentDescriptor *extents, + int count, + HFSPlusExtentRecord cat_extents) +{ + errno_t ret; + hfsmount_t *hfsmp = VTOHFS(iter->vp); + cnode_t *cp = VTOC(iter->vp); + HFSPlusExtentKey *key = hfs_ext_iter_key_mut(iter); + int ndx = 0; + + if (!extents) + extents = iter->group; + + if (count % kHFSPlusExtentDensity) { + // Zero out last group + bzero(&extents[count], (kHFSPlusExtentDensity + - (count % 8)) * sizeof(*extents)); + } + + if (hfs_ext_iter_is_catalog_extents(iter)) { + // Caller is responsible for in-memory updates + + if (cat_extents) + hfs_ext_copy_rec(extents, cat_extents); + + struct cat_fork fork; + + hfs_fork_copy(&fork, &VTOF(iter->vp)->ff_data, extents); + hfs_prepare_fork_for_update(VTOF(iter->vp), &fork, &fork, hfsmp->blockSize); + + bool is_rsrc = VNODE_IS_RSRC(iter->vp); + CHECK(cat_update(hfsmp, &cp->c_desc, &cp->c_attr, + is_rsrc ? NULL : &fork, + is_rsrc ? &fork : NULL), ret, exit); + + // Set the key to the next group + key->startBlock = hfs_total_blocks(extents, kHFSPlusExtentDensity); + + ndx += 8; + } + + // Deal with the remainder which must be overflow extents + for (; ndx < count; ndx += 8) { + filefork_t * const tree = hfsmp->hfs_extents_cp->c_datafork; + + FSBufferDescriptor fbd = { + .bufferAddress = &extents[ndx], + .itemCount = 1, + .itemSize = sizeof(HFSPlusExtentRecord) + }; + + CHECK(BTInsertRecord(tree, &iter->bt_iter, &fbd, + sizeof(HFSPlusExtentRecord)), ret, exit); + + // Set the key to the next group + key->startBlock += hfs_total_blocks(&extents[ndx], kHFSPlusExtentDensity); + } + + ret = 0; + +exit: + + return ret; +} + +#endif // !HFS_EXTENTS_TEST + +static void push_ext(HFSPlusExtentDescriptor *extents, int *count, + const HFSPlusExtentDescriptor *ext) +{ + if (!ext->blockCount) + return; + + if (*count && hfs_ext_end(&extents[*count - 1]) == ext->startBlock) + extents[*count - 1].blockCount += ext->blockCount; + else + extents[(*count)++] = *ext; +} + +/* + * NOTE: Here we rely on the replacement extents not being too big as + * otherwise the number of BTree records that we have to delete could be + * too large. + */ +errno_t hfs_ext_replace(hfsmount_t *hfsmp, vnode_t vp, + uint32_t file_block, + const HFSPlusExtentDescriptor *repl, + int repl_count, + HFSPlusExtentRecord catalog_extents) +{ + errno_t ret; + filefork_t * const tree = hfsmp->hfs_extents_cp->c_datafork; + hfs_ext_iter_t *iter_in = NULL, *iter_out; + HFSPlusExtentDescriptor *extents = NULL; + int buffered_extents = 0; + const int max_roll_back_extents = 16384; // 128k + HFSPlusExtentDescriptor *roll_back_extents = NULL; + int roll_back_count = 0; + const uint32_t end_file_block = file_block + hfs_total_blocks(repl, repl_count); + filefork_t *ff = VTOF(vp); + uint32_t start_group_block = 0, block = 0; + + // Indicate we haven't touched catalog extents + catalog_extents[0].blockCount = 0; + + if (end_file_block > ff_allocblocks(ff)) + return EINVAL; + + iter_in = hfs_malloc(sizeof(*iter_in) * 2); + iter_out = iter_in + 1; + HFSPlusExtentKey *key_in = hfs_ext_iter_key_mut(iter_in); + + // Get to where we want to start + off_t offset = hfs_blk_to_bytes(file_block, hfsmp->blockSize); + + /* + * If the replacement is at the start of a group, we want to pull in the + * group before so that we tidy up any padding that we might have done + * in a prior hfs_ext_replace call. + */ + if (offset > 0) + --offset; + + CHECK(hfs_ext_find(vp, offset, iter_in), ret, exit); + + start_group_block = key_in->startBlock; + + roll_back_extents = hfs_malloc(max_roll_back_extents + * sizeof(HFSPlusExtentDescriptor)); + + // Move to the first extent in this group + iter_in->ndx = 0; + + hfs_ext_iter_copy(iter_in, iter_out); + + // Create a buffer for our extents + buffered_extents = roundup(3 * kHFSPlusExtentDensity + repl_count, + kHFSPlusExtentDensity); + extents = hfs_malloc(sizeof(*extents) * buffered_extents); + int count = 0; + + /* + * Iterate through the extents that are affected by this replace operation. + * We cannot push more than 16 + repl_count extents here; 8 for the group + * containing the replacement start, repl_count for the replacements and 8 + * for the group containing the end. If we went back a group due to + * decrementing the offset above, it's still the same because we know in + * that case the replacement starts at the beginning of the next group. + */ + block = start_group_block; + for (;;) { + if (!iter_in->ndx) { + hfs_ext_copy_rec(iter_in->group, &roll_back_extents[roll_back_count]); + roll_back_count += kHFSPlusExtentDensity; + + if (!hfs_ext_iter_is_catalog_extents(iter_in)) { + // Delete this extent group; we're going to replace it + CHECK(BTDeleteRecord(tree, &iter_in->bt_iter), ret, exit); + } + } + + HFSPlusExtentDescriptor *ext = &iter_in->group[iter_in->ndx]; + if (!ext->blockCount) { + /* + * We ran out of existing extents so we just write the + * extents and we're done. + */ + goto finish; + } + + // If the current extent does not overlap replacement... + if (block + ext->blockCount <= file_block || block >= end_file_block) { + // Keep the current extent exactly as it is + push_ext(extents, &count, ext); + } else { + HFSPlusExtentDescriptor dealloc_ext = *ext; + + if (block <= file_block) { + /* + * The middle or tail of the current extent overlaps + * the replacement extents. Keep the non-overlapping + * head of the current extent. + */ + uint32_t trimmed_len = file_block - block; + + if (trimmed_len) { + // Push (keep) non-overlapping head of current extent + push_ext(extents, &count, + &(HFSPlusExtentDescriptor){ ext->startBlock, + trimmed_len }); + + /* + * Deallocate the part of the current extent that + * overlaps the replacement extents. That starts + * at @file_block. For now, assume it goes + * through the end of the current extent. (If the + * current extent extends beyond the end of the + * replacement extents, we'll update the + * blockCount below.) + */ + dealloc_ext.startBlock += trimmed_len; + dealloc_ext.blockCount -= trimmed_len; + } + + // Insert the replacements + for (int i = 0; i < repl_count; ++i) + push_ext(extents, &count, &repl[i]); + } + + if (block + ext->blockCount > end_file_block) { + /* + * The head or middle of the current extent overlaps + * the replacement extents. Keep the non-overlapping + * tail of the current extent. + */ + uint32_t overlap = end_file_block - block; + + // Push (keep) non-overlapping tail of current extent + push_ext(extents, &count, + &(HFSPlusExtentDescriptor){ ext->startBlock + overlap, + ext->blockCount - overlap }); + + /* + * Deallocate the part of current extent that overlaps + * the replacements. + */ + dealloc_ext.blockCount = (ext->startBlock + overlap + - dealloc_ext.startBlock); + } + + CHECK(BlockDeallocate(hfsmp, dealloc_ext.startBlock, + dealloc_ext.blockCount, 0), ret, exit); + } + + // Move to next (existing) extent from iterator + block += ext->blockCount; + + if (++iter_in->ndx >= kHFSPlusExtentDensity) { + if (block >= end_file_block) { + if (iter_in->last_in_fork || !(count % kHFSPlusExtentDensity)) { + /* + * This is the easy case. We've hit the end or we have a + * multiple of 8, so we can just write out the extents we + * have and it should all fit within a transaction. + */ + + goto finish; + } + + if (count + kHFSPlusExtentDensity > buffered_extents + || (roll_back_count + + kHFSPlusExtentDensity > max_roll_back_extents)) { + /* + * We've run out of room for the next group, so drop out + * and take a different strategy. + */ + break; + } + } + + CHECK(hfs_ext_iter_next_group(iter_in), ret, exit); + } + } // for (;;) + + /* + * We're not at the end so we need to try and pad to a multiple of 8 + * so that we don't have to touch all the subsequent records. We pad + * by stealing single blocks. + */ + + int stop_at = 0; + + for (;;) { + // @in points to the record we're stealing from + int in = count - 1; + + count = roundup(count, kHFSPlusExtentDensity); + + // @out is where we put the stolen single blocks + int out = count - 1; + + do { + if (out <= in) { + // We suceeded in padding; we're done + goto finish; + } + + /* + * "Steal" a block, or move a one-block extent within the + * @extents array. + * + * If the extent we're "stealing" from (@in) is only one + * block long, we'll end up copying it to @out, setting + * @in's blockCount to zero, and decrementing @in. So, we + * either split a multi-block extent; or move it within + * the @extents array. + */ + extents[out].blockCount = 1; + extents[out].startBlock = (extents[in].startBlock + + extents[in].blockCount - 1); + --out; + } while (--extents[in].blockCount || --in >= stop_at); + + // We ran out of extents + if (roll_back_count + kHFSPlusExtentDensity > max_roll_back_extents) { + ret = ENOSPC; + goto exit; + } + + // Need to shift extents starting at out + 1 + ++out; + memmove(&extents[stop_at], &extents[out], + (count - out) * sizeof(*extents)); + count -= out - stop_at; + + // Pull in the next group + CHECK(hfs_ext_iter_next_group(iter_in), ret, exit); + + // Take a copy of these extents for roll back purposes + hfs_ext_copy_rec(iter_in->group, &roll_back_extents[roll_back_count]); + roll_back_count += kHFSPlusExtentDensity; + + // Delete this group; we're going to replace it + CHECK(BTDeleteRecord(tree, &iter_in->bt_iter), ret, exit); + + if (iter_in->last_in_fork) { + // Great! We've hit the end. Coalesce and write out. + int old_count = count; + count = 0; + + /* + * First coalesce the extents we already have. Takes + * advantage of push_ext coalescing the input extent with + * the last extent in @extents. If the extents are not + * contiguous, then this just copies the extents over + * themselves and sets @count back to @old_count. + */ + for (int i = 0; i < old_count; ++i) + push_ext(extents, &count, &extents[i]); + + // Make room if necessary + const int flush_count = buffered_extents - kHFSPlusExtentDensity; + if (count > flush_count) { + CHECK(hfs_ext_iter_update(iter_out, extents, + flush_count, catalog_extents), ret, exit); + + memmove(&extents[0], &extents[flush_count], + (count - flush_count) * sizeof(*extents)); + + count -= flush_count; + } + + // Add in the extents we just read in + for (int i = 0; i < kHFSPlusExtentDensity; ++i) { + HFSPlusExtentDescriptor *ext = &iter_in->group[i]; + if (!ext->blockCount) + break; + push_ext(extents, &count, ext); + } + + goto finish; + } // if (iter_in->last_in_fork) + + /* + * Otherwise, we're not at the end, so we add these extents and then + * try and pad out again to a multiple of 8. We start by making room. + */ + if (count > buffered_extents - kHFSPlusExtentDensity) { + // Only write out one group here + CHECK(hfs_ext_iter_update(iter_out, extents, + kHFSPlusExtentDensity, + catalog_extents), ret, exit); + + memmove(&extents[0], &extents[kHFSPlusExtentDensity], + (count - kHFSPlusExtentDensity) * sizeof(*extents)); + + count -= kHFSPlusExtentDensity; + } + + // Record where to stop when padding above + stop_at = count; + + // Copy in the new extents + hfs_ext_copy_rec(iter_in->group, &extents[count]); + count += kHFSPlusExtentDensity; + } // for (;;) + +finish: + + // Write the remaining extents + CHECK(hfs_ext_iter_update(iter_out, extents, count, + catalog_extents), ret, exit); + + CHECK(BTFlushPath(hfsmp->hfs_catalog_cp->c_datafork), ret, exit); + CHECK(BTFlushPath(hfsmp->hfs_extents_cp->c_datafork), ret, exit); + +exit: + + if (ret && roll_back_count) { + +#define RB_FAILED \ + do { \ + printf("hfs_ext_replace:%u: roll back failed\n", __LINE__); \ + hfs_mark_inconsistent(hfsmp, HFS_ROLLBACK_FAILED); \ + goto roll_back_failed; \ + } while (0) + + // First delete any groups we inserted + HFSPlusExtentKey *key_out = hfs_ext_iter_key_mut(iter_out); + + key_in->startBlock = start_group_block; + if (!key_in->startBlock && key_out->startBlock > key_in->startBlock) { + key_in->startBlock += hfs_total_blocks(catalog_extents, + kHFSPlusExtentDensity); + } + + if (key_out->startBlock > key_in->startBlock) { + FSBufferDescriptor fbd = { + .bufferAddress = &iter_in->group, + .itemCount = 1, + .itemSize = sizeof(iter_in->group) + }; + + if (BTSearchRecord(tree, &iter_in->bt_iter, &fbd, NULL, + &iter_in->bt_iter)) { + RB_FAILED; + } + + for (;;) { + if (BTDeleteRecord(tree, &iter_in->bt_iter)) + RB_FAILED; + + key_in->startBlock += hfs_total_blocks(iter_in->group, + kHFSPlusExtentDensity); + + if (key_in->startBlock >= key_out->startBlock) + break; + + if (BTSearchRecord(tree, &iter_in->bt_iter, &fbd, NULL, + &iter_in->bt_iter)) { + RB_FAILED; + } + } + } + + // Position iter_out + key_out->startBlock = start_group_block; + + // Roll back all the extents + if (hfs_ext_iter_update(iter_out, roll_back_extents, roll_back_count, + catalog_extents)) { + RB_FAILED; + } + + // And we need to reallocate the blocks we deallocated + const uint32_t end_block = min(block, end_file_block); + block = start_group_block; + for (int i = 0; i < roll_back_count && block < end_block; ++i) { + HFSPlusExtentDescriptor *ext = &roll_back_extents[i]; + + if (block + ext->blockCount <= file_block) + continue; + + HFSPlusExtentDescriptor alloc_ext = *ext; + + if (block <= file_block) { + uint32_t trimmed_len = file_block - block; + + alloc_ext.startBlock += trimmed_len; + alloc_ext.blockCount -= trimmed_len; + } + + if (block + ext->blockCount > end_file_block) { + uint32_t overlap = end_file_block - block; + + alloc_ext.blockCount = (ext->startBlock + overlap + - alloc_ext.startBlock); + } + + if (hfs_block_alloc(hfsmp, &alloc_ext, HFS_ALLOC_ROLL_BACK, NULL)) + RB_FAILED; + + block += ext->blockCount; + } + + if (BTFlushPath(hfsmp->hfs_catalog_cp->c_datafork) + || BTFlushPath(hfsmp->hfs_extents_cp->c_datafork)) { + RB_FAILED; + } + } // if (ret && roll_back_count) + +roll_back_failed: + + hfs_free(iter_in, sizeof(*iter_in) * 2); + hfs_free(extents, sizeof(*extents) * buffered_extents); + hfs_free(roll_back_extents, (max_roll_back_extents + * sizeof(HFSPlusExtentDescriptor))); + + return MacToVFSError(ret); +} diff --git a/core/hfs_extents.h b/core/hfs_extents.h new file mode 100644 index 0000000..fce2e49 --- /dev/null +++ b/core/hfs_extents.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2014-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef HFS_EXTENTS_H_ +#define HFS_EXTENTS_H_ + +#include +#include + +#include "hfs_format.h" + +#if !HFS_EXTENTS_TEST && !HFS_ALLOC_TEST +#include "hfs_cnode.h" +#include "hfs.h" +#include "BTreesInternal.h" +#endif + +typedef struct hfs_ext_iter { + struct vnode *vp; // If NULL, this is an xattr extent + BTreeIterator bt_iter; + uint8_t ndx; // Index in group + bool last_in_fork; + uint32_t file_block; + uint32_t group_block_count; + HFSPlusExtentRecord group; +} hfs_ext_iter_t; + +errno_t hfs_ext_find(vnode_t vp, off_t offset, hfs_ext_iter_t *iter); + +errno_t hfs_ext_replace(hfsmount_t *hfsmp, vnode_t vp, + uint32_t file_block, + const HFSPlusExtentDescriptor *repl, + int count, + HFSPlusExtentRecord catalog_extents); + +bool hfs_ext_iter_is_catalog_extents(hfs_ext_iter_t *iter); + +static inline void hfs_ext_copy_rec(const HFSPlusExtentRecord src, + HFSPlusExtentRecord dst) +{ + memcpy(dst, src, sizeof(HFSPlusExtentRecord)); +} + +static inline uint32_t hfs_ext_end(const HFSPlusExtentDescriptor *ext) +{ + return ext->startBlock + ext->blockCount; +} + +#endif // HFS_EXTENTS_H_ diff --git a/core/hfs_format.h b/core/hfs_format.h new file mode 100644 index 0000000..89df0dc --- /dev/null +++ b/core/hfs_format.h @@ -0,0 +1,818 @@ +/* + * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#ifndef __HFS_FORMAT__ +#define __HFS_FORMAT__ + +#include +#include +#include "hfs_unistr.h" + +/* + * hfs_format.h + * + * This file describes the on-disk format for HFS and HFS Plus volumes. + * + * Note: Starting 10.9, definition of struct HFSUniStr255 exists in hfs_unitstr.h + * + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* some on-disk hfs structures have 68K alignment (misaligned) */ + +/* Signatures used to differentiate between HFS and HFS Plus volumes */ +enum { + kHFSSigWord = 0x4244, /* 'BD' in ASCII */ + kHFSPlusSigWord = 0x482B, /* 'H+' in ASCII */ + kHFSXSigWord = 0x4858, /* 'HX' in ASCII */ + + kHFSPlusVersion = 0x0004, /* 'H+' volumes are version 4 only */ + kHFSXVersion = 0x0005, /* 'HX' volumes start with version 5 */ + + kHFSPlusMountVersion = 0x31302E30, /* '10.0' for Mac OS X */ + kHFSJMountVersion = 0x4846534a, /* 'HFSJ' for journaled HFS+ on OS X */ + kFSKMountVersion = 0x46534b21 /* 'FSK!' for failed journal replay */ +}; + + +#ifdef __APPLE_API_PRIVATE +/* + * Mac OS X has two special directories on HFS+ volumes for hardlinked files + * and hardlinked directories as well as for open-unlinked files. + * + * These directories and their contents are not exported from the filesystem + * under Mac OS X. + */ +#define HFSPLUSMETADATAFOLDER "\xE2\x90\x80\xE2\x90\x80\xE2\x90\x80\xE2\x90\x80HFS+ Private Data" +#define HFSPLUS_DIR_METADATA_FOLDER ".HFS+ Private Directory Data\xd" + +/* + * Files in the "HFS+ Private Data" folder have one of the following prefixes + * followed by a decimal number (no leading zeros) for the file ID. + * + * Note: Earlier version of Mac OS X used a 32 bit random number for the link + * ref number instead of the file id. + * + * e.g. iNode7182000 and temp3296 + */ +#define HFS_INODE_PREFIX "iNode" +#define HFS_DELETE_PREFIX "temp" + +/* + * Files in the ".HFS+ Private Directory Data" folder have the following + * prefix followed by a decimal number (no leading zeros) for the file ID. + * + * e.g. dir_555 + */ +#define HFS_DIRINODE_PREFIX "dir_" + +/* + * Hardlink inodes save the head of the link chain in + * an extended attribute named FIRST_LINK_XATTR_NAME. + * The attribute data is the decimal value in ASCII + * of the cnid for the first link in the chain. + * + * This extended attribute is private (i.e. its not + * exported in the getxattr/listxattr POSIX APIs). + */ +#define FIRST_LINK_XATTR_NAME "com.apple.system.hfs.firstlink" +#define FIRST_LINK_XATTR_REC_SIZE (sizeof(HFSPlusAttrData) - 2 + 12) + +/* + * The name space ID for generating an HFS volume UUID + * + * B3E20F39-F292-11D6-97A4-00306543ECAC + */ +#define HFS_UUID_NAMESPACE_ID "\xB3\xE2\x0F\x39\xF2\x92\x11\xD6\x97\xA4\x00\x30\x65\x43\xEC\xAC" + +#endif /* __APPLE_API_PRIVATE */ + +/* + * Indirect link files (hard links) have the following type/creator. + */ +enum { + kHardLinkFileType = 0x686C6E6B, /* 'hlnk' */ + kHFSPlusCreator = 0x6866732B /* 'hfs+' */ +}; + + +/* + * File type and creator for symbolic links + */ +enum { + kSymLinkFileType = 0x736C6E6B, /* 'slnk' */ + kSymLinkCreator = 0x72686170 /* 'rhap' */ +}; + + +enum { + kHFSMaxVolumeNameChars = 27, + kHFSMaxFileNameChars = 31, + kHFSPlusMaxFileNameChars = 255 +}; + + +/* Extent overflow file data structures */ + +/* HFS Extent key */ +struct HFSExtentKey { + u_int8_t keyLength; /* length of key, excluding this field */ + u_int8_t forkType; /* 0 = data fork, FF = resource fork */ + u_int32_t fileID; /* file ID */ + u_int16_t startBlock; /* first file allocation block number in this extent */ +} __attribute__((aligned(2), packed)); +typedef struct HFSExtentKey HFSExtentKey; + +/* HFS Plus Extent key */ +struct HFSPlusExtentKey { + u_int16_t keyLength; /* length of key, excluding this field */ + u_int8_t forkType; /* 0 = data fork, FF = resource fork */ + u_int8_t pad; /* make the other fields align on 32-bit boundary */ + u_int32_t fileID; /* file ID */ + u_int32_t startBlock; /* first file allocation block number in this extent */ +} __attribute__((aligned(2), packed)); +typedef struct HFSPlusExtentKey HFSPlusExtentKey; + +/* Number of extent descriptors per extent record */ +enum { + kHFSExtentDensity = 3, + kHFSPlusExtentDensity = 8 +}; + +/* HFS extent descriptor */ +struct HFSExtentDescriptor { + u_int16_t startBlock; /* first allocation block */ + u_int16_t blockCount; /* number of allocation blocks */ +} __attribute__((aligned(2), packed)); +typedef struct HFSExtentDescriptor HFSExtentDescriptor; + +/* HFS Plus extent descriptor */ +struct HFSPlusExtentDescriptor { + u_int32_t startBlock; /* first allocation block */ + u_int32_t blockCount; /* number of allocation blocks */ +} __attribute__((aligned(2), packed)); +typedef struct HFSPlusExtentDescriptor HFSPlusExtentDescriptor; + +/* HFS extent record */ +typedef HFSExtentDescriptor HFSExtentRecord[3]; + +/* HFS Plus extent record */ +typedef HFSPlusExtentDescriptor HFSPlusExtentRecord[8]; + + +/* Finder information */ +struct FndrFileInfo { + u_int32_t fdType; /* file type */ + u_int32_t fdCreator; /* file creator */ + u_int16_t fdFlags; /* Finder flags */ + struct { + int16_t v; /* file's location */ + int16_t h; + } fdLocation; + int16_t opaque; +} __attribute__((aligned(2), packed)); +typedef struct FndrFileInfo FndrFileInfo; + +struct FndrDirInfo { + struct { /* folder's window rectangle */ + int16_t top; + int16_t left; + int16_t bottom; + int16_t right; + } frRect; + unsigned short frFlags; /* Finder flags */ + struct { + u_int16_t v; /* folder's location */ + u_int16_t h; + } frLocation; + int16_t opaque; +} __attribute__((aligned(2), packed)); +typedef struct FndrDirInfo FndrDirInfo; + +struct FndrOpaqueInfo { + int8_t opaque[16]; +} __attribute__((aligned(2), packed)); +typedef struct FndrOpaqueInfo FndrOpaqueInfo; + +struct FndrExtendedDirInfo { + u_int32_t document_id; + u_int32_t date_added; + u_int16_t extended_flags; + u_int16_t reserved3; + u_int32_t write_gen_counter; +} __attribute__((aligned(2), packed)); + +struct FndrExtendedFileInfo { + u_int32_t document_id; + u_int32_t date_added; + u_int16_t extended_flags; + u_int16_t reserved2; + u_int32_t write_gen_counter; +} __attribute__((aligned(2), packed)); + +/* HFS Plus Fork data info - 80 bytes */ +struct HFSPlusForkData { + u_int64_t logicalSize; /* fork's logical size in bytes */ + u_int32_t clumpSize; /* fork's clump size in bytes */ + u_int32_t totalBlocks; /* total blocks used by this fork */ + HFSPlusExtentRecord extents; /* initial set of extents */ +} __attribute__((aligned(2), packed)); +typedef struct HFSPlusForkData HFSPlusForkData; + + +/* Mac OS X has 16 bytes worth of "BSD" info. + * + * Note: Mac OS 9 implementations and applications + * should preserve, but not change, this information. + */ +struct HFSPlusBSDInfo { + u_int32_t ownerID; /* user-id of owner or hard link chain previous link */ + u_int32_t groupID; /* group-id of owner or hard link chain next link */ + u_int8_t adminFlags; /* super-user changeable flags */ + u_int8_t ownerFlags; /* owner changeable flags */ + u_int16_t fileMode; /* file type and permission bits */ + union { + u_int32_t iNodeNum; /* indirect node number (hard links only) */ + u_int32_t linkCount; /* links that refer to this indirect node */ + u_int32_t rawDevice; /* special file device (FBLK and FCHR only) */ + } special; +} __attribute__((aligned(2), packed)); +typedef struct HFSPlusBSDInfo HFSPlusBSDInfo; + +/* + * Hardlink "links" resolve to an inode + * and the actual uid/gid comes from that + * inode. + * + * We repurpose the links's uid/gid fields + * for the hardlink link chain. The chain + * consists of a doubly linked list of file + * ids. + */ + +#define hl_firstLinkID reserved1 /* Valid only if HasLinkChain flag is set (indirect nodes only) */ + +#define hl_prevLinkID bsdInfo.ownerID /* Valid only if HasLinkChain flag is set */ +#define hl_nextLinkID bsdInfo.groupID /* Valid only if HasLinkChain flag is set */ + +#define hl_linkReference bsdInfo.special.iNodeNum +#define hl_linkCount bsdInfo.special.linkCount + + +/* Catalog file data structures */ + +enum { + kHFSRootParentID = 1, /* Parent ID of the root folder */ + kHFSRootFolderID = 2, /* Folder ID of the root folder */ + kHFSExtentsFileID = 3, /* File ID of the extents file */ + kHFSCatalogFileID = 4, /* File ID of the catalog file */ + kHFSBadBlockFileID = 5, /* File ID of the bad allocation block file */ + kHFSAllocationFileID = 6, /* File ID of the allocation file (HFS Plus only) */ + kHFSStartupFileID = 7, /* File ID of the startup file (HFS Plus only) */ + kHFSAttributesFileID = 8, /* File ID of the attribute file (HFS Plus only) */ + kHFSAttributeDataFileID = 13, /* Used in Mac OS X runtime for extent based attributes */ + /* kHFSAttributeDataFileID is never stored on disk. */ + kHFSRepairCatalogFileID = 14, /* Used when rebuilding Catalog B-tree */ + kHFSBogusExtentFileID = 15, /* Used for exchanging extents in extents file */ + kHFSFirstUserCatalogNodeID = 16 +}; + +/* HFS catalog key */ +struct HFSCatalogKey { + u_int8_t keyLength; /* key length (in bytes) */ + u_int8_t reserved; /* reserved (set to zero) */ + u_int32_t parentID; /* parent folder ID */ + u_int8_t nodeName[kHFSMaxFileNameChars + 1]; /* catalog node name */ +} __attribute__((aligned(2), packed)); +typedef struct HFSCatalogKey HFSCatalogKey; + +/* HFS Plus catalog key */ +struct HFSPlusCatalogKey { + u_int16_t keyLength; /* key length (in bytes) */ + u_int32_t parentID; /* parent folder ID */ + HFSUniStr255 nodeName; /* catalog node name */ +} __attribute__((aligned(2), packed)); +typedef struct HFSPlusCatalogKey HFSPlusCatalogKey; + +/* Catalog record types */ +enum { + /* HFS Catalog Records */ + kHFSFolderRecord = 0x0100, /* Folder record */ + kHFSFileRecord = 0x0200, /* File record */ + kHFSFolderThreadRecord = 0x0300, /* Folder thread record */ + kHFSFileThreadRecord = 0x0400, /* File thread record */ + + /* HFS Plus Catalog Records */ + kHFSPlusFolderRecord = 1, /* Folder record */ + kHFSPlusFileRecord = 2, /* File record */ + kHFSPlusFolderThreadRecord = 3, /* Folder thread record */ + kHFSPlusFileThreadRecord = 4 /* File thread record */ +}; + + +/* Catalog file record flags */ +enum { + kHFSFileLockedBit = 0x0000, /* file is locked and cannot be written to */ + kHFSFileLockedMask = 0x0001, + + kHFSThreadExistsBit = 0x0001, /* a file thread record exists for this file */ + kHFSThreadExistsMask = 0x0002, + + kHFSHasAttributesBit = 0x0002, /* object has extended attributes */ + kHFSHasAttributesMask = 0x0004, + + kHFSHasSecurityBit = 0x0003, /* object has security data (ACLs) */ + kHFSHasSecurityMask = 0x0008, + + kHFSHasFolderCountBit = 0x0004, /* only for HFSX, folder maintains a separate sub-folder count */ + kHFSHasFolderCountMask = 0x0010, /* (sum of folder records and directory hard links) */ + + kHFSHasLinkChainBit = 0x0005, /* has hardlink chain (inode or link) */ + kHFSHasLinkChainMask = 0x0020, + + kHFSHasChildLinkBit = 0x0006, /* folder has a child that's a dir link */ + kHFSHasChildLinkMask = 0x0040, + + kHFSHasDateAddedBit = 0x0007, /* File/Folder has the date-added stored in the finder info. */ + kHFSHasDateAddedMask = 0x0080, + + kHFSFastDevPinnedBit = 0x0008, /* this file has been pinned to the fast-device by the hot-file code on cooperative fusion */ + kHFSFastDevPinnedMask = 0x0100, + + kHFSDoNotFastDevPinBit = 0x0009, /* this file can not be pinned to the fast-device */ + kHFSDoNotFastDevPinMask = 0x0200, + + kHFSFastDevCandidateBit = 0x000a, /* this item is a potential candidate for fast-dev pinning (as are any of its descendents */ + kHFSFastDevCandidateMask = 0x0400, + + kHFSAutoCandidateBit = 0x000b, /* this item was automatically marked as a fast-dev candidate by the kernel */ + kHFSAutoCandidateMask = 0x0800 + + // There are only 4 flag bits remaining: 0x1000, 0x2000, 0x4000, 0x8000 + +}; + + +/* HFS catalog folder record - 70 bytes */ +struct HFSCatalogFolder { + int16_t recordType; /* == kHFSFolderRecord */ + u_int16_t flags; /* folder flags */ + u_int16_t valence; /* folder valence */ + u_int32_t folderID; /* folder ID */ + u_int32_t createDate; /* date and time of creation */ + u_int32_t modifyDate; /* date and time of last modification */ + u_int32_t backupDate; /* date and time of last backup */ + FndrDirInfo userInfo; /* Finder information */ + FndrOpaqueInfo finderInfo; /* additional Finder information */ + u_int32_t reserved[4]; /* reserved - initialized as zero */ +} __attribute__((aligned(2), packed)); +typedef struct HFSCatalogFolder HFSCatalogFolder; + +/* HFS Plus catalog folder record - 88 bytes */ +struct HFSPlusCatalogFolder { + int16_t recordType; /* == kHFSPlusFolderRecord */ + u_int16_t flags; /* file flags */ + u_int32_t valence; /* folder's item count */ + u_int32_t folderID; /* folder ID */ + u_int32_t createDate; /* date and time of creation */ + u_int32_t contentModDate; /* date and time of last content modification */ + u_int32_t attributeModDate; /* date and time of last attribute modification */ + u_int32_t accessDate; /* date and time of last access (MacOS X only) */ + u_int32_t backupDate; /* date and time of last backup */ + HFSPlusBSDInfo bsdInfo; /* permissions (for MacOS X) */ + FndrDirInfo userInfo; /* Finder information */ + FndrOpaqueInfo finderInfo; /* additional Finder information */ + u_int32_t textEncoding; /* hint for name conversions */ + u_int32_t folderCount; /* number of enclosed folders, active when HasFolderCount is set */ +} __attribute__((aligned(2), packed)); +typedef struct HFSPlusCatalogFolder HFSPlusCatalogFolder; + +/* HFS catalog file record - 102 bytes */ +struct HFSCatalogFile { + int16_t recordType; /* == kHFSFileRecord */ + u_int8_t flags; /* file flags */ + int8_t fileType; /* file type (unused ?) */ + FndrFileInfo userInfo; /* Finder information */ + u_int32_t fileID; /* file ID */ + u_int16_t dataStartBlock; /* not used - set to zero */ + int32_t dataLogicalSize; /* logical EOF of data fork */ + int32_t dataPhysicalSize; /* physical EOF of data fork */ + u_int16_t rsrcStartBlock; /* not used - set to zero */ + int32_t rsrcLogicalSize; /* logical EOF of resource fork */ + int32_t rsrcPhysicalSize; /* physical EOF of resource fork */ + u_int32_t createDate; /* date and time of creation */ + u_int32_t modifyDate; /* date and time of last modification */ + u_int32_t backupDate; /* date and time of last backup */ + FndrOpaqueInfo finderInfo; /* additional Finder information */ + u_int16_t clumpSize; /* file clump size (not used) */ + HFSExtentRecord dataExtents; /* first data fork extent record */ + HFSExtentRecord rsrcExtents; /* first resource fork extent record */ + u_int32_t reserved; /* reserved - initialized as zero */ +} __attribute__((aligned(2), packed)); +typedef struct HFSCatalogFile HFSCatalogFile; + +/* HFS Plus catalog file record - 248 bytes */ +struct HFSPlusCatalogFile { + int16_t recordType; /* == kHFSPlusFileRecord */ + u_int16_t flags; /* file flags */ + u_int32_t reserved1; /* reserved - initialized as zero */ + u_int32_t fileID; /* file ID */ + u_int32_t createDate; /* date and time of creation */ + u_int32_t contentModDate; /* date and time of last content modification */ + u_int32_t attributeModDate; /* date and time of last attribute modification */ + u_int32_t accessDate; /* date and time of last access (MacOS X only) */ + u_int32_t backupDate; /* date and time of last backup */ + HFSPlusBSDInfo bsdInfo; /* permissions (for MacOS X) */ + FndrFileInfo userInfo; /* Finder information */ + FndrOpaqueInfo finderInfo; /* additional Finder information */ + u_int32_t textEncoding; /* hint for name conversions */ + u_int32_t reserved2; /* reserved - initialized as zero */ + + /* Note: these start on double long (64 bit) boundary */ + HFSPlusForkData dataFork; /* size and block data for data fork */ + HFSPlusForkData resourceFork; /* size and block data for resource fork */ +} __attribute__((aligned(2), packed)); +typedef struct HFSPlusCatalogFile HFSPlusCatalogFile; + +/* HFS catalog thread record - 46 bytes */ +struct HFSCatalogThread { + int16_t recordType; /* == kHFSFolderThreadRecord or kHFSFileThreadRecord */ + int32_t reserved[2]; /* reserved - initialized as zero */ + u_int32_t parentID; /* parent ID for this catalog node */ + u_int8_t nodeName[kHFSMaxFileNameChars + 1]; /* name of this catalog node */ +} __attribute__((aligned(2), packed)); +typedef struct HFSCatalogThread HFSCatalogThread; + +/* HFS Plus catalog thread record -- 264 bytes */ +struct HFSPlusCatalogThread { + int16_t recordType; /* == kHFSPlusFolderThreadRecord or kHFSPlusFileThreadRecord */ + int16_t reserved; /* reserved - initialized as zero */ + u_int32_t parentID; /* parent ID for this catalog node */ + HFSUniStr255 nodeName; /* name of this catalog node (variable length) */ +} __attribute__((aligned(2), packed)); +typedef struct HFSPlusCatalogThread HFSPlusCatalogThread; + +#ifdef __APPLE_API_UNSTABLE +/* + * These are the types of records in the attribute B-tree. The values were + * chosen so that they wouldn't conflict with the catalog record types. + */ +enum { + kHFSPlusAttrInlineData = 0x10, /* attributes whose data fits in a b-tree node */ + kHFSPlusAttrForkData = 0x20, /* extent based attributes (data lives in extents) */ + kHFSPlusAttrExtents = 0x30 /* overflow extents for large attributes */ +}; + + +/* + * HFSPlusAttrForkData + * For larger attributes, whose value is stored in allocation blocks. + * If the attribute has more than 8 extents, there will be additional + * records (of type HFSPlusAttrExtents) for this attribute. + */ +struct HFSPlusAttrForkData { + u_int32_t recordType; /* == kHFSPlusAttrForkData*/ + u_int32_t reserved; + HFSPlusForkData theFork; /* size and first extents of value*/ +} __attribute__((aligned(2), packed)); +typedef struct HFSPlusAttrForkData HFSPlusAttrForkData; + +/* + * HFSPlusAttrExtents + * This record contains information about overflow extents for large, + * fragmented attributes. + */ +struct HFSPlusAttrExtents { + u_int32_t recordType; /* == kHFSPlusAttrExtents*/ + u_int32_t reserved; + HFSPlusExtentRecord extents; /* additional extents*/ +} __attribute__((aligned(2), packed)); +typedef struct HFSPlusAttrExtents HFSPlusAttrExtents; + +/* + * Atrributes B-tree Data Record + * + * For small attributes, whose entire value is stored + * within a single B-tree record. + */ +struct HFSPlusAttrData { + u_int32_t recordType; /* == kHFSPlusAttrInlineData */ + u_int32_t reserved[2]; + u_int32_t attrSize; /* size of attribute data in bytes */ + u_int8_t attrData[2]; /* variable length */ +} __attribute__((aligned(2), packed)); +typedef struct HFSPlusAttrData HFSPlusAttrData; + + +/* HFSPlusAttrInlineData is obsolete use HFSPlusAttrData instead */ +struct HFSPlusAttrInlineData { + u_int32_t recordType; + u_int32_t reserved; + u_int32_t logicalSize; + u_int8_t userData[2]; +} __attribute__((aligned(2), packed)); +typedef struct HFSPlusAttrInlineData HFSPlusAttrInlineData; + + +/* A generic Attribute Record */ +union HFSPlusAttrRecord { + u_int32_t recordType; + HFSPlusAttrInlineData inlineData; /* NOT USED */ + HFSPlusAttrData attrData; + HFSPlusAttrForkData forkData; + HFSPlusAttrExtents overflowExtents; +}; +typedef union HFSPlusAttrRecord HFSPlusAttrRecord; + +/* Attribute key */ +enum { kHFSMaxAttrNameLen = 127 }; +struct HFSPlusAttrKey { + u_int16_t keyLength; /* key length (in bytes) */ + u_int16_t pad; /* set to zero */ + u_int32_t fileID; /* file associated with attribute */ + u_int32_t startBlock; /* first allocation block number for extents */ + u_int16_t attrNameLen; /* number of unicode characters */ + u_int16_t attrName[kHFSMaxAttrNameLen]; /* attribute name (Unicode) */ +} __attribute__((aligned(2), packed)); +typedef struct HFSPlusAttrKey HFSPlusAttrKey; + +#define kHFSPlusAttrKeyMaximumLength (sizeof(HFSPlusAttrKey) - sizeof(u_int16_t)) +#define kHFSPlusAttrKeyMinimumLength (kHFSPlusAttrKeyMaximumLength - kHFSMaxAttrNameLen*sizeof(u_int16_t)) + +#endif /* __APPLE_API_UNSTABLE */ + + +/* Key and node lengths */ +enum { + kHFSPlusExtentKeyMaximumLength = sizeof(HFSPlusExtentKey) - sizeof(u_int16_t), + kHFSExtentKeyMaximumLength = sizeof(HFSExtentKey) - sizeof(u_int8_t), + kHFSPlusCatalogKeyMaximumLength = sizeof(HFSPlusCatalogKey) - sizeof(u_int16_t), + kHFSPlusCatalogKeyMinimumLength = kHFSPlusCatalogKeyMaximumLength - sizeof(HFSUniStr255) + sizeof(u_int16_t), + kHFSCatalogKeyMaximumLength = sizeof(HFSCatalogKey) - sizeof(u_int8_t), + kHFSCatalogKeyMinimumLength = kHFSCatalogKeyMaximumLength - (kHFSMaxFileNameChars + 1) + sizeof(u_int8_t), + kHFSPlusCatalogMinNodeSize = 4096, + kHFSPlusExtentMinNodeSize = 512, + kHFSPlusAttrMinNodeSize = 4096 +}; + +/* HFS and HFS Plus volume attribute bits */ +enum { + /* Bits 0-6 are reserved (always cleared by MountVol call) */ + kHFSVolumeHardwareLockBit = 7, /* volume is locked by hardware */ + kHFSVolumeUnmountedBit = 8, /* volume was successfully unmounted */ + kHFSVolumeSparedBlocksBit = 9, /* volume has bad blocks spared */ + kHFSVolumeNoCacheRequiredBit = 10, /* don't cache volume blocks (i.e. RAM or ROM disk) */ + kHFSBootVolumeInconsistentBit = 11, /* boot volume is inconsistent (System 7.6 and later) */ + kHFSCatalogNodeIDsReusedBit = 12, + kHFSVolumeJournaledBit = 13, /* this volume has a journal on it */ + kHFSVolumeInconsistentBit = 14, /* serious inconsistencies detected at runtime */ + kHFSVolumeSoftwareLockBit = 15, /* volume is locked by software */ + /* + * HFS only has 16 bits of attributes in the MDB, but HFS Plus has 32 bits. + * Therefore, bits 16-31 can only be used on HFS Plus. + */ + kHFSUnusedNodeFixBit = 31, /* Unused nodes in the Catalog B-tree have been zero-filled. See Radar #6947811. */ + kHFSContentProtectionBit = 30, /* Volume has per-file content protection */ + + /*** Keep these in sync with the bits above ! ****/ + kHFSVolumeHardwareLockMask = 0x00000080, + kHFSVolumeUnmountedMask = 0x00000100, + kHFSVolumeSparedBlocksMask = 0x00000200, + kHFSVolumeNoCacheRequiredMask = 0x00000400, + kHFSBootVolumeInconsistentMask = 0x00000800, + kHFSCatalogNodeIDsReusedMask = 0x00001000, + kHFSVolumeJournaledMask = 0x00002000, + kHFSVolumeInconsistentMask = 0x00004000, + kHFSVolumeSoftwareLockMask = 0x00008000, + + /* Bits 16-31 are allocated from high to low */ + + kHFSContentProtectionMask = 0x40000000, + kHFSUnusedNodeFixMask = 0x80000000, + + kHFSMDBAttributesMask = 0x8380 +}; + +enum { + kHFSUnusedNodesFixDate = 0xc5ef2480 /* March 25, 2009 */ +}; + +/* HFS Master Directory Block - 162 bytes */ +/* Stored at sector #2 (3rd sector) and second-to-last sector. */ +struct HFSMasterDirectoryBlock { + u_int16_t drSigWord; /* == kHFSSigWord */ + u_int32_t drCrDate; /* date and time of volume creation */ + u_int32_t drLsMod; /* date and time of last modification */ + u_int16_t drAtrb; /* volume attributes */ + u_int16_t drNmFls; /* number of files in root folder */ + u_int16_t drVBMSt; /* first block of volume bitmap */ + u_int16_t drAllocPtr; /* start of next allocation search */ + u_int16_t drNmAlBlks; /* number of allocation blocks in volume */ + u_int32_t drAlBlkSiz; /* size (in bytes) of allocation blocks */ + u_int32_t drClpSiz; /* default clump size */ + u_int16_t drAlBlSt; /* first allocation block in volume */ + u_int32_t drNxtCNID; /* next unused catalog node ID */ + u_int16_t drFreeBks; /* number of unused allocation blocks */ + u_int8_t drVN[kHFSMaxVolumeNameChars + 1]; /* volume name */ + u_int32_t drVolBkUp; /* date and time of last backup */ + u_int16_t drVSeqNum; /* volume backup sequence number */ + u_int32_t drWrCnt; /* volume write count */ + u_int32_t drXTClpSiz; /* clump size for extents overflow file */ + u_int32_t drCTClpSiz; /* clump size for catalog file */ + u_int16_t drNmRtDirs; /* number of directories in root folder */ + u_int32_t drFilCnt; /* number of files in volume */ + u_int32_t drDirCnt; /* number of directories in volume */ + u_int32_t drFndrInfo[8]; /* information used by the Finder */ + u_int16_t drEmbedSigWord; /* embedded volume signature (formerly drVCSize) */ + HFSExtentDescriptor drEmbedExtent; /* embedded volume location and size (formerly drVBMCSize and drCtlCSize) */ + u_int32_t drXTFlSize; /* size of extents overflow file */ + HFSExtentRecord drXTExtRec; /* extent record for extents overflow file */ + u_int32_t drCTFlSize; /* size of catalog file */ + HFSExtentRecord drCTExtRec; /* extent record for catalog file */ +} __attribute__((aligned(2), packed)); +typedef struct HFSMasterDirectoryBlock HFSMasterDirectoryBlock; + + +#ifdef __APPLE_API_UNSTABLE +#define SET_HFS_TEXT_ENCODING(hint) \ + (0x656e6300 | ((hint) & 0xff)) +#define GET_HFS_TEXT_ENCODING(hint) \ + (((hint) & 0xffffff00) == 0x656e6300 ? (hint) & 0x000000ff : 0xffffffffU) +#endif /* __APPLE_API_UNSTABLE */ + + +/* HFS Plus Volume Header - 512 bytes */ +/* Stored at sector #2 (3rd sector) and second-to-last sector. */ +struct HFSPlusVolumeHeader { + u_int16_t signature; /* == kHFSPlusSigWord */ + u_int16_t version; /* == kHFSPlusVersion */ + u_int32_t attributes; /* volume attributes */ + u_int32_t lastMountedVersion; /* implementation version which last mounted volume */ + u_int32_t journalInfoBlock; /* block addr of journal info (if volume is journaled, zero otherwise) */ + + u_int32_t createDate; /* date and time of volume creation */ + u_int32_t modifyDate; /* date and time of last modification */ + u_int32_t backupDate; /* date and time of last backup */ + u_int32_t checkedDate; /* date and time of last disk check */ + + u_int32_t fileCount; /* number of files in volume */ + u_int32_t folderCount; /* number of directories in volume */ + + u_int32_t blockSize; /* size (in bytes) of allocation blocks */ + u_int32_t totalBlocks; /* number of allocation blocks in volume (includes this header and VBM*/ + u_int32_t freeBlocks; /* number of unused allocation blocks */ + + u_int32_t nextAllocation; /* start of next allocation search */ + u_int32_t rsrcClumpSize; /* default resource fork clump size */ + u_int32_t dataClumpSize; /* default data fork clump size */ + u_int32_t nextCatalogID; /* next unused catalog node ID */ + + u_int32_t writeCount; /* volume write count */ + u_int64_t encodingsBitmap; /* which encodings have been use on this volume */ + + u_int8_t finderInfo[32]; /* information used by the Finder */ + + HFSPlusForkData allocationFile; /* allocation bitmap file */ + HFSPlusForkData extentsFile; /* extents B-tree file */ + HFSPlusForkData catalogFile; /* catalog B-tree file */ + HFSPlusForkData attributesFile; /* extended attributes B-tree file */ + HFSPlusForkData startupFile; /* boot file (secondary loader) */ +} __attribute__((aligned(2), packed)); +typedef struct HFSPlusVolumeHeader HFSPlusVolumeHeader; + + +/* B-tree structures */ + +enum BTreeKeyLimits{ + kMaxKeyLength = 520 +}; + +union BTreeKey{ + u_int8_t length8; + u_int16_t length16; + u_int8_t rawData [kMaxKeyLength+2]; +}; +typedef union BTreeKey BTreeKey; + +/* BTNodeDescriptor -- Every B-tree node starts with these fields. */ +struct BTNodeDescriptor { + u_int32_t fLink; /* next node at this level*/ + u_int32_t bLink; /* previous node at this level*/ + int8_t kind; /* kind of node (leaf, index, header, map)*/ + u_int8_t height; /* zero for header, map; child is one more than parent*/ + u_int16_t numRecords; /* number of records in this node*/ + u_int16_t reserved; /* reserved - initialized as zero */ +} __attribute__((aligned(2), packed)); +typedef struct BTNodeDescriptor BTNodeDescriptor; + +/* Constants for BTNodeDescriptor kind */ +enum { + kBTLeafNode = -1, + kBTIndexNode = 0, + kBTHeaderNode = 1, + kBTMapNode = 2 +}; + +/* BTHeaderRec -- The first record of a B-tree header node */ +struct BTHeaderRec { + u_int16_t treeDepth; /* maximum height (usually leaf nodes) */ + u_int32_t rootNode; /* node number of root node */ + u_int32_t leafRecords; /* number of leaf records in all leaf nodes */ + u_int32_t firstLeafNode; /* node number of first leaf node */ + u_int32_t lastLeafNode; /* node number of last leaf node */ + u_int16_t nodeSize; /* size of a node, in bytes */ + u_int16_t maxKeyLength; /* reserved */ + u_int32_t totalNodes; /* total number of nodes in tree */ + u_int32_t freeNodes; /* number of unused (free) nodes in tree */ + u_int16_t reserved1; /* unused */ + u_int32_t clumpSize; /* reserved */ + u_int8_t btreeType; /* reserved */ + u_int8_t keyCompareType; /* Key string Comparison Type */ + u_int32_t attributes; /* persistent attributes about the tree */ + u_int32_t reserved3[16]; /* reserved */ +} __attribute__((aligned(2), packed)); +typedef struct BTHeaderRec BTHeaderRec; + +/* Constants for BTHeaderRec attributes */ +enum { + kBTBadCloseMask = 0x00000001, /* reserved */ + kBTBigKeysMask = 0x00000002, /* key length field is 16 bits */ + kBTVariableIndexKeysMask = 0x00000004 /* keys in index nodes are variable length */ +}; + + +/* Catalog Key Name Comparison Type */ +enum { + kHFSCaseFolding = 0xCF, /* case folding (case-insensitive) */ + kHFSBinaryCompare = 0xBC /* binary compare (case-sensitive) */ +}; + +#include + +/* JournalInfoBlock - Structure that describes where our journal lives */ + +// the original size of the reserved field in the JournalInfoBlock was +// 32*sizeof(u_int32_t). To keep the total size of the structure the +// same we subtract the size of new fields (currently: ext_jnl_uuid and +// machine_uuid). If you add additional fields, place them before the +// reserved field and subtract their size in this macro. +// +#define JIB_RESERVED_SIZE ((32*sizeof(u_int32_t)) - sizeof(uuid_string_t) - 48) + +struct JournalInfoBlock { + u_int32_t flags; + u_int32_t device_signature[8]; // signature used to locate our device. + u_int64_t offset; // byte offset to the journal on the device + u_int64_t size; // size in bytes of the journal + uuid_string_t ext_jnl_uuid; + char machine_serial_num[48]; + char reserved[JIB_RESERVED_SIZE]; +} __attribute__((aligned(2), packed)); +typedef struct JournalInfoBlock JournalInfoBlock; + +enum { + kJIJournalInFSMask = 0x00000001, + kJIJournalOnOtherDeviceMask = 0x00000002, + kJIJournalNeedInitMask = 0x00000004 +}; + +// +// This the content type uuid for "external journal" GPT +// partitions. Each instance of a partition also has a +// uuid that uniquely identifies that instance. +// +#define EXTJNL_CONTENT_TYPE_UUID "4A6F7572-6E61-11AA-AA11-00306543ECAC" + + +#ifdef __cplusplus +} +#endif + +#endif /* __HFS_FORMAT__ */ diff --git a/core/hfs_fsctl.h b/core/hfs_fsctl.h new file mode 100644 index 0000000..0b89b25 --- /dev/null +++ b/core/hfs_fsctl.h @@ -0,0 +1,387 @@ +/* + * Copyright (c) 2004-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _HFS_FSCTL_H_ +#define _HFS_FSCTL_H_ + +#include + +#include +#include +#include +#include + +#ifdef __APPLE_API_UNSTABLE + +struct hfs_backingstoreinfo { + int signature; /* == 3419115 */ + int version; /* version of this struct (1) */ + int backingfd; /* disk image file (on backing fs) */ + int bandsize; /* sparse disk image band size */ +}; + + +typedef char pathname_t[MAXPATHLEN]; + +struct hfs_journal_info { + off_t jstart; + off_t jsize; +}; + + +// Will be deprecated and replaced by hfs_fsinfo +struct hfsinfo_metadata { + uint32_t total; + uint32_t extents; + uint32_t catalog; + uint32_t allocation; + uint32_t attribute; + uint32_t journal; + uint32_t reserved[4]; +}; + +/* + * Flags for hfs_fsinfo_data structure + */ +#define HFS_FSINFO_CLASS_A 0x0001 /* Information for class A files requested */ +#define HFS_FSINFO_CLASS_B 0x0002 /* Information for class B files requested */ +#define HFS_FSINFO_CLASS_C 0x0004 /* Information for class C files requested */ +#define HFS_FSINFO_CLASS_D 0x0008 /* Information for class D files requested */ + +/* + * Maximum number of buckets to represent range from 0 to 1TB (2^40) in + * increments of power of 2, and one catch-all bucket for anything that + * is greater than 1TB + */ +#define HFS_FSINFO_DATA_MAX_BUCKETS 42 + +/* + * Maximum number of buckets to represents percentage range from 0 to 100 + * in increments of 10. + */ +#define HFS_FSINFO_PERCENT_MAX_BUCKETS 10 + +/* + * Maximum number of buckets to represent number of file/directory name characters + * (range 1 to 255) in increments of 5. + */ +#define HFS_FSINFO_NAME_MAX_BUCKETS 51 + +/* + * Version number to ensure that the caller and the kernel have same understanding + * of the hfs_fsinfo_data structure. This version needs to be bumped whenever the + * number of buckets is changed. + */ +#define HFS_FSINFO_VERSION 1 + +/* + * hfs_fsinfo_data is generic data structure to aggregate information like sizes + * or counts in buckets of power of 2. Each bucket represents a range of values + * that is determined based on its index in the array. Specifically, buckets[i] + * represents values that are greater than or equal to 2^(i-1) and less than 2^i, + * except the last bucket which represents range greater than or equal to 2^(i-1) + * + * The current maximum number of buckets is 41, so we can represent range from + * 0 up to 1TB in increments of power of 2, and then a catch-all bucket of + * anything that is greater than or equal to 1TB. + * + * For example, + * bucket[0] -> greater than or equal to 0 and less than 1 + * bucket[1] -> greater than or equal to 1 and less than 2 + * bucket[10] -> greater than or equal to 2^(10-1) = 512 and less than 2^10 = 1024 + * bucket[20] -> greater than or equal to 2^(20-1) = 512KB and less than 2^20 = 1MB + * bucket[41] -> greater than or equal to 2^(41-1) = 1TB + * + * Note that fsctls that populate this data structure can take long time to + * execute as this operation can be I/O intensive (traversing btrees) and compute + * intensive. + * + * WARNING: Any changes to this structure should also update version number to + * ensure that the clients and kernel are reading/writing correctly. + */ + +/* + * The header includes the user input fields. + */ +typedef struct hfs_fsinfo_header { + uint32_t request_type; + uint16_t version; + uint16_t flags; +} hfs_fsinfo_header_t; + +struct hfs_fsinfo_data { + hfs_fsinfo_header_t header; + uint32_t bucket[HFS_FSINFO_DATA_MAX_BUCKETS]; +}; + +/* + * Structure to represent information about metadata files + * + * WARNING: Any changes to this structure should also update version number to + * ensure that the clients and kernel are reading/writing correctly. + */ +struct hfs_fsinfo_metadata { + hfs_fsinfo_header_t header; + uint32_t extents; + uint32_t catalog; + uint32_t allocation; + uint32_t attribute; + uint32_t journal; +}; + +/* + * Structure to represent distribution of number of file name characters + * in increments of 5s. Each bucket represents a range of values that is + * determined based on its index in the array. So bucket[i] represents values + * that are greater than or equal to (i*5) and less than ((i+1)*10). + * + * Since this structure represents range of file name characters and the + * maximum number of unicode characters in HFS+ is 255, the maximum number + * of buckets will be 52 [0..51]. + * + * For example, + * bucket[4] -> greater than or equal to 20 and less than 25 characters + * bucket[51] -> equal to 255 characters + * + * WARNING: Any changes to this structure should also update version number to + * ensure that the clients and kernel are reading/writing correctly. + */ +struct hfs_fsinfo_name { + hfs_fsinfo_header_t header; + uint32_t bucket[HFS_FSINFO_NAME_MAX_BUCKETS]; +}; + +/* + * Structure to represent information about content protection classes + * + * WARNING: Any changes to this structure should also update version number to + * ensure that the clients and kernel are reading/writing correctly. + */ +struct hfs_fsinfo_cprotect { + hfs_fsinfo_header_t header; + uint32_t class_A; + uint32_t class_B; + uint32_t class_C; + uint32_t class_D; + uint32_t class_E; + uint32_t class_F; +}; + +/* + * Union of all the different values returned by HFSIOC_FSINFO fsctl + */ +union hfs_fsinfo { + hfs_fsinfo_header_t header; + struct hfs_fsinfo_data data; + struct hfs_fsinfo_metadata metadata; + struct hfs_fsinfo_name name; + struct hfs_fsinfo_cprotect cprotect; +}; +typedef union hfs_fsinfo hfs_fsinfo; + +/* + * Type of FSINFO requested, specified by the caller in request_type field + */ +enum { + /* Information about number of allocation blocks for each metadata file, returns struct hfs_fsinfo_metadata */ + HFS_FSINFO_METADATA_BLOCKS_INFO = 1, + + /* Information about number of extents for each metadata file, returns struct hfs_fsinfo_metadata */ + HFS_FSINFO_METADATA_EXTENTS = 2, + + /* Information about percentage of free nodes vs used nodes in metadata btrees, returns struct hfs_fsinfo_metadata */ + HFS_FSINFO_METADATA_PERCENTFREE = 3, + + /* Distribution of number of extents for data files (data fork, no rsrc fork, no xattr), returns struct hfs_fsinfo_data */ + HFS_FSINFO_FILE_EXTENT_COUNT = 4, + + /* Distribution of extent sizes for data files (data fork, no rsrc fork, no xattr), returns struct hfs_fsinfo_data */ + HFS_FSINFO_FILE_EXTENT_SIZE = 5, + + /* Distribution of file sizes for data files (data fork, no rsrc fork, no xattr), returns struct hfs_fsinfo_data */ + HFS_FSINFO_FILE_SIZE = 6, + + /* Distribution of valence for all directories, returns struct hfs_fsinfo_data */ + HFS_FSINFO_DIR_VALENCE = 7, + + /* Distribution of file/directory name size in unicode characters, returns struct hfs_fsinfo_name */ + HFS_FSINFO_NAME_SIZE = 8, + + /* Distribution of extended attribute sizes, returns hfs_fsinfo_data */ + HFS_FSINFO_XATTR_SIZE = 9, + + /* Distribution of free space for the entire file system, returns struct hfs_fsinfo_data */ + HFS_FSINFO_FREE_EXTENTS = 10, + + /* Information about number of files belonging to each class, returns hfs_fsinfo_cprotect */ + HFS_FSINFO_FILE_CPROTECT_COUNT = 11, + + /* + * Distribution of symbolic link sizes for data files (data fork, no rsrc fork, no xattr), + * returns struct hfs_fsinfo_data + */ + HFS_FSINFO_SYMLINK_SIZE = 12, +}; + + +/* HFS FS CONTROL COMMANDS */ + +#define HFSIOC_RESIZE_PROGRESS _IOR('h', 1, u_int32_t) + +#define HFSIOC_RESIZE_VOLUME _IOW('h', 2, u_int64_t) + +#define HFSIOC_CHANGE_NEXT_ALLOCATION _IOWR('h', 3, u_int32_t) +/* Magic value for next allocation to use with fcntl to set next allocation + * to zero and never update it again on new block allocation. + */ +#define HFS_NO_UPDATE_NEXT_ALLOCATION 0xffffFFFF + +#if defined(KERNEL) +#define HFSIOC_GET_VOL_CREATE_TIME_32 _IOR('h', 4, int32_t) +#define HFSIOC_GET_VOL_CREATE_TIME_64 _IOR('h', 4, int64_t) +#else +#define HFSIOC_GET_VOL_CREATE_TIME _IOR('h', 4, time_t) +#endif /* KERNEL */ + +#define HFSIOC_SETBACKINGSTOREINFO _IOW('h', 7, struct hfs_backingstoreinfo) + +#define HFSIOC_CLRBACKINGSTOREINFO _IO('h', 8) + +// 'h', 9 used to be HFSIOC_BULKACCESS which is now deprecated + +/* Unsupported - Previously used to enable/disable ACLs */ +#define HFSIOC_UNSUPPORTED _IOW('h', 10, int32_t) + +#define HFSIOC_PREV_LINK _IOWR('h', 11, u_int32_t) + +#define HFSIOC_NEXT_LINK _IOWR('h', 12, u_int32_t) + +#define HFSIOC_GETPATH _IOWR('h', 13, pathname_t) +/* By default, the path returned by HFS_GETPATH is an absolute path, + * i.e. it also contains the mount point of the volume on which the + * fileID exists. If the following bit is set, the path returned is + * relative to the root of the volume. + */ +#define HFS_GETPATH_VOLUME_RELATIVE 0x1 + +/* Enable/disable extent-based extended attributes */ +#define HFSIOC_SET_XATTREXTENTS_STATE _IOW('h', 14, u_int32_t) + +#if defined(KERNEL) +#define HFSIOC_EXT_BULKACCESS32 _IOW('h', 15, struct user32_ext_access_t) +#define HFSIOC_EXT_BULKACCESS64 _IOW('h', 15, struct user64_ext_access_t) +#else +#define HFSIOC_EXT_BULKACCESS _IOW('h', 15, struct ext_access_t) +#endif /* KERNEL */ + +#define HFSIOC_MARK_BOOT_CORRUPT _IO('h', 16) + +#define HFSIOC_GET_JOURNAL_INFO _IOR('h', 17, struct hfs_journal_info) + +#define HFSIOC_SET_VERY_LOW_DISK _IOW('h', 20, u_int32_t) + +#define HFSIOC_SET_LOW_DISK _IOW('h', 21, u_int32_t) + +#define HFSIOC_SET_DESIRED_DISK _IOW('h', 22, u_int32_t) + +#define HFSIOC_SET_ALWAYS_ZEROFILL _IOW('h', 23, int32_t) + /* XXXJRT Keep until 31866920 is resolved. */ +#define HFS_SET_ALWAYS_ZEROFILL IOCBASECMD(HFSIOC_SET_ALWAYS_ZEROFILL) + +#define HFSIOC_VOLUME_STATUS _IOR('h', 24, u_int32_t) + +/* Disable metadata zone for given volume */ +#define HFSIOC_DISABLE_METAZONE _IO('h', 25) + +/* Change the next CNID value */ +#define HFSIOC_CHANGE_NEXTCNID _IOWR('h', 26, u_int32_t) + /* XXXJRT Keep until 31866920 is resolved. */ +#define HFS_CHANGE_NEXTCNID IOCBASECMD(HFSIOC_CHANGE_NEXTCNID) + +/* Get the low disk space values */ +#define HFSIOC_GET_VERY_LOW_DISK _IOR('h', 27, u_int32_t) + +#define HFSIOC_GET_LOW_DISK _IOR('h', 28, u_int32_t) + +#define HFSIOC_GET_DESIRED_DISK _IOR('h', 29, u_int32_t) + +/* 30 was HFSIOC_GET_WRITE_GEN_COUNTER and is now deprecated */ + +/* 31 was HFSIOC_GET_DOCUMENT_ID and is now deprecated */ + +/* revisiond only uses this when something transforms in a way the kernel can't track such as "foo.rtf" -> "foo.rtfd" */ +#define HFSIOC_TRANSFER_DOCUMENT_ID _IOW('h', 32, u_int32_t) + + +/* + * XXX: Will be deprecated and replaced by HFSIOC_GET_FSINFO + * + * Get information about number of file system allocation blocks used by metadata + * files on the volume, including individual btrees and journal file. The caller + * can determine the size of file system allocation block using value returned as + * f_bsize by statfs(2). + */ +#define HFSIOC_FSINFO_METADATA_BLOCKS _IOWR('h', 38, struct hfsinfo_metadata) + +/* Send TRIMs for all free blocks to the underlying device */ +#define HFSIOC_CS_FREESPACE_TRIM _IOWR('h', 39, u_int32_t) + + +/* Get file system information for the given volume */ +#define HFSIOC_GET_FSINFO _IOWR('h', 45, hfs_fsinfo) + +/* Re-pin hotfile data; argument controls what state gets repinned */ +#define HFSIOC_REPIN_HOTFILE_STATE _IOWR('h', 46, u_int32_t) + +#define HFS_REPIN_METADATA 0x0001 +#define HFS_REPIN_USERDATA 0x0002 + +/* Mark a directory or file as worth caching on any underlying "fast" device */ +#define HFSIOC_SET_HOTFILE_STATE _IOWR('h', 47, u_int32_t) + +/* flags to pass to SET_HOTFILE_STATE */ +#define HFS_MARK_FASTDEVCANDIDATE 0x0001 +#define HFS_UNMARK_FASTDEVCANDIDATE 0x0002 +#define HFS_NEVER_FASTDEVCANDIDATE 0x0004 + +#define HFSIOC_SET_MAX_DEFRAG_SIZE _IOWR('h', 48, u_int32_t) + +#define HFSIOC_FORCE_ENABLE_DEFRAG _IOWR('h', 49, u_int32_t) + +/* These fsctls are ported from apfs. */ +#ifndef APFSIOC_SET_NEAR_LOW_DISK +#define APFSIOC_SET_NEAR_LOW_DISK _IOW('J', 17, u_int32_t) +#endif /* APFSIOC_SET_NEAR_LOW_DISK */ + +#ifndef APFSIOC_GET_NEAR_LOW_DISK +#define APFSIOC_GET_NEAR_LOW_DISK _IOR('J', 18, u_int32_t) +#endif /* APFSIOC_GET_NEAR_LOW_DISK */ + +#endif /* __APPLE_API_UNSTABLE */ + +#endif /* ! _HFS_FSCTL_H_ */ diff --git a/core/hfs_fsinfo.c b/core/hfs_fsinfo.c new file mode 100644 index 0000000..ef8f362 --- /dev/null +++ b/core/hfs_fsinfo.c @@ -0,0 +1,889 @@ +/* + * Copyright (c) 2014-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include + +#include "hfs.h" +#include "hfs_fsctl.h" +#include "hfs_endian.h" +#include "BTreesInternal.h" +#include "BTreesPrivate.h" +#include "FileMgrInternal.h" + +#include "hfs_cprotect.h" + + +union HFSPlusRecord { + HFSPlusCatalogFolder folder_record; + HFSPlusCatalogFile file_record; + HFSPlusCatalogThread thread_record; + HFSPlusExtentRecord extent_record; + HFSPlusAttrRecord attr_record; +}; +typedef union HFSPlusRecord HFSPlusRecord; + +union HFSPlusKey { + HFSPlusExtentKey extent_key; + HFSPlusAttrKey attr_key; +}; +typedef union HFSPlusKey HFSPlusKey; + +typedef enum traverse_btree_flag { + + //If set, extents btree will also be traversed along with catalog btree, so grab correct locks upfront + TRAVERSE_BTREE_EXTENTS = 1, + + // Getting content-protection attributes, allocate enough space to accomodate the records. + TRAVERSE_BTREE_XATTR_CPROTECT = 2, + +} traverse_btree_flag_t; + + + +static errno_t hfs_fsinfo_metadata_blocks(struct hfsmount *hfsmp, struct hfs_fsinfo_metadata *fsinfo); +static errno_t hfs_fsinfo_metadata_extents(struct hfsmount *hfsmp, struct hfs_fsinfo_metadata *fsinfo); +static errno_t hfs_fsinfo_metadata_percentfree(struct hfsmount *hfsmp, struct hfs_fsinfo_metadata *fsinfo); +static errno_t fsinfo_file_extent_count_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data); +static errno_t fsinfo_file_extent_size_catalog_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data); +static errno_t fsinfo_file_extent_size_overflow_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data); +static errno_t fsinfo_file_size_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data); +static errno_t fsinfo_dir_valence_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data); +static errno_t fsinfo_name_size_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data); +static errno_t fsinfo_xattr_size_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data); +static errno_t traverse_btree(struct hfsmount *hfsmp, uint32_t btree_fileID, int flags, void *fsinfo, + int (*callback)(struct hfsmount *, HFSPlusKey *, HFSPlusRecord *, void *)); +static errno_t hfs_fsinfo_free_extents(struct hfsmount *hfsmp, struct hfs_fsinfo_data *fsinfo); +static void fsinfo_free_extents_callback(void *data, off_t free_extent_size); +#if CONFIG_PROTECT +static errno_t fsinfo_cprotect_count_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data); +#endif +static errno_t fsinfo_symlink_size_callback(struct hfsmount *hfsmp, HFSPlusKey *key, HFSPlusRecord *record, void *data); + +/* + * Entry function for all the fsinfo requests from hfs_vnop_ioctl() + * Depending on the type of request, this function will call the + * appropriate sub-function and return success or failure back to + * the caller. + */ +errno_t hfs_get_fsinfo(struct hfsmount *hfsmp, void *a_data) +{ + int error = 0; + hfs_fsinfo *fsinfo_union; + uint32_t request_type; + uint32_t header_len = sizeof(hfs_fsinfo_header_t); + + fsinfo_union = (hfs_fsinfo *)a_data; + request_type = fsinfo_union->header.request_type; + + // Zero out output fields to fsinfo_union, keep the user input fields intact. + bzero((char *)fsinfo_union + header_len, sizeof(hfs_fsinfo) - header_len); + + switch (request_type) { + case HFS_FSINFO_METADATA_BLOCKS_INFO: + error = hfs_fsinfo_metadata_blocks(hfsmp, &(fsinfo_union->metadata)); + break; + + case HFS_FSINFO_METADATA_EXTENTS: + error = hfs_fsinfo_metadata_extents(hfsmp, &(fsinfo_union->metadata)); + break; + + case HFS_FSINFO_METADATA_PERCENTFREE: + error = hfs_fsinfo_metadata_percentfree(hfsmp, &(fsinfo_union->metadata)); + break; + + case HFS_FSINFO_FILE_EXTENT_COUNT: + /* Traverse catalog btree and invoke callback for all records */ + error = traverse_btree(hfsmp, kHFSCatalogFileID, TRAVERSE_BTREE_EXTENTS, &(fsinfo_union->data), fsinfo_file_extent_count_callback); + break; + + case HFS_FSINFO_FILE_EXTENT_SIZE: + /* Traverse the catalog btree first */ + error = traverse_btree(hfsmp, kHFSCatalogFileID, 0, &(fsinfo_union->data), &fsinfo_file_extent_size_catalog_callback); + if (error) { + break; + } + /* Traverse the overflow extents btree now */ + error = traverse_btree(hfsmp, kHFSExtentsFileID, 0, &(fsinfo_union->data), &fsinfo_file_extent_size_overflow_callback); + break; + + case HFS_FSINFO_FILE_SIZE: + /* Traverse catalog btree and invoke callback for all records */ + error = traverse_btree(hfsmp, kHFSCatalogFileID, 0, &(fsinfo_union->data), &fsinfo_file_size_callback); + break; + + case HFS_FSINFO_DIR_VALENCE: + /* Traverse catalog btree and invoke callback for all records */ + error = traverse_btree(hfsmp, kHFSCatalogFileID, 0, &(fsinfo_union->data), &fsinfo_dir_valence_callback); + break; + + case HFS_FSINFO_NAME_SIZE: + /* Traverse catalog btree and invoke callback for all records */ + error = traverse_btree(hfsmp, kHFSCatalogFileID, 0, &(fsinfo_union->name), &fsinfo_name_size_callback); + break; + + case HFS_FSINFO_XATTR_SIZE: + /* Traverse attribute btree and invoke callback for all records */ + error = traverse_btree(hfsmp, kHFSAttributesFileID, 0, &(fsinfo_union->data), &fsinfo_xattr_size_callback); + break; + + case HFS_FSINFO_FREE_EXTENTS: + error = hfs_fsinfo_free_extents(hfsmp, &(fsinfo_union->data)); + break; + + case HFS_FSINFO_SYMLINK_SIZE: + /* Traverse catalog btree and invoke callback for all records */ + error = traverse_btree(hfsmp, kHFSCatalogFileID, 0, &(fsinfo_union->data), &fsinfo_symlink_size_callback); + break; + +#if CONFIG_PROTECT + case HFS_FSINFO_FILE_CPROTECT_COUNT: + /* Traverse attribute btree and invoke callback for all records */ + error = traverse_btree(hfsmp, kHFSAttributesFileID, TRAVERSE_BTREE_XATTR_CPROTECT, &(fsinfo_union->cprotect), &fsinfo_cprotect_count_callback); + break; +#endif + + default: + return ENOTSUP; + }; + + return error; +} + +/* + * This function provides information about total number of allocation blocks + * for each individual metadata file. + */ +static errno_t +hfs_fsinfo_metadata_blocks(struct hfsmount *hfsmp, struct hfs_fsinfo_metadata *fsinfo) +{ + int lockflags = 0; + int ret_lockflags = 0; + + /* + * Getting number of allocation blocks for all metadata files + * should be a relatively quick operation, so we grab locks for all + * the btrees at the same time + */ + lockflags = SFL_CATALOG | SFL_EXTENTS | SFL_BITMAP | SFL_ATTRIBUTE; + ret_lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_SHARED_LOCK); + + /* Get information about all the btrees */ + fsinfo->extents = hfsmp->hfs_extents_cp->c_datafork->ff_blocks; + fsinfo->catalog = hfsmp->hfs_catalog_cp->c_datafork->ff_blocks; + fsinfo->allocation = hfsmp->hfs_allocation_cp->c_datafork->ff_blocks; + if (hfsmp->hfs_attribute_cp) + fsinfo->attribute = hfsmp->hfs_attribute_cp->c_datafork->ff_blocks; + else + fsinfo->attribute = 0; + + /* Done with btrees, give up the locks */ + hfs_systemfile_unlock(hfsmp, ret_lockflags); + + /* Get information about journal file */ + fsinfo->journal = howmany(hfsmp->jnl_size, hfsmp->blockSize); + + return 0; +} + +/* + * Helper function to count the number of valid extents in a file fork structure + */ +static uint32_t +hfs_count_extents_fp(struct filefork *ff) +{ + int i; + uint32_t count = 0; + for (i = 0; i < kHFSPlusExtentDensity; i++) { + if (ff->ff_data.cf_extents[i].blockCount == 0) { + break; + } + count++; + } + return count; +} + + +/* + * This is a helper function that counts the total number of valid + * extents in all the overflow extent records for given fileID + * in overflow extents btree + */ +static errno_t +hfs_count_overflow_extents(struct hfsmount *hfsmp, uint32_t fileID, uint32_t *num_extents) +{ + int error; + FCB *fcb; + struct BTreeIterator *iterator = NULL; + FSBufferDescriptor btdata; + HFSPlusExtentKey *extentKey; + HFSPlusExtentRecord extentData; + uint32_t extent_count = 0; + int i; + + fcb = VTOF(hfsmp->hfs_extents_vp); + iterator = hfs_mallocz(sizeof(struct BTreeIterator)); + + extentKey = (HFSPlusExtentKey *) &iterator->key; + extentKey->keyLength = kHFSPlusExtentKeyMaximumLength; + extentKey->forkType = kHFSDataForkType; + extentKey->fileID = fileID; + extentKey->startBlock = 0; + + btdata.bufferAddress = &extentData; + btdata.itemSize = sizeof(HFSPlusExtentRecord); + btdata.itemCount = 1; + + /* Search for overflow extent record */ + error = BTSearchRecord(fcb, iterator, &btdata, NULL, iterator); + + /* + * We used startBlock of zero, so we will not find any records and errors + * are expected. It will also position the iterator just before the first + * overflow extent record for given fileID (if any). + */ + if (error && error != fsBTRecordNotFoundErr && error != fsBTEndOfIterationErr) + goto out; + error = 0; + + for (;;) { + + if (msleep(NULL, NULL, PINOD | PCATCH, + "hfs_fsinfo", NULL) == EINTR) { + error = EINTR; + break; + } + + error = BTIterateRecord(fcb, kBTreeNextRecord, iterator, &btdata, NULL); + if (error != 0) { + /* These are expected errors, so mask them */ + if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) { + error = 0; + } + break; + } + + /* If we encounter different fileID, stop the iteration */ + if (extentKey->fileID != fileID) { + break; + } + + if (extentKey->forkType != kHFSDataForkType) + break; + + /* This is our record of interest; only count the datafork extents. */ + for (i = 0; i < kHFSPlusExtentDensity; i++) { + if (extentData[i].blockCount == 0) { + break; + } + extent_count++; + } + } + +out: + hfs_free(iterator, sizeof(*iterator)); + + if (error == 0) { + *num_extents = extent_count; + } + return MacToVFSError(error); +} + +/* + * This function provides information about total number of extents (including + * extents from overflow extents btree, if any) for each individual metadata + * file. + */ +static errno_t +hfs_fsinfo_metadata_extents(struct hfsmount *hfsmp, struct hfs_fsinfo_metadata *fsinfo) +{ + int error = 0; + int lockflags = 0; + int ret_lockflags = 0; + uint32_t overflow_count; + + /* + * Counting the number of extents for all metadata files should + * be a relatively quick operation, so we grab locks for all the + * btrees at the same time + */ + lockflags = SFL_CATALOG | SFL_EXTENTS | SFL_BITMAP | SFL_ATTRIBUTE; + ret_lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_SHARED_LOCK); + + /* Get number of extents for extents overflow btree */ + fsinfo->extents = hfs_count_extents_fp(hfsmp->hfs_extents_cp->c_datafork); + + /* Get number of extents for catalog btree */ + fsinfo->catalog = hfs_count_extents_fp(hfsmp->hfs_catalog_cp->c_datafork); + if (fsinfo->catalog >= kHFSPlusExtentDensity) { + error = hfs_count_overflow_extents(hfsmp, kHFSCatalogFileID, &overflow_count); + if (error) { + goto out; + } + fsinfo->catalog += overflow_count; + } + + /* Get number of extents for allocation file */ + fsinfo->allocation = hfs_count_extents_fp(hfsmp->hfs_allocation_cp->c_datafork); + if (fsinfo->allocation >= kHFSPlusExtentDensity) { + error = hfs_count_overflow_extents(hfsmp, kHFSAllocationFileID, &overflow_count); + if (error) { + goto out; + } + fsinfo->allocation += overflow_count; + } + + /* + * Get number of extents for attribute btree. + * hfs_attribute_cp might be NULL. + */ + if (hfsmp->hfs_attribute_cp) { + fsinfo->attribute = hfs_count_extents_fp(hfsmp->hfs_attribute_cp->c_datafork); + if (fsinfo->attribute >= kHFSPlusExtentDensity) { + error = hfs_count_overflow_extents(hfsmp, kHFSAttributesFileID, &overflow_count); + if (error) { + goto out; + } + fsinfo->attribute += overflow_count; + } + } + /* Journal always has one extent */ + fsinfo->journal = 1; +out: + hfs_systemfile_unlock(hfsmp, ret_lockflags); + return error; +} + +/* + * Helper function to calculate percentage i.e. X is what percent of Y? + */ +static inline uint32_t +hfs_percent(uint32_t X, uint32_t Y) +{ + return (X * 100ll) / Y; +} + +/* + * This function provides percentage of free nodes vs total nodes for each + * individual metadata btrees, i.e. for catalog, overflow extents and + * attributes btree. This information is not applicable for allocation + * file and journal file. + */ +static errno_t +hfs_fsinfo_metadata_percentfree(struct hfsmount *hfsmp, struct hfs_fsinfo_metadata *fsinfo) +{ + int lockflags = 0; + int ret_lockflags = 0; + BTreeControlBlockPtr btreePtr; + uint32_t free_nodes, total_nodes; + + /* + * Getting total and used nodes for all metadata btrees should + * be a relatively quick operation, so we grab locks for all the + * btrees at the same time + */ + lockflags = SFL_CATALOG | SFL_EXTENTS | SFL_BITMAP | SFL_ATTRIBUTE; + ret_lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_SHARED_LOCK); + + /* Overflow extents btree */ + btreePtr = VTOF(hfsmp->hfs_extents_vp)->fcbBTCBPtr; + total_nodes = btreePtr->totalNodes; + free_nodes = btreePtr->freeNodes; + fsinfo->extents = hfs_percent(free_nodes, total_nodes); + + /* Catalog btree */ + btreePtr = VTOF(hfsmp->hfs_catalog_vp)->fcbBTCBPtr; + total_nodes = btreePtr->totalNodes; + free_nodes = btreePtr->freeNodes; + fsinfo->catalog = hfs_percent(free_nodes, total_nodes); + + /* Attributes btree */ + if (hfsmp->hfs_attribute_vp) { + btreePtr = VTOF(hfsmp->hfs_attribute_vp)->fcbBTCBPtr; + total_nodes = btreePtr->totalNodes; + free_nodes = btreePtr->freeNodes; + fsinfo->attribute = hfs_percent(free_nodes, total_nodes); + } + + hfs_systemfile_unlock(hfsmp, ret_lockflags); + return 0; +} + +/* + * Helper function to calculate log base 2 for given number + */ +static inline int +hfs_log2(uint64_t entry) +{ + return (63 - __builtin_clzll(entry|1)); +} + +/* + * Helper function to account for input entry into the data + * array based on its log base 2 value + */ +void hfs_fsinfo_data_add(struct hfs_fsinfo_data *fsinfo, uint64_t entry) +{ + /* + * From hfs_fsctl.h - + * + * hfs_fsinfo_data is generic data structure to aggregate information like sizes + * or counts in buckets of power of 2. Each bucket represents a range of values + * that is determined based on its index in the array. Specifically, buckets[i] + * represents values that are greater than or equal to 2^(i-1) and less than 2^i, + * except the last bucket which represents range greater than or equal to 2^(i-1) + * + * The current maximum number of buckets is 41, so we can represent range from + * 0 up to 1TB in increments of power of 2, and then a catch-all bucket of + * anything that is greater than or equal to 1TB. + * + * For example, + * bucket[0] -> greater than or equal to 0 and less than 1 + * bucket[1] -> greater than or equal to 1 and less than 2 + * bucket[10] -> greater than or equal to 2^(10-1) = 512 and less than 2^10 = 1024 + * bucket[20] -> greater than or equal to 2^(20-1) = 512KB and less than 2^20 = 1MB + * bucket[41] -> greater than or equal to 2^(41-1) = 1TB + */ + uint32_t bucket; + + if (entry) { + /* + * Calculate log base 2 value for the entry. + * Account for this value in the appropriate bucket. + * The last bucket is a catch-all bucket of + * anything that is greater than or equal to 1TB + */ + bucket = MIN(hfs_log2(entry) + 1, HFS_FSINFO_DATA_MAX_BUCKETS-1); + ++fsinfo->bucket[bucket]; + } else { + /* Entry is zero, so account it in 0th offset */ + fsinfo->bucket[0]++; + } +} + +/* + * Function to traverse all the records of a btree and then call caller-provided + * callback function for every record found. The type of btree is chosen based + * on the fileID provided by the caller. This fuction grabs the correct locks + * depending on the type of btree it will be traversing and flags provided + * by the caller. + * + * Note: It might drop and reacquire the locks during execution. + */ +static errno_t +traverse_btree(struct hfsmount *hfsmp, uint32_t btree_fileID, int flags, + void *fsinfo, int (*callback)(struct hfsmount *, HFSPlusKey *, HFSPlusRecord *, void *)) +{ + int error = 0; + int lockflags = 0; + int ret_lockflags = 0; + FCB *fcb; + struct BTreeIterator *iterator = NULL; + struct FSBufferDescriptor btdata; + int btree_operation; + HFSPlusRecord record; + HFSPlusKey *key; + uint64_t start, timeout_abs; + + switch(btree_fileID) { + case kHFSExtentsFileID: + fcb = VTOF(hfsmp->hfs_extents_vp); + lockflags = SFL_EXTENTS; + break; + case kHFSCatalogFileID: + fcb = VTOF(hfsmp->hfs_catalog_vp); + lockflags = SFL_CATALOG; + break; + case kHFSAttributesFileID: + // Attributes file doesn’t exist, There are no records to iterate. + if (hfsmp->hfs_attribute_vp == NULL) + return error; + fcb = VTOF(hfsmp->hfs_attribute_vp); + lockflags = SFL_ATTRIBUTE; + break; + + default: + return EINVAL; + } + + iterator = hfs_mallocz(sizeof(struct BTreeIterator)); + + /* The key is initialized to zero because we are traversing entire btree */ + key = (HFSPlusKey *)&iterator->key; + + if (flags & TRAVERSE_BTREE_EXTENTS) { + lockflags |= SFL_EXTENTS; + } + + btdata.bufferAddress = &record; + btdata.itemSize = sizeof(HFSPlusRecord); + btdata.itemCount = 1; + + /* Lock btree for duration of traversal */ + ret_lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_SHARED_LOCK); + btree_operation = kBTreeFirstRecord; + + nanoseconds_to_absolutetime(HFS_FSINFO_MAX_LOCKHELD_TIME, &timeout_abs); + start = mach_absolute_time(); + + while (1) { + + if (msleep(NULL, NULL, PINOD | PCATCH, + "hfs_fsinfo", NULL) == EINTR) { + error = EINTR; + break; + } + + error = BTIterateRecord(fcb, btree_operation, iterator, &btdata, NULL); + if (error != 0) { + if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) { + error = 0; + } + break; + } + /* Lookup next btree record on next call to BTIterateRecord() */ + btree_operation = kBTreeNextRecord; + + /* Call our callback function and stop iteration if there are any errors */ + error = callback(hfsmp, key, &record, fsinfo); + if (error) { + break; + } + + /* let someone else use the tree after we've processed over HFS_FSINFO_MAX_LOCKHELD_TIME */ + if ((mach_absolute_time() - start) >= timeout_abs) { + + /* release b-tree locks and let someone else get the lock */ + hfs_systemfile_unlock (hfsmp, ret_lockflags); + + /* add tsleep here to force context switch and fairness */ + tsleep((caddr_t)hfsmp, PRIBIO, "hfs_fsinfo", 1); + + /* + * re-acquire the locks in the same way that we wanted them originally. + * note: it is subtle but worth pointing out that in between the time that we + * released and now want to re-acquire these locks that the b-trees may have shifted + * slightly but significantly. For example, the catalog or other b-tree could have grown + * past 8 extents and now requires the extents lock to be held in order to be safely + * manipulated. We can't be sure of the state of the b-tree from where we last left off. + */ + + ret_lockflags = hfs_systemfile_lock (hfsmp, lockflags, HFS_SHARED_LOCK); + + /* + * It's highly likely that the search key we stashed away before dropping lock + * no longer points to an existing item. Iterator's IterateRecord is able to + * re-position itself and process the next record correctly. With lock dropped, + * there might be records missed for statistic gathering, which is ok. The + * point is to get aggregate values. + */ + + start = mach_absolute_time(); + + /* loop back around and get another record */ + } + } + + hfs_systemfile_unlock(hfsmp, ret_lockflags); + hfs_free(iterator, sizeof(*iterator)); + return MacToVFSError(error); +} + +/* + * Callback function to get distribution of number of extents + * for all user files in given file system. Note that this only + * accounts for data fork, no resource fork. + */ +static errno_t +fsinfo_file_extent_count_callback(struct hfsmount *hfsmp, + __unused HFSPlusKey *key, HFSPlusRecord *record, void *data) +{ + int i; + int error = 0; + uint32_t num_extents = 0; + uint32_t num_overflow = 0; + uint32_t blockCount; + + if (record->file_record.recordType == kHFSPlusFileRecord) { + /* Count total number of extents for this file */ + for (i = 0; i < kHFSPlusExtentDensity; i++) { + blockCount = record->file_record.dataFork.extents[i].blockCount; + if (blockCount == 0) { + break; + } + num_extents++; + } + /* This file has overflow extent records, so search overflow btree */ + if (num_extents >= kHFSPlusExtentDensity) { + /* The caller also hold extents overflow btree lock */ + error = hfs_count_overflow_extents(hfsmp, record->file_record.fileID, &num_overflow); + if (error) { + goto out; + } + num_extents += num_overflow; + } + hfs_fsinfo_data_add(data, num_extents); + } +out: + return error; +} + +/* + * Callback function to get distribution of individual extent sizes + * (in bytes) for all user files in given file system from catalog + * btree only. Note that this only accounts for data fork, no resource + * fork. + */ +static errno_t fsinfo_file_extent_size_catalog_callback(struct hfsmount *hfsmp, + __unused HFSPlusKey *key, HFSPlusRecord *record, void *data) +{ + int i; + uint32_t blockCount; + uint64_t extent_size; + + if (record->file_record.recordType == kHFSPlusFileRecord) { + /* Traverse through all valid extents */ + for (i = 0; i < kHFSPlusExtentDensity; i++) { + blockCount = record->file_record.dataFork.extents[i].blockCount; + if (blockCount == 0) { + break; + } + extent_size = hfs_blk_to_bytes(blockCount, hfsmp->blockSize); + hfs_fsinfo_data_add(data, extent_size); + } + } + return 0; +} + +/* + * Callback function to get distribution of individual extent sizes + * (in bytes) for all user files in given file system from overflow + * extents btree only. Note that this only accounts for data fork, + * no resource fork. + */ +static errno_t fsinfo_file_extent_size_overflow_callback(struct hfsmount *hfsmp, + HFSPlusKey *key, HFSPlusRecord *record, void *data) +{ + int i; + uint32_t blockCount; + uint64_t extent_size; + + if (key->extent_key.fileID >= kHFSFirstUserCatalogNodeID) { + // Only count the data fork extents. + if (key->extent_key.forkType == kHFSDataForkType) { + for (i = 0; i < kHFSPlusExtentDensity; i++) { + blockCount = record->extent_record[i].blockCount; + if (blockCount == 0) { + break; + } + extent_size = hfs_blk_to_bytes(blockCount, hfsmp->blockSize); + hfs_fsinfo_data_add(data, extent_size); + } + } + } + return 0; +} + +/* + * Callback function to get distribution of file sizes (in bytes) + * for all user files in given file system. Note that this only + * accounts for data fork, no resource fork. + */ +static errno_t fsinfo_file_size_callback(__unused struct hfsmount *hfsmp, + __unused HFSPlusKey *key, HFSPlusRecord *record, void *data) +{ + if (record->file_record.recordType == kHFSPlusFileRecord) { + /* Record of interest, account for the size in the bucket */ + hfs_fsinfo_data_add(data, record->file_record.dataFork.logicalSize); + } + return 0; +} + +/* + * Callback function to get distribution of directory valence + * for all directories in the given file system. + */ +static errno_t fsinfo_dir_valence_callback(__unused struct hfsmount *hfsmp, + __unused HFSPlusKey *key, HFSPlusRecord *record, void *data) +{ + if (record->folder_record.recordType == kHFSPlusFolderRecord) { + hfs_fsinfo_data_add(data, record->folder_record.valence); + } + return 0; +} + +/* + * Callback function to get distribution of number of unicode + * characters in name for all files and directories for a given + * file system. + */ +static errno_t fsinfo_name_size_callback(__unused struct hfsmount *hfsmp, + __unused HFSPlusKey *key, HFSPlusRecord *record, void *data) +{ + struct hfs_fsinfo_name *fsinfo = (struct hfs_fsinfo_name *)data; + uint32_t length; + + if ((record->folder_record.recordType == kHFSPlusFolderThreadRecord) || + (record->folder_record.recordType == kHFSPlusFileThreadRecord)) { + length = record->thread_record.nodeName.length; + /* Make sure that the nodeName is bounded, otherwise return error */ + if (length > kHFSPlusMaxFileNameChars) { + return EIO; + } + + // sanity check for a name length of zero, which isn't valid on disk. + if (length == 0) + return EIO; + + /* Round it down to nearest multiple of 5 to match our buckets granularity */ + length = (length - 1)/ 5; + /* Account this value into our bucket */ + fsinfo->bucket[length]++; + } + return 0; +} + +/* + * Callback function to get distribution of size of all extended + * attributes for a given file system. + */ +static errno_t fsinfo_xattr_size_callback(__unused struct hfsmount *hfsmp, + __unused HFSPlusKey *key, HFSPlusRecord *record, void *data) +{ + if (record->attr_record.recordType == kHFSPlusAttrInlineData) { + /* Inline attribute */ + hfs_fsinfo_data_add(data, record->attr_record.attrData.attrSize); + } else if (record->attr_record.recordType == kHFSPlusAttrForkData) { + /* Larger attributes with extents information */ + hfs_fsinfo_data_add(data, record->attr_record.forkData.theFork.logicalSize); + } + return 0; +} + + +/* + * Callback function to get distribution of free space extents for a given file system. + */ +static void fsinfo_free_extents_callback(void *data, off_t free_extent_size) +{ + // Assume a minimum of 4 KB block size + hfs_fsinfo_data_add(data, free_extent_size / 4096); +} + +/* + * Function to get distribution of free space extents for a given file system. + */ +static errno_t hfs_fsinfo_free_extents(struct hfsmount *hfsmp, struct hfs_fsinfo_data *fsinfo) +{ + return hfs_find_free_extents(hfsmp, &fsinfo_free_extents_callback, fsinfo); +} + +/* + * Callback function to get distribution of symblock link sizes (in bytes) + * for all user files in given file system. Note that this only + * accounts for data fork, no resource fork. + */ +static errno_t fsinfo_symlink_size_callback(__unused struct hfsmount *hfsmp, + __unused HFSPlusKey *key, HFSPlusRecord *record, void *data) +{ + if (record->file_record.recordType == kHFSPlusFileRecord) { + /* Record of interest, account for the size in the bucket */ + if (S_ISLNK(record->file_record.bsdInfo.fileMode)) + hfs_fsinfo_data_add((struct hfs_fsinfo_data *)data, record->file_record.dataFork.logicalSize); + } + return 0; +} + +#if CONFIG_PROTECT +/* + * Callback function to get total number of files/directories + * for each content protection class + */ +static int fsinfo_cprotect_count_callback(struct hfsmount *hfsmp, HFSPlusKey *key, + HFSPlusRecord *record, void *data) +{ + struct hfs_fsinfo_cprotect *fsinfo = (struct hfs_fsinfo_cprotect *)data; + static const uint16_t cp_xattrname_utf16[] = CONTENT_PROTECTION_XATTR_NAME_CHARS; + /* + * NOTE: cp_xattrname_utf16_len is the number of UTF-16 code units in + * the EA name string. + */ + static const size_t cp_xattrname_utf16_len = sizeof(cp_xattrname_utf16)/2; + struct cp_xattr_v5 *xattr; + size_t xattr_len = sizeof(struct cp_xattr_v5); + struct cprotect cp_entry; + struct cprotect *cp_entryp = &cp_entry; + int error = 0; + + /* Content protect xattrs are inline attributes only, so skip all others */ + if (record->attr_record.recordType != kHFSPlusAttrInlineData) + return 0; + + /* We only look at content protection xattrs */ + if ((key->attr_key.attrNameLen != cp_xattrname_utf16_len) || + (bcmp(key->attr_key.attrName, cp_xattrname_utf16, 2 * cp_xattrname_utf16_len))) { + return 0; + } + + xattr = (struct cp_xattr_v5 *)((void *)(record->attr_record.attrData.attrData)); + error = cp_read_xattr_v5(hfsmp, xattr, xattr_len, (cprotect_t *)&cp_entryp, + CP_GET_XATTR_BASIC_INFO); + if (error) + return 0; + + /* No key present, skip this record */ + if (!ISSET(cp_entry.cp_flags, CP_HAS_A_KEY)) + return 0; + + /* Now account for the persistent class */ + switch (CP_CLASS(cp_entry.cp_pclass)) { + case PROTECTION_CLASS_A: + fsinfo->class_A++; + break; + case PROTECTION_CLASS_B: + fsinfo->class_B++; + break; + case PROTECTION_CLASS_C: + fsinfo->class_C++; + break; + case PROTECTION_CLASS_D: + fsinfo->class_D++; + break; + case PROTECTION_CLASS_E: + fsinfo->class_E++; + break; + case PROTECTION_CLASS_F: + fsinfo->class_F++; + break; + }; + + return 0; +} +#endif diff --git a/core/hfs_hotfiles.c b/core/hfs_hotfiles.c new file mode 100644 index 0000000..8d07a23 --- /dev/null +++ b/core/hfs_hotfiles.c @@ -0,0 +1,3929 @@ +/* + * Copyright (c) 2003-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hfs.h" +#include "hfs_endian.h" +#include "hfs_format.h" +#include "hfs_mount.h" +#include "hfs_hotfiles.h" + +#include "BTreeScanner.h" + + +#define HFC_DEBUG 0 +#define HFC_VERBOSE 0 + + +/* + * Minimum post Tiger base time. + * Thu Mar 31 17:00:00 2005 + */ +#define HFC_MIN_BASE_TIME 0x424c8f00L + +/* + * Hot File List (runtime). + */ +typedef struct hotfileinfo { + u_int32_t hf_fileid; + u_int32_t hf_temperature; + u_int32_t hf_blocks; +} hotfileinfo_t; + +typedef struct hotfilelist { + size_t hfl_size; + u_int32_t hfl_magic; + u_int32_t hfl_version; + time_t hfl_duration; /* duration of sample period */ + int hfl_count; /* count of hot files recorded */ + int hfl_next; /* next file to move */ + int hfl_totalblocks; /* total hot file blocks */ + int hfl_reclaimblks; /* blocks to reclaim in HFV */ + u_int32_t hfl_spare[2]; + hotfileinfo_t hfl_hotfile[1]; /* array of hot files */ +} hotfilelist_t; + + +/* + * Hot File Entry (runtime). + */ +typedef struct hotfile_entry { + struct hotfile_entry *left; + struct hotfile_entry *right; + u_int32_t fileid; + u_int32_t temperature; + u_int32_t blocks; +} hotfile_entry_t; + + +// +// We cap the max temperature for non-system files to "MAX_NORMAL_TEMP" +// so that they will always have a lower temperature than system (aka +// "auto-cached") files. System files have MAX_NORMAL_TEMP added to +// their temperature which produces two bands of files (all non-system +// files will have a temp less than MAX_NORMAL_TEMP and all system +// files will have a temp greatern than MAX_NORMAL_TEMP). +// +// This puts non-system files on the left side of the hotfile btree +// (and we start evicting from the left-side of the tree). The idea is +// that we will evict non-system files more aggressively since their +// working set changes much more dynamically than system files (which +// are for the most part, static). +// +// NOTE: these values have to fit into a 32-bit int. We use a +// value of 1-billion which gives a pretty broad range +// and yet should not run afoul of any sign issues. +// +#define MAX_NORMAL_TEMP 1000000000 +#define HF_TEMP_RANGE MAX_NORMAL_TEMP + + +// +// These used to be defines of the hard coded values. But if +// we're on an cooperative fusion (CF) system we need to change +// the values (which happens in hfs_recording_init() +// +uint32_t hfc_default_file_count = 1000; +uint32_t hfc_default_duration = (3600 * 60); +uint32_t hfc_max_file_count = 5000; +uint64_t hfc_max_file_size = (10 * 1024 * 1024); + + +/* + * Hot File Recording Data (runtime). + */ +typedef struct hotfile_data { + size_t size; + struct hfsmount *hfsmp; + long refcount; + u_int32_t activefiles; /* active number of hot files */ + u_int32_t threshold; + u_int32_t maxblocks; + hotfile_entry_t *rootentry; + hotfile_entry_t *freelist; + hotfile_entry_t *coldest; + hotfile_entry_t entries[]; +} hotfile_data_t; + +static int hfs_recording_start (struct hfsmount *); +static int hfs_recording_stop (struct hfsmount *); + +/* Hotfiles pinning routines */ +static int hfs_getvnode_and_pin (struct hfsmount *hfsmp, uint32_t fileid, uint32_t *pinned); +static int hfs_pin_extent_record (struct hfsmount *hfsmp, HFSPlusExtentRecord extents, uint32_t *pinned); +static int hfs_pin_catalog_rec (struct hfsmount *hfsmp, HFSPlusCatalogFile *cfp, int rsrc); + +/* + * Hot File Data recording functions (in-memory binary tree). + */ +static int hf_insert (hotfile_data_t *, hotfile_entry_t *); +static void hf_delete (hotfile_data_t *, u_int32_t, u_int32_t); +static hotfile_entry_t * hf_coldest (hotfile_data_t *); +static hotfile_entry_t * hf_getnewentry (hotfile_data_t *); +static void hf_getsortedlist (hotfile_data_t *, hotfilelist_t *); + +#if HFC_DEBUG +static hotfile_entry_t * hf_lookup (hotfile_data_t *, u_int32_t, u_int32_t); +static void hf_maxdepth(hotfile_entry_t *, int, int *); +static void hf_printtree (hotfile_entry_t *); +#endif + +/* + * Hot File misc support functions. + */ +static int hotfiles_collect (struct hfsmount *); +static int hotfiles_age (struct hfsmount *); +static int hotfiles_adopt (struct hfsmount *); +static int hotfiles_evict (struct hfsmount *, vfs_context_t); +static int hotfiles_refine (struct hfsmount *); +static int hotextents(struct hfsmount *, HFSPlusExtentDescriptor *); +static int hfs_addhotfile_internal(struct vnode *); +static int hfs_hotfile_cur_freeblks(hfsmount_t *hfsmp); + + +/* + * Hot File Cluster B-tree (on disk) functions. + */ +static int hfc_btree_create (struct hfsmount *, unsigned int, unsigned int); +static int hfc_btree_open (struct hfsmount *, struct vnode **); +static int hfc_btree_open_ext(struct hfsmount *hfsmp, struct vnode **vpp, int ignore_btree_errs); +static int hfc_btree_close (struct hfsmount *, struct vnode *); +static int hfc_btree_delete_record(struct hfsmount *hfsmp, BTreeIterator *iterator, HotFileKey *key); +static int hfc_btree_delete(struct hfsmount *hfsmp); +static int hfc_comparekeys (HotFileKey *, HotFileKey *); + + +char hfc_tag[] = "CLUSTERED HOT FILES B-TREE "; + + +/* + *======================================================================== + * HOT FILE INTERFACE ROUTINES + *======================================================================== + */ + +/* + * Start recording the hottest files on a file system. + * + * Requires that the hfc_mutex be held. + */ +static int +hfs_recording_start(struct hfsmount *hfsmp) +{ + hotfile_data_t *hotdata; + struct timeval tv; + int maxentries; + size_t size; + int i; + int error; + + if ((hfsmp->hfs_flags & HFS_READ_ONLY) || + (hfsmp->jnl == NULL) || + (hfsmp->hfs_flags & HFS_METADATA_ZONE) == 0) { + return (EPERM); + } + if (HFSTOVCB(hfsmp)->freeBlocks < (2 * (u_int32_t)hfsmp->hfs_hotfile_maxblks)) { + return (ENOSPC); + } + if (hfsmp->hfc_stage != HFC_IDLE) { + return (EBUSY); + } + hfsmp->hfc_stage = HFC_BUSY; + + if (hfsmp->hfc_recdata) { + hfs_free(hfsmp->hfc_recdata, hfsmp->hfc_recdata->size); + hfsmp->hfc_recdata = NULL; + } + if (hfsmp->hfc_filelist) { + hfs_free(hfsmp->hfc_filelist, hfsmp->hfc_filelist->hfl_size); + hfsmp->hfc_filelist = NULL; + } + + microtime(&tv); /* Times are base on GMT time. */ + + /* + * On first startup check for suspended recording. + */ + if (hfsmp->hfc_timebase == 0 && + hfc_btree_open(hfsmp, &hfsmp->hfc_filevp) == 0) { + HotFilesInfo hotfileinfo; + + if ((BTGetUserData(VTOF(hfsmp->hfc_filevp), &hotfileinfo, + sizeof(hotfileinfo)) == 0) && + (SWAP_BE32 (hotfileinfo.magic) == HFC_MAGIC) && + (SWAP_BE32 (hotfileinfo.timeleft) > 0) && + (SWAP_BE32 (hotfileinfo.timebase) > 0)) { + if (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) { + if (hfsmp->hfs_hotfile_freeblks == 0) { + hfsmp->hfs_hotfile_freeblks = hfsmp->hfs_hotfile_maxblks - SWAP_BE32 (hotfileinfo.usedblocks); + } + hfsmp->hfc_maxfiles = 0x7fffffff; + printf("hfs: %s: %s: hotfile freeblocks: %d, max: %d\n", hfsmp->vcbVN, __FUNCTION__, + hfsmp->hfs_hotfile_freeblks, hfsmp->hfs_hotfile_maxblks); + } else { + hfsmp->hfc_maxfiles = SWAP_BE32 (hotfileinfo.maxfilecnt); + } + hfsmp->hfc_timebase = SWAP_BE32 (hotfileinfo.timebase); + int timeleft = (int)SWAP_BE32(hotfileinfo.timeleft); + if (timeleft < 0 || timeleft > (int)(HFC_DEFAULT_DURATION*2)) { + // in case this field got botched, don't let it screw things up + // printf("hfs: hotfiles: bogus looking timeleft: %d\n", timeleft); + timeleft = HFC_DEFAULT_DURATION; + } + hfsmp->hfc_timeout = timeleft + tv.tv_sec ; + /* Fix up any bogus timebase values. */ + if (hfsmp->hfc_timebase < HFC_MIN_BASE_TIME) { + hfsmp->hfc_timebase = hfsmp->hfc_timeout - HFC_DEFAULT_DURATION; + } +#if HFC_VERBOSE + printf("hfs: Resume recording hot files on %s (%d secs left (%d); timeout %ld)\n", + hfsmp->vcbVN, SWAP_BE32 (hotfileinfo.timeleft), timeleft, hfsmp->hfc_timeout - tv.tv_sec); +#endif + } else { + hfsmp->hfc_maxfiles = HFC_DEFAULT_FILE_COUNT; + hfsmp->hfc_timebase = tv.tv_sec + 1; + hfsmp->hfc_timeout = hfsmp->hfc_timebase + HFC_DEFAULT_DURATION; + } + (void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp); + hfsmp->hfc_filevp = NULL; + } else { + struct cat_attr cattr; + u_int32_t cnid; + + /* + * Make sure a btree file exists. + */ + cnid = GetFileInfo(HFSTOVCB(hfsmp), kRootDirID, HFC_FILENAME, &cattr, NULL); + if ((cnid == 0) && + !S_ISREG(cattr.ca_mode) && + (error = hfc_btree_create(hfsmp, HFSTOVCB(hfsmp)->blockSize, HFC_DEFAULT_FILE_COUNT))) { + hfsmp->hfc_stage = HFC_IDLE; + wakeup((caddr_t)&hfsmp->hfc_stage); + return (error); + } +#if HFC_VERBOSE + printf("hfs: begin recording hot files on %s (hotfile start/end block: %d - %d; max/free: %d/%d; maxfiles: %d)\n", + hfsmp->vcbVN, + hfsmp->hfs_hotfile_start, hfsmp->hfs_hotfile_end, + hfsmp->hfs_hotfile_maxblks, hfsmp->hfs_hotfile_freeblks, hfsmp->hfc_maxfiles); +#endif + hfsmp->hfc_maxfiles = HFC_DEFAULT_FILE_COUNT; + hfsmp->hfc_timeout = tv.tv_sec + HFC_DEFAULT_DURATION; + + /* Reset time base. */ + if (hfsmp->hfc_timebase == 0) { + hfsmp->hfc_timebase = tv.tv_sec + 1; + } else { + time_t cumulativebase; + + cumulativebase = hfsmp->hfc_timeout - (HFC_CUMULATIVE_CYCLES * HFC_DEFAULT_DURATION); + hfsmp->hfc_timebase = MAX(hfsmp->hfc_timebase, cumulativebase); + } + } + + if ((hfsmp->hfc_maxfiles == 0) || + (hfsmp->hfc_maxfiles > HFC_MAXIMUM_FILE_COUNT)) { + hfsmp->hfc_maxfiles = HFC_DEFAULT_FILE_COUNT; + } + maxentries = hfsmp->hfc_maxfiles; + + size = sizeof(hotfile_data_t) + maxentries * sizeof(hotfile_entry_t); + hotdata = hfs_mallocz(size); + hotdata->size = size; + + for (i = 1; i < maxentries ; i++) + hotdata->entries[i-1].right = &hotdata->entries[i]; + + hotdata->freelist = &hotdata->entries[0]; + /* + * Establish minimum temperature and maximum file size. + */ + hotdata->threshold = HFC_MINIMUM_TEMPERATURE; + hotdata->maxblocks = HFC_MAXIMUM_FILESIZE / HFSTOVCB(hfsmp)->blockSize; + hotdata->hfsmp = hfsmp; + + hfsmp->hfc_recdata = hotdata; + hfsmp->hfc_stage = HFC_RECORDING; + wakeup((caddr_t)&hfsmp->hfc_stage); + return (0); +} + +/* + * Stop recording the hotest files on a file system. + * + * Requires that the hfc_mutex be held. + */ +static int +hfs_recording_stop(struct hfsmount *hfsmp) +{ + hotfile_data_t *hotdata; + hotfilelist_t *listp; + struct timeval tv; + size_t size; + enum hfc_stage newstage = HFC_IDLE; + int error; + + if (hfsmp->hfc_stage != HFC_RECORDING) + return (EPERM); + + hfsmp->hfc_stage = HFC_BUSY; + + hotfiles_collect(hfsmp); + + + /* + * Convert hot file data into a simple file id list.... + * + * then dump the sample data + */ +#if HFC_VERBOSE + printf("hfs: end of hot file recording on %s\n", hfsmp->vcbVN); +#endif + hotdata = hfsmp->hfc_recdata; + if (hotdata == NULL) + return (0); + hfsmp->hfc_recdata = NULL; + hfsmp->hfc_stage = HFC_EVALUATION; + wakeup((caddr_t)&hfsmp->hfc_stage); + +#if HFC_VERBOSE + printf("hfs: curentries: %d\n", hotdata->activefiles); +#endif + /* + * If no hot files recorded then we're done. + */ + if (hotdata->rootentry == NULL) { + error = 0; + goto out; + } + + /* Open the B-tree file for writing... */ + if (hfsmp->hfc_filevp) + panic("hfs_recording_stop: hfc_filevp exists (vp = %p)", hfsmp->hfc_filevp); + + error = hfc_btree_open(hfsmp, &hfsmp->hfc_filevp); + if (error) { + goto out; + } + + /* + * Age the previous set of clustered hot files. + */ + error = hotfiles_age(hfsmp); + if (error) { + (void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp); + hfsmp->hfc_filevp = NULL; + goto out; + } + + /* + * Create a sorted list of hotest files. + */ + size = sizeof(hotfilelist_t); + size += sizeof(hotfileinfo_t) * (hotdata->activefiles - 1); + listp = hfs_mallocz(size); + listp->hfl_size = size; + + hf_getsortedlist(hotdata, listp); /* NOTE: destroys hot file tree! */ + microtime(&tv); + listp->hfl_duration = tv.tv_sec - hfsmp->hfc_timebase; + hfs_assert(!hfsmp->hfc_filelist); + hfsmp->hfc_filelist = listp; + + /* + * Account for duplicates. + */ + error = hotfiles_refine(hfsmp); + if (error) { + (void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp); + hfsmp->hfc_filevp = NULL; + goto out; + } + + /* + * Compute the amount of space to reclaim... + */ + if (listp->hfl_totalblocks > hfs_hotfile_cur_freeblks(hfsmp)) { + listp->hfl_reclaimblks = + MIN(listp->hfl_totalblocks, hfsmp->hfs_hotfile_maxblks) - + hfsmp->hfs_hotfile_freeblks; +#if HFC_VERBOSE + printf("hfs_recording_stop: need to reclaim %d blocks\n", listp->hfl_reclaimblks); +#endif + if (listp->hfl_reclaimblks) + newstage = HFC_EVICTION; + else + newstage = HFC_ADOPTION; + } else { + newstage = HFC_ADOPTION; + } + + if (newstage == HFC_ADOPTION && listp->hfl_totalblocks == 0) { + (void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp); + hfsmp->hfc_filevp = NULL; + newstage = HFC_IDLE; + } +out: +#if HFC_VERBOSE + if (newstage == HFC_EVICTION) + printf("hfs: evicting coldest files\n"); + else if (newstage == HFC_ADOPTION) + printf("hfs: adopting hotest files\n"); +#endif + hfs_free(hotdata, hotdata->size); + + hfsmp->hfc_stage = newstage; + wakeup((caddr_t)&hfsmp->hfc_stage); + return (error); +} + +static void +save_btree_user_info(struct hfsmount *hfsmp) +{ + HotFilesInfo hotfileinfo; + struct timeval tv; + + microtime(&tv); + hotfileinfo.magic = SWAP_BE32 (HFC_MAGIC); + hotfileinfo.version = SWAP_BE32 (HFC_VERSION); + hotfileinfo.duration = SWAP_BE32 (HFC_DEFAULT_DURATION); + hotfileinfo.timebase = SWAP_BE32 (hfsmp->hfc_timebase); + hotfileinfo.timeleft = SWAP_BE32 (hfsmp->hfc_timeout - tv.tv_sec); + hotfileinfo.threshold = SWAP_BE32 (HFC_MINIMUM_TEMPERATURE); + hotfileinfo.maxfileblks = SWAP_BE32 (HFC_MAXIMUM_FILESIZE / HFSTOVCB(hfsmp)->blockSize); + if (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) { + hotfileinfo.usedblocks = SWAP_BE32 (hfsmp->hfs_hotfile_maxblks - hfs_hotfile_cur_freeblks(hfsmp)); +#if HFC_VERBOSE + printf("hfs: %s: saving usedblocks = %d (timeleft: %d; timeout %ld)\n", hfsmp->vcbVN, (hfsmp->hfs_hotfile_maxblks - hfsmp->hfs_hotfile_freeblks), + SWAP_BE32(hotfileinfo.timeleft), hfsmp->hfc_timeout); +#endif + } else { + hotfileinfo.maxfilecnt = SWAP_BE32 (HFC_DEFAULT_FILE_COUNT); + } + strlcpy((char *)hotfileinfo.tag, hfc_tag, sizeof hotfileinfo.tag); + (void) BTSetUserData(VTOF(hfsmp->hfc_filevp), &hotfileinfo, sizeof(hotfileinfo)); +} + +/* + * Suspend recording the hotest files on a file system. + */ +int +hfs_recording_suspend(struct hfsmount *hfsmp) +{ + hotfile_data_t *hotdata = NULL; + int error; + + if (hfsmp->hfc_stage == HFC_DISABLED) + return (0); + + lck_mtx_lock(&hfsmp->hfc_mutex); + + /* + * XXX NOTE + * A suspend can occur during eval/evict/adopt stage. + * In that case we would need to write out info and + * flush our HFBT vnode. Currently we just bail. + */ + + hotdata = hfsmp->hfc_recdata; + if (hotdata == NULL || hfsmp->hfc_stage != HFC_RECORDING) { + error = 0; + goto out; + } + hfsmp->hfc_stage = HFC_BUSY; + +#if HFC_VERBOSE + printf("hfs: suspend hot file recording on %s\n", hfsmp->vcbVN); +#endif + error = hfc_btree_open(hfsmp, &hfsmp->hfc_filevp); + if (error) { + printf("hfs_recording_suspend: err %d opening btree\n", error); + goto out; + } + + if (hfs_start_transaction(hfsmp) != 0) { + goto out; + } + if (hfs_lock(VTOC(hfsmp->hfc_filevp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT) != 0) { + goto end_transaction; + } + + save_btree_user_info(hfsmp); + + hfs_unlock(VTOC(hfsmp->hfc_filevp)); + +end_transaction: + hfs_end_transaction(hfsmp); + +out: + if (hfsmp->hfc_filevp) { + (void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp); + hfsmp->hfc_filevp = NULL; + } + if (hotdata) { + hfs_free(hotdata, hotdata->size); + hfsmp->hfc_recdata = NULL; + } + hfsmp->hfc_stage = HFC_DISABLED; + wakeup((caddr_t)&hfsmp->hfc_stage); + + lck_mtx_unlock(&hfsmp->hfc_mutex); + return (error); +} + + +static void +reset_file_ids(struct hfsmount *hfsmp, uint32_t *fileid_table, int num_ids) +{ + int i, error; + + for(i=0; i < num_ids; i++) { + struct vnode *vp; + + error = hfs_vget(hfsmp, fileid_table[i], &vp, 0, 0); + if (error) { + if (error == ENOENT) { + error = 0; + continue; /* stale entry, go to next */ + } + continue; + } + + // hfs_vget returns a locked cnode so no need to lock here + + if ((hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) && (VTOC(vp)->c_attr.ca_recflags & kHFSFastDevPinnedMask)) { + error = hfs_pin_vnode(hfsmp, vp, HFS_UNPIN_IT, NULL); + } + + /* + * The updates to the catalog must be journaled + */ + hfs_start_transaction(hfsmp); + + // + // turn off _all_ the hotfile related bits since we're resetting state + // + if (VTOC(vp)->c_attr.ca_recflags & kHFSFastDevCandidateMask) { + vnode_clearfastdevicecandidate(vp); + } + + VTOC(vp)->c_attr.ca_recflags &= ~(kHFSFastDevPinnedMask|kHFSDoNotFastDevPinMask|kHFSFastDevCandidateMask|kHFSAutoCandidateMask); + VTOC(vp)->c_flag |= C_MODIFIED; + + hfs_update(vp, 0); + + hfs_end_transaction(hfsmp); + + hfs_unlock(VTOC(vp)); + vnode_put(vp); + } +} + +static int +flag_hotfile(struct hfsmount *hfsmp, const char *filename) +{ + struct vnode *dvp = NULL, *fvp = NULL; + vfs_context_t ctx = vfs_context_kernel(); + int error=0; + size_t fname_len; + const char *orig_fname = filename; + + if (filename == NULL) { + return EINVAL; + } + + fname_len = strlen(filename); // do NOT include the trailing '\0' so that we break out of the loop below + + error = hfs_vfs_root(HFSTOVFS(hfsmp), &dvp, ctx); + if (error) { + return (error); + } + + /* At this point, 'dvp' must be considered iocounted */ + const char *ptr; + ptr = filename; + + while (ptr < (orig_fname + fname_len - 1)) { + for(; ptr < (orig_fname + fname_len) && *ptr && *ptr != '/'; ptr++) { + /* just keep advancing till we reach the end of the string or a slash */ + } + + struct componentname cname = { + .cn_nameiop = LOOKUP, + .cn_flags = ISLASTCN, + .cn_pnbuf = __DECONST(char *, orig_fname), + .cn_nameptr = __DECONST(char *, filename), + .cn_pnlen = fname_len, + .cn_namelen = ptr - filename + }; + + struct vnop_lookup_args ap = { + .a_dvp = dvp, + .a_vpp = &fvp, + .a_cnp = &cname, + .a_context = ctx + }; + + error = hfs_vnop_lookup(&ap); + if (error) { + /* + * If 'dvp' is non-NULL, then it has an iocount. Make sure to release it + * before bailing out. VNOP_LOOKUP could legitimately return ENOENT + * if the item didn't exist or if we raced with a delete. + */ + if (dvp) { + vnode_put(dvp); + dvp = NULL; + } + return error; + } + + if (ptr < orig_fname + fname_len - 1) { + // + // we've got a multi-part pathname so drop the ref on the dir, + // make dvp become what we just looked up, and advance over + // the slash character in the pathname to get to the next part + // of the component + // + vnode_put(dvp); + dvp = fvp; + fvp = NULL; + + filename = ++ptr; // skip the slash character + } + } + + if (fvp == NULL) { + error = ENOENT; + goto out; + } + + struct cnode *cp = VTOC(fvp); + if ((error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT)) != 0) { + goto out; + } + + hfs_start_transaction(hfsmp); + + cp->c_attr.ca_recflags |= (kHFSFastDevCandidateMask|kHFSAutoCandidateMask); + cp->c_flag |= C_MODIFIED; + + hfs_update(fvp, 0); + + hfs_end_transaction(hfsmp); + + hfs_unlock(cp); + //printf("hfs: flagged /%s with the fast-dev-candidate|auto-candidate flags\n", filename); + + +out: + if (fvp) { + vnode_put(fvp); + fvp = NULL; + } + + if (dvp) { + vnode_put(dvp); + dvp = NULL; + } + + return error; +} + + +static void +hfs_setup_default_cf_hotfiles(struct hfsmount *hfsmp) +{ + const char *system_default_hotfiles[] = { + "usr", + "System", + "Applications", + "private/var/db/dyld" + }; + int i; + + for(i=0; i < (int)(sizeof(system_default_hotfiles)/sizeof(char *)); i++) { + flag_hotfile(hfsmp, system_default_hotfiles[i]); + } +} + + +#define NUM_FILE_RESET_IDS 4096 // so we allocate 16k to hold file-ids + +static void +hfs_hotfile_reset(struct hfsmount *hfsmp) +{ + CatalogKey * keyp; + CatalogRecord * datap; + u_int32_t dataSize; + BTScanState scanstate; + BTreeIterator * iterator = NULL; + FSBufferDescriptor record; + u_int32_t data; + u_int32_t cnid; + int error = 0; + uint32_t *fileids=NULL; + int cur_id_index = 0; + + int cleared = 0; /* debug variables */ + int filecount = 0; + int dircount = 0; + +#if HFC_VERBOSE + printf("hfs: %s: %s\n", hfsmp->vcbVN, __FUNCTION__); +#endif + + iterator = hfs_mallocz(sizeof(*iterator)); + + fileids = hfs_malloc(NUM_FILE_RESET_IDS * sizeof(uint32_t)); + + record.bufferAddress = &data; + record.itemSize = sizeof(u_int32_t); + record.itemCount = 1; + + /* + * Get ready to scan the Catalog file. + */ + error = BTScanInitialize(VTOF(HFSTOVCB(hfsmp)->catalogRefNum), 0, 0, 0, + kCatSearchBufferSize, &scanstate); + if (error) { + printf("hfs_hotfile_reset: err %d BTScanInit\n", error); + goto out; + } + + /* + * Visit all the catalog btree leaf records, clearing any that have the + * HotFileCached bit set. + */ + for (;;) { + error = BTScanNextRecord(&scanstate, 0, (void **)&keyp, (void **)&datap, &dataSize); + if (error) { + if (error == btNotFound) + error = 0; + else + printf("hfs_hotfile_reset: err %d BTScanNext\n", error); + break; + } + + if (datap->recordType == kHFSPlusFolderRecord && (dataSize == sizeof(HFSPlusCatalogFolder))) { + HFSPlusCatalogFolder *dirp = (HFSPlusCatalogFolder *)datap; + + dircount++; + + if ((dirp->flags & (kHFSFastDevPinnedMask|kHFSDoNotFastDevPinMask|kHFSFastDevCandidateMask|kHFSAutoCandidateMask)) == 0) { + continue; + } + + cnid = dirp->folderID; + } else if ((datap->recordType == kHFSPlusFileRecord) && (dataSize == sizeof(HFSPlusCatalogFile))) { + HFSPlusCatalogFile *filep = (HFSPlusCatalogFile *)datap; + + filecount++; + + /* + * If the file doesn't have any of the HotFileCached bits set, ignore it. + */ + if ((filep->flags & (kHFSFastDevPinnedMask|kHFSDoNotFastDevPinMask|kHFSFastDevCandidateMask|kHFSAutoCandidateMask)) == 0) { + continue; + } + + cnid = filep->fileID; + } else { + continue; + } + + /* Skip over journal files. */ + if (cnid == hfsmp->hfs_jnlfileid || cnid == hfsmp->hfs_jnlinfoblkid) { + continue; + } + + // + // Just record the cnid of the file for now. We will modify it separately + // because we can't modify the catalog while we're scanning it. + // + fileids[cur_id_index++] = cnid; + if (cur_id_index >= NUM_FILE_RESET_IDS) { + // + // We're over the limit of file-ids so we have to terminate this + // scan, go modify all the catalog records, then restart the scan. + // This is required because it's not permissible to modify the + // catalog while scanning it. + // + (void) BTScanTerminate(&scanstate, &data, &data, &data); + + reset_file_ids(hfsmp, fileids, cur_id_index); + cleared += cur_id_index; + cur_id_index = 0; + + // restart the scan + error = BTScanInitialize(VTOF(HFSTOVCB(hfsmp)->catalogRefNum), 0, 0, 0, + kCatSearchBufferSize, &scanstate); + if (error) { + printf("hfs_hotfile_reset: err %d BTScanInit\n", error); + goto out; + } + continue; + } + } + + if (cur_id_index) { + reset_file_ids(hfsmp, fileids, cur_id_index); + cleared += cur_id_index; + cur_id_index = 0; + } + + printf("hfs: cleared HotFileCache related bits on %d files out of %d (dircount %d)\n", cleared, filecount, dircount); + + (void) BTScanTerminate(&scanstate, &data, &data, &data); + +out: + hfs_free(fileids, NUM_FILE_RESET_IDS * sizeof(uint32_t)); + hfs_free(iterator, sizeof(*iterator)); + + // + // If the hotfile btree exists, delete it. We need to open + // it to be able to delete it because we need the hfc_filevp + // for deletion. + // + error = hfc_btree_open_ext(hfsmp, &hfsmp->hfc_filevp, 1); + if (!error) { + printf("hfs: hotfile_reset: deleting existing hotfile btree\n"); + hfc_btree_delete(hfsmp); + } + + if (hfsmp->hfc_filevp) { + (void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp); + hfsmp->hfc_filevp = NULL; + } + + hfsmp->hfs_hotfile_blk_adjust = 0; + hfsmp->hfs_hotfile_freeblks = hfsmp->hfs_hotfile_maxblks; +} + + +// +// This should ONLY be called by hfs_recording_init() and the special fsctl. +// +// We assume that the hotfile btree is already opened. +// +static int +hfs_hotfile_repin_files(struct hfsmount *hfsmp) +{ + BTreeIterator * iterator = NULL; + HotFileKey * key; + filefork_t * filefork; + int error = 0; + int bt_op; + enum hfc_stage stage; + uint32_t pinned_blocks; + uint32_t num_files=0, nrsrc=0; + uint32_t total_pinned=0; + + if (!(hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) || !hfsmp->hfc_filevp) { + // + // this is only meaningful if we're pinning hotfiles + // (as opposed to the regular form of hotfiles that + // get relocated to the hotfile zone) + // + return 0; + } + +#if HFC_VERBOSE + printf("hfs: %s: %s\n", hfsmp->vcbVN, __FUNCTION__); +#endif + + if (hfs_lock(VTOC(hfsmp->hfc_filevp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT) != 0) { + return (EPERM); + } + + + iterator = hfs_mallocz(sizeof(*iterator)); + + stage = hfsmp->hfc_stage; + hfsmp->hfc_stage = HFC_BUSY; + + bt_op = kBTreeFirstRecord; + + key = (HotFileKey*) &iterator->key; + + filefork = VTOF(hfsmp->hfc_filevp); + int lockflags; + + while (1) { + + lockflags = 0; + /* + * Obtain the first record (ie the coldest one). + */ + if (BTIterateRecord(filefork, bt_op, iterator, NULL, NULL) != 0) { + // no more records + error = 0; + break; + } + if (key->keyLength != HFC_KEYLENGTH) { + // printf("hfs: hotfiles_repin_files: invalid key length %d\n", key->keyLength); + error = EFTYPE; + break; + } + if (key->temperature == HFC_LOOKUPTAG) { + // ran into thread records in the hotfile btree + error = 0; + break; + } + + // + // Just lookup the records in the catalog and pin the direct + // mapped extents. Faster than instantiating full vnodes + // (and thereby thrashing the system vnode cache). + // + struct cat_desc fdesc; + struct cat_attr attr; + struct cat_fork fork; + uint8_t forktype = 0; + + lockflags = hfs_systemfile_lock(hfsmp, (SFL_CATALOG | SFL_EXTENTS), HFS_SHARED_LOCK); + /* + * Snoop the cnode hash to find out if the item we want is in-core already. + * + * We largely expect this function to fail (the items we want are probably not in the hash). + * we use the special variant which bails out as soon as it finds a vnode (even if it is + * marked as open-unlinked or actually removed on-disk. If we find a vnode, then we + * release the systemfile locks and go through the pin-vnode path instead. + */ + if (hfs_chash_snoop (hfsmp, key->fileID, 1, NULL, NULL) == 0) { + pinned_blocks = 0; + + /* unlock immediately and go through the in-core path */ + hfs_systemfile_unlock(hfsmp, lockflags); + lockflags = 0; + + error = hfs_getvnode_and_pin (hfsmp, key->fileID, &pinned_blocks); + if (error) { + /* if ENOENT, then it was deleted in the catalog. Remove from our hotfiles tracking */ + if (error == ENOENT) { + hfc_btree_delete_record(hfsmp, iterator, key); + } + /* other errors, just ignore and move on with life */ + } + else { //!error + total_pinned += pinned_blocks; + num_files++; + } + + goto next; + } + + /* If we get here, we're still holding the systemfile locks */ + error = cat_idlookup(hfsmp, key->fileID, 1, 0, &fdesc, &attr, &fork); + if (error) { + // + // this file system could have been mounted while booted from a + // different partition and thus the hotfile btree would not have + // been maintained. thus a file that was hotfile cached could + // have been deleted while booted from a different partition which + // means we need to delete it from the hotfile btree. + // + // block accounting is taken care of at the end: we re-assign + // hfsmp->hfs_hotfile_freeblks based on how many blocks we actually + // pinned. + // + hfc_btree_delete_record(hfsmp, iterator, key); + + goto next; + } + + if (fork.cf_size == 0) { + // hmmm, the data is probably in the resource fork (aka a compressed file) + error = cat_idlookup(hfsmp, key->fileID, 1, 1, &fdesc, &attr, &fork); + if (error) { + hfc_btree_delete_record(hfsmp, iterator, key); + goto next; + } + forktype = 0xff; + nrsrc++; + } + + pinned_blocks = 0; + + /* Can't release the catalog /extents lock yet, we may need to go find the overflow blocks */ + error = hfs_pin_extent_record (hfsmp, fork.cf_extents, &pinned_blocks); + if (error) { + goto next; //skip to next + } + /* add in the blocks from the inline 8 */ + total_pinned += pinned_blocks; + pinned_blocks = 0; + + /* Could this file have overflow extents? */ + if (fork.cf_extents[kHFSPlusExtentDensity-1].startBlock) { + /* better pin them, too */ + error = hfs_pin_overflow_extents (hfsmp, key->fileID, forktype, &pinned_blocks); + if (error) { + /* If we fail to pin all of the overflow extents, then just skip to the next file */ + goto next; + } + } + + num_files++; + if (pinned_blocks) { + /* now add in any overflow also */ + total_pinned += pinned_blocks; + } + + next: + if (lockflags) { + hfs_systemfile_unlock(hfsmp, lockflags); + lockflags = 0; + } + bt_op = kBTreeNextRecord; + + } /* end while */ + +#if HFC_VERBOSE + printf("hfs: hotfiles_repin_files: re-pinned %d files (nrsrc %d, total pinned %d blks; freeblock %d, maxblocks %d, calculated free: %d)\n", + num_files, nrsrc, total_pinned, hfsmp->hfs_hotfile_freeblks, hfsmp->hfs_hotfile_maxblks, + hfsmp->hfs_hotfile_maxblks - total_pinned); +#endif + // + // make sure this is accurate based on how many blocks we actually pinned + // + hfsmp->hfs_hotfile_freeblks = hfsmp->hfs_hotfile_maxblks - total_pinned; + + hfs_unlock(VTOC(hfsmp->hfc_filevp)); + + hfs_free(iterator, sizeof(*iterator)); + hfsmp->hfc_stage = stage; + wakeup((caddr_t)&hfsmp->hfc_stage); + return (error); +} + +void +hfs_repin_hotfiles(struct hfsmount *hfsmp) +{ + int error, need_close; + + lck_mtx_lock(&hfsmp->hfc_mutex); + + if (hfsmp->hfc_filevp == NULL) { + error = hfc_btree_open(hfsmp, &hfsmp->hfc_filevp); + if (!error) { + need_close = 1; + } else { + printf("hfs: failed to open the btree err=%d. Unable to re-pin hotfiles.\n", error); + lck_mtx_unlock(&hfsmp->hfc_mutex); + return; + } + } else { + need_close = 0; + } + + hfs_pin_vnode(hfsmp, hfsmp->hfc_filevp, HFS_PIN_IT, NULL); + + hfs_hotfile_repin_files(hfsmp); + + if (need_close) { + (void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp); + hfsmp->hfc_filevp = NULL; + } + + lck_mtx_unlock(&hfsmp->hfc_mutex); +} + +/* + * For a given file ID, find and pin all of its overflow extents to the underlying CS + * device. Assumes that the extents overflow b-tree is locked for the duration of this call. + * + * Emit the number of blocks pinned in output argument 'pinned' + * + * Return success or failure (errno) in return value. + * + */ +int hfs_pin_overflow_extents (struct hfsmount *hfsmp, uint32_t fileid, + uint8_t forktype, uint32_t *pinned) { + + struct BTreeIterator *ext_iter = NULL; + ExtentKey *ext_key_ptr = NULL; + ExtentRecord ext_data; + FSBufferDescriptor btRecord; + uint16_t btRecordSize; + int error = 0; + + uint32_t pinned_blocks = 0; + + + ext_iter = hfs_mallocz(sizeof (*ext_iter)); + + BTInvalidateHint (ext_iter); + ext_key_ptr = (ExtentKey*)&ext_iter->key; + btRecord.bufferAddress = &ext_data; + btRecord.itemCount = 1; + + /* + * This is like when you delete a file; we don't actually need most of the search machinery because + * we are going to need all of the extent records that belong to this file (for a given fork type), + * so we might as well use a straight-up iterator. + * + * Position the B-Tree iterator at the first record with this file ID + */ + btRecord.itemSize = sizeof (HFSPlusExtentRecord); + ext_key_ptr->hfsPlus.keyLength = kHFSPlusExtentKeyMaximumLength; + ext_key_ptr->hfsPlus.forkType = forktype; + ext_key_ptr->hfsPlus.pad = 0; + ext_key_ptr->hfsPlus.fileID = fileid; + ext_key_ptr->hfsPlus.startBlock = 0; + + error = BTSearchRecord (VTOF(hfsmp->hfs_extents_vp), ext_iter, &btRecord, &btRecordSize, ext_iter); + if (error == btNotFound) { + /* empty b-tree, so that's ok. we'll fall out during error check below. */ + error = 0; + } + + while (1) { + uint32_t found_fileid; + uint32_t pblocks; + + error = BTIterateRecord (VTOF(hfsmp->hfs_extents_vp), kBTreeNextRecord, ext_iter, &btRecord, &btRecordSize); + if (error) { + /* swallow it if it's btNotFound, otherwise just bail out */ + if (error == btNotFound) + error = 0; + break; + } + + found_fileid = ext_key_ptr->hfsPlus.fileID; + /* + * We only do one fork type at a time. So if either the fork-type doesn't + * match what we are looking for (resource or data), OR the file id doesn't match + * which indicates that there's nothing more with this file ID as the key, then bail out + */ + if ((found_fileid != fileid) || (ext_key_ptr->hfsPlus.forkType != forktype)) { + error = 0; + break; + } + + /* Otherwise, we now have an extent record. Process and pin all of the file extents. */ + pblocks = 0; + error = hfs_pin_extent_record (hfsmp, ext_data.hfsPlus, &pblocks); + + if (error) { + break; + } + pinned_blocks += pblocks; + + /* if 8th extent is empty, then bail out */ + if (ext_data.hfsPlus[kHFSPlusExtentDensity-1].startBlock == 0) { + error = 0; + break; + } + + } // end extent-getting loop + + /* dump the iterator */ + hfs_free(ext_iter, sizeof(*ext_iter)); + + if (error == 0) { + /* + * In the event that the file has no overflow extents, pinned_blocks + * will never be updated, so we'll properly export 0 pinned blocks to caller + */ + *pinned = pinned_blocks; + } + + return error; + +} + + +static int +hfs_getvnode_and_pin (struct hfsmount *hfsmp, uint32_t fileid, uint32_t *pinned) { + struct vnode *vp; + int error = 0; + *pinned = 0; + uint32_t pblocks; + + /* + * Acquire the vnode for this file. This returns a locked cnode on success + */ + error = hfs_vget(hfsmp, fileid, &vp, 0, 0); + if (error) { + /* It's possible the file was open-unlinked. In this case, we'll get ENOENT back. */ + return error; + } + + /* + * Symlinks that may have been inserted into the hotfile zone during a previous OS are now stuck + * here. We do not want to move them. + */ + if (!vnode_isreg(vp)) { + hfs_unlock(VTOC(vp)); + vnode_put(vp); + return EPERM; + } + + if (!(VTOC(vp)->c_attr.ca_recflags & kHFSFastDevPinnedMask)) { + hfs_unlock(VTOC(vp)); + vnode_put(vp); + return EINVAL; + } + + error = hfs_pin_vnode(hfsmp, vp, HFS_PIN_IT, &pblocks); + if (error == 0) { + *pinned = pblocks; + } + + hfs_unlock(VTOC(vp)); + vnode_put(vp); + + return error; + +} + +/* + * Pins an HFS Extent record to the underlying CoreStorage. Assumes that Catalog & Extents overflow + * B-trees are held locked, as needed. + * + * Returns the number of blocks pinned in the output argument 'pinned' + * + * Returns error status (0 || errno) in return value. + */ +static int hfs_pin_extent_record (struct hfsmount *hfsmp, HFSPlusExtentRecord extents, uint32_t *pinned) { + uint32_t pb = 0; + int i; + int error = 0; + + if (pinned == NULL) { + return EINVAL; + } + *pinned = 0; + + + + /* iterate through the extents */ + for ( i = 0; i < kHFSPlusExtentDensity; i++) { + if (extents[i].startBlock == 0) { + break; + } + + error = hfs_pin_block_range (hfsmp, HFS_PIN_IT, extents[i].startBlock, + extents[i].blockCount); + + if (error) { + break; + } + pb += extents[i].blockCount; + } + + *pinned = pb; + + return error; +} + +/* + * Consume an HFS Plus on-disk catalog record and pin its blocks + * to the underlying CS devnode. + * + * NOTE: This is an important distinction! + * This function takes in an HFSPlusCatalogFile* which is the actual + * 200-some-odd-byte on-disk representation in the Catalog B-Tree (not + * one of the run-time structs that we normally use. + * + * This assumes that the catalog and extents-overflow btrees + * are locked, at least in shared mode + */ +static int hfs_pin_catalog_rec (struct hfsmount *hfsmp, HFSPlusCatalogFile *cfp, int rsrc) { + uint32_t pinned_blocks = 0; + HFSPlusForkData *forkdata; + int error = 0; + uint8_t forktype = 0; + + if (rsrc) { + forkdata = &cfp->resourceFork; + forktype = 0xff; + } + else { + forkdata = &cfp->dataFork; + } + + uint32_t pblocks = 0; + + /* iterate through the inline extents */ + error = hfs_pin_extent_record (hfsmp, forkdata->extents, &pblocks); + if (error) { + return error; + } + + pinned_blocks += pblocks; + pblocks = 0; + + /* it may have overflow extents */ + if (forkdata->extents[kHFSPlusExtentDensity-1].startBlock != 0) { + error = hfs_pin_overflow_extents (hfsmp, cfp->fileID, forktype, &pblocks); + } + pinned_blocks += pblocks; + + hfsmp->hfs_hotfile_freeblks -= pinned_blocks; + + return error; +} + + +/* + * + */ +int +hfs_recording_init(struct hfsmount *hfsmp) +{ + CatalogKey * keyp; + CatalogRecord * datap; + u_int32_t dataSize; + HFSPlusCatalogFile *filep; + BTScanState scanstate; + BTreeIterator * iterator = NULL; + FSBufferDescriptor record; + HotFileKey * key; + filefork_t * filefork; + u_int32_t data; + struct cat_attr cattr; + u_int32_t cnid; + int error = 0; + long starting_temp; + + int started_tr = 0; + int started_scan = 0; + + int inserted = 0; /* debug variables */ + int filecount = 0; + int uncacheable = 0; + + /* + * For now, only the boot volume is supported. + */ + if ((vfs_flags(HFSTOVFS(hfsmp)) & MNT_ROOTFS) == 0) { + hfsmp->hfc_stage = HFC_DISABLED; + return (EPERM); + } + + /* We grab the HFC mutex even though we're not fully mounted yet, just for orderliness */ + lck_mtx_lock (&hfsmp->hfc_mutex); + + /* + * Tracking of hot files requires up-to-date access times. + * So if access time updates are disabled, then we disable + * hot files, too. + */ + if (vfs_flags(HFSTOVFS(hfsmp)) & MNT_NOATIME) { + hfsmp->hfc_stage = HFC_DISABLED; + lck_mtx_unlock (&hfsmp->hfc_mutex); + return EPERM; + } + + // + // Check if we've been asked to suspend operation + // + cnid = GetFileInfo(HFSTOVCB(hfsmp), kRootDirID, ".hotfile-suspend", &cattr, NULL); + if (cnid != 0) { + printf("hfs: %s: %s: hotfiles explicitly disabled! remove /.hotfiles-suspend to re-enable\n", hfsmp->vcbVN, __FUNCTION__); + hfsmp->hfc_stage = HFC_DISABLED; + lck_mtx_unlock (&hfsmp->hfc_mutex); + return EPERM; + } + + // + // Check if we've been asked to reset our state. + // + cnid = GetFileInfo(HFSTOVCB(hfsmp), kRootDirID, ".hotfile-reset", &cattr, NULL); + if (cnid != 0) { + hfs_hotfile_reset(hfsmp); + } + + if (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) { + // + // Cooperative Fusion (CF) systems use different constants + // than traditional hotfile systems. These were picked after a bit of + // experimentation - we can cache many more files on the + // ssd in an CF system and we can do so more rapidly + // so bump the limits considerably (and turn down the + // duration so that it doesn't take weeks to adopt all + // the files). + // + hfc_default_file_count = 20000; + hfc_default_duration = 300; // 5min + hfc_max_file_count = 50000; + hfc_max_file_size = (512ULL * 1024ULL * 1024ULL); + } + + /* + * If the Hot File btree exists then metadata zone is ready. + */ + cnid = GetFileInfo(HFSTOVCB(hfsmp), kRootDirID, HFC_FILENAME, &cattr, NULL); + if (cnid != 0 && S_ISREG(cattr.ca_mode)) { + int recreate = 0; + + if (hfsmp->hfc_stage == HFC_DISABLED) + hfsmp->hfc_stage = HFC_IDLE; + hfsmp->hfs_hotfile_freeblks = 0; + + if ((hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) && cattr.ca_blocks > 0) { + // + // make sure the hotfile btree is pinned + // + error = hfc_btree_open(hfsmp, &hfsmp->hfc_filevp); + if (!error) { + /* XXX: must fix hfs_pin_vnode too */ + hfs_pin_vnode(hfsmp, hfsmp->hfc_filevp, HFS_PIN_IT, NULL); + + } else { + printf("hfs: failed to open the btree err=%d. Recreating hotfile btree.\n", error); + recreate = 1; + } + + hfs_hotfile_repin_files(hfsmp); + + if (hfsmp->hfc_filevp) { + (void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp); + hfsmp->hfc_filevp = NULL; + } + + } else if (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) { + // hmmm, the hotfile btree is zero bytes long? how odd. let's recreate it. + printf("hfs: hotfile btree is zero bytes long?! recreating it.\n"); + recreate = 1; + } + + if (!recreate) { + /* don't forget to unlock the mutex */ + lck_mtx_unlock (&hfsmp->hfc_mutex); + return (0); + } else { + // + // open the hotfile btree file ignoring errors because + // we need the vnode pointer for hfc_btree_delete() to + // be able to do its work + // + error = hfc_btree_open_ext(hfsmp, &hfsmp->hfc_filevp, 1); + if (!error) { + // and delete it! + error = hfc_btree_delete(hfsmp); + (void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp); + hfsmp->hfc_filevp = NULL; + } + } + } + + printf("hfs: %s: %s: creating the hotfile btree\n", hfsmp->vcbVN, __FUNCTION__); + if (hfs_start_transaction(hfsmp) != 0) { + lck_mtx_unlock (&hfsmp->hfc_mutex); + return EINVAL; + } + + /* B-tree creation must be journaled */ + started_tr = 1; + + error = hfc_btree_create(hfsmp, HFSTOVCB(hfsmp)->blockSize, HFC_DEFAULT_FILE_COUNT); + if (error) { +#if HFC_VERBOSE + printf("hfs: Error %d creating hot file b-tree on %s \n", error, hfsmp->vcbVN); +#endif + goto recording_init_out; + } + + hfs_end_transaction (hfsmp); + started_tr = 0; + /* + * Do a journal flush + flush track cache. We have to ensure that the async I/Os have been issued to the media + * before proceeding. + */ + hfs_flush (hfsmp, HFS_FLUSH_FULL); + + /* now re-start a new transaction */ + if (hfs_start_transaction (hfsmp) != 0) { + lck_mtx_unlock (&hfsmp->hfc_mutex); + return EINVAL; + } + started_tr = 1; + + /* + * Open the Hot File B-tree file for writing. + */ + if (hfsmp->hfc_filevp) + panic("hfs_recording_init: hfc_filevp exists (vp = %p)", hfsmp->hfc_filevp); + + error = hfc_btree_open(hfsmp, &hfsmp->hfc_filevp); + if (error) { +#if HFC_VERBOSE + printf("hfs: Error %d opening hot file b-tree on %s \n", error, hfsmp->vcbVN); +#endif + goto recording_init_out; + } + + /* + * This function performs work similar to namei; we must NOT hold the catalog lock while + * calling it. This will decorate catalog records as being pinning candidates. (no hotfiles work) + */ + hfs_setup_default_cf_hotfiles(hfsmp); + + /* + * now grab the hotfiles b-tree vnode/cnode lock first, as it is not classified as a systemfile. + */ + if (hfs_lock(VTOC(hfsmp->hfc_filevp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT) != 0) { + error = EPERM; + (void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp); + /* zero it out to avoid pinning later on */ + hfsmp->hfc_filevp = NULL; + goto recording_init_out; + } + + iterator = hfs_mallocz(sizeof(*iterator)); + + key = (HotFileKey*) &iterator->key; + key->keyLength = HFC_KEYLENGTH; + + record.bufferAddress = &data; + record.itemSize = sizeof(u_int32_t); + record.itemCount = 1; + +#if HFC_VERBOSE + printf("hfs: Evaluating space for \"%s\" metadata zone... (freeblks %d)\n", HFSTOVCB(hfsmp)->vcbVN, + hfsmp->hfs_hotfile_freeblks); +#endif + + /* + * Get ready to scan the Catalog file. We explicitly do NOT grab the catalog lock because + * we're fully single-threaded at the moment (by virtue of being called during mount()), + * and if we have to grow the hotfile btree, then we would need to grab the catalog lock + * and if we take a shared lock here, it would deadlock (see ) + * + * We already started a transaction so we should already be holding the journal lock at this point. + * Note that we have to hold the journal lock / start a txn BEFORE the systemfile locks. + */ + + error = BTScanInitialize(VTOF(HFSTOVCB(hfsmp)->catalogRefNum), 0, 0, 0, + kCatSearchBufferSize, &scanstate); + if (error) { + printf("hfs_recording_init: err %d BTScanInit\n", error); + + /* drop the systemfile locks */ + hfs_unlock(VTOC(hfsmp->hfc_filevp)); + + (void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp); + + /* zero it out to avoid pinning */ + hfsmp->hfc_filevp = NULL; + goto recording_init_out; + } + + started_scan = 1; + + filefork = VTOF(hfsmp->hfc_filevp); + + starting_temp = random() % HF_TEMP_RANGE; + + /* + * Visit all the catalog btree leaf records. We have to hold the catalog lock to do this. + * + * NOTE: The B-Tree scanner reads from the media itself. Under normal circumstances it would be + * fine to simply use b-tree routines to read blocks that correspond to b-tree nodes, because the + * block cache is going to ensure you always get the cached copy of a block (even if a journal + * txn has modified one of those blocks). That is NOT true when + * using the scanner. In particular, it will always read whatever is on-disk. So we have to ensure + * that the journal has flushed and that the async I/Os to the metadata files have been issued. + */ + for (;;) { + error = BTScanNextRecord(&scanstate, 0, (void **)&keyp, (void **)&datap, &dataSize); + if (error) { + if (error == btNotFound) + error = 0; + else + printf("hfs_recording_init: err %d BTScanNext\n", error); + break; + } + if ((datap->recordType != kHFSPlusFileRecord) || + (dataSize != sizeof(HFSPlusCatalogFile))) { + continue; + } + filep = (HFSPlusCatalogFile *)datap; + filecount++; + + if (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) { + if (filep->flags & kHFSDoNotFastDevPinMask) { + uncacheable++; + } + + // + // If the file does not have the FastDevPinnedMask set, we + // can ignore it and just go to the next record. + // + if ((filep->flags & kHFSFastDevPinnedMask) == 0) { + continue; + } + } else if (filep->dataFork.totalBlocks == 0) { + continue; + } + + /* + * On a regular hdd, any file that has blocks inside + * the hot file space is recorded for later eviction. + * + * For now, resource forks are ignored. + * + * We don't do this on CF systems as there is no real + * hotfile area - we just pin/unpin blocks belonging to + * interesting files. + */ + if (!(hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) && !hotextents(hfsmp, &filep->dataFork.extents[0])) { + continue; + } + cnid = filep->fileID; + + /* Skip over journal files and the hotfiles B-Tree file. */ + if (cnid == hfsmp->hfs_jnlfileid + || cnid == hfsmp->hfs_jnlinfoblkid + || cnid == VTOC(hfsmp->hfc_filevp)->c_fileid) { + continue; + } + /* + * XXX - need to skip quota files as well. + */ + + uint32_t temp; + + if (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) { + int rsrc = 0; + + temp = (uint32_t)starting_temp++; + if (filep->flags & kHFSAutoCandidateMask) { + temp += MAX_NORMAL_TEMP; + } + + /* use the data fork by default */ + if (filep->dataFork.totalBlocks == 0) { + /* + * but if empty, switch to rsrc as its likely + * a compressed file + */ + rsrc = 1; + } + + error = hfs_pin_catalog_rec (hfsmp, filep, rsrc); + if (error) + break; + + } else { + temp = HFC_MINIMUM_TEMPERATURE; + } + + /* Insert a hot file entry. */ + key->keyLength = HFC_KEYLENGTH; + key->temperature = temp; + key->fileID = cnid; + key->forkType = 0; + data = 0x3f3f3f3f; + error = BTInsertRecord(filefork, iterator, &record, record.itemSize); + if (error) { + printf("hfs_recording_init: BTInsertRecord failed %d (fileid %d)\n", error, key->fileID); + error = MacToVFSError(error); + break; + } + + /* Insert the corresponding thread record. */ + key->keyLength = HFC_KEYLENGTH; + key->temperature = HFC_LOOKUPTAG; + key->fileID = cnid; + key->forkType = 0; + data = temp; + error = BTInsertRecord(filefork, iterator, &record, record.itemSize); + if (error) { + printf("hfs_recording_init: BTInsertRecord failed %d (fileid %d)\n", error, key->fileID); + error = MacToVFSError(error); + break; + } + inserted++; + } // end catalog iteration loop + + save_btree_user_info(hfsmp); + (void) BTFlushPath(filefork); + +recording_init_out: + + /* Unlock first, then pin after releasing everything else */ + if (hfsmp->hfc_filevp) { + hfs_unlock (VTOC(hfsmp->hfc_filevp)); + } + + if (started_scan) { + (void) BTScanTerminate (&scanstate, &data, &data, &data); + } + + if (started_tr) { + hfs_end_transaction(hfsmp); + } + +#if HFC_VERBOSE + printf("hfs: %d files identified out of %d (freeblocks is now: %d)\n", inserted, filecount, hfsmp->hfs_hotfile_freeblks); + if (uncacheable) { + printf("hfs: %d files were marked as uncacheable\n", uncacheable); + } +#endif + + if (iterator) + hfs_free(iterator, sizeof(*iterator)); + + if (hfsmp->hfc_filevp) { + if (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) { + hfs_pin_vnode(hfsmp, hfsmp->hfc_filevp, HFS_PIN_IT, NULL); + } + (void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp); + hfsmp->hfc_filevp = NULL; + } + + if (error == 0) + hfsmp->hfc_stage = HFC_IDLE; + + /* Finally, unlock the HFC mutex */ + lck_mtx_unlock (&hfsmp->hfc_mutex); + + return (error); +} + +/* + * Use sync to perform ocassional background work. + */ +int +hfs_hotfilesync(struct hfsmount *hfsmp, vfs_context_t ctx) +{ + if (hfsmp->hfc_stage) { + struct timeval tv; + + lck_mtx_lock(&hfsmp->hfc_mutex); + + switch (hfsmp->hfc_stage) { + case HFC_IDLE: + (void) hfs_recording_start(hfsmp); + break; + + case HFC_RECORDING: + microtime(&tv); + if (tv.tv_sec > hfsmp->hfc_timeout) + (void) hfs_recording_stop(hfsmp); + break; + + case HFC_EVICTION: + (void) hotfiles_evict(hfsmp, ctx); + break; + + case HFC_ADOPTION: + (void) hotfiles_adopt(hfsmp); + break; + default: + break; + } + + lck_mtx_unlock(&hfsmp->hfc_mutex); + } + return (0); +} + +/* + * Add a hot file to the recording list. + * + * This can happen when a hot file gets reclaimed or at the + * end of the recording period for any active hot file. + * + * NOTE: Since both the data and resource fork can be hot, + * there can be two entries for the same file id. + * + * Note: the cnode is locked on entry. + */ +int +hfs_addhotfile(struct vnode *vp) +{ + hfsmount_t *hfsmp; + int error; + + hfsmp = VTOHFS(vp); + if (hfsmp->hfc_stage != HFC_RECORDING) + return (0); + + lck_mtx_lock(&hfsmp->hfc_mutex); + error = hfs_addhotfile_internal(vp); + lck_mtx_unlock(&hfsmp->hfc_mutex); + return (error); +} + +static int +hf_ignore_process(const char *pname, size_t maxlen) +{ + if ( strncmp(pname, "mds", maxlen) == 0 + || strncmp(pname, "mdworker", maxlen) == 0 + || strncmp(pname, "mds_stores", maxlen) == 0 + || strncmp(pname, "makewhatis", maxlen) == 0) { + return 1; + } + + return 0; + +} + +static int +hfs_addhotfile_internal(struct vnode *vp) +{ + hotfile_data_t *hotdata; + hotfile_entry_t *entry; + hfsmount_t *hfsmp; + cnode_t *cp; + filefork_t *ffp; + u_int32_t temperature; + + hfsmp = VTOHFS(vp); + if (hfsmp->hfc_stage != HFC_RECORDING) + return (0); + + /* + * Only regular files are eligible for hotfiles addition. + * + * Symlinks were previously added to the list and may exist in + * extant hotfiles regions, but no new ones will be added, and no + * symlinks will now be relocated/evicted from the hotfiles region. + */ + if (!vnode_isreg(vp) || vnode_issystem(vp)) { + return (0); + } + + /* Skip resource forks for now. */ + if (VNODE_IS_RSRC(vp)) { + return (0); + } + if ((hotdata = hfsmp->hfc_recdata) == NULL) { + return (0); + } + ffp = VTOF(vp); + cp = VTOC(vp); + + if (cp->c_attr.ca_recflags & (kHFSFastDevPinnedMask|kHFSDoNotFastDevPinMask)) { + // it's already a hotfile or can't be a hotfile... + return 0; + } + + if (vnode_isdir(vp) || vnode_issystem(vp) || (cp->c_flag & (C_DELETED | C_NOEXISTS))) { + return 0; + } + + if ((hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) && vnode_isfastdevicecandidate(vp)) { + // + // On cooperative fusion (CF) systems we have different criteria for whether something + // can be pinned to the ssd. + // + if (cp->c_flag & (C_DELETED|C_NOEXISTS)) { + // + // dead files are definitely not worth caching + // + return 0; + } else if (ffp->ff_blocks == 0 && !(cp->c_bsdflags & UF_COMPRESSED) && !(cp->c_attr.ca_recflags & kHFSFastDevCandidateMask)) { + // + // empty files aren't worth caching but compressed ones might be, as are + // newly created files that live in WorthCaching directories... + // + return 0; + } + + char pname[256]; + pname[0] = '\0'; + proc_selfname(pname, sizeof(pname)); + if (hf_ignore_process(pname, sizeof(pname))) { + // ignore i/o's from certain system daemons + return 0; + } + + temperature = cp->c_fileid; // in memory we just keep it sorted by file-id + } else { + // the normal hard drive based hotfile checks + if ((ffp->ff_bytesread == 0) || + (ffp->ff_blocks == 0) || + (ffp->ff_size == 0) || + (ffp->ff_blocks > hotdata->maxblocks) || + (cp->c_bsdflags & (UF_NODUMP | UF_COMPRESSED)) || + (cp->c_atime < hfsmp->hfc_timebase)) { + return (0); + } + + temperature = ffp->ff_bytesread / ffp->ff_size; + if (temperature < hotdata->threshold) { + return (0); + } + } + + /* + * If there is room or this file is hotter than + * the coldest one then add it to the list. + * + */ + if ((hotdata->activefiles < hfsmp->hfc_maxfiles) || + (hotdata->coldest == NULL) || + (temperature >= hotdata->coldest->temperature)) { + ++hotdata->refcount; + entry = hf_getnewentry(hotdata); + entry->temperature = temperature; + entry->fileid = cp->c_fileid; + // + // if ffp->ff_blocks is zero, it might be compressed so make sure we record + // that there's at least one block. + // + entry->blocks = ffp->ff_blocks ? ffp->ff_blocks : 1; + if (hf_insert(hotdata, entry) == EEXIST) { + // entry is already present, don't need to add it again + entry->right = hotdata->freelist; + hotdata->freelist = entry; + } + --hotdata->refcount; + } + + return (0); +} + +/* + * Remove a hot file from the recording list. + * + * This can happen when a hot file becomes + * an active vnode (active hot files are + * not kept in the recording list until the + * end of the recording period). + * + * Note: the cnode is locked on entry. + */ +int +hfs_removehotfile(struct vnode *vp) +{ + hotfile_data_t *hotdata; + hfsmount_t *hfsmp; + cnode_t *cp; + filefork_t *ffp; + u_int32_t temperature; + + hfsmp = VTOHFS(vp); + if (hfsmp->hfc_stage != HFC_RECORDING) + return (0); + + if ((!vnode_isreg(vp)) || vnode_issystem(vp)) { + return (0); + } + + ffp = VTOF(vp); + cp = VTOC(vp); + + if ((ffp->ff_bytesread == 0) || (ffp->ff_blocks == 0) || + (ffp->ff_size == 0) || (cp->c_atime < hfsmp->hfc_timebase)) { + return (0); + } + + lck_mtx_lock(&hfsmp->hfc_mutex); + if (hfsmp->hfc_stage != HFC_RECORDING) + goto out; + if ((hotdata = hfsmp->hfc_recdata) == NULL) + goto out; + + temperature = ffp->ff_bytesread / ffp->ff_size; + if (temperature < hotdata->threshold) + goto out; + + if (hotdata->coldest && (temperature >= hotdata->coldest->temperature)) { + ++hotdata->refcount; + hf_delete(hotdata, VTOC(vp)->c_fileid, temperature); + --hotdata->refcount; + } +out: + lck_mtx_unlock(&hfsmp->hfc_mutex); + return (0); +} + +int +hfs_hotfile_deleted(__unused struct vnode *vp) +{ +#if 1 + return 0; +#else + // + // XXXdbg - this code, while it would work, would introduce a huge inefficiency + // to deleting files as the way it's written would require us to open + // the hotfile btree on every open, delete two records in it and then + // close the hotfile btree (which involves more writes). + // + // We actually can be lazy about deleting hotfile records for files + // that get deleted. When it's time to evict things, if we encounter + // a record that references a dead file (i.e. a fileid which no + // longer exists), the eviction code will remove the records. Likewise + // the code that scans the HotFile B-Tree at boot time to re-pin files + // will remove dead records. + // + + hotfile_data_t *hotdata; + hfsmount_t *hfsmp; + cnode_t *cp; + filefork_t *filefork; + u_int32_t temperature; + BTreeIterator * iterator = NULL; + FSBufferDescriptor record; + HotFileKey *key; + u_int32_t data; + int error=0; + + cp = VTOC(vp); + if (cp == NULL || !(cp->c_attr.ca_recflags & kHFSFastDevPinnedMask)) { + return 0; + } + + hfsmp = VTOHFS(vp); + if (!(hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN)) { + return 0; + } + + if (hfc_btree_open(hfsmp, &hfsmp->hfc_filevp) != 0 || hfsmp->hfc_filevp == NULL) { + // either there is no hotfile info or it's damaged + return EINVAL; + } + + filefork = VTOF(hfsmp->hfc_filevp); + if (filefork == NULL) { + return 0; + } + + iterator = hfs_mallocz(sizeof(*iterator)); + + key = (HotFileKey*) &iterator->key; + + record.bufferAddress = &data; + record.itemSize = sizeof(u_int32_t); + record.itemCount = 1; + + key->keyLength = HFC_KEYLENGTH; + key->temperature = HFC_LOOKUPTAG; + key->fileID = cp->c_fileid; + key->forkType = 0; + + lck_mtx_lock(&hfsmp->hfc_mutex); + (void) BTInvalidateHint(iterator); + if (BTSearchRecord(filefork, iterator, &record, NULL, iterator) == 0) { + temperature = key->temperature; + hfc_btree_delete_record(hfsmp, iterator, key); + } else { + //printf("hfs: hotfile_deleted: did not find fileid %d\n", cp->c_fileid); + error = ENOENT; + } + + if ((hotdata = hfsmp->hfc_recdata) != NULL) { + // just in case, also make sure it's removed from the in-memory list as well + ++hotdata->refcount; + hf_delete(hotdata, cp->c_fileid, cp->c_fileid); + --hotdata->refcount; + } + + lck_mtx_unlock(&hfsmp->hfc_mutex); + hfs_free(iterator, sizeof(*iterator)); + + hfc_btree_close(hfsmp, hfsmp->hfc_filevp); + + return error; +#endif +} + +int +hfs_hotfile_adjust_blocks(struct vnode *vp, int64_t num_blocks) +{ + hfsmount_t *hfsmp; + + if (vp == NULL) { + return 0; + } + + hfsmp = VTOHFS(vp); + + if (!(hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) || num_blocks == 0 || vp == NULL) { + return 0; + } + + // + // if file is not HotFileCached or it has the CanNotHotFile cache + // bit set then there is nothing to do + // + if (!(VTOC(vp)->c_attr.ca_recflags & kHFSFastDevPinnedMask) || (VTOC(vp)->c_attr.ca_recflags & kHFSDoNotFastDevPinMask)) { + // it's not a hot file or can't be one so don't bother tracking + return 0; + } + + OSAddAtomic(num_blocks, &hfsmp->hfs_hotfile_blk_adjust); + + return (0); +} + +// +// Assumes hfsmp->hfc_mutex is LOCKED +// +static int +hfs_hotfile_cur_freeblks(hfsmount_t *hfsmp) +{ + if (hfsmp->hfc_stage < HFC_IDLE) { + return 0; + } + + int cur_blk_adjust = hfsmp->hfs_hotfile_blk_adjust; // snap a copy of this value + + if (cur_blk_adjust) { + OSAddAtomic(-cur_blk_adjust, &hfsmp->hfs_hotfile_blk_adjust); + hfsmp->hfs_hotfile_freeblks += cur_blk_adjust; + } + + return hfsmp->hfs_hotfile_freeblks; +} + + +/* + *======================================================================== + * HOT FILE MAINTENANCE ROUTINES + *======================================================================== + */ + +static int +hotfiles_collect_callback(struct vnode *vp, __unused void *cargs) +{ + if ((vnode_isreg(vp)) && !vnode_issystem(vp)) + (void) hfs_addhotfile_internal(vp); + + return (VNODE_RETURNED); +} + +/* + * Add all active hot files to the recording list. + */ +static int +hotfiles_collect(struct hfsmount *hfsmp) +{ + struct mount *mp = HFSTOVFS(hfsmp); + + if (vfs_busy(mp, LK_NOWAIT)) + return (0); + + /* + * hotfiles_collect_callback will be called for each vnode + * hung off of this mount point + * the vnode will be + * properly referenced and unreferenced around the callback + */ + vnode_iterate(mp, 0, hotfiles_collect_callback, (void *)NULL); + + vfs_unbusy(mp); + + return (0); +} + + +/* + * Update the data of a btree record + * This is called from within BTUpdateRecord. + */ +static int +update_callback(const HotFileKey *key, u_int32_t *data, u_int32_t *state) +{ + if (key->temperature == HFC_LOOKUPTAG) + *data = *state; + return (0); +} + +/* + * Identify files already in hot area. + */ +static int +hotfiles_refine(struct hfsmount *hfsmp) +{ + BTreeIterator * iterator = NULL; + struct mount *mp; + filefork_t * filefork; + hotfilelist_t *listp; + FSBufferDescriptor record; + HotFileKey * key; + u_int32_t data; + int i; + int error = 0; + + if ((listp = hfsmp->hfc_filelist) == NULL) + return (0); + + if (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) { + // on ssd's we don't refine the temperature since the + // replacement algorithm is simply random + return 0; + } + + mp = HFSTOVFS(hfsmp); + + iterator = hfs_mallocz(sizeof(*iterator)); + + key = (HotFileKey*) &iterator->key; + + record.bufferAddress = &data; + record.itemSize = sizeof(u_int32_t); + record.itemCount = 1; + + if (hfs_start_transaction(hfsmp) != 0) { + error = EINVAL; + goto out; + } + if (hfs_lock(VTOC(hfsmp->hfc_filevp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT) != 0) { + error = EPERM; + goto out1; + } + filefork = VTOF(hfsmp->hfc_filevp); + + for (i = 0; i < listp->hfl_count; ++i) { + /* + * Check if entry (thread) is already in hot area. + */ + key->keyLength = HFC_KEYLENGTH; + key->temperature = HFC_LOOKUPTAG; + key->fileID = listp->hfl_hotfile[i].hf_fileid; + key->forkType = 0; + (void) BTInvalidateHint(iterator); + if (BTSearchRecord(filefork, iterator, &record, NULL, iterator) != 0) { + continue; /* not in hot area, so skip */ + } + + /* + * Update thread entry with latest temperature. + */ + error = BTUpdateRecord(filefork, iterator, + (IterateCallBackProcPtr)update_callback, + &listp->hfl_hotfile[i].hf_temperature); + if (error) { + printf("hfs: hotfiles_refine: BTUpdateRecord failed %d (file %d)\n", error, key->fileID); + error = MacToVFSError(error); + // break; + } + /* + * Re-key entry with latest temperature. + */ + key->keyLength = HFC_KEYLENGTH; + key->temperature = data; + key->fileID = listp->hfl_hotfile[i].hf_fileid; + key->forkType = 0; + /* Pick up record data. */ + (void) BTInvalidateHint(iterator); + (void) BTSearchRecord(filefork, iterator, &record, NULL, iterator); + error = BTDeleteRecord(filefork, iterator); + if (error) { + printf("hfs: hotfiles_refine: BTDeleteRecord failed %d (file %d)\n", error, key->fileID); + error = MacToVFSError(error); + break; + } + key->keyLength = HFC_KEYLENGTH; + key->temperature = listp->hfl_hotfile[i].hf_temperature; + key->fileID = listp->hfl_hotfile[i].hf_fileid; + key->forkType = 0; + error = BTInsertRecord(filefork, iterator, &record, record.itemSize); + if (error) { + printf("hfs: hotfiles_refine: BTInsertRecord failed %d (file %d)\n", error, key->fileID); + error = MacToVFSError(error); + break; + } + /* + * Invalidate this entry in the list. + */ + listp->hfl_hotfile[i].hf_temperature = 0; + listp->hfl_totalblocks -= listp->hfl_hotfile[i].hf_blocks; + + } /* end for */ + + (void) BTFlushPath(filefork); + hfs_unlock(VTOC(hfsmp->hfc_filevp)); + +out1: + hfs_end_transaction(hfsmp); +out: + if (iterator) + hfs_free(iterator, sizeof(*iterator)); + return (error); +} + +/* + * Move new hot files into hot area. + * + * Requires that the hfc_mutex be held. + */ +static int +hotfiles_adopt(struct hfsmount *hfsmp) +{ + BTreeIterator * iterator = NULL; + struct vnode *vp; + filefork_t * filefork; + hotfilelist_t *listp; + FSBufferDescriptor record; + HotFileKey * key; + u_int32_t data; + enum hfc_stage stage; + int fileblocks; + int blksmoved; + int i; + int last; + int error = 0; + int startedtrans = 0; + // + // all files in a given adoption phase have a temperature + // that starts at a random value and then increases linearly. + // the idea is that during eviction, files that were adopted + // together will be evicted together + // + long starting_temp = random() % HF_TEMP_RANGE; + long temp_adjust = 0; + + if ((listp = hfsmp->hfc_filelist) == NULL) + return (0); + + if (hfsmp->hfc_stage != HFC_ADOPTION) { + return (EBUSY); + } + if (hfs_lock(VTOC(hfsmp->hfc_filevp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT) != 0) { + return (EPERM); + } + + iterator = hfs_mallocz(sizeof(*iterator)); + +#if HFC_VERBOSE + printf("hfs:%s: hotfiles_adopt: (hfl_next: %d, hotfile start/end block: %d - %d; max/free: %d/%d; maxfiles: %d)\n", + hfsmp->vcbVN, + listp->hfl_next, + hfsmp->hfs_hotfile_start, hfsmp->hfs_hotfile_end, + hfsmp->hfs_hotfile_maxblks, hfsmp->hfs_hotfile_freeblks, hfsmp->hfc_maxfiles); +#endif + + stage = hfsmp->hfc_stage; + hfsmp->hfc_stage = HFC_BUSY; + + blksmoved = 0; + last = listp->hfl_next + HFC_FILESPERSYNC; + if (last > listp->hfl_count) + last = listp->hfl_count; + + key = (HotFileKey*) &iterator->key; + key->keyLength = HFC_KEYLENGTH; + + record.bufferAddress = &data; + record.itemSize = sizeof(u_int32_t); + record.itemCount = 1; + + filefork = VTOF(hfsmp->hfc_filevp); + + for (i = listp->hfl_next; (i < last) && (blksmoved < HFC_BLKSPERSYNC); ++i) { + /* + * Skip entries that aren't going to work. + */ + if (listp->hfl_hotfile[i].hf_temperature == 0) { + //printf("hfs: zero temp on file-id %d\n", listp->hfl_hotfile[i].hf_fileid); + listp->hfl_next++; + continue; + } + if (listp->hfl_hotfile[i].hf_fileid == VTOC(hfsmp->hfc_filevp)->c_fileid) { + //printf("hfs: cannot adopt the hotfile b-tree itself! (file-id %d)\n", listp->hfl_hotfile[i].hf_fileid); + listp->hfl_next++; + continue; + } + if (listp->hfl_hotfile[i].hf_fileid < kHFSFirstUserCatalogNodeID) { + //printf("hfs: cannot adopt system files (file-id %d)\n", listp->hfl_hotfile[i].hf_fileid); + listp->hfl_next++; + continue; + } + + /* + * Acquire a vnode for this file. + */ + error = hfs_vget(hfsmp, listp->hfl_hotfile[i].hf_fileid, &vp, 0, 0); + if (error) { + //printf("failed to get fileid %d (err %d)\n", listp->hfl_hotfile[i].hf_fileid, error); + if (error == ENOENT) { + error = 0; + listp->hfl_next++; + continue; /* stale entry, go to next */ + } + break; + } + + //printf("hfs: examining hotfile entry w/fileid %d, temp %d, blocks %d (HotFileCached: %s)\n", + // listp->hfl_hotfile[i].hf_fileid, listp->hfl_hotfile[i].hf_temperature, + // listp->hfl_hotfile[i].hf_blocks, + // (VTOC(vp)->c_attr.ca_recflags & kHFSFastDevPinnedMask) ? "YES" : "NO"); + + if (!vnode_isreg(vp)) { + /* Symlinks are ineligible for adoption into the hotfile zone. */ + //printf("hfs: hotfiles_adopt: huh, not a file %d (%d)\n", listp->hfl_hotfile[i].hf_fileid, VTOC(vp)->c_cnid); + hfs_unlock(VTOC(vp)); + vnode_put(vp); + listp->hfl_hotfile[i].hf_temperature = 0; + listp->hfl_next++; + continue; /* stale entry, go to next */ + } + if ( (VTOC(vp)->c_flag & (C_DELETED | C_NOEXISTS)) + || (!(hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) && hotextents(hfsmp, &VTOF(vp)->ff_extents[0])) + || (VTOC(vp)->c_attr.ca_recflags & (kHFSFastDevPinnedMask|kHFSDoNotFastDevPinMask))) { + hfs_unlock(VTOC(vp)); + vnode_put(vp); + listp->hfl_hotfile[i].hf_temperature = 0; + listp->hfl_next++; + listp->hfl_totalblocks -= listp->hfl_hotfile[i].hf_blocks; + continue; /* stale entry, go to next */ + } + + fileblocks = VTOF(vp)->ff_blocks; + + // + // for CF, if the file is empty (and not compressed) or it is too large, + // do not try to pin it. (note: if fileblocks == 0 but the file is marked + // as compressed, we may still be able to cache it). + // + if ((hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) && + ((fileblocks == 0 && !(VTOC(vp)->c_bsdflags & UF_COMPRESSED)) || + (unsigned int)fileblocks > (HFC_MAXIMUM_FILESIZE / (uint64_t)HFSTOVCB(hfsmp)->blockSize))) { + // don't try to cache something too large or that's zero-bytes + + vnode_clearfastdevicecandidate(vp); // turn off the fast-dev-candidate flag so we don't keep trying to cache it. + + hfs_unlock(VTOC(vp)); + vnode_put(vp); + listp->hfl_hotfile[i].hf_temperature = 0; + listp->hfl_next++; + listp->hfl_totalblocks -= listp->hfl_hotfile[i].hf_blocks; + continue; /* entry is too big, just carry on with the next guy */ + } + + // + // If a file is not an autocandidate (i.e. it's a user-tagged file desirous of + // being hotfile cached) but it is already bigger than 4 megs, don't bother + // hotfile caching it. Note that if a user tagged file starts small, gets + // adopted and then grows over time we will allow it to grow bigger than 4 megs + // which is intentional for things like the Mail or Photos database files which + // grow slowly over time and benefit from being on the FastDevice. + // + if ((hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) && + !(VTOC(vp)->c_attr.ca_recflags & kHFSAutoCandidateMask) && + (VTOC(vp)->c_attr.ca_recflags & kHFSFastDevCandidateMask) && + (unsigned int)fileblocks > ((4*1024*1024) / (uint64_t)HFSTOVCB(hfsmp)->blockSize)) { + + vnode_clearfastdevicecandidate(vp); // turn off the fast-dev-candidate flag so we don't keep trying to cache it. + + hfs_unlock(VTOC(vp)); + vnode_put(vp); + listp->hfl_hotfile[i].hf_temperature = 0; + listp->hfl_next++; + listp->hfl_totalblocks -= listp->hfl_hotfile[i].hf_blocks; + continue; /* entry is too big, just carry on with the next guy */ + } + + if (fileblocks > hfs_hotfile_cur_freeblks(hfsmp)) { + // + // No room for this file. Although eviction should have made space + // it's best that we check here as well since writes to existing + // hotfiles may have eaten up space since we performed eviction + // + hfs_unlock(VTOC(vp)); + vnode_put(vp); + listp->hfl_next++; + listp->hfl_totalblocks -= fileblocks; + continue; /* entry too big, go to next */ + } + + if ((blksmoved > 0) && + (blksmoved + fileblocks) > HFC_BLKSPERSYNC) { + // + // we've done enough work, let's be nice to the system and + // stop until the next iteration + // + hfs_unlock(VTOC(vp)); + vnode_put(vp); + break; /* adopt this entry the next time around */ + } + + // + // The size of data for a hot file record is 4 bytes. The data + // stored in hot file record is not really meaningful. However + // to aid debugging, we store first four bytes of the file name + // or the ASCII text "????" + // + if (VTOC(vp)->c_desc.cd_nameptr && (VTOC(vp)->c_desc.cd_namelen > 0)) { + size_t max_len; + + max_len = sizeof(u_int32_t); + if (max_len > (unsigned)VTOC(vp)->c_desc.cd_namelen) + max_len = VTOC(vp)->c_desc.cd_namelen; + + memcpy(&data, VTOC(vp)->c_desc.cd_nameptr, max_len); + } else + data = 0x3f3f3f3f; + + + if (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) { + // + // For CF we pin the blocks belonging to the file + // to the "fast" (aka ssd) media + // + uint32_t pinned_blocks; + + if (vnode_isautocandidate(vp)) { + VTOC(vp)->c_attr.ca_recflags |= kHFSAutoCandidateMask; + } + if (VTOC(vp)->c_attr.ca_recflags & kHFSAutoCandidateMask) { + // + // this moves auto-cached files to the higher tier + // of "temperatures" which means they are less likely + // to get evicted (user selected hotfiles will get + // evicted first in the theory that they change more + // frequently compared to system files) + // + temp_adjust = MAX_NORMAL_TEMP; + } else { + temp_adjust = 0; + } + + hfs_unlock(VTOC(vp)); // don't need an exclusive lock for this + hfs_lock(VTOC(vp), HFS_SHARED_LOCK, HFS_LOCK_ALLOW_NOEXISTS); + + error = hfs_pin_vnode(hfsmp, vp, HFS_PIN_IT, &pinned_blocks); + + fileblocks = pinned_blocks; + + // go back to an exclusive lock since we're going to modify the cnode again + hfs_unlock(VTOC(vp)); + hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); + } else { + // + // Old style hotfiles moves the data to the center (aka "hot") + // region of the disk + // + error = hfs_relocate(vp, hfsmp->hfs_hotfile_start, kauth_cred_get(), current_proc()); + } + + if (!error) { + VTOC(vp)->c_attr.ca_recflags |= kHFSFastDevPinnedMask; + VTOC(vp)->c_flag |= C_MODIFIED; + } else if ((hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) && error == EALREADY) { + // + // If hfs_pin_vnode() returned EALREADY then this file is not + // ever able to be hotfile cached the normal way. This can + // happen with compressed files which have their data stored + // in an extended attribute. We flag them so that we won't + // bother to try and hotfile cache them again the next time + // they're read. + // + VTOC(vp)->c_attr.ca_recflags |= kHFSDoNotFastDevPinMask; + VTOC(vp)->c_flag |= C_MODIFIED; + } + + hfs_unlock(VTOC(vp)); + vnode_put(vp); + if (error) { +#if HFC_VERBOSE + if (error != EALREADY) { + printf("hfs: hotfiles_adopt: could not relocate file %d (err %d)\n", listp->hfl_hotfile[i].hf_fileid, error); + } +#endif + + if (last < listp->hfl_count) { + last++; + } + /* Move on to next item. */ + listp->hfl_next++; + continue; + } + /* Keep hot file free space current. */ + hfsmp->hfs_hotfile_freeblks -= fileblocks; + listp->hfl_totalblocks -= fileblocks; + + /* Insert hot file entry */ + key->keyLength = HFC_KEYLENGTH; + + if (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) { + // + // The "temperature" for a CF hotfile is simply a random + // number that we sequentially increment for each file in + // the set of files we're currently adopting. This has the + // nice property that all of the files we pin to the ssd + // in the current phase will sort together in the hotfile + // btree. When eviction time comes we will evict them + // together as well. This gives the eviction phase temporal + // locality - things written together get evicted together + // which is what ssd's like. + // + listp->hfl_hotfile[i].hf_temperature = (uint32_t)temp_adjust + starting_temp++; + } + + key->temperature = listp->hfl_hotfile[i].hf_temperature; + key->fileID = listp->hfl_hotfile[i].hf_fileid; + key->forkType = 0; + + /* Start a new transaction before calling BTree code. */ + if (hfs_start_transaction(hfsmp) != 0) { + error = EINVAL; + break; + } + startedtrans = 1; + + error = BTInsertRecord(filefork, iterator, &record, record.itemSize); + if (error) { + int orig_error = error; + error = MacToVFSError(error); + printf("hfs: hotfiles_adopt:1: BTInsertRecord failed %d/%d (fileid %d)\n", error, orig_error, key->fileID); + stage = HFC_IDLE; + break; + } + + /* Insert thread record */ + key->keyLength = HFC_KEYLENGTH; + key->temperature = HFC_LOOKUPTAG; + key->fileID = listp->hfl_hotfile[i].hf_fileid; + key->forkType = 0; + data = listp->hfl_hotfile[i].hf_temperature; + error = BTInsertRecord(filefork, iterator, &record, record.itemSize); + if (error) { + int orig_error = error; + error = MacToVFSError(error); + printf("hfs: hotfiles_adopt:2: BTInsertRecord failed %d/%d (fileid %d)\n", error, orig_error, key->fileID); + stage = HFC_IDLE; + break; + } else { + (void) BTFlushPath(filefork); + blksmoved += fileblocks; + } + + listp->hfl_next++; + if (listp->hfl_next >= listp->hfl_count) { + break; + } + + /* Transaction complete. */ + if (startedtrans) { + hfs_end_transaction(hfsmp); + startedtrans = 0; + } + + if (hfs_hotfile_cur_freeblks(hfsmp) <= 0) { +#if HFC_VERBOSE + printf("hfs: hotfiles_adopt: free space exhausted (%d)\n", hfsmp->hfs_hotfile_freeblks); +#endif + break; + } + } /* end for */ + +#if HFC_VERBOSE + printf("hfs: hotfiles_adopt: [%d] adopted %d blocks (%d files left)\n", listp->hfl_next, blksmoved, listp->hfl_count - i); +#endif + if (!startedtrans) { + // start a txn so we'll save the btree summary info + if (hfs_start_transaction(hfsmp) == 0) { + startedtrans = 1; + } + } + + /* Finish any outstanding transactions. */ + if (startedtrans) { + save_btree_user_info(hfsmp); + + (void) BTFlushPath(filefork); + hfs_end_transaction(hfsmp); + startedtrans = 0; + } + hfs_unlock(VTOC(hfsmp->hfc_filevp)); + + if ((listp->hfl_next >= listp->hfl_count) || (hfsmp->hfs_hotfile_freeblks <= 0)) { +#if HFC_VERBOSE + printf("hfs: hotfiles_adopt: all done relocating %d files\n", listp->hfl_count); + printf("hfs: hotfiles_adopt: %d blocks free in hot file band\n", hfsmp->hfs_hotfile_freeblks); +#endif + stage = HFC_IDLE; + } + hfs_free(iterator, sizeof(*iterator)); + + if (stage != HFC_ADOPTION && hfsmp->hfc_filevp) { + (void) hfc_btree_close(hfsmp, hfsmp->hfc_filevp); + hfsmp->hfc_filevp = NULL; + } + hfsmp->hfc_stage = stage; + wakeup((caddr_t)&hfsmp->hfc_stage); + return (error); +} + +/* + * Reclaim space by evicting the coldest files. + * + * Requires that the hfc_mutex be held. + */ +static int +hotfiles_evict(struct hfsmount *hfsmp, vfs_context_t ctx) +{ + BTreeIterator * iterator = NULL; + struct vnode *vp; + HotFileKey * key; + filefork_t * filefork; + hotfilelist_t *listp; + enum hfc_stage stage; + u_int32_t savedtemp; + int blksmoved; + int filesmoved; + int fileblocks; + int error = 0; + int startedtrans = 0; + int bt_op; + + if (hfsmp->hfc_stage != HFC_EVICTION) { + return (EBUSY); + } + + if ((listp = hfsmp->hfc_filelist) == NULL) + return (0); + + if (hfs_lock(VTOC(hfsmp->hfc_filevp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT) != 0) { + return (EPERM); + } + +#if HFC_VERBOSE + printf("hfs:%s: hotfiles_evict (hotfile start/end block: %d - %d; max/free: %d/%d; maxfiles: %d)\n", + hfsmp->vcbVN, + hfsmp->hfs_hotfile_start, hfsmp->hfs_hotfile_end, + hfsmp->hfs_hotfile_maxblks, hfsmp->hfs_hotfile_freeblks, hfsmp->hfc_maxfiles); +#endif + + iterator = hfs_mallocz(sizeof(*iterator)); + + stage = hfsmp->hfc_stage; + hfsmp->hfc_stage = HFC_BUSY; + + filesmoved = blksmoved = 0; + bt_op = kBTreeFirstRecord; + + key = (HotFileKey*) &iterator->key; + + filefork = VTOF(hfsmp->hfc_filevp); + +#if HFC_VERBOSE + printf("hfs: hotfiles_evict: reclaim blks %d\n", listp->hfl_reclaimblks); +#endif + + while (listp->hfl_reclaimblks > 0 && + blksmoved < HFC_BLKSPERSYNC && + filesmoved < HFC_FILESPERSYNC) { + + /* + * Obtain the first record (ie the coldest one). + */ + if (BTIterateRecord(filefork, bt_op, iterator, NULL, NULL) != 0) { +#if HFC_VERBOSE + printf("hfs: hotfiles_evict: no more records\n"); +#endif + error = 0; + stage = HFC_ADOPTION; + break; + } + if (key->keyLength != HFC_KEYLENGTH) { + printf("hfs: hotfiles_evict: invalid key length %d\n", key->keyLength); + error = EFTYPE; + break; + } + if (key->temperature == HFC_LOOKUPTAG) { +#if HFC_VERBOSE + printf("hfs: hotfiles_evict: ran into thread records\n"); +#endif + error = 0; + stage = HFC_ADOPTION; + break; + } + + // Jump straight to delete for some files... + if (key->fileID == VTOC(hfsmp->hfc_filevp)->c_fileid + || key->fileID == hfsmp->hfs_jnlfileid + || key->fileID == hfsmp->hfs_jnlinfoblkid + || key->fileID < kHFSFirstUserCatalogNodeID) { + goto delete; + } + + /* + * Aquire the vnode for this file. + */ + error = hfs_vget(hfsmp, key->fileID, &vp, 0, 0); + if (error) { + if (error == ENOENT) { + goto delete; /* stale entry, go to next */ + } else { + printf("hfs: hotfiles_evict: err %d getting file %d\n", + error, key->fileID); + } + break; + } + + /* + * Symlinks that may have been inserted into the hotfile zone during a previous OS are now stuck + * here. We do not want to move them. + */ + if (!vnode_isreg(vp)) { + //printf("hfs: hotfiles_evict: huh, not a file %d\n", key->fileID); + hfs_unlock(VTOC(vp)); + vnode_put(vp); + goto delete; /* invalid entry, go to next */ + } + + fileblocks = VTOF(vp)->ff_blocks; + if ((blksmoved > 0) && + (blksmoved + fileblocks) > HFC_BLKSPERSYNC) { + hfs_unlock(VTOC(vp)); + vnode_put(vp); + break; + } + /* + * Make sure file is in the hot area. + */ + if (!hotextents(hfsmp, &VTOF(vp)->ff_extents[0]) && !(VTOC(vp)->c_attr.ca_recflags & kHFSFastDevPinnedMask)) { +#if HFC_VERBOSE + printf("hfs: hotfiles_evict: file %d isn't hot!\n", key->fileID); +#endif + hfs_unlock(VTOC(vp)); + vnode_put(vp); + goto delete; /* stale entry, go to next */ + } + + /* + * Relocate file out of hot area. On cooperative fusion (CF) that just + * means un-pinning the data from the ssd. For traditional hotfiles that means moving + * the file data out of the hot region of the disk. + */ + if (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) { + uint32_t pinned_blocks; + + hfs_unlock(VTOC(vp)); // don't need an exclusive lock for this + hfs_lock(VTOC(vp), HFS_SHARED_LOCK, HFS_LOCK_ALLOW_NOEXISTS); + + error = hfs_pin_vnode(hfsmp, vp, HFS_UNPIN_IT, &pinned_blocks); + fileblocks = pinned_blocks; + + if (!error) { + // go back to an exclusive lock since we're going to modify the cnode again + hfs_unlock(VTOC(vp)); + hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); + } + } else { + error = hfs_relocate(vp, HFSTOVCB(hfsmp)->nextAllocation, vfs_context_ucred(ctx), vfs_context_proc(ctx)); + } + if (error) { +#if HFC_VERBOSE + printf("hfs: hotfiles_evict: err %d relocating file %d\n", error, key->fileID); +#endif + hfs_unlock(VTOC(vp)); + vnode_put(vp); + bt_op = kBTreeNextRecord; + goto next; /* go to next */ + } else { + VTOC(vp)->c_attr.ca_recflags &= ~kHFSFastDevPinnedMask; + VTOC(vp)->c_flag |= C_MODIFIED; + } + + // + // We do not believe that this call to hfs_fsync() is + // necessary and it causes a journal transaction + // deadlock so we are removing it. + // + // (void) hfs_fsync(vp, MNT_WAIT, 0, p); + + hfs_unlock(VTOC(vp)); + vnode_put(vp); + + hfsmp->hfs_hotfile_freeblks += fileblocks; + listp->hfl_reclaimblks -= fileblocks; + if (listp->hfl_reclaimblks < 0) + listp->hfl_reclaimblks = 0; + blksmoved += fileblocks; + filesmoved++; +delete: + /* Start a new transaction before calling BTree code. */ + if (hfs_start_transaction(hfsmp) != 0) { + error = EINVAL; + break; + } + startedtrans = 1; + + error = BTDeleteRecord(filefork, iterator); + if (error) { + error = MacToVFSError(error); + break; + } + savedtemp = key->temperature; + key->temperature = HFC_LOOKUPTAG; + error = BTDeleteRecord(filefork, iterator); + if (error) { + error = MacToVFSError(error); + break; + } + key->temperature = savedtemp; +next: + (void) BTFlushPath(filefork); + + /* Transaction complete. */ + if (startedtrans) { + hfs_end_transaction(hfsmp); + startedtrans = 0; + } + + } /* end while */ + +#if HFC_VERBOSE + printf("hfs: hotfiles_evict: moved %d files (%d blks, %d to go)\n", filesmoved, blksmoved, listp->hfl_reclaimblks); +#endif + /* Finish any outstanding transactions. */ + if (startedtrans) { + save_btree_user_info(hfsmp); + + (void) BTFlushPath(filefork); + hfs_end_transaction(hfsmp); + startedtrans = 0; + } + hfs_unlock(VTOC(hfsmp->hfc_filevp)); + + /* + * Move to next stage when finished. + */ + if (listp->hfl_reclaimblks <= 0) { + stage = HFC_ADOPTION; +#if HFC_VERBOSE + printf("hfs: hotfiles_evict: %d blocks free in hot file band\n", hfsmp->hfs_hotfile_freeblks); +#endif + } + hfs_free(iterator, sizeof(*iterator)); + hfsmp->hfc_stage = stage; + wakeup((caddr_t)&hfsmp->hfc_stage); + return (error); +} + +/* + * Age the existing records in the hot files b-tree. + */ +static int +hotfiles_age(struct hfsmount *hfsmp) +{ + BTreeInfoRec btinfo; + BTreeIterator * iterator = NULL; + BTreeIterator * prev_iterator; + FSBufferDescriptor record; + FSBufferDescriptor prev_record; + HotFileKey * key; + HotFileKey * prev_key; + filefork_t * filefork; + u_int32_t data; + u_int32_t prev_data; + u_int32_t newtemp; + int error; + int i; + int numrecs; + int aged = 0; + u_int16_t reclen; + + + if (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) { + // + // hotfiles don't age on CF + // + return 0; + } + + iterator = hfs_mallocz(2 * sizeof(*iterator)); + + key = (HotFileKey*) &iterator->key; + + prev_iterator = &iterator[1]; + prev_key = (HotFileKey*) &prev_iterator->key; + + record.bufferAddress = &data; + record.itemSize = sizeof(data); + record.itemCount = 1; + prev_record.bufferAddress = &prev_data; + prev_record.itemSize = sizeof(prev_data); + prev_record.itemCount = 1; + + /* + * Capture b-tree changes inside a transaction + */ + if (hfs_start_transaction(hfsmp) != 0) { + error = EINVAL; + goto out2; + } + if (hfs_lock(VTOC(hfsmp->hfc_filevp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT) != 0) { + error = EPERM; + goto out1; + } + filefork = VTOF(hfsmp->hfc_filevp); + + error = BTGetInformation(filefork, 0, &btinfo); + if (error) { + error = MacToVFSError(error); + goto out; + } + if (btinfo.numRecords < 2) { + error = 0; + goto out; + } + + /* Only want 1st half of leaf records */ + numrecs = (btinfo.numRecords /= 2) - 1; + + error = BTIterateRecord(filefork, kBTreeFirstRecord, iterator, &record, &reclen); + if (error) { + printf("hfs_agehotfiles: BTIterateRecord: %d\n", error); + error = MacToVFSError(error); + goto out; + } + bcopy(iterator, prev_iterator, sizeof(BTreeIterator)); + prev_data = data; + + for (i = 0; i < numrecs; ++i) { + error = BTIterateRecord(filefork, kBTreeNextRecord, iterator, &record, &reclen); + if (error == 0) { + if (key->temperature < prev_key->temperature) { + printf("hfs_agehotfiles: out of order keys!\n"); + error = EFTYPE; + break; + } + if (reclen != sizeof(data)) { + printf("hfs_agehotfiles: invalid record length %d\n", reclen); + error = EFTYPE; + break; + } + if (key->keyLength != HFC_KEYLENGTH) { + printf("hfs_agehotfiles: invalid key length %d\n", key->keyLength); + error = EFTYPE; + break; + } + } else if ((error == fsBTEndOfIterationErr || error == fsBTRecordNotFoundErr) && + (i == (numrecs - 1))) { + error = 0; + } else if (error) { + printf("hfs_agehotfiles: %d of %d BTIterateRecord: %d\n", i, numrecs, error); + error = MacToVFSError(error); + break; + } + if (prev_key->temperature == HFC_LOOKUPTAG) { +#if HFC_VERBOSE + printf("hfs_agehotfiles: ran into thread record\n"); +#endif + error = 0; + break; + } + error = BTDeleteRecord(filefork, prev_iterator); + if (error) { + printf("hfs_agehotfiles: BTDeleteRecord failed %d (file %d)\n", error, prev_key->fileID); + error = MacToVFSError(error); + break; + } + + /* Age by halving the temperature (floor = 4) */ + newtemp = MAX(prev_key->temperature >> 1, 4); + prev_key->temperature = newtemp; + + error = BTInsertRecord(filefork, prev_iterator, &prev_record, prev_record.itemSize); + if (error) { + printf("hfs_agehotfiles: BTInsertRecord failed %d (file %d)\n", error, prev_key->fileID); + error = MacToVFSError(error); + break; + } + ++aged; + /* + * Update thread entry with latest temperature. + */ + prev_key->temperature = HFC_LOOKUPTAG; + error = BTUpdateRecord(filefork, prev_iterator, + (IterateCallBackProcPtr)update_callback, + &newtemp); + if (error) { + printf("hfs_agehotfiles: %d of %d BTUpdateRecord failed %d (file %d, %d)\n", + i, numrecs, error, prev_key->fileID, newtemp); + error = MacToVFSError(error); + // break; + } + + bcopy(iterator, prev_iterator, sizeof(BTreeIterator)); + prev_data = data; + + } /* end for */ + +#if HFC_VERBOSE + if (error == 0) + printf("hfs_agehotfiles: aged %d records out of %d\n", aged, btinfo.numRecords); +#endif + (void) BTFlushPath(filefork); +out: + hfs_unlock(VTOC(hfsmp->hfc_filevp)); +out1: + hfs_end_transaction(hfsmp); +out2: + if (iterator) + hfs_free(iterator, 2 * sizeof(*iterator)); + return (error); +} + +/* + * Return true if any blocks (or all blocks if all is true) + * are contained in the hot file region. + */ +static int +hotextents(struct hfsmount *hfsmp, HFSPlusExtentDescriptor * extents) +{ + u_int32_t b1, b2; + int i; + int inside = 0; + + for (i = 0; i < kHFSPlusExtentDensity; ++i) { + b1 = extents[i].startBlock; + if (b1 == 0) + break; + b2 = b1 + extents[i].blockCount - 1; + if ((b1 >= hfsmp->hfs_hotfile_start && + b2 <= hfsmp->hfs_hotfile_end) || + (b1 < hfsmp->hfs_hotfile_end && + b2 > hfsmp->hfs_hotfile_end)) { + inside = 1; + break; + } + } + return (inside); +} + + +/* + *======================================================================== + * HOT FILE B-TREE ROUTINES + *======================================================================== + */ + +/* + * Open the hot files b-tree for writing. + * + * On successful exit the vnode has a reference but not an iocount. + */ +static int +hfc_btree_open(struct hfsmount *hfsmp, struct vnode **vpp) +{ + return hfc_btree_open_ext(hfsmp, vpp, 0); +} + +static int +hfc_btree_open_ext(struct hfsmount *hfsmp, struct vnode **vpp, int ignore_btree_errs) +{ + proc_t p; + struct vnode *vp; + struct cat_desc cdesc; + struct cat_attr cattr; + struct cat_fork cfork; + static char filename[] = HFC_FILENAME; + int error; + int retry = 0; + int lockflags; + int newvnode_flags = 0; + + *vpp = NULL; + p = current_proc(); + + bzero(&cdesc, sizeof(cdesc)); + cdesc.cd_parentcnid = kRootDirID; + cdesc.cd_nameptr = (const u_int8_t *)filename; + cdesc.cd_namelen = strlen(filename); + + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); + + error = cat_lookup(hfsmp, &cdesc, 0, 0, &cdesc, &cattr, &cfork, NULL); + + hfs_systemfile_unlock(hfsmp, lockflags); + + if (error) { + printf("hfs: hfc_btree_open: cat_lookup error %d\n", error); + return (error); + } +again: + cdesc.cd_flags |= CD_ISMETA; + error = hfs_getnewvnode(hfsmp, NULL, NULL, &cdesc, 0, &cattr, + &cfork, &vp, &newvnode_flags); + if (error) { + printf("hfs: hfc_btree_open: hfs_getnewvnode error %d\n", error); + cat_releasedesc(&cdesc); + return (error); + } + if (!vnode_issystem(vp)) { +#if HFC_VERBOSE + printf("hfs: hfc_btree_open: file has UBC, try again\n"); +#endif + hfs_unlock(VTOC(vp)); + vnode_recycle(vp); + vnode_put(vp); + if (retry++ == 0) + goto again; + else + return (EBUSY); + } + + /* Open the B-tree file for writing... */ + error = BTOpenPath(VTOF(vp), (KeyCompareProcPtr) hfc_comparekeys); + if (error) { + if (!ignore_btree_errs) { + printf("hfs: hfc_btree_open: BTOpenPath error %d; filesize %lld\n", error, VTOF(vp)->ff_size); + error = MacToVFSError(error); + } else { + error = 0; + } + } + + hfs_unlock(VTOC(vp)); + if (error == 0) { + *vpp = vp; + vnode_ref(vp); /* keep a reference while its open */ + } + vnode_put(vp); + + if (!vnode_issystem(vp)) + panic("hfs: hfc_btree_open: not a system file (vp = %p)", vp); + + HotFilesInfo hotfileinfo; + + if (error == 0 && (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN)) { + if ((BTGetUserData(VTOF(vp), &hotfileinfo, sizeof(hotfileinfo)) == 0) && (SWAP_BE32 (hotfileinfo.magic) == HFC_MAGIC)) { + if (hfsmp->hfs_hotfile_freeblks == 0) { + hfsmp->hfs_hotfile_freeblks = hfsmp->hfs_hotfile_maxblks - SWAP_BE32 (hotfileinfo.usedblocks); + } + + hfs_hotfile_cur_freeblks(hfsmp); // factors in any adjustments that happened at run-time + } + } + + return (error); +} + +/* + * Close the hot files b-tree. + * + * On entry the vnode has a reference. + */ +static int +hfc_btree_close(struct hfsmount *hfsmp, struct vnode *vp) +{ + proc_t p = current_proc(); + int error = 0; + + + if (hfsmp->jnl) { + hfs_flush(hfsmp, HFS_FLUSH_JOURNAL); + } + + if (vnode_get(vp) == 0) { + error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + if (error == 0) { + (void) hfs_fsync(vp, MNT_WAIT, 0, p); + error = BTClosePath(VTOF(vp)); + hfs_unlock(VTOC(vp)); + } + vnode_rele(vp); + vnode_recycle(vp); + vnode_put(vp); + } + + return (error); +} + +// +// Assumes that hfsmp->hfc_filevp points to the hotfile btree vnode +// (i.e. you called hfc_btree_open() ahead of time) +// +static int +hfc_btree_delete_record(struct hfsmount *hfsmp, BTreeIterator *iterator, HotFileKey *key) +{ + int error; + filefork_t *filefork=VTOF(hfsmp->hfc_filevp); + + /* Start a new transaction before calling BTree code. */ + if (hfs_start_transaction(hfsmp) != 0) { + return EINVAL; + } + + error = BTDeleteRecord(filefork, iterator); + if (error) { + error = MacToVFSError(error); + printf("hfs: failed to delete record for file-id %d : err %d\n", key->fileID, error); + goto out; + } + + int savedtemp; + savedtemp = key->temperature; + key->temperature = HFC_LOOKUPTAG; + error = BTDeleteRecord(filefork, iterator); + if (error) { + error = MacToVFSError(error); + printf("hfs:2: failed to delete record for file-id %d : err %d\n", key->fileID, error); + } + key->temperature = savedtemp; + + (void) BTFlushPath(filefork); + +out: + /* Transaction complete. */ + hfs_end_transaction(hfsmp); + + return error; +} + +// +// You have to have already opened the hotfile btree so +// that hfsmp->hfc_filevp is filled in. +// +static int +hfc_btree_delete(struct hfsmount *hfsmp) +{ + struct vnode *dvp = NULL; + vfs_context_t ctx = vfs_context_current(); + struct vnode_attr va; + static char filename[] = HFC_FILENAME; + int error; + + error = hfs_vfs_root(HFSTOVFS(hfsmp), &dvp, ctx); + if (error) { + return (error); + } + + struct componentname cname = { + .cn_nameiop = DELETE, + .cn_flags = ISLASTCN, + .cn_pnbuf = filename, + .cn_pnlen = sizeof(filename), + .cn_nameptr = filename, + .cn_namelen = strlen(filename), + }; + + VATTR_INIT(&va); + VATTR_SET(&va, va_type, VREG); + VATTR_SET(&va, va_mode, S_IFREG | S_IRUSR | S_IWUSR); + VATTR_SET(&va, va_uid, 0); + VATTR_SET(&va, va_gid, 0); + + if (hfs_start_transaction(hfsmp) != 0) { + error = EINVAL; + goto out; + } + + struct vnop_remove_args ap = { + .a_dvp = dvp, + .a_vp = hfsmp->hfc_filevp, + .a_cnp = &cname, + }; + + error = hfs_vnop_remove(&ap); + if (error) { + printf("hfs: error %d removing HFBT on %s\n", error, HFSTOVCB(hfsmp)->vcbVN); + } + + hfs_end_transaction(hfsmp); + +out: + if (dvp) { + vnode_put(dvp); + dvp = NULL; + } + + return 0; +} + + + + +/* + * Create a hot files btree file. + * + */ +static int +hfc_btree_create(struct hfsmount *hfsmp, unsigned int nodesize, unsigned int entries) +{ + struct vnode *dvp = NULL; + struct vnode *vp = NULL; + struct cnode *cp = NULL; + vfs_context_t ctx = vfs_context_current(); + struct vnode_attr va; + static char filename[] = HFC_FILENAME; + int error; + + if (hfsmp->hfc_filevp) + panic("hfs: hfc_btree_create: hfc_filevp exists (vp = %p)", hfsmp->hfc_filevp); + + error = hfs_vfs_root(HFSTOVFS(hfsmp), &dvp, ctx); + if (error) { + return (error); + } + + struct componentname cname = { + .cn_nameiop = CREATE, + .cn_flags = ISLASTCN, + .cn_pnbuf = filename, + .cn_pnlen = sizeof(filename), + .cn_nameptr = filename, + .cn_namelen = strlen(filename) + }; + + VATTR_INIT(&va); + VATTR_SET(&va, va_type, VREG); + VATTR_SET(&va, va_mode, S_IFREG | S_IRUSR | S_IWUSR); + VATTR_SET(&va, va_uid, 0); + VATTR_SET(&va, va_gid, 0); + + if (hfs_start_transaction(hfsmp) != 0) { + vnode_put(dvp); + return EINVAL; + } + + /* call ourselves directly, ignore the higher-level VFS file creation code */ + + struct vnop_create_args ap = { + .a_dvp = dvp, + .a_vpp = &vp, + .a_cnp = &cname, + .a_vap = &va + }; + + error = hfs_vnop_create(&ap); + if (error) { + printf("hfs: error %d creating HFBT on %s\n", error, HFSTOVCB(hfsmp)->vcbVN); + goto out; + } + if (dvp) { + vnode_put(dvp); + dvp = NULL; + } + if ((error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { + goto out; + } + cp = VTOC(vp); + + /* Don't use non-regular files or files with links. */ + if (!vnode_isreg(vp) || cp->c_linkcount != 1) { + error = EFTYPE; + goto out; + } + + printf("hfs: created HFBT on %s\n", HFSTOVCB(hfsmp)->vcbVN); + + if (VTOF(vp)->ff_size < nodesize) { + caddr_t buffer; + u_int16_t *index; + u_int16_t offset; + BTNodeDescriptor *ndp; + BTHeaderRec *bthp; + HotFilesInfo *hotfileinfo; + int nodecnt; + int filesize; + int entirespernode; + + /* + * Mark it invisible (truncate will pull these changes). + */ + ((FndrFileInfo *)&cp->c_finderinfo[0])->fdFlags |= + SWAP_BE16 (kIsInvisible + kNameLocked); + + buffer = hfs_mallocz(nodesize); + index = (u_int16_t *)buffer; + + entirespernode = (nodesize - sizeof(BTNodeDescriptor) - 2) / + (sizeof(HotFileKey) + 6); + nodecnt = 2 + howmany(entries * 2, entirespernode); + nodecnt = roundup(nodecnt, 8); + filesize = nodecnt * nodesize; + + /* FILL IN THE NODE DESCRIPTOR: */ + ndp = (BTNodeDescriptor *)buffer; + ndp->kind = kBTHeaderNode; + ndp->numRecords = SWAP_BE16 (3); + offset = sizeof(BTNodeDescriptor); + index[(nodesize / 2) - 1] = SWAP_BE16 (offset); + + /* FILL IN THE HEADER RECORD: */ + bthp = (BTHeaderRec *)((u_int8_t *)buffer + offset); + bthp->nodeSize = SWAP_BE16 (nodesize); + bthp->totalNodes = SWAP_BE32 (filesize / nodesize); + bthp->freeNodes = SWAP_BE32 (nodecnt - 1); + bthp->clumpSize = SWAP_BE32 (filesize); + bthp->btreeType = kUserBTreeType; /* non-metadata */ + bthp->attributes |= SWAP_BE32 (kBTBigKeysMask); + bthp->maxKeyLength = SWAP_BE16 (HFC_KEYLENGTH); + offset += sizeof(BTHeaderRec); + index[(nodesize / 2) - 2] = SWAP_BE16 (offset); + + /* FILL IN THE USER RECORD: */ + hotfileinfo = (HotFilesInfo *)((u_int8_t *)buffer + offset); + hotfileinfo->magic = SWAP_BE32 (HFC_MAGIC); + hotfileinfo->version = SWAP_BE32 (HFC_VERSION); + hotfileinfo->duration = SWAP_BE32 (HFC_DEFAULT_DURATION); + hotfileinfo->timebase = 0; + hotfileinfo->timeleft = 0; + hotfileinfo->threshold = SWAP_BE32 (HFC_MINIMUM_TEMPERATURE); + hotfileinfo->maxfileblks = SWAP_BE32 (HFC_MAXIMUM_FILESIZE / HFSTOVCB(hfsmp)->blockSize); + if (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) { + if (hfsmp->hfs_hotfile_freeblks == 0) { + hfsmp->hfs_hotfile_freeblks = hfsmp->hfs_hotfile_maxblks; + } + hotfileinfo->usedblocks = SWAP_BE32 (hfsmp->hfs_hotfile_maxblks - hfsmp->hfs_hotfile_freeblks); + } else { + hotfileinfo->maxfilecnt = SWAP_BE32 (HFC_DEFAULT_FILE_COUNT); + } + strlcpy((char *)hotfileinfo->tag, hfc_tag, + sizeof hotfileinfo->tag); + offset += kBTreeHeaderUserBytes; + index[(nodesize / 2) - 3] = SWAP_BE16 (offset); + + /* FILL IN THE MAP RECORD (only one node in use). */ + *((u_int8_t *)buffer + offset) = 0x80; + offset += nodesize - sizeof(BTNodeDescriptor) - sizeof(BTHeaderRec) + - kBTreeHeaderUserBytes - (4 * sizeof(int16_t)); + index[(nodesize / 2) - 4] = SWAP_BE16 (offset); + + vnode_setnoflush(vp); + error = hfs_truncate(vp, (off_t)filesize, IO_NDELAY, 0, ctx); + if (error) { + printf("hfs: error %d growing HFBT on %s\n", error, HFSTOVCB(hfsmp)->vcbVN); + goto out; + } + cp->c_flag |= C_ZFWANTSYNC; + cp->c_zftimeout = 1; + + if (error == 0) { + struct vnop_write_args args; + uio_t auio; + + auio = uio_create(1, 0, UIO_SYSSPACE, UIO_WRITE); + uio_addiov(auio, (uintptr_t)buffer, nodesize); + + args.a_desc = &vnop_write_desc; + args.a_vp = vp; + args.a_uio = auio; + args.a_ioflag = 0; + args.a_context = ctx; + + hfs_unlock(cp); + cp = NULL; + + error = hfs_vnop_write(&args); + if (error) + printf("hfs: error %d writing HFBT on %s\n", error, HFSTOVCB(hfsmp)->vcbVN); + + uio_free(auio); + } + hfs_free(buffer, nodesize); + } +out: + hfs_end_transaction(hfsmp); + if (dvp) { + vnode_put(dvp); + } + if (vp) { + if (cp) + hfs_unlock(cp); + vnode_recycle(vp); + vnode_put(vp); + } + return (error); +} + +/* + * Compare two hot file b-tree keys. + * + * Result: +n search key > trial key + * 0 search key = trial key + * -n search key < trial key + */ +static int +hfc_comparekeys(HotFileKey *searchKey, HotFileKey *trialKey) +{ + /* + * Compared temperatures first. + */ + if (searchKey->temperature == trialKey->temperature) { + /* + * Temperatures are equal so compare file ids. + */ + if (searchKey->fileID == trialKey->fileID) { + /* + * File ids are equal so compare fork types. + */ + if (searchKey->forkType == trialKey->forkType) { + return (0); + } else if (searchKey->forkType > trialKey->forkType) { + return (1); + } + } else if (searchKey->fileID > trialKey->fileID) { + return (1); + } + } else if (searchKey->temperature > trialKey->temperature) { + return (1); + } + + return (-1); +} + + +/* + *======================================================================== + * HOT FILE DATA COLLECTING ROUTINES + *======================================================================== + */ + +/* + * Lookup a hot file entry in the tree. + */ +#if HFC_DEBUG +static hotfile_entry_t * +hf_lookup(hotfile_data_t *hotdata, u_int32_t fileid, u_int32_t temperature) +{ + hotfile_entry_t *entry = hotdata->rootentry; + + while (entry && + entry->temperature != temperature && + entry->fileid != fileid) { + + if (temperature > entry->temperature) + entry = entry->right; + else if (temperature < entry->temperature) + entry = entry->left; + else if (fileid > entry->fileid) + entry = entry->right; + else + entry = entry->left; + } + return (entry); +} +#endif + +/* + * Insert a hot file entry into the tree. + */ +static int +hf_insert(hotfile_data_t *hotdata, hotfile_entry_t *newentry) +{ + hotfile_entry_t *entry = hotdata->rootentry; + u_int32_t fileid = newentry->fileid; + u_int32_t temperature = newentry->temperature; + + if (entry == NULL) { + hotdata->rootentry = newentry; + hotdata->coldest = newentry; + hotdata->activefiles++; + return 0; + } + + while (entry) { + if (temperature > entry->temperature) { + if (entry->right) { + entry = entry->right; + } else { + entry->right = newentry; + break; + } + } else if (temperature < entry->temperature) { + if (entry->left) { + entry = entry->left; + } else { + entry->left = newentry; + break; + } + } else if (fileid > entry->fileid) { + if (entry->right) { + entry = entry->right; + } else { + if (entry->fileid != fileid) + entry->right = newentry; + break; + } + } else { + if (entry->left) { + entry = entry->left; + } else { + if (entry->fileid != fileid) { + entry->left = newentry; + } else { + return EEXIST; + } + break; + } + } + } + + hotdata->activefiles++; + return 0; +} + +/* + * Find the coldest entry in the tree. + */ +static hotfile_entry_t * +hf_coldest(hotfile_data_t *hotdata) +{ + hotfile_entry_t *entry = hotdata->rootentry; + + if (entry) { + while (entry->left) + entry = entry->left; + } + return (entry); +} + +/* + * Find the hottest entry in the tree. + */ +static hotfile_entry_t * +hf_hottest(hotfile_data_t *hotdata) +{ + hotfile_entry_t *entry = hotdata->rootentry; + + if (entry) { + while (entry->right) + entry = entry->right; + } + return (entry); +} + +/* + * Delete a hot file entry from the tree. + */ +static void +hf_delete(hotfile_data_t *hotdata, u_int32_t fileid, u_int32_t temperature) +{ + hotfile_entry_t *entry, *parent, *next; + + parent = NULL; + entry = hotdata->rootentry; + + while (entry && + entry->temperature != temperature && + entry->fileid != fileid) { + + parent = entry; + if (temperature > entry->temperature) + entry = entry->right; + else if (temperature < entry->temperature) + entry = entry->left; + else if (fileid > entry->fileid) + entry = entry->right; + else + entry = entry->left; + } + + if (entry) { + /* + * Reorganize the sub-trees spanning from our entry. + */ + if ((next = entry->right)) { + hotfile_entry_t *pnextl, *psub; + /* + * Tree pruning: take the left branch of the + * current entry and place it at the lowest + * left branch of the current right branch + */ + psub = next; + + /* Walk the Right/Left sub tree from current entry */ + while ((pnextl = psub->left)) + psub = pnextl; + + /* Plug the old left tree to the new ->Right leftmost entry */ + psub->left = entry->left; + + } else /* only left sub-tree, simple case */ { + next = entry->left; + } + /* + * Now, plug the current entry sub tree to + * the good pointer of our parent entry. + */ + if (parent == NULL) + hotdata->rootentry = next; + else if (parent->left == entry) + parent->left = next; + else + parent->right = next; + + /* Place entry back on the free-list */ + entry->left = 0; + entry->fileid = 0; + entry->temperature = 0; + + entry->right = hotdata->freelist; + hotdata->freelist = entry; + hotdata->activefiles--; + + if (hotdata->coldest == entry || hotdata->coldest == NULL) { + hotdata->coldest = hf_coldest(hotdata); + } + + } +} + +/* + * Get a free hot file entry. + */ +static hotfile_entry_t * +hf_getnewentry(hotfile_data_t *hotdata) +{ + hotfile_entry_t * entry; + + /* + * When the free list is empty then steal the coldest one + */ + if (hotdata->freelist == NULL) { + entry = hf_coldest(hotdata); + hf_delete(hotdata, entry->fileid, entry->temperature); + } + entry = hotdata->freelist; + hotdata->freelist = entry->right; + entry->right = 0; + + return (entry); +} + + +/* + * Generate a sorted list of hot files (hottest to coldest). + * + * As a side effect, every node in the hot file tree will be + * deleted (moved to the free list). + */ +static void +hf_getsortedlist(hotfile_data_t * hotdata, hotfilelist_t *sortedlist) +{ + int i = 0; + hotfile_entry_t *entry; + + while ((entry = hf_hottest(hotdata)) != NULL) { + sortedlist->hfl_hotfile[i].hf_fileid = entry->fileid; + sortedlist->hfl_hotfile[i].hf_temperature = entry->temperature; + sortedlist->hfl_hotfile[i].hf_blocks = entry->blocks; + sortedlist->hfl_totalblocks += entry->blocks; + ++i; + + hf_delete(hotdata, entry->fileid, entry->temperature); + } + + sortedlist->hfl_count = i; + +#if HFC_VERBOSE + printf("hfs: hf_getsortedlist returning %d entries w/%d total blocks\n", i, sortedlist->hfl_totalblocks); +#endif +} + + +#if HFC_DEBUG +static void +hf_maxdepth(hotfile_entry_t * root, int depth, int *maxdepth) +{ + if (root) { + depth++; + if (depth > *maxdepth) + *maxdepth = depth; + hf_maxdepth(root->left, depth, maxdepth); + hf_maxdepth(root->right, depth, maxdepth); + } +} + +static void +hf_printtree(hotfile_entry_t * root) +{ + if (root) { + hf_printtree(root->left); + printf("hfs: temperature: % 8d, fileid %d\n", root->temperature, root->fileid); + hf_printtree(root->right); + } +} +#endif diff --git a/core/hfs_hotfiles.h b/core/hfs_hotfiles.h new file mode 100644 index 0000000..0ce32da --- /dev/null +++ b/core/hfs_hotfiles.h @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2003-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#ifndef __HFS_HOTFILES__ +#define __HFS_HOTFILES__ + +#include + +#ifdef KERNEL +#ifdef __APPLE_API_PRIVATE + + +#define HFC_FILENAME ".hotfiles.btree" + + +/* + * Temperature measurement constraints. + */ +#define HFC_DEFAULT_FILE_COUNT hfc_default_file_count +#define HFC_DEFAULT_DURATION hfc_default_duration +#define HFC_CUMULATIVE_CYCLES 3 +#define HFC_MAXIMUM_FILE_COUNT hfc_max_file_count +#define HFC_MAXIMUM_FILESIZE hfc_max_file_size +#define HFC_MINIMUM_TEMPERATURE 24 + + +/* + * Sync constraints. + */ +#define HFC_BLKSPERSYNC 300 +#define HFC_FILESPERSYNC 50 + + +/* + * Hot file clustering stages. + */ +enum hfc_stage { + HFC_DISABLED, + HFC_IDLE, + HFC_BUSY, + HFC_RECORDING, + HFC_EVALUATION, + HFC_EVICTION, + HFC_ADOPTION, +}; + + +/* + * B-tree file key format (on-disk). + */ +struct HotFileKey { + u_int16_t keyLength; /* length of key, excluding this field */ + u_int8_t forkType; /* 0 = data fork, FF = resource fork */ + u_int8_t pad; /* make the other fields align on 32-bit boundary */ + u_int32_t temperature; /* temperature recorded */ + u_int32_t fileID; /* file ID */ +}; +typedef struct HotFileKey HotFileKey; + +#define HFC_LOOKUPTAG 0xFFFFFFFF +#define HFC_KEYLENGTH (sizeof(HotFileKey) - sizeof(u_int16_t)) + +/* + * B-tree header node user info (on-disk). + */ +struct HotFilesInfo { + u_int32_t magic; + u_int32_t version; + u_int32_t duration; /* duration of sample period (secs) */ + u_int32_t timebase; /* start of recording period (GMT time in secs) */ + u_int32_t timeleft; /* time remaining in recording period (secs) */ + u_int32_t threshold; + u_int32_t maxfileblks; + union { + u_int32_t _maxfilecnt; // on hdd's we track the max # of files + u_int32_t _usedblocks; // on ssd's we track how many blocks are used + } _u; + u_int8_t tag[32]; +}; + +#define usedblocks _u._usedblocks +#define maxfilecnt _u._maxfilecnt + +typedef struct HotFilesInfo HotFilesInfo; + +#define HFC_MAGIC 0xFF28FF26 +#define HFC_VERSION 1 + + +struct hfsmount; +struct proc; +struct vnode; + +/* + * Hot File interface functions. + */ +int hfs_hotfilesync (struct hfsmount *, vfs_context_t ctx); + +int hfs_recording_init(struct hfsmount *); +int hfs_recording_suspend (struct hfsmount *); + +int hfs_addhotfile (struct vnode *); +int hfs_removehotfile (struct vnode *); +int hfs_hotfile_deleted(struct vnode *vp); // called when a file is deleted +void hfs_repin_hotfiles(struct hfsmount *); + +// call this to adjust the number of used hotfile blocks either up/down +int hfs_hotfile_adjust_blocks(struct vnode *vp, int64_t num_blocks); + +#endif /* __APPLE_API_PRIVATE */ +#endif /* KERNEL */ +#endif /* __HFS_HOTFILES__ */ diff --git a/core/hfs_iokit.cpp b/core/hfs_iokit.cpp new file mode 100644 index 0000000..5908364 --- /dev/null +++ b/core/hfs_iokit.cpp @@ -0,0 +1,307 @@ +/* + * Copyright (c) 2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "hfs_iokit.h" +#include "hfs.h" +#include "hfs_dbg.h" + +#ifndef panic_on_assert +bool panic_on_assert; +#endif + +#if DEBUG +bool hfs_corruption_panics = true; +#endif + +class com_apple_filesystems_hfs : public IOService { + OSDeclareDefaultStructors(com_apple_filesystems_hfs) + +public: + + bool start(IOService *provider) override; + void stop(IOService *provider) override; + +protected: + vfstable_t vfs_handle; +}; + +#define super IOService +OSDefineMetaClassAndStructors(com_apple_filesystems_hfs, IOService) + +extern struct vnodeopv_desc hfs_vnodeop_opv_desc; +#if CONFIG_HFS_STD +extern struct vnodeopv_desc hfs_std_vnodeop_opv_desc; +#endif +extern struct vnodeopv_desc hfs_specop_opv_desc; +extern struct vnodeopv_desc hfs_fifoop_opv_desc; +extern struct vfsops hfs_vfsops; + +bool com_apple_filesystems_hfs::start(IOService *provider) +{ + if (!super::start(provider)) + return false; + +#ifndef panic_on_assert + panic_on_assert = PE_i_can_has_kernel_configuration() & kPEICanHasAssertions; +#endif + +#if DEBUG + PE_parse_boot_argn("hfs_corruption_panics", &hfs_corruption_panics, sizeof(hfs_corruption_panics)); +#endif + + struct vnodeopv_desc *op_descs[] = { + &hfs_vnodeop_opv_desc, +#if CONFIG_HFS_STD + &hfs_std_vnodeop_opv_desc, +#endif + &hfs_specop_opv_desc, +#if FIFO + &hfs_fifoop_opv_desc, +#endif + }; + +#define lengthof(x) (sizeof(x)/sizeof(*x)) + +#ifndef VFS_TBLVNOP_SECLUDE_RENAME +#define VFS_TBLVNOP_SECLUDE_RENAME 0 +#endif + + struct vfs_fsentry vfe = { + .vfe_vfsops = &hfs_vfsops, + .vfe_vopcnt = lengthof(op_descs), + .vfe_opvdescs = op_descs, + .vfe_fsname = "hfs", + .vfe_flags = (VFS_TBLNOTYPENUM | VFS_TBLLOCALVOL | VFS_TBLREADDIR_EXTENDED + | VFS_TBL64BITREADY | VFS_TBLVNOP_PAGEOUTV2 | VFS_TBLVNOP_PAGEINV2 + | VFS_TBLTHREADSAFE | VFS_TBLCANMOUNTROOT | VFS_TBLVNOP_SECLUDE_RENAME + | VFS_TBLNATIVEXATTR) + }; + + int ret = vfs_fsadd(&vfe, &vfs_handle); + + if (ret) { + printf("hfs: vfs_fsadd failed: %d!\n", ret); + vfs_handle = NULL; + return false; + } + + hfs_init_zones(); + + hfs_sysctl_register(); + + return true; +} + +void com_apple_filesystems_hfs::stop(IOService *provider) +{ + if (vfs_handle) { + vfs_fsremove(vfs_handle); + hfs_sysctl_unregister(); + vfs_handle = NULL; + } + + super::stop(provider); +} + +int hfs_is_ejectable(const char *cdev_name) +{ + int ret = 0; + OSDictionary *dictionary; + OSString *dev_name; + + if (strncmp(cdev_name, "/dev/", 5) == 0) { + cdev_name += 5; + } + + dictionary = IOService::serviceMatching("IOMedia"); + if( dictionary ) { + dev_name = OSString::withCString( cdev_name ); + if( dev_name ) { + IOService *service; + mach_timespec_t tv = { 5, 0 }; // wait up to "timeout" seconds for the device + + dictionary->setObject(kIOBSDNameKey, dev_name); + dictionary->retain(); + service = IOService::waitForService(dictionary, &tv); + if( service ) { + OSBoolean *ejectable = (OSBoolean *)service->getProperty("Ejectable"); + + if( ejectable ) { + ret = (int)ejectable->getValue(); + } + + } + dev_name->release(); + } + dictionary->release(); + } + + return ret; +} + +void hfs_iterate_media_with_content(const char *content_uuid_cstring, + int (*func)(const char *device, + const char *uuid_str, + void *arg), + void *arg) +{ + OSDictionary *dictionary; + OSString *content_uuid_string; + + dictionary = IOService::serviceMatching("IOMedia"); + if (dictionary) { + content_uuid_string = OSString::withCString(content_uuid_cstring); + if (content_uuid_string) { + IOService *service; + OSIterator *iter; + + dictionary->setObject("Content", content_uuid_string); + dictionary->retain(); + + iter = IOService::getMatchingServices(dictionary); + while (iter && (service = (IOService *)iter->getNextObject())) { + if (service) { + OSString *iostr = (OSString *) service->getProperty(kIOBSDNameKey); + OSString *uuidstr = (OSString *)service->getProperty("UUID"); + const char *uuid; + + if (iostr) { + if (uuidstr) { + uuid = uuidstr->getCStringNoCopy(); + } else { + uuid = "00000000-0000-0000-0000-000000000000"; + } + + if (!func(iostr->getCStringNoCopy(), uuid, arg)) + break; + } + } + } + if (iter) + iter->release(); + + content_uuid_string->release(); + } + dictionary->release(); + } +} + +kern_return_t hfs_get_platform_serial_number(char *serial_number_str, + uint32_t len) +{ + OSDictionary * platform_dict; + IOService *platform; + OSString * string; + + if (len < 1) { + return 0; + } + serial_number_str[0] = '\0'; + + platform_dict = IOService::serviceMatching( "IOPlatformExpertDevice" ); + if (platform_dict == NULL) { + return KERN_NOT_SUPPORTED; + } + + platform = IOService::waitForService( platform_dict ); + if (platform) { + string = (OSString *)platform->getProperty(kIOPlatformSerialNumberKey); + if (string == 0) { + return KERN_NOT_SUPPORTED; + } else { + strlcpy( serial_number_str, string->getCStringNoCopy(), len); + } + } + + return KERN_SUCCESS; +} + +// Interface with AKS + +static aks_file_system_key_services_t * +key_services(void) +{ + static aks_file_system_key_services_t *g_key_services; + + if (!g_key_services) { + IOService *platform = IOService::getPlatform(); + if (platform) { + IOReturn ret = platform->callPlatformFunction + (kAKSFileSystemKeyServices, true, &g_key_services, NULL, NULL, NULL); + if (ret) + printf("hfs: unable to get " kAKSFileSystemKeyServices " (0x%x)\n", ret); + } + } + + return g_key_services; +} + +int hfs_unwrap_key(aks_cred_t access, const aks_wrapped_key_t wrapped_key_in, + aks_raw_key_t key_out) +{ + aks_file_system_key_services_t *ks = key_services(); + if (!ks || !ks->unwrap_key) + return ENXIO; + return ks->unwrap_key(access, wrapped_key_in, key_out); +} + +int hfs_rewrap_key(aks_cred_t access, cp_key_class_t dp_class, + const aks_wrapped_key_t wrapped_key_in, + aks_wrapped_key_t wrapped_key_out) +{ + aks_file_system_key_services_t *ks = key_services(); + if (!ks || !ks->rewrap_key) + return ENXIO; + return ks->rewrap_key(access, dp_class, wrapped_key_in, wrapped_key_out); +} + +int hfs_new_key(aks_cred_t access, cp_key_class_t dp_class, + aks_raw_key_t key_out, aks_wrapped_key_t wrapped_key_out) +{ + aks_file_system_key_services_t *ks = key_services(); + if (!ks || !ks->new_key) + return ENXIO; + return ks->new_key(access, dp_class, key_out, wrapped_key_out); +} + +int hfs_backup_key(aks_cred_t access, const aks_wrapped_key_t wrapped_key_in, + aks_wrapped_key_t wrapped_key_out) +{ + aks_file_system_key_services_t *ks = key_services(); + if (!ks || !ks->backup_key) + return ENXIO; + return ks->backup_key(access, wrapped_key_in, wrapped_key_out); +} diff --git a/core/hfs_iokit.h b/core/hfs_iokit.h new file mode 100644 index 0000000..d31a062 --- /dev/null +++ b/core/hfs_iokit.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef hfs_iokit_h +#define hfs_iokit_h + +#include +#include + +__BEGIN_DECLS + +int hfs_is_ejectable(const char *cdev_name); +void hfs_iterate_media_with_content(const char *content_uuid_cstring, + int (*func)(const char *bsd_name, + const char *uuid_str, + void *arg), + void *arg); +kern_return_t hfs_get_platform_serial_number(char *serial_number_str, + uint32_t len); +int hfs_unwrap_key(aks_cred_t access, const aks_wrapped_key_t wrapped_key_in, + aks_raw_key_t key_out); +int hfs_rewrap_key(aks_cred_t access, cp_key_class_t dp_class, + const aks_wrapped_key_t wrapped_key_in, + aks_wrapped_key_t wrapped_key_out); +int hfs_new_key(aks_cred_t access, cp_key_class_t dp_class, + aks_raw_key_t key_out, aks_wrapped_key_t wrapped_key_out); +int hfs_backup_key(aks_cred_t access, const aks_wrapped_key_t wrapped_key_in, + aks_wrapped_key_t wrapped_key_out); + +__END_DECLS + +#endif /* hfs_iokit_h */ diff --git a/core/hfs_journal.c b/core/hfs_journal.c new file mode 100644 index 0000000..f1e6ee6 --- /dev/null +++ b/core/hfs_journal.c @@ -0,0 +1,4892 @@ +/* + * Copyright (c) 2002-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +// +// This file implements a simple write-ahead journaling layer. +// In theory any file system can make use of it by calling these +// functions when the fs wants to modify meta-data blocks. See +// hfs_journal.h for a more detailed description of the api and +// data structures. +// +// Dominic Giampaolo (dbg@apple.com) +// + +#ifdef KERNEL + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* OSAddAtomic */ + +#include "hfs.h" + +kern_return_t thread_terminate(thread_t); + +/* + * Set sysctl vfs.generic.jnl.kdebug.trim=1 to enable KERNEL_DEBUG_CONSTANT + * logging of trim-related calls within the journal. (They're + * disabled by default because there can be a lot of these events, + * and we don't want to overwhelm the kernel debug buffer. If you + * want to watch these events in particular, just set the sysctl.) + */ +static int jnl_kdebug = 0; + +HFS_SYSCTL(NODE, _vfs_generic_hfs, OID_AUTO, jnl, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Journal") +HFS_SYSCTL(NODE, _vfs_generic_hfs_jnl, OID_AUTO, kdebug, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Journal kdebug") +HFS_SYSCTL(INT, _vfs_generic_hfs_jnl_kdebug, OID_AUTO, trim, CTLFLAG_RW|CTLFLAG_LOCKED, &jnl_kdebug, 0, "Enable kdebug logging for journal TRIM") + +#define DBG_JOURNAL_FLUSH FSDBG_CODE(DBG_JOURNAL, 1) +#define DBG_JOURNAL_TRIM_ADD FSDBG_CODE(DBG_JOURNAL, 2) +#define DBG_JOURNAL_TRIM_REMOVE FSDBG_CODE(DBG_JOURNAL, 3) +#define DBG_JOURNAL_TRIM_REMOVE_PENDING FSDBG_CODE(DBG_JOURNAL, 4) +#define DBG_JOURNAL_TRIM_REALLOC FSDBG_CODE(DBG_JOURNAL, 5) +#define DBG_JOURNAL_TRIM_FLUSH FSDBG_CODE(DBG_JOURNAL, 6) +#define DBG_JOURNAL_TRIM_UNMAP FSDBG_CODE(DBG_JOURNAL, 7) + +/* + * Cap the journal max size to 2GB. On HFS, it will attempt to occupy + * a full allocation block if the current size is smaller than the allocation + * block on which it resides. Once we hit the exabyte filesystem range, then + * it will use 2GB allocation blocks. As a result, make the cap 2GB. + */ +#define MAX_JOURNAL_SIZE 0x80000000U + +#include +#else + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "compat.h" + +#endif /* KERNEL */ + +#include "hfs_journal.h" + +#include + +// +// By default, we grow the list of extents to trim by 4K at a time. +// We'll opt to flush a transaction if it contains at least +// JOURNAL_FLUSH_TRIM_EXTENTS extents to be trimmed (even if the number +// of modified blocks is small). +// +enum { + JOURNAL_DEFAULT_TRIM_BYTES = 4096, + JOURNAL_DEFAULT_TRIM_EXTENTS = JOURNAL_DEFAULT_TRIM_BYTES / sizeof(dk_extent_t), + JOURNAL_FLUSH_TRIM_EXTENTS = JOURNAL_DEFAULT_TRIM_EXTENTS * 15 / 16 +}; + +unsigned int jnl_trim_flush_limit = JOURNAL_FLUSH_TRIM_EXTENTS; + +HFS_SYSCTL(UINT, _vfs_generic_hfs_jnl, OID_AUTO, trim_flush, CTLFLAG_RW, &jnl_trim_flush_limit, 0, "number of trimmed extents to cause a journal flush") + +// number of bytes to checksum in a block_list_header +// NOTE: this should be enough to clear out the header +// fields as well as the first entry of binfo[] +#define BLHDR_CHECKSUM_SIZE 32 + +static void lock_condition(journal *jnl, boolean_t *condition, const char *condition_name); +static void wait_condition(journal *jnl, boolean_t *condition, const char *condition_name); +static void unlock_condition(journal *jnl, boolean_t *condition); +static void finish_end_thread(transaction *tr); +static void write_header_thread(journal *jnl); +static int finish_end_transaction(transaction *tr, errno_t (*callback)(void*), void *callback_arg); +static int end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void *callback_arg, boolean_t drop_lock, boolean_t must_wait); +static void abort_transaction(journal *jnl, transaction *tr); +static void dump_journal(journal *jnl); + +static __inline__ void lock_oldstart(journal *jnl); +static __inline__ void unlock_oldstart(journal *jnl); +static __inline__ void lock_flush(journal *jnl); +static __inline__ void unlock_flush(journal *jnl); + + +// +// 3105942 - Coalesce writes to the same block on journal replay +// + +typedef struct bucket { + off_t block_num; + uint32_t jnl_offset; + uint32_t block_size; + int32_t cksum; +} bucket; + +#define STARTING_BUCKETS 256 + +static int add_block(journal *jnl, struct bucket **buf_ptr, off_t block_num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr); +static int grow_table(struct bucket **buf_ptr, int num_buckets, int new_size); +static int lookup_bucket(struct bucket **buf_ptr, off_t block_num, int num_full); +static int do_overlap(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t block_num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr); +static int insert_block(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr, int overwriting); + +#define CHECK_JOURNAL(jnl) \ + do { \ + if (jnl == NULL) { \ + panic("%s:%d: null journal ptr?\n", __FILE__, __LINE__); \ + } \ + if (jnl->jdev == NULL) { \ + panic("%s:%d: jdev is null!\n", __FILE__, __LINE__); \ + } \ + if (jnl->fsdev == NULL) { \ + panic("%s:%d: fsdev is null!\n", __FILE__, __LINE__); \ + } \ + if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC) { \ + panic("%s:%d: jhdr magic corrupted (0x%x != 0x%x)\n", \ + __FILE__, __LINE__, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC); \ + } \ + if ( jnl->jhdr->start <= 0 \ + || jnl->jhdr->start > jnl->jhdr->size) { \ + panic("%s:%d: jhdr start looks bad (0x%llx max size 0x%llx)\n", \ + __FILE__, __LINE__, jnl->jhdr->start, jnl->jhdr->size); \ + } \ + if ( jnl->jhdr->end <= 0 \ + || jnl->jhdr->end > jnl->jhdr->size) { \ + panic("%s:%d: jhdr end looks bad (0x%llx max size 0x%llx)\n", \ + __FILE__, __LINE__, jnl->jhdr->end, jnl->jhdr->size); \ + } \ + } while(0) + +#define CHECK_TRANSACTION(tr) \ + do { \ + if (tr == NULL) { \ + panic("%s:%d: null transaction ptr?\n", __FILE__, __LINE__); \ + } \ + if (tr->jnl == NULL) { \ + panic("%s:%d: null tr->jnl ptr?\n", __FILE__, __LINE__); \ + } \ + if (tr->blhdr != (block_list_header *)tr->tbuffer) { \ + panic("%s:%d: blhdr (%p) != tbuffer (%p)\n", __FILE__, __LINE__, tr->blhdr, tr->tbuffer); \ + } \ + if (tr->total_bytes < 0) { \ + panic("%s:%d: tr total_bytes looks bad: %d\n", __FILE__, __LINE__, tr->total_bytes); \ + } \ + if (tr->journal_start < 0) { \ + panic("%s:%d: tr journal start looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_start); \ + } \ + if (tr->journal_end < 0) { \ + panic("%s:%d: tr journal end looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_end); \ + } \ + if (tr->blhdr && (tr->blhdr->max_blocks <= 0 || tr->blhdr->max_blocks > (tr->jnl->jhdr->size/tr->jnl->jhdr->jhdr_size))) { \ + panic("%s:%d: tr blhdr max_blocks looks bad: %d\n", __FILE__, __LINE__, tr->blhdr->max_blocks); \ + } \ + } while(0) + + + +// +// this isn't a great checksum routine but it will do for now. +// we use it to checksum the journal header and the block list +// headers that are at the start of each transaction. +// +static unsigned int +calc_checksum(const char *ptr, int len) +{ + int i; + unsigned int cksum=0; + + // this is a lame checksum but for now it'll do + for(i = 0; i < len; i++, ptr++) { + cksum = (cksum << 8) ^ (cksum + *(unsigned char *)ptr); + } + + return (~cksum); +} + +// +// Journal Locking +// +lck_grp_attr_t * jnl_group_attr; +lck_attr_t * jnl_lock_attr; +lck_grp_t * jnl_mutex_group; + +void +journal_init(void) +{ + jnl_lock_attr = lck_attr_alloc_init(); + jnl_group_attr = lck_grp_attr_alloc_init(); + jnl_mutex_group = lck_grp_alloc_init("jnl-mutex", jnl_group_attr); +} + +__inline__ void +journal_lock(journal *jnl) +{ + lck_mtx_lock(&jnl->jlock); + if (jnl->owner) { + panic ("jnl: owner is %p, expected NULL\n", jnl->owner); + } + jnl->owner = current_thread(); +} + +__inline__ void +journal_unlock(journal *jnl) +{ + jnl->owner = NULL; + lck_mtx_unlock(&jnl->jlock); +} + +static __inline__ void +lock_flush(journal *jnl) +{ + lck_mtx_lock(&jnl->flock); +} + +static __inline__ void +unlock_flush(journal *jnl) +{ + lck_mtx_unlock(&jnl->flock); +} + +static __inline__ void +lock_oldstart(journal *jnl) +{ + lck_mtx_lock(&jnl->old_start_lock); +} + +static __inline__ void +unlock_oldstart(journal *jnl) +{ + lck_mtx_unlock(&jnl->old_start_lock); +} + + + +#define JNL_WRITE 0x0001 +#define JNL_READ 0x0002 +#define JNL_HEADER 0x8000 + +// +// This function sets up a fake buf and passes it directly to the +// journal device strategy routine (so that it won't get cached in +// the block cache. +// +// It also handles range checking the i/o so that we don't write +// outside the journal boundaries and it will wrap the i/o back +// to the beginning if necessary (skipping over the journal header) +// +static size_t +do_journal_io(journal *jnl, off_t *offset, void *data, size_t len, int direction) +{ + int err; + off_t curlen = len; + size_t io_sz = 0; + buf_t bp; + off_t max_iosize; + bufattr_t bap; + boolean_t was_vm_privileged = FALSE; + boolean_t need_vm_privilege = FALSE; + + if (vfs_isswapmount(jnl->fsmount)) + need_vm_privilege = TRUE; + + if (*offset < 0 || *offset > jnl->jhdr->size) { + panic("jnl: do_jnl_io: bad offset 0x%llx (max 0x%llx)\n", *offset, jnl->jhdr->size); + } + + if (direction & JNL_WRITE) + max_iosize = jnl->max_write_size; + else if (direction & JNL_READ) + max_iosize = jnl->max_read_size; + else + max_iosize = 128 * 1024; + +again: + bp = buf_alloc(jnl->jdev); + + if (*offset + curlen > jnl->jhdr->size && *offset != 0 && jnl->jhdr->size != 0) { + if (*offset == jnl->jhdr->size) { + *offset = jnl->jhdr->jhdr_size; + } else { + curlen = jnl->jhdr->size - *offset; + } + } + + if (curlen > max_iosize) { + curlen = max_iosize; + } + + if (curlen <= 0) { + panic("jnl: do_jnl_io: curlen == %lld, offset 0x%llx len %zd\n", curlen, *offset, len); + } + + if (*offset == 0 && (direction & JNL_HEADER) == 0) { + panic("jnl: request for i/o to jnl-header without JNL_HEADER flag set! (len %lld, data %p)\n", curlen, data); + } + + /* + * As alluded to in the block comment at the top of the function, we use a "fake" iobuf + * here and issue directly to the disk device that the journal protects since we don't + * want this to enter the block cache. As a result, we lose the ability to mark it + * as a metadata buf_t for the layers below us that may care. If we were to + * simply attach the B_META flag into the b_flags this may confuse things further + * since this is an iobuf, not a metadata buffer. + * + * To address this, we use the extended bufattr struct embedded in the bp. + * Explicitly mark the buf here as a metadata buffer in its bufattr flags. + */ + bap = buf_attr(bp); + bufattr_markmeta(bap); + + if (direction & JNL_READ) + buf_setflags(bp, B_READ); + else { + /* + * don't have to set any flags + */ + vnode_startwrite(jnl->jdev); + } + buf_setsize(bp, curlen); + buf_setcount(bp, curlen); + buf_setdataptr(bp, (uintptr_t)data); + buf_setblkno(bp, (daddr64_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size)); + buf_setlblkno(bp, (daddr64_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size)); + + if ((direction & JNL_WRITE) && (jnl->flags & JOURNAL_DO_FUA_WRITES)) { + buf_markfua(bp); + } + + if (need_vm_privilege == TRUE) { + /* + * if we block waiting for memory, and there is enough pressure to + * cause us to try and create a new swap file, we may end up deadlocking + * due to waiting for the journal on the swap file creation path... + * by making ourselves vm_privileged, we give ourselves the best chance + * of not blocking + */ + was_vm_privileged = set_vm_privilege(TRUE); + } + DTRACE_IO1(journal__start, buf_t, bp); + err = VNOP_STRATEGY(bp); + if (!err) { + err = (int)buf_biowait(bp); + } + DTRACE_IO1(journal__done, buf_t, bp); + + if (need_vm_privilege == TRUE && was_vm_privileged == FALSE) + set_vm_privilege(FALSE); + + buf_free(bp); + + if (err) { + printf("jnl: %s: do_jnl_io: strategy err 0x%x\n", jnl->jdev_name, err); + return 0; + } + + *offset += curlen; + io_sz += curlen; + + if (io_sz != len) { + // handle wrap-around + data = (char *)data + curlen; + curlen = len - io_sz; + if (*offset >= jnl->jhdr->size) { + *offset = jnl->jhdr->jhdr_size; + } + goto again; + } + + return io_sz; +} + +static size_t +read_journal_data(journal *jnl, off_t *offset, void *data, size_t len) +{ + return do_journal_io(jnl, offset, data, len, JNL_READ); +} + +static size_t +write_journal_data(journal *jnl, off_t *offset, void *data, size_t len) +{ + return do_journal_io(jnl, offset, data, len, JNL_WRITE); +} + + +static size_t +read_journal_header(journal *jnl, void *data, size_t len) +{ + off_t hdr_offset = 0; + + return do_journal_io(jnl, &hdr_offset, data, len, JNL_READ|JNL_HEADER); +} + +static int +write_journal_header(journal *jnl, int updating_start, uint32_t sequence_num) +{ + static int num_err_prints = 0; + int ret=0; + off_t jhdr_offset = 0; + // + // Flush the track cache if we're not doing force-unit-access + // writes. + // + if (!updating_start && (jnl->flags & JOURNAL_DO_FUA_WRITES) == 0) { + + dk_synchronize_t sync_request = { + .options = DK_SYNCHRONIZE_OPTION_BARRIER, + }; + + /* + * If device doesn't support barrier-only flush, or + * the journal is on a different device, use full flush. + */ + if (!(jnl->flags & JOURNAL_FEATURE_BARRIER) || (jnl->jdev != jnl->fsdev)) { + sync_request.options = 0; + jnl->flush_counter++; + } + + ret = VNOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZE, (caddr_t)&sync_request, FWRITE, vfs_context_kernel()); + } + if (ret != 0) { + // + // Only print this error if it's a different error than the + // previous one, or if it's the first time for this device + // or if the total number of printfs is less than 25. We + // allow for up to 25 printfs to insure that some make it + // into the on-disk syslog. Otherwise if we only printed + // one, it's possible it would never make it to the syslog + // for the root volume and that makes debugging hard. + // + if ( ret != jnl->last_flush_err + || (jnl->flags & JOURNAL_FLUSHCACHE_ERR) == 0 + || num_err_prints++ < 25) { + + printf("jnl: %s: flushing fs disk buffer returned 0x%x\n", jnl->jdev_name, ret); + + jnl->flags |= JOURNAL_FLUSHCACHE_ERR; + jnl->last_flush_err = ret; + } + } + + jnl->jhdr->sequence_num = sequence_num; + jnl->jhdr->checksum = 0; + jnl->jhdr->checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE); + + if (do_journal_io(jnl, &jhdr_offset, jnl->header_buf, jnl->jhdr->jhdr_size, JNL_WRITE|JNL_HEADER) != (size_t)jnl->jhdr->jhdr_size) { + printf("jnl: %s: write_journal_header: error writing the journal header!\n", jnl->jdev_name); + jnl->flags |= JOURNAL_INVALID; + return -1; + } + + // If we're not doing force-unit-access writes, then we + // have to flush after writing the journal header so that + // a future transaction doesn't sneak out to disk before + // the header does and thus overwrite data that the old + // journal header refers to. Saw this exact case happen + // on an IDE bus analyzer with Larry Barras so while it + // may seem obscure, it's not. + // + if (updating_start && (jnl->flags & JOURNAL_DO_FUA_WRITES) == 0) { + + dk_synchronize_t sync_request = { + .options = DK_SYNCHRONIZE_OPTION_BARRIER, + }; + + /* + * If device doesn't support barrier-only flush, or + * the journal is on a different device, use full flush. + */ + if (!(jnl->flags & JOURNAL_FEATURE_BARRIER) || (jnl->jdev != jnl->fsdev)) { + sync_request.options = 0; + jnl->flush_counter++; + } + + VNOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZE, (caddr_t)&sync_request, FWRITE, vfs_context_kernel()); + } + + return 0; +} + + + +// +// this is a work function used to free up transactions that +// completed. they can't be free'd from buffer_flushed_callback +// because it is called from deep with the disk driver stack +// and thus can't do something that would potentially cause +// paging. it gets called by each of the journal api entry +// points so stuff shouldn't hang around for too long. +// +static void +free_old_stuff(journal *jnl) +{ + transaction *tr, *next; + block_list_header *blhdr=NULL, *next_blhdr=NULL; + + if (jnl->tr_freeme == NULL) + return; + + lock_oldstart(jnl); + tr = jnl->tr_freeme; + jnl->tr_freeme = NULL; + unlock_oldstart(jnl); + + for(; tr; tr=next) { + for (blhdr = tr->blhdr; blhdr; blhdr = next_blhdr) { + next_blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum); + blhdr->binfo[0].bnum = 0xdeadc0de; + + hfs_free(blhdr, tr->tbuffer_size); + + KERNEL_DEBUG(0xbbbbc01c, jnl, tr, tr->tbuffer_size, 0, 0); + } + next = tr->next; + hfs_free(tr, sizeof(*tr)); + } +} + + + +// +// This is our callback that lets us know when a buffer has been +// flushed to disk. It's called from deep within the driver stack +// and thus is quite limited in what it can do. Notably, it can +// not initiate any new i/o's or allocate/free memory. +// +static void +buffer_flushed_callback(struct buf *bp, void *arg) +{ + transaction *tr; + journal *jnl; + transaction *ctr, *prev=NULL, *next; + size_t i; + int bufsize, amt_flushed, total_bytes; + + + //printf("jnl: buf flush: bp @ 0x%x l/blkno %qd/%qd vp 0x%x tr @ 0x%x\n", + // bp, buf_lblkno(bp), buf_blkno(bp), buf_vnode(bp), arg); + + // snarf out the bits we want + bufsize = buf_size(bp); + tr = (transaction *)arg; + + // then we've already seen it + if (tr == NULL) { + return; + } + + CHECK_TRANSACTION(tr); + + jnl = tr->jnl; + + CHECK_JOURNAL(jnl); + + amt_flushed = tr->num_killed; + total_bytes = tr->total_bytes; + + // update the number of blocks that have been flushed. + // this buf may represent more than one block so take + // that into account. + // + // OSAddAtomic() returns the value of tr->num_flushed before the add + // + amt_flushed += OSAddAtomic(bufsize, &tr->num_flushed); + + + // if this transaction isn't done yet, just return as + // there is nothing to do. + // + // NOTE: we are careful to not reference anything through + // the tr pointer after doing the OSAddAtomic(). if + // this if statement fails then we are the last one + // and then it's ok to dereference "tr". + // + if ((amt_flushed + bufsize) < total_bytes) { + return; + } + + // this will single thread checking the transaction + lock_oldstart(jnl); + + if (tr->total_bytes == (int)0xfbadc0de) { + // then someone beat us to it... + unlock_oldstart(jnl); + return; + } + + // mark this so that we're the owner of dealing with the + // cleanup for this transaction + tr->total_bytes = 0xfbadc0de; + + if (jnl->flags & JOURNAL_INVALID) + goto transaction_done; + + //printf("jnl: tr 0x%x (0x%llx 0x%llx) in jnl 0x%x completed.\n", + // tr, tr->journal_start, tr->journal_end, jnl); + + // find this entry in the old_start[] index and mark it completed + for(i = 0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) { + + if ((off_t)(jnl->old_start[i] & ~(0x8000000000000000ULL)) == tr->journal_start) { + jnl->old_start[i] &= ~(0x8000000000000000ULL); + break; + } + } + + if (i >= sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) { + panic("jnl: buffer_flushed: did not find tr w/start @ %lld (tr %p, jnl %p)\n", + tr->journal_start, tr, jnl); + } + + + // if we are here then we need to update the journal header + // to reflect that this transaction is complete + if (tr->journal_start == jnl->active_start) { + jnl->active_start = tr->journal_end; + tr->journal_start = tr->journal_end = (off_t)0; + } + + // go through the completed_trs list and try to coalesce + // entries, restarting back at the beginning if we have to. + for (ctr = jnl->completed_trs; ctr; prev=ctr, ctr=next) { + if (ctr->journal_start == jnl->active_start) { + jnl->active_start = ctr->journal_end; + if (prev) { + prev->next = ctr->next; + } + if (ctr == jnl->completed_trs) { + jnl->completed_trs = ctr->next; + } + + next = jnl->completed_trs; // this starts us over again + ctr->next = jnl->tr_freeme; + jnl->tr_freeme = ctr; + ctr = NULL; + } else if (tr->journal_end == ctr->journal_start) { + ctr->journal_start = tr->journal_start; + next = jnl->completed_trs; // this starts us over again + ctr = NULL; + tr->journal_start = tr->journal_end = (off_t)0; + } else if (tr->journal_start == ctr->journal_end) { + ctr->journal_end = tr->journal_end; + next = ctr->next; + tr->journal_start = tr->journal_end = (off_t)0; + } else if (ctr->next && ctr->journal_end == ctr->next->journal_start) { + // coalesce the next entry with this one and link the next + // entry in at the head of the tr_freeme list + next = ctr->next; // temporarily use the "next" variable + ctr->journal_end = next->journal_end; + ctr->next = next->next; + next->next = jnl->tr_freeme; // link in the next guy at the head of the tr_freeme list + jnl->tr_freeme = next; + + next = jnl->completed_trs; // this starts us over again + ctr = NULL; + } else { + next = ctr->next; + } + } + + // if this is true then we didn't merge with anyone + // so link ourselves in at the head of the completed + // transaction list. + if (tr->journal_start != 0) { + // put this entry into the correct sorted place + // in the list instead of just at the head. + // + + prev = NULL; + for (ctr = jnl->completed_trs; ctr && tr->journal_start > ctr->journal_start; prev=ctr, ctr=ctr->next) { + // just keep looping + } + + if (ctr == NULL && prev == NULL) { + jnl->completed_trs = tr; + tr->next = NULL; + } else if (ctr == jnl->completed_trs) { + tr->next = jnl->completed_trs; + jnl->completed_trs = tr; + } else { + tr->next = prev->next; + prev->next = tr; + } + } else { + // if we're here this tr got merged with someone else so + // put it on the list to be free'd + tr->next = jnl->tr_freeme; + jnl->tr_freeme = tr; + } +transaction_done: + unlock_oldstart(jnl); + + unlock_condition(jnl, &jnl->asyncIO); +} + + +#include + +#define SWAP16(x) OSSwapInt16(x) +#define SWAP32(x) OSSwapInt32(x) +#define SWAP64(x) OSSwapInt64(x) + + +static void +swap_journal_header(journal *jnl) +{ + jnl->jhdr->magic = SWAP32(jnl->jhdr->magic); + jnl->jhdr->endian = SWAP32(jnl->jhdr->endian); + jnl->jhdr->start = SWAP64(jnl->jhdr->start); + jnl->jhdr->end = SWAP64(jnl->jhdr->end); + jnl->jhdr->size = SWAP64(jnl->jhdr->size); + jnl->jhdr->blhdr_size = SWAP32(jnl->jhdr->blhdr_size); + jnl->jhdr->checksum = SWAP32(jnl->jhdr->checksum); + jnl->jhdr->jhdr_size = SWAP32(jnl->jhdr->jhdr_size); + jnl->jhdr->sequence_num = SWAP32(jnl->jhdr->sequence_num); +} + +static void +swap_block_list_header(journal *jnl, block_list_header *blhdr) +{ + int i; + + blhdr->max_blocks = SWAP16(blhdr->max_blocks); + blhdr->num_blocks = SWAP16(blhdr->num_blocks); + blhdr->bytes_used = SWAP32(blhdr->bytes_used); + blhdr->checksum = SWAP32(blhdr->checksum); + blhdr->flags = SWAP32(blhdr->flags); + + if (blhdr->num_blocks >= ((jnl->jhdr->blhdr_size / sizeof(block_info)) - 1)) { + printf("jnl: %s: blhdr num blocks looks suspicious (%d / blhdr size %d). not swapping.\n", jnl->jdev_name, blhdr->num_blocks, jnl->jhdr->blhdr_size); + return; + } + + for(i = 0; i < blhdr->num_blocks; i++) { + blhdr->binfo[i].bnum = SWAP64(blhdr->binfo[i].bnum); + blhdr->binfo[i].u.bi.bsize = SWAP32(blhdr->binfo[i].u.bi.bsize); + blhdr->binfo[i].u.bi.b.cksum = SWAP32(blhdr->binfo[i].u.bi.b.cksum); + } +} + + +static int +update_fs_block(journal *jnl, void *block_ptr, off_t fs_block, size_t bsize) +{ + int ret; + struct buf *oblock_bp=NULL; + boolean_t was_vm_privileged = FALSE; + + + // first read the block we want. + ret = buf_meta_bread(jnl->fsdev, (daddr64_t)fs_block, bsize, NOCRED, &oblock_bp); + if (ret != 0) { + printf("jnl: %s: update_fs_block: error reading fs block # %lld! (ret %d)\n", jnl->jdev_name, fs_block, ret); + + if (oblock_bp) { + buf_brelse(oblock_bp); + oblock_bp = NULL; + } + + // let's try to be aggressive here and just re-write the block + oblock_bp = buf_getblk(jnl->fsdev, (daddr64_t)fs_block, bsize, 0, 0, BLK_META); + if (oblock_bp == NULL) { + printf("jnl: %s: update_fs_block: buf_getblk() for %lld failed! failing update.\n", jnl->jdev_name, fs_block); + return -1; + } + } + + // make sure it's the correct size. + if (buf_size(oblock_bp) != bsize) { + buf_brelse(oblock_bp); + return -1; + } + + // copy the journal data over top of it + memcpy((char *)buf_dataptr(oblock_bp), block_ptr, bsize); + + if (vfs_isswapmount(jnl->fsmount)) { + /* + * if we block waiting for memory, and there is enough pressure to + * cause us to try and create a new swap file, we may end up deadlocking + * due to waiting for the journal on the swap file creation path... + * by making ourselves vm_privileged, we give ourselves the best chance + * of not blocking + */ + was_vm_privileged = set_vm_privilege(TRUE); + } + ret = VNOP_BWRITE(oblock_bp); + + if (vfs_isswapmount(jnl->fsmount) && (was_vm_privileged == FALSE)) + set_vm_privilege(FALSE); + + if (ret != 0) { + printf("jnl: %s: update_fs_block: failed to update block %lld (ret %d)\n", jnl->jdev_name, fs_block,ret); + return ret; + } + // and now invalidate it so that if someone else wants to read + // it in a different size they'll be able to do it. + ret = buf_meta_bread(jnl->fsdev, (daddr64_t)fs_block, bsize, NOCRED, &oblock_bp); + if (oblock_bp) { + buf_markinvalid(oblock_bp); + buf_brelse(oblock_bp); + } + + return 0; +} + +static int +grow_table(struct bucket **buf_ptr, int num_buckets, int new_size) +{ + struct bucket *newBuf; + int current_size = num_buckets, i; + + // return if newsize is less than the current size + if (new_size < num_buckets) { + return current_size; + } + + newBuf = hfs_malloc(new_size*sizeof(struct bucket)); + + // printf("jnl: lookup_bucket: expanded co_buf to %d elems\n", new_size); + + // copy existing elements + bcopy(*buf_ptr, newBuf, num_buckets*sizeof(struct bucket)); + + // initialize the new ones + for(i = num_buckets; i < new_size; i++) { + newBuf[i].block_num = (off_t)-1; + } + + // free the old container + hfs_free(*buf_ptr, num_buckets * sizeof(struct bucket)); + + // reset the buf_ptr + *buf_ptr = newBuf; + + return new_size; +} + +static int +lookup_bucket(struct bucket **buf_ptr, off_t block_num, int num_full) +{ + int lo, hi, index, matches, i; + + if (num_full == 0) { + return 0; // table is empty, so insert at index=0 + } + + lo = 0; + hi = num_full - 1; + index = -1; + + // perform binary search for block_num + do { + int mid = (hi - lo)/2 + lo; + off_t this_num = (*buf_ptr)[mid].block_num; + + if (block_num == this_num) { + index = mid; + break; + } + + if (block_num < this_num) { + hi = mid; + continue; + } + + if (block_num > this_num) { + lo = mid + 1; + continue; + } + } while (lo < hi); + + // check if lo and hi converged on the match + if (block_num == (*buf_ptr)[hi].block_num) { + index = hi; + } + + // if no existing entry found, find index for new one + if (index == -1) { + index = (block_num < (*buf_ptr)[hi].block_num) ? hi : hi + 1; + } else { + // make sure that we return the right-most index in the case of multiple matches + matches = 0; + i = index + 1; + while (i < num_full && block_num == (*buf_ptr)[i].block_num) { + matches++; + i++; + } + + index += matches; + } + + return index; +} + +static int +insert_block(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr, int overwriting) +{ + if (!overwriting) { + // grow the table if we're out of space - we may index the table + // with *num_full_ptr (lookup_bucket() can return a maximum value == + // *num_full_ptr), so we need to grow when we hit (*num_buckets_ptr - 1) + // to prevent out-of-bounds indexing + if (*num_full_ptr >= (*num_buckets_ptr - 1)) { + int new_size = *num_buckets_ptr * 2; + int grow_size = grow_table(buf_ptr, *num_buckets_ptr, new_size); + + if (grow_size < new_size) { + printf("jnl: %s: add_block: grow_table returned an error!\n", jnl->jdev_name); + return -1; + } + + *num_buckets_ptr = grow_size; //update num_buckets to reflect the new size + } + + // if we're not inserting at the end, we need to bcopy + if (blk_index != *num_full_ptr) { + bcopy( (*buf_ptr)+(blk_index), (*buf_ptr)+(blk_index+1), (*num_full_ptr-blk_index)*sizeof(struct bucket) ); + } + + (*num_full_ptr)++; // increment only if we're not overwriting + } + + // sanity check the values we're about to add + if ((off_t)offset >= jnl->jhdr->size) { + offset = jnl->jhdr->jhdr_size + (offset - jnl->jhdr->size); + } + if (size <= 0) { + panic("jnl: insert_block: bad size in insert_block (%zd)\n", size); + } + + (*buf_ptr)[blk_index].block_num = num; + (*buf_ptr)[blk_index].block_size = (uint32_t)size; + (*buf_ptr)[blk_index].jnl_offset = (uint32_t)offset; + (*buf_ptr)[blk_index].cksum = cksum; + + return blk_index; +} + +static int +do_overlap(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t block_num, size_t size, __unused size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr) +{ + int num_to_remove, index, i, overwrite, err; + size_t jhdr_size = jnl->jhdr->jhdr_size, new_offset; + off_t overlap, block_start, block_end; + + block_start = block_num*jhdr_size; + block_end = block_start + size; + overwrite = (block_num == (*buf_ptr)[blk_index].block_num && size >= (*buf_ptr)[blk_index].block_size); + + // first, eliminate any overlap with the previous entry + if (blk_index != 0 && !overwrite) { + off_t prev_block_start = (*buf_ptr)[blk_index-1].block_num*jhdr_size; + off_t prev_block_end = prev_block_start + (*buf_ptr)[blk_index-1].block_size; + overlap = prev_block_end - block_start; + if (overlap > 0) { + if (overlap % jhdr_size != 0) { + panic("jnl: do_overlap: overlap with previous entry not a multiple of %zd\n", jhdr_size); + } + + // if the previous entry completely overlaps this one, we need to break it into two pieces. + if (prev_block_end > block_end) { + off_t new_num = block_end / jhdr_size; + size_t new_size = prev_block_end - block_end; + + new_offset = (*buf_ptr)[blk_index-1].jnl_offset + (block_end - prev_block_start); + + err = insert_block(jnl, buf_ptr, blk_index, new_num, new_size, new_offset, cksum, num_buckets_ptr, num_full_ptr, 0); + if (err < 0) { + panic("jnl: do_overlap: error inserting during pre-overlap\n"); + } + } + + // Regardless, we need to truncate the previous entry to the beginning of the overlap + (*buf_ptr)[blk_index-1].block_size = (uint32_t)(block_start - prev_block_start); + (*buf_ptr)[blk_index-1].cksum = 0; // have to blow it away because there's no way to check it + } + } + + // then, bail out fast if there's no overlap with the entries that follow + if (!overwrite && block_end <= (off_t)((*buf_ptr)[blk_index].block_num*jhdr_size)) { + return 0; // no overlap, no overwrite + } else if (overwrite && (blk_index + 1 >= *num_full_ptr || block_end <= (off_t)((*buf_ptr)[blk_index+1].block_num*jhdr_size))) { + + (*buf_ptr)[blk_index].cksum = cksum; // update this + return 1; // simple overwrite + } + + // Otherwise, find all cases of total and partial overlap. We use the special + // block_num of -2 to designate entries that are completely overlapped and must + // be eliminated. The block_num, size, and jnl_offset of partially overlapped + // entries must be adjusted to keep the array consistent. + index = blk_index; + num_to_remove = 0; + while (index < *num_full_ptr && block_end > (off_t)((*buf_ptr)[index].block_num*jhdr_size)) { + if (block_end >= (off_t)(((*buf_ptr)[index].block_num*jhdr_size + (*buf_ptr)[index].block_size))) { + (*buf_ptr)[index].block_num = -2; // mark this for deletion + num_to_remove++; + } else { + overlap = block_end - (*buf_ptr)[index].block_num*jhdr_size; + if (overlap > 0) { + if (overlap % jhdr_size != 0) { + panic("jnl: do_overlap: overlap of %lld is not multiple of %zd\n", overlap, jhdr_size); + } + + // if we partially overlap this entry, adjust its block number, jnl offset, and size + (*buf_ptr)[index].block_num += (overlap / jhdr_size); // make sure overlap is multiple of jhdr_size, or round up + (*buf_ptr)[index].cksum = 0; + + new_offset = (*buf_ptr)[index].jnl_offset + overlap; // check for wrap-around + if ((off_t)new_offset >= jnl->jhdr->size) { + new_offset = jhdr_size + (new_offset - jnl->jhdr->size); + } + (*buf_ptr)[index].jnl_offset = (uint32_t)new_offset; + + (*buf_ptr)[index].block_size -= overlap; // sanity check for negative value + if ((*buf_ptr)[index].block_size <= 0) { + panic("jnl: do_overlap: after overlap, new block size is invalid (%u)\n", (*buf_ptr)[index].block_size); + // return -1; // if above panic is removed, return -1 for error + } + } + + } + + index++; + } + + // bcopy over any completely overlapped entries, starting at the right (where the above loop broke out) + index--; // start with the last index used within the above loop + while (index >= blk_index) { + if ((*buf_ptr)[index].block_num == -2) { + if (index == *num_full_ptr-1) { + (*buf_ptr)[index].block_num = -1; // it's the last item in the table... just mark as free + } else { + bcopy( (*buf_ptr)+(index+1), (*buf_ptr)+(index), (*num_full_ptr - (index + 1)) * sizeof(struct bucket) ); + } + (*num_full_ptr)--; + } + index--; + } + + // eliminate any stale entries at the end of the table + for(i = *num_full_ptr; i < (*num_full_ptr + num_to_remove); i++) { + (*buf_ptr)[i].block_num = -1; + } + + return 0; // if we got this far, we need to insert the entry into the table (rather than overwrite) +} + +// PR-3105942: Coalesce writes to the same block in journal replay +// We coalesce writes by maintaining a dynamic sorted array of physical disk blocks +// to be replayed and the corresponding location in the journal which contains +// the most recent data for those blocks. The array is "played" once the all the +// blocks in the journal have been coalesced. The code for the case of conflicting/ +// overlapping writes to a single block is the most dense. Because coalescing can +// disrupt the existing time-ordering of blocks in the journal playback, care +// is taken to catch any overlaps and keep the array consistent. +static int +add_block(journal *jnl, struct bucket **buf_ptr, off_t block_num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr) +{ + int blk_index, overwriting; + + // on return from lookup_bucket(), blk_index is the index into the table where block_num should be + // inserted (or the index of the elem to overwrite). + blk_index = lookup_bucket( buf_ptr, block_num, *num_full_ptr); + + // check if the index is within bounds (if we're adding this block to the end of + // the table, blk_index will be equal to num_full) + if (blk_index < 0 || blk_index > *num_full_ptr) { + //printf("jnl: add_block: trouble adding block to co_buf\n"); + return -1; + } // else printf("jnl: add_block: adding block 0x%llx at i=%d\n", block_num, blk_index); + + // Determine whether we're overwriting an existing entry by checking for overlap + overwriting = do_overlap(jnl, buf_ptr, blk_index, block_num, size, offset, cksum, num_buckets_ptr, num_full_ptr); + if (overwriting < 0) { + return -1; // if we got an error, pass it along + } + + // returns the index, or -1 on error + blk_index = insert_block(jnl, buf_ptr, blk_index, block_num, size, offset, cksum, num_buckets_ptr, num_full_ptr, overwriting); + + return blk_index; +} + +static int +replay_journal(journal *jnl) +{ + int i, bad_blocks=0; + unsigned int orig_checksum, checksum, check_block_checksums = 0; + size_t ret; + size_t max_bsize = 0; /* protected by block_ptr */ + block_list_header *blhdr; + off_t offset, txn_start_offset=0, blhdr_offset, orig_jnl_start; + char *buff, *block_ptr=NULL; + struct bucket *co_buf; + int num_buckets = STARTING_BUCKETS, num_full, check_past_jnl_end = 1, in_uncharted_territory=0; + uint32_t last_sequence_num = 0; + int replay_retry_count = 0; + + // wrap the start ptr if it points to the very end of the journal + if (jnl->jhdr->start == jnl->jhdr->size) { + jnl->jhdr->start = jnl->jhdr->jhdr_size; + } + if (jnl->jhdr->end == jnl->jhdr->size) { + jnl->jhdr->end = jnl->jhdr->jhdr_size; + } + + if (jnl->jhdr->start == jnl->jhdr->end) { + return 0; + } + + orig_jnl_start = jnl->jhdr->start; + + // allocate memory for the header_block. we'll read each blhdr into this + buff = hfs_malloc(jnl->jhdr->blhdr_size); + + // allocate memory for the coalesce buffer + co_buf = hfs_malloc(num_buckets*sizeof(struct bucket)); + +restart_replay: + + // initialize entries + for(i = 0; i < num_buckets; i++) { + co_buf[i].block_num = -1; + } + num_full = 0; // empty at first + + + printf("jnl: %s: replay_journal: from: %lld to: %lld (joffset 0x%llx)\n", + jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end, jnl->jdev_offset); + + while (check_past_jnl_end || jnl->jhdr->start != jnl->jhdr->end) { + offset = blhdr_offset = jnl->jhdr->start; + ret = read_journal_data(jnl, &offset, buff, jnl->jhdr->blhdr_size); + if (ret != (size_t)jnl->jhdr->blhdr_size) { + printf("jnl: %s: replay_journal: Could not read block list header block @ 0x%llx!\n", jnl->jdev_name, offset); + bad_blocks = 1; + goto bad_txn_handling; + } + + blhdr = (block_list_header *)buff; + + orig_checksum = blhdr->checksum; + blhdr->checksum = 0; + if (jnl->flags & JOURNAL_NEED_SWAP) { + // calculate the checksum based on the unswapped data + // because it is done byte-at-a-time. + orig_checksum = (unsigned int)SWAP32(orig_checksum); + checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE); + swap_block_list_header(jnl, blhdr); + } else { + checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE); + } + + + // + // XXXdbg - if these checks fail, we should replay as much + // we can in the hopes that it will still leave the + // drive in a better state than if we didn't replay + // anything + // + if (checksum != orig_checksum) { + if (check_past_jnl_end && in_uncharted_territory) { + + if (blhdr_offset != jnl->jhdr->end) { + printf("jnl: %s: Extra txn replay stopped @ %lld / 0x%llx\n", jnl->jdev_name, blhdr_offset, blhdr_offset); + } + + check_past_jnl_end = 0; + jnl->jhdr->end = blhdr_offset; + continue; + } + + printf("jnl: %s: replay_journal: bad block list header @ 0x%llx (checksum 0x%x != 0x%x)\n", + jnl->jdev_name, blhdr_offset, orig_checksum, checksum); + + if (blhdr_offset == orig_jnl_start) { + // if there's nothing in the journal at all, just bail out altogether. + goto bad_replay; + } + + bad_blocks = 1; + goto bad_txn_handling; + } + + if ( (last_sequence_num != 0) + && (blhdr->binfo[0].u.bi.b.sequence_num != 0) + && (blhdr->binfo[0].u.bi.b.sequence_num != last_sequence_num) + && (blhdr->binfo[0].u.bi.b.sequence_num != last_sequence_num+1)) { + + txn_start_offset = jnl->jhdr->end = blhdr_offset; + + if (check_past_jnl_end) { + check_past_jnl_end = 0; + printf("jnl: %s: 2: extra replay stopped @ %lld / 0x%llx (seq %d < %d)\n", + jnl->jdev_name, blhdr_offset, blhdr_offset, blhdr->binfo[0].u.bi.b.sequence_num, last_sequence_num); + continue; + } + + printf("jnl: %s: txn sequence numbers out of order in txn @ %lld / %llx! (%d < %d)\n", + jnl->jdev_name, blhdr_offset, blhdr_offset, blhdr->binfo[0].u.bi.b.sequence_num, last_sequence_num); + bad_blocks = 1; + goto bad_txn_handling; + } + last_sequence_num = blhdr->binfo[0].u.bi.b.sequence_num; + + if (blhdr_offset >= jnl->jhdr->end && jnl->jhdr->start <= jnl->jhdr->end) { + if (last_sequence_num == 0) { + check_past_jnl_end = 0; + printf("jnl: %s: pre-sequence-num-enabled txn's - can not go further than end (%lld %lld).\n", + jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end); + if (jnl->jhdr->start != jnl->jhdr->end) { + jnl->jhdr->start = jnl->jhdr->end; + } + continue; + } + printf("jnl: %s: examining extra transactions starting @ %lld / 0x%llx\n", jnl->jdev_name, blhdr_offset, blhdr_offset); + } + + if ( blhdr->max_blocks <= 0 || blhdr->max_blocks > (jnl->jhdr->size/jnl->jhdr->jhdr_size) + || blhdr->num_blocks <= 0 || blhdr->num_blocks > blhdr->max_blocks) { + printf("jnl: %s: replay_journal: bad looking journal entry: max: %d num: %d\n", + jnl->jdev_name, blhdr->max_blocks, blhdr->num_blocks); + bad_blocks = 1; + goto bad_txn_handling; + } + + max_bsize = 0; + for (i = 1; i < blhdr->num_blocks; i++) { + if (blhdr->binfo[i].bnum < 0 && blhdr->binfo[i].bnum != (off_t)-1) { + printf("jnl: %s: replay_journal: bogus block number 0x%llx\n", jnl->jdev_name, blhdr->binfo[i].bnum); + bad_blocks = 1; + goto bad_txn_handling; + } + + if ((size_t)blhdr->binfo[i].u.bi.bsize > max_bsize) { + max_bsize = blhdr->binfo[i].u.bi.bsize; + } + } + + if (blhdr->flags & BLHDR_CHECK_CHECKSUMS) { + check_block_checksums = 1; + block_ptr = hfs_malloc(max_bsize); + } else { + block_ptr = NULL; + } + + if (blhdr->flags & BLHDR_FIRST_HEADER) { + txn_start_offset = blhdr_offset; + } + + //printf("jnl: replay_journal: adding %d blocks in journal entry @ 0x%llx to co_buf\n", + // blhdr->num_blocks-1, jnl->jhdr->start); + bad_blocks = 0; + for (i = 1; i < blhdr->num_blocks; i++) { + int size, ret_val; + off_t number; + + size = blhdr->binfo[i].u.bi.bsize; + number = blhdr->binfo[i].bnum; + + // don't add "killed" blocks + if (number == (off_t)-1) { + //printf("jnl: replay_journal: skipping killed fs block (index %d)\n", i); + } else { + + if (check_block_checksums) { + int32_t disk_cksum; + off_t block_offset; + + block_offset = offset; + + // read the block so we can check the checksum + ret = read_journal_data(jnl, &block_offset, block_ptr, size); + if (ret != (size_t)size) { + printf("jnl: %s: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", jnl->jdev_name, offset); + bad_blocks = 1; + goto bad_txn_handling; + } + + disk_cksum = calc_checksum(block_ptr, size); + + // there is no need to swap the checksum from disk because + // it got swapped when the blhdr was read in. + if (blhdr->binfo[i].u.bi.b.cksum != 0 && disk_cksum != blhdr->binfo[i].u.bi.b.cksum) { + printf("jnl: %s: txn starting at %lld (%lld) @ index %3d bnum %lld (%d) with disk cksum != blhdr cksum (0x%.8x 0x%.8x)\n", + jnl->jdev_name, txn_start_offset, blhdr_offset, i, number, size, disk_cksum, blhdr->binfo[i].u.bi.b.cksum); + printf("jnl: 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x\n", + *(int *)&block_ptr[0*sizeof(int)], *(int *)&block_ptr[1*sizeof(int)], *(int *)&block_ptr[2*sizeof(int)], *(int *)&block_ptr[3*sizeof(int)], + *(int *)&block_ptr[4*sizeof(int)], *(int *)&block_ptr[5*sizeof(int)], *(int *)&block_ptr[6*sizeof(int)], *(int *)&block_ptr[7*sizeof(int)]); + + bad_blocks = 1; + goto bad_txn_handling; + } + } + + + // add this bucket to co_buf, coalescing where possible + // printf("jnl: replay_journal: adding block 0x%llx\n", number); + ret_val = add_block(jnl, &co_buf, number, size, (size_t) offset, blhdr->binfo[i].u.bi.b.cksum, &num_buckets, &num_full); + + if (ret_val == -1) { + printf("jnl: %s: replay_journal: trouble adding block to co_buf\n", jnl->jdev_name); + goto bad_replay; + } // else printf("jnl: replay_journal: added block 0x%llx at i=%d\n", number); + } + + // increment offset + offset += size; + + // check if the last block added puts us off the end of the jnl. + // if so, we need to wrap to the beginning and take any remainder + // into account + // + if (offset >= jnl->jhdr->size) { + offset = jnl->jhdr->jhdr_size + (offset - jnl->jhdr->size); + } + } + + if (block_ptr) { + hfs_free(block_ptr, max_bsize); + block_ptr = NULL; + } + + if (bad_blocks) { + bad_txn_handling: + /* Journal replay got error before it found any valid + * transations, abort replay */ + if (txn_start_offset == 0) { + printf("jnl: %s: no known good txn start offset! aborting journal replay.\n", jnl->jdev_name); + goto bad_replay; + } + + /* Repeated error during journal replay, abort replay */ + if (replay_retry_count == 3) { + printf("jnl: %s: repeated errors replaying journal! aborting journal replay.\n", jnl->jdev_name); + goto bad_replay; + } + replay_retry_count++; + + /* There was an error replaying the journal (possibly + * EIO/ENXIO from the device). So retry replaying all + * the good transactions that we found before getting + * the error. + */ + jnl->jhdr->start = orig_jnl_start; + jnl->jhdr->end = txn_start_offset; + check_past_jnl_end = 0; + last_sequence_num = 0; + printf("jnl: %s: restarting journal replay (%lld - %lld)!\n", jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end); + goto restart_replay; + } + + jnl->jhdr->start += blhdr->bytes_used; + if (jnl->jhdr->start >= jnl->jhdr->size) { + // wrap around and skip the journal header block + jnl->jhdr->start = (jnl->jhdr->start % jnl->jhdr->size) + jnl->jhdr->jhdr_size; + } + + if (jnl->jhdr->start == jnl->jhdr->end) { + in_uncharted_territory = 1; + } + } + + if (jnl->jhdr->start != jnl->jhdr->end) { + printf("jnl: %s: start %lld != end %lld. resetting end.\n", jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end); + jnl->jhdr->end = jnl->jhdr->start; + } + + //printf("jnl: replay_journal: replaying %d blocks\n", num_full); + + /* + * make sure it's at least one page in size, so + * start max_bsize at PAGE_SIZE + */ + for (i = 0, max_bsize = PAGE_SIZE; i < num_full; i++) { + + if (co_buf[i].block_num == (off_t)-1) + continue; + + if (co_buf[i].block_size > max_bsize) + max_bsize = co_buf[i].block_size; + } + /* + * round max_bsize up to the nearest PAGE_SIZE multiple + */ + if (max_bsize & (PAGE_SIZE - 1)) { + max_bsize = (max_bsize + PAGE_SIZE) & ~(PAGE_SIZE - 1); + } + + block_ptr = hfs_malloc(max_bsize); + + // Replay the coalesced entries in the co-buf + for(i = 0; i < num_full; i++) { + size_t size = co_buf[i].block_size; + off_t jnl_offset = (off_t) co_buf[i].jnl_offset; + off_t number = co_buf[i].block_num; + + + // printf("replaying co_buf[%d]: block 0x%llx, size 0x%x, jnl_offset 0x%llx\n", i, co_buf[i].block_num, + // co_buf[i].block_size, co_buf[i].jnl_offset); + + if (number == (off_t)-1) { + // printf("jnl: replay_journal: skipping killed fs block\n"); + } else { + + // do journal read, and set the phys. block + ret = read_journal_data(jnl, &jnl_offset, block_ptr, size); + if (ret != size) { + printf("jnl: %s: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", jnl->jdev_name, jnl_offset); + goto bad_replay; + } + + if (update_fs_block(jnl, block_ptr, number, size) != 0) { + goto bad_replay; + } + } + } + + + // done replaying; update jnl header + if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num) != 0) { + goto bad_replay; + } + + printf("jnl: %s: journal replay done.\n", jnl->jdev_name); + + // free block_ptr + if (block_ptr) { + hfs_free(block_ptr, max_bsize); + block_ptr = NULL; + } + + // free the coalesce buffer + hfs_free(co_buf, num_buckets*sizeof(struct bucket)); + co_buf = NULL; + + hfs_free(buff, jnl->jhdr->blhdr_size); + return 0; + +bad_replay: + hfs_free(block_ptr, max_bsize); + hfs_free(co_buf, num_buckets*sizeof(struct bucket)); + hfs_free(buff, jnl->jhdr->blhdr_size); + + return -1; +} + + +#define DEFAULT_TRANSACTION_BUFFER_SIZE (128*1024) +#define MAX_TRANSACTION_BUFFER_SIZE (3072*1024) + +// XXXdbg - so I can change it in the debugger +int def_tbuffer_size = 0; + + +// +// This function sets the size of the tbuffer and the +// size of the blhdr. It assumes that jnl->jhdr->size +// and jnl->jhdr->jhdr_size are already valid. +// +static void +size_up_tbuffer(journal *jnl, int tbuffer_size, int phys_blksz) +{ + // + // one-time initialization based on how much memory + // there is in the machine. + // + if (def_tbuffer_size == 0) { + uint64_t memsize = 0; + size_t l = sizeof(memsize); + sysctlbyname("hw.memsize", &memsize, &l, NULL, 0); + + if (memsize < (256*1024*1024)) { + def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE; + } else if (memsize < (512*1024*1024)) { + def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 2; + } else if (memsize < (1024*1024*1024)) { + def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 3; + } else { + def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * (memsize / (256*1024*1024)); + } + } + + // For analyzer + hfs_assert(jnl->jhdr->jhdr_size > 0); + + // size up the transaction buffer... can't be larger than the number + // of blocks that can fit in a block_list_header block. + if (tbuffer_size == 0) { + jnl->tbuffer_size = def_tbuffer_size; + } else { + // make sure that the specified tbuffer_size isn't too small + if (tbuffer_size < jnl->jhdr->blhdr_size * 2) { + tbuffer_size = jnl->jhdr->blhdr_size * 2; + } + // and make sure it's an even multiple of the block size + if ((tbuffer_size % jnl->jhdr->jhdr_size) != 0) { + tbuffer_size -= (tbuffer_size % jnl->jhdr->jhdr_size); + } + + jnl->tbuffer_size = tbuffer_size; + } + + if (jnl->tbuffer_size > (jnl->jhdr->size / 2)) { + jnl->tbuffer_size = (jnl->jhdr->size / 2); + } + + if (jnl->tbuffer_size > MAX_TRANSACTION_BUFFER_SIZE) { + jnl->tbuffer_size = MAX_TRANSACTION_BUFFER_SIZE; + } + + jnl->jhdr->blhdr_size = (jnl->tbuffer_size / jnl->jhdr->jhdr_size) * sizeof(block_info); + if (jnl->jhdr->blhdr_size < phys_blksz) { + jnl->jhdr->blhdr_size = phys_blksz; + } else if ((jnl->jhdr->blhdr_size % phys_blksz) != 0) { + // have to round up so we're an even multiple of the physical block size + jnl->jhdr->blhdr_size = (jnl->jhdr->blhdr_size + (phys_blksz - 1)) & ~(phys_blksz - 1); + } +} + +static void +get_io_info(struct vnode *devvp, size_t phys_blksz, journal *jnl, struct vfs_context *context) +{ + off_t readblockcnt; + off_t writeblockcnt; + off_t readmaxcnt=0, tmp_readmaxcnt; + off_t writemaxcnt=0, tmp_writemaxcnt; + off_t readsegcnt, writesegcnt; + int32_t features; + + if (VNOP_IOCTL(devvp, DKIOCGETFEATURES, (caddr_t)&features, 0, context) == 0) { + if (features & DK_FEATURE_FORCE_UNIT_ACCESS) { + const char *name = vnode_getname_printable(devvp); + jnl->flags |= JOURNAL_DO_FUA_WRITES; + printf("jnl: %s: enabling FUA writes (features 0x%x)\n", name, features); + vnode_putname_printable(name); + } + if (features & DK_FEATURE_UNMAP) { + jnl->flags |= JOURNAL_USE_UNMAP; + } + + if (features & DK_FEATURE_BARRIER) { + jnl->flags |= JOURNAL_FEATURE_BARRIER; + } + } + + // + // First check the max read size via several different mechanisms... + // + VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTREAD, (caddr_t)&readmaxcnt, 0, context); + + if (VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTREAD, (caddr_t)&readblockcnt, 0, context) == 0) { + tmp_readmaxcnt = readblockcnt * phys_blksz; + if (readmaxcnt == 0 || (readblockcnt > 0 && tmp_readmaxcnt < readmaxcnt)) { + readmaxcnt = tmp_readmaxcnt; + } + } + + if (VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTREAD, (caddr_t)&readsegcnt, 0, context)) { + readsegcnt = 0; + } + + if (readsegcnt > 0 && (readsegcnt * PAGE_SIZE) < readmaxcnt) { + readmaxcnt = readsegcnt * PAGE_SIZE; + } + + if (readmaxcnt == 0) { + readmaxcnt = 128 * 1024; + } else if (readmaxcnt > UINT32_MAX) { + readmaxcnt = UINT32_MAX; + } + + + // + // Now check the max writes size via several different mechanisms... + // + VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTWRITE, (caddr_t)&writemaxcnt, 0, context); + + if (VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTWRITE, (caddr_t)&writeblockcnt, 0, context) == 0) { + tmp_writemaxcnt = writeblockcnt * phys_blksz; + if (writemaxcnt == 0 || (writeblockcnt > 0 && tmp_writemaxcnt < writemaxcnt)) { + writemaxcnt = tmp_writemaxcnt; + } + } + + if (VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTWRITE, (caddr_t)&writesegcnt, 0, context)) { + writesegcnt = 0; + } + + if (writesegcnt > 0 && (writesegcnt * PAGE_SIZE) < writemaxcnt) { + writemaxcnt = writesegcnt * PAGE_SIZE; + } + + if (writemaxcnt == 0) { + writemaxcnt = 128 * 1024; + } else if (writemaxcnt > UINT32_MAX) { + writemaxcnt = UINT32_MAX; + } + + jnl->max_read_size = readmaxcnt; + jnl->max_write_size = writemaxcnt; + // printf("jnl: %s: max read/write: %lld k / %lld k\n", + // jnl->jdev_name ? jnl->jdev_name : "unknown", + // jnl->max_read_size/1024, jnl->max_write_size/1024); +} + + +journal * +journal_create(struct vnode *jvp, + off_t offset, + off_t journal_size, + struct vnode *fsvp, + size_t min_fs_blksz, + int32_t flags, + int32_t tbuffer_size, + void (*flush)(void *arg), + void *arg, + struct mount *fsmount) +{ + journal *jnl; + uint32_t phys_blksz, new_txn_base; + u_int32_t min_size; + const char *jdev_name; + /* + * Cap the journal max size to 2GB. On HFS, it will attempt to occupy + * a full allocation block if the current size is smaller than the allocation + * block on which it resides. Once we hit the exabyte filesystem range, then + * it will use 2GB allocation blocks. As a result, make the cap 2GB. + */ + + jdev_name = vnode_getname_printable(jvp); + + /* Get the real physical block size. */ + if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, vfs_context_kernel())) { + goto cleanup_jdev_name; + } + + if (journal_size < (256*1024) || journal_size > (MAX_JOURNAL_SIZE)) { + printf("jnl: %s: create: journal size %lld looks bogus.\n", jdev_name, journal_size); + goto cleanup_jdev_name; + } + + min_size = phys_blksz * (phys_blksz / sizeof(block_info)); + /* Reject journals that are too small given the sector size of the device */ + if (journal_size < min_size) { + printf("jnl: %s: create: journal size (%lld) too small given sector size of (%u)\n", + jdev_name, journal_size, phys_blksz); + goto cleanup_jdev_name; + } + + if (phys_blksz > min_fs_blksz) { + printf("jnl: %s: create: error: phys blksize %u bigger than min fs blksize %zd\n", + jdev_name, phys_blksz, min_fs_blksz); + goto cleanup_jdev_name; + } + + if ((journal_size % phys_blksz) != 0) { + printf("jnl: %s: create: journal size 0x%llx is not an even multiple of block size 0x%ux\n", + jdev_name, journal_size, phys_blksz); + goto cleanup_jdev_name; + } + + + jnl = hfs_mallocz(sizeof(struct journal)); + + jnl->jdev = jvp; + jnl->jdev_offset = offset; + jnl->fsdev = fsvp; + jnl->flush = flush; + jnl->flush_arg = arg; + jnl->flags = (flags & JOURNAL_OPTION_FLAGS_MASK); + jnl->jdev_name = jdev_name; + lck_mtx_init(&jnl->old_start_lock, jnl_mutex_group, jnl_lock_attr); + + // Keep a point to the mount around for use in IO throttling. + jnl->fsmount = fsmount; + + get_io_info(jvp, phys_blksz, jnl, vfs_context_kernel()); + + jnl->header_buf = hfs_malloc(phys_blksz); + jnl->header_buf_size = phys_blksz; + + jnl->jhdr = (journal_header *)jnl->header_buf; + memset(jnl->jhdr, 0, sizeof(journal_header)); + + // we have to set this up here so that do_journal_io() will work + jnl->jhdr->jhdr_size = phys_blksz; + + // + // We try and read the journal header to see if there is already one + // out there. If there is, it's possible that it has transactions + // in it that we might replay if we happen to pick a sequence number + // that is a little less than the old one, there is a crash and the + // last txn written ends right at the start of a txn from the previous + // incarnation of this file system. If all that happens we would + // replay the transactions from the old file system and that would + // destroy your disk. Although it is extremely unlikely for all those + // conditions to happen, the probability is non-zero and the result is + // severe - you lose your file system. Therefore if we find a valid + // journal header and the sequence number is non-zero we write junk + // over the entire journal so that there is no way we will encounter + // any old transactions. This is slow but should be a rare event + // since most tools erase the journal. + // + if ( read_journal_header(jnl, jnl->jhdr, phys_blksz) == phys_blksz + && jnl->jhdr->magic == JOURNAL_HEADER_MAGIC + && jnl->jhdr->sequence_num != 0) { + + new_txn_base = (jnl->jhdr->sequence_num + (journal_size / phys_blksz) + (random() % 16384)) & 0x00ffffff; + printf("jnl: %s: create: avoiding old sequence number 0x%x (0x%x)\n", jdev_name, jnl->jhdr->sequence_num, new_txn_base); + +#if 0 + int i; + off_t pos=0; + + for(i = 1; i < journal_size / phys_blksz; i++) { + pos = i*phys_blksz; + + // we don't really care what data we write just so long + // as it's not a valid transaction header. since we have + // the header_buf sitting around we'll use that. + write_journal_data(jnl, &pos, jnl->header_buf, phys_blksz); + } + printf("jnl: create: done clearing journal (i=%d)\n", i); +#endif + } else { + new_txn_base = random() & 0x00ffffff; + } + + memset(jnl->header_buf, 0, phys_blksz); + + jnl->jhdr->magic = JOURNAL_HEADER_MAGIC; + jnl->jhdr->endian = ENDIAN_MAGIC; + jnl->jhdr->start = phys_blksz; // start at block #1, block #0 is for the jhdr itself + jnl->jhdr->end = phys_blksz; + jnl->jhdr->size = journal_size; + jnl->jhdr->jhdr_size = phys_blksz; + size_up_tbuffer(jnl, tbuffer_size, phys_blksz); + + jnl->active_start = jnl->jhdr->start; + + // XXXdbg - for testing you can force the journal to wrap around + // jnl->jhdr->start = jnl->jhdr->size - (phys_blksz*3); + // jnl->jhdr->end = jnl->jhdr->size - (phys_blksz*3); + + jnl->jhdr->sequence_num = new_txn_base; + + lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr); + lck_mtx_init(&jnl->flock, jnl_mutex_group, jnl_lock_attr); + lck_rw_init(&jnl->trim_lock, jnl_mutex_group, jnl_lock_attr); + + + jnl->flushing = FALSE; + jnl->asyncIO = FALSE; + jnl->flush_aborted = FALSE; + jnl->writing_header = FALSE; + jnl->async_trim = NULL; + jnl->sequence_num = jnl->jhdr->sequence_num; + + if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num) != 0) { + printf("jnl: %s: journal_create: failed to write journal header.\n", jdev_name); + goto bad_write; + } + + goto journal_create_complete; + + +bad_write: + hfs_free(jnl->header_buf, phys_blksz); + jnl->jhdr = NULL; + hfs_free(jnl, sizeof(*jnl)); +cleanup_jdev_name: + vnode_putname_printable(jdev_name); + jnl = NULL; +journal_create_complete: + return jnl; +} + + +journal * +journal_open(struct vnode *jvp, + off_t offset, + off_t journal_size, + struct vnode *fsvp, + size_t min_fs_blksz, + int32_t flags, + int32_t tbuffer_size, + void (*flush)(void *arg), + void *arg, + struct mount *fsmount) +{ + journal *jnl; + uint32_t orig_blksz=0; + uint32_t phys_blksz; + u_int32_t min_size = 0; + int orig_checksum, checksum; + const char *jdev_name = vnode_getname_printable(jvp); + + /* Get the real physical block size. */ + if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, vfs_context_kernel())) { + goto cleanup_jdev_name; + } + + if (phys_blksz > min_fs_blksz) { + printf("jnl: %s: open: error: phys blksize %u bigger than min fs blksize %zd\n", + jdev_name, phys_blksz, min_fs_blksz); + goto cleanup_jdev_name; + } + + if (journal_size < (256*1024) || journal_size > (1024*1024*1024)) { + printf("jnl: %s: open: journal size %lld looks bogus.\n", jdev_name, journal_size); + goto cleanup_jdev_name; + } + + min_size = phys_blksz * (phys_blksz / sizeof(block_info)); + /* Reject journals that are too small given the sector size of the device */ + if (journal_size < min_size) { + printf("jnl: %s: open: journal size (%lld) too small given sector size of (%u)\n", + jdev_name, journal_size, phys_blksz); + goto cleanup_jdev_name; + } + + if ((journal_size % phys_blksz) != 0) { + printf("jnl: %s: open: journal size 0x%llx is not an even multiple of block size 0x%x\n", + jdev_name, journal_size, phys_blksz); + goto cleanup_jdev_name; + } + + jnl = hfs_mallocz(sizeof(struct journal)); + + jnl->jdev = jvp; + jnl->jdev_offset = offset; + jnl->fsdev = fsvp; + jnl->flush = flush; + jnl->flush_arg = arg; + jnl->flags = (flags & JOURNAL_OPTION_FLAGS_MASK); + jnl->jdev_name = jdev_name; + lck_mtx_init(&jnl->old_start_lock, jnl_mutex_group, jnl_lock_attr); + + /* We hold the mount to later pass to the throttling code for IO + * accounting. + */ + jnl->fsmount = fsmount; + + get_io_info(jvp, phys_blksz, jnl, vfs_context_kernel()); + + jnl->header_buf = hfs_malloc(phys_blksz); + jnl->header_buf_size = phys_blksz; + + jnl->jhdr = (journal_header *)jnl->header_buf; + memset(jnl->jhdr, 0, sizeof(journal_header)); + + // we have to set this up here so that do_journal_io() will work + jnl->jhdr->jhdr_size = phys_blksz; + + if (read_journal_header(jnl, jnl->jhdr, phys_blksz) != phys_blksz) { + printf("jnl: %s: open: could not read %u bytes for the journal header.\n", + jdev_name, phys_blksz); + goto bad_journal; + } + + /* + * Check for a bad jhdr size after reading in the journal header. + * The journal header length cannot be zero + */ + if (jnl->jhdr->jhdr_size == 0) { + printf("jnl: %s: open: bad jhdr size (%d) \n", jdev_name, jnl->jhdr->jhdr_size); + goto bad_journal; + } + + orig_checksum = jnl->jhdr->checksum; + jnl->jhdr->checksum = 0; + + if (jnl->jhdr->magic == SWAP32(JOURNAL_HEADER_MAGIC)) { + // do this before the swap since it's done byte-at-a-time + orig_checksum = SWAP32(orig_checksum); + checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE); + swap_journal_header(jnl); + jnl->flags |= JOURNAL_NEED_SWAP; + } else { + checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE); + } + + if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC && jnl->jhdr->magic != OLD_JOURNAL_HEADER_MAGIC) { + printf("jnl: %s: open: journal magic is bad (0x%x != 0x%x)\n", + jnl->jdev_name, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC); + goto bad_journal; + } + + // only check if we're the current journal header magic value + if (jnl->jhdr->magic == JOURNAL_HEADER_MAGIC) { + + if (orig_checksum != checksum) { + printf("jnl: %s: open: journal checksum is bad (0x%x != 0x%x)\n", + jdev_name, orig_checksum, checksum); + + //goto bad_journal; + } + } + + // XXXdbg - convert old style magic numbers to the new one + if (jnl->jhdr->magic == OLD_JOURNAL_HEADER_MAGIC) { + jnl->jhdr->magic = JOURNAL_HEADER_MAGIC; + } + + if (phys_blksz != (size_t)jnl->jhdr->jhdr_size && jnl->jhdr->jhdr_size != 0) { + /* + * The volume has probably been resized (such that we had to adjust the + * logical sector size), or copied to media with a different logical + * sector size. + * + * Temporarily change the device's logical block size to match the + * journal's header size. This will allow us to replay the journal + * safely. If the replay succeeds, we will update the journal's header + * size (later in this function). + */ + orig_blksz = phys_blksz; + phys_blksz = jnl->jhdr->jhdr_size; + VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&phys_blksz, FWRITE, vfs_context_kernel()); + printf("jnl: %s: open: temporarily switched block size from %u to %u\n", + jdev_name, orig_blksz, phys_blksz); + } + + if ( jnl->jhdr->start <= 0 + || jnl->jhdr->start > jnl->jhdr->size + || jnl->jhdr->start > 1024*1024*1024) { + printf("jnl: %s: open: jhdr start looks bad (0x%llx max size 0x%llx)\n", + jdev_name, jnl->jhdr->start, jnl->jhdr->size); + goto bad_journal; + } + + if ( jnl->jhdr->end <= 0 + || jnl->jhdr->end > jnl->jhdr->size + || jnl->jhdr->end > 1024*1024*1024) { + printf("jnl: %s: open: jhdr end looks bad (0x%llx max size 0x%llx)\n", + jdev_name, jnl->jhdr->end, jnl->jhdr->size); + goto bad_journal; + } + + if (jnl->jhdr->size < (256*1024) || jnl->jhdr->size > 1024*1024*1024) { + printf("jnl: %s: open: jhdr size looks bad (0x%llx)\n", jdev_name, jnl->jhdr->size); + goto bad_journal; + } + +// XXXdbg - can't do these checks because hfs writes all kinds of +// non-uniform sized blocks even on devices that have a block size +// that is larger than 512 bytes (i.e. optical media w/2k blocks). +// therefore these checks will fail and so we just have to punt and +// do more relaxed checking... +// XXXdbg if ((jnl->jhdr->start % jnl->jhdr->jhdr_size) != 0) { + if ((jnl->jhdr->start % 512) != 0) { + printf("jnl: %s: open: journal start (0x%llx) not a multiple of 512?\n", + jdev_name, jnl->jhdr->start); + goto bad_journal; + } + +//XXXdbg if ((jnl->jhdr->end % jnl->jhdr->jhdr_size) != 0) { + if ((jnl->jhdr->end % 512) != 0) { + printf("jnl: %s: open: journal end (0x%llx) not a multiple of block size (0x%x)?\n", + jdev_name, jnl->jhdr->end, jnl->jhdr->jhdr_size); + goto bad_journal; + } + + if (jnl->jhdr->blhdr_size < 0) { + //throw out invalid sizes + printf("jnl %s: open: blhdr size looks bogus! (%d) \n", + jdev_name, jnl->jhdr->blhdr_size); + goto bad_journal; + } + + // take care of replaying the journal if necessary + if (flags & JOURNAL_RESET) { + printf("jnl: %s: journal start/end pointers reset! (s 0x%llx e 0x%llx)\n", + jdev_name, jnl->jhdr->start, jnl->jhdr->end); + jnl->jhdr->start = jnl->jhdr->end; + } else if (replay_journal(jnl) != 0) { + printf("jnl: %s: journal_open: Error replaying the journal!\n", jdev_name); + goto bad_journal; + } + + /* + * When we get here, we know that the journal is empty (jnl->jhdr->start == + * jnl->jhdr->end). If the device's logical block size was different from + * the journal's header size, then we can now restore the device's logical + * block size and update the journal's header size to match. + * + * Note that we also adjust the journal's start and end so that they will + * be aligned on the new block size. We pick a new sequence number to + * avoid any problems if a replay found previous transactions using the old + * journal header size. (See the comments in journal_create(), above.) + */ + + if (orig_blksz != 0) { + VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, vfs_context_kernel()); + phys_blksz = orig_blksz; + + orig_blksz = 0; + + jnl->jhdr->jhdr_size = phys_blksz; + jnl->jhdr->start = phys_blksz; + jnl->jhdr->end = phys_blksz; + jnl->jhdr->sequence_num = (jnl->jhdr->sequence_num + + (journal_size / phys_blksz) + + (random() % 16384)) & 0x00ffffff; + + if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num)) { + printf("jnl: %s: open: failed to update journal header size\n", jdev_name); + goto bad_journal; + } + } + + // make sure this is in sync! + jnl->active_start = jnl->jhdr->start; + jnl->sequence_num = jnl->jhdr->sequence_num; + + // set this now, after we've replayed the journal + size_up_tbuffer(jnl, tbuffer_size, phys_blksz); + + // TODO: Does this need to change if the device's logical block size changed? + if ((off_t)(jnl->jhdr->blhdr_size/sizeof(block_info)-1) > (jnl->jhdr->size/jnl->jhdr->jhdr_size)) { + printf("jnl: %s: open: jhdr size and blhdr size are not compatible (0x%llx, %d, %d)\n", jdev_name, jnl->jhdr->size, + jnl->jhdr->blhdr_size, jnl->jhdr->jhdr_size); + goto bad_journal; + } + + lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr); + lck_mtx_init(&jnl->flock, jnl_mutex_group, jnl_lock_attr); + lck_rw_init(&jnl->trim_lock, jnl_mutex_group, jnl_lock_attr); + + goto journal_open_complete; + +bad_journal: + if (orig_blksz != 0) { + phys_blksz = orig_blksz; + VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, vfs_context_kernel()); + printf("jnl: %s: open: restored block size after error\n", jdev_name); + } + hfs_free(jnl->header_buf, jnl->header_buf_size); + hfs_free(jnl, sizeof(*jnl)); +cleanup_jdev_name: + vnode_putname_printable(jdev_name); + jnl = NULL; +journal_open_complete: + return jnl; +} + + +int +journal_is_clean(struct vnode *jvp, + off_t offset, + off_t journal_size, + struct vnode *fsvp, + size_t min_fs_block_size) +{ + journal jnl; + uint32_t phys_blksz; + int ret; + int orig_checksum, checksum; + const char *jdev_name = vnode_getname_printable(jvp); + + /* Get the real physical block size. */ + if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, vfs_context_kernel())) { + printf("jnl: %s: is_clean: failed to get device block size.\n", jdev_name); + ret = EINVAL; + goto cleanup_jdev_name; + } + + if (phys_blksz > (uint32_t)min_fs_block_size) { + printf("jnl: %s: is_clean: error: phys blksize %d bigger than min fs blksize %zd\n", + jdev_name, phys_blksz, min_fs_block_size); + ret = EINVAL; + goto cleanup_jdev_name; + } + + if (journal_size < (256*1024) || journal_size > (MAX_JOURNAL_SIZE)) { + printf("jnl: %s: is_clean: journal size %lld looks bogus.\n", jdev_name, journal_size); + ret = EINVAL; + goto cleanup_jdev_name; + } + + if ((journal_size % phys_blksz) != 0) { + printf("jnl: %s: is_clean: journal size 0x%llx is not an even multiple of block size 0x%x\n", + jdev_name, journal_size, phys_blksz); + ret = EINVAL; + goto cleanup_jdev_name; + } + + memset(&jnl, 0, sizeof(jnl)); + + jnl.header_buf = hfs_malloc(phys_blksz); + jnl.header_buf_size = phys_blksz; + + get_io_info(jvp, phys_blksz, &jnl, vfs_context_kernel()); + + jnl.jhdr = (journal_header *)jnl.header_buf; + memset(jnl.jhdr, 0, sizeof(journal_header)); + + jnl.jdev = jvp; + jnl.jdev_offset = offset; + jnl.fsdev = fsvp; + + // we have to set this up here so that do_journal_io() will work + jnl.jhdr->jhdr_size = phys_blksz; + + if (read_journal_header(&jnl, jnl.jhdr, phys_blksz) != (unsigned)phys_blksz) { + printf("jnl: %s: is_clean: could not read %d bytes for the journal header.\n", + jdev_name, phys_blksz); + ret = EINVAL; + goto get_out; + } + + orig_checksum = jnl.jhdr->checksum; + jnl.jhdr->checksum = 0; + + if (jnl.jhdr->magic == SWAP32(JOURNAL_HEADER_MAGIC)) { + // do this before the swap since it's done byte-at-a-time + orig_checksum = SWAP32(orig_checksum); + checksum = calc_checksum((char *)jnl.jhdr, JOURNAL_HEADER_CKSUM_SIZE); + swap_journal_header(&jnl); + jnl.flags |= JOURNAL_NEED_SWAP; + } else { + checksum = calc_checksum((char *)jnl.jhdr, JOURNAL_HEADER_CKSUM_SIZE); + } + + if (jnl.jhdr->magic != JOURNAL_HEADER_MAGIC && jnl.jhdr->magic != OLD_JOURNAL_HEADER_MAGIC) { + printf("jnl: %s: is_clean: journal magic is bad (0x%x != 0x%x)\n", + jdev_name, jnl.jhdr->magic, JOURNAL_HEADER_MAGIC); + ret = EINVAL; + goto get_out; + } + + if (orig_checksum != checksum) { + printf("jnl: %s: is_clean: journal checksum is bad (0x%x != 0x%x)\n", jdev_name, orig_checksum, checksum); + ret = EINVAL; + goto get_out; + } + + // + // if the start and end are equal then the journal is clean. + // otherwise it's not clean and therefore an error. + // + if (jnl.jhdr->start == jnl.jhdr->end) { + ret = 0; + } else { + ret = EBUSY; // so the caller can differentiate an invalid journal from a "busy" one + } + +get_out: + hfs_free(jnl.header_buf, jnl.header_buf_size); +cleanup_jdev_name: + vnode_putname_printable(jdev_name); + return ret; +} + + +void +journal_close(journal *jnl) +{ + volatile off_t *start, *end; + int counter=0; + + CHECK_JOURNAL(jnl); + + // set this before doing anything that would block so that + // we start tearing things down properly. + // + jnl->flags |= JOURNAL_CLOSE_PENDING; + + if (jnl->owner != current_thread()) { + journal_lock(jnl); + } + + wait_condition(jnl, &jnl->flushing, "journal_close"); + + // + // only write stuff to disk if the journal is still valid + // + if ((jnl->flags & JOURNAL_INVALID) == 0) { + + if (jnl->active_tr) { + /* + * "journal_end_transaction" will fire the flush asynchronously + */ + journal_end_transaction(jnl); + } + + // flush any buffered transactions + if (jnl->cur_tr) { + transaction *tr = jnl->cur_tr; + + jnl->cur_tr = NULL; + /* + * "end_transaction" will wait for any in-progress flush to complete + * before flushing "cur_tr" synchronously("must_wait" == TRUE) + */ + end_transaction(tr, 1, NULL, NULL, FALSE, TRUE); + } + /* + * if there was an "active_tr", make sure we wait for + * it to flush if there was no "cur_tr" to process + */ + wait_condition(jnl, &jnl->flushing, "journal_close"); + + //start = &jnl->jhdr->start; + start = &jnl->active_start; + end = &jnl->jhdr->end; + + while (*start != *end && counter++ < 5000) { + //printf("jnl: close: flushing the buffer cache (start 0x%llx end 0x%llx)\n", *start, *end); + if (jnl->flush) { + jnl->flush(jnl->flush_arg); + } + tsleep((caddr_t)jnl, PRIBIO, "jnl_close", 2); + } + + if (*start != *end) { + printf("jnl: %s: close: buffer flushing didn't seem to flush out all the transactions! (0x%llx - 0x%llx)\n", + jnl->jdev_name, *start, *end); + } + + // make sure this is in sync when we close the journal + jnl->jhdr->start = jnl->active_start; + + // if this fails there's not much we can do at this point... + write_journal_header(jnl, 1, jnl->sequence_num); + } else { + // if we're here the journal isn't valid any more. + // so make sure we don't leave any locked blocks lying around + printf("jnl: %s: close: journal is invalid. aborting outstanding transactions\n", jnl->jdev_name); + if (jnl->active_tr || jnl->cur_tr) { + transaction *tr; + + if (jnl->active_tr) { + tr = jnl->active_tr; + jnl->active_tr = NULL; + } else { + tr = jnl->cur_tr; + jnl->cur_tr = NULL; + } + abort_transaction(jnl, tr); + + if (jnl->active_tr || jnl->cur_tr) { + panic("jnl: %s: close: jnl @ %p had both an active and cur tr\n", jnl->jdev_name, jnl); + } + } + } + wait_condition(jnl, &jnl->asyncIO, "journal_close"); + + free_old_stuff(jnl); + + hfs_free(jnl->header_buf, jnl->header_buf_size); + jnl->jhdr = (void *)0xbeefbabe; + + vnode_putname_printable(jnl->jdev_name); + + journal_unlock(jnl); + lck_mtx_destroy(&jnl->old_start_lock, jnl_mutex_group); + lck_mtx_destroy(&jnl->jlock, jnl_mutex_group); + lck_mtx_destroy(&jnl->flock, jnl_mutex_group); + hfs_free(jnl, sizeof(*jnl)); +} + +static void +dump_journal(journal *jnl) +{ + transaction *ctr; + + printf("journal for dev %s:", jnl->jdev_name); + printf(" jdev_offset %.8llx\n", jnl->jdev_offset); + printf(" magic: 0x%.8x\n", jnl->jhdr->magic); + printf(" start: 0x%.8llx\n", jnl->jhdr->start); + printf(" end: 0x%.8llx\n", jnl->jhdr->end); + printf(" size: 0x%.8llx\n", jnl->jhdr->size); + printf(" blhdr size: %d\n", jnl->jhdr->blhdr_size); + printf(" jhdr size: %d\n", jnl->jhdr->jhdr_size); + printf(" chksum: 0x%.8x\n", jnl->jhdr->checksum); + + printf(" completed transactions:\n"); + for (ctr = jnl->completed_trs; ctr; ctr = ctr->next) { + printf(" 0x%.8llx - 0x%.8llx\n", ctr->journal_start, ctr->journal_end); + } +} + + + +static off_t +free_space(journal *jnl) +{ + off_t free_space_offset; + + if (jnl->jhdr->start < jnl->jhdr->end) { + free_space_offset = jnl->jhdr->size - (jnl->jhdr->end - jnl->jhdr->start) - jnl->jhdr->jhdr_size; + } else if (jnl->jhdr->start > jnl->jhdr->end) { + free_space_offset = jnl->jhdr->start - jnl->jhdr->end; + } else { + // journal is completely empty + free_space_offset = jnl->jhdr->size - jnl->jhdr->jhdr_size; + } + + return free_space_offset; +} + + +// +// The journal must be locked on entry to this function. +// The "desired_size" is in bytes. +// +static int +check_free_space(journal *jnl, int desired_size, boolean_t *delayed_header_write, uint32_t sequence_num) +{ + size_t i; + int counter=0; + + //printf("jnl: check free space (desired 0x%x, avail 0x%Lx)\n", + // desired_size, free_space(jnl)); + + if (delayed_header_write) + *delayed_header_write = FALSE; + + while (1) { + int old_start_empty; + + // make sure there's space in the journal to hold this transaction + if (free_space(jnl) > desired_size && jnl->old_start[0] == 0) { + break; + } + if (counter++ == 5000) { + dump_journal(jnl); + panic("jnl: check_free_space: buffer flushing isn't working " + "(jnl @ %p s %lld e %lld f %lld [active start %lld]).\n", jnl, + jnl->jhdr->start, jnl->jhdr->end, free_space(jnl), jnl->active_start); + } + if (counter > 7500) { + printf("jnl: %s: check_free_space: giving up waiting for free space.\n", jnl->jdev_name); + return ENOSPC; + } + + // + // here's where we lazily bump up jnl->jhdr->start. we'll consume + // entries until there is enough space for the next transaction. + // + old_start_empty = 1; + lock_oldstart(jnl); + + for (i = 0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) { + int lcl_counter; + + lcl_counter = 0; + while (jnl->old_start[i] & 0x8000000000000000LL) { + if (lcl_counter++ > 10000) { + panic("jnl: check_free_space: tr starting @ 0x%llx not flushing (jnl %p).\n", + jnl->old_start[i], jnl); + } + + unlock_oldstart(jnl); + if (jnl->flush) { + jnl->flush(jnl->flush_arg); + } + tsleep((caddr_t)jnl, PRIBIO, "check_free_space1", 1); + lock_oldstart(jnl); + } + + if (jnl->old_start[i] == 0) { + continue; + } + + old_start_empty = 0; + jnl->jhdr->start = jnl->old_start[i]; + jnl->old_start[i] = 0; + + if (free_space(jnl) > desired_size) { + + if (delayed_header_write) + *delayed_header_write = TRUE; + else { + unlock_oldstart(jnl); + write_journal_header(jnl, 1, sequence_num); + lock_oldstart(jnl); + } + break; + } + } + unlock_oldstart(jnl); + + // if we bumped the start, loop and try again + if (i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) { + continue; + } else if (old_start_empty) { + // + // if there is nothing in old_start anymore then we can + // bump the jhdr->start to be the same as active_start + // since it is possible there was only one very large + // transaction in the old_start array. if we didn't do + // this then jhdr->start would never get updated and we + // would wind up looping until we hit the panic at the + // start of the loop. + // + jnl->jhdr->start = jnl->active_start; + + if (delayed_header_write) + *delayed_header_write = TRUE; + else + write_journal_header(jnl, 1, sequence_num); + continue; + } + + + // if the file system gave us a flush function, call it to so that + // it can flush some blocks which hopefully will cause some transactions + // to complete and thus free up space in the journal. + if (jnl->flush) { + jnl->flush(jnl->flush_arg); + } + + // wait for a while to avoid being cpu-bound (this will + // put us to sleep for 10 milliseconds) + tsleep((caddr_t)jnl, PRIBIO, "check_free_space2", 1); + } + + return 0; +} + +/* + * Allocate a new active transaction. + */ +static errno_t +journal_allocate_transaction(journal *jnl) +{ + transaction *tr; + boolean_t was_vm_privileged = FALSE; + + if (vfs_isswapmount(jnl->fsmount)) { + /* + * the disk driver can allocate memory on this path... + * if we block waiting for memory, and there is enough pressure to + * cause us to try and create a new swap file, we may end up deadlocking + * due to waiting for the journal on the swap file creation path... + * by making ourselves vm_privileged, we give ourselves the best chance + * of not blocking + */ + was_vm_privileged = set_vm_privilege(TRUE); + } + tr = hfs_mallocz(sizeof(transaction)); + + tr->tbuffer_size = jnl->tbuffer_size; + + tr->tbuffer = hfs_malloc(tr->tbuffer_size); + + if (vfs_isswapmount(jnl->fsmount) && (was_vm_privileged == FALSE)) + set_vm_privilege(FALSE); + + // journal replay code checksum check depends on this. + memset(tr->tbuffer, 0, BLHDR_CHECKSUM_SIZE); + // Fill up the rest of the block with unimportant bytes (0x5a 'Z' chosen for visibility) + memset(tr->tbuffer + BLHDR_CHECKSUM_SIZE, 0x5a, jnl->jhdr->blhdr_size - BLHDR_CHECKSUM_SIZE); + + tr->blhdr = (block_list_header *)tr->tbuffer; + tr->blhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1; + tr->blhdr->num_blocks = 1; // accounts for this header block + tr->blhdr->bytes_used = jnl->jhdr->blhdr_size; + tr->blhdr->flags = BLHDR_CHECK_CHECKSUMS | BLHDR_FIRST_HEADER; + + tr->sequence_num = ++jnl->sequence_num; + tr->num_blhdrs = 1; + tr->total_bytes = jnl->jhdr->blhdr_size; + tr->jnl = jnl; + + jnl->active_tr = tr; + + return 0; +} + +int +journal_start_transaction(journal *jnl) +{ + int ret; + + CHECK_JOURNAL(jnl); + + free_old_stuff(jnl); + + if (jnl->flags & JOURNAL_INVALID) { + return EINVAL; + } + if (jnl->owner == current_thread()) { + if (jnl->active_tr == NULL) { + panic("jnl: start_tr: active_tr is NULL (jnl @ %p, owner %p, current_thread %p\n", + jnl, jnl->owner, current_thread()); + } + jnl->nested_count++; + return 0; + } + + journal_lock(jnl); + + if (jnl->nested_count != 0 || jnl->active_tr != NULL) { + panic("jnl: start_tr: owner %p, nested count %d, active_tr %p jnl @ %p\n", + jnl->owner, jnl->nested_count, jnl->active_tr, jnl); + } + + jnl->nested_count = 1; + +#if JOE + // make sure there's room in the journal + if (free_space(jnl) < jnl->tbuffer_size) { + + KERNEL_DEBUG(0xbbbbc030 | DBG_FUNC_START, jnl, 0, 0, 0, 0); + + // this is the call that really waits for space to free up + // as well as updating jnl->jhdr->start + if (check_free_space(jnl, jnl->tbuffer_size, NULL, jnl->sequence_num) != 0) { + printf("jnl: %s: start transaction failed: no space\n", jnl->jdev_name); + ret = ENOSPC; + goto bad_start; + } + KERNEL_DEBUG(0xbbbbc030 | DBG_FUNC_END, jnl, 0, 0, 0, 0); + } +#endif + + // if there's a buffered transaction, use it. + if (jnl->cur_tr) { + jnl->active_tr = jnl->cur_tr; + jnl->cur_tr = NULL; + + return 0; + } + + ret = journal_allocate_transaction(jnl); + if (ret) { + goto bad_start; + } + + // printf("jnl: start_tr: owner 0x%x new tr @ 0x%x\n", jnl->owner, jnl->active_tr); + + return 0; + +bad_start: + jnl->nested_count = 0; + journal_unlock(jnl); + + return ret; +} + + +int +journal_modify_block_start(journal *jnl, struct buf *bp) +{ + transaction *tr; + boolean_t was_vm_privileged = FALSE; + + CHECK_JOURNAL(jnl); + + + free_old_stuff(jnl); + + if (jnl->flags & JOURNAL_INVALID) { + return EINVAL; + } + + if (vfs_isswapmount(jnl->fsmount)) { + /* + * if we block waiting for memory, and there is enough pressure to + * cause us to try and create a new swap file, we may end up deadlocking + * due to waiting for the journal on the swap file creation path... + * by making ourselves vm_privileged, we give ourselves the best chance + * of not blocking + */ + was_vm_privileged = set_vm_privilege(TRUE); + } + + // XXXdbg - for debugging I want this to be true. later it may + // not be necessary. + if ((buf_flags(bp) & B_META) == 0) { + panic("jnl: modify_block_start: bp @ %p is not a meta-data block! (jnl %p)\n", bp, jnl); + } + + tr = jnl->active_tr; + CHECK_TRANSACTION(tr); + + if (jnl->owner != current_thread()) { + panic("jnl: modify_block_start: called w/out a transaction! jnl %p, owner %p, curact %p\n", + jnl, jnl->owner, current_thread()); + } + + //printf("jnl: mod block start (bp 0x%x vp 0x%x l/blkno %qd/%qd bsz %d; total bytes %d)\n", + // bp, buf_vnode(bp), buf_lblkno(bp), buf_blkno(bp), buf_size(bp), tr->total_bytes); + + // can't allow blocks that aren't an even multiple of the + // underlying block size. + if ((buf_size(bp) % jnl->jhdr->jhdr_size) != 0) { + uint32_t phys_blksz, bad=0; + + if (VNOP_IOCTL(jnl->jdev, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, vfs_context_kernel())) { + bad = 1; + } else if (phys_blksz != (uint32_t)jnl->jhdr->jhdr_size) { + if (phys_blksz < 512) { + panic("jnl: mod block start: phys blksz %d is too small (%d, %d)\n", + phys_blksz, buf_size(bp), jnl->jhdr->jhdr_size); + } + + if ((buf_size(bp) % phys_blksz) != 0) { + bad = 1; + } else if (phys_blksz < (uint32_t)jnl->jhdr->jhdr_size) { + jnl->jhdr->jhdr_size = phys_blksz; + } else { + // the phys_blksz is now larger... need to realloc the jhdr + char *new_header_buf; + + printf("jnl: %s: phys blksz got bigger (was: %d/%d now %d)\n", + jnl->jdev_name, jnl->header_buf_size, jnl->jhdr->jhdr_size, phys_blksz); + new_header_buf = hfs_malloc(phys_blksz); + memcpy(new_header_buf, jnl->header_buf, jnl->header_buf_size); + memset(&new_header_buf[jnl->header_buf_size], 0x18, (phys_blksz - jnl->header_buf_size)); + hfs_free(jnl->header_buf, jnl->header_buf_size); + jnl->header_buf = new_header_buf; + jnl->header_buf_size = phys_blksz; + + jnl->jhdr = (journal_header *)jnl->header_buf; + jnl->jhdr->jhdr_size = phys_blksz; + } + } else { + bad = 1; + } + + if (bad) { + panic("jnl: mod block start: bufsize %d not a multiple of block size %d\n", + buf_size(bp), jnl->jhdr->jhdr_size); + + if (vfs_isswapmount(jnl->fsmount) && (was_vm_privileged == FALSE)) + set_vm_privilege(FALSE); + return -1; + } + } + + // make sure that this transaction isn't bigger than the whole journal + if (tr->total_bytes+buf_size(bp) >= (jnl->jhdr->size - jnl->jhdr->jhdr_size)) { + panic("jnl: transaction too big (%d >= %lld bytes, bufsize %d, tr %p bp %p)\n", + tr->total_bytes, (tr->jnl->jhdr->size - jnl->jhdr->jhdr_size), buf_size(bp), tr, bp); + + if (vfs_isswapmount(jnl->fsmount) && (was_vm_privileged == FALSE)) + set_vm_privilege(FALSE); + return -1; + } + +#if DEBUG + const int f = buf_flags(bp); + hfs_assert(!ISSET(f, B_DELWRI) || ISSET(f, B_LOCKED)); +#endif + + buf_setflags(bp, B_LOCKED); + + if (vfs_isswapmount(jnl->fsmount) && (was_vm_privileged == FALSE)) + set_vm_privilege(FALSE); + + return 0; +} + +int +journal_modify_block_abort(journal *jnl, struct buf *bp) +{ + transaction *tr; + block_list_header *blhdr; + int i; + + CHECK_JOURNAL(jnl); + + free_old_stuff(jnl); + + tr = jnl->active_tr; + + // + // if there's no active transaction then we just want to + // call buf_brelse() and return since this is just a block + // that happened to be modified as part of another tr. + // + if (tr == NULL) { + buf_brelse(bp); + return 0; + } + + if (jnl->flags & JOURNAL_INVALID) { + /* Still need to buf_brelse(). Callers assume we consume the bp. */ + buf_brelse(bp); + return EINVAL; + } + + CHECK_TRANSACTION(tr); + + if (jnl->owner != current_thread()) { + panic("jnl: modify_block_abort: called w/out a transaction! jnl %p, owner %p, curact %p\n", + jnl, jnl->owner, current_thread()); + } + + // printf("jnl: modify_block_abort: tr 0x%x bp 0x%x\n", jnl->active_tr, bp); + + // first check if it's already part of this transaction + for (blhdr = tr->blhdr; blhdr; blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) { + for (i = 1; i < blhdr->num_blocks; i++) { + if (bp == blhdr->binfo[i].u.bp) { + break; + } + } + + if (i < blhdr->num_blocks) { + break; + } + } + + // + // if blhdr is null, then this block has only had modify_block_start + // called on it as part of the current transaction. that means that + // it is ok to clear the LOCKED bit since it hasn't actually been + // modified. if blhdr is non-null then modify_block_end was called + // on it and so we need to keep it locked in memory. + // + if (blhdr == NULL) { + buf_clearflags(bp, B_LOCKED); + } + + buf_brelse(bp); + return 0; +} + + +int +journal_modify_block_end(journal *jnl, struct buf *bp, void (*func)(buf_t bp, void *arg), void *arg) +{ + int i = 1; + int tbuffer_offset=0; + block_list_header *blhdr, *prev=NULL; + transaction *tr; + + CHECK_JOURNAL(jnl); + + free_old_stuff(jnl); + + if (jnl->flags & JOURNAL_INVALID) { + /* Still need to buf_brelse(). Callers assume we consume the bp. */ + buf_brelse(bp); + return EINVAL; + } + + tr = jnl->active_tr; + CHECK_TRANSACTION(tr); + + if (jnl->owner != current_thread()) { + panic("jnl: modify_block_end: called w/out a transaction! jnl %p, owner %p, curact %p\n", + jnl, jnl->owner, current_thread()); + } + + //printf("jnl: mod block end: (bp 0x%x vp 0x%x l/blkno %qd/%qd bsz %d, total bytes %d)\n", + // bp, buf_vnode(bp), buf_lblkno(bp), buf_blkno(bp), buf_size(bp), tr->total_bytes); + + if ((buf_flags(bp) & B_LOCKED) == 0) { + panic("jnl: modify_block_end: bp %p not locked! jnl @ %p\n", bp, jnl); + } + + // first check if it's already part of this transaction + for (blhdr = tr->blhdr; blhdr; prev = blhdr, blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) { + tbuffer_offset = jnl->jhdr->blhdr_size; + + for (i = 1; i < blhdr->num_blocks; i++) { + if (bp == blhdr->binfo[i].u.bp) { + break; + } + if (blhdr->binfo[i].bnum != (off_t)-1) { + tbuffer_offset += buf_size(blhdr->binfo[i].u.bp); + } else { + tbuffer_offset += blhdr->binfo[i].u.bi.bsize; + } + } + + if (i < blhdr->num_blocks) { + break; + } + } + + if (blhdr == NULL + && prev + && (prev->num_blocks+1) <= prev->max_blocks + && (prev->bytes_used+buf_size(bp)) <= (uint32_t)tr->tbuffer_size) { + blhdr = prev; + + } else if (blhdr == NULL) { + block_list_header *nblhdr; + if (prev == NULL) { + panic("jnl: modify block end: no way man, prev == NULL?!?, jnl %p, bp %p\n", jnl, bp); + } + + // we got to the end of the list, didn't find the block and there's + // no room in the block_list_header pointed to by prev + + // we allocate another tbuffer and link it in at the end of the list + // through prev->binfo[0].bnum. that's a skanky way to do things but + // avoids having yet another linked list of small data structures to manage. + + nblhdr = hfs_malloc(tr->tbuffer_size); + + // journal replay code checksum check depends on this. + memset(nblhdr, 0, BLHDR_CHECKSUM_SIZE); + // Fill up the rest of the block with unimportant bytes + memset(nblhdr + BLHDR_CHECKSUM_SIZE, 0x5a, jnl->jhdr->blhdr_size - BLHDR_CHECKSUM_SIZE); + + // initialize the new guy + nblhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1; + nblhdr->num_blocks = 1; // accounts for this header block + nblhdr->bytes_used = jnl->jhdr->blhdr_size; + nblhdr->flags = BLHDR_CHECK_CHECKSUMS; + + tr->num_blhdrs++; + tr->total_bytes += jnl->jhdr->blhdr_size; + + // then link him in at the end + prev->binfo[0].bnum = (off_t)((long)nblhdr); + + // and finally switch to using the new guy + blhdr = nblhdr; + tbuffer_offset = jnl->jhdr->blhdr_size; + i = 1; + } + + + if ((i+1) > blhdr->max_blocks) { + panic("jnl: modify_block_end: i = %d, max_blocks %d\n", i, blhdr->max_blocks); + } + + // if this is true then this is a new block we haven't seen + if (i >= blhdr->num_blocks) { + int bsize; + vnode_t vp; + + vp = buf_vnode(bp); + if (vnode_ref(vp)) { + // Nobody checks the return values, so... + jnl->flags |= JOURNAL_INVALID; + + buf_brelse(bp); + + // We're probably here due to a force unmount, so EIO is appropriate + return EIO; + } + + bsize = buf_size(bp); + + blhdr->binfo[i].bnum = (off_t)(buf_blkno(bp)); + blhdr->binfo[i].u.bp = bp; + + KERNEL_DEBUG_CONSTANT(0x3018004, kdebug_vnode(vp), blhdr->binfo[i].bnum, bsize, 0, 0); + /* + * Update the per-task logical counter for metadata write. + * We use (2 * bsize) to account for the write to the journal and the + * corresponding write to the btree. + */ + task_update_logical_writes(current_task(), (2 * bsize), TASK_WRITE_METADATA, vp); + + if (func) { + void (*old_func)(buf_t, void *)=NULL, *old_arg=NULL; + + buf_setfilter(bp, func, arg, &old_func, &old_arg); + if (old_func != NULL && old_func != func) { + panic("jnl: modify_block_end: old func %p / arg %p (func %p)", old_func, old_arg, func); + } + } + + blhdr->bytes_used += bsize; + tr->total_bytes += bsize; + + blhdr->num_blocks++; + } + buf_bdwrite(bp); + + return 0; +} + +int +journal_kill_block(journal *jnl, struct buf *bp) +{ + int i; + int bflags; + block_list_header *blhdr; + transaction *tr; + + CHECK_JOURNAL(jnl); + + free_old_stuff(jnl); + + if (jnl->flags & JOURNAL_INVALID) { + buf_brelse(bp); + return 0; + } + + tr = jnl->active_tr; + CHECK_TRANSACTION(tr); + + if (jnl->owner != current_thread()) { + panic("jnl: modify_block_end: called w/out a transaction! jnl %p, owner %p, curact %p\n", + jnl, jnl->owner, current_thread()); + } + + bflags = buf_flags(bp); + + if ( !(bflags & B_LOCKED)) + panic("jnl: modify_block_end: called with bp not B_LOCKED"); + + /* + * bp must be BL_BUSY and B_LOCKED + * first check if it's already part of this transaction + */ + for (blhdr = tr->blhdr; blhdr; blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) { + + for (i = 1; i < blhdr->num_blocks; i++) { + if (bp == blhdr->binfo[i].u.bp) { + vnode_t vp; + + buf_clearflags(bp, B_LOCKED); + + // this undoes the vnode_ref() in journal_modify_block_end() + vp = buf_vnode(bp); + vnode_rele_ext(vp, 0, 1); + + // if the block has the DELWRI and FILTER bits sets, then + // things are seriously weird. if it was part of another + // transaction then journal_modify_block_start() should + // have force it to be written. + // + //if ((bflags & B_DELWRI) && (bflags & B_FILTER)) { + // panic("jnl: kill block: this defies all logic! bp 0x%x\n", bp); + //} else { + tr->num_killed += buf_size(bp); + //} + blhdr->binfo[i].bnum = (off_t)-1; + blhdr->binfo[i].u.bp = NULL; + blhdr->binfo[i].u.bi.bsize = buf_size(bp); + + buf_markinvalid(bp); + buf_brelse(bp); + + return 0; + } + } + } + + /* + * We did not find the block in any transaction buffer but we still + * need to release it or else it will be left locked forever. + */ + buf_brelse(bp); + + return 0; +} + +/* +;________________________________________________________________________________ +; +; Routine: journal_trim_set_callback +; +; Function: Provide the journal with a routine to be called back when a +; TRIM has (or would have) been issued to the device. That +; is, the transaction has been flushed to the device, and the +; blocks freed by the transaction are now safe for reuse. +; +; CAUTION: If the journal becomes invalid (eg., due to an I/O +; error when trying to write to the journal), this callback +; will stop getting called, even if extents got freed before +; the journal became invalid! +; +; Input Arguments: +; jnl - The journal structure for the filesystem. +; callback - The function to call when the TRIM is complete. +; arg - An argument to be passed to callback. +;________________________________________________________________________________ +*/ +void +journal_trim_set_callback(journal *jnl, jnl_trim_callback_t callback, void *arg) +{ + jnl->trim_callback = callback; + jnl->trim_callback_arg = arg; +} + + +/* +;________________________________________________________________________________ +; +; Routine: journal_trim_realloc +; +; Function: Increase the amount of memory allocated for the list of extents +; to be unmapped (trimmed). This routine will be called when +; adding an extent to the list, and the list already occupies +; all of the space allocated to it. This routine returns ENOMEM +; if unable to allocate more space, or 0 if the extent list was +; grown successfully. +; +; Input Arguments: +; trim - The trim list to be resized. +; +; Output: +; (result) - ENOMEM or 0. +; +; Side effects: +; The allocated_count and extents fields of tr->trim are updated +; if the function returned 0. +;________________________________________________________________________________ +*/ +static int +trim_realloc(journal *jnl, struct jnl_trim_list *trim) +{ + void *new_extents; + uint32_t new_allocated_count; + boolean_t was_vm_privileged = FALSE; + + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REALLOC | DBG_FUNC_START, obfuscate_addr(trim), 0, trim->allocated_count, trim->extent_count, 0); + + new_allocated_count = trim->allocated_count + JOURNAL_DEFAULT_TRIM_EXTENTS; + + if (vfs_isswapmount(jnl->fsmount)) { + /* + * if we block waiting for memory, and there is enough pressure to + * cause us to try and create a new swap file, we may end up deadlocking + * due to waiting for the journal on the swap file creation path... + * by making ourselves vm_privileged, we give ourselves the best chance + * of not blocking + */ + was_vm_privileged = set_vm_privilege(TRUE); + } + new_extents = hfs_malloc(new_allocated_count * sizeof(dk_extent_t)); + if (vfs_isswapmount(jnl->fsmount) && (was_vm_privileged == FALSE)) + set_vm_privilege(FALSE); + + if (new_extents == NULL) { + printf("jnl: trim_realloc: unable to grow extent list!\n"); + /* + * Since we could be called when allocating space previously marked + * to be trimmed, we need to empty out the list to be safe. + */ + trim->extent_count = 0; + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REALLOC | DBG_FUNC_END, ENOMEM, 0, trim->allocated_count, 0, 0); + return ENOMEM; + } + + /* Copy the old extent list to the newly allocated list. */ + if (trim->extents != NULL) { + memmove(new_extents, + trim->extents, + trim->allocated_count * sizeof(dk_extent_t)); + hfs_free(trim->extents, trim->allocated_count * sizeof(dk_extent_t)); + } + + trim->allocated_count = new_allocated_count; + trim->extents = new_extents; + + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REALLOC | DBG_FUNC_END, 0, 0, new_allocated_count, trim->extent_count, 0); + + return 0; +} + +/* + ;________________________________________________________________________________ + ; + ; Routine: trim_search_extent + ; + ; Function: Search the given extent list to see if any of its extents + ; overlap the given extent. + ; + ; Input Arguments: + ; trim - The trim list to be searched. + ; offset - The first byte of the range to be searched for. + ; length - The number of bytes of the extent being searched for. + ; overlap_start - start of the overlapping extent + ; overlap_len - length of the overlapping extent + ; + ; Output: + ; (result) - TRUE if one or more extents overlap, FALSE otherwise. + ;________________________________________________________________________________ + */ +static int +trim_search_extent(struct jnl_trim_list *trim, uint64_t offset, + uint64_t length, uint64_t *overlap_start, uint64_t *overlap_len) +{ + uint64_t end = offset + length; + uint32_t lower = 0; /* Lowest index to search */ + uint32_t upper = trim->extent_count; /* Highest index to search + 1 */ + uint32_t middle; + + /* A binary search over the extent list. */ + while (lower < upper) { + middle = (lower + upper) / 2; + + if (trim->extents[middle].offset >= end) + upper = middle; + else if (trim->extents[middle].offset + trim->extents[middle].length <= offset) + lower = middle + 1; + else { + if (overlap_start) { + *overlap_start = trim->extents[middle].offset; + } + if (overlap_len) { + *overlap_len = trim->extents[middle].length; + } + return TRUE; + } + } + + return FALSE; +} + + +/* +;________________________________________________________________________________ +; +; Routine: journal_trim_add_extent +; +; Function: Keep track of extents that have been freed as part of this +; transaction. If the underlying device supports TRIM (UNMAP), +; then those extents will be trimmed/unmapped once the +; transaction has been written to the journal. (For example, +; SSDs can support trim/unmap and avoid having to recopy those +; blocks when doing wear leveling, and may reuse the same +; phsyical blocks for different logical blocks.) +; +; HFS also uses this, in combination with journal_trim_set_callback, +; to add recently freed extents to its free extent cache, but +; only after the transaction that freed them is committed to +; disk. (This reduces the chance of overwriting live data in +; a way that causes data loss if a transaction never gets +; written to the journal.) +; +; Input Arguments: +; jnl - The journal for the volume containing the byte range. +; offset - The first byte of the range to be trimmed. +; length - The number of bytes of the extent being trimmed. +;________________________________________________________________________________ +*/ +int +journal_trim_add_extent(journal *jnl, uint64_t offset, uint64_t length) +{ + uint64_t end; + transaction *tr; + dk_extent_t *extent; + uint32_t insert_index; + uint32_t replace_count; + + CHECK_JOURNAL(jnl); + + /* TODO: Is it OK to manipulate the trim list even if JOURNAL_INVALID is set? I think so... */ + if (jnl->flags & JOURNAL_INVALID) { + return EINVAL; + } + + tr = jnl->active_tr; + CHECK_TRANSACTION(tr); + + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD | DBG_FUNC_START, obfuscate_addr(jnl), offset, length, tr->trim.extent_count, 0); + + if (jnl->owner != current_thread()) { + panic("jnl: trim_add_extent: called w/out a transaction! jnl %p, owner %p, curact %p\n", + jnl, jnl->owner, current_thread()); + } + + free_old_stuff(jnl); + + end = offset + length; + + /* + * Find the range of existing extents that can be combined with the + * input extent. We start by counting the number of extents that end + * strictly before the input extent, then count the number of extents + * that overlap or are contiguous with the input extent. + */ + extent = tr->trim.extents; + insert_index = 0; + while (insert_index < tr->trim.extent_count && extent->offset + extent->length < offset) { + ++insert_index; + ++extent; + } + replace_count = 0; + while (insert_index + replace_count < tr->trim.extent_count && extent->offset <= end) { + ++replace_count; + ++extent; + } + + /* + * If none of the existing extents can be combined with the input extent, + * then just insert it in the list (before item number insert_index). + */ + if (replace_count == 0) { + /* If the list was already full, we need to grow it. */ + if (tr->trim.extent_count == tr->trim.allocated_count) { + if (trim_realloc(jnl, &tr->trim) != 0) { + printf("jnl: trim_add_extent: out of memory!"); + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD | DBG_FUNC_END, ENOMEM, 0, 0, tr->trim.extent_count, 0); + return ENOMEM; + } + } + + /* Shift any existing extents with larger offsets. */ + if (insert_index < tr->trim.extent_count) { + memmove(&tr->trim.extents[insert_index+1], + &tr->trim.extents[insert_index], + (tr->trim.extent_count - insert_index) * sizeof(dk_extent_t)); + } + tr->trim.extent_count++; + + /* Store the new extent in the list. */ + tr->trim.extents[insert_index].offset = offset; + tr->trim.extents[insert_index].length = length; + + /* We're done. */ + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD | DBG_FUNC_END, 0, 0, 0, tr->trim.extent_count, 0); + return 0; + } + + /* + * Update extent number insert_index to be the union of the input extent + * and all of the replaced extents. + */ + if (tr->trim.extents[insert_index].offset < offset) + offset = tr->trim.extents[insert_index].offset; + extent = &tr->trim.extents[insert_index + replace_count - 1]; + if (extent->offset + extent->length > end) + end = extent->offset + extent->length; + tr->trim.extents[insert_index].offset = offset; + tr->trim.extents[insert_index].length = end - offset; + + /* + * If we were replacing more than one existing extent, then shift any + * extents with larger offsets, and update the count of extents. + * + * We're going to leave extent #insert_index alone since it was just updated, above. + * We need to move extents from index (insert_index + replace_count) through the end of + * the list by (replace_count - 1) positions so that they overwrite extent #(insert_index + 1). + */ + if (replace_count > 1 && (insert_index + replace_count) < tr->trim.extent_count) { + memmove(&tr->trim.extents[insert_index + 1], + &tr->trim.extents[insert_index + replace_count], + (tr->trim.extent_count - insert_index - replace_count) * sizeof(dk_extent_t)); + } + tr->trim.extent_count -= replace_count - 1; + + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD | DBG_FUNC_END, 0, 0, 0, tr->trim.extent_count, 0); + return 0; +} + +/* + * journal_trim_extent_overlap + * + * Return 1 if there are any pending TRIMs that overlap with the given offset and length + * Return 0 otherwise. + */ + +int journal_trim_extent_overlap (journal *jnl, uint64_t offset, uint64_t length, uint64_t *end) { + transaction *tr = NULL; + int overlap = 0; + + uint64_t overlap_start; + uint64_t overlap_len; + tr = jnl->active_tr; + CHECK_TRANSACTION(tr); + + /* + * There are two lists that need to be examined for potential overlaps: + * + * The first is the current transaction. Since this function requires that + * a transaction be active when this is called, this is the "active_tr" + * pointer in the journal struct. This has a trimlist pointer which needs + * to be searched. + */ + overlap = trim_search_extent (&tr->trim, offset, length, &overlap_start, &overlap_len); + if (overlap == 0) { + /* + * The second is the async trim list, which is only done if the current + * transaction group (active transaction) did not overlap with our target + * extent. This async trim list is the set of all previously + * committed transaction groups whose I/Os are now in-flight. We need to hold the + * trim lock in order to search this list. If we grab the list before the + * TRIM has completed, then we will compare it. If it is grabbed AFTER the + * TRIM has completed, then the pointer will be zeroed out and we won't have + * to check anything. + */ + lck_rw_lock_shared (&jnl->trim_lock); + if (jnl->async_trim != NULL) { + overlap = trim_search_extent(jnl->async_trim, offset, length, &overlap_start, &overlap_len); + } + lck_rw_unlock_shared (&jnl->trim_lock); + } + + if (overlap) { + /* compute the end (min) of the overlapping range */ + if ( (overlap_start + overlap_len) < (offset + length)) { + *end = (overlap_start + overlap_len); + } + else { + *end = (offset + length); + } + } + + + return overlap; +} + +/* + * journal_request_immediate_flush + * + * FS requests that the journal flush immediately upon the + * active transaction's completion. + * + * Returns 0 if operation succeeds + * Returns EPERM if we failed to leave hint + */ +int +journal_request_immediate_flush (journal *jnl) { + + transaction *tr = NULL; + /* + * Is a transaction still in process? You must do + * this while there are txns open + */ + tr = jnl->active_tr; + if (tr != NULL) { + CHECK_TRANSACTION(tr); + tr->flush_on_completion = TRUE; + } + else { + return EPERM; + } + return 0; +} + + + +/* +;________________________________________________________________________________ +; +; Routine: trim_remove_extent +; +; Function: Indicate that a range of bytes, some of which may have previously +; been passed to journal_trim_add_extent, is now allocated. +; Any overlapping ranges currently in the journal's trim list will +; be removed. If the underlying device supports TRIM (UNMAP), then +; these extents will not be trimmed/unmapped when the transaction +; is written to the journal. +; +; HFS also uses this to prevent newly allocated space from being +; added to its free extent cache (if some portion of the newly +; allocated space was recently freed). +; +; Input Arguments: +; trim - The trim list to update. +; offset - The first byte of the range to be trimmed. +; length - The number of bytes of the extent being trimmed. +;________________________________________________________________________________ +*/ +static int +trim_remove_extent(journal *jnl, struct jnl_trim_list *trim, uint64_t offset, uint64_t length) +{ + u_int64_t end; + dk_extent_t *extent; + u_int32_t keep_before; + u_int32_t keep_after; + + end = offset + length; + + /* + * Find any existing extents that start before or end after the input + * extent. These extents will be modified if they overlap the input + * extent. Other extents between them will be deleted. + */ + extent = trim->extents; + keep_before = 0; + while (keep_before < trim->extent_count && extent->offset < offset) { + ++keep_before; + ++extent; + } + keep_after = keep_before; + if (keep_after > 0) { + /* See if previous extent extends beyond both ends of input extent. */ + --keep_after; + --extent; + } + while (keep_after < trim->extent_count && (extent->offset + extent->length) <= end) { + ++keep_after; + ++extent; + } + + /* + * When we get here, the first keep_before extents (0 .. keep_before-1) + * start before the input extent, and extents (keep_after .. extent_count-1) + * end after the input extent. We'll need to keep, all of those extents, + * but possibly modify #(keep_before-1) and #keep_after to remove the portion + * that overlaps with the input extent. + */ + + /* + * Does the input extent start after and end before the same existing + * extent? If so, we have to "punch a hole" in that extent and convert + * it to two separate extents. + */ + if (keep_before > keep_after) { + /* If the list was already full, we need to grow it. */ + if (trim->extent_count == trim->allocated_count) { + if (trim_realloc(jnl, trim) != 0) { + printf("jnl: trim_remove_extent: out of memory!"); + return ENOMEM; + } + } + + /* + * Make room for a new extent by shifting extents #keep_after and later + * down by one extent. When we're done, extents #keep_before and + * #keep_after will be identical, and we can fall through to removing + * the portion that overlaps the input extent. + */ + memmove(&trim->extents[keep_before], + &trim->extents[keep_after], + (trim->extent_count - keep_after) * sizeof(dk_extent_t)); + ++trim->extent_count; + ++keep_after; + + /* + * Fall through. We now have the case where the length of extent + * #(keep_before - 1) needs to be updated, and the start of extent + * #(keep_after) needs to be updated. + */ + } + + /* + * May need to truncate the end of extent #(keep_before - 1) if it overlaps + * the input extent. + */ + if (keep_before > 0) { + extent = &trim->extents[keep_before - 1]; + if (extent->offset + extent->length > offset) { + extent->length = offset - extent->offset; + } + } + + /* + * May need to update the start of extent #(keep_after) if it overlaps the + * input extent. + */ + if (keep_after < trim->extent_count) { + extent = &trim->extents[keep_after]; + if (extent->offset < end) { + extent->length = extent->offset + extent->length - end; + extent->offset = end; + } + } + + /* + * If there were whole extents that overlapped the input extent, get rid + * of them by shifting any following extents, and updating the count. + */ + if (keep_after > keep_before && keep_after < trim->extent_count) { + memmove(&trim->extents[keep_before], + &trim->extents[keep_after], + (trim->extent_count - keep_after) * sizeof(dk_extent_t)); + } + trim->extent_count -= keep_after - keep_before; + + return 0; +} + +/* + ;________________________________________________________________________________ + ; + ; Routine: journal_trim_remove_extent + ; + ; Function: Make note of a range of bytes, some of which may have previously + ; been passed to journal_trim_add_extent, is now in use on the + ; volume. The given bytes will be not be trimmed as part of + ; this transaction, or a pending trim of a transaction being + ; asynchronously flushed. + ; + ; Input Arguments: + ; jnl - The journal for the volume containing the byte range. + ; offset - The first byte of the range to be trimmed. + ; length - The number of bytes of the extent being trimmed. + ;________________________________________________________________________________ + */ +int +journal_trim_remove_extent(journal *jnl, uint64_t offset, uint64_t length) +{ + int error = 0; + transaction *tr; + + CHECK_JOURNAL(jnl); + + /* TODO: Is it OK to manipulate the trim list even if JOURNAL_INVALID is set? I think so... */ + if (jnl->flags & JOURNAL_INVALID) { + return EINVAL; + } + + tr = jnl->active_tr; + CHECK_TRANSACTION(tr); + + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE | DBG_FUNC_START, obfuscate_addr(jnl), offset, length, tr->trim.extent_count, 0); + + if (jnl->owner != current_thread()) { + panic("jnl: trim_remove_extent: called w/out a transaction! jnl %p, owner %p, curact %p\n", + jnl, jnl->owner, current_thread()); + } + + free_old_stuff(jnl); + + error = trim_remove_extent(jnl, &tr->trim, offset, length); + if (error == 0) { + int found = FALSE; + + /* + * See if a pending trim has any extents that overlap with the + * one we were given. + */ + lck_rw_lock_shared(&jnl->trim_lock); + if (jnl->async_trim != NULL) + found = trim_search_extent(jnl->async_trim, offset, length, NULL, NULL); + lck_rw_unlock_shared(&jnl->trim_lock); + + if (found) { + /* + * There was an overlap, so avoid trimming the extent we + * just allocated. (Otherwise, it might get trimmed after + * we've written to it, which will cause that data to be + * corrupted.) + */ + uint32_t async_extent_count = 0; + + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE_PENDING | DBG_FUNC_START, obfuscate_addr(jnl), offset, length, 0, 0); + lck_rw_lock_exclusive(&jnl->trim_lock); + if (jnl->async_trim != NULL) { + error = trim_remove_extent(jnl, jnl->async_trim, offset, length); + async_extent_count = jnl->async_trim->extent_count; + } + lck_rw_unlock_exclusive(&jnl->trim_lock); + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE_PENDING | DBG_FUNC_END, error, 0, 0, async_extent_count, 0); + } + } + + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE | DBG_FUNC_END, error, 0, 0, tr->trim.extent_count, 0); + return error; +} + + +static int +journal_trim_flush(journal *jnl, transaction *tr) +{ + int err = 0; + boolean_t was_vm_privileged = FALSE; + + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_FLUSH | DBG_FUNC_START, obfuscate_addr(jnl), tr, 0, tr->trim.extent_count, 0); + + if (vfs_isswapmount(jnl->fsmount)) { + /* + * the disk driver can allocate memory on this path... + * if we block waiting for memory, and there is enough pressure to + * cause us to try and create a new swap file, we may end up deadlocking + * due to waiting for the journal on the swap file creation path... + * by making ourselves vm_privileged, we give ourselves the best chance + * of not blocking + */ + was_vm_privileged = set_vm_privilege(TRUE); + } + lck_rw_lock_shared(&jnl->trim_lock); + if (tr->trim.extent_count > 0) { + dk_unmap_t unmap; + + bzero(&unmap, sizeof(unmap)); + if (jnl->flags & JOURNAL_USE_UNMAP) { + unmap.extents = tr->trim.extents; + unmap.extentsCount = tr->trim.extent_count; + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_UNMAP | DBG_FUNC_START, obfuscate_addr(jnl), tr, 0, tr->trim.extent_count, 0); + err = VNOP_IOCTL(jnl->fsdev, DKIOCUNMAP, (caddr_t)&unmap, FWRITE, vfs_context_kernel()); + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_UNMAP | DBG_FUNC_END, err, 0, 0, 0, 0); + } + + /* + * Call back into the file system to tell them that we have + * trimmed some extents and that they can now be reused. + * + * CAUTION: If the journal becomes invalid (eg., due to an I/O + * error when trying to write to the journal), this callback + * will stop getting called, even if extents got freed before + * the journal became invalid! + */ + if (jnl->trim_callback) + jnl->trim_callback(jnl->trim_callback_arg, tr->trim.extent_count, tr->trim.extents); + } + lck_rw_unlock_shared(&jnl->trim_lock); + + if (vfs_isswapmount(jnl->fsmount) && (was_vm_privileged == FALSE)) + set_vm_privilege(FALSE); + /* + * If the transaction we're flushing was the async transaction, then + * tell the current transaction that there is no pending trim + * any more. + * + * NOTE: Since we released the lock, another thread could have + * removed one or more extents from our list. That's not a + * problem since any writes to the re-allocated blocks + * would get sent to the device after the DKIOCUNMAP. + */ + lck_rw_lock_exclusive(&jnl->trim_lock); + if (jnl->async_trim == &tr->trim) + jnl->async_trim = NULL; + lck_rw_unlock_exclusive(&jnl->trim_lock); + + /* + * By the time we get here, no other thread can discover the address + * of "tr", so it is safe for us to manipulate tr->trim without + * holding any locks. + */ + if (tr->trim.extents) { + hfs_free(tr->trim.extents, tr->trim.allocated_count * sizeof(dk_extent_t)); + tr->trim.allocated_count = 0; + tr->trim.extent_count = 0; + tr->trim.extents = NULL; + } + + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_FLUSH | DBG_FUNC_END, err, 0, 0, 0, 0); + + return err; +} + +static int +journal_binfo_cmp(const void *a, const void *b) +{ + const block_info *bi_a = (const struct block_info *)a; + const block_info *bi_b = (const struct block_info *)b; + daddr64_t res; + + if (bi_a->bnum == (off_t)-1) { + return 1; + } + if (bi_b->bnum == (off_t)-1) { + return -1; + } + + // don't have to worry about negative block + // numbers so this is ok to do. + // + res = (buf_blkno(bi_a->u.bp) - buf_blkno(bi_b->u.bp)); + + return (int)res; +} + + +/* + * End a transaction. If the transaction is small enough, and we're not forcing + * a write to disk, the "active" transaction becomes the "current" transaction, + * and will be reused for the next transaction that is started (group commit). + * + * If the transaction gets written to disk (because force_it is true, or no + * group commit, or the transaction is sufficiently full), the blocks get + * written into the journal first, then the are written asynchronously. When + * those async writes complete, the transaction can be freed and removed from + * the journal. + * + * An optional callback can be supplied. If given, it is called after the + * the blocks have been written to the journal, but before the async writes + * of those blocks to their normal on-disk locations. This is used by + * journal_relocate so that the location of the journal can be changed and + * flushed to disk before the blocks get written to their normal locations. + * Note that the callback is only called if the transaction gets written to + * the journal during this end_transaction call; you probably want to set the + * force_it flag. + * + * Inputs: + * tr Transaction to add to the journal + * force_it If true, force this transaction to the on-disk journal immediately. + * callback See description above. Pass NULL for no callback. + * callback_arg Argument passed to callback routine. + * + * Result + * 0 No errors + * -1 An error occurred. The journal is marked invalid. + */ +static int +end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void *callback_arg, boolean_t drop_lock, boolean_t must_wait) +{ + block_list_header *blhdr=NULL, *next=NULL; + int i, ret_val = 0; + errno_t err; + journal *jnl = tr->jnl; + struct buf *bp; + size_t tbuffer_offset; + boolean_t drop_lock_early; + + if (jnl->cur_tr) { + panic("jnl: jnl @ %p already has cur_tr %p, new tr: %p\n", + jnl, jnl->cur_tr, tr); + } + + // if there weren't any modified blocks in the transaction + // just save off the transaction pointer and return. + if (tr->total_bytes == jnl->jhdr->blhdr_size) { + jnl->cur_tr = tr; + goto done; + } + + // if our transaction buffer isn't very full, just hang + // on to it and don't actually flush anything. this is + // what is known as "group commit". we will flush the + // transaction buffer if it's full or if we have more than + // one of them so we don't start hogging too much memory. + // + // We also check the device supports UNMAP/TRIM, and if so, + // the number of extents waiting to be trimmed. If it is + // small enough, then keep accumulating more (so we can + // reduce the overhead of trimming). If there was a prior + // trim error, then we stop issuing trims for this + // volume, so we can also coalesce transactions. + // + if ( force_it == 0 + && (jnl->flags & JOURNAL_NO_GROUP_COMMIT) == 0 + && tr->num_blhdrs < 3 + && (tr->total_bytes <= ((tr->tbuffer_size*tr->num_blhdrs) - tr->tbuffer_size/8)) + && (!(jnl->flags & JOURNAL_USE_UNMAP) || (tr->trim.extent_count < jnl_trim_flush_limit))) { + + jnl->cur_tr = tr; + goto done; + } + + KERNEL_DEBUG(0xbbbbc018|DBG_FUNC_START, jnl, tr, drop_lock, must_wait, 0); + + lock_condition(jnl, &jnl->flushing, "end_transaction"); + + /* + * if the previous 'finish_end_transaction' was being run + * asynchronously, it could have encountered a condition + * that caused it to mark the journal invalid... if that + * occurred while we were waiting for it to finish, we + * need to notice and abort the current transaction + */ + if ((jnl->flags & JOURNAL_INVALID) || jnl->flush_aborted == TRUE) { + unlock_condition(jnl, &jnl->flushing); + + abort_transaction(jnl, tr); + ret_val = -1; + KERNEL_DEBUG(0xbbbbc018|DBG_FUNC_END, jnl, tr, ret_val, 0, 0); + goto done; + } + + /* + * Store a pointer to this transaction's trim list so that + * future transactions can find it. + * + * Note: if there are no extents in the trim list, then don't + * bother saving the pointer since nothing can add new extents + * to the list (and other threads/transactions only care if + * there is a trim pending). + */ + lck_rw_lock_exclusive(&jnl->trim_lock); + if (jnl->async_trim != NULL) + panic("jnl: end_transaction: async_trim already non-NULL!"); + if (tr->trim.extent_count > 0) + jnl->async_trim = &tr->trim; + lck_rw_unlock_exclusive(&jnl->trim_lock); + + /* + * snapshot the transaction sequence number while we are still behind + * the journal lock since it will be bumped upon the start of the + * next transaction group which may overlap the current journal flush... + * we pass the snapshot into write_journal_header during the journal + * flush so that it can write the correct version in the header... + * because we hold the 'flushing' condition variable for the duration + * of the journal flush, 'saved_sequence_num' remains stable + */ + jnl->saved_sequence_num = jnl->sequence_num; + + /* + * if we're here we're going to flush the transaction buffer to disk. + * 'check_free_space' will not return untl there is enough free + * space for this transaction in the journal and jnl->old_start[0] + * is avaiable for use + */ + KERNEL_DEBUG(0xbbbbc030 | DBG_FUNC_START, jnl, 0, 0, 0, 0); + + check_free_space(jnl, tr->total_bytes, &tr->delayed_header_write, jnl->saved_sequence_num); + + KERNEL_DEBUG(0xbbbbc030 | DBG_FUNC_END, jnl, tr->delayed_header_write, 0, 0, 0); + + // range check the end index + if (jnl->jhdr->end <= 0 || jnl->jhdr->end > jnl->jhdr->size) { + panic("jnl: end_transaction: end is bogus 0x%llx (sz 0x%llx)\n", + jnl->jhdr->end, jnl->jhdr->size); + } + if (tr->delayed_header_write == TRUE) { + thread_t thread = THREAD_NULL; + + lock_condition(jnl, &jnl->writing_header, "end_transaction"); + /* + * fire up a thread to write the journal header + * asynchronously... when it finishes, it will call + * unlock_condition... we can overlap the preparation of + * the log and buffers during this time + */ + kernel_thread_start((thread_continue_t)write_header_thread, jnl, &thread); + } else + jnl->write_header_failed = FALSE; + + + // this transaction starts where the current journal ends + tr->journal_start = jnl->jhdr->end; + + lock_oldstart(jnl); + /* + * Because old_start is locked above, we can cast away the volatile qualifier before passing it to memcpy. + * slide everyone else down and put our latest guy in the last + * entry in the old_start array + */ + memcpy(__CAST_AWAY_QUALIFIER(&jnl->old_start[0], volatile, void *), __CAST_AWAY_QUALIFIER(&jnl->old_start[1], volatile, void *), sizeof(jnl->old_start)-sizeof(jnl->old_start[0])); + jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] = tr->journal_start | 0x8000000000000000LL; + + unlock_oldstart(jnl); + + + for (blhdr = tr->blhdr; blhdr; blhdr = next) { + char *blkptr; + buf_t sbp; + int32_t bsize; + + tbuffer_offset = jnl->jhdr->blhdr_size; + + for (i = 1; i < blhdr->num_blocks; i++) { + + if (blhdr->binfo[i].bnum != (off_t)-1) { + void (*func)(buf_t, void *); + void *arg; + + bp = blhdr->binfo[i].u.bp; + + if (bp == NULL) { + panic("jnl: inconsistent binfo (NULL bp w/bnum %lld; jnl @ %p, tr %p)\n", + blhdr->binfo[i].bnum, jnl, tr); + } + /* + * acquire the bp here so that we can safely + * mess around with its data. buf_acquire() + * will return EAGAIN if the buffer was busy, + * so loop trying again. + */ + do { + err = buf_acquire(bp, BAC_REMOVE, 0, 0); + } while (err == EAGAIN); + + if (err) + panic("could not acquire bp %p (err %d)\n", bp, err); + + if ((buf_flags(bp) & (B_LOCKED|B_DELWRI)) != (B_LOCKED|B_DELWRI)) { + if (jnl->flags & JOURNAL_CLOSE_PENDING) { + buf_clearflags(bp, B_LOCKED); + buf_brelse(bp); + + /* + * this is an odd case that appears to happen occasionally + * make sure we mark this block as no longer valid + * so that we don't process it in "finish_end_transaction" since + * the bp that is recorded in our array no longer belongs + * to us (normally we substitute a shadow bp to be processed + * issuing a 'buf_bawrite' on a stale buf_t pointer leads + * to all kinds of problems. + */ + blhdr->binfo[i].bnum = (off_t)-1; + continue; + } else { + panic("jnl: end_tr: !!!DANGER!!! bp %p flags (0x%x) not LOCKED & DELWRI\n", bp, buf_flags(bp)); + } + } + bsize = buf_size(bp); + + buf_setfilter(bp, NULL, NULL, &func, &arg); + + blkptr = (char *)&((char *)blhdr)[tbuffer_offset]; + + sbp = buf_create_shadow_priv(bp, FALSE, (uintptr_t)blkptr, 0, 0); + + if (sbp == NULL) + panic("jnl: buf_create_shadow returned NULL"); + + /* + * copy the data into the transaction buffer... + */ + memcpy(blkptr, (char *)buf_dataptr(bp), bsize); + + buf_clearflags(bp, B_LOCKED); + buf_markclean(bp); + buf_drop(bp); + + /* + * adopt the shadow buffer for this block + */ + if (func) { + /* + * transfer FS hook function to the + * shadow buffer... it will get called + * in finish_end_transaction + */ + buf_setfilter(sbp, func, arg, NULL, NULL); + } + blhdr->binfo[i].u.bp = sbp; + + } else { + // bnum == -1, only true if a block was "killed" + bsize = blhdr->binfo[i].u.bi.bsize; + } + tbuffer_offset += bsize; + } + next = (block_list_header *)((long)blhdr->binfo[0].bnum); + } + /* + * if callback != NULL, we don't want to drop the journal + * lock, or complete end_transaction asynchronously, since + * the caller is expecting the callback to run in the calling + * context + * + * if drop_lock == FALSE, we can't complete end_transaction + * asynchronously + */ + if (callback) + drop_lock_early = FALSE; + else + drop_lock_early = drop_lock; + + if (drop_lock_early == FALSE) + must_wait = TRUE; + + if (drop_lock_early == TRUE) { + journal_unlock(jnl); + drop_lock = FALSE; + } + if (must_wait == TRUE) + ret_val = finish_end_transaction(tr, callback, callback_arg); + else { + thread_t thread = THREAD_NULL; + + /* + * fire up a thread to complete processing this transaction + * asynchronously... when it finishes, it will call + * unlock_condition + */ + kernel_thread_start((thread_continue_t)finish_end_thread, tr, &thread); + } + KERNEL_DEBUG(0xbbbbc018|DBG_FUNC_END, jnl, tr, ret_val, 0, 0); +done: + if (drop_lock == TRUE) { + journal_unlock(jnl); + } + return (ret_val); +} + + +static void +finish_end_thread(transaction *tr) +{ + throttle_set_thread_io_policy(IOPOL_PASSIVE); + + finish_end_transaction(tr, NULL, NULL); + + thread_deallocate(current_thread()); + thread_terminate(current_thread()); +} + +static void +write_header_thread(journal *jnl) +{ + throttle_set_thread_io_policy(IOPOL_PASSIVE); + + if (write_journal_header(jnl, 1, jnl->saved_sequence_num)) + jnl->write_header_failed = TRUE; + else + jnl->write_header_failed = FALSE; + unlock_condition(jnl, &jnl->writing_header); + + thread_deallocate(current_thread()); + thread_terminate(current_thread()); +} + +static int +finish_end_transaction(transaction *tr, errno_t (*callback)(void*), void *callback_arg) +{ + int i, amt; + int ret = 0; + off_t end; + journal *jnl = tr->jnl; + buf_t bp, *bparray; + vnode_t vp; + block_list_header *blhdr=NULL, *next=NULL; + size_t tbuffer_offset; + int bufs_written = 0; + int ret_val = 0; + boolean_t was_vm_privileged = FALSE; + + KERNEL_DEBUG(0xbbbbc028|DBG_FUNC_START, jnl, tr, 0, 0, 0); + + if (vfs_isswapmount(jnl->fsmount)) { + /* + * if we block waiting for memory, and there is enough pressure to + * cause us to try and create a new swap file, we may end up deadlocking + * due to waiting for the journal on the swap file creation path... + * by making ourselves vm_privileged, we give ourselves the best chance + * of not blocking + */ + was_vm_privileged = set_vm_privilege(TRUE); + } + end = jnl->jhdr->end; + + for (blhdr = tr->blhdr; blhdr; blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) { + + amt = blhdr->bytes_used; + + blhdr->binfo[0].u.bi.b.sequence_num = tr->sequence_num; + + blhdr->checksum = 0; + blhdr->checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE); + + bparray = hfs_malloc(blhdr->num_blocks * sizeof(buf_t)); + tbuffer_offset = jnl->jhdr->blhdr_size; + + for (i = 1; i < blhdr->num_blocks; i++) { + void (*func)(buf_t, void *); + void *arg; + int32_t bsize; + + /* + * finish preparing the shadow buf_t before + * calculating the individual block checksums + */ + if (blhdr->binfo[i].bnum != (off_t)-1) { + daddr64_t blkno; + daddr64_t lblkno; + + bp = blhdr->binfo[i].u.bp; + + vp = buf_vnode(bp); + blkno = buf_blkno(bp); + lblkno = buf_lblkno(bp); + + if (vp == NULL && lblkno == blkno) { + printf("jnl: %s: end_tr: bad news! buffer w/null vp and l/blkno = %qd/%qd. aborting the transaction.\n", + jnl->jdev_name, lblkno, blkno); + ret_val = -1; + goto bad_journal; + } + + // if the lblkno is the same as blkno and this bp isn't + // associated with the underlying file system device then + // we need to call bmap() to get the actual physical block. + // + if ((lblkno == blkno) && (vp != jnl->fsdev)) { + off_t f_offset; + size_t contig_bytes; + + if (hfs_vnop_blktooff(&(struct vnop_blktooff_args){ + .a_vp = vp, + .a_lblkno = lblkno, + .a_offset = &f_offset + })) { + printf("jnl: %s: end_tr: vnop_blktooff failed\n", jnl->jdev_name); + ret_val = -1; + goto bad_journal; + } + + if (hfs_vnop_blockmap(&(struct vnop_blockmap_args) { + .a_vp = vp, + .a_foffset = f_offset, + .a_size = buf_count(bp), + .a_bpn = &blkno, + .a_run = &contig_bytes + })) { + printf("jnl: %s: end_tr: can't blockmap the buffer\n", jnl->jdev_name); + ret_val = -1; + goto bad_journal; + } + + if ((uint32_t)contig_bytes < buf_count(bp)) { + printf("jnl: %s: end_tr: blk not physically contiguous on disk\n", jnl->jdev_name); + ret_val = -1; + goto bad_journal; + } + buf_setblkno(bp, blkno); + } + // update this so we write out the correct physical block number! + blhdr->binfo[i].bnum = (off_t)(blkno); + + /* + * pick up the FS hook function (if any) and prepare + * to fire this buffer off in the next pass + */ + buf_setfilter(bp, buffer_flushed_callback, tr, &func, &arg); + + if (func) { + /* + * call the hook function supplied by the filesystem... + * this needs to happen BEFORE cacl_checksum in case + * the FS morphs the data in the buffer + */ + func(bp, arg); + } + bparray[i] = bp; + bsize = buf_size(bp); + blhdr->binfo[i].u.bi.bsize = bsize; + blhdr->binfo[i].u.bi.b.cksum = calc_checksum(&((char *)blhdr)[tbuffer_offset], bsize); + } else { + bparray[i] = NULL; + bsize = blhdr->binfo[i].u.bi.bsize; + blhdr->binfo[i].u.bi.b.cksum = 0; + } + tbuffer_offset += bsize; + } + /* + * if we fired off the journal_write_header asynchronously in + * 'end_transaction', we need to wait for its completion + * before writing the actual journal data + */ + wait_condition(jnl, &jnl->writing_header, "finish_end_transaction"); + + if (jnl->write_header_failed == FALSE) + ret = write_journal_data(jnl, &end, blhdr, amt); + else + ret_val = -1; + /* + * put the bp pointers back so that we can + * make the final pass on them + */ + for (i = 1; i < blhdr->num_blocks; i++) + blhdr->binfo[i].u.bp = bparray[i]; + + hfs_free(bparray, blhdr->num_blocks * sizeof(buf_t)); + + if (ret_val == -1) + goto bad_journal; + + if (ret != amt) { + printf("jnl: %s: end_transaction: only wrote %d of %d bytes to the journal!\n", + jnl->jdev_name, ret, amt); + + ret_val = -1; + goto bad_journal; + } + } + jnl->jhdr->end = end; // update where the journal now ends + tr->journal_end = end; // the transaction ends here too + + if (tr->journal_start == 0 || tr->journal_end == 0) { + panic("jnl: end_transaction: bad tr journal start/end: 0x%llx 0x%llx\n", + tr->journal_start, tr->journal_end); + } + + if (write_journal_header(jnl, 0, jnl->saved_sequence_num) != 0) { + ret_val = -1; + goto bad_journal; + } + /* + * If the caller supplied a callback, call it now that the blocks have been + * written to the journal. This is used by journal_relocate so, for example, + * the file system can change its pointer to the new journal. + */ + if (callback != NULL && callback(callback_arg) != 0) { + ret_val = -1; + goto bad_journal; + } + + // + // Send a DKIOCUNMAP for the extents trimmed by this transaction, and + // free up the extent list. + // + journal_trim_flush(jnl, tr); + + // the buffer_flushed_callback will only be called for the + // real blocks that get flushed so we have to account for + // the block_list_headers here. + // + tr->num_flushed = tr->num_blhdrs * jnl->jhdr->blhdr_size; + + lock_condition(jnl, &jnl->asyncIO, "finish_end_transaction"); + + // + // setup for looping through all the blhdr's. + // + for (blhdr = tr->blhdr; blhdr; blhdr = next) { + uint16_t num_blocks; + + /* + * grab this info ahead of issuing the buf_bawrites... + * once the last one goes out, its possible for blhdr + * to be freed (especially if we get preempted) before + * we do the last check of num_blocks or + * grab the next blhdr pointer... + */ + next = (block_list_header *)((long)blhdr->binfo[0].bnum); + num_blocks = blhdr->num_blocks; + + /* + * we can re-order the buf ptrs because everything is written out already + */ + kx_qsort(&blhdr->binfo[1], num_blocks-1, sizeof(block_info), journal_binfo_cmp); + + /* + * need to make sure that the loop issuing the buf_bawrite's + * does not touch blhdr once the last buf_bawrite has been + * issued... at that point, we no longer have a legitmate + * reference on the associated storage since it will be + * released upon the completion of that last buf_bawrite + */ + for (i = num_blocks-1; i >= 1; i--) { + if (blhdr->binfo[i].bnum != (off_t)-1) + break; + num_blocks--; + } + for (i = 1; i < num_blocks; i++) { + + if ((bp = blhdr->binfo[i].u.bp)) { + vp = buf_vnode(bp); + + buf_bawrite(bp); + + // this undoes the vnode_ref() in journal_modify_block_end() + vnode_rele_ext(vp, 0, 1); + + bufs_written++; + } + } + } + if (bufs_written == 0) { + /* + * since we didn't issue any buf_bawrite's, there is no + * async trigger to cause the memory associated with this + * transaction to be freed... so, move it to the garbage + * list now + */ + lock_oldstart(jnl); + + tr->next = jnl->tr_freeme; + jnl->tr_freeme = tr; + + unlock_oldstart(jnl); + + unlock_condition(jnl, &jnl->asyncIO); + } + + //printf("jnl: end_tr: tr @ 0x%x, jnl-blocks: 0x%llx - 0x%llx. exit!\n", + // tr, tr->journal_start, tr->journal_end); + +bad_journal: + if (ret_val == -1) { + abort_transaction(jnl, tr); // cleans up list of extents to be trimmed + + /* + * 'flush_aborted' is protected by the flushing condition... we need to + * set it before dropping the condition so that it will be + * noticed in 'end_transaction'... we add this additional + * aborted condition so that we can drop the 'flushing' condition + * before grabbing the journal lock... this avoids a deadlock + * in 'end_transaction' which is holding the journal lock while + * waiting for the 'flushing' condition to clear... + * everyone else will notice the JOURNAL_INVALID flag + */ + jnl->flush_aborted = TRUE; + + unlock_condition(jnl, &jnl->flushing); + journal_lock(jnl); + + jnl->flags |= JOURNAL_INVALID; + jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] &= ~0x8000000000000000LL; + + journal_unlock(jnl); + } else + unlock_condition(jnl, &jnl->flushing); + + if (vfs_isswapmount(jnl->fsmount) && (was_vm_privileged == FALSE)) + set_vm_privilege(FALSE); + + KERNEL_DEBUG(0xbbbbc028|DBG_FUNC_END, jnl, tr, bufs_written, ret_val, 0); + + return (ret_val); +} + + +static void +lock_condition(journal *jnl, boolean_t *condition, const char *condition_name) +{ + + KERNEL_DEBUG(0xbbbbc020|DBG_FUNC_START, jnl, condition, 0, 0, 0); + + lock_flush(jnl); + + while (*condition == TRUE) + msleep(condition, &jnl->flock, PRIBIO, condition_name, NULL); + + *condition = TRUE; + unlock_flush(jnl); + + KERNEL_DEBUG(0xbbbbc020|DBG_FUNC_END, jnl, condition, 0, 0, 0); +} + +static void +wait_condition(journal *jnl, boolean_t *condition, const char *condition_name) +{ + + if (*condition == FALSE) + return; + + KERNEL_DEBUG(0xbbbbc02c|DBG_FUNC_START, jnl, condition, 0, 0, 0); + + lock_flush(jnl); + + while (*condition == TRUE) + msleep(condition, &jnl->flock, PRIBIO, condition_name, NULL); + + unlock_flush(jnl); + + KERNEL_DEBUG(0xbbbbc02c|DBG_FUNC_END, jnl, condition, 0, 0, 0); +} + +static void +unlock_condition(journal *jnl, boolean_t *condition) +{ + lock_flush(jnl); + + *condition = FALSE; + wakeup(condition); + + unlock_flush(jnl); +} + +static void +abort_transaction(journal *jnl, transaction *tr) +{ + block_list_header *blhdr, *next; + + // for each block list header, iterate over the blocks then + // free up the memory associated with the block list. + // + // find each of the primary blocks (i.e. the list could + // contain a mix of shadowed and real buf_t's depending + // on when the abort condition was detected) and mark them + // clean and locked in the cache... this at least allows + // the FS a consistent view between it's incore data structures + // and the meta-data held in the cache + // + KERNEL_DEBUG(0xbbbbc034|DBG_FUNC_START, jnl, tr, 0, 0, 0); + + for (blhdr = tr->blhdr; blhdr; blhdr = next) { + int i; + + for (i = 1; i < blhdr->num_blocks; i++) { + buf_t bp, tbp, sbp; + vnode_t bp_vp; + errno_t err; + + if (blhdr->binfo[i].bnum == (off_t)-1) + continue; + + tbp = blhdr->binfo[i].u.bp; + + bp_vp = buf_vnode(tbp); + + if (buf_shadow(tbp)) { + sbp = tbp; + buf_setfilter(tbp, NULL, NULL, NULL, NULL); + } else { + hfs_assert(ISSET(buf_flags(tbp), B_LOCKED)); + + sbp = NULL; + + do { + err = buf_acquire(tbp, BAC_REMOVE, 0, 0); + } while (err == EAGAIN); + + if (!err) { + buf_setfilter(tbp, NULL, NULL, NULL, NULL); + buf_brelse(tbp); + } + } + + if (bp_vp) { + err = buf_meta_bread(bp_vp, + buf_lblkno(tbp), + buf_size(tbp), + NOCRED, + &bp); + if (err == 0) { + if (sbp == NULL && bp != tbp && (buf_flags(tbp) & B_LOCKED)) { + panic("jnl: abort_tr: got back a different bp! (bp %p should be %p, jnl %p\n", + bp, tbp, jnl); + } + /* + * once the journal has been marked INVALID and aborted, + * NO meta data can be written back to the disk, so + * mark the buf_t clean and make sure it's locked in the cache + * note: if we found a shadow, the real buf_t needs to be relocked + */ + buf_setflags(bp, B_LOCKED); + buf_markclean(bp); + buf_brelse(bp); + + KERNEL_DEBUG(0xbbbbc034|DBG_FUNC_NONE, jnl, tr, bp, 0, 0); + + /* + * this undoes the vnode_ref() in journal_modify_block_end() + */ + vnode_rele_ext(bp_vp, 0, 1); + } else { + printf("jnl: %s: abort_tr: could not find block %lld for vnode!\n", + jnl->jdev_name, blhdr->binfo[i].bnum); + if (bp) { + buf_brelse(bp); + } + } + } + if (sbp) + buf_brelse(sbp); + } + next = (block_list_header *)((long)blhdr->binfo[0].bnum); + + // we can free blhdr here since we won't need it any more + blhdr->binfo[0].bnum = 0xdeadc0de; + hfs_free(blhdr, tr->tbuffer_size); + } + + /* + * If the transaction we're aborting was the async transaction, then + * tell the current transaction that there is no pending trim + * any more. + */ + lck_rw_lock_exclusive(&jnl->trim_lock); + if (jnl->async_trim == &tr->trim) + jnl->async_trim = NULL; + lck_rw_unlock_exclusive(&jnl->trim_lock); + + + if (tr->trim.extents) { + hfs_free(tr->trim.extents, tr->trim.allocated_count * sizeof(dk_extent_t)); + } + tr->trim.allocated_count = 0; + tr->trim.extent_count = 0; + tr->trim.extents = NULL; + tr->tbuffer = NULL; + tr->blhdr = NULL; + tr->total_bytes = 0xdbadc0de; + hfs_free(tr, sizeof(*tr)); + + KERNEL_DEBUG(0xbbbbc034|DBG_FUNC_END, jnl, tr, 0, 0, 0); +} + + +int +journal_end_transaction(journal *jnl) +{ + int ret; + transaction *tr; + + CHECK_JOURNAL(jnl); + + free_old_stuff(jnl); + + if ((jnl->flags & JOURNAL_INVALID) && jnl->owner == NULL) { + return 0; + } + + if (jnl->owner != current_thread()) { + panic("jnl: end_tr: I'm not the owner! jnl %p, owner %p, curact %p\n", + jnl, jnl->owner, current_thread()); + } + jnl->nested_count--; + + if (jnl->nested_count > 0) { + return 0; + } else if (jnl->nested_count < 0) { + panic("jnl: jnl @ %p has negative nested count (%d). bad boy.\n", jnl, jnl->nested_count); + } + + if (jnl->flags & JOURNAL_INVALID) { + if (jnl->active_tr) { + if (jnl->cur_tr != NULL) { + panic("jnl: journal @ %p has active tr (%p) and cur tr (%p)\n", + jnl, jnl->active_tr, jnl->cur_tr); + } + tr = jnl->active_tr; + jnl->active_tr = NULL; + + abort_transaction(jnl, tr); + } + journal_unlock(jnl); + + return EINVAL; + } + + tr = jnl->active_tr; + CHECK_TRANSACTION(tr); + + // clear this out here so that when check_free_space() calls + // the FS flush function, we don't panic in journal_flush() + // if the FS were to call that. note: check_free_space() is + // called from end_transaction(). + // + jnl->active_tr = NULL; + + /* Examine the force-journal-flush state in the active txn */ + if (tr->flush_on_completion == TRUE) { + /* + * If the FS requested it, disallow group commit and force the + * transaction out to disk immediately. + */ + ret = end_transaction(tr, 1, NULL, NULL, TRUE, TRUE); + } + else { + /* in the common path we can simply use the double-buffered journal */ + ret = end_transaction(tr, 0, NULL, NULL, TRUE, FALSE); + } + + return ret; +} + + +/* + * Flush the contents of the journal to the disk. + * + * Input: + * wait_for_IO - + * If TRUE, wait to write in-memory journal to the disk + * consistently, and also wait to write all asynchronous + * metadata blocks to its corresponding locations + * consistently on the disk. This means that the journal + * is empty at this point and does not contain any + * transactions. This is overkill in normal scenarios + * but is useful whenever the metadata blocks are required + * to be consistent on-disk instead of just the journal + * being consistent; like before live verification + * and live volume resizing. + * + * If FALSE, only wait to write in-memory journal to the + * disk consistently. This means that the journal still + * contains uncommitted transactions and the file system + * metadata blocks in the journal transactions might be + * written asynchronously to the disk. But there is no + * guarantee that they are written to the disk before + * returning to the caller. Note that this option is + * sufficient for file system data integrity as it + * guarantees consistent journal content on the disk. + */ +int +journal_flush(journal *jnl, journal_flush_options_t options) +{ + boolean_t drop_lock = FALSE; + errno_t error = 0; + uint32_t flush_count = 0; + + CHECK_JOURNAL(jnl); + + free_old_stuff(jnl); + + if (jnl->flags & JOURNAL_INVALID) { + return -1; + } + + KDBG(DBG_JOURNAL_FLUSH | DBG_FUNC_START, jnl); + + if (jnl->owner != current_thread()) { + journal_lock(jnl); + drop_lock = TRUE; + } + + if (ISSET(options, JOURNAL_FLUSH_FULL)) + flush_count = jnl->flush_counter; + + // if we're not active, flush any buffered transactions + if (jnl->active_tr == NULL && jnl->cur_tr) { + transaction *tr = jnl->cur_tr; + + jnl->cur_tr = NULL; + + if (ISSET(options, JOURNAL_WAIT_FOR_IO)) { + wait_condition(jnl, &jnl->flushing, "journal_flush"); + wait_condition(jnl, &jnl->asyncIO, "journal_flush"); + } + /* + * "end_transction" will wait for any current async flush + * to complete, before flushing "cur_tr"... because we've + * specified the 'must_wait' arg as TRUE, it will then + * synchronously flush the "cur_tr" + */ + end_transaction(tr, 1, NULL, NULL, drop_lock, TRUE); // force it to get flushed + + } else { + if (drop_lock == TRUE) { + journal_unlock(jnl); + } + + /* Because of pipelined journal, the journal transactions + * might be in process of being flushed on another thread. + * If there is nothing to flush currently, we should + * synchronize ourselves with the pipelined journal thread + * to ensure that all inflight transactions, if any, are + * flushed before we return success to caller. + */ + wait_condition(jnl, &jnl->flushing, "journal_flush"); + } + if (ISSET(options, JOURNAL_WAIT_FOR_IO)) { + wait_condition(jnl, &jnl->asyncIO, "journal_flush"); + } + + if (ISSET(options, JOURNAL_FLUSH_FULL)) { + + dk_synchronize_t sync_request = { + .options = 0, + }; + + // We need a full cache flush. If it has not been done, do it here. + if (flush_count == jnl->flush_counter) + error = VNOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZE, (caddr_t)&sync_request, FWRITE, vfs_context_kernel()); + + // If external journal partition is enabled, flush filesystem data partition. + if (jnl->jdev != jnl->fsdev) + error = VNOP_IOCTL(jnl->fsdev, DKIOCSYNCHRONIZE, (caddr_t)&sync_request, FWRITE, vfs_context_kernel()); + + } + + KDBG(DBG_JOURNAL_FLUSH | DBG_FUNC_END, jnl); + + return 0; +} + +int +journal_active(journal *jnl) +{ + if (jnl->flags & JOURNAL_INVALID) { + return -1; + } + + return (jnl->active_tr == NULL) ? 0 : 1; +} + +void * +journal_owner(journal *jnl) +{ + return jnl->owner; +} + +int journal_uses_fua(journal *jnl) +{ + if (jnl->flags & JOURNAL_DO_FUA_WRITES) + return 1; + return 0; +} + +/* + * Relocate the journal. + * + * You provide the new starting offset and size for the journal. You may + * optionally provide a new tbuffer_size; passing zero defaults to not + * changing the tbuffer size except as needed to fit within the new journal + * size. + * + * You must have already started a transaction. The transaction may contain + * modified blocks (such as those needed to deallocate the old journal, + * allocate the new journal, and update the location and size of the journal + * in filesystem-private structures). Any transactions prior to the active + * transaction will be flushed to the old journal. The new journal will be + * initialized, and the blocks from the active transaction will be written to + * the new journal. + * + * The caller will need to update the structures that identify the location + * and size of the journal. These updates should be made in the supplied + * callback routine. These updates must NOT go into a transaction. You should + * force these updates to the media before returning from the callback. In the + * even of a crash, either the old journal will be found, with an empty journal, + * or the new journal will be found with the contents of the active transaction. + * + * Upon return from the callback, the blocks from the active transaction are + * written to their normal locations on disk. + * + * (Remember that we have to ensure that blocks get committed to the journal + * before being committed to their normal locations. But the blocks don't count + * as committed until the new journal is pointed at.) + * + * Upon return, there is still an active transaction: newly allocated, and + * with no modified blocks. Call journal_end_transaction as normal. You may + * modifiy additional blocks before calling journal_end_transaction, and those + * blocks will (eventually) go to the relocated journal. + * + * Inputs: + * jnl The (opened) journal to relocate. + * offset The new journal byte offset (from start of the journal device). + * journal_size The size, in bytes, of the new journal. + * tbuffer_size The new desired transaction buffer size. Pass zero to keep + * the same size as the current journal. The size will be + * modified as needed to fit the new journal. + * callback Routine called after the new journal has been initialized, + * and the active transaction written to the new journal, but + * before the blocks are written to their normal locations. + * Pass NULL for no callback. + * callback_arg An argument passed to the callback routine. + * + * Result: + * 0 No errors + * EINVAL The offset is not block aligned + * EINVAL The journal_size is not a multiple of the block size + * EINVAL The journal is invalid + * (any) An error returned by journal_flush. + * + */ +int journal_relocate(journal *jnl, off_t offset, off_t journal_size, int32_t tbuffer_size, + errno_t (*callback)(void *), void *callback_arg) +{ + int ret; + transaction *tr; + size_t i = 0; + + /* + * Sanity check inputs, and adjust the size of the transaction buffer. + */ + if (jnl->jhdr->jhdr_size == 0) { + printf("jnl: %s: relocate: bad jhdr size (%d)\n", jnl->jdev_name, jnl->jhdr->jhdr_size); + return EINVAL; + } + + if ((offset % jnl->jhdr->jhdr_size) != 0) { + printf("jnl: %s: relocate: offset 0x%llx is not an even multiple of block size 0x%x\n", + jnl->jdev_name, offset, jnl->jhdr->jhdr_size); + return EINVAL; + } + if ((journal_size % jnl->jhdr->jhdr_size) != 0) { + printf("jnl: %s: relocate: journal size 0x%llx is not an even multiple of block size 0x%x\n", + jnl->jdev_name, journal_size, jnl->jhdr->jhdr_size); + return EINVAL; + } + + CHECK_JOURNAL(jnl); + + /* Guarantee we own the active transaction. */ + if (jnl->flags & JOURNAL_INVALID) { + return EINVAL; + } + if (jnl->owner != current_thread()) { + panic("jnl: relocate: Not the owner! jnl %p, owner %p, curact %p\n", + jnl, jnl->owner, current_thread()); + } + + if (tbuffer_size == 0) + tbuffer_size = jnl->tbuffer_size; + size_up_tbuffer(jnl, tbuffer_size, jnl->jhdr->jhdr_size); + + /* + * Flush any non-active transactions. We have to temporarily hide the + * active transaction to make journal_flush flush out non-active but + * current (unwritten) transactions. + */ + tr = jnl->active_tr; + CHECK_TRANSACTION(tr); + jnl->active_tr = NULL; + ret = journal_flush(jnl, JOURNAL_WAIT_FOR_IO); + jnl->active_tr = tr; + + if (ret) { + return ret; + } + wait_condition(jnl, &jnl->flushing, "end_transaction"); + + /* + * At this point, we have completely flushed the contents of the current + * journal to disk (and have asynchronously written all of the txns to + * their actual desired locations). As a result, we can (and must) clear + * out the old_start array. If we do not, then if the last written transaction + * started at the beginning of the journal (starting 1 block into the + * journal file) it could confuse the buffer_flushed callback. This is + * because we're about to reset the start/end pointers of the journal header + * below. + */ + lock_oldstart(jnl); + for (i = 0; i < sizeof (jnl->old_start) / sizeof(jnl->old_start[0]); i++) { + jnl->old_start[i] = 0; + } + unlock_oldstart(jnl); + + /* Update the journal's offset and size in memory. */ + jnl->jdev_offset = offset; + jnl->jhdr->start = jnl->jhdr->end = jnl->jhdr->jhdr_size; + jnl->jhdr->size = journal_size; + jnl->active_start = jnl->jhdr->start; + + /* + * Force the active transaction to be written to the new journal. Call the + * supplied callback after the blocks have been written to the journal, but + * before they get written to their normal on-disk locations. + */ + jnl->active_tr = NULL; + ret = end_transaction(tr, 1, callback, callback_arg, FALSE, TRUE); + if (ret) { + printf("jnl: %s: relocate: end_transaction failed (%d)\n", jnl->jdev_name, ret); + goto bad_journal; + } + + /* + * Create a new, empty transaction to be the active transaction. This way + * our caller can use journal_end_transaction as usual. + */ + ret = journal_allocate_transaction(jnl); + if (ret) { + printf("jnl: %s: relocate: could not allocate new transaction (%d)\n", jnl->jdev_name, ret); + goto bad_journal; + } + + return 0; + +bad_journal: + jnl->flags |= JOURNAL_INVALID; + abort_transaction(jnl, tr); + return ret; +} + +uint32_t journal_current_txn(journal *jnl) +{ + return jnl->sequence_num + (jnl->active_tr || jnl->cur_tr ? 0 : 1); +} diff --git a/core/hfs_journal.h b/core/hfs_journal.h new file mode 100644 index 0000000..ff8b851 --- /dev/null +++ b/core/hfs_journal.h @@ -0,0 +1,378 @@ +/* + * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + * This header contains the structures and function prototypes + * for the vfs journaling code. The data types are not meant + * to be modified by user code. Just use the functions and do + * not mess around with the structs. + */ +#ifndef HFS_JOURNAL_H_ +#define HFS_JOURNAL_H_ + +#include +#include + +#ifdef __APPLE_API_UNSTABLE + +#include +#include +#include + + +typedef struct _blk_info { + int32_t bsize; + union { + int32_t cksum; + uint32_t sequence_num; + } b; +} _blk_info; + +typedef struct block_info { + off_t bnum; // block # on the file system device + union { + _blk_info bi; + struct buf *bp; + } u; +} __attribute__((__packed__)) block_info; + +typedef struct block_list_header { + u_int16_t max_blocks; // max number of blocks in this chunk + u_int16_t num_blocks; // number of valid block numbers in block_nums + int32_t bytes_used; // how many bytes of this tbuffer are used + uint32_t checksum; // on-disk: checksum of this header and binfo[0] + int32_t flags; // check-checksums, initial blhdr, etc + block_info binfo[1]; // so we can reference them by name +} block_list_header; + +#define BLHDR_CHECK_CHECKSUMS 0x0001 +#define BLHDR_FIRST_HEADER 0x0002 + + +struct journal; + +struct jnl_trim_list { + uint32_t allocated_count; + uint32_t extent_count; + dk_extent_t *extents; +}; + +typedef void (*jnl_trim_callback_t)(void *arg, uint32_t extent_count, const dk_extent_t *extents); + +typedef struct transaction { + int tbuffer_size; // in bytes + char *tbuffer; // memory copy of the transaction + block_list_header *blhdr; // points to the first byte of tbuffer + int num_blhdrs; // how many buffers we've allocated + int total_bytes; // total # of bytes in transaction + int num_flushed; // how many bytes have been flushed + int num_killed; // how many bytes were "killed" + off_t journal_start; // where in the journal this transaction starts + off_t journal_end; // where in the journal this transaction ends + struct journal *jnl; // ptr back to the journal structure + struct transaction *next; // list of tr's (either completed or to be free'd) + uint32_t sequence_num; + struct jnl_trim_list trim; + boolean_t delayed_header_write; + boolean_t flush_on_completion; //flush transaction immediately upon txn end. +} transaction; + + +/* + * This is written to block zero of the journal and it + * maintains overall state about the journal. + */ +typedef struct journal_header { + int32_t magic; + int32_t endian; + volatile off_t start; // zero-based byte offset of the start of the first transaction + volatile off_t end; // zero-based byte offset of where free space begins + off_t size; // size in bytes of the entire journal + int32_t blhdr_size; // size in bytes of each block_list_header in the journal + uint32_t checksum; + int32_t jhdr_size; // block size (in bytes) of the journal header + uint32_t sequence_num; // NEW FIELD: a monotonically increasing value assigned to all txn's +} journal_header; + +#define JOURNAL_HEADER_MAGIC 0x4a4e4c78 // 'JNLx' +#define ENDIAN_MAGIC 0x12345678 + +// +// we only checksum the original size of the journal_header to remain +// backwards compatible. the size of the original journal_heade is +// everything up to the the sequence_num field, hence we use the +// offsetof macro to calculate the size. +// +#define JOURNAL_HEADER_CKSUM_SIZE (offsetof(struct journal_header, sequence_num)) + +#define OLD_JOURNAL_HEADER_MAGIC 0x4a484452 // 'JHDR' + + +/* + * In memory structure about the journal. + */ +typedef struct journal { + lck_mtx_t jlock; // protects the struct journal data + lck_mtx_t flock; // serializes flushing of journal + lck_rw_t trim_lock; // protects the async_trim field, below + + + struct vnode *jdev; // vnode of the device where the journal lives + off_t jdev_offset; // byte offset to the start of the journal + const char *jdev_name; + + struct vnode *fsdev; // vnode of the file system device + struct mount *fsmount; // mount of the file system + + void (*flush)(void *arg); // fs callback to flush meta data blocks + void *flush_arg; // arg that's passed to flush() + + int32_t flags; + int32_t tbuffer_size; // default transaction buffer size + boolean_t flush_aborted; + boolean_t flushing; + boolean_t asyncIO; + boolean_t writing_header; + boolean_t write_header_failed; + + struct jnl_trim_list *async_trim; // extents to be trimmed by transaction being asynchronously flushed + jnl_trim_callback_t trim_callback; + void *trim_callback_arg; + + char *header_buf; // in-memory copy of the journal header + int32_t header_buf_size; + journal_header *jhdr; // points to the first byte of header_buf + + uint32_t saved_sequence_num; + uint32_t sequence_num; + + off_t max_read_size; + off_t max_write_size; + + transaction *cur_tr; // for group-commit + transaction *completed_trs; // out-of-order transactions that completed + transaction *active_tr; // for nested transactions + int32_t nested_count; // for nested transactions + void *owner; // a ptr that's unique to the calling process + + transaction *tr_freeme; // transaction structs that need to be free'd + + volatile off_t active_start; // the active start that we only keep in memory + lck_mtx_t old_start_lock; // protects the old_start + volatile off_t old_start[16]; // this is how we do lazy start update + + int last_flush_err; // last error from flushing the cache + uint32_t flush_counter; // a monotonically increasing value assigned on track cache flush +} journal; + +/* internal-only journal flags (top 16 bits) */ +#define JOURNAL_CLOSE_PENDING 0x00010000 +#define JOURNAL_INVALID 0x00020000 +#define JOURNAL_FLUSHCACHE_ERR 0x00040000 // means we already printed this err +#define JOURNAL_NEED_SWAP 0x00080000 // swap any data read from disk +#define JOURNAL_DO_FUA_WRITES 0x00100000 // do force-unit-access writes +#define JOURNAL_USE_UNMAP 0x00200000 // device supports UNMAP (TRIM) +#define JOURNAL_FEATURE_BARRIER 0x00400000 // device supports barrier-only flush + + +/* journal_open/create options are always in the low-16 bits */ +#define JOURNAL_OPTION_FLAGS_MASK 0x0000ffff + +__BEGIN_DECLS +/* + * Prototypes. + */ + +/* + * Call journal_init() to initialize the journaling code (sets up lock attributes) + */ +void journal_init(void); + +/* + * Call journal_create() to create a new journal. You only + * call this once, typically at file system creation time. + * + * The "jvp" argument is the vnode where the journal is written. + * The journal starts at "offset" and is "journal_size" bytes long. + * + * The "fsvp" argument is the vnode of your file system. It may be + * the same as "jvp". + * + * The "min_fs_block_size" argument is the minimum block size + * (in bytes) that the file system will ever write. Typically + * this is the block size of the file system (1k, 4k, etc) but + * on HFS+ it is the minimum block size of the underlying device. + * + * The flags argument lets you disable group commit if you + * want tighter guarantees on transactions (in exchange for + * lower performance). + * + * The tbuffer_size is the size of the transaction buffer + * used by the journal. If you specify zero, the journal code + * will use a reasonable defaults. The tbuffer_size should + * be an integer multiple of the min_fs_block_size. + * + * Returns a valid journal pointer or NULL if one could not + * be created. + */ +journal *journal_create(struct vnode *jvp, + off_t offset, + off_t journal_size, + struct vnode *fsvp, + size_t min_fs_block_size, + int32_t flags, + int32_t tbuffer_size, + void (*flush)(void *arg), + void *arg, + struct mount *fsmount); + +/* + * Call journal_open() when mounting an existing file system + * that has a previously created journal. It will take care + * of validating the journal and replaying it if necessary. + * + * See journal_create() for a description of the arguments. + * + * Returns a valid journal pointer of NULL if it runs into + * trouble reading/playing back the journal. + */ +journal *journal_open(struct vnode *jvp, + off_t offset, + off_t journal_size, + struct vnode *fsvp, + size_t min_fs_block_size, + int32_t flags, + int32_t tbuffer_size, + void (*flush)(void *arg), + void *arg, + struct mount *fsmount); + +/* + * Test whether the journal is clean or not. This is intended + * to be used when you're mounting read-only. If the journal + * is not clean for some reason then you should not mount the + * volume as your data structures may be in an unknown state. + */ +int journal_is_clean(struct vnode *jvp, + off_t offset, + off_t journal_size, + struct vnode *fsvp, + size_t min_fs_block_size); + + +/* + * Call journal_close() just before your file system is unmounted. + * It flushes any outstanding transactions and makes sure the + * journal is in a consistent state. + */ +void journal_close(journal *journalp); + +/* + * flags for journal_create/open. only can use + * the low 16 bits for flags because internal + * bits go in the high 16. + */ +#define JOURNAL_NO_GROUP_COMMIT 0x00000001 +#define JOURNAL_RESET 0x00000002 + +/* + * Transaction related functions. + * + * Before you start modifying file system meta data, you + * should call journal_start_transaction(). Then before + * you modify each block, call journal_modify_block_start() + * and when you're done, journal_modify_block_end(). When + * you've modified the last block as part of a transaction, + * call journal_end_transaction() to commit the changes. + * + * If you decide to abort the modifications to a block you + * should call journal_modify_block_abort(). + * + * If as part of a transaction you need want to throw out + * any previous copies of a block (because it got deleted) + * then call journal_kill_block(). This will mark it so + * that the journal does not play it back (effectively + * dropping it). + * + * journal_trim_add_extent() marks a range of bytes on the device which should + * be trimmed (invalidated, unmapped). journal_trim_remove_extent() marks a + * range of bytes which should no longer be trimmed. Accumulated extents + * will be trimmed when the transaction is flushed to the on-disk journal. + */ +int journal_start_transaction(journal *jnl); +int journal_modify_block_start(journal *jnl, struct buf *bp); +int journal_modify_block_abort(journal *jnl, struct buf *bp); +int journal_modify_block_end(journal *jnl, struct buf *bp, void (*func)(struct buf *bp, void *arg), void *arg); +int journal_kill_block(journal *jnl, struct buf *bp); +int journal_trim_add_extent(journal *jnl, uint64_t offset, uint64_t length); +int journal_trim_remove_extent(journal *jnl, uint64_t offset, uint64_t length); +void journal_trim_set_callback(journal *jnl, jnl_trim_callback_t callback, void *arg); +int journal_trim_extent_overlap (journal *jnl, uint64_t offset, uint64_t length, uint64_t *end); +/* Mark state in the journal that requests an immediate journal flush upon txn completion */ +int journal_request_immediate_flush (journal *jnl); +int journal_end_transaction(journal *jnl); + +int journal_active(journal *jnl); + +typedef enum journal_flush_options { + JOURNAL_WAIT_FOR_IO = 0x01, // Flush journal and metadata blocks, wait for async IO to complete. + JOURNAL_FLUSH_FULL = 0x02, // Flush track cache to media +} journal_flush_options_t; + +int journal_flush(journal *jnl, journal_flush_options_t options); +void *journal_owner(journal *jnl); // compare against current_thread() +int journal_uses_fua(journal *jnl); +void journal_lock(journal *jnl); +void journal_unlock(journal *jnl); + + +/* + * Relocate the journal. + * + * You provide the new starting offset and size for the journal. You may + * optionally provide a new tbuffer_size; passing zero defaults to not + * changing the tbuffer size except as needed to fit within the new journal + * size. + * + * You must have already started a transaction. The transaction may contain + * modified blocks (such as those needed to deallocate the old journal, + * allocate the new journal, and update the location and size of the journal + * in filesystem-private structures). Any transactions prior to the active + * transaction will be flushed to the old journal. The new journal will be + * initialized, and the blocks from the active transaction will be written to + * the new journal. The caller will need to update the structures that + * identify the location and size of the journal from the callback routine. + */ +int journal_relocate(journal *jnl, off_t offset, off_t journal_size, int32_t tbuffer_size, + errno_t (*callback)(void *), void *callback_arg); + +uint32_t journal_current_txn(journal *jnl); + +__END_DECLS + +#endif /* __APPLE_API_UNSTABLE */ +#endif /* !HFS_JOURNAL_H_ */ diff --git a/core/hfs_kdebug.h b/core/hfs_kdebug.h new file mode 100644 index 0000000..827fc4f --- /dev/null +++ b/core/hfs_kdebug.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2014 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#ifndef HFS_KDEBUG_H_ +#define HFS_KDEBUG_H_ + +#include + +/* + * KERNEL_DEBUG related definitions for HFS. + * + * NOTE: The Class DBG_FSYSTEM = 3, and Subclass DBG_HFS = 8, so these + * debug codes are of the form 0x0308nnnn. + */ +#define HFSDBG_CODE(code) FSDBG_CODE(DBG_HFS, code) + +enum { + HFSDBG_WRITE = FSDBG_CODE(DBG_FSRW, 0), /* 0x3010000 */ + HFSDBG_TRUNCATE = FSDBG_CODE(DBG_FSRW, 7), /* 0x301001C */ + HFSDBG_READ = FSDBG_CODE(DBG_FSRW, 12), /* 0x3010030 */ + HFSDBG_GETNEWVNODE = FSDBG_CODE(DBG_FSRW, 37), /* 0x3010094 */ + HFSDBG_UPDATE = FSDBG_CODE(DBG_FSRW, 8192), /* 0x3018000 */ + HFSDBG_UNMAP_FREE = HFSDBG_CODE(0), /* 0x03080000 */ + HFSDBG_UNMAP_ALLOC = HFSDBG_CODE(1), /* 0x03080004 */ + HFSDBG_UNMAP_CALLBACK = HFSDBG_CODE(2), /* 0x03080008 */ + /* 0x0308000C is unused */ + HFSDBG_BLOCK_ALLOCATE = HFSDBG_CODE(4), /* 0x03080010 */ + HFSDBG_BLOCK_DEALLOCATE = HFSDBG_CODE(5), /* 0x03080014 */ + HFSDBG_READ_BITMAP_BLOCK = HFSDBG_CODE(6), /* 0x03080018 */ + HFSDBG_RELEASE_BITMAP_BLOCK = HFSDBG_CODE(7), /* 0x0308001C */ + HFSDBG_FIND_CONTIG_BITMAP = HFSDBG_CODE(8), /* 0x03080020 */ + HFSDBG_ALLOC_ANY_BITMAP = HFSDBG_CODE(9), /* 0x03080024 */ + HFSDBG_ALLOC_FIND_KNOWN = HFSDBG_CODE(10), /* 0x03080028 */ + HFSDBG_MARK_ALLOC_BITMAP = HFSDBG_CODE(11), /* 0x0308002C */ + HFSDBG_MARK_FREE_BITMAP = HFSDBG_CODE(12), /* 0x03080030 */ + HFSDBG_BLOCK_FIND_CONTIG = HFSDBG_CODE(13), /* 0x03080034 */ + HFSDBG_IS_ALLOCATED = HFSDBG_CODE(14), /* 0x03080038 */ + /* 0x0308003C is unused */ + HFSDBG_RESET_EXTENT_CACHE = HFSDBG_CODE(16), /* 0x03080040 */ + HFSDBG_REMOVE_EXTENT_CACHE = HFSDBG_CODE(17), /* 0x03080044 */ + HFSDBG_ADD_EXTENT_CACHE = HFSDBG_CODE(18), /* 0x03080048 */ + HFSDBG_READ_BITMAP_RANGE = HFSDBG_CODE(19), /* 0x0308004C */ + HFSDBG_RELEASE_SCAN_BITMAP = HFSDBG_CODE(20), /* 0x03080050 */ + HFSDBG_SYNCER = HFSDBG_CODE(21), /* 0x03080054 */ + HFSDBG_SYNCER_TIMED = HFSDBG_CODE(22), /* 0x03080058 */ + HFSDBG_UNMAP_SCAN = HFSDBG_CODE(23), /* 0x0308005C */ + HFSDBG_UNMAP_SCAN_TRIM = HFSDBG_CODE(24), /* 0x03080060 */ +}; + +/* + Parameters logged by the above tracepoints: +--------------------------------------------------------------------------------------------------------------------------------- + CODE EVENT NAME DBG_FUNC_START arg1, arg2, arg3, arg4, arg5 ... DBG_FUNC_END arg1, arg2, arg3, arg4, arg5 + DBG_FUNC_NONE arg1, arg2, arg3, arg4, arg5 +--------------------------------------------------------------------------------------------------------------------------------- +0x3010000 HFSDBG_WRITE offset, uio_resid, ff_size, filebytes, 0 ... uio_offset, uio_resid, ff_size, filebytes, 0 + offset, uio_resid, ff_size, filebytes, 0 +0x301001C HFSDBG_TRUNCATE length, ff_size, filebytes, 0, 0 ... length, ff_size, filebytes, retval, 0 + length, ff_size, filebytes, 0, 0 +0x3010030 HFSDBG_READ uio_offset, uio_resid, filesize, filebytes, 0 ... uio_offset, uio_resid, filesize, filebytes, 0 +0x3010094 HFSDBG_GETNEWVNODE c_vp, c_rsrc_vp, 0, 0, 0 +0x3018000 HFSDBG_UPDATE vp, tstate, 0, 0, 0 ... vp, tstate, error, 0/-1, 0 + 0 HFSDBG_UNMAP_FREE startBlock, blockCount, 0, 0, 0 ... err, 0, 0, 0, 0 + 1 HFSDBG_UNMAP_ALLOC startBlock, blockCount, 0, 0, 0 ... err, 0, 0, 0, 0 + 2 HFSDBG_UNMAP_CALLBACK 0, extentCount, 0, 0, 0 ... 0, 0, 0, 0, 0 + 3 unused + 4 HFSDBG_BLOCK_ALLOCATE startBlock, minBlocks, maxBlocks, flags, 0 ... err, actualStartBlock, actualBlockCount, 0, 0 + 5 HFSDBG_BLOCK_DEALLOCATE startBlock, blockCount, flags, 0, 0 ... err, 0, 0, 0, 0 + 6 HFSDBG_READ_BITMAP_BLOCK startBlock, 0, 0, 0, 0 ... err, 0, 0, 0, 0 + 7 HFSDBG_RELEASE_BITMAP_BLOCK dirty, 0, 0, 0, 0 ... 0, 0, 0, 0, 0 + 8 HFSDBG_FIND_CONTIG_BITMAP startBlock, minBlocks, maxBlocks, useMeta, 0 ... err, actualStartBlock, actualBlockCount, 0, 0 + 9 HFSDBG_ALLOC_ANY_BITMAP startBlock, endBlock, maxBlocks, useMeta, 0 ... err, actualStartBlock, actualBlockCount, 0, 0 + 10 HFSDBG_ALLOC_FIND_KNOWN 0, 0, maxBlocks, 0, 0 ... err, actualStartBlock, actualBlockCount, 0, 0 + 11 HFSDBG_MARK_ALLOC_BITMAP startBlock, blockCount, flags, 0, 0 ... err, 0, 0, 0, 0 + 12 HFSDBG_MARK_FREE_BITMAP startBlock, blockCount, valid, 0, 0 ... err, 0, 0, 0, 0 + 13 HFSDBG_BLOCK_FIND_CONTIG startBlock, endBlock, minBlocks, maxBlocks, 0 ... err, actualStartBlock, actualBlockCount, 0, 0 + 14 HFSDBG_IS_ALLOCATED startBlock, blockCount, stop, 0, 0 ... err, 0, actualBlockCount, 0, 0 + 15 unused + 16 HFSDBG_RESET_EXTENT_CACHE 0, 0, 0, 0, 0 ... 0, 0, 0, 0, 0 + 17 HFSDBG_REMOVE_EXTENT_CACHE startBlock, blockCount, vcbFreeExtCnt, 0, 0 ... 0, 0, vcbFreeExtCnt, extentsRemoved, 0 + 18 HFSDBG_ADD_EXTENT_CACHE startBlock, blockCount, vcbFreeExtCnt, 0, 0 ... 0, 0, vcbFreeExtCnt, retval, 0 + 19 HFSDBG_READ_BITMAP_RANGE startBlock, iosize, 0, 0, 0 ... err, 0, 0, 0, 0 + 20 HFSDBG_RELEASE_SCAN_BITMAP 0, 0, 0, 0, 0 ... 0, 0, 0, 0, 0 + 21 HFSDBG_SYNCER hfsmp, now, mnt_last_write_completed_timestamp, mnt_pending_write_size, 0 ... err, deadline, 0, 0, 0 + 22 HFSDBG_SYNCER_TIMED now, last_write_completed, hfs_mp->mnt_last_write_issued_timestamp, mnt_pending_write_size, 0 ... now, mnt_last_write_completed_timestamp, mnt_last_write_issued_timestamp, hfs_mp->mnt_pending_write_size, 0 + 23 HFSDBG_UNMAP_SCAN hfs_raw_dev, 0, 0, 0, 0 ... hfs_raw_dev, error, 0, 0, 0 + 24 HFSDBG_UNMAP_TRIM hfs_raw_dev, 0, 0, 0, 0 ... hfs_raw_dev, error, 0, 0, 0 +*/ + +#endif // HFS_KDEBUG_H_ diff --git a/core/hfs_link.c b/core/hfs_link.c new file mode 100644 index 0000000..478f519 --- /dev/null +++ b/core/hfs_link.c @@ -0,0 +1,1419 @@ +/* + * Copyright (c) 1999-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hfs.h" +#include "hfs_catalog.h" +#include "hfs_format.h" +#include "hfs_endian.h" + + +static int cur_link_id = 0; + +/* + * Private directories where hardlink inodes reside. + */ +const char *hfs_private_names[] = { + HFSPLUSMETADATAFOLDER, /* FILE HARDLINKS */ + HFSPLUS_DIR_METADATA_FOLDER /* DIRECTORY HARDLINKS */ +}; + + +/* + * Hardlink inodes save the head of their link chain in a + * private extended attribute. The following calls are + * used to access this attribute. + */ +static int setfirstlink(struct hfsmount * hfsmp, cnid_t fileid, cnid_t firstlink); +static int getfirstlink(struct hfsmount * hfsmp, cnid_t fileid, cnid_t *firstlink); + +/* + * Create a new catalog link record + * + * An indirect link is a reference to an inode (the real + * file or directory record). + * + * All the indirect links for a given inode are chained + * together in a doubly linked list. + * + * Pre-Leopard file hard links do not have kHFSHasLinkChainBit + * set and do not have first/prev/next link IDs i.e. the values + * are zero. If a new link is being added to an existing + * pre-Leopard file hard link chain, do not set kHFSHasLinkChainBit. + */ +static int +createindirectlink(struct hfsmount *hfsmp, u_int32_t linknum, struct cat_desc *descp, + cnid_t nextcnid, cnid_t *linkcnid, int is_inode_linkchain_set) +{ + struct FndrFileInfo *fip; + struct cat_attr attr; + + if (linknum == 0) { + printf("hfs: createindirectlink: linknum is zero!\n"); + return (EINVAL); + } + + /* Setup the default attributes */ + bzero(&attr, sizeof(attr)); + + /* Links are matched to inodes by link ID and to volumes by create date */ + attr.ca_linkref = linknum; + attr.ca_itime = hfsmp->hfs_metadata_createdate; + attr.ca_mode = S_IFREG | S_IRUSR | S_IRGRP | S_IROTH; + attr.ca_recflags = kHFSHasLinkChainMask | kHFSThreadExistsMask; + attr.ca_flags = UF_IMMUTABLE; + fip = (struct FndrFileInfo *)&attr.ca_finderinfo; + + if (descp->cd_flags & CD_ISDIR) { + fip->fdType = SWAP_BE32 (kHFSAliasType); + fip->fdCreator = SWAP_BE32 (kHFSAliasCreator); + fip->fdFlags = SWAP_BE16 (kIsAlias); + } else /* file */ { + fip->fdType = SWAP_BE32 (kHardLinkFileType); + fip->fdCreator = SWAP_BE32 (kHFSPlusCreator); + fip->fdFlags = SWAP_BE16 (kHasBeenInited); + /* If the file inode does not have kHFSHasLinkChainBit set + * and the next link chain ID is zero, assume that this + * is pre-Leopard file inode. Therefore clear the bit. + */ + if ((is_inode_linkchain_set == 0) && (nextcnid == 0)) { + attr.ca_recflags &= ~kHFSHasLinkChainMask; + } + } + /* Create the indirect link directly in the catalog */ + return cat_createlink(hfsmp, descp, &attr, nextcnid, linkcnid); +} + + +/* + * Make a link to the cnode cp in the directory dp + * using the name in cnp. src_vp is the vnode that + * corresponds to 'cp' which was part of the arguments to + * hfs_vnop_link. + * + * The cnodes cp and dcp must be locked. + */ +static int +hfs_makelink(struct hfsmount *hfsmp, struct vnode *src_vp, struct cnode *cp, + struct cnode *dcp, struct componentname *cnp, vfs_context_t ctx) +{ + struct proc *p = vfs_context_proc(ctx); + u_int32_t indnodeno = 0; + char inodename[32]; + struct cat_desc to_desc; + struct cat_desc link_desc; + int newlink = 0; + int lockflags; + int retval = 0; + cat_cookie_t cookie; + cnid_t orig_cnid; + cnid_t linkcnid = 0; + cnid_t orig_firstlink; + enum privdirtype type; + + type = S_ISDIR(cp->c_mode) ? DIR_HARDLINKS : FILE_HARDLINKS; + + if (cur_link_id == 0) { + cur_link_id = ((random() & 0x3fffffff) + 100); + } + + /* We don't allow link nodes in our private system directories. */ + if (dcp->c_fileid == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid || + dcp->c_fileid == hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) { + return (EPERM); + } + + bzero(&cookie, sizeof(cat_cookie_t)); + /* Reserve some space in the Catalog file. */ + if ((retval = cat_preflight(hfsmp, (2 * CAT_CREATE)+ CAT_RENAME, &cookie, p))) { + return (retval); + } + + lockflags = SFL_CATALOG | SFL_ATTRIBUTE; + /* Directory hard links allocate space for a symlink. */ + if (type == DIR_HARDLINKS) { + lockflags |= SFL_BITMAP; + } + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); + + /* Save the current cnid value so we restore it if an error occurs. */ + orig_cnid = cp->c_desc.cd_cnid; + + /* + * If this is a new hardlink then we need to create the inode + * and replace the original file/dir object with a link node. + */ + if ((cp->c_linkcount == 2) && !(cp->c_flag & C_HARDLINK)) { + newlink = 1; + bzero(&to_desc, sizeof(to_desc)); + to_desc.cd_parentcnid = hfsmp->hfs_private_desc[type].cd_cnid; + to_desc.cd_cnid = cp->c_fileid; + to_desc.cd_flags = (type == DIR_HARDLINKS) ? CD_ISDIR : 0; + + do { + if (type == DIR_HARDLINKS) { + /* Directory hardlinks always use the cnid. */ + indnodeno = cp->c_fileid; + MAKE_DIRINODE_NAME(inodename, sizeof(inodename), + indnodeno); + } else { + /* Get a unique indirect node number */ + if (retval == 0) { + indnodeno = cp->c_fileid; + } else { + indnodeno = cur_link_id++; + } + MAKE_INODE_NAME(inodename, sizeof(inodename), + indnodeno); + } + /* Move original file/dir to data node directory */ + to_desc.cd_nameptr = (const u_int8_t *)inodename; + to_desc.cd_namelen = strlen(inodename); + + retval = cat_rename(hfsmp, &cp->c_desc, &hfsmp->hfs_private_desc[type], + &to_desc, NULL); + + if (retval != 0 && retval != EEXIST) { + printf("hfs_makelink: cat_rename to %s failed (%d) fileid=%d, vol=%s\n", + inodename, retval, cp->c_fileid, hfsmp->vcbVN); + } + } while ((retval == EEXIST) && (type == FILE_HARDLINKS)); + if (retval) + goto out; + + /* + * Replace original file/dir with a link record. + */ + + bzero(&link_desc, sizeof(link_desc)); + link_desc.cd_nameptr = cp->c_desc.cd_nameptr; + link_desc.cd_namelen = cp->c_desc.cd_namelen; + link_desc.cd_parentcnid = cp->c_parentcnid; + link_desc.cd_flags = S_ISDIR(cp->c_mode) ? CD_ISDIR : 0; + + retval = createindirectlink(hfsmp, indnodeno, &link_desc, 0, &linkcnid, true); + if (retval) { + int err; + + /* Restore the cnode's cnid. */ + cp->c_desc.cd_cnid = orig_cnid; + + /* Put the original file back. */ + err = cat_rename(hfsmp, &to_desc, &dcp->c_desc, &cp->c_desc, NULL); + if (err) { + if (err != EIO && err != ENXIO) + printf("hfs_makelink: error %d from cat_rename backout 1", err); + hfs_mark_inconsistent(hfsmp, HFS_ROLLBACK_FAILED); + } + if (retval != EIO && retval != ENXIO) { + printf("hfs_makelink: createindirectlink (1) failed: %d\n", retval); + retval = EIO; + } + goto out; + } + cp->c_attr.ca_linkref = indnodeno; + cp->c_desc.cd_cnid = linkcnid; + /* Directory hard links store the first link in an attribute. */ + if (type == DIR_HARDLINKS) { + if (setfirstlink(hfsmp, cp->c_fileid, linkcnid) == 0) + cp->c_attr.ca_recflags |= kHFSHasAttributesMask; + } else /* FILE_HARDLINKS */ { + cp->c_attr.ca_firstlink = linkcnid; + } + cp->c_attr.ca_recflags |= kHFSHasLinkChainMask; + } else { + indnodeno = cp->c_attr.ca_linkref; + } + + /* + * Create a catalog entry for the new link (parentID + name). + */ + + bzero(&link_desc, sizeof(link_desc)); + link_desc.cd_nameptr = (const u_int8_t *)cnp->cn_nameptr; + link_desc.cd_namelen = strlen(cnp->cn_nameptr); + link_desc.cd_parentcnid = dcp->c_fileid; + link_desc.cd_flags = S_ISDIR(cp->c_mode) ? CD_ISDIR : 0; + + /* Directory hard links store the first link in an attribute. */ + if (type == DIR_HARDLINKS) { + retval = getfirstlink(hfsmp, cp->c_fileid, &orig_firstlink); + } else /* FILE_HARDLINKS */ { + orig_firstlink = cp->c_attr.ca_firstlink; + } + if (retval == 0) + retval = createindirectlink(hfsmp, indnodeno, &link_desc, + orig_firstlink, &linkcnid, + (cp->c_attr.ca_recflags & kHFSHasLinkChainMask)); + if (retval && newlink) { + int err; + + /* Get rid of new link */ + (void) cat_delete(hfsmp, &cp->c_desc, &cp->c_attr); + + /* Restore the cnode's cnid. */ + cp->c_desc.cd_cnid = orig_cnid; + + /* Put the original file back. */ + err = cat_rename(hfsmp, &to_desc, &dcp->c_desc, &cp->c_desc, NULL); + if (err) { + if (err != EIO && err != ENXIO) + printf("hfs_makelink: error %d from cat_rename backout 2", err); + hfs_mark_inconsistent(hfsmp, HFS_ROLLBACK_FAILED); + } + + cp->c_attr.ca_linkref = 0; + + if (retval != EIO && retval != ENXIO) { + printf("hfs_makelink: createindirectlink (2) failed: %d\n", retval); + retval = EIO; + } + goto out; + } else if (retval == 0) { + + /* Update the original first link to point back to the new first link. */ + if (cp->c_attr.ca_recflags & kHFSHasLinkChainMask) { + (void) cat_update_siblinglinks(hfsmp, orig_firstlink, linkcnid, HFS_IGNORABLE_LINK); + + /* Update the inode's first link value. */ + if (type == DIR_HARDLINKS) { + if (setfirstlink(hfsmp, cp->c_fileid, linkcnid) == 0) + cp->c_attr.ca_recflags |= kHFSHasAttributesMask; + } else { + cp->c_attr.ca_firstlink = linkcnid; + } + } + /* + * Finally, if this is a new hardlink then: + * - update the private system directory + * - mark the cnode as a hard link + */ + if (newlink) { + vnode_t vp; + + hfsmp->hfs_private_attr[type].ca_entries++; + /* From application perspective, directory hard link is a + * normal directory. Therefore count the new directory + * hard link for folder count calculation. + */ + if (type == DIR_HARDLINKS) { + INC_FOLDERCOUNT(hfsmp, hfsmp->hfs_private_attr[type]); + } + retval = cat_update(hfsmp, &hfsmp->hfs_private_desc[type], + &hfsmp->hfs_private_attr[type], NULL, NULL); + if (retval) { + if (retval != EIO && retval != ENXIO) { + printf("hfs_makelink: cat_update of privdir failed! (%d)\n", retval); + retval = EIO; + } + hfs_mark_inconsistent(hfsmp, HFS_OP_INCOMPLETE); + } + cp->c_flag |= C_HARDLINK; + + /* + * Now we need to mark the vnodes as being hardlinks via the vnode_setmultipath call. + * Note that we're calling vnode_get here, which should simply add an iocount if possible, without + * doing much checking. It's safe to call this because we are protected by the cnode lock, which + * ensures that anyone trying to reclaim it will block until we release it. vnode_get will usually + * give us an extra iocount, unless the vnode is about to be reclaimed (and has no iocounts). + * In that case, we'd error out, but we'd also not care if we added the VISHARDLINK bit to the vnode. + * + * As for the iocount we're about to add, we can't necessarily always call vnode_put here. + * If the one we add is the only iocount on the vnode, and there was + * sufficient vnode pressure, it could go through VNOP_INACTIVE immediately, which would + * require the cnode lock and cause us to double-lock panic. We can only call vnode_put if we know + * that the vnode we're operating on is the one with which we came into hfs_vnop_link, because + * that means VFS took an iocount on it for us. If it's *not* the one that we came into the call + * with, then mark it as NEED_VNODE_PUT to have hfs_unlock drop it for us. hfs_vnop_link will + * unlock the cnode when it is finished. + */ + if ((vp = cp->c_vp) != NULLVP) { + if (vnode_get(vp) == 0) { + vnode_setmultipath(vp); + if (vp == src_vp) { + /* we have an iocount on data fork vnode already. */ + vnode_put(vp); + } + else { + cp->c_flag |= C_NEED_DVNODE_PUT; + } + } + } + if ((vp = cp->c_rsrc_vp) != NULLVP) { + if (vnode_get(vp) == 0) { + vnode_setmultipath(vp); + if (vp == src_vp) { + vnode_put(vp); + } + else { + cp->c_flag |= C_NEED_RVNODE_PUT; + } + } + } + cp->c_flag |= C_MODIFIED; + cp->c_touch_chgtime = TRUE; + } + } +out: + hfs_systemfile_unlock(hfsmp, lockflags); + + cat_postflight(hfsmp, &cookie, p); + + if (retval == 0 && newlink) { + hfs_volupdate(hfsmp, VOL_MKFILE, 0); + } + return (retval); +} + + +/* + * link vnode operation + * + * IN vnode_t a_vp; + * IN vnode_t a_tdvp; + * IN struct componentname *a_cnp; + * IN vfs_context_t a_context; + */ +int +hfs_vnop_link(struct vnop_link_args *ap) +{ + struct hfsmount *hfsmp; + struct vnode *vp = ap->a_vp; + struct vnode *tdvp = ap->a_tdvp; + struct vnode *fdvp = NULLVP; + struct componentname *cnp = ap->a_cnp; + struct cnode *cp; + struct cnode *tdcp; + struct cnode *fdcp = NULL; + struct cat_desc todesc; + cnid_t parentcnid; + int lockflags = 0; + int intrans = 0; + enum vtype v_type; + int error, ret; + + hfsmp = VTOHFS(vp); + v_type = vnode_vtype(vp); + + /* No hard links in HFS standard file systems. */ + if (hfsmp->hfs_flags & HFS_STANDARD) { + return (ENOTSUP); + } + /* Linking to a special file is not permitted. */ + if (v_type == VBLK || v_type == VCHR) { + return (EPERM); + } + + /* + * For now, return ENOTSUP for a symlink target. This can happen + * for linkat(2) when called without AT_SYMLINK_FOLLOW. + */ + if (v_type == VLNK) + return (ENOTSUP); + + cp = VTOC(vp); + + if (v_type == VDIR) { +#if CONFIG_HFS_DIRLINK + /* Make sure our private directory exists. */ + if (hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid == 0) { + return (EPERM); + } + /* + * Directory hardlinks (ADLs) have only been qualified on + * journaled HFS+. If/when they are tested on non-journaled + * file systems then this test can be removed. + */ + if (hfsmp->jnl == NULL) { + return (EPERM); + } + + /* Directory hardlinks also need the parent of the original directory. */ + if ((error = hfs_vget(hfsmp, hfs_currentparent(cp, /* have_lock: */ false), + &fdvp, 1, 0))) { + return (error); + } +#else + /* some platforms don't support directory hardlinks. */ + return EPERM; +#endif + } else { + /* Make sure our private directory exists. */ + if (hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid == 0) { + return (ENOTSUP); + } + } + if (hfs_freeblks(hfsmp, 0) == 0) { + if (fdvp) { + vnode_put(fdvp); + } + return (ENOSPC); + } + + nspace_snapshot_event(vp, VTOC(vp)->c_ctime, NAMESPACE_HANDLER_LINK_CREATE, NULL); + + /* Lock the cnodes. */ + if (fdvp) { + if ((error = hfs_lockfour(VTOC(tdvp), VTOC(vp), VTOC(fdvp), NULL, HFS_EXCLUSIVE_LOCK, NULL))) { + if (fdvp) { + vnode_put(fdvp); + } + return (error); + } + fdcp = VTOC(fdvp); + } else { + if ((error = hfs_lockpair(VTOC(tdvp), VTOC(vp), HFS_EXCLUSIVE_LOCK))) { + return (error); + } + } + tdcp = VTOC(tdvp); + /* grab the parent CNID from originlist after grabbing cnode locks */ + parentcnid = hfs_currentparent(cp, /* have_lock: */ true); + + /* + * Make sure we didn't race the src or dst parent directories with rmdir. + * Note that we should only have a src parent directory cnode lock + * if we're dealing with a directory hardlink here. + */ + if (fdcp) { + if (fdcp->c_flag & (C_NOEXISTS | C_DELETED)) { + error = ENOENT; + goto out; + } + } + + if (tdcp->c_flag & (C_NOEXISTS | C_DELETED)) { + error = ENOENT; + goto out; + } + + /* Check the source for errors: + * too many links, immutable, race with unlink + */ + if (cp->c_linkcount >= HFS_LINK_MAX) { + error = EMLINK; + goto out; + } + if (cp->c_bsdflags & (IMMUTABLE | APPEND)) { + error = EPERM; + goto out; + } + if (cp->c_flag & (C_NOEXISTS | C_DELETED)) { + error = ENOENT; + goto out; + } + + tdcp->c_flag |= C_DIR_MODIFICATION; + + if (hfs_start_transaction(hfsmp) != 0) { + error = EINVAL; + goto out; + } + intrans = 1; + + todesc.cd_flags = (v_type == VDIR) ? CD_ISDIR : 0; + todesc.cd_encoding = 0; + todesc.cd_nameptr = (const u_int8_t *)cnp->cn_nameptr; + todesc.cd_namelen = cnp->cn_namelen; + todesc.cd_parentcnid = tdcp->c_fileid; + todesc.cd_hint = 0; + todesc.cd_cnid = 0; + + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); + + /* If destination exists then we lost a race with create. */ + if (cat_lookup(hfsmp, &todesc, 0, 0, NULL, NULL, NULL, NULL) == 0) { + error = EEXIST; + goto out; + } + if (cp->c_flag & C_HARDLINK) { + struct cat_attr cattr; + + /* If inode is missing then we lost a race with unlink. */ + if ((cat_idlookup(hfsmp, cp->c_fileid, 0, 0, NULL, &cattr, NULL) != 0) || + (cattr.ca_fileid != cp->c_fileid)) { + error = ENOENT; + goto out; + } + } else { + cnid_t fileid; + + /* If source is missing then we lost a race with unlink. */ + if ((cat_lookup(hfsmp, &cp->c_desc, 0, 0, NULL, NULL, NULL, &fileid) != 0) || + (fileid != cp->c_fileid)) { + error = ENOENT; + goto out; + } + } + /* + * All directory links must reside in an non-ARCHIVED hierarchy. + */ + if (v_type == VDIR) { + /* + * - Source parent and destination parent cannot match + * - A link is not permitted in the root directory + * - Parent of 'pointed at' directory is not the root directory + * - The 'pointed at' directory (source) is not an ancestor + * of the new directory hard link (destination). + * - No ancestor of the new directory hard link (destination) + * is a directory hard link. + */ + if ((parentcnid == tdcp->c_fileid) || + (tdcp->c_fileid == kHFSRootFolderID) || + (parentcnid == kHFSRootFolderID) || + cat_check_link_ancestry(hfsmp, tdcp->c_fileid, cp->c_fileid)) { + error = EPERM; /* abide by the rules, you did not */ + goto out; + } + } + hfs_systemfile_unlock(hfsmp, lockflags); + lockflags = 0; + + cp->c_linkcount++; + cp->c_flag |= C_MODIFIED; + cp->c_touch_chgtime = TRUE; + error = hfs_makelink(hfsmp, vp, cp, tdcp, cnp, ap->a_context); + if (error) { + cp->c_linkcount--; + hfs_volupdate(hfsmp, VOL_UPDATE, 0); + } else { + /* Invalidate negative cache entries in the destination directory */ + if (tdcp->c_flag & C_NEG_ENTRIES) { + cache_purge_negatives(tdvp); + tdcp->c_flag &= ~C_NEG_ENTRIES; + } + + /* Update the target directory and volume stats */ + tdcp->c_entries++; + if (v_type == VDIR) { + INC_FOLDERCOUNT(hfsmp, tdcp->c_attr); + tdcp->c_attr.ca_recflags |= kHFSHasChildLinkMask; + + /* Set kHFSHasChildLinkBit in the destination hierarchy */ + error = cat_set_childlinkbit(hfsmp, tdcp->c_parentcnid); + if (error) { + printf ("hfs_vnop_link: error updating destination parent chain for id=%u, vol=%s\n", tdcp->c_cnid, hfsmp->vcbVN); + error = 0; + } + } + tdcp->c_dirchangecnt++; + tdcp->c_flag |= C_MODIFIED; + hfs_incr_gencount(tdcp); + tdcp->c_touch_chgtime = TRUE; + tdcp->c_touch_modtime = TRUE; + + error = hfs_update(tdvp, 0); + if (error) { + if (error != EIO && error != ENXIO) { + printf("hfs_vnop_link: error %d updating tdvp %p\n", error, tdvp); + error = EIO; + } + hfs_mark_inconsistent(hfsmp, HFS_OP_INCOMPLETE); + } + + if ((v_type == VDIR) && + (fdcp != NULL) && + ((fdcp->c_attr.ca_recflags & kHFSHasChildLinkMask) == 0)) { + + fdcp->c_attr.ca_recflags |= kHFSHasChildLinkMask; + fdcp->c_flag |= C_MODIFIED; + fdcp->c_touch_chgtime = TRUE; + error = hfs_update(fdvp, 0); + if (error) { + if (error != EIO && error != ENXIO) { + printf("hfs_vnop_link: error %d updating fdvp %p\n", error, fdvp); + // No point changing error as it's set immediate below + } + hfs_mark_inconsistent(hfsmp, HFS_OP_INCOMPLETE); + } + + /* Set kHFSHasChildLinkBit in the source hierarchy */ + error = cat_set_childlinkbit(hfsmp, fdcp->c_parentcnid); + if (error) { + printf ("hfs_vnop_link: error updating source parent chain for id=%u, vol=%s\n", fdcp->c_cnid, hfsmp->vcbVN); + error = 0; + } + } + hfs_volupdate(hfsmp, VOL_MKFILE, + (tdcp->c_cnid == kHFSRootFolderID)); + } + + if (error == 0 && (ret = hfs_update(vp, 0)) != 0) { + if (ret != EIO && ret != ENXIO) + printf("hfs_vnop_link: error %d updating vp @ %p\n", ret, vp); + hfs_mark_inconsistent(hfsmp, HFS_OP_INCOMPLETE); + } + +out: + if (lockflags) { + hfs_systemfile_unlock(hfsmp, lockflags); + } + if (intrans) { + hfs_end_transaction(hfsmp); + } + + tdcp->c_flag &= ~C_DIR_MODIFICATION; + wakeup((caddr_t)&tdcp->c_flag); + + if (fdcp) { + hfs_unlockfour(tdcp, cp, fdcp, NULL); + } else { + hfs_unlockpair(tdcp, cp); + } + if (fdvp) { + vnode_put(fdvp); + } + return (error); +} + + +/* + * Remove a link to a hardlink file/dir. + * + * Note: dvp and vp cnodes are already locked. + */ +int +hfs_unlink(struct hfsmount *hfsmp, struct vnode *dvp, struct vnode *vp, struct componentname *cnp, int skip_reserve) +{ + struct cnode *cp; + struct cnode *dcp; + struct cat_desc cndesc; + struct timeval tv; + char inodename[32]; + cnid_t prevlinkid; + cnid_t nextlinkid; + int lockflags = 0; + int started_tr; + int error; + + if (hfsmp->hfs_flags & HFS_STANDARD) { + return (EPERM); + } + cp = VTOC(vp); + dcp = VTOC(dvp); + + dcp->c_flag |= C_DIR_MODIFICATION; + + /* Remove the entry from the namei cache: */ + cache_purge(vp); + + if ((error = hfs_start_transaction(hfsmp)) != 0) { + started_tr = 0; + goto out; + } + started_tr = 1; + + /* + * Protect against a race with rename by using the component + * name passed in and parent id from dvp (instead of using + * the cp->c_desc which may have changed). + * + * Re-lookup the component name so we get the correct cnid + * for the name (as opposed to the c_cnid in the cnode which + * could have changed before the cnode was locked). + */ + cndesc.cd_flags = vnode_isdir(vp) ? CD_ISDIR : 0; + cndesc.cd_encoding = cp->c_desc.cd_encoding; + cndesc.cd_nameptr = (const u_int8_t *)cnp->cn_nameptr; + cndesc.cd_namelen = cnp->cn_namelen; + cndesc.cd_parentcnid = dcp->c_fileid; + cndesc.cd_hint = dcp->c_childhint; + + lockflags = SFL_CATALOG | SFL_ATTRIBUTE; + if (cndesc.cd_flags & CD_ISDIR) { + /* We'll be removing the alias resource allocation blocks. */ + lockflags |= SFL_BITMAP; + } + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); + + if ((error = cat_lookuplink(hfsmp, &cndesc, &cndesc.cd_cnid, &prevlinkid, &nextlinkid))) { + goto out; + } + + /* Reserve some space in the catalog file. */ + if (!skip_reserve && (error = cat_preflight(hfsmp, 2 * CAT_DELETE, NULL, 0))) { + goto out; + } + + /* Purge any cached origin entries for a directory or file hard link. */ + hfs_relorigin(cp, dcp->c_fileid); + if (dcp->c_fileid != dcp->c_cnid) { + hfs_relorigin(cp, dcp->c_cnid); + } + + /* Delete the link record. */ + if ((error = cat_deletelink(hfsmp, &cndesc))) { + goto out; + } + + /* Update the parent directory. */ + if (dcp->c_entries > 0) { + dcp->c_entries--; + } + if (cndesc.cd_flags & CD_ISDIR) { + DEC_FOLDERCOUNT(hfsmp, dcp->c_attr); + } + dcp->c_dirchangecnt++; + hfs_incr_gencount(dcp); + microtime(&tv); + dcp->c_touch_chgtime = dcp->c_touch_modtime = true; + dcp->c_flag |= C_MODIFIED; + hfs_update(dcp->c_vp, 0); + + /* + * If this is the last link then we need to process the inode. + * Otherwise we need to fix up the link chain. + */ + --cp->c_linkcount; + if (cp->c_linkcount < 1) { + char delname[32]; + struct cat_desc to_desc; + struct cat_desc from_desc; + + /* + * If a file inode or directory inode is being deleted, rename + * it to an open deleted file. This ensures that deletion + * of inode and its corresponding extended attributes does + * not overflow the journal. This inode will be deleted + * either in hfs_vnop_inactive() or in hfs_remove_orphans(). + * Note: a rename failure here is not fatal. + */ + bzero(&from_desc, sizeof(from_desc)); + bzero(&to_desc, sizeof(to_desc)); + if (vnode_isdir(vp)) { + if (cp->c_entries != 0) { + panic("hfs_unlink: dir not empty (id %d, %d entries)", cp->c_fileid, cp->c_entries); + } + MAKE_DIRINODE_NAME(inodename, sizeof(inodename), + cp->c_attr.ca_linkref); + from_desc.cd_parentcnid = hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid; + from_desc.cd_flags = CD_ISDIR; + to_desc.cd_flags = CD_ISDIR; + } else { + MAKE_INODE_NAME(inodename, sizeof(inodename), + cp->c_attr.ca_linkref); + from_desc.cd_parentcnid = hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid; + from_desc.cd_flags = 0; + to_desc.cd_flags = 0; + } + from_desc.cd_nameptr = (const u_int8_t *)inodename; + from_desc.cd_namelen = strlen(inodename); + from_desc.cd_cnid = cp->c_fileid; + + MAKE_DELETED_NAME(delname, sizeof(delname), cp->c_fileid); + to_desc.cd_nameptr = (const u_int8_t *)delname; + to_desc.cd_namelen = strlen(delname); + to_desc.cd_parentcnid = hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid; + to_desc.cd_cnid = cp->c_fileid; + + error = cat_rename(hfsmp, &from_desc, &hfsmp->hfs_private_desc[FILE_HARDLINKS], + &to_desc, (struct cat_desc *)NULL); + if (error == 0) { + cp->c_flag |= C_DELETED; + cp->c_attr.ca_recflags &= ~kHFSHasLinkChainMask; + cp->c_attr.ca_firstlink = 0; + if (vnode_isdir(vp)) { + hfsmp->hfs_private_attr[DIR_HARDLINKS].ca_entries--; + DEC_FOLDERCOUNT(hfsmp, hfsmp->hfs_private_attr[DIR_HARDLINKS]); + + hfsmp->hfs_private_attr[FILE_HARDLINKS].ca_entries++; + INC_FOLDERCOUNT(hfsmp, hfsmp->hfs_private_attr[FILE_HARDLINKS]); + + (void)cat_update(hfsmp, &hfsmp->hfs_private_desc[DIR_HARDLINKS], + &hfsmp->hfs_private_attr[DIR_HARDLINKS], NULL, NULL); + (void)cat_update(hfsmp, &hfsmp->hfs_private_desc[FILE_HARDLINKS], + &hfsmp->hfs_private_attr[FILE_HARDLINKS], NULL, NULL); + } + } else { + error = 0; /* rename failure here is not fatal */ + } + } else /* Still some links left */ { + cnid_t firstlink; + + /* + * Update the start of the link chain. + * Note: Directory hard links store the first link in an attribute. + */ + if (vnode_isdir(vp) && + getfirstlink(hfsmp, cp->c_fileid, &firstlink) == 0 && + firstlink == cndesc.cd_cnid) { + if (setfirstlink(hfsmp, cp->c_fileid, nextlinkid) == 0) + cp->c_attr.ca_recflags |= kHFSHasAttributesMask; + } else if (cp->c_attr.ca_firstlink == cndesc.cd_cnid) { + cp->c_attr.ca_firstlink = nextlinkid; + } + /* Update previous link. */ + if (prevlinkid) { + (void) cat_update_siblinglinks(hfsmp, prevlinkid, HFS_IGNORABLE_LINK, nextlinkid); + } + /* Update next link. */ + if (nextlinkid) { + (void) cat_update_siblinglinks(hfsmp, nextlinkid, prevlinkid, HFS_IGNORABLE_LINK); + } + } + + /* + * The call to cat_releasedesc below will only release the name + * buffer; it does not zero out the rest of the fields in the + * 'cat_desc' data structure. + * + * As a result, since there are still other links at this point, + * we need to make the current cnode descriptor point to the raw + * inode. If a path-based system call comes along first, it will + * replace the descriptor with a valid link ID. If a userland + * process already has a file descriptor open, then they will + * bypass that lookup, though. Replacing the descriptor CNID with + * the raw inode will force it to generate a new full path. + */ + cp->c_cnid = cp->c_fileid; + + /* Push new link count to disk. */ + cp->c_ctime = tv.tv_sec; + (void) cat_update(hfsmp, &cp->c_desc, &cp->c_attr, NULL, NULL); + + /* All done with the system files. */ + hfs_systemfile_unlock(hfsmp, lockflags); + lockflags = 0; + + /* Update file system stats. */ + hfs_volupdate(hfsmp, VOL_RMFILE, (dcp->c_cnid == kHFSRootFolderID)); + + /* + * All done with this cnode's descriptor... + * + * Note: all future catalog calls for this cnode may be + * by fileid only. This is OK for HFS (which doesn't have + * file thread records) since HFS doesn't support hard links. + */ + cat_releasedesc(&cp->c_desc); + +out: + if (lockflags) { + hfs_systemfile_unlock(hfsmp, lockflags); + } + if (started_tr) { + hfs_end_transaction(hfsmp); + } + + dcp->c_flag &= ~C_DIR_MODIFICATION; + wakeup((caddr_t)&dcp->c_flag); + + return (error); +} + + +/* + * Initialize the HFS+ private system directories. + * + * These directories are used to hold the inodes + * for file and directory hardlinks as well as + * open-unlinked files. + * + * If they don't yet exist they will get created. + * + * This call is assumed to be made during mount. + */ +void +hfs_privatedir_init(struct hfsmount * hfsmp, enum privdirtype type) +{ + struct vnode * dvp = NULLVP; + struct cnode * dcp = NULL; + struct cat_desc *priv_descp; + struct cat_attr *priv_attrp; + struct FndrDirInfo * fndrinfo; + struct timeval tv; + int lockflags; + int trans = 0; + int error; + + if (hfsmp->hfs_flags & HFS_STANDARD) { + return; + } + + priv_descp = &hfsmp->hfs_private_desc[type]; + priv_attrp = &hfsmp->hfs_private_attr[type]; + + /* Check if directory already exists. */ + if (priv_descp->cd_cnid != 0) { + return; + } + + priv_descp->cd_parentcnid = kRootDirID; + priv_descp->cd_nameptr = (const u_int8_t *)hfs_private_names[type]; + priv_descp->cd_namelen = strlen((const char *)priv_descp->cd_nameptr); + priv_descp->cd_flags = CD_ISDIR | CD_DECOMPOSED; + + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); + error = cat_lookup(hfsmp, priv_descp, 0, 0, NULL, priv_attrp, NULL, NULL); + hfs_systemfile_unlock(hfsmp, lockflags); + + if (error == 0) { + if (type == FILE_HARDLINKS) { + hfsmp->hfs_metadata_createdate = priv_attrp->ca_itime; + } + priv_descp->cd_cnid = priv_attrp->ca_fileid; + goto exit; + } + + /* Directory is missing, if this is read-only then we're done. */ + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + goto exit; + } + + /* Grab the root directory so we can update it later. */ + if (hfs_vget(hfsmp, kRootDirID, &dvp, 0, 0) != 0) { + goto exit; + } + dcp = VTOC(dvp); + + /* Setup the default attributes */ + bzero(priv_attrp, sizeof(struct cat_attr)); + priv_attrp->ca_flags = UF_IMMUTABLE | UF_HIDDEN; + priv_attrp->ca_mode = S_IFDIR; + if (type == DIR_HARDLINKS) { + priv_attrp->ca_mode |= S_ISVTX | S_IRUSR | S_IXUSR | S_IRGRP | + S_IXGRP | S_IROTH | S_IXOTH; + } + priv_attrp->ca_linkcount = 1; + priv_attrp->ca_itime = hfsmp->hfs_itime; + priv_attrp->ca_recflags = kHFSHasFolderCountMask; + + fndrinfo = (struct FndrDirInfo *)&priv_attrp->ca_finderinfo; + fndrinfo->frLocation.v = SWAP_BE16(16384); + fndrinfo->frLocation.h = SWAP_BE16(16384); + fndrinfo->frFlags = SWAP_BE16(kIsInvisible + kNameLocked); + + if (hfs_start_transaction(hfsmp) != 0) { + goto exit; + } + trans = 1; + + /* Need the catalog and EA b-trees for CNID acquisition */ + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_ATTRIBUTE, HFS_EXCLUSIVE_LOCK); + + /* Make sure there's space in the Catalog file. */ + if (cat_preflight(hfsmp, CAT_CREATE, NULL, 0) != 0) { + hfs_systemfile_unlock(hfsmp, lockflags); + goto exit; + } + + /* Get the CNID for use */ + cnid_t new_id; + if ((error = cat_acquire_cnid(hfsmp, &new_id))) { + hfs_systemfile_unlock (hfsmp, lockflags); + goto exit; + } + + /* Create the private directory on disk. */ + error = cat_create(hfsmp, new_id, priv_descp, priv_attrp, NULL); + if (error == 0) { + priv_descp->cd_cnid = priv_attrp->ca_fileid; + + /* Update the parent directory */ + dcp->c_entries++; + INC_FOLDERCOUNT(hfsmp, dcp->c_attr); + dcp->c_dirchangecnt++; + hfs_incr_gencount(dcp); + microtime(&tv); + dcp->c_ctime = tv.tv_sec; + dcp->c_mtime = tv.tv_sec; + (void) cat_update(hfsmp, &dcp->c_desc, &dcp->c_attr, NULL, NULL); + } + + hfs_systemfile_unlock(hfsmp, lockflags); + + if (error) { + goto exit; + } + if (type == FILE_HARDLINKS) { + hfsmp->hfs_metadata_createdate = priv_attrp->ca_itime; + } + hfs_volupdate(hfsmp, VOL_MKDIR, 1); +exit: + if (trans) { + hfs_end_transaction(hfsmp); + } + if (dvp) { + hfs_unlock(dcp); + vnode_put(dvp); + } + if ((error == 0) && (type == DIR_HARDLINKS)) { + hfs_xattr_init(hfsmp); + } +} + + +/* + * Lookup a hardlink link (from chain) + */ +int +hfs_lookup_siblinglinks(struct hfsmount *hfsmp, cnid_t linkfileid, cnid_t *prevlinkid, cnid_t *nextlinkid) +{ + int lockflags; + int error; + + *prevlinkid = 0; + *nextlinkid = 0; + + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); + + error = cat_lookup_siblinglinks(hfsmp, linkfileid, prevlinkid, nextlinkid); + if (error == ENOLINK) { + hfs_systemfile_unlock(hfsmp, lockflags); + lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_SHARED_LOCK); + + error = getfirstlink(hfsmp, linkfileid, nextlinkid); + } + hfs_systemfile_unlock(hfsmp, lockflags); + + return (error); +} + + +/* Find the oldest / last hardlink in the link chain */ +int +hfs_lookup_lastlink (struct hfsmount *hfsmp, cnid_t linkfileid, + cnid_t *lastid, struct cat_desc *cdesc) { + int lockflags; + int error; + + *lastid = 0; + + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); + + error = cat_lookup_lastlink(hfsmp, linkfileid, lastid, cdesc); + + hfs_systemfile_unlock(hfsmp, lockflags); + + /* + * cat_lookup_lastlink will zero out the lastid/cdesc arguments as needed + * upon error cases. + */ + return error; +} + + +/* + * Cache the origin of a directory or file hard link + * + * cnode must be lock on entry + */ +void +hfs_savelinkorigin(cnode_t *cp, cnid_t parentcnid) +{ + linkorigin_t *origin = NULL; + thread_t thread = current_thread(); + int count = 0; + int maxorigins = (S_ISDIR(cp->c_mode)) ? MAX_CACHED_ORIGINS : MAX_CACHED_FILE_ORIGINS; + /* + * Look for an existing origin first. If not found, create/steal one. + */ + TAILQ_FOREACH(origin, &cp->c_originlist, lo_link) { + ++count; + if (origin->lo_thread == thread) { + TAILQ_REMOVE(&cp->c_originlist, origin, lo_link); + break; + } + } + if (origin == NULL) { + /* Recycle the last (i.e., the oldest) if we have too many. */ + if (count > maxorigins) { + origin = TAILQ_LAST(&cp->c_originlist, hfs_originhead); + TAILQ_REMOVE(&cp->c_originlist, origin, lo_link); + } else { + origin = hfs_malloc(sizeof(linkorigin_t)); + } + origin->lo_thread = thread; + } + origin->lo_cnid = cp->c_cnid; + origin->lo_parentcnid = parentcnid; + TAILQ_INSERT_HEAD(&cp->c_originlist, origin, lo_link); +} + +/* + * Release any cached origins for a directory or file hard link + * + * cnode must be lock on entry + */ +void +hfs_relorigins(struct cnode *cp) +{ + linkorigin_t *origin, *prev; + + TAILQ_FOREACH_SAFE(origin, &cp->c_originlist, lo_link, prev) { + hfs_free(origin, sizeof(*origin)); + } + TAILQ_INIT(&cp->c_originlist); +} + +/* + * Release a specific origin for a directory or file hard link + * + * cnode must be lock on entry + */ +void +hfs_relorigin(struct cnode *cp, cnid_t parentcnid) +{ + linkorigin_t *origin, *prev; + thread_t thread = current_thread(); + + TAILQ_FOREACH_SAFE(origin, &cp->c_originlist, lo_link, prev) { + if (origin->lo_thread == thread) { + TAILQ_REMOVE(&cp->c_originlist, origin, lo_link); + hfs_free(origin, sizeof(*origin)); + break; + } else if (origin->lo_parentcnid == parentcnid) { + /* + * If the threads don't match, then we don't want to + * delete the entry because that might cause other threads + * to fall back and use whatever happens to be in + * c_parentcnid or the wrong link ID. By setting the + * values to zero here, it should serve as an indication + * that the path is no longer valid and that's better than + * using a random parent ID or link ID. + */ + origin->lo_parentcnid = 0; + origin->lo_cnid = 0; + } + } +} + +/* + * Test if a directory or file hard link has a cached origin + * + * cnode must be lock on entry + */ +int +hfs_haslinkorigin(cnode_t *cp) +{ + if (cp->c_flag & C_HARDLINK) { + linkorigin_t *origin; + thread_t thread = current_thread(); + + TAILQ_FOREACH(origin, &cp->c_originlist, lo_link) { + if (origin->lo_thread == thread) { + return origin->lo_cnid != 0; + } + } + } + return (0); +} + +/* + * Obtain the current parent cnid of a directory or file hard link + * + * cnode must be lock on entry + */ +cnid_t +hfs_currentparent(cnode_t *cp, bool have_lock) +{ + if (cp->c_flag & C_HARDLINK) { + if (!have_lock) + hfs_lock_always(cp, HFS_SHARED_LOCK); + + linkorigin_t *origin; + thread_t thread = current_thread(); + + TAILQ_FOREACH(origin, &cp->c_originlist, lo_link) { + if (origin->lo_thread == thread) { + if (!have_lock) + hfs_unlock(cp); + return (origin->lo_parentcnid); + } + } + + if (!have_lock) + hfs_unlock(cp); + } + return (cp->c_parentcnid); +} + +/* + * Obtain the current cnid of a directory or file hard link + * + * cnode must be lock on entry + */ +cnid_t +hfs_currentcnid(cnode_t *cp) +{ + if (cp->c_flag & C_HARDLINK) { + linkorigin_t *origin; + thread_t thread = current_thread(); + + TAILQ_FOREACH(origin, &cp->c_originlist, lo_link) { + if (origin->lo_thread == thread) { + return (origin->lo_cnid); + } + } + } + return (cp->c_cnid); +} + + +/* + * Set the first link attribute for a given file id. + * + * The attributes b-tree must already be locked. + * If journaling is enabled, a transaction must already be started. + */ +static int +setfirstlink(struct hfsmount * hfsmp, cnid_t fileid, cnid_t firstlink) +{ + FCB * btfile; + BTreeIterator * iterator; + FSBufferDescriptor btdata; + u_int8_t attrdata[FIRST_LINK_XATTR_REC_SIZE]; + HFSPlusAttrData *dataptr; + int result; + u_int16_t datasize; + + if (hfsmp->hfs_attribute_cp == NULL) { + return (EPERM); + } + iterator = hfs_mallocz(sizeof(*iterator)); + + result = hfs_buildattrkey(fileid, FIRST_LINK_XATTR_NAME, (HFSPlusAttrKey *)&iterator->key); + if (result) { + goto out; + } + dataptr = (HFSPlusAttrData *)&attrdata[0]; + dataptr->recordType = kHFSPlusAttrInlineData; + dataptr->reserved[0] = 0; + dataptr->reserved[1] = 0; + + /* + * Since attrData is variable length, we calculate the size of + * attrData by subtracting the size of all other members of + * structure HFSPlusAttData from the size of attrdata. + */ + (void)snprintf((char *)&dataptr->attrData[0], + sizeof(dataptr) - (4 * sizeof(uint32_t)), + "%lu", (unsigned long)firstlink); + dataptr->attrSize = 1 + strlen((char *)&dataptr->attrData[0]); + + /* Calculate size of record rounded up to multiple of 2 bytes. */ + datasize = sizeof(HFSPlusAttrData) - 2 + dataptr->attrSize + ((dataptr->attrSize & 1) ? 1 : 0); + + btdata.bufferAddress = dataptr; + btdata.itemSize = datasize; + btdata.itemCount = 1; + + btfile = hfsmp->hfs_attribute_cp->c_datafork; + + /* Insert the attribute. */ + result = BTInsertRecord(btfile, iterator, &btdata, datasize); + if (result == btExists) { + result = BTReplaceRecord(btfile, iterator, &btdata, datasize); + } + (void) BTFlushPath(btfile); +out: + hfs_free(iterator, sizeof(*iterator)); + + return MacToVFSError(result); +} + +/* + * Get the first link attribute for a given file id. + * + * The attributes b-tree must already be locked. + */ +static int +getfirstlink(struct hfsmount * hfsmp, cnid_t fileid, cnid_t *firstlink) +{ + FCB * btfile; + BTreeIterator * iterator; + FSBufferDescriptor btdata; + u_int8_t attrdata[FIRST_LINK_XATTR_REC_SIZE]; + HFSPlusAttrData *dataptr; + int result; + u_int16_t datasize; + + if (hfsmp->hfs_attribute_cp == NULL) { + return (EPERM); + } + iterator = hfs_mallocz(sizeof(*iterator)); + + result = hfs_buildattrkey(fileid, FIRST_LINK_XATTR_NAME, (HFSPlusAttrKey *)&iterator->key); + if (result) + goto out; + + dataptr = (HFSPlusAttrData *)&attrdata[0]; + datasize = sizeof(attrdata); + + btdata.bufferAddress = dataptr; + btdata.itemSize = sizeof(attrdata); + btdata.itemCount = 1; + + btfile = hfsmp->hfs_attribute_cp->c_datafork; + + result = BTSearchRecord(btfile, iterator, &btdata, NULL, NULL); + if (result) + goto out; + + if (dataptr->attrSize < 3) { + result = ENOENT; + goto out; + } + *firstlink = strtoul((char*)&dataptr->attrData[0], NULL, 10); +out: + hfs_free(iterator, sizeof(*iterator)); + + return MacToVFSError(result); +} + +errno_t hfs_first_link(hfsmount_t *hfsmp, cnode_t *cp, cnid_t *link_id) +{ + errno_t error = 0; + + if (S_ISDIR(cp->c_mode)) { + int lockf = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_SHARED_LOCK); + + error = getfirstlink(hfsmp, cp->c_fileid, link_id); + + hfs_systemfile_unlock(hfsmp, lockf); + } else { + if (cp->c_attr.ca_firstlink) + *link_id = cp->c_attr.ca_firstlink; + else { + // This can happen if the cnode has been deleted + error = ENOENT; + } + } + + return error; +} diff --git a/core/hfs_lookup.c b/core/hfs_lookup.c new file mode 100644 index 0000000..943d194 --- /dev/null +++ b/core/hfs_lookup.c @@ -0,0 +1,680 @@ +/* + * Copyright (c) 1999-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)hfs_lookup.c 1.0 + * derived from @(#)ufs_lookup.c 8.15 (Berkeley) 6/16/95 + * + * (c) 1998-1999 Apple Inc. All Rights Reserved + * (c) 1990, 1992 NeXT Computer, Inc. All Rights Reserved + * + * + * hfs_lookup.c -- code to handle directory traversal on HFS/HFS+ volume + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hfs.h" +#include "hfs_catalog.h" +#include "hfs_cnode.h" + + +/* + * FROM FREEBSD 3.1 + * Convert a component of a pathname into a pointer to a locked cnode. + * This is a very central and rather complicated routine. + * If the file system is not maintained in a strict tree hierarchy, + * this can result in a deadlock situation (see comments in code below). + * + * The cnp->cn_nameiop argument is LOOKUP, CREATE, RENAME, or DELETE depending + * on whether the name is to be looked up, created, renamed, or deleted. + * When CREATE, RENAME, or DELETE is specified, information usable in + * creating, renaming, or deleting a directory entry may be calculated. + * Notice that these are the only operations that can affect the directory of the target. + * + * LOCKPARENT and WANTPARENT actually refer to the parent of the last item, + * so if ISLASTCN is not set, they should be ignored. Also they are mutually exclusive, or + * WANTPARENT really implies DONTLOCKPARENT. Either of them set means that the calling + * routine wants to access the parent of the target, locked or unlocked. + * + * Keeping the parent locked as long as possible protects from other processes + * looking up the same item, so it has to be locked until the cnode is totally finished + * + * hfs_cache_lookup() performs the following for us: + * check that it is a directory + * check accessibility of directory + * check for modification attempts on read-only mounts + * if name found in cache + * if at end of path and deleting or creating + * drop it + * else + * return name. + * return hfs_lookup() + * + * Overall outline of hfs_lookup: + * + * handle simple cases of . and .. + * search for name in directory, to found or notfound + * notfound: + * if creating, return locked directory, leaving info on available slots + * else return error + * found: + * if at end of path and deleting, return information to allow delete + * if at end of path and rewriting (RENAME and LOCKPARENT), lock target + * cnode and return info to allow rewrite + * if not at end, add name to cache; if at end and neither creating + * nor deleting, add name to cache + */ + + +/* + * Lookup *cnp in directory *dvp, return it in *vpp. + * **vpp is held on exit. + * We create a cnode for the file, but we do NOT open the file here. + +#% lookup dvp L ? ? +#% lookup vpp - L - + + IN struct vnode *dvp - Parent node of file; + INOUT struct vnode **vpp - node of target file, its a new node if + the target vnode did not exist; + IN struct componentname *cnp - Name of file; + + * When should we lock parent_hp in here ?? + */ +static int +hfs_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, int *cnode_locked, int force_casesensitive_lookup) +{ + struct cnode *dcp; /* cnode for directory being searched */ + struct vnode *tvp; /* target vnode */ + struct hfsmount *hfsmp; + int flags; + int nameiop; + int retval = 0; + int isDot; + struct cat_desc desc; + struct cat_desc cndesc; + struct cat_attr attr; + struct cat_fork fork; + int lockflags; + int newvnode_flags; + + retry: + newvnode_flags = 0; + dcp = NULL; + hfsmp = VTOHFS(dvp); + *vpp = NULL; + *cnode_locked = 0; + isDot = FALSE; + tvp = NULL; + nameiop = cnp->cn_nameiop; + flags = cnp->cn_flags; + bzero(&desc, sizeof(desc)); + + /* + * First check to see if it is a . or .., else look it up. + */ + if (flags & ISDOTDOT) { /* Wanting the parent */ + cnp->cn_flags &= ~MAKEENTRY; + goto found; /* .. is always defined */ + } else if ((cnp->cn_nameptr[0] == '.') && (cnp->cn_namelen == 1)) { + isDot = TRUE; + cnp->cn_flags &= ~MAKEENTRY; + goto found; /* We always know who we are */ + } else { + if (hfs_lock(VTOC(dvp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT) != 0) { + retval = ENOENT; /* The parent no longer exists ? */ + goto exit; + } + dcp = VTOC(dvp); + + if (dcp->c_flag & C_DIR_MODIFICATION) { + // This needs to be changed to sleep on c_flag using assert_wait. + // msleep((caddr_t)&dcp->c_flag, &dcp->c_rwlock, PINOD, "hfs_vnop_lookup", 0); + hfs_unlock(dcp); + tsleep((caddr_t)dvp, PRIBIO, "hfs_lookup", 1); + + goto retry; + } + + + /* + * We shouldn't need to go to the catalog if there are no children. + * However, in the face of a minor disk corruption where the valence of + * the directory is off, we could infinite loop here if we return ENOENT + * even though there are actually items in the directory. (create will + * see the ENOENT, try to create something, which will return with + * EEXIST over and over again). As a result, always check the catalog. + */ + + bzero(&cndesc, sizeof(cndesc)); + cndesc.cd_nameptr = (const u_int8_t *)cnp->cn_nameptr; + cndesc.cd_namelen = cnp->cn_namelen; + cndesc.cd_parentcnid = dcp->c_fileid; + cndesc.cd_hint = dcp->c_childhint; + + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); + + retval = cat_lookup(hfsmp, &cndesc, 0, force_casesensitive_lookup, &desc, &attr, &fork, NULL); + + hfs_systemfile_unlock(hfsmp, lockflags); + + if (retval == 0) { + dcp->c_childhint = desc.cd_hint; + /* + * Note: We must drop the parent lock here before calling + * hfs_getnewvnode (which takes the child lock). + */ + hfs_unlock(dcp); + dcp = NULL; + + /* Verify that the item just looked up isn't one of the hidden directories. */ + if (desc.cd_cnid == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid || + desc.cd_cnid == hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) { + retval = ENOENT; + goto exit; + } + + goto found; + } + + if (retval == HFS_ERESERVEDNAME) { + /* + * We found the name in the catalog, but it is unavailable + * to us. The exact error to return to our caller depends + * on the operation, and whether we've already reached the + * last path component. In all cases, avoid a negative + * cache entry, since someone else may be able to access + * the name if their lookup is configured differently. + */ + + cnp->cn_flags &= ~MAKEENTRY; + + if (((flags & ISLASTCN) == 0) || ((nameiop == LOOKUP) || (nameiop == DELETE))) { + /* A reserved name for a pure lookup is the same as the path not being present */ + retval = ENOENT; + } else { + /* A reserved name with intent to create must be rejected as impossible */ + retval = EEXIST; + } + } + if (retval != ENOENT) + goto exit; + /* + * This is a non-existing entry + * + * If creating, and at end of pathname and current + * directory has not been removed, then can consider + * allowing file to be created. + */ + if ((nameiop == CREATE || nameiop == RENAME) && + (flags & ISLASTCN) && + !(ISSET(dcp->c_flag, C_DELETED | C_NOEXISTS))) { + retval = EJUSTRETURN; + goto exit; + } + /* + * Insert name into the name cache (as non-existent). + */ +#if CONFIG_HFS_STD + if ((hfsmp->hfs_flags & HFS_STANDARD) == 0) +#endif + { + if ((cnp->cn_flags & MAKEENTRY) && + (nameiop != CREATE)) { + cache_enter(dvp, NULL, cnp); + dcp->c_flag |= C_NEG_ENTRIES; + } + } + goto exit; + } + +found: + if (flags & ISLASTCN) { + switch(nameiop) { + case DELETE: + cnp->cn_flags &= ~MAKEENTRY; + break; + + case RENAME: + cnp->cn_flags &= ~MAKEENTRY; + if (isDot) { + retval = EISDIR; + goto exit; + } + break; + } + } + + if (isDot) { + if ((retval = vnode_get(dvp))) + goto exit; + *vpp = dvp; + } else if (flags & ISDOTDOT) { + /* + * Directory hard links can have multiple parents so + * find the appropriate parent for the current thread. + */ + if ((retval = hfs_vget(hfsmp, hfs_currentparent(VTOC(dvp), + /* have_lock: */ false), &tvp, 0, 0))) { + goto exit; + } + *cnode_locked = 1; + *vpp = tvp; + } else { + int type = (attr.ca_mode & S_IFMT); + + if (!(flags & ISLASTCN) && (type != S_IFDIR) && (type != S_IFLNK)) { + retval = ENOTDIR; + goto exit; + } + /* Don't cache directory hardlink names. */ + if (attr.ca_recflags & kHFSHasLinkChainMask) { + cnp->cn_flags &= ~MAKEENTRY; + } + /* Names with composed chars are not cached. */ + if (cnp->cn_namelen != desc.cd_namelen) + cnp->cn_flags &= ~MAKEENTRY; + + retval = hfs_getnewvnode(hfsmp, dvp, cnp, &desc, 0, &attr, &fork, &tvp, &newvnode_flags); + + if (retval) { + /* + * If this was a create/rename operation lookup, then by this point + * we expected to see the item returned from hfs_getnewvnode above. + * In the create case, it would probably eventually bubble out an EEXIST + * because the item existed when we were trying to create it. In the + * rename case, it would let us know that we need to go ahead and + * delete it as part of the rename. However, if we hit the condition below + * then it means that we found the element during cat_lookup above, but + * it is now no longer there. We simply behave as though we never found + * the element at all and return EJUSTRETURN. + */ + if ((retval == ENOENT) && + ((cnp->cn_nameiop == CREATE) || (cnp->cn_nameiop == RENAME)) && + (flags & ISLASTCN)) { + retval = EJUSTRETURN; + } + + /* + * If this was a straight lookup operation, we may need to redrive the entire + * lookup starting from cat_lookup if the element was deleted as the result of + * a rename operation. Since rename is supposed to guarantee atomicity, then + * lookups cannot fail because the underlying element is deleted as a result of + * the rename call -- either they returned the looked up element prior to rename + * or return the newer element. If we are in this region, then all we can do is add + * workarounds to guarantee the latter case. The element has already been deleted, so + * we just re-try the lookup to ensure the caller gets the most recent element. + */ + if ((retval == ENOENT) && (cnp->cn_nameiop == LOOKUP) && + (newvnode_flags & (GNV_CHASH_RENAMED | GNV_CAT_DELETED))) { + if (dcp) { + hfs_unlock (dcp); + } + /* get rid of any name buffers that may have lingered from the cat_lookup call */ + cat_releasedesc (&desc); + goto retry; + } + + /* Also, re-drive the lookup if the item we looked up was a hardlink, and the number + * or name of hardlinks has changed in the interim between the cat_lookup above, and + * our call to hfs_getnewvnode. hfs_getnewvnode will validate the cattr we passed it + * against what is actually in the catalog after the cnode is created. If there were + * any issues, it will bubble out ERECYCLE, which we need to swallow and use as the + * key to redrive as well. We need to special case this below because in this case, + * it needs to occur regardless of the type of lookup we're doing here. + */ + if ((retval == ERECYCLE) && (newvnode_flags & GNV_CAT_ATTRCHANGED)) { + if (dcp) { + hfs_unlock (dcp); + } + /* get rid of any name buffers that may have lingered from the cat_lookup call */ + cat_releasedesc (&desc); + retval = 0; + goto retry; + } + + /* skip to the error-handling code if we can't retry */ + goto exit; + } + + /* + * Save the origin info for file and directory hardlinks. Directory hardlinks + * need the origin for '..' lookups, and file hardlinks need it to ensure that + * competing lookups do not cause us to vend different hardlinks than the ones requested. + */ + if (ISSET(VTOC(tvp)->c_flag, C_HARDLINK)) + hfs_savelinkorigin(VTOC(tvp), VTOC(dvp)->c_fileid); + *cnode_locked = 1; + *vpp = tvp; + } +exit: + if (dcp) { + hfs_unlock(dcp); + } + cat_releasedesc(&desc); + return (retval); +} + + + +/* + * Name caching works as follows: + * + * Names found by directory scans are retained in a cache + * for future reference. It is managed LRU, so frequently + * used names will hang around. Cache is indexed by hash value + * obtained from (vp, name) where vp refers to the directory + * containing name. + * + * If it is a "negative" entry, (i.e. for a name that is known NOT to + * exist) the vnode pointer will be NULL. + * + * Upon reaching the last segment of a path, if the reference + * is for DELETE, or NOCACHE is set (rewrite), and the + * name is located in the cache, it will be dropped. + * + */ + +int +hfs_vnop_lookup(struct vnop_lookup_args *ap) +{ + struct vnode *dvp = ap->a_dvp; + struct vnode *vp; + struct cnode *cp; + struct cnode *dcp; + struct hfsmount *hfsmp; + int error; + struct vnode **vpp = ap->a_vpp; + struct componentname *cnp = ap->a_cnp; + int flags = cnp->cn_flags; + struct proc *p = vfs_context_proc(ap->a_context); + int force_casesensitive_lookup = proc_is_forcing_hfs_case_sensitivity(p); + int cnode_locked; + int fastdev_candidate = 0; + int auto_candidate = 0; + + *vpp = NULL; + dcp = VTOC(dvp); + hfsmp = VTOHFS(dvp); + + if ((hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) && (vnode_isfastdevicecandidate(dvp) || (dcp->c_attr.ca_recflags & kHFSFastDevCandidateMask)) ){ + fastdev_candidate = 1; + auto_candidate = (vnode_isautocandidate(dvp) || (dcp->c_attr.ca_recflags & kHFSAutoCandidateMask)); + } + + + /* + * Lookup an entry in the cache + * + * If the lookup succeeds, the vnode is returned in *vpp, + * and a status of -1 is returned. + * + * If the lookup determines that the name does not exist + * (negative cacheing), a status of ENOENT is returned. + * + * If the lookup fails, a status of zero is returned. + */ + error = cache_lookup(dvp, vpp, cnp); + if (error != -1) { + if ((error == ENOENT) && (cnp->cn_nameiop != CREATE)) + goto exit; /* found a negative cache entry */ + goto lookup; /* did not find it in the cache */ + } + /* + * We have a name that matched + * cache_lookup returns the vp with an iocount reference already taken + */ + error = 0; + vp = *vpp; + cp = VTOC(vp); + + /* We aren't allowed to vend out vp's via lookup to the hidden directory */ + if (cp->c_cnid == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid || + cp->c_cnid == hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) { + /* Drop the iocount from cache_lookup */ + vnode_put (vp); + error = ENOENT; + goto exit; + } + + if (cp->c_attr.ca_recflags & kHFSDoNotFastDevPinMask) { + fastdev_candidate = 0; + } + + /* + * If this is a hard-link vnode then we need to update + * the name (of the link), the parent ID, the cnid, the + * text encoding and the catalog hint. This enables + * getattrlist calls to return the correct link info. + */ + + /* + * Alternatively, if we are forcing a case-sensitive lookup + * on a case-insensitive volume, the namecache entry + * may have been for an incorrect case. Since we cannot + * determine case vs. normalization, redrive the catalog + * lookup based on any byte mismatch. + */ + if (((flags & ISLASTCN) && (cp->c_flag & C_HARDLINK)) + || (force_casesensitive_lookup && !(hfsmp->hfs_flags & HFS_CASE_SENSITIVE))) { + int stale_link = 0; + + hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); + if ((cp->c_parentcnid != dcp->c_cnid) || + (cnp->cn_namelen != cp->c_desc.cd_namelen) || + (bcmp(cnp->cn_nameptr, cp->c_desc.cd_nameptr, cp->c_desc.cd_namelen) != 0)) { + struct cat_desc desc; + struct cat_attr lookup_attr; + int lockflags; + + if (force_casesensitive_lookup && !(hfsmp->hfs_flags & HFS_CASE_SENSITIVE)) { + /* + * Since the name in the cnode doesn't match our lookup + * string exactly, do a full lookup. + */ + hfs_unlock (cp); + + vnode_put(vp); + goto lookup; + } + + /* + * Get an updated descriptor + */ + desc.cd_nameptr = (const u_int8_t *)cnp->cn_nameptr; + desc.cd_namelen = cnp->cn_namelen; + desc.cd_parentcnid = dcp->c_fileid; + desc.cd_hint = dcp->c_childhint; + desc.cd_encoding = 0; + desc.cd_cnid = 0; + desc.cd_flags = S_ISDIR(cp->c_mode) ? CD_ISDIR : 0; + + /* + * Because lookups call replace_desc to put a new descriptor in + * the cnode we are modifying it is possible that this cnode's + * descriptor is out of date for the parent ID / name that + * we are trying to look up. (It may point to a different hardlink). + * + * We need to be cautious that when re-supplying the + * descriptor below that the results of the catalog lookup + * still point to the same raw inode for the hardlink. This would + * not be the case if we found something in the cache above but + * the vnode it returned no longer has a valid hardlink for the + * parent ID/filename combo we are requesting. (This is because + * hfs_unlink does not directly trigger namecache removal). + * + * As a result, before vending out the vnode (and replacing + * its descriptor) verify that the fileID is the same by comparing + * the in-cnode attributes vs. the one returned from the lookup call + * below. If they do not match, treat this lookup as if we never hit + * in the cache at all. + */ + + lockflags = hfs_systemfile_lock(VTOHFS(dvp), SFL_CATALOG, HFS_SHARED_LOCK); + + error = cat_lookup(VTOHFS(vp), &desc, 0, 0, &desc, &lookup_attr, NULL, NULL); + + hfs_systemfile_unlock(VTOHFS(dvp), lockflags); + + /* + * Note that cat_lookup may fail to find something with the name provided in the + * stack-based descriptor above. In that case, an ENOENT is a legitimate errno + * to be placed in error, which will get returned in the fastpath below. + */ + if (error == 0) { + if (lookup_attr.ca_fileid == cp->c_attr.ca_fileid) { + /* It still points to the right raw inode. Replacing the descriptor is fine */ + replace_desc (cp, &desc); + + /* + * Save the origin info for file and directory hardlinks. Directory hardlinks + * need the origin for '..' lookups, and file hardlinks need it to ensure that + * competing lookups do not cause us to vend different hardlinks than the ones requested. + */ + hfs_savelinkorigin(cp, dcp->c_fileid); + } + else { + /* If the fileID does not match then do NOT replace the descriptor! */ + stale_link = 1; + } + } + } + hfs_unlock (cp); + + if (stale_link) { + /* + * If we had a stale_link, then we need to pretend as though + * we never found this vnode and force a lookup through the + * traditional path. Drop the iocount acquired through + * cache_lookup above and force a cat lookup / getnewvnode + */ + vnode_put(vp); + goto lookup; + } + + if (error) { + /* + * If the cat_lookup failed then the caller will not expect + * a vnode with an iocount on it. + */ + vnode_put(vp); + } + + } + goto exit; + +lookup: + /* + * The vnode was not in the name cache or it was stale. + * + * So we need to do a real lookup. + */ + cnode_locked = 0; + + error = hfs_lookup(dvp, vpp, cnp, &cnode_locked, force_casesensitive_lookup); + + if (*vpp && (VTOC(*vpp)->c_attr.ca_recflags & kHFSDoNotFastDevPinMask)) { + fastdev_candidate = 0; + } + + if (*vpp && (VTOC(*vpp)->c_attr.ca_recflags & kHFSAutoCandidateMask)) { + //printf("vp %s / %d is an auto-candidate\n", (*vpp)->v_name ? (*vpp)->v_name : "no-name", VTOC(*vpp)->c_fileid); + auto_candidate = 1; + } + + if (cnode_locked) + hfs_unlock(VTOC(*vpp)); +exit: + if (*vpp && fastdev_candidate && !vnode_isfastdevicecandidate(*vpp)) { + vnode_setfastdevicecandidate(*vpp); + if (auto_candidate) { + vnode_setautocandidate(*vpp); + } + } + + /* + * check to see if we issued any I/O while completing this lookup and + * this thread/task is throttleable... if so, throttle now + * + * this allows us to throttle in between multiple meta data reads that + * might result due to looking up a long pathname (since we'll have to + * re-enter hfs_vnop_lookup for each component of the pathnam not in + * the VFS cache), instead of waiting until the entire path lookup has + * completed and throttling at the systemcall return + */ + if (__builtin_expect(throttle_lowpri_window(), 0)) + throttle_lowpri_io(1); + + return (error); +} + + diff --git a/core/hfs_macos_defs.h b/core/hfs_macos_defs.h new file mode 100644 index 0000000..029262d --- /dev/null +++ b/core/hfs_macos_defs.h @@ -0,0 +1,299 @@ +/* + * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef __HFS_MACOS_TYPES__ +#define __HFS_MACOS_TYPES__ + +#include + +#ifdef KERNEL +#ifdef __APPLE_API_PRIVATE + +#include + +#include +#include +#include +#include +#include +#include + + +#define TARGET_OS_WIN32 0 +#define TARGET_OS_UNIX 0 + +#define PRAGMA_IMPORT 0 +#define PRAGMA_STRUCT_ALIGN 1 +#define PRAGMA_ONCE 0 +#define PRAGMA_STRUCT_PACK 0 +#define PRAGMA_STRUCT_PACKPUSH 0 + +#if __GNUC__ >= 2 + #define TYPE_LONGLONG 1 +#else + #define TYPE_LONGLONG 0 +#endif +#ifdef __cplusplus + #define TYPE_BOOL 1 +#else + #define TYPE_BOOL 0 +#endif + +#define EXTERN_API(_type) extern _type +#define EXTERN_API_C(_type) extern _type + +#define CALLBACK_API_C(_type, _name) _type ( * _name) + +#define TARGET_API_MACOS_X 1 +#define TARGET_API_MAC_OS8 0 +#define TARGET_API_MAC_CARBON 0 + + + +/****** START OF MACOSTYPES *********/ + + +/* + 4.4BSD's sys/types.h defines size_t without defining __size_t__: + Things are a lot clearer from here on if we define __size_t__ now. + */ +#define __size_t__ + +/******************************************************************************** + + Special values in C + + NULL The C standard for an impossible pointer value + nil A carry over from pascal, NULL is prefered for C + +*********************************************************************************/ +#ifndef NULL + #define NULL 0 +#endif + +#ifndef nil + #define nil NULL +#endif + +typedef char * Ptr; +typedef long Size; + +typedef int16_t OSErr; +typedef u_int32_t ItemCount; +typedef u_int32_t ByteCount; +typedef u_int8_t * BytePtr; +typedef u_int32_t ByteOffset; + +typedef u_int16_t UniChar; +typedef unsigned char Str255[256]; +typedef unsigned char Str31[32]; +typedef unsigned char * StringPtr; +typedef const unsigned char * ConstStr255Param; +typedef const unsigned char * ConstStr31Param; +typedef const unsigned char * ConstUTF8Param; + +typedef u_int8_t Byte; + +typedef u_int32_t TextEncoding; +typedef UniChar * UniCharArrayPtr; +typedef const UniChar * ConstUniCharArrayPtr; + + +/******************************************************************************** + + Boolean types and values + + Boolean A one byte value, holds "false" (0) or "true" (1) + false The Boolean value of zero (0) + true The Boolean value of one (1) + +*********************************************************************************/ +/* + The identifiers "true" and "false" are becoming keywords in C++ + and work with the new built-in type "bool" + "Boolean" will remain an unsigned char for compatibility with source + code written before "bool" existed. +*/ +#if !TYPE_BOOL && !__bool_true_false_are_defined + +enum { + false = 0, + true = 1 +}; + +#endif /* !TYPE_BOOL */ + + +EXTERN_API( void ) DebugStr(const char * debuggerMsg); + +/********************************************************************************* + + Added types for HFSPlus MacOS X functionality. Needs to be incorporated to + other places + +*********************************************************************************/ + +typedef struct vnode* FileReference; + + +/***** START OF MACOSSTUBS ********/ + + +/* + SizeTDef.h -- Common definitions + + size_t - this type is defined by several ANSI headers. +*/ +#if ! defined (__size_t__) + #define __size_t__ + #if defined (__xlc) || defined (__xlC) || defined (__xlC__) || defined (__MWERKS__) + typedef unsigned long size_t; + #else /* __xlC */ + typedef unsigned int size_t; + #endif /* __xlC */ +#endif /* __size_t__ */ + + +/* + File: Errors.h + +*/ +enum { + noErr = 0, + dskFulErr = -34, /*disk full*/ + bdNamErr = -37, /*there may be no bad names in the final system!*/ + paramErr = -50, /*error in user parameter list*/ + memFullErr = -108, /*Not enough room in heap zone*/ + fileBoundsErr = -1309, /*file's EOF, offset, mark or size is too big*/ + kTECUsedFallbacksStatus = -8783, + +}; + + +enum { + /* Finder Flags */ + kHasBeenInited = 0x0100, + kHasCustomIcon = 0x0400, + kIsStationery = 0x0800, + kNameLocked = 0x1000, + kHasBundle = 0x2000, + kIsInvisible = 0x4000, + kIsAlias = 0x8000 +}; + +enum { + fsRtParID = 1, + fsRtDirID = 2 +}; + + +enum { + /* Mac OS encodings*/ + kTextEncodingMacRoman = 0L, + kTextEncodingMacJapanese = 1, + kTextEncodingMacChineseTrad = 2, + kTextEncodingMacKorean = 3, + kTextEncodingMacArabic = 4, + kTextEncodingMacHebrew = 5, + kTextEncodingMacGreek = 6, + kTextEncodingMacCyrillic = 7, + kTextEncodingMacDevanagari = 9, + kTextEncodingMacGurmukhi = 10, + kTextEncodingMacGujarati = 11, + kTextEncodingMacOriya = 12, + kTextEncodingMacBengali = 13, + kTextEncodingMacTamil = 14, + kTextEncodingMacTelugu = 15, + kTextEncodingMacKannada = 16, + kTextEncodingMacMalayalam = 17, + kTextEncodingMacSinhalese = 18, + kTextEncodingMacBurmese = 19, + kTextEncodingMacKhmer = 20, + kTextEncodingMacThai = 21, + kTextEncodingMacLaotian = 22, + kTextEncodingMacGeorgian = 23, + kTextEncodingMacArmenian = 24, + kTextEncodingMacChineseSimp = 25, + kTextEncodingMacTibetan = 26, + kTextEncodingMacMongolian = 27, + kTextEncodingMacEthiopic = 28, + kTextEncodingMacCentralEurRoman = 29, + kTextEncodingMacVietnamese = 30, + kTextEncodingMacExtArabic = 31, /* The following use script code 0, smRoman*/ + kTextEncodingMacSymbol = 33, + kTextEncodingMacDingbats = 34, + kTextEncodingMacTurkish = 35, + kTextEncodingMacCroatian = 36, + kTextEncodingMacIcelandic = 37, + kTextEncodingMacRomanian = 38, + kTextEncodingMacUnicode = 0x7E, + + kTextEncodingMacFarsi = 0x8C, /* Like MacArabic but uses Farsi digits */ /* The following use script code 7, smCyrillic */ + kTextEncodingMacUkrainian = 0x98, /* The following use script code 32, smUnimplemented */ +}; + + +/* PROTOTYPES */ + +#if DEBUG + extern void RequireFileLock(FileReference vp, int shareable); + #define REQUIRE_FILE_LOCK(vp,s) RequireFileLock((vp),(s)) +#else + #define REQUIRE_FILE_LOCK(vp,s) +#endif + + +EXTERN_API( void ) +BlockMoveData(const void * srcPtr, void * destPtr, Size byteCount); + +#define BlockMoveData(src, dest, len) bcopy((src), (dest), (len)) + +EXTERN_API_C( void ) +ClearMemory(void * start, u_int32_t length); + +#define ClearMemory(start, length) bzero((start), (size_t)(length)); + +/* + * The maximum number UTF-16 code units required to represent a HFS + * standard file name. The derivation for this number is not + * documented; it has been this value for some time. Mark, our + * resident Unicode expert, says "I'm not entirely certain, but I + * think it is the worst case for Korean Hangul conjoining jamos. The + * '15' is because a Str31 can contain at most 15 two-byte characters + * (in MacKorean encoding). Worst case, each one of those characters + * gets normalized to up to 5 UTF-16 code points. Each character is + * composed of up to three jamos; up to two of those jamos might not + * be in Unicode plane 0, which means they can take two UTF-16 code + * points (each) to represent. So your '5' is '2 + 2 + 1'." Sounds + * plausible! Safe to ship it, I say! + */ +#define MAX_HFS_UNICODE_CHARS (15*5) + +#endif /* __APPLE_API_PRIVATE */ +#endif /* KERNEL */ +#endif /* __HFS_MACOS_TYPES__ */ diff --git a/core/hfs_mount.h b/core/hfs_mount.h new file mode 100644 index 0000000..abbe61a --- /dev/null +++ b/core/hfs_mount.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + * Copyright (c) 1997-2002 Apple Inc. All Rights Reserved + * + */ + +#ifndef _HFS_MOUNT_H_ +#define _HFS_MOUNT_H_ + +#include + +#include +#include + +/* + * Arguments to mount HFS-based filesystems + */ + +#define OVERRIDE_UNKNOWN_PERMISSIONS 0 + +#define UNKNOWNUID ((uid_t)99) +#define UNKNOWNGID ((gid_t)99) +#define UNKNOWNPERMISSIONS (S_IRWXU | S_IROTH | S_IXOTH) /* 705 */ + +#ifdef __APPLE_API_UNSTABLE +struct hfs_mount_args { +#ifndef KERNEL + char *fspec; /* block special device to mount */ +#endif + uid_t hfs_uid; /* uid that owns hfs files (standard HFS only) */ + gid_t hfs_gid; /* gid that owns hfs files (standard HFS only) */ + mode_t hfs_mask; /* mask to be applied for hfs perms (standard HFS only) */ + u_int32_t hfs_encoding; /* encoding for this volume (standard HFS only) */ + struct timezone hfs_timezone; /* user time zone info (standard HFS only) */ + int flags; /* mounting flags, see below */ + int journal_tbuffer_size; /* size in bytes of the journal transaction buffer */ + int journal_flags; /* flags to pass to journal_open/create */ + int journal_disable; /* don't use journaling (potentially dangerous) */ +}; + +#define HFSFSMNT_NOXONFILES 0x1 /* disable execute permissions for files */ +#define HFSFSMNT_WRAPPER 0x2 /* mount HFS wrapper (if it exists) */ +#define HFSFSMNT_EXTENDED_ARGS 0x4 /* indicates new fields after "flags" are valid */ + +/* + * Sysctl values for HFS + */ +#define HFS_ENCODINGBIAS 1 /* encoding matching CJK bias */ +#define HFS_EXTEND_FS 2 +#define HFS_ENABLE_JOURNALING 0x082969 +#define HFS_DISABLE_JOURNALING 0x031272 +#define HFS_REPLAY_JOURNAL 0x6a6e6c72 +#define HFS_ENABLE_RESIZE_DEBUG 4 /* enable debug code for volume resizing */ + +#endif /* __APPLE_API_UNSTABLE */ + +#endif /* ! _HFS_MOUNT_H_ */ diff --git a/core/hfs_notification.c b/core/hfs_notification.c new file mode 100644 index 0000000..614c32d --- /dev/null +++ b/core/hfs_notification.c @@ -0,0 +1,198 @@ +/* + * Copyright (C) 2003-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "hfs.h" +#include "hfs_catalog.h" +#include "hfs_cnode.h" +#include "hfs_dbg.h" +#include "hfs_mount.h" +#include "hfs_quota.h" +#include "hfs_endian.h" + +#include "BTreesInternal.h" +#include "FileMgrInternal.h" + + + +void hfs_generate_volume_notifications(struct hfsmount *hfsmp) +{ + fsid_t fsid; + u_int32_t freeblks, state=999; + + /* Do not generate low disk notifications for read-only volumes */ + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return; + } + + fsid.val[0] = hfsmp->hfs_raw_dev; + fsid.val[1] = vfs_typenum(HFSTOVFS(hfsmp)); + + freeblks = hfs_freeblks(hfsmp, 1); + + /* + * Find the theshold the number of free blocks fits into. + * We fire upon reaching a level below desired only once, + * except for when we reach the low disk or near low disk levels + * from below, in which case we do not fire unless we have also + * reached the desired disk level (hysteresis). + * This is illustrated in the following diagram: + * + * fire ^ + * --------- desired level + * | + * + * + * | + * --------- near low disk level + * fire v + * + * + * | + * --------- low disk level + * fire v + * + * + * | ^ fire + * --------- very low disk level + * fire v | + * + */ + if (freeblks < hfsmp->hfs_freespace_notify_dangerlimit) { + state = 4; + } else if (freeblks < hfsmp->hfs_freespace_notify_warninglimit) { + state = 3; + } else if (freeblks < hfsmp->hfs_freespace_notify_nearwarninglimit) { + state = 2; + } else if (freeblks < hfsmp->hfs_freespace_notify_desiredlevel) { + /* We are between the near low disk and desired levels */ + state = 1; + } else if (freeblks >= hfsmp->hfs_freespace_notify_desiredlevel) { + state = 0; + } + + /* Free blocks are less than dangerlimit for the first time */ + if (state == 4 && !(hfsmp->hfs_notification_conditions & VQ_VERYLOWDISK)) { + /* Dump some logging to track down intermittent issues */ + printf("hfs: set VeryLowDisk: vol:%s, freeblks:%d, dangerlimit:%d\n", hfsmp->vcbVN, freeblks, hfsmp->hfs_freespace_notify_dangerlimit); + +#if HFS_SPARSE_DEV + // If we're a sparse device, dump some info about the backing store.. + hfs_lock_mount(hfsmp); + vnode_t backing_vp = hfsmp->hfs_backingvp; + if (backing_vp && vnode_get(backing_vp) != 0) + backing_vp = NULL; + hfs_unlock_mount(hfsmp); + + if (backing_vp) { + struct vfsstatfs *sfs = vfs_statfs(vnode_mount(backing_vp)); + printf("hfs: set VeryLowDisk: vol:%s, backingstore b_avail:%lld, tag:%d\n", + hfsmp->vcbVN, sfs->f_bavail, vnode_tag(backing_vp)); + vnode_put(backing_vp); + } +#endif + + hfsmp->hfs_notification_conditions |= (VQ_VERYLOWDISK|VQ_LOWDISK|VQ_NEARLOWDISK); + vfs_event_signal(&fsid, hfsmp->hfs_notification_conditions, (intptr_t)NULL); + } else if (state == 3) { + /* Free blocks are less than warning limit for the first time */ + if (!(hfsmp->hfs_notification_conditions & VQ_LOWDISK)) { + printf("hfs: set LowDisk: vol:%s, freeblks:%d, warninglimit:%d\n", hfsmp->vcbVN, freeblks, hfsmp->hfs_freespace_notify_warninglimit); + hfsmp->hfs_notification_conditions |= (VQ_LOWDISK|VQ_NEARLOWDISK); + vfs_event_signal(&fsid, hfsmp->hfs_notification_conditions, (intptr_t)NULL); + } else if (hfsmp->hfs_notification_conditions & VQ_VERYLOWDISK) { + /* Free blocks count has increased from danger limit to warning limit, so just clear VERYLOWDISK warning */ + printf("hfs: clear VeryLowDisk: vol:%s, freeblks:%d, dangerlimit:%d\n", hfsmp->vcbVN, freeblks, hfsmp->hfs_freespace_notify_dangerlimit); + hfsmp->hfs_notification_conditions &= ~VQ_VERYLOWDISK; + vfs_event_signal(&fsid, hfsmp->hfs_notification_conditions, (intptr_t)NULL); + } + } else if (state == 2) { + /* Free blocks are less than the near warning limit for the first time */ + if (!(hfsmp->hfs_notification_conditions & VQ_NEARLOWDISK)) { + printf("hfs: set NearLowDisk: vol:%s, freeblks:%d, nearwarninglimit:%d\n", hfsmp->vcbVN, freeblks, + hfsmp->hfs_freespace_notify_nearwarninglimit); + + hfsmp->hfs_notification_conditions |= VQ_NEARLOWDISK; + vfs_event_signal(&fsid, hfsmp->hfs_notification_conditions, (intptr_t)NULL); + } else { + /* Free blocks count has increased from warning/danger limit to near warning limit, + * so clear VERYLOWDISK / LOWDISK warnings, and signal if we clear VERYLOWDISK */ + hfsmp->hfs_notification_conditions &= ~VQ_LOWDISK; + if (hfsmp->hfs_notification_conditions & VQ_VERYLOWDISK) { + printf("hfs: clear VeryLowDisk: vol:%s, freeblks:%d, dangerlimit:%d\n", hfsmp->vcbVN, freeblks, + hfsmp->hfs_freespace_notify_dangerlimit); + + hfsmp->hfs_notification_conditions &= ~VQ_VERYLOWDISK; + vfs_event_signal(&fsid, hfsmp->hfs_notification_conditions, (intptr_t)NULL); + } + } + } else if (state == 1) { + /* Free blocks are less than the desireable level, but more than the near warning level + * In this case, we may have to notify if we were previously underneath the danger limit */ + if (hfsmp->hfs_notification_conditions & VQ_VERYLOWDISK) { + printf("hfs: clear VeryLowDisk: vol:%s, freeblks:%d, dangerlimit:%d\n", hfsmp->vcbVN, freeblks, + hfsmp->hfs_freespace_notify_dangerlimit); + + hfsmp->hfs_notification_conditions &= ~VQ_VERYLOWDISK; + vfs_event_signal(&fsid, hfsmp->hfs_notification_conditions, (intptr_t)NULL); + } + } else if (state == 0) { + /* Free blocks count has increased to desirable level, so clear all conditions */ + if (hfsmp->hfs_notification_conditions & (VQ_NEARLOWDISK|VQ_LOWDISK|VQ_VERYLOWDISK)) { + if (hfsmp->hfs_notification_conditions & VQ_NEARLOWDISK) { + printf("hfs: clear NearLowDisk: vol:%s, freeblks:%d, nearwarninglimit:%d, desiredlevel:%d\n", hfsmp->vcbVN, + freeblks, hfsmp->hfs_freespace_notify_nearwarninglimit, hfsmp->hfs_freespace_notify_desiredlevel); + } + if (hfsmp->hfs_notification_conditions & VQ_LOWDISK) { + printf("hfs: clear LowDisk: vol:%s, freeblks:%d, warninglimit:%d, desiredlevel:%d\n", hfsmp->vcbVN, freeblks, + hfsmp->hfs_freespace_notify_warninglimit, hfsmp->hfs_freespace_notify_desiredlevel); + } + if (hfsmp->hfs_notification_conditions & VQ_VERYLOWDISK) { + printf("hfs: clear VeryLowDisk: vol:%s, freeblks:%d, dangerlimit:%d\n", hfsmp->vcbVN, freeblks, hfsmp->hfs_freespace_notify_warninglimit); + } + hfsmp->hfs_notification_conditions &= ~(VQ_VERYLOWDISK|VQ_LOWDISK|VQ_NEARLOWDISK); + if (hfsmp->hfs_notification_conditions == 0) { + vfs_event_signal(&fsid, VQ_UPDATE|VQ_DESIRED_DISK, (intptr_t)NULL); + } else { + vfs_event_signal(&fsid, hfsmp->hfs_notification_conditions, (intptr_t)NULL); + } + } + } +} diff --git a/core/hfs_quota.c b/core/hfs_quota.c new file mode 100644 index 0000000..da47b9c --- /dev/null +++ b/core/hfs_quota.c @@ -0,0 +1,1014 @@ +/* + * Copyright (c) 2002-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + * Copyright (c) 1982, 1986, 1990, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Robert Elz at The University of Melbourne. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)hfs_quota.c + * derived from @(#)ufs_quota.c 8.5 (Berkeley) 5/20/95 + */ + +#if QUOTA + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hfs.h" +#include "hfs_cnode.h" +#include "hfs_quota.h" +#include "hfs_mount.h" + + +/* + * Quota name to error message mapping. + */ +#if 0 +static char *quotatypes[] = INITQFNAMES; +#endif + +/* + * Set up the quotas for a cnode. + * + * This routine completely defines the semantics of quotas. + * If other criterion want to be used to establish quotas, the + * MAXQUOTAS value in quotas.h should be increased, and the + * additional dquots set up here. + */ +int +hfs_getinoquota(cp) + register struct cnode *cp; +{ + struct hfsmount *hfsmp; + struct vnode *vp; + int error; + int drop_usrquota = false; + + vp = cp->c_vp ? cp->c_vp : cp->c_rsrc_vp; + hfsmp = VTOHFS(vp); + /* + * Set up the user quota based on file uid. + * EINVAL means that quotas are not enabled. + */ + if (cp->c_dquot[USRQUOTA] == NODQUOT) { + error = dqget(cp->c_uid, &hfsmp->hfs_qfiles[USRQUOTA], USRQUOTA, &cp->c_dquot[USRQUOTA]); + if ((error != 0) && (error != EINVAL)) { + return error; + } else if (error == 0) { + drop_usrquota = true; + } + } + + /* + * Set up the group quota based on file gid. + * EINVAL means that quotas are not enabled. + */ + if (cp->c_dquot[GRPQUOTA] == NODQUOT) { + error = dqget(cp->c_gid, &hfsmp->hfs_qfiles[GRPQUOTA], GRPQUOTA, &cp->c_dquot[GRPQUOTA]); + if ((error != 0) && (error != EINVAL)) { + if (drop_usrquota == true) { + dqrele(cp->c_dquot[USRQUOTA]); + cp->c_dquot[USRQUOTA] = NODQUOT; + } + return error; + } + } + + return (0); +} + +/* + * Update disk usage, and take corrective action. + */ +int +hfs_chkdq(cp, change, cred, flags) + register struct cnode *cp; + int64_t change; + kauth_cred_t cred; + int flags; +{ + register struct dquot *dq; + register int i; + int64_t ncurbytes; + int error=0; + struct proc *p; + +#if DIAGNOSTIC + if ((flags & CHOWN) == 0) + hfs_chkdquot(cp); +#endif + if (change == 0) + return (0); + if (change < 0) { + for (i = 0; i < MAXQUOTAS; i++) { + if ((dq = cp->c_dquot[i]) == NODQUOT) + continue; + dqlock(dq); + + ncurbytes = dq->dq_curbytes + change; + if (ncurbytes >= 0) + dq->dq_curbytes = ncurbytes; + else + dq->dq_curbytes = 0; + dq->dq_flags &= ~DQ_BLKS; + dq->dq_flags |= DQ_MOD; + + dqunlock(dq); + } + return (0); + } + p = current_proc(); + /* + * This use of proc_ucred() is safe because kernproc credential never + * changes. + */ + if (!IS_VALID_CRED(cred)) + cred = proc_ucred(kernproc); + if (suser(cred, NULL) || proc_forcequota(p)) { + for (i = 0; i < MAXQUOTAS; i++) { + if ((dq = cp->c_dquot[i]) == NODQUOT) + continue; + error = hfs_chkdqchg(cp, change, cred, i); + if (error) { + break; + } + } + } + if ((flags & FORCE) || error == 0) { + for (i = 0; i < MAXQUOTAS; i++) { + if ((dq = cp->c_dquot[i]) == NODQUOT) + continue; + dqlock(dq); + + dq->dq_curbytes += change; + dq->dq_flags |= DQ_MOD; + + dqunlock(dq); + } + } + return (error); +} + +/* + * Check for a valid change to a users allocation. + * Issue an error message and vfs event if appropriate. + */ +int +hfs_chkdqchg(cp, change, cred, type) + struct cnode *cp; + int64_t change; + kauth_cred_t cred; + int type; +{ + register struct dquot *dq = cp->c_dquot[type]; + u_int64_t ncurbytes; + struct vnode *vp = cp->c_vp ? cp->c_vp : cp->c_rsrc_vp; + + fsid_t fsid; + fsid.val[0] = VTOHFS(vp)->hfs_raw_dev; + fsid.val[1] = vfs_typenum(VTOVFS(vp)); + + dqlock(dq); + + ncurbytes = dq->dq_curbytes + change; + /* + * If user would exceed their hard limit, disallow space allocation. + */ + if (ncurbytes >= dq->dq_bhardlimit && dq->dq_bhardlimit) { + if ((dq->dq_flags & DQ_BLKS) == 0 && + cp->c_uid == kauth_cred_getuid(cred)) { +#if 0 + printf("\nhfs: write failed, %s disk limit reached\n", + quotatypes[type]); +#endif + dq->dq_flags |= DQ_BLKS; + vfs_event_signal(&fsid, VQ_QUOTA, (intptr_t)NULL); + } + dqunlock(dq); + + return (EDQUOT); + } + /* + * If user is over their soft limit for too long, disallow space + * allocation. Reset time limit as they cross their soft limit. + */ + if (ncurbytes >= dq->dq_bsoftlimit && dq->dq_bsoftlimit) { + struct timeval tv; + + microuptime(&tv); + if (dq->dq_curbytes < dq->dq_bsoftlimit) { + dq->dq_btime = tv.tv_sec + + VTOHFS(vp)->hfs_qfiles[type].qf_btime; +#if 0 + if (cp->c_uid == kauth_cred_getuid(cred)) + printf("\nhfs: warning, %s %s\n", + quotatypes[type], "disk quota exceeded"); +#endif + vfs_event_signal(&fsid, VQ_QUOTA, (intptr_t)NULL); + dqunlock(dq); + + return (0); + } + if (tv.tv_sec > (time_t)dq->dq_btime) { + if ((dq->dq_flags & DQ_BLKS) == 0 && + cp->c_uid == kauth_cred_getuid(cred)) { +#if 0 + printf("\nhfs: write failed, %s %s\n", + quotatypes[type], + "disk quota exceeded for too long"); +#endif + dq->dq_flags |= DQ_BLKS; + vfs_event_signal(&fsid, VQ_QUOTA, (intptr_t)NULL); + } + dqunlock(dq); + + return (EDQUOT); + } + } + dqunlock(dq); + + return (0); +} + +/* + * Check the inode limit, applying corrective action. + */ +int +hfs_chkiq(cp, change, cred, flags) + register struct cnode *cp; + int32_t change; + kauth_cred_t cred; + int flags; +{ + register struct dquot *dq; + register int i; + int ncurinodes, error=0; + struct proc *p; + +#if DIAGNOSTIC + if ((flags & CHOWN) == 0) + hfs_chkdquot(cp); +#endif + if (change == 0) + return (0); + if (change < 0) { + for (i = 0; i < MAXQUOTAS; i++) { + if ((dq = cp->c_dquot[i]) == NODQUOT) + continue; + dqlock(dq); + + ncurinodes = dq->dq_curinodes + change; + if (ncurinodes >= 0) + dq->dq_curinodes = ncurinodes; + else + dq->dq_curinodes = 0; + dq->dq_flags &= ~DQ_INODS; + dq->dq_flags |= DQ_MOD; + + dqunlock(dq); + } + return (0); + } + p = current_proc(); + /* + * This use of proc_ucred() is safe because kernproc credential never + * changes. + */ + if (!IS_VALID_CRED(cred)) + cred = proc_ucred(kernproc); + if (suser(cred, NULL) || proc_forcequota(p)) { + for (i = 0; i < MAXQUOTAS; i++) { + if ((dq = cp->c_dquot[i]) == NODQUOT) + continue; + error = hfs_chkiqchg(cp, change, cred, i); + if (error) { + break; + } + } + } + if ((flags & FORCE) || error == 0) { + for (i = 0; i < MAXQUOTAS; i++) { + if ((dq = cp->c_dquot[i]) == NODQUOT) + continue; + dqlock(dq); + + dq->dq_curinodes += change; + dq->dq_flags |= DQ_MOD; + + dqunlock(dq); + } + } + return (error); +} + + +/* + * Check to see if a change to a user's allocation should be permitted or not. + * Issue an error message if it should not be permitted. Return 0 if + * it should be allowed. + */ +int hfs_isiqchg_allowed(dq, hfsmp, change, cred, type, uid) + struct dquot* dq; + struct hfsmount* hfsmp; + int32_t change; + kauth_cred_t cred; + int type; + uid_t uid; +{ + u_int32_t ncurinodes; + + fsid_t fsid; + fsid.val[0] = hfsmp->hfs_raw_dev; + fsid.val[1] = vfs_typenum(HFSTOVFS(hfsmp)); + + dqlock(dq); + + ncurinodes = dq->dq_curinodes + change; + /* + * If user would exceed their hard limit, disallow cnode allocation. + */ + if (ncurinodes >= dq->dq_ihardlimit && dq->dq_ihardlimit) { + if ((dq->dq_flags & DQ_INODS) == 0 && + uid == kauth_cred_getuid(cred)) { + dq->dq_flags |= DQ_INODS; + vfs_event_signal(&fsid, VQ_QUOTA, (intptr_t)NULL); + } + dqunlock(dq); + + return (EDQUOT); + } + /* + * If user is over their soft limit for too long, disallow cnode + * allocation. Reset time limit as they cross their soft limit. + */ + if (ncurinodes >= dq->dq_isoftlimit && dq->dq_isoftlimit) { + struct timeval tv; + + microuptime(&tv); + if (dq->dq_curinodes < dq->dq_isoftlimit) { + dq->dq_itime = tv.tv_sec + hfsmp->hfs_qfiles[type].qf_itime; + vfs_event_signal(&fsid, VQ_QUOTA, (intptr_t)NULL); + dqunlock(dq); + return (0); + } + if (tv.tv_sec > (time_t)dq->dq_itime) { + if (((dq->dq_flags & DQ_INODS) == 0) && + (uid == kauth_cred_getuid(cred))) { + dq->dq_flags |= DQ_INODS; + vfs_event_signal(&fsid, VQ_QUOTA, (intptr_t)NULL); + } + dqunlock(dq); + + return (EDQUOT); + } + } + dqunlock(dq); + + return (0); +} + + +/* + * Check for a valid change to a users allocation. + * Issue an error message if appropriate. + */ +int +hfs_chkiqchg(cp, change, cred, type) + struct cnode *cp; + int32_t change; + kauth_cred_t cred; + int type; +{ + register struct dquot *dq = cp->c_dquot[type]; + u_int32_t ncurinodes; + struct vnode *vp = cp->c_vp ? cp->c_vp : cp->c_rsrc_vp; + + dqlock(dq); + + ncurinodes = dq->dq_curinodes + change; + /* + * If user would exceed their hard limit, disallow cnode allocation. + */ + if (ncurinodes >= dq->dq_ihardlimit && dq->dq_ihardlimit) { + if ((dq->dq_flags & DQ_INODS) == 0 && + cp->c_uid == kauth_cred_getuid(cred)) { +#if 0 + printf("\nhfs: write failed, %s cnode limit reached\n", + quotatypes[type]); +#endif + dq->dq_flags |= DQ_INODS; + } + dqunlock(dq); + + return (EDQUOT); + } + /* + * If user is over their soft limit for too long, disallow cnode + * allocation. Reset time limit as they cross their soft limit. + */ + if (ncurinodes >= dq->dq_isoftlimit && dq->dq_isoftlimit) { + struct timeval tv; + + microuptime(&tv); + if (dq->dq_curinodes < dq->dq_isoftlimit) { + dq->dq_itime = tv.tv_sec + + VTOHFS(vp)->hfs_qfiles[type].qf_itime; +#if 0 + if (cp->c_uid == kauth_cred_getuid(cred)) + printf("\nhfs: warning, %s %s\n", + quotatypes[type], "cnode quota exceeded"); +#endif + dqunlock(dq); + + return (0); + } + if (tv.tv_sec > (time_t)dq->dq_itime) { + if ((dq->dq_flags & DQ_INODS) == 0 && + cp->c_uid == kauth_cred_getuid(cred)) { +#if 0 + printf("\nhfs: write failed, %s %s\n", + quotatypes[type], + "cnode quota exceeded for too long"); +#endif + dq->dq_flags |= DQ_INODS; + } + dqunlock(dq); + + return (EDQUOT); + } + } + dqunlock(dq); + + return (0); +} + +#if DIAGNOSTIC +/* + * On filesystems with quotas enabled, it is an error for a file to change + * size and not to have a dquot structure associated with it. + */ +void +hfs_chkdquot(cp) + register struct cnode *cp; +{ + struct vnode *vp = cp->c_vp ? cp->c_vp : cp->c_rsrc_vp; + struct hfsmount *hfsmp = VTOHFS(vp); + register int i; + + for (i = 0; i < MAXQUOTAS; i++) { + if (hfsmp->hfs_qfiles[i].qf_vp == NULLVP) + continue; + if (cp->c_dquot[i] == NODQUOT) { + vprint("chkdquot: missing dquot", vp); + panic("missing dquot"); + } + } +} +#endif + +/* + * Code to process quotactl commands. + */ + +/* + * Q_QUOTAON - set up a quota file for a particular file system. + */ +struct hfs_quotaon_cargs { + int error; +}; + +static int +hfs_quotaon_callback(struct vnode *vp, void *cargs) +{ + struct hfs_quotaon_cargs *args; + + args = (struct hfs_quotaon_cargs *)cargs; + + args->error = hfs_getinoquota(VTOC(vp)); + if (args->error) + return (VNODE_RETURNED_DONE); + + return (VNODE_RETURNED); +} + +int +hfs_quotaon(p, mp, type, fnamep) + struct proc *p; + struct mount *mp; + register int type; + caddr_t fnamep; +{ + struct hfsmount *hfsmp = VFSTOHFS(mp); + struct quotafile *qfp; + struct vnode *vp; + int error = 0; + struct hfs_quotaon_cargs args; + + /* Finish setting up quota structures. */ + dqhashinit(); + + qfp = &hfsmp->hfs_qfiles[type]; + + if ( (qf_get(qfp, QTF_OPENING)) ) + return (0); + + error = vnode_open(fnamep, FREAD|FWRITE, 0, 0, &vp, NULL); + if (error) { + goto out; + } + if (!vnode_isreg(vp)) { + (void) vnode_close(vp, FREAD|FWRITE, NULL); + error = EACCES; + goto out; + } + vfs_setflags(mp, (u_int64_t)((unsigned int)MNT_QUOTA)); + hfs_lock_mount (hfsmp); + hfsmp->hfs_flags |= HFS_QUOTAS; + hfs_unlock_mount (hfsmp); + vnode_setnoflush(vp); + /* + * Save the credential of the process that turned on quotas. + */ + qfp->qf_cred = kauth_cred_proc_ref(p); + qfp->qf_vp = vp; + /* + * Finish initializing the quota file + */ + error = dqfileopen(qfp, type); + if (error) { + (void) vnode_close(vp, FREAD|FWRITE, NULL); + + if (IS_VALID_CRED(qfp->qf_cred)) + kauth_cred_unref(&qfp->qf_cred); + qfp->qf_vp = NULLVP; + goto out; + } + qf_put(qfp, QTF_OPENING); + + /* + * Search vnodes associated with this mount point, + * adding references to quota file being opened. + * NB: only need to add dquot's for cnodes being modified. + * + * hfs_quota_callback will be called for each vnode open for + * 'write' (VNODE_WRITEABLE) hung off of this mount point + * the vnode will be in an 'unbusy' state (VNODE_WAIT) and + * properly referenced and unreferenced around the callback + */ + args.error = 0; + + vnode_iterate(mp, VNODE_WRITEABLE | VNODE_WAIT, hfs_quotaon_callback, (void *)&args); + + error = args.error; + + if (error) { + hfs_quotaoff(p, mp, type); + } + return (error); + +out: + qf_put(qfp, QTF_OPENING); + + return (error); +} + + +/* + * Q_QUOTAOFF - turn off disk quotas for a filesystem. + */ +struct hfs_quotaoff_cargs { + int type; +}; + +static int +hfs_quotaoff_callback(struct vnode *vp, void *cargs) +{ + struct hfs_quotaoff_cargs *args; + struct cnode *cp; + struct dquot *dq; + + args = (struct hfs_quotaoff_cargs *)cargs; + + cp = VTOC(vp); + + dq = cp->c_dquot[args->type]; + cp->c_dquot[args->type] = NODQUOT; + + dqrele(dq); + + return (VNODE_RETURNED); +} + +int +hfs_quotaoff(__unused struct proc *p, struct mount *mp, register int type) +{ + struct vnode *qvp; + struct hfsmount *hfsmp = VFSTOHFS(mp); + struct quotafile *qfp; + int error; + struct hfs_quotaoff_cargs args; + + /* + * If quotas haven't been initialized, there's no work to be done. + */ + if (!dqisinitialized()) + return (0); + + qfp = &hfsmp->hfs_qfiles[type]; + + if ( (qf_get(qfp, QTF_CLOSING)) ) + return (0); + qvp = qfp->qf_vp; + + /* + * Sync out any orpaned dirty dquot entries. + */ + dqsync_orphans(qfp); + + /* + * Search vnodes associated with this mount point, + * deleting any references to quota file being closed. + * + * hfs_quotaoff_callback will be called for each vnode + * hung off of this mount point + * the vnode will be in an 'unbusy' state (VNODE_WAIT) and + * properly referenced and unreferenced around the callback + */ + args.type = type; + + vnode_iterate(mp, VNODE_WAIT, hfs_quotaoff_callback, (void *)&args); + + dqflush(qvp); + /* Finish tearing down the quota file */ + dqfileclose(qfp, type); + + vnode_clearnoflush(qvp); + error = vnode_close(qvp, FREAD|FWRITE, NULL); + + qfp->qf_vp = NULLVP; + + if (IS_VALID_CRED(qfp->qf_cred)) + kauth_cred_unref(&qfp->qf_cred); + for (type = 0; type < MAXQUOTAS; type++) + if (hfsmp->hfs_qfiles[type].qf_vp != NULLVP) + break; + if (type == MAXQUOTAS) { + vfs_clearflags(mp, (u_int64_t)((unsigned int)MNT_QUOTA)); + hfs_lock_mount (hfsmp); + hfsmp->hfs_flags &= ~HFS_QUOTAS; + hfs_unlock_mount (hfsmp); + } + + qf_put(qfp, QTF_CLOSING); + + return (error); +} + +/* + * hfs_quotacheck - checks quotas mountwide for a hypothetical situation. It probes + * the quota data structures to see if adding an inode would be allowed or not. If it + * will be allowed, the change is made. Otherwise, it reports an error back out so the + * caller will know not to proceed with inode allocation in the HFS Catalog. + * + * Note that this function ONLY tests for addition of inodes, not subtraction. + */ +int hfs_quotacheck(hfsmp, change, uid, gid, cred) + struct hfsmount *hfsmp; + int change; + uid_t uid; + gid_t gid; + kauth_cred_t cred; +{ + struct dquot *dq = NULL; + struct proc *p; + int error = 0; + int i; + id_t id = uid; + + p = current_proc(); + if (!IS_VALID_CRED(cred)) { + /* This use of proc_ucred() is safe because kernproc credential never changes */ + cred = proc_ucred(kernproc); + } + + if (suser(cred, NULL) || proc_forcequota(p)) { + for (i = 0; i < MAXQUOTAS; i++) { + /* Select if user or group id should be used */ + if (i == USRQUOTA) + id = uid; + else if (i == GRPQUOTA) + id = gid; + + error = dqget(id, &hfsmp->hfs_qfiles[i], i, &dq); + if (error && (error != EINVAL)) + break; + + error = 0; + if (dq == NODQUOT) + continue; + + /* Check quota information */ + error = hfs_isiqchg_allowed(dq, hfsmp, change, cred, i, id); + if (error) { + dqrele(dq); + break; + } + + dqlock(dq); + /* Update quota information */ + dq->dq_curinodes += change; + dqunlock(dq); + dqrele(dq); + } + } + + return error; +} + + +/* + * Q_GETQUOTA - return current values in a dqblk structure. + */ +int +hfs_getquota(mp, id, type, datap) + struct mount *mp; + u_int32_t id; + int type; + caddr_t datap; +{ + struct dquot *dq; + int error; + + error = dqget(id, &VFSTOHFS(mp)->hfs_qfiles[type], type, &dq); + if (error) + return (error); + dqlock(dq); + + bcopy(&dq->dq_dqb, datap, sizeof(dq->dq_dqb)); + + dqunlock(dq); + dqrele(dq); + + return (error); +} + +/* + * Q_SETQUOTA - assign an entire dqblk structure. + */ +int +hfs_setquota(mp, id, type, datap) + struct mount *mp; + u_int32_t id; + int type; + caddr_t datap; +{ + struct dquot *dq; + struct hfsmount *hfsmp = VFSTOHFS(mp); + struct dqblk * newlimp = (struct dqblk *) datap; + struct timeval tv; + int error; + + error = dqget(id, &hfsmp->hfs_qfiles[type], type, &dq); + if (error) + return (error); + dqlock(dq); + + /* + * Copy all but the current values. + * Reset time limit if previously had no soft limit or were + * under it, but now have a soft limit and are over it. + */ + newlimp->dqb_curbytes = dq->dq_curbytes; + newlimp->dqb_curinodes = dq->dq_curinodes; + if (dq->dq_id != 0) { + newlimp->dqb_btime = dq->dq_btime; + newlimp->dqb_itime = dq->dq_itime; + } + if (newlimp->dqb_bsoftlimit && + dq->dq_curbytes >= newlimp->dqb_bsoftlimit && + (dq->dq_bsoftlimit == 0 || dq->dq_curbytes < dq->dq_bsoftlimit)) { + microuptime(&tv); + newlimp->dqb_btime = tv.tv_sec + hfsmp->hfs_qfiles[type].qf_btime; + } + if (newlimp->dqb_isoftlimit && + dq->dq_curinodes >= newlimp->dqb_isoftlimit && + (dq->dq_isoftlimit == 0 || dq->dq_curinodes < dq->dq_isoftlimit)) { + microuptime(&tv); + newlimp->dqb_itime = tv.tv_sec + hfsmp->hfs_qfiles[type].qf_itime; + } + bcopy(newlimp, &dq->dq_dqb, sizeof(dq->dq_dqb)); + if (dq->dq_curbytes < dq->dq_bsoftlimit) + dq->dq_flags &= ~DQ_BLKS; + if (dq->dq_curinodes < dq->dq_isoftlimit) + dq->dq_flags &= ~DQ_INODS; + if (dq->dq_isoftlimit == 0 && dq->dq_bsoftlimit == 0 && + dq->dq_ihardlimit == 0 && dq->dq_bhardlimit == 0) + dq->dq_flags |= DQ_FAKE; + else + dq->dq_flags &= ~DQ_FAKE; + dq->dq_flags |= DQ_MOD; + + dqunlock(dq); + dqrele(dq); + + return (0); +} + +/* + * Q_SETUSE - set current cnode and byte usage. + */ +int +hfs_setuse(mp, id, type, datap) + struct mount *mp; + u_int32_t id; + int type; + caddr_t datap; +{ + struct hfsmount *hfsmp = VFSTOHFS(mp); + struct dquot *dq; + struct timeval tv; + int error; + struct dqblk *quotablkp = (struct dqblk *) datap; + + error = dqget(id, &hfsmp->hfs_qfiles[type], type, &dq); + if (error) + return (error); + dqlock(dq); + + /* + * Reset time limit if have a soft limit and were + * previously under it, but are now over it. + */ + if (dq->dq_bsoftlimit && dq->dq_curbytes < dq->dq_bsoftlimit && + quotablkp->dqb_curbytes >= dq->dq_bsoftlimit) { + microuptime(&tv); + dq->dq_btime = tv.tv_sec + hfsmp->hfs_qfiles[type].qf_btime; + } + if (dq->dq_isoftlimit && dq->dq_curinodes < dq->dq_isoftlimit && + quotablkp->dqb_curinodes >= dq->dq_isoftlimit) { + microuptime(&tv); + dq->dq_itime = tv.tv_sec + hfsmp->hfs_qfiles[type].qf_itime; + } + dq->dq_curbytes = quotablkp->dqb_curbytes; + dq->dq_curinodes = quotablkp->dqb_curinodes; + if (dq->dq_curbytes < dq->dq_bsoftlimit) + dq->dq_flags &= ~DQ_BLKS; + if (dq->dq_curinodes < dq->dq_isoftlimit) + dq->dq_flags &= ~DQ_INODS; + dq->dq_flags |= DQ_MOD; + + dqunlock(dq); + dqrele(dq); + + return (0); +} + + +/* + * Q_SYNC - sync quota files to disk. + */ +static int +hfs_qsync_callback(struct vnode *vp, __unused void *cargs) +{ + struct cnode *cp; + struct dquot *dq; + int i; + + cp = VTOC(vp); + + for (i = 0; i < MAXQUOTAS; i++) { + dq = cp->c_dquot[i]; + if (dq != NODQUOT && (dq->dq_flags & DQ_MOD)) + dqsync(dq); + } + return (VNODE_RETURNED); +} + +int +hfs_qsync(mp) + struct mount *mp; +{ + struct hfsmount *hfsmp = VFSTOHFS(mp); + int i; + + if (!dqisinitialized()) + return (0); + + /* + * Check if the mount point has any quotas. + * If not, simply return. + */ + for (i = 0; i < MAXQUOTAS; i++) + if (hfsmp->hfs_qfiles[i].qf_vp != NULLVP) + break; + if (i == MAXQUOTAS) + return (0); + + /* + * Sync out any orpaned dirty dquot entries. + */ + for (i = 0; i < MAXQUOTAS; i++) + if (hfsmp->hfs_qfiles[i].qf_vp != NULLVP) + dqsync_orphans(&hfsmp->hfs_qfiles[i]); + + /* + * Search vnodes associated with this mount point, + * synchronizing any modified dquot structures. + * + * hfs_qsync_callback will be called for each vnode + * hung off of this mount point + * the vnode will be + * properly referenced and unreferenced around the callback + */ + vnode_iterate(mp, 0, hfs_qsync_callback, (void *)NULL); + + return (0); +} + +/* + * Q_QUOTASTAT - get quota on/off status + */ +int +hfs_quotastat(mp, type, datap) + struct mount *mp; + register int type; + caddr_t datap; +{ + struct hfsmount *hfsmp = VFSTOHFS(mp); + int error = 0; + int qstat; + + if ((((unsigned int)vfs_flags(mp)) & MNT_QUOTA) && (hfsmp->hfs_qfiles[type].qf_vp != NULLVP)) + qstat = 1; /* quotas are on for this type */ + else + qstat = 0; /* quotas are off for this type */ + + *((int *)datap) = qstat; + return (error); +} + +#endif // QUOTA diff --git a/core/hfs_quota.h b/core/hfs_quota.h new file mode 100644 index 0000000..27ee6a5 --- /dev/null +++ b/core/hfs_quota.h @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2002 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Robert Elz at The University of Melbourne. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)hfs_quota.h + * derived from @(#)quota.h 8.3 (Berkeley) 8/19/94 + */ + +#ifndef _HFS_QUOTA_H_ +#define _HFS_QUOTA_H_ + +#include + +#ifdef KERNEL +#ifdef __APPLE_API_PRIVATE +#include + +#include + +struct cnode; +struct mount; +struct proc; +#ifndef _KAUTH_CRED_T +#define _KAUTH_CRED_T +struct ucred; +typedef struct ucred *kauth_cred_t; +#endif /* !_KAUTH_CRED_T */ +__BEGIN_DECLS +int hfs_chkdq(struct cnode *, int64_t, kauth_cred_t, int); +int hfs_chkdqchg(struct cnode *, int64_t, kauth_cred_t, int); +int hfs_chkiq(struct cnode *, int32_t, kauth_cred_t, int); +int hfs_chkiqchg(struct cnode *, int32_t, kauth_cred_t, int); +int hfs_getinoquota(struct cnode *); +int hfs_getquota(struct mount *, u_int32_t, int, caddr_t); +int hfs_qsync(struct mount *mp); +int hfs_quotaoff(struct proc *, struct mount *, int); +int hfs_quotaon(struct proc *, struct mount *, int, caddr_t); +int hfs_quotastat(struct mount *, int, caddr_t); +int hfs_setquota(struct mount *, u_int32_t, int, caddr_t); +int hfs_setuse(struct mount *, u_int32_t, int, caddr_t); +int hfs_isiqchg_allowed(struct dquot *, struct hfsmount *, int32_t, kauth_cred_t, int, uid_t); +int hfs_quotacheck (struct hfsmount *, int , uid_t, gid_t, kauth_cred_t); +__END_DECLS + +#if DIAGNOSTIC +__BEGIN_DECLS +void hfs_chkdquot(struct cnode *); +__END_DECLS +#endif +#endif /* __APPLE_API_PRIVATE */ +#endif /* KERNEL */ + +#endif /* ! _HFS_QUOTA_H_ */ diff --git a/core/hfs_readwrite.c b/core/hfs_readwrite.c new file mode 100644 index 0000000..99092aa --- /dev/null +++ b/core/hfs_readwrite.c @@ -0,0 +1,5876 @@ +/* + * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* @(#)hfs_readwrite.c 1.0 + * + * (c) 1998-2001 Apple Inc. All Rights Reserved + * + * hfs_readwrite.c -- vnode operations to deal with reading and writing files. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include + +#include +#include + +#include + +#include + +#include "hfs.h" +#include "hfs_attrlist.h" +#include "hfs_endian.h" +#include "hfs_fsctl.h" +#include "hfs_quota.h" +#include "FileMgrInternal.h" +#include "BTreesInternal.h" +#include "hfs_cnode.h" +#include "hfs_dbg.h" + +#if HFS_CONFIG_KEY_ROLL +#include "hfs_key_roll.h" +#endif + +#define can_cluster(size) ((((size & (4096-1))) == 0) && (size <= (MAXPHYSIO/2))) + +enum { + MAXHFSFILESIZE = 0x7FFFFFFF /* this needs to go in the mount structure */ +}; + +/* from bsd/hfs/hfs_vfsops.c */ +extern int hfs_vfs_vget (struct mount *mp, ino64_t ino, struct vnode **vpp, vfs_context_t context); + +/* from hfs_hotfiles.c */ +extern int hfs_pin_overflow_extents (struct hfsmount *hfsmp, uint32_t fileid, + uint8_t forktype, uint32_t *pinned); + +static int hfs_clonefile(struct vnode *, int, int, int); +static int hfs_clonesysfile(struct vnode *, int, int, int, kauth_cred_t, struct proc *); +static int do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skip, vfs_context_t context); + + +/* + * Read data from a file. + */ +int +hfs_vnop_read(struct vnop_read_args *ap) +{ + /* + struct vnop_read_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + struct uio *a_uio; + int a_ioflag; + vfs_context_t a_context; + }; + */ + + uio_t uio = ap->a_uio; + struct vnode *vp = ap->a_vp; + struct cnode *cp; + struct filefork *fp; + struct hfsmount *hfsmp; + off_t filesize; + off_t filebytes; + off_t start_resid = uio_resid(uio); + off_t offset = uio_offset(uio); + int retval = 0; + int took_truncate_lock = 0; + int io_throttle = 0; + int throttled_count = 0; + + /* Preflight checks */ + if (!vnode_isreg(vp)) { + /* can only read regular files */ + if (vnode_isdir(vp)) + return (EISDIR); + else + return (EPERM); + } + if (start_resid == 0) + return (0); /* Nothing left to do */ + if (offset < 0) + return (EINVAL); /* cant read from a negative offset */ + +#if SECURE_KERNEL + if ((ap->a_ioflag & (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) == + (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) { + /* Don't allow unencrypted io request from user space */ + return EPERM; + } +#endif + +#if HFS_COMPRESSION + if (VNODE_IS_RSRC(vp)) { + if (hfs_hides_rsrc(ap->a_context, VTOC(vp), 1)) { /* 1 == don't take the cnode lock */ + return 0; + } + /* otherwise read the resource fork normally */ + } else { + int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */ + if (compressed) { + retval = decmpfs_read_compressed(ap, &compressed, VTOCMP(vp)); + if (retval == 0 && !(ap->a_ioflag & IO_EVTONLY) && vnode_isfastdevicecandidate(vp)) { + (void) hfs_addhotfile(vp); + } + if (compressed) { + if (retval == 0) { + /* successful read, update the access time */ + VTOC(vp)->c_touch_acctime = TRUE; + + // + // compressed files are not traditional hot file candidates + // but they may be for CF (which ignores the ff_bytesread + // field) + // + if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) { + VTOF(vp)->ff_bytesread = 0; + } + } + return retval; + } + /* otherwise the file was converted back to a regular file while we were reading it */ + retval = 0; + } else if ((VTOC(vp)->c_bsdflags & UF_COMPRESSED)) { + int error; + + error = check_for_dataless_file(vp, NAMESPACE_HANDLER_READ_OP); + if (error) { + return error; + } + + } + } +#endif /* HFS_COMPRESSION */ + + cp = VTOC(vp); + fp = VTOF(vp); + hfsmp = VTOHFS(vp); + +#if CONFIG_PROTECT + if ((retval = cp_handle_vnop (vp, CP_READ_ACCESS, ap->a_ioflag)) != 0) { + goto exit; + } + +#if HFS_CONFIG_KEY_ROLL + if (ISSET(ap->a_ioflag, IO_ENCRYPTED)) { + off_rsrc_t off_rsrc = off_rsrc_make(offset + start_resid, + VNODE_IS_RSRC(vp)); + + retval = hfs_key_roll_up_to(ap->a_context, vp, off_rsrc); + if (retval) + goto exit; + } +#endif // HFS_CONFIG_KEY_ROLL +#endif // CONFIG_PROTECT + + /* + * If this read request originated from a syscall (as opposed to + * an in-kernel page fault or something), then set it up for + * throttle checks + */ + if (ap->a_ioflag & IO_SYSCALL_DISPATCH) { + io_throttle = IO_RETURN_ON_THROTTLE; + } + +read_again: + + /* Protect against a size change. */ + hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT); + took_truncate_lock = 1; + + filesize = fp->ff_size; + filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize; + + /* + * Check the file size. Note that per POSIX spec, we return 0 at + * file EOF, so attempting a read at an offset that is too big + * should just return 0 on HFS+. Since the return value was initialized + * to 0 above, we just jump to exit. HFS Standard has its own behavior. + */ + if (offset > filesize) { +#if CONFIG_HFS_STD + if ((hfsmp->hfs_flags & HFS_STANDARD) && + (offset > (off_t)MAXHFSFILESIZE)) { + retval = EFBIG; + } +#endif + goto exit; + } + + KERNEL_DEBUG(HFSDBG_READ | DBG_FUNC_START, + (int)uio_offset(uio), uio_resid(uio), (int)filesize, (int)filebytes, 0); + + retval = cluster_read(vp, uio, filesize, ap->a_ioflag |io_throttle); + + cp->c_touch_acctime = TRUE; + + KERNEL_DEBUG(HFSDBG_READ | DBG_FUNC_END, + (int)uio_offset(uio), uio_resid(uio), (int)filesize, (int)filebytes, 0); + + /* + * Keep track blocks read + */ + if (hfsmp->hfc_stage == HFC_RECORDING && retval == 0) { + int took_cnode_lock = 0; + off_t bytesread; + + bytesread = start_resid - uio_resid(uio); + + /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */ + if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff) { + hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); + took_cnode_lock = 1; + } + /* + * If this file hasn't been seen since the start of + * the current sampling period then start over. + */ + if (cp->c_atime < hfsmp->hfc_timebase) { + struct timeval tv; + + fp->ff_bytesread = bytesread; + microtime(&tv); + cp->c_atime = tv.tv_sec; + } else { + fp->ff_bytesread += bytesread; + } + + if (!(ap->a_ioflag & IO_EVTONLY) && vnode_isfastdevicecandidate(vp)) { + // + // We don't add hotfiles for processes doing IO_EVTONLY I/O + // on the assumption that they're system processes such as + // mdworker which scan everything in the system (and thus + // do not represent user-initiated access to files) + // + (void) hfs_addhotfile(vp); + } + if (took_cnode_lock) + hfs_unlock(cp); + } +exit: + if (took_truncate_lock) { + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); + } + if (retval == EAGAIN) { + throttle_lowpri_io(1); + throttled_count++; + + retval = 0; + goto read_again; + } + if (throttled_count) + throttle_info_reset_window(NULL); + return (retval); +} + +/* + * Ideally, this wouldn't be necessary; the cluster code should be + * able to handle this on the read-side. See . + */ +static errno_t hfs_zero_eof_page(vnode_t vp, off_t zero_up_to) +{ + hfs_assert(VTOC(vp)->c_lockowner != current_thread()); + hfs_assert(VTOC(vp)->c_truncatelockowner == current_thread()); + + struct filefork *fp = VTOF(vp); + + if (!(fp->ff_size & PAGE_MASK_64) || zero_up_to <= fp->ff_size) { + // Nothing to do + return 0; + } + + zero_up_to = MIN(zero_up_to, (off_t)round_page_64(fp->ff_size)); + + /* N.B. At present, @zero_up_to is not important because the cluster + code will always zero up to the end of the page anyway. */ + return cluster_write(vp, NULL, fp->ff_size, zero_up_to, + fp->ff_size, 0, IO_HEADZEROFILL); +} + +/* + * Write data to a file. + */ +int +hfs_vnop_write(struct vnop_write_args *ap) +{ + uio_t uio = ap->a_uio; + struct vnode *vp = ap->a_vp; + struct cnode *cp; + struct filefork *fp; + struct hfsmount *hfsmp; + kauth_cred_t cred = NULL; + off_t origFileSize; + off_t writelimit; + off_t bytesToAdd = 0; + off_t actualBytesAdded; + off_t filebytes; + off_t offset; + ssize_t resid; + int eflags; + int ioflag = ap->a_ioflag; + int retval = 0; + int lockflags; + int cnode_locked = 0; + int partialwrite = 0; + int do_snapshot = 1; + time_t orig_ctime=VTOC(vp)->c_ctime; + int took_truncate_lock = 0; + int io_return_on_throttle = 0; + int throttled_count = 0; + +#if HFS_COMPRESSION + if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */ + int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp)); + switch(state) { + case FILE_IS_COMPRESSED: + return EACCES; + case FILE_IS_CONVERTING: + /* if FILE_IS_CONVERTING, we allow writes but do not + bother with snapshots or else we will deadlock. + */ + do_snapshot = 0; + break; + default: + printf("invalid state %d for compressed file\n", state); + /* fall through */ + } + } else if ((VTOC(vp)->c_bsdflags & UF_COMPRESSED)) { + int error; + + error = check_for_dataless_file(vp, NAMESPACE_HANDLER_WRITE_OP); + if (error != 0) { + return error; + } + } + + if (do_snapshot) { + nspace_snapshot_event(vp, orig_ctime, NAMESPACE_HANDLER_WRITE_OP, uio); + } + +#endif + +#if SECURE_KERNEL + if ((ioflag & (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) == + (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) { + /* Don't allow unencrypted io request from user space */ + return EPERM; + } +#endif + + resid = uio_resid(uio); + offset = uio_offset(uio); + + if (offset < 0) + return (EINVAL); + if (resid == 0) + return (E_NONE); + if (!vnode_isreg(vp)) + return (EPERM); /* Can only write regular files */ + + cp = VTOC(vp); + fp = VTOF(vp); + hfsmp = VTOHFS(vp); + +#if CONFIG_PROTECT + if ((retval = cp_handle_vnop (vp, CP_WRITE_ACCESS, 0)) != 0) { + goto exit; + } +#endif + + eflags = kEFDeferMask; /* defer file block allocations */ +#if HFS_SPARSE_DEV + /* + * When the underlying device is sparse and space + * is low (< 8MB), stop doing delayed allocations + * and begin doing synchronous I/O. + */ + if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) && + (hfs_freeblks(hfsmp, 0) < 2048)) { + eflags &= ~kEFDeferMask; + ioflag |= IO_SYNC; + } +#endif /* HFS_SPARSE_DEV */ + + if ((ioflag & (IO_SINGLE_WRITER | IO_SYSCALL_DISPATCH)) == + (IO_SINGLE_WRITER | IO_SYSCALL_DISPATCH)) { + io_return_on_throttle = IO_RETURN_ON_THROTTLE; + } + +again: + /* + * Protect against a size change. + * + * Note: If took_truncate_lock is true, then we previously got the lock shared + * but needed to upgrade to exclusive. So try getting it exclusive from the + * start. + */ + if (ioflag & IO_APPEND || took_truncate_lock) { + hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + } + else { + hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT); + } + took_truncate_lock = 1; + + /* Update UIO */ + if (ioflag & IO_APPEND) { + uio_setoffset(uio, fp->ff_size); + offset = fp->ff_size; + } + if ((cp->c_bsdflags & APPEND) && offset != fp->ff_size) { + retval = EPERM; + goto exit; + } + + cred = vfs_context_ucred(ap->a_context); + if (cred && suser(cred, NULL) != 0) + eflags |= kEFReserveMask; + + origFileSize = fp->ff_size; + writelimit = offset + resid; + + /* + * We may need an exclusive truncate lock for several reasons, all + * of which are because we may be writing to a (portion of a) block + * for the first time, and we need to make sure no readers see the + * prior, uninitialized contents of the block. The cases are: + * + * 1. We have unallocated (delayed allocation) blocks. We may be + * allocating new blocks to the file and writing to them. + * (A more precise check would be whether the range we're writing + * to contains delayed allocation blocks.) + * 2. We need to extend the file. The bytes between the old EOF + * and the new EOF are not yet initialized. This is important + * even if we're not allocating new blocks to the file. If the + * old EOF and new EOF are in the same block, we still need to + * protect that range of bytes until they are written for the + * first time. + * + * If we had a shared lock with the above cases, we need to try to upgrade + * to an exclusive lock. If the upgrade fails, we will lose the shared + * lock, and will need to take the truncate lock again; the took_truncate_lock + * flag will still be set, causing us to try for an exclusive lock next time. + */ + if ((cp->c_truncatelockowner == HFS_SHARED_OWNER) && + ((fp->ff_unallocblocks != 0) || + (writelimit > origFileSize))) { + if (lck_rw_lock_shared_to_exclusive(&cp->c_truncatelock) == FALSE) { + /* + * Lock upgrade failed and we lost our shared lock, try again. + * Note: we do not set took_truncate_lock=0 here. Leaving it + * set to 1 will cause us to try to get the lock exclusive. + */ + goto again; + } + else { + /* Store the owner in the c_truncatelockowner field if we successfully upgrade */ + cp->c_truncatelockowner = current_thread(); + } + } + + if ( (retval = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { + goto exit; + } + cnode_locked = 1; + + filebytes = hfs_blk_to_bytes(fp->ff_blocks, hfsmp->blockSize); + + if (offset > filebytes + && (hfs_blk_to_bytes(hfs_freeblks(hfsmp, ISSET(eflags, kEFReserveMask)), + hfsmp->blockSize) < offset - filebytes)) { + retval = ENOSPC; + goto exit; + } + + KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_START, + (int)offset, uio_resid(uio), (int)fp->ff_size, + (int)filebytes, 0); + + /* Check if we do not need to extend the file */ + if (writelimit <= filebytes) { + goto sizeok; + } + + bytesToAdd = writelimit - filebytes; + +#if QUOTA + retval = hfs_chkdq(cp, (int64_t)(roundup(bytesToAdd, hfsmp->blockSize)), + cred, 0); + if (retval) + goto exit; +#endif /* QUOTA */ + + if (hfs_start_transaction(hfsmp) != 0) { + retval = EINVAL; + goto exit; + } + + while (writelimit > filebytes) { + bytesToAdd = writelimit - filebytes; + + /* Protect extents b-tree and allocation bitmap */ + lockflags = SFL_BITMAP; + if (overflow_extents(fp)) + lockflags |= SFL_EXTENTS; + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); + + /* Files that are changing size are not hot file candidates. */ + if (hfsmp->hfc_stage == HFC_RECORDING) { + fp->ff_bytesread = 0; + } + retval = MacToVFSError(ExtendFileC (hfsmp, (FCB*)fp, bytesToAdd, + 0, eflags, &actualBytesAdded)); + + hfs_systemfile_unlock(hfsmp, lockflags); + + if ((actualBytesAdded == 0) && (retval == E_NONE)) + retval = ENOSPC; + if (retval != E_NONE) + break; + filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize; + KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_NONE, + (int)offset, uio_resid(uio), (int)fp->ff_size, (int)filebytes, 0); + } + (void) hfs_update(vp, 0); + (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); + (void) hfs_end_transaction(hfsmp); + + /* + * If we didn't grow the file enough try a partial write. + * POSIX expects this behavior. + */ + if ((retval == ENOSPC) && (filebytes > offset)) { + retval = 0; + partialwrite = 1; + uio_setresid(uio, (uio_resid(uio) - bytesToAdd)); + resid -= bytesToAdd; + writelimit = filebytes; + } +sizeok: + if (retval == E_NONE) { + off_t filesize; + off_t head_off; + int lflag; + + if (writelimit > fp->ff_size) { + filesize = writelimit; + struct timeval tv; + rl_add(fp->ff_size, writelimit - 1 , &fp->ff_invalidranges); + microuptime(&tv); + cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT; + } else + filesize = fp->ff_size; + + lflag = ioflag & ~(IO_TAILZEROFILL | IO_HEADZEROFILL | IO_NOZEROVALID | IO_NOZERODIRTY); + + /* + * We no longer use IO_HEADZEROFILL or IO_TAILZEROFILL (except + * for one case below). For the regions that lie before the + * beginning and after the end of this write that are in the + * same page, we let the cluster code handle zeroing that out + * if necessary. If those areas are not cached, the cluster + * code will try and read those areas in, and in the case + * where those regions have never been written to, + * hfs_vnop_blockmap will consult the invalid ranges and then + * indicate that. The cluster code will zero out those areas. + */ + + head_off = trunc_page_64(offset); + + if (head_off < offset && head_off >= fp->ff_size) { + /* + * The first page is beyond current EOF, so as an + * optimisation, we can pass IO_HEADZEROFILL. + */ + lflag |= IO_HEADZEROFILL; + } + + hfs_unlock(cp); + cnode_locked = 0; + + /* + * We need to tell UBC the fork's new size BEFORE calling + * cluster_write, in case any of the new pages need to be + * paged out before cluster_write completes (which does happen + * in embedded systems due to extreme memory pressure). + * Similarly, we need to tell hfs_vnop_pageout what the new EOF + * will be, so that it can pass that on to cluster_pageout, and + * allow those pageouts. + * + * We don't update ff_size yet since we don't want pageins to + * be able to see uninitialized data between the old and new + * EOF, until cluster_write has completed and initialized that + * part of the file. + * + * The vnode pager relies on the file size last given to UBC via + * ubc_setsize. hfs_vnop_pageout relies on fp->ff_new_size or + * ff_size (whichever is larger). NOTE: ff_new_size is always + * zero, unless we are extending the file via write. + */ + if (filesize > fp->ff_size) { + retval = hfs_zero_eof_page(vp, offset); + if (retval) + goto exit; + fp->ff_new_size = filesize; + ubc_setsize(vp, filesize); + } + retval = cluster_write(vp, uio, fp->ff_size, filesize, head_off, + 0, lflag | IO_NOZERODIRTY | io_return_on_throttle); + if (retval) { + fp->ff_new_size = 0; /* no longer extending; use ff_size */ + + if (retval == EAGAIN) { + /* + * EAGAIN indicates that we still have I/O to do, but + * that we now need to be throttled + */ + if (resid != uio_resid(uio)) { + /* + * did manage to do some I/O before returning EAGAIN + */ + resid = uio_resid(uio); + offset = uio_offset(uio); + + cp->c_touch_chgtime = TRUE; + cp->c_touch_modtime = TRUE; + hfs_incr_gencount(cp); + } + if (filesize > fp->ff_size) { + /* + * we called ubc_setsize before the call to + * cluster_write... since we only partially + * completed the I/O, we need to + * re-adjust our idea of the filesize based + * on our interim EOF + */ + ubc_setsize(vp, offset); + + fp->ff_size = offset; + } + goto exit; + } + if (filesize > origFileSize) { + ubc_setsize(vp, origFileSize); + } + goto ioerr_exit; + } + + if (filesize > origFileSize) { + fp->ff_size = filesize; + + /* Files that are changing size are not hot file candidates. */ + if (hfsmp->hfc_stage == HFC_RECORDING) { + fp->ff_bytesread = 0; + } + } + fp->ff_new_size = 0; /* ff_size now has the correct size */ + } + if (partialwrite) { + uio_setresid(uio, (uio_resid(uio) + bytesToAdd)); + resid += bytesToAdd; + } + + if (vnode_should_flush_after_write(vp, ioflag)) + hfs_flush(hfsmp, HFS_FLUSH_CACHE); + +ioerr_exit: + if (!cnode_locked) { + hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); + cnode_locked = 1; + } + + if (resid > uio_resid(uio)) { + cp->c_touch_chgtime = TRUE; + cp->c_touch_modtime = TRUE; + hfs_incr_gencount(cp); + + /* + * If we successfully wrote any data, and we are not the superuser + * we clear the setuid and setgid bits as a precaution against + * tampering. + */ + if (cp->c_mode & (S_ISUID | S_ISGID)) { + cred = vfs_context_ucred(ap->a_context); + if (cred && suser(cred, NULL)) { + cp->c_mode &= ~(S_ISUID | S_ISGID); + } + } + } + if (retval) { + if (ioflag & IO_UNIT) { + (void)hfs_truncate(vp, origFileSize, ioflag & IO_SYNC, + 0, ap->a_context); + uio_setoffset(uio, (uio_offset(uio) - (resid - uio_resid(uio)))); + uio_setresid(uio, resid); + filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize; + } + } else if ((ioflag & IO_SYNC) && (resid > uio_resid(uio))) + retval = hfs_update(vp, 0); + + /* Updating vcbWrCnt doesn't need to be atomic. */ + hfsmp->vcbWrCnt++; + + KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_END, + (int)uio_offset(uio), uio_resid(uio), (int)fp->ff_size, (int)filebytes, 0); +exit: + if (retval && took_truncate_lock + && cp->c_truncatelockowner == current_thread()) { + fp->ff_new_size = 0; + rl_remove(fp->ff_size, RL_INFINITY, &fp->ff_invalidranges); + } + + if (cnode_locked) + hfs_unlock(cp); + + if (took_truncate_lock) { + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); + } + if (retval == EAGAIN) { + throttle_lowpri_io(1); + throttled_count++; + + retval = 0; + goto again; + } + if (throttled_count) + throttle_info_reset_window(NULL); + return (retval); +} + +/* support for the "bulk-access" fcntl */ + +#define CACHE_LEVELS 16 +#define NUM_CACHE_ENTRIES (64*16) +#define PARENT_IDS_FLAG 0x100 + +struct access_cache { + int numcached; + int cachehits; /* these two for statistics gathering */ + int lookups; + unsigned int *acache; + unsigned char *haveaccess; +}; + +struct access_t { + uid_t uid; /* IN: effective user id */ + short flags; /* IN: access requested (i.e. R_OK) */ + short num_groups; /* IN: number of groups user belongs to */ + int num_files; /* IN: number of files to process */ + int *file_ids; /* IN: array of file ids */ + gid_t *groups; /* IN: array of groups */ + short *access; /* OUT: access info for each file (0 for 'has access') */ +} __attribute__((unavailable)); // this structure is for reference purposes only + +struct user32_access_t { + uid_t uid; /* IN: effective user id */ + short flags; /* IN: access requested (i.e. R_OK) */ + short num_groups; /* IN: number of groups user belongs to */ + int num_files; /* IN: number of files to process */ + user32_addr_t file_ids; /* IN: array of file ids */ + user32_addr_t groups; /* IN: array of groups */ + user32_addr_t access; /* OUT: access info for each file (0 for 'has access') */ +}; + +struct user64_access_t { + uid_t uid; /* IN: effective user id */ + short flags; /* IN: access requested (i.e. R_OK) */ + short num_groups; /* IN: number of groups user belongs to */ + int num_files; /* IN: number of files to process */ + user64_addr_t file_ids; /* IN: array of file ids */ + user64_addr_t groups; /* IN: array of groups */ + user64_addr_t access; /* OUT: access info for each file (0 for 'has access') */ +}; + + +// these are the "extended" versions of the above structures +// note that it is crucial that they be different sized than +// the regular version +struct ext_access_t { + uint32_t flags; /* IN: access requested (i.e. R_OK) */ + uint32_t num_files; /* IN: number of files to process */ + uint32_t map_size; /* IN: size of the bit map */ + uint32_t *file_ids; /* IN: Array of file ids */ + char *bitmap; /* OUT: hash-bitmap of interesting directory ids */ + short *access; /* OUT: access info for each file (0 for 'has access') */ + uint32_t num_parents; /* future use */ + cnid_t *parents; /* future use */ +} __attribute__((unavailable)); // this structure is for reference purposes only + +struct user32_ext_access_t { + uint32_t flags; /* IN: access requested (i.e. R_OK) */ + uint32_t num_files; /* IN: number of files to process */ + uint32_t map_size; /* IN: size of the bit map */ + user32_addr_t file_ids; /* IN: Array of file ids */ + user32_addr_t bitmap; /* OUT: hash-bitmap of interesting directory ids */ + user32_addr_t access; /* OUT: access info for each file (0 for 'has access') */ + uint32_t num_parents; /* future use */ + user32_addr_t parents; /* future use */ +}; + +struct user64_ext_access_t { + uint32_t flags; /* IN: access requested (i.e. R_OK) */ + uint32_t num_files; /* IN: number of files to process */ + uint32_t map_size; /* IN: size of the bit map */ + user64_addr_t file_ids; /* IN: array of file ids */ + user64_addr_t bitmap; /* IN: array of groups */ + user64_addr_t access; /* OUT: access info for each file (0 for 'has access') */ + uint32_t num_parents;/* future use */ + user64_addr_t parents;/* future use */ +}; + + +/* + * Perform a binary search for the given parent_id. Return value is + * the index if there is a match. If no_match_indexp is non-NULL it + * will be assigned with the index to insert the item (even if it was + * not found). + */ +static int cache_binSearch(cnid_t *array, unsigned int hi, cnid_t parent_id, int *no_match_indexp) +{ + int index=-1; + unsigned int lo=0; + + do { + unsigned int mid = ((hi - lo)/2) + lo; + unsigned int this_id = array[mid]; + + if (parent_id == this_id) { + hi = mid; + break; + } + + if (parent_id < this_id) { + hi = mid; + continue; + } + + if (parent_id > this_id) { + lo = mid + 1; + continue; + } + } while(lo < hi); + + /* check if lo and hi converged on the match */ + if (parent_id == array[hi]) { + index = hi; + } + + if (no_match_indexp) { + *no_match_indexp = hi; + } + + return index; +} + + +static int +lookup_bucket(struct access_cache *cache, int *indexp, cnid_t parent_id) +{ + unsigned int hi; + int matches = 0; + int index, no_match_index; + + if (cache->numcached == 0) { + *indexp = 0; + return 0; // table is empty, so insert at index=0 and report no match + } + + if (cache->numcached > NUM_CACHE_ENTRIES) { + cache->numcached = NUM_CACHE_ENTRIES; + } + + hi = cache->numcached - 1; + + index = cache_binSearch(cache->acache, hi, parent_id, &no_match_index); + + /* if no existing entry found, find index for new one */ + if (index == -1) { + index = no_match_index; + matches = 0; + } else { + matches = 1; + } + + *indexp = index; + return matches; +} + +/* + * Add a node to the access_cache at the given index (or do a lookup first + * to find the index if -1 is passed in). We currently do a replace rather + * than an insert if the cache is full. + */ +static void +add_node(struct access_cache *cache, int index, cnid_t nodeID, int access) +{ + int lookup_index = -1; + + /* need to do a lookup first if -1 passed for index */ + if (index == -1) { + if (lookup_bucket(cache, &lookup_index, nodeID)) { + if (cache->haveaccess[lookup_index] != access && cache->haveaccess[lookup_index] == ESRCH) { + // only update an entry if the previous access was ESRCH (i.e. a scope checking error) + cache->haveaccess[lookup_index] = access; + } + + /* mission accomplished */ + return; + } else { + index = lookup_index; + } + + } + + /* if the cache is full, do a replace rather than an insert */ + if (cache->numcached >= NUM_CACHE_ENTRIES) { + cache->numcached = NUM_CACHE_ENTRIES-1; + + if (index > cache->numcached) { + index = cache->numcached; + } + } + + if (index < cache->numcached && index < NUM_CACHE_ENTRIES && nodeID > cache->acache[index]) { + index++; + } + + if (index >= 0 && index < cache->numcached) { + /* only do bcopy if we're inserting */ + bcopy( cache->acache+index, cache->acache+(index+1), (cache->numcached - index)*sizeof(int) ); + bcopy( cache->haveaccess+index, cache->haveaccess+(index+1), (cache->numcached - index)*sizeof(unsigned char) ); + } + + cache->acache[index] = nodeID; + cache->haveaccess[index] = access; + cache->numcached++; +} + + +struct cinfo { + uid_t uid; + gid_t gid; + mode_t mode; + cnid_t parentcnid; + u_int16_t recflags; +}; + +static int +snoop_callback(const cnode_t *cp, void *arg) +{ + struct cinfo *cip = arg; + + cip->uid = cp->c_uid; + cip->gid = cp->c_gid; + cip->mode = cp->c_mode; + cip->parentcnid = cp->c_parentcnid; + cip->recflags = cp->c_attr.ca_recflags; + + return (0); +} + +/* + * Lookup the cnid's attr info (uid, gid, and mode) as well as its parent id. If the item + * isn't incore, then go to the catalog. + */ +static int +do_attr_lookup(struct hfsmount *hfsmp, struct access_cache *cache, cnid_t cnid, + struct cnode *skip_cp, CatalogKey *keyp, struct cat_attr *cnattrp) +{ + int error = 0; + + /* if this id matches the one the fsctl was called with, skip the lookup */ + if (cnid == skip_cp->c_cnid) { + cnattrp->ca_uid = skip_cp->c_uid; + cnattrp->ca_gid = skip_cp->c_gid; + cnattrp->ca_mode = skip_cp->c_mode; + cnattrp->ca_recflags = skip_cp->c_attr.ca_recflags; + keyp->hfsPlus.parentID = skip_cp->c_parentcnid; + } else { + struct cinfo c_info; + + /* otherwise, check the cnode hash incase the file/dir is incore */ + error = hfs_chash_snoop(hfsmp, cnid, 0, snoop_callback, &c_info); + + if (error == EACCES) { + // File is deleted + return ENOENT; + } else if (!error) { + cnattrp->ca_uid = c_info.uid; + cnattrp->ca_gid = c_info.gid; + cnattrp->ca_mode = c_info.mode; + cnattrp->ca_recflags = c_info.recflags; + keyp->hfsPlus.parentID = c_info.parentcnid; + } else { + int lockflags; + + if (throttle_io_will_be_throttled(-1, HFSTOVFS(hfsmp))) + throttle_lowpri_io(1); + + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); + + /* lookup this cnid in the catalog */ + error = cat_getkeyplusattr(hfsmp, cnid, keyp, cnattrp); + + hfs_systemfile_unlock(hfsmp, lockflags); + + cache->lookups++; + } + } + + return (error); +} + + +/* + * Compute whether we have access to the given directory (nodeID) and all its parents. Cache + * up to CACHE_LEVELS as we progress towards the root. + */ +static int +do_access_check(struct hfsmount *hfsmp, int *err, struct access_cache *cache, HFSCatalogNodeID nodeID, + struct cnode *skip_cp, struct proc *theProcPtr, kauth_cred_t myp_ucred, + struct vfs_context *my_context, + char *bitmap, + uint32_t map_size, + cnid_t* parents, + uint32_t num_parents) +{ + int myErr = 0; + int myResult; + HFSCatalogNodeID thisNodeID; + unsigned int myPerms; + struct cat_attr cnattr; + int cache_index = -1, scope_index = -1, scope_idx_start = -1; + CatalogKey catkey; + + int i = 0, ids_to_cache = 0; + int parent_ids[CACHE_LEVELS]; + + thisNodeID = nodeID; + while (thisNodeID >= kRootDirID) { + myResult = 0; /* default to "no access" */ + + /* check the cache before resorting to hitting the catalog */ + + /* ASSUMPTION: access info of cached entries is "final"... i.e. no need + * to look any further after hitting cached dir */ + + if (lookup_bucket(cache, &cache_index, thisNodeID)) { + cache->cachehits++; + myErr = cache->haveaccess[cache_index]; + if (scope_index != -1) { + if (myErr == ESRCH) { + myErr = 0; + } + } else { + scope_index = 0; // so we'll just use the cache result + scope_idx_start = ids_to_cache; + } + myResult = (myErr == 0) ? 1 : 0; + goto ExitThisRoutine; + } + + + if (parents) { + int tmp; + tmp = cache_binSearch(parents, num_parents-1, thisNodeID, NULL); + if (scope_index == -1) + scope_index = tmp; + if (tmp != -1 && scope_idx_start == -1 && ids_to_cache < CACHE_LEVELS) { + scope_idx_start = ids_to_cache; + } + } + + /* remember which parents we want to cache */ + if (ids_to_cache < CACHE_LEVELS) { + parent_ids[ids_to_cache] = thisNodeID; + ids_to_cache++; + } + // Inefficient (using modulo) and we might want to use a hash function, not rely on the node id to be "nice"... + if (bitmap && map_size) { + bitmap[(thisNodeID/8)%(map_size)]|=(1<<(thisNodeID&7)); + } + + + /* do the lookup (checks the cnode hash, then the catalog) */ + myErr = do_attr_lookup(hfsmp, cache, thisNodeID, skip_cp, &catkey, &cnattr); + if (myErr) { + goto ExitThisRoutine; /* no access */ + } + + /* Root always gets access. */ + if (suser(myp_ucred, NULL) == 0) { + thisNodeID = catkey.hfsPlus.parentID; + myResult = 1; + continue; + } + + // if the thing has acl's, do the full permission check + if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) { + struct vnode *vp; + + /* get the vnode for this cnid */ + myErr = hfs_vget(hfsmp, thisNodeID, &vp, 0, 0); + if ( myErr ) { + myResult = 0; + goto ExitThisRoutine; + } + + thisNodeID = VTOC(vp)->c_parentcnid; + + hfs_unlock(VTOC(vp)); + + if (vnode_vtype(vp) == VDIR) { + myErr = vnode_authorize(vp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), my_context); + } else { + myErr = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA, my_context); + } + + vnode_put(vp); + if (myErr) { + myResult = 0; + goto ExitThisRoutine; + } + } else { + unsigned int flags; + int mode = cnattr.ca_mode & S_IFMT; + myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid, cnattr.ca_mode, hfsmp->hfs_mp,myp_ucred, theProcPtr); + + if (mode == S_IFDIR) { + flags = R_OK | X_OK; + } else { + flags = R_OK; + } + if ( (myPerms & flags) != flags) { + myResult = 0; + myErr = EACCES; + goto ExitThisRoutine; /* no access */ + } + + /* up the hierarchy we go */ + thisNodeID = catkey.hfsPlus.parentID; + } + } + + /* if here, we have access to this node */ + myResult = 1; + + ExitThisRoutine: + if (parents && myErr == 0 && scope_index == -1) { + myErr = ESRCH; + } + + if (myErr) { + myResult = 0; + } + *err = myErr; + + /* cache the parent directory(ies) */ + for (i = 0; i < ids_to_cache; i++) { + if (myErr == 0 && parents && (scope_idx_start == -1 || i > scope_idx_start)) { + add_node(cache, -1, parent_ids[i], ESRCH); + } else { + add_node(cache, -1, parent_ids[i], myErr); + } + } + + return (myResult); +} + +static int +do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp, + struct vnop_ioctl_args *ap, int arg_size, vfs_context_t context) +{ + boolean_t is64bit; + + /* + * NOTE: on entry, the vnode has an io_ref. In case this vnode + * happens to be in our list of file_ids, we'll note it + * avoid calling hfs_chashget_nowait() on that id as that + * will cause a "locking against myself" panic. + */ + Boolean check_leaf = true; + + struct user64_ext_access_t *user_access_structp; + struct user64_ext_access_t tmp_user_access; + struct access_cache cache; + + int error = 0, prev_parent_check_ok=1; + unsigned int i; + + short flags; + unsigned int num_files = 0; + int map_size = 0; + int num_parents = 0; + int *file_ids=NULL; + short *access=NULL; + char *bitmap=NULL; + cnid_t *parents=NULL; + int leaf_index; + + cnid_t cnid; + cnid_t prevParent_cnid = 0; + unsigned int myPerms; + short myaccess = 0; + struct cat_attr cnattr; + CatalogKey catkey; + struct cnode *skip_cp = VTOC(vp); + kauth_cred_t cred = vfs_context_ucred(context); + proc_t p = vfs_context_proc(context); + + is64bit = proc_is64bit(p); + + /* initialize the local cache and buffers */ + cache.numcached = 0; + cache.cachehits = 0; + cache.lookups = 0; + cache.acache = NULL; + cache.haveaccess = NULL; + + /* struct copyin done during dispatch... need to copy file_id array separately */ + if (ap->a_data == NULL) { + error = EINVAL; + goto err_exit_bulk_access; + } + + if (is64bit) { + if (arg_size != sizeof(struct user64_ext_access_t)) { + error = EINVAL; + goto err_exit_bulk_access; + } + + user_access_structp = (struct user64_ext_access_t *)ap->a_data; + + } else if (arg_size == sizeof(struct user32_access_t)) { + struct user32_access_t *accessp = (struct user32_access_t *)ap->a_data; + + // convert an old style bulk-access struct to the new style + tmp_user_access.flags = accessp->flags; + tmp_user_access.num_files = accessp->num_files; + tmp_user_access.map_size = 0; + tmp_user_access.file_ids = CAST_USER_ADDR_T(accessp->file_ids); + tmp_user_access.bitmap = USER_ADDR_NULL; + tmp_user_access.access = CAST_USER_ADDR_T(accessp->access); + tmp_user_access.num_parents = 0; + user_access_structp = &tmp_user_access; + + } else if (arg_size == sizeof(struct user32_ext_access_t)) { + struct user32_ext_access_t *accessp = (struct user32_ext_access_t *)ap->a_data; + + // up-cast from a 32-bit version of the struct + tmp_user_access.flags = accessp->flags; + tmp_user_access.num_files = accessp->num_files; + tmp_user_access.map_size = accessp->map_size; + tmp_user_access.num_parents = accessp->num_parents; + + tmp_user_access.file_ids = CAST_USER_ADDR_T(accessp->file_ids); + tmp_user_access.bitmap = CAST_USER_ADDR_T(accessp->bitmap); + tmp_user_access.access = CAST_USER_ADDR_T(accessp->access); + tmp_user_access.parents = CAST_USER_ADDR_T(accessp->parents); + + user_access_structp = &tmp_user_access; + } else { + error = EINVAL; + goto err_exit_bulk_access; + } + + map_size = user_access_structp->map_size; + + num_files = user_access_structp->num_files; + + num_parents= user_access_structp->num_parents; + + if (num_files < 1) { + goto err_exit_bulk_access; + } + if (num_files > 1024) { + error = EINVAL; + goto err_exit_bulk_access; + } + + if (num_parents > 1024) { + error = EINVAL; + goto err_exit_bulk_access; + } + + file_ids = hfs_malloc(sizeof(int) * num_files); + access = hfs_malloc(sizeof(short) * num_files); + if (map_size) { + bitmap = hfs_mallocz(sizeof(char) * map_size); + } + + if (num_parents) { + parents = hfs_malloc(sizeof(cnid_t) * num_parents); + } + + cache.acache = hfs_malloc(sizeof(int) * NUM_CACHE_ENTRIES); + cache.haveaccess = hfs_malloc(sizeof(unsigned char) * NUM_CACHE_ENTRIES); + + if ((error = copyin(user_access_structp->file_ids, (caddr_t)file_ids, + num_files * sizeof(int)))) { + goto err_exit_bulk_access; + } + + if (num_parents) { + if ((error = copyin(user_access_structp->parents, (caddr_t)parents, + num_parents * sizeof(cnid_t)))) { + goto err_exit_bulk_access; + } + } + + flags = user_access_structp->flags; + if ((flags & (F_OK | R_OK | W_OK | X_OK)) == 0) { + flags = R_OK; + } + + /* check if we've been passed leaf node ids or parent ids */ + if (flags & PARENT_IDS_FLAG) { + check_leaf = false; + } + + /* Check access to each file_id passed in */ + for (i = 0; i < num_files; i++) { + leaf_index=-1; + cnid = (cnid_t) file_ids[i]; + + /* root always has access */ + if ((!parents) && (!suser(cred, NULL))) { + access[i] = 0; + continue; + } + + if (check_leaf) { + /* do the lookup (checks the cnode hash, then the catalog) */ + error = do_attr_lookup(hfsmp, &cache, cnid, skip_cp, &catkey, &cnattr); + if (error) { + access[i] = (short) error; + continue; + } + + if (parents) { + // Check if the leaf matches one of the parent scopes + leaf_index = cache_binSearch(parents, num_parents-1, cnid, NULL); + if (leaf_index >= 0 && parents[leaf_index] == cnid) + prev_parent_check_ok = 0; + else if (leaf_index >= 0) + prev_parent_check_ok = 1; + } + + // if the thing has acl's, do the full permission check + if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) { + struct vnode *cvp; + int myErr = 0; + /* get the vnode for this cnid */ + myErr = hfs_vget(hfsmp, cnid, &cvp, 0, 0); + if ( myErr ) { + access[i] = myErr; + continue; + } + + hfs_unlock(VTOC(cvp)); + + if (vnode_vtype(cvp) == VDIR) { + myErr = vnode_authorize(cvp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), context); + } else { + myErr = vnode_authorize(cvp, NULL, KAUTH_VNODE_READ_DATA, context); + } + + vnode_put(cvp); + if (myErr) { + access[i] = myErr; + continue; + } + } else { + /* before calling CheckAccess(), check the target file for read access */ + myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid, + cnattr.ca_mode, hfsmp->hfs_mp, cred, p); + + /* fail fast if no access */ + if ((myPerms & flags) == 0) { + access[i] = EACCES; + continue; + } + } + } else { + /* we were passed an array of parent ids */ + catkey.hfsPlus.parentID = cnid; + } + + /* if the last guy had the same parent and had access, we're done */ + if (i > 0 && catkey.hfsPlus.parentID == prevParent_cnid && access[i-1] == 0 && prev_parent_check_ok) { + cache.cachehits++; + access[i] = 0; + continue; + } + + myaccess = do_access_check(hfsmp, &error, &cache, catkey.hfsPlus.parentID, + skip_cp, p, cred, context,bitmap, map_size, parents, num_parents); + + if (myaccess || (error == ESRCH && leaf_index != -1)) { + access[i] = 0; // have access.. no errors to report + } else { + access[i] = (error != 0 ? (short) error : EACCES); + } + + prevParent_cnid = catkey.hfsPlus.parentID; + } + + /* copyout the access array */ + if ((error = copyout((caddr_t)access, user_access_structp->access, + num_files * sizeof (short)))) { + goto err_exit_bulk_access; + } + if (map_size && bitmap) { + if ((error = copyout((caddr_t)bitmap, user_access_structp->bitmap, + map_size * sizeof (char)))) { + goto err_exit_bulk_access; + } + } + + + err_exit_bulk_access: + + hfs_free(file_ids, sizeof(int) * num_files); + hfs_free(parents, sizeof(cnid_t) * num_parents); + hfs_free(bitmap, sizeof(char) * map_size); + hfs_free(access, sizeof(short) * num_files); + hfs_free(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES); + hfs_free(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES); + + return (error); +} + + +/* end "bulk-access" support */ + + +/* + * Control filesystem operating characteristics. + */ +int +hfs_vnop_ioctl( struct vnop_ioctl_args /* { + vnode_t a_vp; + long a_command; + caddr_t a_data; + int a_fflag; + vfs_context_t a_context; + } */ *ap) +{ + struct vnode * vp = ap->a_vp; + struct hfsmount *hfsmp = VTOHFS(vp); + vfs_context_t context = ap->a_context; + kauth_cred_t cred = vfs_context_ucred(context); + proc_t p = vfs_context_proc(context); + struct vfsstatfs *vfsp; + boolean_t is64bit; + off_t jnl_start, jnl_size; + struct hfs_journal_info *jip; +#if HFS_COMPRESSION + int compressed = 0; + off_t uncompressed_size = -1; + int decmpfs_error = 0; + + if (ap->a_command == F_RDADVISE) { + /* we need to inspect the decmpfs state of the file as early as possible */ + compressed = hfs_file_is_compressed(VTOC(vp), 0); + if (compressed) { + if (VNODE_IS_RSRC(vp)) { + /* if this is the resource fork, treat it as if it were empty */ + uncompressed_size = 0; + } else { + decmpfs_error = hfs_uncompressed_size_of_compressed_file(NULL, vp, 0, &uncompressed_size, 0); + if (decmpfs_error != 0) { + /* failed to get the uncompressed size, we'll check for this later */ + uncompressed_size = -1; + } + } + } + } +#endif /* HFS_COMPRESSION */ + + is64bit = proc_is64bit(p); + +#if CONFIG_PROTECT +#if HFS_CONFIG_KEY_ROLL + // The HFSIOC_KEY_ROLL fsctl does its own access checks + if (ap->a_command != HFSIOC_KEY_ROLL) +#endif + { + int error = 0; + if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) { + return error; + } + } +#endif /* CONFIG_PROTECT */ + + switch (ap->a_command) { + + case HFSIOC_GETPATH: + { + struct vnode *file_vp; + cnid_t cnid; + int outlen; + char *bufptr; + int error; + int flags = 0; + + /* Caller must be owner of file system. */ + vfsp = vfs_statfs(HFSTOVFS(hfsmp)); + if (suser(cred, NULL) && + kauth_cred_getuid(cred) != vfsp->f_owner) { + return (EACCES); + } + /* Target vnode must be file system's root. */ + if (!vnode_isvroot(vp)) { + return (EINVAL); + } + bufptr = (char *)ap->a_data; + cnid = strtoul(bufptr, NULL, 10); + if (ap->a_fflag & HFS_GETPATH_VOLUME_RELATIVE) { + flags |= BUILDPATH_VOLUME_RELATIVE; + } + + /* We need to call hfs_vfs_vget to leverage the code that will + * fix the origin list for us if needed, as opposed to calling + * hfs_vget, since we will need the parent for build_path call. + */ + + if ((error = hfs_vfs_vget(HFSTOVFS(hfsmp), cnid, &file_vp, context))) { + return (error); + } + + error = build_path(file_vp, bufptr, sizeof(pathname_t), &outlen, flags, context); + vnode_put(file_vp); + + return (error); + } + + case HFSIOC_SET_MAX_DEFRAG_SIZE: + { + int error = 0; /* Assume success */ + u_int32_t maxsize = 0; + + if (vnode_vfsisrdonly(vp)) { + return (EROFS); + } + vfsp = vfs_statfs(HFSTOVFS(hfsmp)); + if (!kauth_cred_issuser(cred)) { + return (EACCES); /* must be root */ + } + + maxsize = *(u_int32_t *)ap->a_data; + + hfs_lock_mount(hfsmp); + if (maxsize > HFS_MAX_DEFRAG_SIZE) { + error = EINVAL; + } + else { + hfsmp->hfs_defrag_max = maxsize; + } + hfs_unlock_mount(hfsmp); + + return (error); + } + + case HFSIOC_FORCE_ENABLE_DEFRAG: + { + int error = 0; /* Assume success */ + u_int32_t do_enable = 0; + + if (vnode_vfsisrdonly(vp)) { + return (EROFS); + } + vfsp = vfs_statfs(HFSTOVFS(hfsmp)); + if (!kauth_cred_issuser(cred)) { + return (EACCES); /* must be root */ + } + + do_enable = *(u_int32_t *)ap->a_data; + + hfs_lock_mount(hfsmp); + if (do_enable != 0) { + hfsmp->hfs_defrag_nowait = 1; + } + else { + error = EINVAL; + } + + hfs_unlock_mount(hfsmp); + + return (error); + } + + + case HFSIOC_TRANSFER_DOCUMENT_ID: + { + struct cnode *cp = NULL; + int error; + u_int32_t to_fd = *(u_int32_t *)ap->a_data; + struct fileproc *to_fp; + struct vnode *to_vp; + struct cnode *to_cp; + + cp = VTOC(vp); + + if ((error = fp_getfvp(p, to_fd, &to_fp, &to_vp)) != 0) { + //printf("could not get the vnode for fd %d (err %d)\n", to_fd, error); + return error; + } + if ( (error = vnode_getwithref(to_vp)) ) { + file_drop(to_fd); + return error; + } + + if (VTOHFS(to_vp) != hfsmp) { + error = EXDEV; + goto transfer_cleanup; + } + + int need_unlock = 1; + to_cp = VTOC(to_vp); + error = hfs_lockpair(cp, to_cp, HFS_EXCLUSIVE_LOCK); + if (error != 0) { + //printf("could not lock the pair of cnodes (error %d)\n", error); + goto transfer_cleanup; + } + + if (!(cp->c_bsdflags & UF_TRACKED)) { + error = EINVAL; + } else if (to_cp->c_bsdflags & UF_TRACKED) { + // + // if the destination is already tracked, return an error + // as otherwise it's a silent deletion of the target's + // document-id + // + error = EEXIST; + } else if (S_ISDIR(cp->c_attr.ca_mode) || S_ISREG(cp->c_attr.ca_mode) || S_ISLNK(cp->c_attr.ca_mode)) { + // + // we can use the FndrExtendedFileInfo because the doc-id is the first + // thing in both it and the ExtendedDirInfo struct which is fixed in + // format and can not change layout + // + struct FndrExtendedFileInfo *f_extinfo = (struct FndrExtendedFileInfo *)((u_int8_t*)cp->c_finderinfo + 16); + struct FndrExtendedFileInfo *to_extinfo = (struct FndrExtendedFileInfo *)((u_int8_t*)to_cp->c_finderinfo + 16); + + if (f_extinfo->document_id == 0) { + uint32_t new_id; + + hfs_unlockpair(cp, to_cp); // have to unlock to be able to get a new-id + + if ((error = hfs_generate_document_id(hfsmp, &new_id)) == 0) { + // + // re-lock the pair now that we have the document-id + // + hfs_lockpair(cp, to_cp, HFS_EXCLUSIVE_LOCK); + f_extinfo->document_id = new_id; + } else { + goto transfer_cleanup; + } + } + + to_extinfo->document_id = f_extinfo->document_id; + f_extinfo->document_id = 0; + //printf("TRANSFERRING: doc-id %d from ino %d to ino %d\n", to_extinfo->document_id, cp->c_fileid, to_cp->c_fileid); + + // make sure the destination is also UF_TRACKED + to_cp->c_bsdflags |= UF_TRACKED; + cp->c_bsdflags &= ~UF_TRACKED; + + // mark the cnodes dirty + cp->c_flag |= C_MODIFIED; + to_cp->c_flag |= C_MODIFIED; + + int lockflags; + if ((error = hfs_start_transaction(hfsmp)) == 0) { + + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); + + (void) cat_update(hfsmp, &cp->c_desc, &cp->c_attr, NULL, NULL); + (void) cat_update(hfsmp, &to_cp->c_desc, &to_cp->c_attr, NULL, NULL); + + hfs_systemfile_unlock (hfsmp, lockflags); + (void) hfs_end_transaction(hfsmp); + } + + add_fsevent(FSE_DOCID_CHANGED, context, + FSE_ARG_DEV, hfsmp->hfs_raw_dev, + FSE_ARG_INO, (ino64_t)cp->c_fileid, // src inode # + FSE_ARG_INO, (ino64_t)to_cp->c_fileid, // dst inode # + FSE_ARG_INT32, to_extinfo->document_id, + FSE_ARG_DONE); + + hfs_unlockpair(cp, to_cp); // unlock this so we can send the fsevents + need_unlock = 0; + + if (need_fsevent(FSE_STAT_CHANGED, vp)) { + add_fsevent(FSE_STAT_CHANGED, context, FSE_ARG_VNODE, vp, FSE_ARG_DONE); + } + if (need_fsevent(FSE_STAT_CHANGED, to_vp)) { + add_fsevent(FSE_STAT_CHANGED, context, FSE_ARG_VNODE, to_vp, FSE_ARG_DONE); + } + } + + if (need_unlock) { + hfs_unlockpair(cp, to_cp); + } + + transfer_cleanup: + vnode_put(to_vp); + file_drop(to_fd); + + return error; + } + + + + case HFSIOC_PREV_LINK: + case HFSIOC_NEXT_LINK: + { + cnid_t linkfileid; + cnid_t nextlinkid; + cnid_t prevlinkid; + int error; + + /* Caller must be owner of file system. */ + vfsp = vfs_statfs(HFSTOVFS(hfsmp)); + if (suser(cred, NULL) && + kauth_cred_getuid(cred) != vfsp->f_owner) { + return (EACCES); + } + /* Target vnode must be file system's root. */ + if (!vnode_isvroot(vp)) { + return (EINVAL); + } + linkfileid = *(cnid_t *)ap->a_data; + if (linkfileid < kHFSFirstUserCatalogNodeID) { + return (EINVAL); + } + if ((error = hfs_lookup_siblinglinks(hfsmp, linkfileid, &prevlinkid, &nextlinkid))) { + return (error); + } + if (ap->a_command == HFSIOC_NEXT_LINK) { + *(cnid_t *)ap->a_data = nextlinkid; + } else { + *(cnid_t *)ap->a_data = prevlinkid; + } + return (0); + } + + case HFSIOC_RESIZE_PROGRESS: { + + vfsp = vfs_statfs(HFSTOVFS(hfsmp)); + if (suser(cred, NULL) && + kauth_cred_getuid(cred) != vfsp->f_owner) { + return (EACCES); /* must be owner of file system */ + } + if (!vnode_isvroot(vp)) { + return (EINVAL); + } + /* file system must not be mounted read-only */ + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return (EROFS); + } + + return hfs_resize_progress(hfsmp, (u_int32_t *)ap->a_data); + } + + case HFSIOC_RESIZE_VOLUME: { + u_int64_t newsize; + u_int64_t cursize; + int ret; + + vfsp = vfs_statfs(HFSTOVFS(hfsmp)); + if (suser(cred, NULL) && + kauth_cred_getuid(cred) != vfsp->f_owner) { + return (EACCES); /* must be owner of file system */ + } + if (!vnode_isvroot(vp)) { + return (EINVAL); + } + + /* filesystem must not be mounted read only */ + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return (EROFS); + } + newsize = *(u_int64_t *)ap->a_data; + cursize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize; + + if (newsize == cursize) { + return (0); + } + IOBSDMountChange(hfsmp->hfs_mp, kIOMountChangeWillResize); + if (newsize > cursize) { + ret = hfs_extendfs(hfsmp, *(u_int64_t *)ap->a_data, context); + } else { + ret = hfs_truncatefs(hfsmp, *(u_int64_t *)ap->a_data, context); + } + IOBSDMountChange(hfsmp->hfs_mp, kIOMountChangeDidResize); + return (ret); + } + case HFSIOC_CHANGE_NEXT_ALLOCATION: { + int error = 0; /* Assume success */ + u_int32_t location; + + if (vnode_vfsisrdonly(vp)) { + return (EROFS); + } + vfsp = vfs_statfs(HFSTOVFS(hfsmp)); + if (suser(cred, NULL) && + kauth_cred_getuid(cred) != vfsp->f_owner) { + return (EACCES); /* must be owner of file system */ + } + if (!vnode_isvroot(vp)) { + return (EINVAL); + } + hfs_lock_mount(hfsmp); + location = *(u_int32_t *)ap->a_data; + if ((location >= hfsmp->allocLimit) && + (location != HFS_NO_UPDATE_NEXT_ALLOCATION)) { + error = EINVAL; + goto fail_change_next_allocation; + } + /* Return previous value. */ + *(u_int32_t *)ap->a_data = hfsmp->nextAllocation; + if (location == HFS_NO_UPDATE_NEXT_ALLOCATION) { + /* On magic value for location, set nextAllocation to next block + * after metadata zone and set flag in mount structure to indicate + * that nextAllocation should not be updated again. + */ + if (hfsmp->hfs_metazone_end != 0) { + HFS_UPDATE_NEXT_ALLOCATION(hfsmp, hfsmp->hfs_metazone_end + 1); + } + hfsmp->hfs_flags |= HFS_SKIP_UPDATE_NEXT_ALLOCATION; + } else { + hfsmp->hfs_flags &= ~HFS_SKIP_UPDATE_NEXT_ALLOCATION; + HFS_UPDATE_NEXT_ALLOCATION(hfsmp, location); + } + MarkVCBDirty(hfsmp); +fail_change_next_allocation: + hfs_unlock_mount(hfsmp); + return (error); + } + +#if HFS_SPARSE_DEV + case HFSIOC_SETBACKINGSTOREINFO: { + struct vnode * di_vp; + struct hfs_backingstoreinfo *bsdata; + int error = 0; + + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return (EROFS); + } + if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) { + return (EALREADY); + } + vfsp = vfs_statfs(HFSTOVFS(hfsmp)); + if (suser(cred, NULL) && + kauth_cred_getuid(cred) != vfsp->f_owner) { + return (EACCES); /* must be owner of file system */ + } + bsdata = (struct hfs_backingstoreinfo *)ap->a_data; + if (bsdata == NULL) { + return (EINVAL); + } + if ((error = file_vnode(bsdata->backingfd, &di_vp))) { + return (error); + } + if ((error = vnode_getwithref(di_vp))) { + file_drop(bsdata->backingfd); + return(error); + } + + if (vnode_mount(vp) == vnode_mount(di_vp)) { + (void)vnode_put(di_vp); + file_drop(bsdata->backingfd); + return (EINVAL); + } + + // Dropped in unmount + vnode_ref(di_vp); + + hfs_lock_mount(hfsmp); + hfsmp->hfs_backingvp = di_vp; + hfsmp->hfs_flags |= HFS_HAS_SPARSE_DEVICE; + hfsmp->hfs_sparsebandblks = bsdata->bandsize / hfsmp->blockSize * 4; + hfs_unlock_mount(hfsmp); + + /* We check the MNTK_VIRTUALDEV bit instead of marking the dependent process */ + + /* + * If the sparse image is on a sparse image file (as opposed to a sparse + * bundle), then we may need to limit the free space to the maximum size + * of a file on that volume. So we query (using pathconf), and if we get + * a meaningful result, we cache the number of blocks for later use in + * hfs_freeblks(). + */ + hfsmp->hfs_backingfs_maxblocks = 0; + if (vnode_vtype(di_vp) == VREG) { + int terr; + int hostbits; + terr = vn_pathconf(di_vp, _PC_FILESIZEBITS, &hostbits, context); + if (terr == 0 && hostbits != 0 && hostbits < 64) { + u_int64_t hostfilesizemax = ((u_int64_t)1) << hostbits; + + hfsmp->hfs_backingfs_maxblocks = hostfilesizemax / hfsmp->blockSize; + } + } + + /* The free extent cache is managed differently for sparse devices. + * There is a window between which the volume is mounted and the + * device is marked as sparse, so the free extent cache for this + * volume is currently initialized as normal volume (sorted by block + * count). Reset the cache so that it will be rebuilt again + * for sparse device (sorted by start block). + */ + ResetVCBFreeExtCache(hfsmp); + + (void)vnode_put(di_vp); + file_drop(bsdata->backingfd); + return (0); + } + + case HFSIOC_CLRBACKINGSTOREINFO: { + struct vnode * tmpvp; + + vfsp = vfs_statfs(HFSTOVFS(hfsmp)); + if (suser(cred, NULL) && + kauth_cred_getuid(cred) != vfsp->f_owner) { + return (EACCES); /* must be owner of file system */ + } + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return (EROFS); + } + + if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) && + hfsmp->hfs_backingvp) { + + hfs_lock_mount(hfsmp); + hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE; + tmpvp = hfsmp->hfs_backingvp; + hfsmp->hfs_backingvp = NULLVP; + hfsmp->hfs_sparsebandblks = 0; + hfs_unlock_mount(hfsmp); + + vnode_rele(tmpvp); + } + return (0); + } +#endif /* HFS_SPARSE_DEV */ + + /* Change the next CNID stored in the VH */ + case HFSIOC_CHANGE_NEXTCNID: { + int error = 0; /* Assume success */ + u_int32_t fileid; + int wraparound = 0; + int lockflags = 0; + + if (vnode_vfsisrdonly(vp)) { + return (EROFS); + } + vfsp = vfs_statfs(HFSTOVFS(hfsmp)); + if (suser(cred, NULL) && + kauth_cred_getuid(cred) != vfsp->f_owner) { + return (EACCES); /* must be owner of file system */ + } + + fileid = *(u_int32_t *)ap->a_data; + + /* Must have catalog lock excl. to advance the CNID pointer */ + lockflags = hfs_systemfile_lock (hfsmp, SFL_CATALOG , HFS_EXCLUSIVE_LOCK); + + hfs_lock_mount(hfsmp); + + /* If it is less than the current next CNID, force the wraparound bit to be set */ + if (fileid < hfsmp->vcbNxtCNID) { + wraparound=1; + } + + /* Return previous value. */ + *(u_int32_t *)ap->a_data = hfsmp->vcbNxtCNID; + + hfsmp->vcbNxtCNID = fileid; + + if (wraparound) { + hfsmp->vcbAtrb |= kHFSCatalogNodeIDsReusedMask; + } + + MarkVCBDirty(hfsmp); + hfs_unlock_mount(hfsmp); + hfs_systemfile_unlock (hfsmp, lockflags); + + return (error); + } + + case F_FREEZE_FS: { + struct mount *mp; + + mp = vnode_mount(vp); + hfsmp = VFSTOHFS(mp); + + if (!(hfsmp->jnl)) + return (ENOTSUP); + + vfsp = vfs_statfs(mp); + + if (kauth_cred_getuid(cred) != vfsp->f_owner && + !kauth_cred_issuser(cred)) + return (EACCES); + + return hfs_freeze(hfsmp); + } + + case F_THAW_FS: { + vfsp = vfs_statfs(vnode_mount(vp)); + if (kauth_cred_getuid(cred) != vfsp->f_owner && + !kauth_cred_issuser(cred)) + return (EACCES); + + return hfs_thaw(hfsmp, current_proc()); + } + + case HFSIOC_EXT_BULKACCESS32: + case HFSIOC_EXT_BULKACCESS64: { + int size; +#if CONFIG_HFS_STD + if (hfsmp->hfs_flags & HFS_STANDARD) { + return EINVAL; + } +#endif + + if (is64bit) { + size = sizeof(struct user64_ext_access_t); + } else { + size = sizeof(struct user32_ext_access_t); + } + + return do_bulk_access_check(hfsmp, vp, ap, size, context); + } + + case HFSIOC_SET_XATTREXTENTS_STATE: { + int state; + + if (ap->a_data == NULL) { + return (EINVAL); + } + + state = *(int *)ap->a_data; + + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return (EROFS); + } + + /* Super-user can enable or disable extent-based extended + * attribute support on a volume + * Note: Starting Mac OS X 10.7, extent-based extended attributes + * are enabled by default, so any change will be transient only + * till the volume is remounted. + */ + if (!kauth_cred_issuser(kauth_cred_get())) { + return (EPERM); + } + if (state == 0 || state == 1) + return hfs_set_volxattr(hfsmp, HFSIOC_SET_XATTREXTENTS_STATE, state); + else + return (EINVAL); + } + + case F_SETSTATICCONTENT: { + int error; + int enable_static = 0; + struct cnode *cp = NULL; + /* + * lock the cnode, decorate the cnode flag, and bail out. + * VFS should have already authenticated the caller for us. + */ + + if (ap->a_data) { + /* + * Note that even though ap->a_data is of type caddr_t, + * the fcntl layer at the syscall handler will pass in NULL + * or 1 depending on what the argument supplied to the fcntl + * was. So it is in fact correct to check the ap->a_data + * argument for zero or non-zero value when deciding whether or not + * to enable the static bit in the cnode. + */ + enable_static = 1; + } + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return EROFS; + } + cp = VTOC(vp); + + error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + if (error == 0) { + if (enable_static) { + cp->c_flag |= C_SSD_STATIC; + } + else { + cp->c_flag &= ~C_SSD_STATIC; + } + hfs_unlock (cp); + } + return error; + } + + case F_SET_GREEDY_MODE: { + int error; + int enable_greedy_mode = 0; + struct cnode *cp = NULL; + /* + * lock the cnode, decorate the cnode flag, and bail out. + * VFS should have already authenticated the caller for us. + */ + + if (ap->a_data) { + /* + * Note that even though ap->a_data is of type caddr_t, + * the fcntl layer at the syscall handler will pass in NULL + * or 1 depending on what the argument supplied to the fcntl + * was. So it is in fact correct to check the ap->a_data + * argument for zero or non-zero value when deciding whether or not + * to enable the greedy mode bit in the cnode. + */ + enable_greedy_mode = 1; + } + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return EROFS; + } + cp = VTOC(vp); + + error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + if (error == 0) { + if (enable_greedy_mode) { + cp->c_flag |= C_SSD_GREEDY_MODE; + } + else { + cp->c_flag &= ~C_SSD_GREEDY_MODE; + } + hfs_unlock (cp); + } + return error; + } + + case F_SETIOTYPE: { + int error; + uint32_t iotypeflag = 0; + + struct cnode *cp = NULL; + /* + * lock the cnode, decorate the cnode flag, and bail out. + * VFS should have already authenticated the caller for us. + */ + + if (ap->a_data == NULL) { + return EINVAL; + } + + /* + * Note that even though ap->a_data is of type caddr_t, we + * can only use 32 bits of flag values. + */ + iotypeflag = (uint32_t) ap->a_data; + switch (iotypeflag) { + case F_IOTYPE_ISOCHRONOUS: + break; + default: + return EINVAL; + } + + + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return EROFS; + } + cp = VTOC(vp); + + error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + if (error == 0) { + switch (iotypeflag) { + case F_IOTYPE_ISOCHRONOUS: + cp->c_flag |= C_IO_ISOCHRONOUS; + break; + default: + break; + } + hfs_unlock (cp); + } + return error; + } + + case F_MAKECOMPRESSED: { + int error = 0; + uint32_t gen_counter; + struct cnode *cp = NULL; + int reset_decmp = 0; + + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return EROFS; + } + + /* + * acquire & lock the cnode. + * VFS should have already authenticated the caller for us. + */ + + if (ap->a_data) { + /* + * Cast the pointer into a uint32_t so we can extract the + * supplied generation counter. + */ + gen_counter = *((uint32_t*)ap->a_data); + } + else { + return EINVAL; + } + +#if HFS_COMPRESSION + cp = VTOC(vp); + /* Grab truncate lock first; we may truncate the file */ + hfs_lock_truncate (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + + error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + if (error) { + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); + return error; + } + + /* Are there any other usecounts/FDs? */ + if (vnode_isinuse(vp, 1)) { + hfs_unlock(cp); + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); + return EBUSY; + } + + /* now we have the cnode locked down; Validate arguments */ + if (cp->c_attr.ca_flags & (UF_IMMUTABLE | UF_COMPRESSED)) { + /* EINVAL if you are trying to manipulate an IMMUTABLE file */ + hfs_unlock(cp); + hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT); + return EINVAL; + } + + if ((hfs_get_gencount (cp)) == gen_counter) { + /* + * OK, the gen_counter matched. Go for it: + * Toggle state bits, truncate file, and suppress mtime update + */ + reset_decmp = 1; + cp->c_bsdflags |= UF_COMPRESSED; + + error = hfs_truncate(vp, 0, IO_NDELAY, HFS_TRUNCATE_SKIPTIMES, + ap->a_context); + } + else { + error = ESTALE; + } + + /* Unlock cnode before executing decmpfs ; they may need to get an EA */ + hfs_unlock(cp); + + /* + * Reset the decmp state while still holding the truncate lock. We need to + * serialize here against a listxattr on this node which may occur at any + * time. + * + * Even if '0/skiplock' is passed in 2nd argument to hfs_file_is_compressed, + * that will still potentially require getting the com.apple.decmpfs EA. If the + * EA is required, then we can't hold the cnode lock, because the getxattr call is + * generic(through VFS), and can't pass along any info telling it that we're already + * holding it (the lock). If we don't serialize, then we risk listxattr stopping + * and trying to fill in the hfs_file_is_compressed info during the callback + * operation, which will result in deadlock against the b-tree node. + * + * So, to serialize against listxattr (which will grab buf_t meta references on + * the b-tree blocks), we hold the truncate lock as we're manipulating the + * decmpfs payload. + */ + if ((reset_decmp) && (error == 0)) { + decmpfs_cnode *dp = VTOCMP (vp); + if (dp != NULL) { + decmpfs_cnode_set_vnode_state(dp, FILE_TYPE_UNKNOWN, 0); + } + + /* Initialize the decmpfs node as needed */ + (void) hfs_file_is_compressed (cp, 0); /* ok to take lock */ + } + + hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT); + +#endif + return error; + } + + case F_SETBACKINGSTORE: { + + int error = 0; + + /* + * See comment in F_SETSTATICCONTENT re: using + * a null check for a_data + */ + if (ap->a_data) { + error = hfs_set_backingstore (vp, 1); + } + else { + error = hfs_set_backingstore (vp, 0); + } + + return error; + } + + case F_GETPATH_MTMINFO: { + int error = 0; + + int *data = (int*) ap->a_data; + + /* Ask if this is a backingstore vnode */ + error = hfs_is_backingstore (vp, data); + + return error; + } + + case F_FULLFSYNC: { + int error; + + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return (EROFS); + } + error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + if (error == 0) { + error = hfs_fsync(vp, MNT_WAIT, HFS_FSYNC_FULL, p); + hfs_unlock(VTOC(vp)); + } + + return error; + } + + case F_BARRIERFSYNC: { + int error; + + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return (EROFS); + } + error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + if (error == 0) { + error = hfs_fsync(vp, MNT_WAIT, HFS_FSYNC_BARRIER, p); + hfs_unlock(VTOC(vp)); + } + + return error; + } + + case F_CHKCLEAN: { + register struct cnode *cp; + int error; + + if (!vnode_isreg(vp)) + return EINVAL; + + error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + if (error == 0) { + cp = VTOC(vp); + /* + * used by regression test to determine if + * all the dirty pages (via write) have been cleaned + * after a call to 'fsysnc'. + */ + error = is_file_clean(vp, VTOF(vp)->ff_size); + hfs_unlock(cp); + } + return (error); + } + + case F_RDADVISE: { + register struct radvisory *ra; + struct filefork *fp; + int error; + + if (!vnode_isreg(vp)) + return EINVAL; + + ra = (struct radvisory *)(ap->a_data); + fp = VTOF(vp); + + /* Protect against a size change. */ + hfs_lock_truncate(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + +#if HFS_COMPRESSION + if (compressed) { + if (uncompressed_size == -1) { + /* fetching the uncompressed size failed above, so return the error */ + error = decmpfs_error; + } else if (ra->ra_offset >= uncompressed_size) { + error = EFBIG; + } else { + error = advisory_read(vp, uncompressed_size, ra->ra_offset, ra->ra_count); + } + } else +#endif /* HFS_COMPRESSION */ + if (ra->ra_offset >= fp->ff_size) { + error = EFBIG; + } else { + error = advisory_read(vp, fp->ff_size, ra->ra_offset, ra->ra_count); + } + + hfs_unlock_truncate(VTOC(vp), HFS_LOCK_DEFAULT); + return (error); + } + + case HFSIOC_GET_VOL_CREATE_TIME_32: { + *(user32_time_t *)(ap->a_data) = (user32_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate)); + return 0; + } + + case HFSIOC_GET_VOL_CREATE_TIME_64: { + *(user64_time_t *)(ap->a_data) = (user64_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate)); + return 0; + } + + case SPOTLIGHT_IOC_GET_MOUNT_TIME: + *(uint32_t *)ap->a_data = hfsmp->hfs_mount_time; + break; + + case SPOTLIGHT_IOC_GET_LAST_MTIME: + *(uint32_t *)ap->a_data = hfsmp->hfs_last_mounted_mtime; + break; + + case HFSIOC_GET_VERY_LOW_DISK: + *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_dangerlimit; + break; + + case HFSIOC_SET_VERY_LOW_DISK: + if (*(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_warninglimit) { + return EINVAL; + } + + hfsmp->hfs_freespace_notify_dangerlimit = *(uint32_t *)ap->a_data; + break; + + case HFSIOC_GET_LOW_DISK: + *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_warninglimit; + break; + + case HFSIOC_SET_LOW_DISK: + if ( *(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_desiredlevel + || *(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_dangerlimit) { + + return EINVAL; + } + + hfsmp->hfs_freespace_notify_warninglimit = *(uint32_t *)ap->a_data; + break; + + /* The following two fsctls were ported from apfs. */ + case APFSIOC_GET_NEAR_LOW_DISK: + *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_nearwarninglimit; + break; + + case APFSIOC_SET_NEAR_LOW_DISK: + if ( *(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_desiredlevel + || *(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_warninglimit) { + return EINVAL; + } + + hfsmp->hfs_freespace_notify_nearwarninglimit = *(uint32_t *)ap->a_data; + break; + + case HFSIOC_GET_DESIRED_DISK: + *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_desiredlevel; + break; + + case HFSIOC_SET_DESIRED_DISK: + if (*(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_warninglimit) { + return EINVAL; + } + + hfsmp->hfs_freespace_notify_desiredlevel = *(uint32_t *)ap->a_data; + break; + + case HFSIOC_VOLUME_STATUS: + *(uint32_t *)ap->a_data = hfsmp->hfs_notification_conditions; + break; + + case HFS_SET_BOOT_INFO: + if (!vnode_isvroot(vp)) + return(EINVAL); + if (!kauth_cred_issuser(cred) && (kauth_cred_getuid(cred) != vfs_statfs(HFSTOVFS(hfsmp))->f_owner)) + return(EACCES); /* must be superuser or owner of filesystem */ + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return (EROFS); + } + hfs_lock_mount (hfsmp); + bcopy(ap->a_data, &hfsmp->vcbFndrInfo, sizeof(hfsmp->vcbFndrInfo)); + /* Null out the cached UUID, to be safe */ + uuid_clear (hfsmp->hfs_full_uuid); + hfs_unlock_mount (hfsmp); + (void) hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT); + break; + + case HFS_GET_BOOT_INFO: + if (!vnode_isvroot(vp)) + return(EINVAL); + hfs_lock_mount (hfsmp); + bcopy(&hfsmp->vcbFndrInfo, ap->a_data, sizeof(hfsmp->vcbFndrInfo)); + hfs_unlock_mount(hfsmp); + break; + + /* case HFS_MARK_BOOT_CORRUPT: _IO are the same */ + case HFSIOC_MARK_BOOT_CORRUPT: + /* Mark the boot volume corrupt by setting + * kHFSVolumeInconsistentBit in the volume header. This will + * force fsck_hfs on next mount. + */ + if (!kauth_cred_issuser(kauth_cred_get())) { + return EACCES; + } + + /* Allowed only on the root vnode of the boot volume */ + if (!(vfs_flags(HFSTOVFS(hfsmp)) & MNT_ROOTFS) || + !vnode_isvroot(vp)) { + return EINVAL; + } + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return (EROFS); + } + printf ("hfs_vnop_ioctl: Marking the boot volume corrupt.\n"); + hfs_mark_inconsistent(hfsmp, HFS_FSCK_FORCED); + break; + + case HFSIOC_GET_JOURNAL_INFO: + jip = (struct hfs_journal_info*)ap->a_data; + + if (vp == NULLVP) + return EINVAL; + + if (hfsmp->jnl == NULL) { + jnl_start = 0; + jnl_size = 0; + } else { + jnl_start = hfs_blk_to_bytes(hfsmp->jnl_start, hfsmp->blockSize) + hfsmp->hfsPlusIOPosOffset; + jnl_size = hfsmp->jnl_size; + } + + jip->jstart = jnl_start; + jip->jsize = jnl_size; + break; + + case HFSIOC_SET_ALWAYS_ZEROFILL: { + struct cnode *cp = VTOC(vp); + + if (*(int *)ap->a_data) { + cp->c_flag |= C_ALWAYS_ZEROFILL; + } else { + cp->c_flag &= ~C_ALWAYS_ZEROFILL; + } + break; + } + + /* case HFS_DISABLE_METAZONE: _IO are the same */ + case HFSIOC_DISABLE_METAZONE: { + /* Only root can disable metadata zone */ + if (!kauth_cred_issuser(kauth_cred_get())) { + return EACCES; + } + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return (EROFS); + } + + /* Disable metadata zone now */ + (void) hfs_metadatazone_init(hfsmp, true); + printf ("hfs: Disabling metadata zone on %s\n", hfsmp->vcbVN); + break; + } + + + case HFSIOC_FSINFO_METADATA_BLOCKS: { + int error; + struct hfsinfo_metadata *hinfo; + + hinfo = (struct hfsinfo_metadata *)ap->a_data; + + /* Get information about number of metadata blocks */ + error = hfs_getinfo_metadata_blocks(hfsmp, hinfo); + if (error) { + return error; + } + + break; + } + + case HFSIOC_GET_FSINFO: { + hfs_fsinfo *fsinfo = (hfs_fsinfo *)ap->a_data; + + /* Only root is allowed to get fsinfo */ + if (!kauth_cred_issuser(kauth_cred_get())) { + return EACCES; + } + + /* + * Make sure that the caller's version number matches with + * the kernel's version number. This will make sure that + * if the structures being read/written into are changed + * by the kernel, the caller will not read incorrect data. + * + * The first three fields --- request_type, version and + * flags are same for all the hfs_fsinfo structures, so + * we can access the version number by assuming any + * structure for now. + */ + if (fsinfo->header.version != HFS_FSINFO_VERSION) { + return ENOTSUP; + } + + /* Make sure that the current file system is not marked inconsistent */ + if (hfsmp->vcbAtrb & kHFSVolumeInconsistentMask) { + return EIO; + } + + return hfs_get_fsinfo(hfsmp, ap->a_data); + } + + case HFSIOC_CS_FREESPACE_TRIM: { + int error = 0; + int lockflags = 0; + + /* Only root allowed */ + if (!kauth_cred_issuser(kauth_cred_get())) { + return EACCES; + } + + /* + * This core functionality is similar to hfs_scan_blocks(). + * The main difference is that hfs_scan_blocks() is called + * as part of mount where we are assured that the journal is + * empty to start with. This fcntl() can be called on a + * mounted volume, therefore it has to flush the content of + * the journal as well as ensure the state of summary table. + * + * This fcntl scans over the entire allocation bitmap, + * creates list of all the free blocks, and issues TRIM + * down to the underlying device. This can take long time + * as it can generate up to 512MB of read I/O. + */ + + if ((hfsmp->hfs_flags & HFS_SUMMARY_TABLE) == 0) { + error = hfs_init_summary(hfsmp); + if (error) { + printf("hfs: fsctl() could not initialize summary table for %s\n", hfsmp->vcbVN); + return error; + } + } + + /* + * The journal maintains list of recently deallocated blocks to + * issue DKIOCUNMAPs when the corresponding journal transaction is + * flushed to the disk. To avoid any race conditions, we only + * want one active trim list and only one thread issuing DKIOCUNMAPs. + * Therefore we make sure that the journal trim list is sync'ed, + * empty, and not modifiable for the duration of our scan. + * + * Take the journal lock before flushing the journal to the disk. + * We will keep on holding the journal lock till we don't get the + * bitmap lock to make sure that no new journal transactions can + * start. This will make sure that the journal trim list is not + * modified after the journal flush and before getting bitmap lock. + * We can release the journal lock after we acquire the bitmap + * lock as it will prevent any further block deallocations. + */ + hfs_journal_lock(hfsmp); + + /* Flush the journal and wait for all I/Os to finish up */ + error = hfs_flush(hfsmp, HFS_FLUSH_JOURNAL_META); + if (error) { + hfs_journal_unlock(hfsmp); + return error; + } + + /* Take bitmap lock to ensure it is not being modified */ + lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK); + + /* Release the journal lock */ + hfs_journal_unlock(hfsmp); + + /* + * ScanUnmapBlocks reads the bitmap in large block size + * (up to 1MB) unlike the runtime which reads the bitmap + * in the 4K block size. This can cause buf_t collisions + * and potential data corruption. To avoid this, we + * invalidate all the existing buffers associated with + * the bitmap vnode before scanning it. + * + * Note: ScanUnmapBlock() cleans up all the buffers + * after itself, so there won't be any large buffers left + * for us to clean up after it returns. + */ + error = buf_invalidateblks(hfsmp->hfs_allocation_vp, 0, 0, 0); + if (error) { + hfs_systemfile_unlock(hfsmp, lockflags); + return error; + } + + /* Traverse bitmap and issue DKIOCUNMAPs */ + error = ScanUnmapBlocks(hfsmp); + hfs_systemfile_unlock(hfsmp, lockflags); + if (error) { + return error; + } + + break; + } + + case HFSIOC_SET_HOTFILE_STATE: { + int error; + struct cnode *cp = VTOC(vp); + uint32_t hf_state = *((uint32_t*)ap->a_data); + uint32_t num_unpinned = 0; + + error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + if (error) { + return error; + } + + // printf("hfs: setting hotfile state %d on %s\n", hf_state, vp->v_name); + if (hf_state == HFS_MARK_FASTDEVCANDIDATE) { + vnode_setfastdevicecandidate(vp); + + cp->c_attr.ca_recflags |= kHFSFastDevCandidateMask; + cp->c_attr.ca_recflags &= ~kHFSDoNotFastDevPinMask; + cp->c_flag |= C_MODIFIED; + } else if (hf_state == HFS_UNMARK_FASTDEVCANDIDATE || hf_state == HFS_NEVER_FASTDEVCANDIDATE) { + vnode_clearfastdevicecandidate(vp); + hfs_removehotfile(vp); + + if (cp->c_attr.ca_recflags & kHFSFastDevPinnedMask) { + hfs_pin_vnode(hfsmp, vp, HFS_UNPIN_IT, &num_unpinned); + } + + if (hf_state == HFS_NEVER_FASTDEVCANDIDATE) { + cp->c_attr.ca_recflags |= kHFSDoNotFastDevPinMask; + } + cp->c_attr.ca_recflags &= ~(kHFSFastDevCandidateMask|kHFSFastDevPinnedMask); + cp->c_flag |= C_MODIFIED; + + } else { + error = EINVAL; + } + + if (num_unpinned != 0) { + lck_mtx_lock(&hfsmp->hfc_mutex); + hfsmp->hfs_hotfile_freeblks += num_unpinned; + lck_mtx_unlock(&hfsmp->hfc_mutex); + } + + hfs_unlock(cp); + return error; + } + + case HFSIOC_REPIN_HOTFILE_STATE: { + int error=0; + uint32_t repin_what = *((uint32_t*)ap->a_data); + + /* Only root allowed */ + if (!kauth_cred_issuser(kauth_cred_get())) { + return EACCES; + } + + if (!(hfsmp->hfs_flags & (HFS_CS_METADATA_PIN | HFS_CS_HOTFILE_PIN))) { + // this system is neither regular Fusion or Cooperative Fusion + // so this fsctl makes no sense. + return EINVAL; + } + + // + // After a converting a CoreStorage volume to be encrypted, the + // extents could have moved around underneath us. This call + // allows corestoraged to re-pin everything that should be + // pinned (it would happen on the next reboot too but that could + // be a long time away). + // + if ((repin_what & HFS_REPIN_METADATA) && (hfsmp->hfs_flags & HFS_CS_METADATA_PIN)) { + hfs_pin_fs_metadata(hfsmp); + } + if ((repin_what & HFS_REPIN_USERDATA) && (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN)) { + hfs_repin_hotfiles(hfsmp); + } + if ((repin_what & HFS_REPIN_USERDATA) && (hfsmp->hfs_flags & HFS_CS_SWAPFILE_PIN)) { + //XXX Swapfiles (marked SWAP_PINNED) may have moved too. + //XXX Do we care? They have a more transient/dynamic nature/lifetime. + } + + return error; + } + +#if HFS_CONFIG_KEY_ROLL + + case HFSIOC_KEY_ROLL: { + if (!kauth_cred_issuser(kauth_cred_get())) + return EACCES; + + hfs_key_roll_args_t *args = (hfs_key_roll_args_t *)ap->a_data; + + return hfs_key_roll_op(ap->a_context, ap->a_vp, args); + } + + case HFSIOC_GET_KEY_AUTO_ROLL: { + if (!kauth_cred_issuser(kauth_cred_get())) + return EACCES; + + hfs_key_auto_roll_args_t *args = (hfs_key_auto_roll_args_t *)ap->a_data; + if (args->api_version != HFS_KEY_AUTO_ROLL_API_VERSION_1) + return ENOTSUP; + args->flags = (ISSET(hfsmp->cproot_flags, CP_ROOT_AUTO_ROLL_OLD_CLASS_GENERATION) + ? HFS_KEY_AUTO_ROLL_OLD_CLASS_GENERATION : 0); + args->min_key_os_version = hfsmp->hfs_auto_roll_min_key_os_version; + args->max_key_os_version = hfsmp->hfs_auto_roll_max_key_os_version; + break; + } + + case HFSIOC_SET_KEY_AUTO_ROLL: { + if (!kauth_cred_issuser(kauth_cred_get())) + return EACCES; + + hfs_key_auto_roll_args_t *args = (hfs_key_auto_roll_args_t *)ap->a_data; + if (args->api_version != HFS_KEY_AUTO_ROLL_API_VERSION_1) + return ENOTSUP; + return cp_set_auto_roll(hfsmp, args); + } + +#endif // HFS_CONFIG_KEY_ROLL + +#if CONFIG_PROTECT + case F_TRANSCODEKEY: + /* + * This API is only supported when called via kernel so + * a_fflag must be set to 1 (it's not possible to get here + * with it set to 1 via fsctl). + */ + if (ap->a_fflag != 1) + return ENOTTY; + return cp_vnode_transcode(vp, (cp_key_t *)ap->a_data); + + case F_GETPROTECTIONLEVEL: + return cp_get_root_major_vers (vp, (uint32_t *)ap->a_data); + + case F_GETDEFAULTPROTLEVEL: + return cp_get_default_level(vp, (uint32_t *)ap->a_data); +#endif // CONFIG_PROTECT + + case FIOPINSWAP: + return hfs_pin_vnode(hfsmp, vp, HFS_PIN_IT | HFS_DATALESS_PIN, + NULL); + + case FSIOC_CAS_BSDFLAGS: { + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return (EROFS); + } + +#if 0 + struct fsioc_cas_bsdflags *cas = (void *)ap->a_data; + struct cnode *cp = VTOC(vp); + u_int32_t document_id = 0; + int decmpfs_reset_state = 0; + int error; + + /* Don't allow modification of the journal. */ + if (hfs_is_journal_file(hfsmp, cp)) { + return (EPERM); + } + + if ((error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { + return (error); + } + + cas->actual_flags = cp->c_bsdflags; + if (cas->actual_flags != cas->expected_flags) { + hfs_unlock(cp); + return (0); + } + + // + // Check if we'll need a document_id. If so, we need to drop the lock + // (to avoid any possible deadlock with the root vnode which has to get + // locked to get the document id), generate the document_id, re-acquire + // the lock, and perform the CAS check again. We do it in this sequence + // in order to avoid throwing away document_ids in the case where the + // CAS check fails. Note that it can still happen, but by performing + // the check first, hopefully we can reduce the ocurrence. + // + if ((cas->new_flags & UF_TRACKED) && !(VTOC(vp)->c_bsdflags & UF_TRACKED)) { + struct FndrExtendedDirInfo *fip = (struct FndrExtendedDirInfo *)((char *)&(VTOC(vp)->c_attr.ca_finderinfo) + 16); + // + // If the document_id is not set, get a new one. It will be set + // on the file down below once we hold the cnode lock. + // + if (fip->document_id == 0) { + // + // Drat, we have to generate one. Unlock the cnode, do the + // deed, re-lock the cnode, and then to the CAS check again + // to see if we lost the race. + // + hfs_unlock(cp); + if (hfs_generate_document_id(hfsmp, &document_id) != 0) { + document_id = 0; + } + if ((error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { + return (error); + } + cas->actual_flags = cp->c_bsdflags; + if (cas->actual_flags != cas->expected_flags) { + hfs_unlock(cp); + return (0); + } + } + } + + bool setting_compression = false; + + if (!(cas->actual_flags & UF_COMPRESSED) && (cas->new_flags & UF_COMPRESSED)) + setting_compression = true; + + if (setting_compression) { + hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + if (VTOF(vp)->ff_size) { + // hfs_truncate will deal with the cnode lock + error = hfs_truncate(vp, 0, IO_NDELAY, 0, ap->a_context); + } + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); + } + + if (!error) + error = hfs_set_bsd_flags(hfsmp, cp, cas->new_flags, + document_id, ap->a_context, + &decmpfs_reset_state); + if (error == 0) { + error = hfs_update(vp, 0); + } + hfs_unlock(cp); + if (error) { + return (error); + } + +#if HFS_COMPRESSION + if (decmpfs_reset_state) { + /* + * we've changed the UF_COMPRESSED flag, so reset the decmpfs state for this cnode + * but don't do it while holding the hfs cnode lock + */ + decmpfs_cnode *dp = VTOCMP(vp); + if (!dp) { + /* + * call hfs_lazy_init_decmpfs_cnode() to make sure that the decmpfs_cnode + * is filled in; we need a decmpfs_cnode to prevent decmpfs state changes + * on this file if it's locked + */ + dp = hfs_lazy_init_decmpfs_cnode(VTOC(vp)); + if (!dp) { + /* failed to allocate a decmpfs_cnode */ + return ENOMEM; /* what should this be? */ + } + } + decmpfs_cnode_set_vnode_state(dp, FILE_TYPE_UNKNOWN, 0); + } +#endif + break; +#endif + return ENOTSUP; + } + + default: + return (ENOTTY); + } + + return 0; +} + +/* + * select + */ +int +hfs_vnop_select(__unused struct vnop_select_args *ap) +/* + struct vnop_select_args { + vnode_t a_vp; + int a_which; + int a_fflags; + void *a_wql; + vfs_context_t a_context; + }; +*/ +{ + /* + * We should really check to see if I/O is possible. + */ + return (1); +} + +/* + * Converts a logical block number to a physical block, and optionally returns + * the amount of remaining blocks in a run. The logical block is based on hfsNode.logBlockSize. + * The physical block number is based on the device block size, currently its 512. + * The block run is returned in logical blocks, and is the REMAINING amount of blocks + */ +int +hfs_bmap(struct vnode *vp, daddr_t bn, struct vnode **vpp, daddr64_t *bnp, unsigned int *runp) +{ + struct filefork *fp = VTOF(vp); + struct hfsmount *hfsmp = VTOHFS(vp); + int retval = E_NONE; + u_int32_t logBlockSize; + size_t bytesContAvail = 0; + off_t blockposition; + int lockExtBtree; + int lockflags = 0; + + /* + * Check for underlying vnode requests and ensure that logical + * to physical mapping is requested. + */ + if (vpp != NULL) + *vpp = hfsmp->hfs_devvp; + if (bnp == NULL) + return (0); + + logBlockSize = GetLogicalBlockSize(vp); + blockposition = (off_t)bn * logBlockSize; + + lockExtBtree = overflow_extents(fp); + + if (lockExtBtree) + lockflags = hfs_systemfile_lock(hfsmp, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK); + + retval = MacToVFSError( + MapFileBlockC (HFSTOVCB(hfsmp), + (FCB*)fp, + MAXPHYSIO, + blockposition, + bnp, + &bytesContAvail)); + + if (lockExtBtree) + hfs_systemfile_unlock(hfsmp, lockflags); + + if (retval == E_NONE) { + /* Figure out how many read ahead blocks there are */ + if (runp != NULL) { + if (can_cluster(logBlockSize)) { + /* Make sure this result never goes negative: */ + *runp = (bytesContAvail < logBlockSize) ? 0 : (bytesContAvail / logBlockSize) - 1; + } else { + *runp = 0; + } + } + } + return (retval); +} + +/* + * Convert logical block number to file offset. + */ +int +hfs_vnop_blktooff(struct vnop_blktooff_args *ap) +/* + struct vnop_blktooff_args { + vnode_t a_vp; + daddr64_t a_lblkno; + off_t *a_offset; + }; +*/ +{ + if (ap->a_vp == NULL) + return (EINVAL); + *ap->a_offset = (off_t)ap->a_lblkno * (off_t)GetLogicalBlockSize(ap->a_vp); + + return(0); +} + +/* + * Convert file offset to logical block number. + */ +int +hfs_vnop_offtoblk(struct vnop_offtoblk_args *ap) +/* + struct vnop_offtoblk_args { + vnode_t a_vp; + off_t a_offset; + daddr64_t *a_lblkno; + }; +*/ +{ + if (ap->a_vp == NULL) + return (EINVAL); + *ap->a_lblkno = (daddr64_t)(ap->a_offset / (off_t)GetLogicalBlockSize(ap->a_vp)); + + return(0); +} + +/* + * Map file offset to physical block number. + * + * If this function is called for write operation, and if the file + * had virtual blocks allocated (delayed allocation), real blocks + * are allocated by calling ExtendFileC(). + * + * If this function is called for read operation, and if the file + * had virtual blocks allocated (delayed allocation), no change + * to the size of file is done, and if required, rangelist is + * searched for mapping. + * + * System file cnodes are expected to be locked (shared or exclusive). + * + * -- INVALID RANGES -- + * + * Invalid ranges are used to keep track of where we have extended a + * file, but have not yet written that data to disk. In the past we + * would clear up the invalid ranges as we wrote to those areas, but + * before data was actually flushed to disk. The problem with that + * approach is that the data can be left in the cache and is therefore + * still not valid on disk. So now we clear up the ranges here, when + * the flags field has VNODE_WRITE set, indicating a write is about to + * occur. This isn't ideal (ideally we want to clear them up when + * know the data has been successfully written), but it's the best we + * can do. + * + * For reads, we use the invalid ranges here in block map to indicate + * to the caller that the data should be zeroed (a_bpn == -1). We + * have to be careful about what ranges we return to the cluster code. + * Currently the cluster code can only handle non-rounded values for + * the EOF; it cannot handle funny sized ranges in the middle of the + * file (the main problem is that it sends down odd sized I/Os to the + * disk). Our code currently works because whilst the very first + * offset and the last offset in the invalid ranges are not aligned, + * gaps in the invalid ranges between the first and last, have to be + * aligned (because we always write page sized blocks). For example, + * consider this arrangement: + * + * +-------------+-----+-------+------+ + * | |XXXXX| |XXXXXX| + * +-------------+-----+-------+------+ + * a b c d + * + * This shows two invalid ranges and . Whilst a and d + * are not necessarily aligned, b and c *must* be. + * + * Zero-filling occurs in a number of ways: + * + * 1. When a read occurs and we return with a_bpn == -1. + * + * 2. When hfs_fsync or hfs_filedone calls hfs_flush_invalid_ranges + * which will cause us to iterate over the ranges bringing in + * pages that are not present in the cache and zeroing them. Any + * pages that are already in the cache are left untouched. Note + * that hfs_fsync does not always flush invalid ranges. + * + * 3. When we extend a file we zero out from the old EOF to the end + * of the page. It would be nice if we didn't have to do this if + * the page wasn't present (and could defer it), but because of + * the problem described above, we have to. + * + * The invalid ranges are also used to restrict the size that we write + * out on disk: see hfs_prepare_fork_for_update. + * + * Note that invalid ranges are ignored when neither the VNODE_READ or + * the VNODE_WRITE flag is specified. This is useful for the + * F_LOG2PHYS* fcntls which are not interested in invalid ranges: they + * just want to know whether blocks are physically allocated or not. + */ +int +hfs_vnop_blockmap(struct vnop_blockmap_args *ap) +/* + struct vnop_blockmap_args { + vnode_t a_vp; + off_t a_foffset; + size_t a_size; + daddr64_t *a_bpn; + size_t *a_run; + void *a_poff; + int a_flags; + vfs_context_t a_context; + }; +*/ +{ + struct vnode *vp = ap->a_vp; + struct cnode *cp; + struct filefork *fp; + struct hfsmount *hfsmp; + size_t bytesContAvail = ap->a_size; + int retval = E_NONE; + int syslocks = 0; + int lockflags = 0; + struct rl_entry *invalid_range; + enum rl_overlaptype overlaptype; + int started_tr = 0; + int tooklock = 0; + +#if HFS_COMPRESSION + if (VNODE_IS_RSRC(vp)) { + /* allow blockmaps to the resource fork */ + } else { + if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */ + int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp)); + switch(state) { + case FILE_IS_COMPRESSED: + return ENOTSUP; + case FILE_IS_CONVERTING: + /* if FILE_IS_CONVERTING, we allow blockmap */ + break; + default: + printf("invalid state %d for compressed file\n", state); + /* fall through */ + } + } + } +#endif /* HFS_COMPRESSION */ + + /* Do not allow blockmap operation on a directory */ + if (vnode_isdir(vp)) { + return (ENOTSUP); + } + + /* + * Check for underlying vnode requests and ensure that logical + * to physical mapping is requested. + */ + if (ap->a_bpn == NULL) + return (0); + + hfsmp = VTOHFS(vp); + cp = VTOC(vp); + fp = VTOF(vp); + + if ( !vnode_issystem(vp) && !vnode_islnk(vp) && !vnode_isswap(vp)) { + if (cp->c_lockowner != current_thread()) { + hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); + tooklock = 1; + } + + // For reads, check the invalid ranges + if (ISSET(ap->a_flags, VNODE_READ)) { + if (ap->a_foffset >= fp->ff_size) { + retval = ERANGE; + goto exit; + } + + overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset, + ap->a_foffset + (off_t)bytesContAvail - 1, + &invalid_range); + switch(overlaptype) { + case RL_MATCHINGOVERLAP: + case RL_OVERLAPCONTAINSRANGE: + case RL_OVERLAPSTARTSBEFORE: + /* There's no valid block for this byte offset */ + *ap->a_bpn = (daddr64_t)-1; + /* There's no point limiting the amount to be returned + * if the invalid range that was hit extends all the way + * to the EOF (i.e. there's no valid bytes between the + * end of this range and the file's EOF): + */ + if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) && + ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) { + bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset; + } + + retval = 0; + goto exit; + + case RL_OVERLAPISCONTAINED: + case RL_OVERLAPENDSAFTER: + /* The range of interest hits an invalid block before the end: */ + if (invalid_range->rl_start == ap->a_foffset) { + /* There's actually no valid information to be had starting here: */ + *ap->a_bpn = (daddr64_t)-1; + if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) && + ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) { + bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset; + } + + retval = 0; + goto exit; + } else { + /* + * Sadly, the lower layers don't like us to + * return unaligned ranges, so we skip over + * any invalid ranges here that are less than + * a page: zeroing of those bits is not our + * responsibility (it's dealt with elsewhere). + */ + do { + off_t rounded_start = round_page_64(invalid_range->rl_start); + if ((off_t)bytesContAvail < rounded_start - ap->a_foffset) + break; + if (rounded_start < invalid_range->rl_end + 1) { + bytesContAvail = rounded_start - ap->a_foffset; + break; + } + } while ((invalid_range = TAILQ_NEXT(invalid_range, + rl_link))); + } + break; + + case RL_NOOVERLAP: + break; + } // switch + } + } + +#if CONFIG_PROTECT + if (cp->c_cpentry) { + const int direction = (ISSET(ap->a_flags, VNODE_WRITE) + ? VNODE_WRITE : VNODE_READ); + + cp_io_params_t io_params; + cp_io_params(hfsmp, cp->c_cpentry, + off_rsrc_make(ap->a_foffset, VNODE_IS_RSRC(vp)), + direction, &io_params); + + if (io_params.max_len < (off_t)bytesContAvail) + bytesContAvail = io_params.max_len; + + if (io_params.phys_offset != -1) { + *ap->a_bpn = ((io_params.phys_offset + hfsmp->hfsPlusIOPosOffset) + / hfsmp->hfs_logical_block_size); + + retval = 0; + goto exit; + } + } +#endif + +retry: + + /* Check virtual blocks only when performing write operation */ + if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) { + if (hfs_start_transaction(hfsmp) != 0) { + retval = EINVAL; + goto exit; + } else { + started_tr = 1; + } + syslocks = SFL_EXTENTS | SFL_BITMAP; + + } else if (overflow_extents(fp)) { + syslocks = SFL_EXTENTS; + } + + if (syslocks) + lockflags = hfs_systemfile_lock(hfsmp, syslocks, HFS_EXCLUSIVE_LOCK); + + /* + * Check for any delayed allocations. + */ + if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) { + int64_t actbytes; + u_int32_t loanedBlocks; + + // + // Make sure we have a transaction. It's possible + // that we came in and fp->ff_unallocblocks was zero + // but during the time we blocked acquiring the extents + // btree, ff_unallocblocks became non-zero and so we + // will need to start a transaction. + // + if (started_tr == 0) { + if (syslocks) { + hfs_systemfile_unlock(hfsmp, lockflags); + syslocks = 0; + } + goto retry; + } + + /* + * Note: ExtendFileC will Release any blocks on loan and + * aquire real blocks. So we ask to extend by zero bytes + * since ExtendFileC will account for the virtual blocks. + */ + + loanedBlocks = fp->ff_unallocblocks; + retval = ExtendFileC(hfsmp, (FCB*)fp, 0, 0, + kEFAllMask | kEFNoClumpMask, &actbytes); + + if (retval) { + fp->ff_unallocblocks = loanedBlocks; + cp->c_blocks += loanedBlocks; + fp->ff_blocks += loanedBlocks; + + hfs_lock_mount (hfsmp); + hfsmp->loanedBlocks += loanedBlocks; + hfs_unlock_mount (hfsmp); + + hfs_systemfile_unlock(hfsmp, lockflags); + cp->c_flag |= C_MODIFIED; + if (started_tr) { + (void) hfs_update(vp, 0); + (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); + + hfs_end_transaction(hfsmp); + started_tr = 0; + } + goto exit; + } + } + + retval = MapFileBlockC(hfsmp, (FCB *)fp, bytesContAvail, ap->a_foffset, + ap->a_bpn, &bytesContAvail); + if (syslocks) { + hfs_systemfile_unlock(hfsmp, lockflags); + syslocks = 0; + } + + if (retval) { + /* On write, always return error because virtual blocks, if any, + * should have been allocated in ExtendFileC(). We do not + * allocate virtual blocks on read, therefore return error + * only if no virtual blocks are allocated. Otherwise we search + * rangelist for zero-fills + */ + if ((MacToVFSError(retval) != ERANGE) || + (ap->a_flags & VNODE_WRITE) || + ((ap->a_flags & VNODE_READ) && (fp->ff_unallocblocks == 0))) { + goto exit; + } + + /* Validate if the start offset is within logical file size */ + if (ap->a_foffset >= fp->ff_size) { + goto exit; + } + + /* + * At this point, we have encountered a failure during + * MapFileBlockC that resulted in ERANGE, and we are not + * servicing a write, and there are borrowed blocks. + * + * However, the cluster layer will not call blockmap for + * blocks that are borrowed and in-cache. We have to assume + * that because we observed ERANGE being emitted from + * MapFileBlockC, this extent range is not valid on-disk. So + * we treat this as a mapping that needs to be zero-filled + * prior to reading. + */ + + if (fp->ff_size - ap->a_foffset < (off_t)bytesContAvail) + bytesContAvail = fp->ff_size - ap->a_foffset; + + *ap->a_bpn = (daddr64_t) -1; + retval = 0; + + goto exit; + } + +exit: + if (retval == 0) { + if (ISSET(ap->a_flags, VNODE_WRITE)) { + struct rl_entry *r = TAILQ_FIRST(&fp->ff_invalidranges); + + // See if we might be overlapping invalid ranges... + if (r && (ap->a_foffset + (off_t)bytesContAvail) > r->rl_start) { + /* + * Mark the file as needing an update if we think the + * on-disk EOF has changed. + */ + if (ap->a_foffset <= r->rl_start) + SET(cp->c_flag, C_MODIFIED); + + /* + * This isn't the ideal place to put this. Ideally, we + * should do something *after* we have successfully + * written to the range, but that's difficult to do + * because we cannot take locks in the callback. At + * present, the cluster code will call us with VNODE_WRITE + * set just before it's about to write the data so we know + * that data is about to be written. If we get an I/O + * error at this point then chances are the metadata + * update to follow will also have an I/O error so the + * risk here is small. + */ + rl_remove(ap->a_foffset, ap->a_foffset + bytesContAvail - 1, + &fp->ff_invalidranges); + + if (!TAILQ_FIRST(&fp->ff_invalidranges)) { + cp->c_flag &= ~C_ZFWANTSYNC; + cp->c_zftimeout = 0; + } + } + } + + if (ap->a_run) + *ap->a_run = bytesContAvail; + + if (ap->a_poff) + *(int *)ap->a_poff = 0; + } + + if (started_tr) { + hfs_update(vp, TRUE); + hfs_volupdate(hfsmp, VOL_UPDATE, 0); + hfs_end_transaction(hfsmp); + started_tr = 0; + } + + if (tooklock) + hfs_unlock(cp); + + return (MacToVFSError(retval)); +} + +/* + * prepare and issue the I/O + * buf_strategy knows how to deal + * with requests that require + * fragmented I/Os + */ +int +hfs_vnop_strategy(struct vnop_strategy_args *ap) +{ + buf_t bp = ap->a_bp; + vnode_t vp = buf_vnode(bp); + int error = 0; + + /* Mark buffer as containing static data if cnode flag set */ + if (VTOC(vp)->c_flag & C_SSD_STATIC) { + buf_markstatic(bp); + } + + /* Mark buffer as containing static data if cnode flag set */ + if (VTOC(vp)->c_flag & C_SSD_GREEDY_MODE) { + bufattr_markgreedymode(buf_attr(bp)); + } + + /* mark buffer as containing burst mode data if cnode flag set */ + if (VTOC(vp)->c_flag & C_IO_ISOCHRONOUS) { + bufattr_markisochronous(buf_attr(bp)); + } + +#if CONFIG_PROTECT + error = cp_handle_strategy(bp); + + if (error) + return error; +#endif + + error = buf_strategy(VTOHFS(vp)->hfs_devvp, ap); + + return error; +} + +int +do_hfs_truncate(struct vnode *vp, off_t length, int flags, int truncateflags, vfs_context_t context) +{ + register struct cnode *cp = VTOC(vp); + struct filefork *fp = VTOF(vp); + kauth_cred_t cred = vfs_context_ucred(context); + int retval; + off_t bytesToAdd; + off_t actualBytesAdded; + off_t filebytes; + u_int32_t fileblocks; + int blksize; + struct hfsmount *hfsmp; + int lockflags; + int suppress_times = (truncateflags & HFS_TRUNCATE_SKIPTIMES); + + blksize = VTOVCB(vp)->blockSize; + fileblocks = fp->ff_blocks; + filebytes = (off_t)fileblocks * (off_t)blksize; + + KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_START, + (int)length, (int)fp->ff_size, (int)filebytes, 0, 0); + + if (length < 0) + return (EINVAL); + + /* This should only happen with a corrupt filesystem */ + if ((off_t)fp->ff_size < 0) + return (EINVAL); + + if ((!ISHFSPLUS(VTOVCB(vp))) && (length > (off_t)MAXHFSFILESIZE)) + return (EFBIG); + + hfsmp = VTOHFS(vp); + + retval = E_NONE; + + /* Files that are changing size are not hot file candidates. */ + if (hfsmp->hfc_stage == HFC_RECORDING) { + fp->ff_bytesread = 0; + } + + /* + * We cannot just check if fp->ff_size == length (as an optimization) + * since there may be extra physical blocks that also need truncation. + */ +#if QUOTA + if ((retval = hfs_getinoquota(cp))) + return(retval); +#endif /* QUOTA */ + + /* + * Lengthen the size of the file. We must ensure that the + * last byte of the file is allocated. Since the smallest + * value of ff_size is 0, length will be at least 1. + */ + if (length > (off_t)fp->ff_size) { +#if QUOTA + retval = hfs_chkdq(cp, (int64_t)(roundup(length - filebytes, blksize)), + cred, 0); + if (retval) + goto Err_Exit; +#endif /* QUOTA */ + /* + * If we don't have enough physical space then + * we need to extend the physical size. + */ + if (length > filebytes) { + int eflags; + u_int32_t blockHint = 0; + + /* All or nothing and don't round up to clumpsize. */ + eflags = kEFAllMask | kEFNoClumpMask; + + if (cred && (suser(cred, NULL) != 0)) { + eflags |= kEFReserveMask; /* keep a reserve */ + } + + /* + * Allocate Journal and Quota files in metadata zone. + */ + if (filebytes == 0 && + hfsmp->hfs_flags & HFS_METADATA_ZONE && + hfs_virtualmetafile(cp)) { + eflags |= kEFMetadataMask; + blockHint = hfsmp->hfs_metazone_start; + } + if (hfs_start_transaction(hfsmp) != 0) { + retval = EINVAL; + goto Err_Exit; + } + + /* Protect extents b-tree and allocation bitmap */ + lockflags = SFL_BITMAP; + if (overflow_extents(fp)) + lockflags |= SFL_EXTENTS; + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); + + /* + * Keep growing the file as long as the current EOF is + * less than the desired value. + */ + while ((length > filebytes) && (retval == E_NONE)) { + bytesToAdd = length - filebytes; + retval = MacToVFSError(ExtendFileC(VTOVCB(vp), + (FCB*)fp, + bytesToAdd, + blockHint, + eflags, + &actualBytesAdded)); + + filebytes = (off_t)fp->ff_blocks * (off_t)blksize; + if (actualBytesAdded == 0 && retval == E_NONE) { + if (length > filebytes) + length = filebytes; + break; + } + } /* endwhile */ + + hfs_systemfile_unlock(hfsmp, lockflags); + + if (hfsmp->jnl) { + hfs_update(vp, 0); + hfs_volupdate(hfsmp, VOL_UPDATE, 0); + } + + hfs_end_transaction(hfsmp); + + if (retval) + goto Err_Exit; + + KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_NONE, + (int)length, (int)fp->ff_size, (int)filebytes, 0, 0); + } + + if (ISSET(flags, IO_NOZEROFILL)) { + // An optimisation for the hibernation file + if (vnode_isswap(vp)) + rl_remove_all(&fp->ff_invalidranges); + } else { + if (!vnode_issystem(vp) && retval == E_NONE) { + if (length > (off_t)fp->ff_size) { + struct timeval tv; + + /* Extending the file: time to fill out the current last page w. zeroes? */ + if (fp->ff_size & PAGE_MASK_64) { + /* There might be some valid data at the start of the (current) last page + of the file, so zero out the remainder of that page to ensure the + entire page contains valid data. */ + hfs_unlock(cp); + retval = hfs_zero_eof_page(vp, length); + hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); + if (retval) goto Err_Exit; + } + microuptime(&tv); + rl_add(fp->ff_size, length - 1, &fp->ff_invalidranges); + cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT; + } + } else { + panic("hfs_truncate: invoked on non-UBC object?!"); + }; + } + if (suppress_times == 0) { + cp->c_touch_modtime = TRUE; + } + fp->ff_size = length; + + } else { /* Shorten the size of the file */ + + // An optimisation for the hibernation file + if (ISSET(flags, IO_NOZEROFILL) && vnode_isswap(vp)) { + rl_remove_all(&fp->ff_invalidranges); + } else if ((off_t)fp->ff_size > length) { + /* Any space previously marked as invalid is now irrelevant: */ + rl_remove(length, fp->ff_size - 1, &fp->ff_invalidranges); + } + + /* + * Account for any unmapped blocks. Note that the new + * file length can still end up with unmapped blocks. + */ + if (fp->ff_unallocblocks > 0) { + u_int32_t finalblks; + u_int32_t loanedBlocks; + + hfs_lock_mount(hfsmp); + loanedBlocks = fp->ff_unallocblocks; + cp->c_blocks -= loanedBlocks; + fp->ff_blocks -= loanedBlocks; + fp->ff_unallocblocks = 0; + + hfsmp->loanedBlocks -= loanedBlocks; + + finalblks = (length + blksize - 1) / blksize; + if (finalblks > fp->ff_blocks) { + /* calculate required unmapped blocks */ + loanedBlocks = finalblks - fp->ff_blocks; + hfsmp->loanedBlocks += loanedBlocks; + + fp->ff_unallocblocks = loanedBlocks; + cp->c_blocks += loanedBlocks; + fp->ff_blocks += loanedBlocks; + } + hfs_unlock_mount (hfsmp); + } + + off_t savedbytes = ((off_t)fp->ff_blocks * (off_t)blksize); + if (hfs_start_transaction(hfsmp) != 0) { + retval = EINVAL; + goto Err_Exit; + } + + if (fp->ff_unallocblocks == 0) { + /* Protect extents b-tree and allocation bitmap */ + lockflags = SFL_BITMAP; + if (overflow_extents(fp)) + lockflags |= SFL_EXTENTS; + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); + + retval = MacToVFSError(TruncateFileC(VTOVCB(vp), (FCB*)fp, length, 0, + FORK_IS_RSRC (fp), FTOC(fp)->c_fileid, false)); + + hfs_systemfile_unlock(hfsmp, lockflags); + } + if (hfsmp->jnl) { + if (retval == 0) { + fp->ff_size = length; + } + hfs_update(vp, 0); + hfs_volupdate(hfsmp, VOL_UPDATE, 0); + } + hfs_end_transaction(hfsmp); + + filebytes = (off_t)fp->ff_blocks * (off_t)blksize; + if (retval) + goto Err_Exit; +#if QUOTA + /* These are bytesreleased */ + (void) hfs_chkdq(cp, (int64_t)-(savedbytes - filebytes), NOCRED, 0); +#endif /* QUOTA */ + + // + // Unlike when growing a file, we adjust the hotfile block count here + // instead of deeper down in the block allocation code because we do + // not necessarily have a vnode or "fcb" at the time we're deleting + // the file and so we wouldn't know if it was hotfile cached or not + // + hfs_hotfile_adjust_blocks(vp, (int64_t)((savedbytes - filebytes) / blksize)); + + + /* + * Only set update flag if the logical length changes & we aren't + * suppressing modtime updates. + */ + if (((off_t)fp->ff_size != length) && (suppress_times == 0)) { + cp->c_touch_modtime = TRUE; + } + fp->ff_size = length; + } + if (cp->c_mode & (S_ISUID | S_ISGID)) { + if (!vfs_context_issuser(context)) + cp->c_mode &= ~(S_ISUID | S_ISGID); + } + cp->c_flag |= C_MODIFIED; + cp->c_touch_chgtime = TRUE; /* status changed */ + if (suppress_times == 0) { + cp->c_touch_modtime = TRUE; /* file data was modified */ + + /* + * If we are not suppressing the modtime update, then + * update the gen count as well. + */ + if (S_ISREG(cp->c_attr.ca_mode) || S_ISLNK (cp->c_attr.ca_mode)) { + hfs_incr_gencount(cp); + } + } + + retval = hfs_update(vp, 0); + if (retval) { + KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_NONE, + -1, -1, -1, retval, 0); + } + +Err_Exit: + + KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_END, + (int)length, (int)fp->ff_size, (int)filebytes, retval, 0); + + return (retval); +} + +/* + * Preparation which must be done prior to deleting the catalog record + * of a file or directory. In order to make the on-disk as safe as possible, + * we remove the catalog entry before releasing the bitmap blocks and the + * overflow extent records. However, some work must be done prior to deleting + * the catalog record. + * + * When calling this function, the cnode must exist both in memory and on-disk. + * If there are both resource fork and data fork vnodes, this function should + * be called on both. + */ + +int +hfs_prepare_release_storage (struct hfsmount *hfsmp, struct vnode *vp) { + + struct filefork *fp = VTOF(vp); + struct cnode *cp = VTOC(vp); +#if QUOTA + int retval = 0; +#endif /* QUOTA */ + + /* Cannot truncate an HFS directory! */ + if (vnode_isdir(vp)) { + return (EISDIR); + } + + /* + * See the comment below in hfs_truncate for why we need to call + * setsize here. Essentially we want to avoid pending IO if we + * already know that the blocks are going to be released here. + * This function is only called when totally removing all storage for a file, so + * we can take a shortcut and immediately setsize (0); + */ + ubc_setsize(vp, 0); + + /* This should only happen with a corrupt filesystem */ + if ((off_t)fp->ff_size < 0) + return (EINVAL); + + /* + * We cannot just check if fp->ff_size == length (as an optimization) + * since there may be extra physical blocks that also need truncation. + */ +#if QUOTA + if ((retval = hfs_getinoquota(cp))) { + return(retval); + } +#endif /* QUOTA */ + + /* Wipe out any invalid ranges which have yet to be backed by disk */ + rl_remove(0, fp->ff_size - 1, &fp->ff_invalidranges); + + /* + * Account for any unmapped blocks. Since we're deleting the + * entire file, we don't have to worry about just shrinking + * to a smaller number of borrowed blocks. + */ + if (fp->ff_unallocblocks > 0) { + u_int32_t loanedBlocks; + + hfs_lock_mount (hfsmp); + loanedBlocks = fp->ff_unallocblocks; + cp->c_blocks -= loanedBlocks; + fp->ff_blocks -= loanedBlocks; + fp->ff_unallocblocks = 0; + + hfsmp->loanedBlocks -= loanedBlocks; + + hfs_unlock_mount (hfsmp); + } + + return 0; +} + + +/* + * Special wrapper around calling TruncateFileC. This function is useable + * even when the catalog record does not exist any longer, making it ideal + * for use when deleting a file. The simplification here is that we know + * that we are releasing all blocks. + * + * Note that this function may be called when there is no vnode backing + * the file fork in question. We may call this from hfs_vnop_inactive + * to clear out resource fork data (and may not want to clear out the data + * fork yet). As a result, we pointer-check both sets of inputs before + * doing anything with them. + * + * The caller is responsible for saving off a copy of the filefork(s) + * embedded within the cnode prior to calling this function. The pointers + * supplied as arguments must be valid even if the cnode is no longer valid. + */ + +int +hfs_release_storage (struct hfsmount *hfsmp, struct filefork *datafork, + struct filefork *rsrcfork, u_int32_t fileid) { + + off_t filebytes; + u_int32_t fileblocks; + int blksize = 0; + int error = 0; + int lockflags; + + blksize = hfsmp->blockSize; + + /* Data Fork */ + if (datafork) { + off_t prev_filebytes; + + datafork->ff_size = 0; + + fileblocks = datafork->ff_blocks; + filebytes = (off_t)fileblocks * (off_t)blksize; + prev_filebytes = filebytes; + + /* We killed invalid ranges and loaned blocks before we removed the catalog entry */ + + while (filebytes > 0) { + if (filebytes > HFS_BIGFILE_SIZE) { + filebytes -= HFS_BIGFILE_SIZE; + } else { + filebytes = 0; + } + + /* Start a transaction, and wipe out as many blocks as we can in this iteration */ + if (hfs_start_transaction(hfsmp) != 0) { + error = EINVAL; + break; + } + + if (datafork->ff_unallocblocks == 0) { + /* Protect extents b-tree and allocation bitmap */ + lockflags = SFL_BITMAP; + if (overflow_extents(datafork)) + lockflags |= SFL_EXTENTS; + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); + + error = MacToVFSError(TruncateFileC(HFSTOVCB(hfsmp), datafork, filebytes, 1, 0, fileid, false)); + + hfs_systemfile_unlock(hfsmp, lockflags); + } + (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); + + struct cnode *cp = datafork ? FTOC(datafork) : NULL; + struct vnode *vp; + vp = cp ? CTOV(cp, 0) : NULL; + hfs_hotfile_adjust_blocks(vp, (int64_t)((prev_filebytes - filebytes) / blksize)); + prev_filebytes = filebytes; + + /* Finish the transaction and start over if necessary */ + hfs_end_transaction(hfsmp); + + if (error) { + break; + } + } + } + + /* Resource fork */ + if (error == 0 && rsrcfork) { + rsrcfork->ff_size = 0; + + fileblocks = rsrcfork->ff_blocks; + filebytes = (off_t)fileblocks * (off_t)blksize; + + /* We killed invalid ranges and loaned blocks before we removed the catalog entry */ + + while (filebytes > 0) { + if (filebytes > HFS_BIGFILE_SIZE) { + filebytes -= HFS_BIGFILE_SIZE; + } else { + filebytes = 0; + } + + /* Start a transaction, and wipe out as many blocks as we can in this iteration */ + if (hfs_start_transaction(hfsmp) != 0) { + error = EINVAL; + break; + } + + if (rsrcfork->ff_unallocblocks == 0) { + /* Protect extents b-tree and allocation bitmap */ + lockflags = SFL_BITMAP; + if (overflow_extents(rsrcfork)) + lockflags |= SFL_EXTENTS; + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); + + error = MacToVFSError(TruncateFileC(HFSTOVCB(hfsmp), rsrcfork, filebytes, 1, 1, fileid, false)); + + hfs_systemfile_unlock(hfsmp, lockflags); + } + (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); + + /* Finish the transaction and start over if necessary */ + hfs_end_transaction(hfsmp); + + if (error) { + break; + } + } + } + + return error; +} + +errno_t hfs_ubc_setsize(vnode_t vp, off_t len, bool have_cnode_lock) +{ + errno_t error; + + /* + * Call ubc_setsize to give the VM subsystem a chance to do + * whatever it needs to with existing pages before we delete + * blocks. Note that symlinks don't use the UBC so we'll + * get back ENOENT in that case. + */ + if (have_cnode_lock) { + error = ubc_setsize_ex(vp, len, UBC_SETSIZE_NO_FS_REENTRY); + if (error == EAGAIN) { + cnode_t *cp = VTOC(vp); + + if (cp->c_truncatelockowner != current_thread()) + hfs_warn("hfs: hfs_ubc_setsize called without exclusive truncate lock!"); + + hfs_unlock(cp); + error = ubc_setsize_ex(vp, len, 0); + hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK); + } + } else + error = ubc_setsize_ex(vp, len, 0); + + return error == ENOENT ? 0 : error; +} + +/* + * Truncate a cnode to at most length size, freeing (or adding) the + * disk blocks. + */ +int +hfs_truncate(struct vnode *vp, off_t length, int flags, + int truncateflags, vfs_context_t context) +{ + struct filefork *fp = VTOF(vp); + off_t filebytes; + u_int32_t fileblocks; + int blksize; + errno_t error = 0; + struct cnode *cp = VTOC(vp); + hfsmount_t *hfsmp = VTOHFS(vp); + + /* Cannot truncate an HFS directory! */ + if (vnode_isdir(vp)) { + return (EISDIR); + } + /* A swap file cannot change size. */ + if (vnode_isswap(vp) && length && !ISSET(flags, IO_NOAUTH)) { + return (EPERM); + } + + blksize = hfsmp->blockSize; + fileblocks = fp->ff_blocks; + filebytes = (off_t)fileblocks * (off_t)blksize; + + bool caller_has_cnode_lock = (cp->c_lockowner == current_thread()); + + error = hfs_ubc_setsize(vp, length, caller_has_cnode_lock); + if (error) + return error; + + if (!caller_has_cnode_lock) { + error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + if (error) + return error; + } + + if (vnode_islnk(vp) && cp->c_datafork->ff_symlinkptr) { + hfs_free(cp->c_datafork->ff_symlinkptr, cp->c_datafork->ff_size); + cp->c_datafork->ff_symlinkptr = NULL; + } + + // have to loop truncating or growing files that are + // really big because otherwise transactions can get + // enormous and consume too many kernel resources. + + if (length < filebytes) { + while (filebytes > length) { + if ((filebytes - length) > HFS_BIGFILE_SIZE) { + filebytes -= HFS_BIGFILE_SIZE; + } else { + filebytes = length; + } + error = do_hfs_truncate(vp, filebytes, flags, truncateflags, context); + if (error) + break; + } + } else if (length > filebytes) { + kauth_cred_t cred = vfs_context_ucred(context); + const bool keep_reserve = cred && suser(cred, NULL) != 0; + + if (hfs_freeblks(hfsmp, keep_reserve) + < howmany(length - filebytes, blksize)) { + error = ENOSPC; + } else { + while (filebytes < length) { + if ((length - filebytes) > HFS_BIGFILE_SIZE) { + filebytes += HFS_BIGFILE_SIZE; + } else { + filebytes = length; + } + error = do_hfs_truncate(vp, filebytes, flags, truncateflags, context); + if (error) + break; + } + } + } else /* Same logical size */ { + + error = do_hfs_truncate(vp, length, flags, truncateflags, context); + } + /* Files that are changing size are not hot file candidates. */ + if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) { + fp->ff_bytesread = 0; + } + +#if HFS_CONFIG_KEY_ROLL + if (!error && cp->c_truncatelockowner == current_thread()) { + hfs_key_roll_check(cp, true); + } +#endif + + if (!caller_has_cnode_lock) + hfs_unlock(cp); + + // Make sure UBC's size matches up (in case we didn't completely succeed) + errno_t err2 = hfs_ubc_setsize(vp, fp->ff_size, caller_has_cnode_lock); + if (!error) + error = err2; + + return error; +} + + +/* + * Preallocate file storage space. + */ +int +hfs_vnop_allocate(struct vnop_allocate_args /* { + vnode_t a_vp; + off_t a_length; + u_int32_t a_flags; + off_t *a_bytesallocated; + off_t a_offset; + vfs_context_t a_context; + } */ *ap) +{ + struct vnode *vp = ap->a_vp; + struct cnode *cp; + struct filefork *fp; + ExtendedVCB *vcb; + off_t length = ap->a_length; + off_t startingPEOF; + off_t moreBytesRequested; + off_t actualBytesAdded; + off_t filebytes; + u_int32_t fileblocks; + int retval, retval2; + u_int32_t blockHint; + u_int32_t extendFlags; /* For call to ExtendFileC */ + struct hfsmount *hfsmp; + kauth_cred_t cred = vfs_context_ucred(ap->a_context); + int lockflags; + time_t orig_ctime; + + *(ap->a_bytesallocated) = 0; + + if (!vnode_isreg(vp)) + return (EISDIR); + if (length < (off_t)0) + return (EINVAL); + + cp = VTOC(vp); + + orig_ctime = VTOC(vp)->c_ctime; + + nspace_snapshot_event(vp, orig_ctime, ap->a_length == 0 ? NAMESPACE_HANDLER_TRUNCATE_OP|NAMESPACE_HANDLER_DELETE_OP : NAMESPACE_HANDLER_TRUNCATE_OP, NULL); + + hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + + if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { + goto Err_Exit; + } + + fp = VTOF(vp); + hfsmp = VTOHFS(vp); + vcb = VTOVCB(vp); + + fileblocks = fp->ff_blocks; + filebytes = (off_t)fileblocks * (off_t)vcb->blockSize; + + if ((ap->a_flags & ALLOCATEFROMVOL) && (length < filebytes)) { + retval = EINVAL; + goto Err_Exit; + } + + /* Fill in the flags word for the call to Extend the file */ + + extendFlags = kEFNoClumpMask; + if (ap->a_flags & ALLOCATECONTIG) + extendFlags |= kEFContigMask; + if (ap->a_flags & ALLOCATEALL) + extendFlags |= kEFAllMask; + if (cred && suser(cred, NULL) != 0) + extendFlags |= kEFReserveMask; + if (hfs_virtualmetafile(cp)) + extendFlags |= kEFMetadataMask; + + retval = E_NONE; + blockHint = 0; + startingPEOF = filebytes; + + if (ap->a_flags & ALLOCATEFROMPEOF) + length += filebytes; + else if (ap->a_flags & ALLOCATEFROMVOL) + blockHint = ap->a_offset / VTOVCB(vp)->blockSize; + + /* If no changes are necesary, then we're done */ + if (filebytes == length) + goto Std_Exit; + + /* + * Lengthen the size of the file. We must ensure that the + * last byte of the file is allocated. Since the smallest + * value of filebytes is 0, length will be at least 1. + */ + if (length > filebytes) { + if (ISSET(extendFlags, kEFAllMask) + && (hfs_freeblks(hfsmp, ISSET(extendFlags, kEFReserveMask)) + < howmany(length - filebytes, hfsmp->blockSize))) { + retval = ENOSPC; + goto Err_Exit; + } + + off_t total_bytes_added = 0, orig_request_size; + + orig_request_size = moreBytesRequested = length - filebytes; + +#if QUOTA + retval = hfs_chkdq(cp, + (int64_t)(roundup(moreBytesRequested, vcb->blockSize)), + cred, 0); + if (retval) + goto Err_Exit; + +#endif /* QUOTA */ + /* + * Metadata zone checks. + */ + if (hfsmp->hfs_flags & HFS_METADATA_ZONE) { + /* + * Allocate Journal and Quota files in metadata zone. + */ + if (hfs_virtualmetafile(cp)) { + blockHint = hfsmp->hfs_metazone_start; + } else if ((blockHint >= hfsmp->hfs_metazone_start) && + (blockHint <= hfsmp->hfs_metazone_end)) { + /* + * Move blockHint outside metadata zone. + */ + blockHint = hfsmp->hfs_metazone_end + 1; + } + } + + + while ((length > filebytes) && (retval == E_NONE)) { + off_t bytesRequested; + + if (hfs_start_transaction(hfsmp) != 0) { + retval = EINVAL; + goto Err_Exit; + } + + /* Protect extents b-tree and allocation bitmap */ + lockflags = SFL_BITMAP; + if (overflow_extents(fp)) + lockflags |= SFL_EXTENTS; + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); + + if (moreBytesRequested >= HFS_BIGFILE_SIZE) { + bytesRequested = HFS_BIGFILE_SIZE; + } else { + bytesRequested = moreBytesRequested; + } + + if (extendFlags & kEFContigMask) { + // if we're on a sparse device, this will force it to do a + // full scan to find the space needed. + hfsmp->hfs_flags &= ~HFS_DID_CONTIG_SCAN; + } + + retval = MacToVFSError(ExtendFileC(vcb, + (FCB*)fp, + bytesRequested, + blockHint, + extendFlags, + &actualBytesAdded)); + + if (retval == E_NONE) { + *(ap->a_bytesallocated) += actualBytesAdded; + total_bytes_added += actualBytesAdded; + moreBytesRequested -= actualBytesAdded; + if (blockHint != 0) { + blockHint += actualBytesAdded / vcb->blockSize; + } + } + filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize; + + hfs_systemfile_unlock(hfsmp, lockflags); + + if (hfsmp->jnl) { + (void) hfs_update(vp, 0); + (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); + } + + hfs_end_transaction(hfsmp); + } + + + /* + * if we get an error and no changes were made then exit + * otherwise we must do the hfs_update to reflect the changes + */ + if (retval && (startingPEOF == filebytes)) + goto Err_Exit; + + /* + * Adjust actualBytesAdded to be allocation block aligned, not + * clump size aligned. + * NOTE: So what we are reporting does not affect reality + * until the file is closed, when we truncate the file to allocation + * block size. + */ + if (total_bytes_added != 0 && orig_request_size < total_bytes_added) + *(ap->a_bytesallocated) = + roundup(orig_request_size, (off_t)vcb->blockSize); + + } else { /* Shorten the size of the file */ + + /* + * N.B. At present, this code is never called. If and when we + * do start using it, it looks like there might be slightly + * strange semantics with the file size: it's possible for the + * file size to *increase* e.g. if current file size is 5, + * length is 1024 and filebytes is 4096, the file size will + * end up being 1024 bytes. This isn't necessarily a problem + * but it's not consistent with the code above which doesn't + * change the file size. + */ + + retval = hfs_truncate(vp, length, 0, 0, ap->a_context); + filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize; + + /* + * if we get an error and no changes were made then exit + * otherwise we must do the hfs_update to reflect the changes + */ + if (retval && (startingPEOF == filebytes)) goto Err_Exit; +#if QUOTA + /* These are bytesreleased */ + (void) hfs_chkdq(cp, (int64_t)-((startingPEOF - filebytes)), NOCRED,0); +#endif /* QUOTA */ + + if (fp->ff_size > filebytes) { + fp->ff_size = filebytes; + + hfs_ubc_setsize(vp, fp->ff_size, true); + } + } + +Std_Exit: + cp->c_flag |= C_MODIFIED; + cp->c_touch_chgtime = TRUE; + cp->c_touch_modtime = TRUE; + retval2 = hfs_update(vp, 0); + + if (retval == 0) + retval = retval2; +Err_Exit: + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); + hfs_unlock(cp); + return (retval); +} + + +/* + * Pagein for HFS filesystem + */ +int +hfs_vnop_pagein(struct vnop_pagein_args *ap) +/* + struct vnop_pagein_args { + vnode_t a_vp, + upl_t a_pl, + vm_offset_t a_pl_offset, + off_t a_f_offset, + size_t a_size, + int a_flags + vfs_context_t a_context; + }; +*/ +{ + vnode_t vp; + struct cnode *cp; + struct filefork *fp; + int error = 0; + upl_t upl; + upl_page_info_t *pl; + off_t f_offset; + off_t page_needed_f_offset; + int offset; + int isize; + int upl_size; + int pg_index; + boolean_t truncate_lock_held = FALSE; + boolean_t file_converted = FALSE; + kern_return_t kret; + + vp = ap->a_vp; + cp = VTOC(vp); + fp = VTOF(vp); + +#if CONFIG_PROTECT + if ((error = cp_handle_vnop(vp, CP_READ_ACCESS | CP_WRITE_ACCESS, 0)) != 0) { + /* + * If we errored here, then this means that one of two things occurred: + * 1. there was a problem with the decryption of the key. + * 2. the device is locked and we are not allowed to access this particular file. + * + * Either way, this means that we need to shut down this upl now. As long as + * the pl pointer is NULL (meaning that we're supposed to create the UPL ourselves) + * then we create a upl and immediately abort it. + */ + if (ap->a_pl == NULL) { + /* create the upl */ + ubc_create_upl (vp, ap->a_f_offset, ap->a_size, &upl, &pl, + UPL_UBC_PAGEIN | UPL_RET_ONLY_ABSENT); + /* mark the range as needed so it doesn't immediately get discarded upon abort */ + ubc_upl_range_needed (upl, ap->a_pl_offset / PAGE_SIZE, 1); + + /* Abort the range */ + ubc_upl_abort_range (upl, 0, ap->a_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR); + } + + + return error; + } +#endif /* CONFIG_PROTECT */ + + if (ap->a_pl != NULL) { + /* + * this can only happen for swap files now that + * we're asking for V2 paging behavior... + * so don't need to worry about decompression, or + * keeping track of blocks read or taking the truncate lock + */ + error = cluster_pagein(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset, + ap->a_size, (off_t)fp->ff_size, ap->a_flags); + goto pagein_done; + } + + page_needed_f_offset = ap->a_f_offset + ap->a_pl_offset; + +retry_pagein: + /* + * take truncate lock (shared/recursive) to guard against + * zero-fill thru fsync interfering, but only for v2 + * + * the HFS_RECURSE_TRUNCLOCK arg indicates that we want the + * lock shared and we are allowed to recurse 1 level if this thread already + * owns the lock exclusively... this can legally occur + * if we are doing a shrinking ftruncate against a file + * that is mapped private, and the pages being truncated + * do not currently exist in the cache... in that case + * we will have to page-in the missing pages in order + * to provide them to the private mapping... we must + * also call hfs_unlock_truncate with a postive been_recursed + * arg to indicate that if we have recursed, there is no need to drop + * the lock. Allowing this simple recursion is necessary + * in order to avoid a certain deadlock... since the ftruncate + * already holds the truncate lock exclusively, if we try + * to acquire it shared to protect the pagein path, we will + * hang this thread + * + * NOTE: The if () block below is a workaround in order to prevent a + * VM deadlock. See rdar://7853471. + * + * If we are in a forced unmount, then launchd will still have the + * dyld_shared_cache file mapped as it is trying to reboot. If we + * take the truncate lock here to service a page fault, then our + * thread could deadlock with the forced-unmount. The forced unmount + * thread will try to reclaim the dyld_shared_cache vnode, but since it's + * marked C_DELETED, it will call ubc_setsize(0). As a result, the unmount + * thread will think it needs to copy all of the data out of the file + * and into a VM copy object. If we hold the cnode lock here, then that + * VM operation will not be able to proceed, because we'll set a busy page + * before attempting to grab the lock. Note that this isn't as simple as "don't + * call ubc_setsize" because doing that would just shift the problem to the + * ubc_msync done before the vnode is reclaimed. + * + * So, if a forced unmount on this volume is in flight AND the cnode is + * marked C_DELETED, then just go ahead and do the page in without taking + * the lock (thus suspending pagein_v2 semantics temporarily). Since it's on a file + * that is not going to be available on the next mount, this seems like a + * OK solution from a correctness point of view, even though it is hacky. + */ + if (vfs_isforce(vnode_mount(vp))) { + if (cp->c_flag & C_DELETED) { + /* If we don't get it, then just go ahead and operate without the lock */ + truncate_lock_held = hfs_try_trunclock(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE); + } + } + else { + hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE); + truncate_lock_held = TRUE; + } + + kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, UPL_UBC_PAGEIN | UPL_RET_ONLY_ABSENT); + + if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) { + error = EINVAL; + goto pagein_done; + } + ubc_upl_range_needed(upl, ap->a_pl_offset / PAGE_SIZE, 1); + + upl_size = isize = ap->a_size; + + /* + * Scan from the back to find the last page in the UPL, so that we + * aren't looking at a UPL that may have already been freed by the + * preceding aborts/completions. + */ + for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) { + if (upl_page_present(pl, --pg_index)) + break; + if (pg_index == 0) { + /* + * no absent pages were found in the range specified + * just abort the UPL to get rid of it and then we're done + */ + ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY); + goto pagein_done; + } + } + /* + * initialize the offset variables before we touch the UPL. + * f_offset is the position into the file, in bytes + * offset is the position into the UPL, in bytes + * pg_index is the pg# of the UPL we're operating on + * isize is the offset into the UPL of the last page that is present. + */ + isize = ((pg_index + 1) * PAGE_SIZE); + pg_index = 0; + offset = 0; + f_offset = ap->a_f_offset; + + while (isize) { + int xsize; + int num_of_pages; + + if ( !upl_page_present(pl, pg_index)) { + /* + * we asked for RET_ONLY_ABSENT, so it's possible + * to get back empty slots in the UPL. + * just skip over them + */ + f_offset += PAGE_SIZE; + offset += PAGE_SIZE; + isize -= PAGE_SIZE; + pg_index++; + + continue; + } + /* + * We know that we have at least one absent page. + * Now checking to see how many in a row we have + */ + num_of_pages = 1; + xsize = isize - PAGE_SIZE; + + while (xsize) { + if ( !upl_page_present(pl, pg_index + num_of_pages)) + break; + num_of_pages++; + xsize -= PAGE_SIZE; + } + xsize = num_of_pages * PAGE_SIZE; + +#if HFS_COMPRESSION + if (VNODE_IS_RSRC(vp)) { + /* allow pageins of the resource fork */ + } else { + int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */ + + if (compressed) { + + if (truncate_lock_held) { + /* + * can't hold the truncate lock when calling into the decmpfs layer + * since it calls back into this layer... even though we're only + * holding the lock in shared mode, and the re-entrant path only + * takes the lock shared, we can deadlock if some other thread + * tries to grab the lock exclusively in between. + */ + hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE); + truncate_lock_held = FALSE; + } + ap->a_pl = upl; + ap->a_pl_offset = offset; + ap->a_f_offset = f_offset; + ap->a_size = xsize; + + error = decmpfs_pagein_compressed(ap, &compressed, VTOCMP(vp)); + /* + * note that decpfs_pagein_compressed can change the state of + * 'compressed'... it will set it to 0 if the file is no longer + * compressed once the compression lock is successfully taken + * i.e. we would block on that lock while the file is being inflated + */ + if (error == 0 && vnode_isfastdevicecandidate(vp)) { + (void) hfs_addhotfile(vp); + } + if (compressed) { + if (error == 0) { + /* successful page-in, update the access time */ + VTOC(vp)->c_touch_acctime = TRUE; + + // + // compressed files are not traditional hot file candidates + // but they may be for CF (which ignores the ff_bytesread + // field) + // + if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) { + fp->ff_bytesread = 0; + } + } else if (error == EAGAIN) { + /* + * EAGAIN indicates someone else already holds the compression lock... + * to avoid deadlocking, we'll abort this range of pages with an + * indication that the pagein needs to be redriven + */ + ubc_upl_abort_range(upl, (upl_offset_t) offset, xsize, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_RESTART); + } else if (error == ENOSPC) { + + if (upl_size == PAGE_SIZE) + panic("decmpfs_pagein_compressed: couldn't ubc_upl_map a single page\n"); + + ubc_upl_abort_range(upl, (upl_offset_t) offset, isize, UPL_ABORT_FREE_ON_EMPTY); + + ap->a_size = PAGE_SIZE; + ap->a_pl = NULL; + ap->a_pl_offset = 0; + ap->a_f_offset = page_needed_f_offset; + + goto retry_pagein; + } else { + ubc_upl_abort(upl, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR); + goto pagein_done; + } + goto pagein_next_range; + } + else { + /* + * Set file_converted only if the file became decompressed while we were + * paging in. If it were still compressed, we would re-start the loop using the goto + * in the above block. This avoid us overloading truncate_lock_held as our retry_pagein + * condition below, since we could have avoided taking the truncate lock to prevent + * a deadlock in the force unmount case. + */ + file_converted = TRUE; + } + } + if (file_converted == TRUE) { + /* + * the file was converted back to a regular file after we first saw it as compressed + * we need to abort the upl, retake the truncate lock, recreate the UPL and start over + * reset a_size so that we consider what remains of the original request + * and null out a_upl and a_pl_offset. + * + * We should only be able to get into this block if the decmpfs_pagein_compressed + * successfully decompressed the range in question for this file. + */ + ubc_upl_abort_range(upl, (upl_offset_t) offset, isize, UPL_ABORT_FREE_ON_EMPTY); + + ap->a_size = isize; + ap->a_pl = NULL; + ap->a_pl_offset = 0; + + /* Reset file_converted back to false so that we don't infinite-loop. */ + file_converted = FALSE; + goto retry_pagein; + } + } +#endif + error = cluster_pagein(vp, upl, offset, f_offset, xsize, (off_t)fp->ff_size, ap->a_flags); + + /* + * Keep track of blocks read. + */ + if ( !vnode_isswap(vp) && VTOHFS(vp)->hfc_stage == HFC_RECORDING && error == 0) { + int bytesread; + int took_cnode_lock = 0; + + if (ap->a_f_offset == 0 && fp->ff_size < PAGE_SIZE) + bytesread = fp->ff_size; + else + bytesread = xsize; + + /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */ + if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff && cp->c_lockowner != current_thread()) { + hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); + took_cnode_lock = 1; + } + /* + * If this file hasn't been seen since the start of + * the current sampling period then start over. + */ + if (cp->c_atime < VTOHFS(vp)->hfc_timebase) { + struct timeval tv; + + fp->ff_bytesread = bytesread; + microtime(&tv); + cp->c_atime = tv.tv_sec; + } else { + fp->ff_bytesread += bytesread; + } + cp->c_touch_acctime = TRUE; + + if (vnode_isfastdevicecandidate(vp)) { + (void) hfs_addhotfile(vp); + } + if (took_cnode_lock) + hfs_unlock(cp); + } +pagein_next_range: + f_offset += xsize; + offset += xsize; + isize -= xsize; + pg_index += num_of_pages; + + error = 0; + } + +pagein_done: + if (truncate_lock_held == TRUE) { + /* Note 1 is passed to hfs_unlock_truncate in been_recursed argument */ + hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE); + } + + return (error); +} + +/* + * Pageout for HFS filesystem. + */ +int +hfs_vnop_pageout(struct vnop_pageout_args *ap) +/* + struct vnop_pageout_args { + vnode_t a_vp, + upl_t a_pl, + vm_offset_t a_pl_offset, + off_t a_f_offset, + size_t a_size, + int a_flags + vfs_context_t a_context; + }; +*/ +{ + vnode_t vp = ap->a_vp; + struct cnode *cp; + struct filefork *fp; + int retval = 0; + off_t filesize; + upl_t upl; + upl_page_info_t* pl = NULL; + vm_offset_t a_pl_offset; + int a_flags; + int is_pageoutv2 = 0; + kern_return_t kret; + + cp = VTOC(vp); + fp = VTOF(vp); + + a_flags = ap->a_flags; + a_pl_offset = ap->a_pl_offset; + + /* + * we can tell if we're getting the new or old behavior from the UPL + */ + if ((upl = ap->a_pl) == NULL) { + int request_flags; + + is_pageoutv2 = 1; + /* + * we're in control of any UPL we commit + * make sure someone hasn't accidentally passed in UPL_NOCOMMIT + */ + a_flags &= ~UPL_NOCOMMIT; + a_pl_offset = 0; + + /* + * For V2 semantics, we want to take the cnode truncate lock + * shared to guard against the file size changing via zero-filling. + * + * However, we have to be careful because we may be invoked + * via the ubc_msync path to write out dirty mmap'd pages + * in response to a lock event on a content-protected + * filesystem (e.g. to write out class A files). + * As a result, we want to take the truncate lock 'SHARED' with + * the mini-recursion locktype so that we don't deadlock/panic + * because we may be already holding the truncate lock exclusive to force any other + * IOs to have blocked behind us. + */ + hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE); + + if (a_flags & UPL_MSYNC) { + request_flags = UPL_UBC_MSYNC | UPL_RET_ONLY_DIRTY; + } + else { + request_flags = UPL_UBC_PAGEOUT | UPL_RET_ONLY_DIRTY; + } + + kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, request_flags); + + if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) { + retval = EINVAL; + goto pageout_done; + } + } + /* + * from this point forward upl points at the UPL we're working with + * it was either passed in or we succesfully created it + */ + + /* + * Figure out where the file ends, for pageout purposes. If + * ff_new_size > ff_size, then we're in the middle of extending the + * file via a write, so it is safe (and necessary) that we be able + * to pageout up to that point. + */ + filesize = fp->ff_size; + if (fp->ff_new_size > filesize) + filesize = fp->ff_new_size; + + /* + * Now that HFS is opting into VFC_VFSVNOP_PAGEOUTV2, we may need to operate on our own + * UPL instead of relying on the UPL passed into us. We go ahead and do that here, + * scanning for dirty ranges. We'll issue our own N cluster_pageout calls, for + * N dirty ranges in the UPL. Note that this is almost a direct copy of the + * logic in vnode_pageout except that we need to do it after grabbing the truncate + * lock in HFS so that we don't lock invert ourselves. + * + * Note that we can still get into this function on behalf of the default pager with + * non-V2 behavior (swapfiles). However in that case, we did not grab locks above + * since fsync and other writing threads will grab the locks, then mark the + * relevant pages as busy. But the pageout codepath marks the pages as busy, + * and THEN would attempt to grab the truncate lock, which would result in deadlock. So + * we do not try to grab anything for the pre-V2 case, which should only be accessed + * by the paging/VM system. + */ + + if (is_pageoutv2) { + off_t f_offset; + int offset; + int isize; + int pg_index; + int error; + int error_ret = 0; + + isize = ap->a_size; + f_offset = ap->a_f_offset; + + /* + * Scan from the back to find the last page in the UPL, so that we + * aren't looking at a UPL that may have already been freed by the + * preceding aborts/completions. + */ + for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) { + if (upl_page_present(pl, --pg_index)) + break; + if (pg_index == 0) { + ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY); + goto pageout_done; + } + } + + /* + * initialize the offset variables before we touch the UPL. + * a_f_offset is the position into the file, in bytes + * offset is the position into the UPL, in bytes + * pg_index is the pg# of the UPL we're operating on. + * isize is the offset into the UPL of the last non-clean page. + */ + isize = ((pg_index + 1) * PAGE_SIZE); + + offset = 0; + pg_index = 0; + + while (isize) { + int xsize; + int num_of_pages; + + if ( !upl_page_present(pl, pg_index)) { + /* + * we asked for RET_ONLY_DIRTY, so it's possible + * to get back empty slots in the UPL. + * just skip over them + */ + f_offset += PAGE_SIZE; + offset += PAGE_SIZE; + isize -= PAGE_SIZE; + pg_index++; + + continue; + } + if ( !upl_dirty_page(pl, pg_index)) { + panic ("hfs_vnop_pageout: unforeseen clean page @ index %d for UPL %p\n", pg_index, upl); + } + + /* + * We know that we have at least one dirty page. + * Now checking to see how many in a row we have + */ + num_of_pages = 1; + xsize = isize - PAGE_SIZE; + + while (xsize) { + if ( !upl_dirty_page(pl, pg_index + num_of_pages)) + break; + num_of_pages++; + xsize -= PAGE_SIZE; + } + xsize = num_of_pages * PAGE_SIZE; + + if ((error = cluster_pageout(vp, upl, offset, f_offset, + xsize, filesize, a_flags))) { + if (error_ret == 0) + error_ret = error; + } + f_offset += xsize; + offset += xsize; + isize -= xsize; + pg_index += num_of_pages; + } + /* capture errnos bubbled out of cluster_pageout if they occurred */ + if (error_ret != 0) { + retval = error_ret; + } + } /* end block for v2 pageout behavior */ + else { + /* + * just call cluster_pageout for old pre-v2 behavior + */ + retval = cluster_pageout(vp, upl, a_pl_offset, ap->a_f_offset, + ap->a_size, filesize, a_flags); + } + + /* + * If data was written, update the modification time of the file + * but only if it's mapped writable; we will have touched the + * modifcation time for direct writes. + */ + if (retval == 0 && (ubc_is_mapped_writable(vp) + || ISSET(cp->c_flag, C_MIGHT_BE_DIRTY_FROM_MAPPING))) { + hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); + + // Check again with lock + bool mapped_writable = ubc_is_mapped_writable(vp); + if (mapped_writable + || ISSET(cp->c_flag, C_MIGHT_BE_DIRTY_FROM_MAPPING)) { + cp->c_touch_modtime = TRUE; + cp->c_touch_chgtime = TRUE; + + /* + * We only need to increment the generation counter if + * it's currently mapped writable because we incremented + * the counter in hfs_vnop_mnomap. + */ + if (mapped_writable) + hfs_incr_gencount(VTOC(vp)); + + /* + * If setuid or setgid bits are set and this process is + * not the superuser then clear the setuid and setgid bits + * as a precaution against tampering. + */ + if ((cp->c_mode & (S_ISUID | S_ISGID)) && + (vfs_context_suser(ap->a_context) != 0)) { + cp->c_mode &= ~(S_ISUID | S_ISGID); + } + } + + hfs_unlock(cp); + } + +pageout_done: + if (is_pageoutv2) { + /* + * Release the truncate lock. Note that because + * we may have taken the lock recursively by + * being invoked via ubc_msync due to lockdown, + * we should release it recursively, too. + */ + hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE); + } + return (retval); +} + +/* + * Intercept B-Tree node writes to unswap them if necessary. + */ +int +hfs_vnop_bwrite(struct vnop_bwrite_args *ap) +{ + int retval = 0; + register struct buf *bp = ap->a_bp; + register struct vnode *vp = buf_vnode(bp); + BlockDescriptor block; + + /* Trap B-Tree writes */ + if ((VTOC(vp)->c_fileid == kHFSExtentsFileID) || + (VTOC(vp)->c_fileid == kHFSCatalogFileID) || + (VTOC(vp)->c_fileid == kHFSAttributesFileID) || + (vp == VTOHFS(vp)->hfc_filevp)) { + + /* + * Swap and validate the node if it is in native byte order. + * This is always be true on big endian, so we always validate + * before writing here. On little endian, the node typically has + * been swapped and validated when it was written to the journal, + * so we won't do anything here. + */ + if (((u_int16_t *)((char *)buf_dataptr(bp) + buf_count(bp) - 2))[0] == 0x000e) { + /* Prepare the block pointer */ + block.blockHeader = bp; + block.buffer = (char *)buf_dataptr(bp); + block.blockNum = buf_lblkno(bp); + /* not found in cache ==> came from disk */ + block.blockReadFromDisk = (buf_fromcache(bp) == 0); + block.blockSize = buf_count(bp); + + /* Endian un-swap B-Tree node */ + retval = hfs_swap_BTNode (&block, vp, kSwapBTNodeHostToBig, false); + if (retval) + panic("hfs_vnop_bwrite: about to write corrupt node!\n"); + } + } + + /* This buffer shouldn't be locked anymore but if it is clear it */ + if ((buf_flags(bp) & B_LOCKED)) { + // XXXdbg + if (VTOHFS(vp)->jnl) { + panic("hfs: CLEARING the lock bit on bp %p\n", bp); + } + buf_clearflags(bp, B_LOCKED); + } + retval = vn_bwrite (ap); + + return (retval); +} + + +int +hfs_pin_block_range(struct hfsmount *hfsmp, int pin_state, uint32_t start_block, uint32_t nblocks) +{ + _dk_cs_pin_t pin; + unsigned ioc; + int err; + + memset(&pin, 0, sizeof(pin)); + pin.cp_extent.offset = ((uint64_t)start_block) * HFSTOVCB(hfsmp)->blockSize; + pin.cp_extent.length = ((uint64_t)nblocks) * HFSTOVCB(hfsmp)->blockSize; + switch (pin_state) { + case HFS_PIN_IT: + ioc = _DKIOCCSPINEXTENT; + pin.cp_flags = _DKIOCCSPINTOFASTMEDIA; + break; + case HFS_PIN_IT | HFS_TEMP_PIN: + ioc = _DKIOCCSPINEXTENT; + pin.cp_flags = _DKIOCCSPINTOFASTMEDIA | _DKIOCCSTEMPORARYPIN; + break; + case HFS_PIN_IT | HFS_DATALESS_PIN: + ioc = _DKIOCCSPINEXTENT; + pin.cp_flags = _DKIOCCSPINTOFASTMEDIA | _DKIOCCSPINFORSWAPFILE; + break; + case HFS_UNPIN_IT: + ioc = _DKIOCCSUNPINEXTENT; + pin.cp_flags = 0; + break; + case HFS_UNPIN_IT | HFS_EVICT_PIN: + ioc = _DKIOCCSPINEXTENT; + pin.cp_flags = _DKIOCCSPINTOSLOWMEDIA; + break; + default: + return EINVAL; + } + err = VNOP_IOCTL(hfsmp->hfs_devvp, ioc, (caddr_t)&pin, 0, vfs_context_kernel()); + return err; +} + +// +// The cnode lock should already be held on entry to this function +// +int +hfs_pin_vnode(struct hfsmount *hfsmp, struct vnode *vp, int pin_state, uint32_t *num_blocks_pinned) +{ + struct filefork *fp = VTOF(vp); + int i, err=0, need_put=0; + struct vnode *rsrc_vp=NULL; + uint32_t npinned = 0; + off_t offset; + + if (num_blocks_pinned) { + *num_blocks_pinned = 0; + } + + if (vnode_vtype(vp) != VREG) { + /* Not allowed to pin directories or symlinks */ + printf("hfs: can't pin vnode of type %d\n", vnode_vtype(vp)); + return (EPERM); + } + + if (fp->ff_unallocblocks) { + printf("hfs: can't pin a vnode w/unalloced blocks (%d)\n", fp->ff_unallocblocks); + return (EINVAL); + } + + /* + * It is possible that if the caller unlocked/re-locked the cnode after checking + * for C_NOEXISTS|C_DELETED that the file could have been deleted while the + * cnode was unlocked. So check the condition again and return ENOENT so that + * the caller knows why we failed to pin the vnode. + */ + if (VTOC(vp)->c_flag & (C_NOEXISTS|C_DELETED)) { + // makes no sense to pin something that's pending deletion + return ENOENT; + } + + if (fp->ff_blocks == 0 && (VTOC(vp)->c_bsdflags & UF_COMPRESSED)) { + if (!VNODE_IS_RSRC(vp) && hfs_vgetrsrc(hfsmp, vp, &rsrc_vp) == 0) { + //printf("hfs: fileid %d resource fork nblocks: %d / size: %lld\n", VTOC(vp)->c_fileid, + // VTOC(rsrc_vp)->c_rsrcfork->ff_blocks,VTOC(rsrc_vp)->c_rsrcfork->ff_size); + + fp = VTOC(rsrc_vp)->c_rsrcfork; + need_put = 1; + } + } + if (fp->ff_blocks == 0) { + if (need_put) { + // + // use a distinct error code for a compressed file that has no resource fork; + // we return EALREADY to indicate that the data is already probably hot file + // cached because it's in an EA and the attributes btree is on the ssd + // + err = EALREADY; + } else { + err = EINVAL; + } + goto out; + } + + offset = 0; + for (i = 0; i < kHFSPlusExtentDensity; i++) { + if (fp->ff_extents[i].startBlock == 0) { + break; + } + + err = hfs_pin_block_range(hfsmp, pin_state, fp->ff_extents[i].startBlock, fp->ff_extents[i].blockCount); + if (err) { + break; + } else { + npinned += fp->ff_extents[i].blockCount; + } + } + + if (err || npinned == 0) { + goto out; + } + + if (fp->ff_extents[kHFSPlusExtentDensity-1].startBlock) { + uint32_t pblocks; + uint8_t forktype = 0; + + if (fp == VTOC(vp)->c_rsrcfork) { + forktype = 0xff; + } + /* + * The file could have overflow extents, better pin them. + * + * We assume that since we are holding the cnode lock for this cnode, + * the files extents cannot be manipulated, but the tree could, so we + * need to ensure that it doesn't change behind our back as we iterate it. + */ + int lockflags = hfs_systemfile_lock (hfsmp, SFL_EXTENTS, HFS_SHARED_LOCK); + err = hfs_pin_overflow_extents(hfsmp, VTOC(vp)->c_fileid, forktype, &pblocks); + hfs_systemfile_unlock (hfsmp, lockflags); + + if (err) { + goto out; + } + npinned += pblocks; + } + +out: + if (num_blocks_pinned) { + *num_blocks_pinned = npinned; + } + + if (need_put && rsrc_vp) { + // + // have to unlock the cnode since it's shared between the + // resource fork vnode and the data fork vnode (and the + // vnode_put() may need to re-acquire the cnode lock to + // reclaim the resource fork vnode) + // + hfs_unlock(VTOC(vp)); + vnode_put(rsrc_vp); + hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); + } + return err; +} + + +/* + * Relocate a file to a new location on disk + * cnode must be locked on entry + * + * Relocation occurs by cloning the file's data from its + * current set of blocks to a new set of blocks. During + * the relocation all of the blocks (old and new) are + * owned by the file. + * + * ----------------- + * |///////////////| + * ----------------- + * 0 N (file offset) + * + * ----------------- ----------------- + * |///////////////| | | STEP 1 (acquire new blocks) + * ----------------- ----------------- + * 0 N N+1 2N + * + * ----------------- ----------------- + * |///////////////| |///////////////| STEP 2 (clone data) + * ----------------- ----------------- + * 0 N N+1 2N + * + * ----------------- + * |///////////////| STEP 3 (head truncate blocks) + * ----------------- + * 0 N + * + * During steps 2 and 3 page-outs to file offsets less + * than or equal to N are suspended. + * + * During step 3 page-ins to the file get suspended. + */ +int +hfs_relocate(struct vnode *vp, u_int32_t blockHint, kauth_cred_t cred, + struct proc *p) +{ + struct cnode *cp; + struct filefork *fp; + struct hfsmount *hfsmp; + u_int32_t headblks; + u_int32_t datablks; + u_int32_t blksize; + u_int32_t growsize; + u_int32_t nextallocsave; + daddr64_t sector_a, sector_b; + int eflags; + off_t newbytes; + int retval; + int lockflags = 0; + int took_trunc_lock = 0; + int started_tr = 0; + enum vtype vnodetype; + + vnodetype = vnode_vtype(vp); + if (vnodetype != VREG) { + /* Not allowed to move symlinks. */ + return (EPERM); + } + + hfsmp = VTOHFS(vp); + if (hfsmp->hfs_flags & HFS_FRAGMENTED_FREESPACE) { + return (ENOSPC); + } + + cp = VTOC(vp); + fp = VTOF(vp); + if (fp->ff_unallocblocks) + return (EINVAL); + +#if CONFIG_PROTECT + /* + * + * Disable HFS file relocation on content-protected filesystems + */ + if (cp_fs_protected (hfsmp->hfs_mp)) { + return EINVAL; + } +#endif + /* If it's an SSD, also disable HFS relocation */ + if (hfsmp->hfs_flags & HFS_SSD) { + return EINVAL; + } + + + blksize = hfsmp->blockSize; + if (blockHint == 0) + blockHint = hfsmp->nextAllocation; + + if (fp->ff_size > 0x7fffffff) { + return (EFBIG); + } + + if (!vnode_issystem(vp) && (vnodetype != VLNK)) { + hfs_unlock(cp); + hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + /* Force lock since callers expects lock to be held. */ + if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS))) { + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); + return (retval); + } + /* No need to continue if file was removed. */ + if (cp->c_flag & C_NOEXISTS) { + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); + return (ENOENT); + } + took_trunc_lock = 1; + } + headblks = fp->ff_blocks; + datablks = howmany(fp->ff_size, blksize); + growsize = datablks * blksize; + eflags = kEFContigMask | kEFAllMask | kEFNoClumpMask; + if (blockHint >= hfsmp->hfs_metazone_start && + blockHint <= hfsmp->hfs_metazone_end) + eflags |= kEFMetadataMask; + + if (hfs_start_transaction(hfsmp) != 0) { + if (took_trunc_lock) + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); + return (EINVAL); + } + started_tr = 1; + /* + * Protect the extents b-tree and the allocation bitmap + * during MapFileBlockC and ExtendFileC operations. + */ + lockflags = SFL_BITMAP; + if (overflow_extents(fp)) + lockflags |= SFL_EXTENTS; + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); + + retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize - 1, §or_a, NULL); + if (retval) { + retval = MacToVFSError(retval); + goto out; + } + + /* + * STEP 1 - acquire new allocation blocks. + */ + nextallocsave = hfsmp->nextAllocation; + retval = ExtendFileC(hfsmp, (FCB*)fp, growsize, blockHint, eflags, &newbytes); + if (eflags & kEFMetadataMask) { + hfs_lock_mount(hfsmp); + HFS_UPDATE_NEXT_ALLOCATION(hfsmp, nextallocsave); + MarkVCBDirty(hfsmp); + hfs_unlock_mount(hfsmp); + } + + retval = MacToVFSError(retval); + if (retval == 0) { + cp->c_flag |= C_MODIFIED; + if (newbytes < growsize) { + retval = ENOSPC; + goto restore; + } else if (fp->ff_blocks < (headblks + datablks)) { + printf("hfs_relocate: allocation failed id=%u, vol=%s\n", cp->c_cnid, hfsmp->vcbVN); + retval = ENOSPC; + goto restore; + } + + retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize, §or_b, NULL); + if (retval) { + retval = MacToVFSError(retval); + } else if ((sector_a + 1) == sector_b) { + retval = ENOSPC; + goto restore; + } else if ((eflags & kEFMetadataMask) && + ((((u_int64_t)sector_b * hfsmp->hfs_logical_block_size) / blksize) > + hfsmp->hfs_metazone_end)) { +#if 0 + const char * filestr; + char emptystr = '\0'; + + if (cp->c_desc.cd_nameptr != NULL) { + filestr = (const char *)&cp->c_desc.cd_nameptr[0]; + } else if (vnode_name(vp) != NULL) { + filestr = vnode_name(vp); + } else { + filestr = &emptystr; + } +#endif + retval = ENOSPC; + goto restore; + } + } + /* Done with system locks and journal for now. */ + hfs_systemfile_unlock(hfsmp, lockflags); + lockflags = 0; + hfs_end_transaction(hfsmp); + started_tr = 0; + + if (retval) { + /* + * Check to see if failure is due to excessive fragmentation. + */ + if ((retval == ENOSPC) && + (hfs_freeblks(hfsmp, 0) > (datablks * 2))) { + hfsmp->hfs_flags |= HFS_FRAGMENTED_FREESPACE; + } + goto out; + } + /* + * STEP 2 - clone file data into the new allocation blocks. + */ + + if (vnodetype == VLNK) + retval = EPERM; + else if (vnode_issystem(vp)) + retval = hfs_clonesysfile(vp, headblks, datablks, blksize, cred, p); + else + retval = hfs_clonefile(vp, headblks, datablks, blksize); + + /* Start transaction for step 3 or for a restore. */ + if (hfs_start_transaction(hfsmp) != 0) { + retval = EINVAL; + goto out; + } + started_tr = 1; + if (retval) + goto restore; + + /* + * STEP 3 - switch to cloned data and remove old blocks. + */ + lockflags = SFL_BITMAP; + if (overflow_extents(fp)) + lockflags |= SFL_EXTENTS; + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); + + retval = HeadTruncateFile(hfsmp, (FCB*)fp, headblks); + + hfs_systemfile_unlock(hfsmp, lockflags); + lockflags = 0; + if (retval) + goto restore; +out: + if (took_trunc_lock) + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); + + if (lockflags) { + hfs_systemfile_unlock(hfsmp, lockflags); + lockflags = 0; + } + + /* Push cnode's new extent data to disk. */ + if (retval == 0) { + hfs_update(vp, 0); + } + if (hfsmp->jnl) { + if (cp->c_cnid < kHFSFirstUserCatalogNodeID) + (void) hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT | HFS_FVH_WRITE_ALT); + else + (void) hfs_flushvolumeheader(hfsmp, 0); + } +exit: + if (started_tr) + hfs_end_transaction(hfsmp); + + return (retval); + +restore: + if (fp->ff_blocks == headblks) { + if (took_trunc_lock) + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); + goto exit; + } + /* + * Give back any newly allocated space. + */ + if (lockflags == 0) { + lockflags = SFL_BITMAP; + if (overflow_extents(fp)) + lockflags |= SFL_EXTENTS; + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); + } + + (void) TruncateFileC(hfsmp, (FCB*)fp, fp->ff_size, 0, FORK_IS_RSRC(fp), + FTOC(fp)->c_fileid, false); + + hfs_systemfile_unlock(hfsmp, lockflags); + lockflags = 0; + + if (took_trunc_lock) + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); + goto exit; +} + + +/* + * Clone a file's data within the file. + * + */ +static int +hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize) +{ + caddr_t bufp; + size_t bufsize; + size_t copysize; + size_t iosize; + size_t offset; + off_t writebase; + uio_t auio; + int error = 0; + + writebase = blkstart * blksize; + copysize = blkcnt * blksize; + iosize = bufsize = MIN(copysize, 128 * 1024); + offset = 0; + + hfs_unlock(VTOC(vp)); + +#if CONFIG_PROTECT + if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) { + hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); + return (error); + } +#endif /* CONFIG_PROTECT */ + + bufp = hfs_malloc(bufsize); + + auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ); + + while (offset < copysize) { + iosize = MIN(copysize - offset, iosize); + + uio_reset(auio, offset, UIO_SYSSPACE, UIO_READ); + uio_addiov(auio, (uintptr_t)bufp, iosize); + + error = cluster_read(vp, auio, copysize, IO_NOCACHE); + if (error) { + printf("hfs_clonefile: cluster_read failed - %d\n", error); + break; + } + if (uio_resid(auio) != 0) { + printf("hfs_clonefile: cluster_read: uio_resid = %lld\n", (int64_t)uio_resid(auio)); + error = EIO; + break; + } + + uio_reset(auio, writebase + offset, UIO_SYSSPACE, UIO_WRITE); + uio_addiov(auio, (uintptr_t)bufp, iosize); + + error = cluster_write(vp, auio, writebase + offset, + writebase + offset + iosize, + uio_offset(auio), 0, IO_NOCACHE | IO_SYNC); + if (error) { + printf("hfs_clonefile: cluster_write failed - %d\n", error); + break; + } + if (uio_resid(auio) != 0) { + printf("hfs_clonefile: cluster_write failed - uio_resid not zero\n"); + error = EIO; + break; + } + offset += iosize; + } + uio_free(auio); + + if ((blksize & PAGE_MASK)) { + /* + * since the copy may not have started on a PAGE + * boundary (or may not have ended on one), we + * may have pages left in the cache since NOCACHE + * will let partially written pages linger... + * lets just flush the entire range to make sure + * we don't have any pages left that are beyond + * (or intersect) the real LEOF of this file + */ + ubc_msync(vp, writebase, writebase + offset, NULL, UBC_INVALIDATE | UBC_PUSHDIRTY); + } else { + /* + * No need to call ubc_msync or hfs_invalbuf + * since the file was copied using IO_NOCACHE and + * the copy was done starting and ending on a page + * boundary in the file. + */ + } + hfs_free(bufp, bufsize); + + hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); + return (error); +} + +/* + * Clone a system (metadata) file. + * + */ +static int +hfs_clonesysfile(struct vnode *vp, int blkstart, int blkcnt, int blksize, + kauth_cred_t cred, struct proc *p) +{ + caddr_t bufp; + char * offset; + size_t bufsize; + size_t iosize; + struct buf *bp = NULL; + daddr64_t blkno; + daddr64_t blk; + daddr64_t start_blk; + daddr64_t last_blk; + int breadcnt; + int i; + int error = 0; + + + iosize = GetLogicalBlockSize(vp); + bufsize = MIN(blkcnt * blksize, 1024 * 1024) & ~(iosize - 1); + breadcnt = bufsize / iosize; + + bufp = hfs_malloc(bufsize); + + start_blk = ((daddr64_t)blkstart * blksize) / iosize; + last_blk = ((daddr64_t)blkcnt * blksize) / iosize; + blkno = 0; + + while (blkno < last_blk) { + /* + * Read up to a megabyte + */ + offset = bufp; + for (i = 0, blk = blkno; (i < breadcnt) && (blk < last_blk); ++i, ++blk) { + error = (int)buf_meta_bread(vp, blk, iosize, cred, &bp); + if (error) { + printf("hfs_clonesysfile: meta_bread error %d\n", error); + goto out; + } + if (buf_count(bp) != iosize) { + printf("hfs_clonesysfile: b_bcount is only %d\n", buf_count(bp)); + goto out; + } + bcopy((char *)buf_dataptr(bp), offset, iosize); + + buf_markinvalid(bp); + buf_brelse(bp); + bp = NULL; + + offset += iosize; + } + + /* + * Write up to a megabyte + */ + offset = bufp; + for (i = 0; (i < breadcnt) && (blkno < last_blk); ++i, ++blkno) { + bp = buf_getblk(vp, start_blk + blkno, iosize, 0, 0, BLK_META); + if (bp == NULL) { + printf("hfs_clonesysfile: getblk failed on blk %qd\n", start_blk + blkno); + error = EIO; + goto out; + } + bcopy(offset, (char *)buf_dataptr(bp), iosize); + error = (int)buf_bwrite(bp); + bp = NULL; + if (error) + goto out; + offset += iosize; + } + } +out: + if (bp) { + buf_brelse(bp); + } + + hfs_free(bufp, bufsize); + + error = hfs_fsync(vp, MNT_WAIT, 0, p); + + return (error); +} + +errno_t hfs_flush_invalid_ranges(vnode_t vp) +{ + cnode_t *cp = VTOC(vp); + + hfs_assert(cp->c_lockowner == current_thread()); + hfs_assert(cp->c_truncatelockowner == current_thread()); + + if (!ISSET(cp->c_flag, C_ZFWANTSYNC) && !cp->c_zftimeout) + return 0; + + filefork_t *fp = VTOF(vp); + + /* + * We can't hold the cnode lock whilst we call cluster_write so we + * need to copy the extents into a local buffer. + */ + int max_exts = 16; + struct ext { + off_t start, end; + } exts_buf[max_exts]; // 256 bytes + struct ext *exts = exts_buf; + int ext_count = 0; + errno_t ret; + + struct rl_entry *r = TAILQ_FIRST(&fp->ff_invalidranges); + + while (r) { + /* If we have more than can fit in our stack buffer, switch + to a heap buffer. */ + if (exts == exts_buf && ext_count == max_exts) { + max_exts = 256; + exts = hfs_malloc(sizeof(struct ext) * max_exts); + memcpy(exts, exts_buf, ext_count * sizeof(struct ext)); + } + + struct rl_entry *next = TAILQ_NEXT(r, rl_link); + + exts[ext_count++] = (struct ext){ r->rl_start, r->rl_end }; + + if (!next || (ext_count == max_exts && exts != exts_buf)) { + hfs_unlock(cp); + for (int i = 0; i < ext_count; ++i) { + ret = cluster_write(vp, NULL, fp->ff_size, exts[i].end + 1, + exts[i].start, 0, + IO_HEADZEROFILL | IO_NOZERODIRTY | IO_NOCACHE); + if (ret) { + hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK); + goto exit; + } + } + + if (!next) { + hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK); + break; + } + + /* Push any existing clusters which should clean up our invalid + ranges as they go through hfs_vnop_blockmap. */ + cluster_push(vp, 0); + + hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK); + + /* + * Get back to where we were (given we dropped the lock). + * This shouldn't be many because we pushed above. + */ + TAILQ_FOREACH(r, &fp->ff_invalidranges, rl_link) { + if (r->rl_end > exts[ext_count - 1].end) + break; + } + + ext_count = 0; + } else + r = next; + } + + ret = 0; + +exit: + + if (exts != exts_buf) + hfs_free(exts, sizeof(struct ext) * max_exts); + + return ret; +} diff --git a/core/hfs_resize.c b/core/hfs_resize.c new file mode 100644 index 0000000..8686705 --- /dev/null +++ b/core/hfs_resize.c @@ -0,0 +1,3432 @@ +/* + * Copyright (c) 2013-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include +#include +#include + +#include "hfs_journal.h" +#include + +#include "hfs.h" +#include "hfs_catalog.h" +#include "hfs_cnode.h" +#include "hfs_endian.h" +#include "hfs_btreeio.h" +#include "hfs_cprotect.h" + +/* Enable/disable debugging code for live volume resizing */ +int hfs_resize_debug = 0; + +static errno_t hfs_file_extent_overlaps(struct hfsmount *hfsmp, u_int32_t allocLimit, + struct HFSPlusCatalogFile *filerec, bool *overlaps); +static int hfs_reclaimspace(struct hfsmount *hfsmp, u_int32_t allocLimit, u_int32_t reclaimblks, vfs_context_t context); +static int hfs_extend_journal(struct hfsmount *hfsmp, u_int32_t sector_size, u_int64_t sector_count, vfs_context_t context); + +/* + * Extend a file system. + */ +int +hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) +{ + struct proc *p = vfs_context_proc(context); + kauth_cred_t cred = vfs_context_ucred(context); + struct vnode *vp = NULL; + struct vnode *devvp; + struct buf *bp; + struct filefork *fp = NULL; + ExtendedVCB *vcb; + struct cat_fork forkdata; + u_int64_t oldsize; + uint32_t newblkcnt; + u_int64_t prev_phys_block_count; + u_int32_t addblks; + u_int64_t sector_count; + u_int32_t sector_size; + u_int32_t phys_sector_size; + u_int32_t overage_blocks; + daddr64_t prev_fs_alt_sector; + daddr_t bitmapblks; + int lockflags = 0; + int error; + int64_t oldBitmapSize; + + Boolean usedExtendFileC = false; + int transaction_begun = 0; + + devvp = hfsmp->hfs_devvp; + vcb = HFSTOVCB(hfsmp); + + /* + * - HFS Plus file systems only. + * - Journaling must be enabled. + * - No embedded volumes. + */ + if ((vcb->vcbSigWord == kHFSSigWord) || + (hfsmp->jnl == NULL) || + (vcb->hfsPlusIOPosOffset != 0)) { + return (EPERM); + } + /* + * If extending file system by non-root, then verify + * ownership and check permissions. + */ + if (suser(cred, NULL)) { + error = hfs_vget(hfsmp, kHFSRootFolderID, &vp, 0, 0); + + if (error) + return (error); + error = hfs_owner_rights(hfsmp, VTOC(vp)->c_uid, cred, p, 0); + if (error == 0) { + error = hfs_write_access(vp, cred, p, false); + } + hfs_unlock(VTOC(vp)); + vnode_put(vp); + if (error) + return (error); + + error = vnode_authorize(devvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, context); + if (error) + return (error); + } + if (VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE, (caddr_t)§or_size, 0, context)) { + return (ENXIO); + } + if (sector_size != hfsmp->hfs_logical_block_size) { + return (ENXIO); + } + if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)§or_count, 0, context)) { + return (ENXIO); + } + /* Check if partition size is correct for new file system size */ + if ((sector_size * sector_count) < newsize) { + printf("hfs_extendfs: not enough space on device (vol=%s)\n", hfsmp->vcbVN); + return (ENOSPC); + } + error = VNOP_IOCTL(devvp, DKIOCGETPHYSICALBLOCKSIZE, (caddr_t)&phys_sector_size, 0, context); + if (error) { + if ((error != ENOTSUP) && (error != ENOTTY)) { + return (ENXIO); + } + /* If ioctl is not supported, force physical and logical sector size to be same */ + phys_sector_size = sector_size; + } + oldsize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize; + + /* + * Validate new size. + */ + if ((newsize <= oldsize) || (newsize % sector_size) || (newsize % phys_sector_size)) { + printf("hfs_extendfs: invalid size (newsize=%qu, oldsize=%qu)\n", newsize, oldsize); + return (EINVAL); + } + uint64_t cnt = newsize / vcb->blockSize; + if (cnt > 0xFFFFFFFF) { + printf ("hfs_extendfs: current blockSize=%u too small for newsize=%qu\n", hfsmp->blockSize, newsize); + return (EOVERFLOW); + } + + newblkcnt = (uint32_t)cnt; + + addblks = newblkcnt - vcb->totalBlocks; + + if (hfs_resize_debug) { + printf ("hfs_extendfs: old: size=%qu, blkcnt=%u\n", oldsize, hfsmp->totalBlocks); + printf ("hfs_extendfs: new: size=%qu, blkcnt=%u, addblks=%u\n", newsize, newblkcnt, addblks); + } + printf("hfs_extendfs: will extend \"%s\" by %d blocks\n", vcb->vcbVN, addblks); + + hfs_lock_mount (hfsmp); + if (hfsmp->hfs_flags & HFS_RESIZE_IN_PROGRESS) { + hfs_unlock_mount(hfsmp); + error = EALREADY; + goto out; + } + hfsmp->hfs_flags |= HFS_RESIZE_IN_PROGRESS; + hfs_unlock_mount (hfsmp); + + /* Start with a clean journal. */ + hfs_flush(hfsmp, HFS_FLUSH_JOURNAL_META); + + /* + * Enclose changes inside a transaction. + */ + if (hfs_start_transaction(hfsmp) != 0) { + error = EINVAL; + goto out; + } + transaction_begun = 1; + + + /* Update the hfsmp fields for the physical information about the device */ + prev_phys_block_count = hfsmp->hfs_logical_block_count; + prev_fs_alt_sector = hfsmp->hfs_fs_avh_sector; + + hfsmp->hfs_logical_block_count = sector_count; + hfsmp->hfs_logical_bytes = (uint64_t) sector_count * (uint64_t) sector_size; + + /* + * It is possible that the new file system is smaller than the partition size. + * Therefore, update offsets for AVH accordingly. + */ + if (hfs_resize_debug) { + printf ("hfs_extendfs: old: partition_avh_sector=%qu, fs_avh_sector=%qu\n", + hfsmp->hfs_partition_avh_sector, hfsmp->hfs_fs_avh_sector); + } + hfsmp->hfs_partition_avh_sector = (hfsmp->hfsPlusIOPosOffset / sector_size) + + HFS_ALT_SECTOR(sector_size, hfsmp->hfs_logical_block_count); + + hfsmp->hfs_fs_avh_sector = (hfsmp->hfsPlusIOPosOffset / sector_size) + + HFS_ALT_SECTOR(sector_size, (newsize/hfsmp->hfs_logical_block_size)); + if (hfs_resize_debug) { + printf ("hfs_extendfs: new: partition_avh_sector=%qu, fs_avh_sector=%qu\n", + hfsmp->hfs_partition_avh_sector, hfsmp->hfs_fs_avh_sector); + } + + /* + * Note: we take the attributes lock in case we have an attribute data vnode + * which needs to change size. + */ + lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE | SFL_EXTENTS | SFL_BITMAP, HFS_EXCLUSIVE_LOCK); + vp = vcb->allocationsRefNum; + fp = VTOF(vp); + bcopy(&fp->ff_data, &forkdata, sizeof(forkdata)); + + /* + * Calculate additional space required (if any) by allocation bitmap. + */ + oldBitmapSize = fp->ff_size; + bitmapblks = roundup((newblkcnt+7) / 8, vcb->vcbVBMIOSize) / vcb->blockSize; + if (bitmapblks > (daddr_t)fp->ff_blocks) + bitmapblks -= fp->ff_blocks; + else + bitmapblks = 0; + + /* + * The allocation bitmap can contain unused bits that are beyond end of + * current volume's allocation blocks. Usually they are supposed to be + * zero'ed out but there can be cases where they might be marked as used. + * After extending the file system, those bits can represent valid + * allocation blocks, so we mark all the bits from the end of current + * volume to end of allocation bitmap as "free". + * + * Figure out the number of overage blocks before proceeding though, + * so we don't add more bytes to our I/O than necessary. + * First figure out the total number of blocks representable by the + * end of the bitmap file vs. the total number of blocks in the new FS. + * Then subtract away the number of blocks in the current FS. This is how much + * we can mark as free right now without having to grow the bitmap file. + */ + overage_blocks = fp->ff_blocks * vcb->blockSize * 8; + overage_blocks = MIN (overage_blocks, newblkcnt); + overage_blocks -= vcb->totalBlocks; + + BlockMarkFreeUnused(vcb, vcb->totalBlocks, overage_blocks); + + if (bitmapblks > 0) { + daddr64_t blkno; + daddr_t blkcnt; + off_t bytesAdded; + + /* + * Get the bitmap's current size (in allocation blocks) so we know + * where to start zero filling once the new space is added. We've + * got to do this before the bitmap is grown. + */ + blkno = (daddr64_t)fp->ff_blocks; + + /* + * Try to grow the allocation file in the normal way, using allocation + * blocks already existing in the file system. This way, we might be + * able to grow the bitmap contiguously, or at least in the metadata + * zone. + */ + error = ExtendFileC(vcb, fp, bitmapblks * vcb->blockSize, 0, + kEFAllMask | kEFNoClumpMask | kEFReserveMask + | kEFMetadataMask | kEFContigMask, &bytesAdded); + + if (error == 0) { + usedExtendFileC = true; + } else { + /* + * If the above allocation failed, fall back to allocating the new + * extent of the bitmap from the space we're going to add. Since those + * blocks don't yet belong to the file system, we have to update the + * extent list directly, and manually adjust the file size. + */ + bytesAdded = 0; + error = AddFileExtent(vcb, fp, vcb->totalBlocks, bitmapblks); + if (error) { + printf("hfs_extendfs: error %d adding extents\n", error); + goto out; + } + fp->ff_blocks += bitmapblks; + VTOC(vp)->c_blocks = fp->ff_blocks; + VTOC(vp)->c_flag |= C_MODIFIED; + } + + /* + * Update the allocation file's size to include the newly allocated + * blocks. Note that ExtendFileC doesn't do this, which is why this + * statement is outside the above "if" statement. + */ + fp->ff_size += (u_int64_t)bitmapblks * (u_int64_t)vcb->blockSize; + + /* + * Zero out the new bitmap blocks. + */ + { + + bp = NULL; + blkcnt = bitmapblks; + while (blkcnt > 0) { + error = (int)buf_meta_bread(vp, blkno, vcb->blockSize, NOCRED, &bp); + if (error) { + if (bp) { + buf_brelse(bp); + } + break; + } + bzero((char *)buf_dataptr(bp), vcb->blockSize); + buf_markaged(bp); + error = (int)buf_bwrite(bp); + if (error) + break; + --blkcnt; + ++blkno; + } + } + if (error) { + printf("hfs_extendfs: error %d clearing blocks\n", error); + goto out; + } + /* + * Mark the new bitmap space as allocated. + * + * Note that ExtendFileC will have marked any blocks it allocated, so + * this is only needed if we used AddFileExtent. Also note that this + * has to come *after* the zero filling of new blocks in the case where + * we used AddFileExtent (since the part of the bitmap we're touching + * is in those newly allocated blocks). + */ + if (!usedExtendFileC) { + error = BlockMarkAllocated(vcb, vcb->totalBlocks, bitmapblks); + if (error) { + printf("hfs_extendfs: error %d setting bitmap\n", error); + goto out; + } + vcb->freeBlocks -= bitmapblks; + } + } + + /* + * Mark the new alternate VH as allocated. + */ + if (vcb->blockSize == 512) + error = BlockMarkAllocated(vcb, vcb->totalBlocks + addblks - 2, 2); + else + error = BlockMarkAllocated(vcb, vcb->totalBlocks + addblks - 1, 1); + if (error) { + printf("hfs_extendfs: error %d setting bitmap (VH)\n", error); + goto out; + } + + /* + * Mark the old alternate VH as free. + */ + if (vcb->blockSize == 512) + (void) BlockMarkFree(vcb, vcb->totalBlocks - 2, 2); + else + (void) BlockMarkFree(vcb, vcb->totalBlocks - 1, 1); + + /* + * Adjust file system variables for new space. + */ + vcb->totalBlocks += addblks; + vcb->freeBlocks += addblks; + MarkVCBDirty(vcb); + error = hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT | HFS_FVH_WRITE_ALT); + if (error) { + printf("hfs_extendfs: couldn't flush volume headers (%d)", error); + /* + * Restore to old state. + */ + if (usedExtendFileC) { + (void) TruncateFileC(vcb, fp, oldBitmapSize, 0, FORK_IS_RSRC(fp), + FTOC(fp)->c_fileid, false); + } else { + fp->ff_blocks -= bitmapblks; + fp->ff_size -= (u_int64_t)bitmapblks * (u_int64_t)vcb->blockSize; + /* + * No need to mark the excess blocks free since those bitmap blocks + * are no longer part of the bitmap. But we do need to undo the + * effect of the "vcb->freeBlocks -= bitmapblks" above. + */ + vcb->freeBlocks += bitmapblks; + } + vcb->totalBlocks -= addblks; + vcb->freeBlocks -= addblks; + hfsmp->hfs_logical_block_count = prev_phys_block_count; + hfsmp->hfs_fs_avh_sector = prev_fs_alt_sector; + /* Do not revert hfs_partition_avh_sector because the + * partition size is larger than file system size + */ + MarkVCBDirty(vcb); + if (vcb->blockSize == 512) { + if (BlockMarkAllocated(vcb, vcb->totalBlocks - 2, 2)) { + hfs_mark_inconsistent(hfsmp, HFS_ROLLBACK_FAILED); + } + } else { + if (BlockMarkAllocated(vcb, vcb->totalBlocks - 1, 1)) { + hfs_mark_inconsistent(hfsmp, HFS_ROLLBACK_FAILED); + } + } + goto out; + } + /* + * Invalidate the old alternate volume header. We are growing the filesystem so + * this sector must be returned to the FS as free space. + */ + bp = NULL; + if (prev_fs_alt_sector) { + if (buf_meta_bread(hfsmp->hfs_devvp, + HFS_PHYSBLK_ROUNDDOWN(prev_fs_alt_sector, hfsmp->hfs_log_per_phys), + hfsmp->hfs_physical_block_size, NOCRED, &bp) == 0) { + journal_modify_block_start(hfsmp->jnl, bp); + + bzero((char *)buf_dataptr(bp) + HFS_ALT_OFFSET(hfsmp->hfs_physical_block_size), kMDBSize); + + journal_modify_block_end(hfsmp->jnl, bp, NULL, NULL); + } else if (bp) { + buf_brelse(bp); + } + } + + /* + * Update the metadata zone size based on current volume size + */ + hfs_metadatazone_init(hfsmp, false); + + /* + * Adjust the size of hfsmp->hfs_attrdata_vp + */ + if (hfsmp->hfs_attrdata_vp) { + struct cnode *attr_cp; + struct filefork *attr_fp; + + if (vnode_get(hfsmp->hfs_attrdata_vp) == 0) { + attr_cp = VTOC(hfsmp->hfs_attrdata_vp); + attr_fp = VTOF(hfsmp->hfs_attrdata_vp); + + attr_cp->c_blocks = newblkcnt; + attr_fp->ff_blocks = newblkcnt; + attr_fp->ff_extents[0].blockCount = newblkcnt; + attr_fp->ff_size = (off_t) newblkcnt * hfsmp->blockSize; + ubc_setsize(hfsmp->hfs_attrdata_vp, attr_fp->ff_size); + vnode_put(hfsmp->hfs_attrdata_vp); + } + } + + /* + * We only update hfsmp->allocLimit if totalBlocks actually increased. + */ + if (error == 0) { + UpdateAllocLimit(hfsmp, hfsmp->totalBlocks); + } + + /* Release all locks and sync up journal content before + * checking and extending, if required, the journal + */ + if (lockflags) { + hfs_systemfile_unlock(hfsmp, lockflags); + lockflags = 0; + } + if (transaction_begun) { + hfs_end_transaction(hfsmp); + hfs_flush(hfsmp, HFS_FLUSH_JOURNAL_META); + transaction_begun = 0; + } + + /* Increase the journal size, if required. */ + error = hfs_extend_journal(hfsmp, sector_size, sector_count, context); + if (error) { + printf ("hfs_extendfs: Could not extend journal size\n"); + goto out_noalloc; + } + + /* Log successful extending */ + printf("hfs_extendfs: extended \"%s\" to %d blocks (was %d blocks)\n", + hfsmp->vcbVN, hfsmp->totalBlocks, (u_int32_t)(oldsize/hfsmp->blockSize)); + +out: + if (error && fp) { + /* Restore allocation fork. */ + bcopy(&forkdata, &fp->ff_data, sizeof(forkdata)); + VTOC(vp)->c_blocks = fp->ff_blocks; + + } + +out_noalloc: + hfs_lock_mount (hfsmp); + hfsmp->hfs_flags &= ~HFS_RESIZE_IN_PROGRESS; + hfs_unlock_mount (hfsmp); + if (lockflags) { + hfs_systemfile_unlock(hfsmp, lockflags); + } + if (transaction_begun) { + hfs_end_transaction(hfsmp); + /* Just to be sure, sync all data to the disk */ + int flush_error = hfs_flush(hfsmp, HFS_FLUSH_FULL); + if (flush_error && !error) + error = flush_error; + } + if (error) { + printf ("hfs_extentfs: failed error=%d on vol=%s\n", MacToVFSError(error), hfsmp->vcbVN); + } + + return MacToVFSError(error); +} + +#define HFS_MIN_SIZE (32LL * 1024LL * 1024LL) + +/* + * Truncate a file system (while still mounted). + */ +int +hfs_truncatefs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) +{ + u_int64_t oldsize; + u_int32_t newblkcnt; + u_int32_t reclaimblks = 0; + int lockflags = 0; + int transaction_begun = 0; + Boolean updateFreeBlocks = false; + Boolean disable_sparse = false; + int error = 0; + + hfs_lock_mount (hfsmp); + if (hfsmp->hfs_flags & HFS_RESIZE_IN_PROGRESS) { + hfs_unlock_mount (hfsmp); + return (EALREADY); + } + hfsmp->hfs_flags |= HFS_RESIZE_IN_PROGRESS; + hfsmp->hfs_resize_blocksmoved = 0; + hfsmp->hfs_resize_totalblocks = 0; + hfsmp->hfs_resize_progress = 0; + hfs_unlock_mount (hfsmp); + + /* + * - Journaled HFS Plus volumes only. + * - No embedded volumes. + */ + if ((hfsmp->jnl == NULL) || + (hfsmp->hfsPlusIOPosOffset != 0)) { + error = EPERM; + goto out; + } + oldsize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize; + newblkcnt = newsize / hfsmp->blockSize; + reclaimblks = hfsmp->totalBlocks - newblkcnt; + + if (hfs_resize_debug) { + printf ("hfs_truncatefs: old: size=%qu, blkcnt=%u, freeblks=%u\n", oldsize, hfsmp->totalBlocks, hfs_freeblks(hfsmp, 1)); + printf ("hfs_truncatefs: new: size=%qu, blkcnt=%u, reclaimblks=%u\n", newsize, newblkcnt, reclaimblks); + } + + /* Make sure new size is valid. */ + if ((newsize < HFS_MIN_SIZE) || + (newsize >= oldsize) || + (newsize % hfsmp->hfs_logical_block_size) || + (newsize % hfsmp->hfs_physical_block_size)) { + printf ("hfs_truncatefs: invalid size (newsize=%qu, oldsize=%qu)\n", newsize, oldsize); + error = EINVAL; + goto out; + } + + /* + * Make sure that the file system has enough free blocks reclaim. + * + * Before resize, the disk is divided into four zones - + * A. Allocated_Stationary - These are allocated blocks that exist + * before the new end of disk. These blocks will not be + * relocated or modified during resize. + * B. Free_Stationary - These are free blocks that exist before the + * new end of disk. These blocks can be used for any new + * allocations during resize, including allocation for relocating + * data from the area of disk being reclaimed. + * C. Allocated_To-Reclaim - These are allocated blocks that exist + * beyond the new end of disk. These blocks need to be reclaimed + * during resize by allocating equal number of blocks in Free + * Stationary zone and copying the data. + * D. Free_To-Reclaim - These are free blocks that exist beyond the + * new end of disk. Nothing special needs to be done to reclaim + * them. + * + * Total number of blocks on the disk before resize: + * ------------------------------------------------ + * Total Blocks = Allocated_Stationary + Free_Stationary + + * Allocated_To-Reclaim + Free_To-Reclaim + * + * Total number of blocks that need to be reclaimed: + * ------------------------------------------------ + * Blocks to Reclaim = Allocated_To-Reclaim + Free_To-Reclaim + * + * Note that the check below also makes sure that we have enough space + * to relocate data from Allocated_To-Reclaim to Free_Stationary. + * Therefore we do not need to check total number of blocks to relocate + * later in the code. + * + * The condition below gets converted to: + * + * Allocated To-Reclaim + Free To-Reclaim >= Free Stationary + Free To-Reclaim + * + * which is equivalent to: + * + * Allocated To-Reclaim >= Free Stationary + */ + if (reclaimblks >= hfs_freeblks(hfsmp, 1)) { + printf("hfs_truncatefs: insufficient space (need %u blocks; have %u free blocks)\n", reclaimblks, hfs_freeblks(hfsmp, 1)); + error = ENOSPC; + goto out; + } + + /* Start with a clean journal. */ + hfs_flush(hfsmp, HFS_FLUSH_JOURNAL_META); + + if (hfs_start_transaction(hfsmp) != 0) { + error = EINVAL; + goto out; + } + transaction_begun = 1; + + /* Take the bitmap lock to update the alloc limit field */ + lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK); + + /* + * Prevent new allocations from using the part we're trying to truncate. + * + * NOTE: allocLimit is set to the allocation block number where the new + * alternate volume header will be. That way there will be no files to + * interfere with allocating the new alternate volume header, and no files + * in the allocation blocks beyond (i.e. the blocks we're trying to + * truncate away. + */ + if (hfsmp->blockSize == 512) { + error = UpdateAllocLimit (hfsmp, newblkcnt - 2); + } + else { + error = UpdateAllocLimit (hfsmp, newblkcnt - 1); + } + + /* Sparse devices use first fit allocation which is not ideal + * for volume resize which requires best fit allocation. If a + * sparse device is being truncated, disable the sparse device + * property temporarily for the duration of resize. Also reset + * the free extent cache so that it is rebuilt as sorted by + * totalBlocks instead of startBlock. + * + * Note that this will affect all allocations on the volume and + * ideal fix would be just to modify resize-related allocations, + * but it will result in complexity like handling of two free + * extent caches sorted differently, etc. So we stick to this + * solution for now. + */ + hfs_lock_mount (hfsmp); + if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) { + hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE; + ResetVCBFreeExtCache(hfsmp); + disable_sparse = true; + } + + /* + * Update the volume free block count to reflect the total number + * of free blocks that will exist after a successful resize. + * Relocation of extents will result in no net change in the total + * free space on the disk. Therefore the code that allocates + * space for new extent and deallocates the old extent explicitly + * prevents updating the volume free block count. It will also + * prevent false disk full error when the number of blocks in + * an extent being relocated is more than the free blocks that + * will exist after the volume is resized. + */ + hfsmp->reclaimBlocks = reclaimblks; + hfsmp->freeBlocks -= reclaimblks; + updateFreeBlocks = true; + hfs_unlock_mount(hfsmp); + + if (lockflags) { + hfs_systemfile_unlock(hfsmp, lockflags); + lockflags = 0; + } + + /* + * Update the metadata zone size to match the new volume size, + * and if it too less, metadata zone might be disabled. + */ + hfs_metadatazone_init(hfsmp, false); + + /* + * If some files have blocks at or beyond the location of the + * new alternate volume header, recalculate free blocks and + * reclaim blocks. Otherwise just update free blocks count. + * + * The current allocLimit is set to the location of new alternate + * volume header, and reclaimblks are the total number of blocks + * that need to be reclaimed. So the check below is really + * ignoring the blocks allocated for old alternate volume header. + */ + if (hfs_isallocated(hfsmp, hfsmp->allocLimit, reclaimblks)) { + /* + * hfs_reclaimspace will use separate transactions when + * relocating files (so we don't overwhelm the journal). + */ + hfs_end_transaction(hfsmp); + transaction_begun = 0; + + /* Attempt to reclaim some space. */ + error = hfs_reclaimspace(hfsmp, hfsmp->allocLimit, reclaimblks, context); + if (error != 0) { + printf("hfs_truncatefs: couldn't reclaim space on %s (error=%d)\n", hfsmp->vcbVN, error); + error = ENOSPC; + goto out; + } + + if (hfs_start_transaction(hfsmp) != 0) { + error = EINVAL; + goto out; + } + transaction_begun = 1; + + /* Check if we're clear now. */ + error = hfs_isallocated(hfsmp, hfsmp->allocLimit, reclaimblks); + if (error != 0) { + printf("hfs_truncatefs: didn't reclaim enough space on %s (error=%d)\n", hfsmp->vcbVN, error); + error = EAGAIN; /* tell client to try again */ + goto out; + } + } + + /* + * Note: we take the attributes lock in case we have an attribute data vnode + * which needs to change size. + */ + lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE | SFL_EXTENTS | SFL_BITMAP, HFS_EXCLUSIVE_LOCK); + + /* + * Allocate last 1KB for alternate volume header. + */ + error = BlockMarkAllocated(hfsmp, hfsmp->allocLimit, (hfsmp->blockSize == 512) ? 2 : 1); + if (error) { + printf("hfs_truncatefs: Error %d allocating new alternate volume header\n", error); + goto out; + } + + /* + * Mark the old alternate volume header as free. + * We don't bother shrinking allocation bitmap file. + */ + if (hfsmp->blockSize == 512) + (void) BlockMarkFree(hfsmp, hfsmp->totalBlocks - 2, 2); + else + (void) BlockMarkFree(hfsmp, hfsmp->totalBlocks - 1, 1); + + /* Don't invalidate the old AltVH yet. It is still valid until the partition size is updated ! */ + + /* Log successful shrinking. */ + printf("hfs_truncatefs: shrank \"%s\" to %d blocks (was %d blocks)\n", + hfsmp->vcbVN, newblkcnt, hfsmp->totalBlocks); + + /* + * Adjust file system variables and flush them to disk. + * + * Note that although the logical block size is updated here, it is only + * done for the benefit/convenience of the partition management software. The + * logical block count change has not yet actually been propagated to + * the disk device yet (and we won't get any notification when it does). + */ + hfsmp->totalBlocks = newblkcnt; + hfsmp->hfs_logical_block_count = newsize / hfsmp->hfs_logical_block_size; + hfsmp->hfs_logical_bytes = (uint64_t) hfsmp->hfs_logical_block_count * (uint64_t) hfsmp->hfs_logical_block_size; + hfsmp->reclaimBlocks = 0; + + /* + * At this point, a smaller HFS file system exists in a larger volume. + * As per volume format, the alternate volume header is located 1024 bytes + * before end of the partition. So, until the partition is also resized, + * a valid alternate volume header will need to be updated at 1024 bytes + * before end of the volume. Under normal circumstances, a file system + * resize is always followed by a volume resize, so we also need to + * write a copy of the new alternate volume header at 1024 bytes before + * end of the new file system. + */ + if (hfs_resize_debug) { + printf ("hfs_truncatefs: old: partition_avh_sector=%qu, fs_avh_sector=%qu\n", + hfsmp->hfs_partition_avh_sector, hfsmp->hfs_fs_avh_sector); + } + hfsmp->hfs_fs_avh_sector = HFS_ALT_SECTOR(hfsmp->hfs_logical_block_size, hfsmp->hfs_logical_block_count); + /* Note hfs_partition_avh_sector stays unchanged! partition size has not yet been modified */ + if (hfs_resize_debug) { + printf ("hfs_truncatefs: new: partition_avh_sector=%qu, fs_avh_sector=%qu\n", + hfsmp->hfs_partition_avh_sector, hfsmp->hfs_fs_avh_sector); + } + + MarkVCBDirty(hfsmp); + error = hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT | HFS_FVH_WRITE_ALT); + if (error) { + panic("hfs_truncatefs: unexpected error flushing volume header (%d)\n", error); + } + + /* + * Adjust the size of hfsmp->hfs_attrdata_vp + */ + if (hfsmp->hfs_attrdata_vp) { + struct cnode *cp; + struct filefork *fp; + + if (vnode_get(hfsmp->hfs_attrdata_vp) == 0) { + cp = VTOC(hfsmp->hfs_attrdata_vp); + fp = VTOF(hfsmp->hfs_attrdata_vp); + + cp->c_blocks = newblkcnt; + fp->ff_blocks = newblkcnt; + fp->ff_extents[0].blockCount = newblkcnt; + fp->ff_size = (off_t) newblkcnt * hfsmp->blockSize; + ubc_setsize(hfsmp->hfs_attrdata_vp, fp->ff_size); + vnode_put(hfsmp->hfs_attrdata_vp); + } + } + +out: + /* + * Update the allocLimit to acknowledge the last one or two blocks now. + * Add it to the tree as well if necessary. + */ + UpdateAllocLimit (hfsmp, hfsmp->totalBlocks); + + hfs_lock_mount (hfsmp); + if (disable_sparse == true) { + /* Now that resize is completed, set the volume to be sparse + * device again so that all further allocations will be first + * fit instead of best fit. Reset free extent cache so that + * it is rebuilt. + */ + hfsmp->hfs_flags |= HFS_HAS_SPARSE_DEVICE; + ResetVCBFreeExtCache(hfsmp); + } + + if (error && (updateFreeBlocks == true)) { + hfsmp->freeBlocks += reclaimblks; + } + hfsmp->reclaimBlocks = 0; + + if (hfsmp->nextAllocation >= hfsmp->allocLimit) { + hfsmp->nextAllocation = hfsmp->hfs_metazone_end + 1; + } + hfsmp->hfs_flags &= ~HFS_RESIZE_IN_PROGRESS; + hfs_unlock_mount (hfsmp); + + /* On error, reset the metadata zone for original volume size */ + if (error && (updateFreeBlocks == true)) { + hfs_metadatazone_init(hfsmp, false); + } + + if (lockflags) { + hfs_systemfile_unlock(hfsmp, lockflags); + } + if (transaction_begun) { + hfs_end_transaction(hfsmp); + /* Just to be sure, sync all data to the disk */ + int flush_error = hfs_flush(hfsmp, HFS_FLUSH_FULL); + if (flush_error && !error) + error = flush_error; + } + + if (error) { + printf ("hfs_truncatefs: failed error=%d on vol=%s\n", MacToVFSError(error), hfsmp->vcbVN); + } + + return MacToVFSError(error); +} + + +/* + * Invalidate the physical block numbers associated with buffer cache blocks + * in the given extent of the given vnode. + */ +struct hfs_inval_blk_no { + daddr64_t sectorStart; + daddr64_t sectorCount; +}; +static int +hfs_invalidate_block_numbers_callback(buf_t bp, void *args_in) +{ + daddr64_t blkno; + struct hfs_inval_blk_no *args; + + blkno = buf_blkno(bp); + args = args_in; + + if (blkno >= args->sectorStart && blkno < args->sectorStart+args->sectorCount) + buf_setblkno(bp, buf_lblkno(bp)); + + return BUF_RETURNED; +} +static void +hfs_invalidate_sectors(struct vnode *vp, daddr64_t sectorStart, daddr64_t sectorCount) +{ + struct hfs_inval_blk_no args; + args.sectorStart = sectorStart; + args.sectorCount = sectorCount; + + buf_iterate(vp, hfs_invalidate_block_numbers_callback, BUF_SCAN_DIRTY|BUF_SCAN_CLEAN, &args); +} + + +/* + * Copy the contents of an extent to a new location. Also invalidates the + * physical block number of any buffer cache block in the copied extent + * (so that if the block is written, it will go through VNOP_BLOCKMAP to + * determine the new physical block number). + * + * At this point, for regular files, we hold the truncate lock exclusive + * and the cnode lock exclusive. + */ +static int +hfs_copy_extent( + struct hfsmount *hfsmp, + struct vnode *vp, /* The file whose extent is being copied. */ + u_int32_t oldStart, /* The start of the source extent. */ + u_int32_t newStart, /* The start of the destination extent. */ + u_int32_t blockCount, /* The number of allocation blocks to copy. */ + __unused vfs_context_t context) +{ + int err = 0; + size_t bufferSize; + void *buffer = NULL; + struct vfsioattr ioattr; + buf_t bp = NULL; + off_t resid; + size_t ioSize; + u_int32_t ioSizeSectors; /* Device sectors in this I/O */ + daddr64_t srcSector, destSector; + u_int32_t sectorsPerBlock = hfsmp->blockSize / hfsmp->hfs_logical_block_size; +#if CONFIG_PROTECT + int cpenabled = 0; +#endif + + /* + * Sanity check that we have locked the vnode of the file we're copying. + * + * But since hfs_systemfile_lock() doesn't actually take the lock on + * the allocation file if a journal is active, ignore the check if the + * file being copied is the allocation file. + */ + struct cnode *cp = VTOC(vp); + if (cp != hfsmp->hfs_allocation_cp && cp->c_lockowner != current_thread()) + panic("hfs_copy_extent: vp=%p (cp=%p) not owned?\n", vp, cp); + +#if CONFIG_PROTECT + /* + * Prepare the CP blob and get it ready for use, if necessary. + * + * Note that we specifically *exclude* system vnodes (catalog, bitmap, extents, EAs), + * because they are implicitly protected via the media key on iOS. As such, they + * must not be relocated except with the media key. So it is OK to not pass down + * a special cpentry to the IOMedia/LwVM code for handling. + */ + if (!vnode_issystem (vp) && vnode_isreg(vp) && cp_fs_protected (hfsmp->hfs_mp)) { + cpenabled = 1; + } +#endif + + /* + * Determine the I/O size to use + * + * NOTE: Many external drives will result in an ioSize of 128KB. + * TODO: Should we use a larger buffer, doing several consecutive + * reads, then several consecutive writes? + */ + vfs_ioattr(hfsmp->hfs_mp, &ioattr); + bufferSize = MIN(ioattr.io_maxreadcnt, ioattr.io_maxwritecnt); + buffer = hfs_malloc(bufferSize); + + /* Get a buffer for doing the I/O */ + bp = buf_alloc(hfsmp->hfs_devvp); + buf_setdataptr(bp, (uintptr_t)buffer); + + resid = (off_t) blockCount * (off_t) hfsmp->blockSize; + srcSector = (daddr64_t) oldStart * hfsmp->blockSize / hfsmp->hfs_logical_block_size; + destSector = (daddr64_t) newStart * hfsmp->blockSize / hfsmp->hfs_logical_block_size; + while (resid > 0) { + ioSize = MIN(bufferSize, (size_t) resid); + ioSizeSectors = ioSize / hfsmp->hfs_logical_block_size; + + /* Prepare the buffer for reading */ + buf_reset(bp, B_READ); + buf_setsize(bp, ioSize); + buf_setcount(bp, ioSize); + buf_setblkno(bp, srcSector); + buf_setlblkno(bp, srcSector); + + /* + * Note that because this is an I/O to the device vp + * it is correct to have lblkno and blkno both point to the + * start sector being read from. If it were being issued against the + * underlying file then that would be different. + */ + + /* Attach the new CP blob to the buffer if needed */ +#if CONFIG_PROTECT + if (cpenabled) { + /* attach the RELOCATION_INFLIGHT flag for the underlying call to VNOP_STRATEGY */ + cp->c_cpentry->cp_flags |= CP_RELOCATION_INFLIGHT; + bufattr_setcpx(buf_attr(bp), hfsmp->hfs_resize_cpx); + + /* Initialize the content protection file offset to start at 0 */ + bufattr_setcpoff(buf_attr(bp), 0); + } +#endif + + /* Do the read */ + err = VNOP_STRATEGY(bp); + if (!err) + err = buf_biowait(bp); + if (err) { +#if CONFIG_PROTECT + /* Turn the flag off in error cases. */ + if (cpenabled) { + cp->c_cpentry->cp_flags &= ~CP_RELOCATION_INFLIGHT; + } +#endif + printf("hfs_copy_extent: Error %d from VNOP_STRATEGY (read)\n", err); + break; + } + + /* Prepare the buffer for writing */ + buf_reset(bp, B_WRITE); + buf_setsize(bp, ioSize); + buf_setcount(bp, ioSize); + buf_setblkno(bp, destSector); + buf_setlblkno(bp, destSector); + if (vnode_issystem(vp) && journal_uses_fua(hfsmp->jnl)) + buf_markfua(bp); + +#if CONFIG_PROTECT + /* Attach the CP to the buffer if needed */ + if (cpenabled) { + bufattr_setcpx(buf_attr(bp), hfsmp->hfs_resize_cpx); + /* + * The last STRATEGY call may have updated the cp file offset behind our + * back, so we cannot trust it. Re-initialize the content protection + * file offset back to 0 before initiating the write portion of this I/O. + */ + bufattr_setcpoff(buf_attr(bp), 0); + } +#endif + + /* Do the write */ + vnode_startwrite(hfsmp->hfs_devvp); + err = VNOP_STRATEGY(bp); + if (!err) { + err = buf_biowait(bp); + } +#if CONFIG_PROTECT + /* Turn the flag off regardless once the strategy call finishes. */ + if (cpenabled) { + cp->c_cpentry->cp_flags &= ~CP_RELOCATION_INFLIGHT; + } +#endif + if (err) { + printf("hfs_copy_extent: Error %d from VNOP_STRATEGY (write)\n", err); + break; + } + + resid -= ioSize; + srcSector += ioSizeSectors; + destSector += ioSizeSectors; + } + if (bp) + buf_free(bp); + hfs_free(buffer, bufferSize); + + /* Make sure all writes have been flushed to disk. */ + if (vnode_issystem(vp) && !journal_uses_fua(hfsmp->jnl)) { + + err = hfs_flush(hfsmp, HFS_FLUSH_CACHE); + if (err) { + printf("hfs_copy_extent: hfs_flush failed (%d)\n", err); + err = 0; /* Don't fail the copy. */ + } + } + + if (!err) + hfs_invalidate_sectors(vp, (daddr64_t)oldStart*sectorsPerBlock, (daddr64_t)blockCount*sectorsPerBlock); + + return err; +} + + +/* Structure to store state of reclaiming extents from a + * given file. hfs_reclaim_file()/hfs_reclaim_xattr() + * initializes the values in this structure which are then + * used by code that reclaims and splits the extents. + */ +struct hfs_reclaim_extent_info { + struct vnode *vp; + u_int32_t fileID; + u_int8_t forkType; + u_int8_t is_dirlink; /* Extent belongs to directory hard link */ + u_int8_t is_sysfile; /* Extent belongs to system file */ + u_int8_t is_xattr; /* Extent belongs to extent-based xattr */ + u_int8_t extent_index; + int lockflags; /* Locks that reclaim and split code should grab before modifying the extent record */ + u_int32_t blocks_relocated; /* Total blocks relocated for this file till now */ + u_int32_t recStartBlock; /* File allocation block number (FABN) for current extent record */ + u_int32_t cur_blockCount; /* Number of allocation blocks that have been checked for reclaim */ + struct filefork *catalog_fp; /* If non-NULL, extent is from catalog record */ + union record { + HFSPlusExtentRecord overflow;/* Extent record from overflow extents btree */ + HFSPlusAttrRecord xattr; /* Attribute record for large EAs */ + } record; + HFSPlusExtentDescriptor *extents; /* Pointer to current extent record being processed. + * For catalog extent record, points to the correct + * extent information in filefork. For overflow extent + * record, or xattr record, points to extent record + * in the structure above + */ + struct cat_desc *dirlink_desc; + struct cat_attr *dirlink_attr; + struct filefork *dirlink_fork; /* For directory hard links, fp points actually to this */ + struct BTreeIterator *iterator; /* Shared read/write iterator, hfs_reclaim_file/xattr() + * use it for reading and hfs_reclaim_extent()/hfs_split_extent() + * use it for writing updated extent record + */ + struct FSBufferDescriptor btdata; /* Shared btdata for reading/writing extent record, same as iterator above */ + u_int16_t recordlen; + int overflow_count; /* For debugging, counter for overflow extent record */ + FCB *fcb; /* Pointer to the current btree being traversed */ +}; + +/* + * Split the current extent into two extents, with first extent + * to contain given number of allocation blocks. Splitting of + * extent creates one new extent entry which can result in + * shifting of many entries through all the extent records of a + * file, and/or creating a new extent record in the overflow + * extent btree. + * + * Example: + * The diagram below represents two consecutive extent records, + * for simplicity, lets call them record X and X+1 respectively. + * Interesting extent entries have been denoted by letters. + * If the letter is unchanged before and after split, it means + * that the extent entry was not modified during the split. + * A '.' means that the entry remains unchanged after the split + * and is not relevant for our example. A '0' means that the + * extent entry is empty. + * + * If there isn't sufficient contiguous free space to relocate + * an extent (extent "C" below), we will have to break the one + * extent into multiple smaller extents, and relocate each of + * the smaller extents individually. The way we do this is by + * finding the largest contiguous free space that is currently + * available (N allocation blocks), and then convert extent "C" + * into two extents, C1 and C2, that occupy exactly the same + * allocation blocks as extent C. Extent C1 is the first + * N allocation blocks of extent C, and extent C2 is the remainder + * of extent C. Then we can relocate extent C1 since we know + * we have enough contiguous free space to relocate it in its + * entirety. We then repeat the process starting with extent C2. + * + * In record X, only the entries following entry C are shifted, and + * the original entry C is replaced with two entries C1 and C2 which + * are actually two extent entries for contiguous allocation blocks. + * + * Note that the entry E from record X is shifted into record X+1 as + * the new first entry. Since the first entry of record X+1 is updated, + * the FABN will also get updated with the blockCount of entry E. + * This also results in shifting of all extent entries in record X+1. + * Note that the number of empty entries after the split has been + * changed from 3 to 2. + * + * Before: + * record X record X+1 + * ---------------------===--------- --------------------------------- + * | A | . | . | . | B | C | D | E | | F | . | . | . | G | 0 | 0 | 0 | + * ---------------------===--------- --------------------------------- + * + * After: + * ---------------------=======----- --------------------------------- + * | A | . | . | . | B | C1| C2| D | | E | F | . | . | . | G | 0 | 0 | + * ---------------------=======----- --------------------------------- + * + * C1.startBlock = C.startBlock + * C1.blockCount = N + * + * C2.startBlock = C.startBlock + N + * C2.blockCount = C.blockCount - N + * + * FABN = old FABN - E.blockCount + * + * Inputs: + * extent_info - This is the structure that contains state about + * the current file, extent, and extent record that + * is being relocated. This structure is shared + * among code that traverses through all the extents + * of the file, code that relocates extents, and + * code that splits the extent. + * newBlockCount - The blockCount of the extent to be split after + * successfully split operation. + * Output: + * Zero on success, non-zero on failure. + */ +static int +hfs_split_extent(struct hfs_reclaim_extent_info *extent_info, uint32_t newBlockCount) +{ + int error = 0; + int index = extent_info->extent_index; + int i; + HFSPlusExtentDescriptor shift_extent; /* Extent entry that should be shifted into next extent record */ + HFSPlusExtentDescriptor last_extent; + HFSPlusExtentDescriptor *extents; /* Pointer to current extent record being manipulated */ + HFSPlusExtentRecord *extents_rec = NULL; + HFSPlusExtentKey *extents_key = NULL; + HFSPlusAttrRecord *xattr_rec = NULL; + HFSPlusAttrKey *xattr_key = NULL; + struct BTreeIterator iterator; + struct FSBufferDescriptor btdata; + uint16_t reclen; + uint32_t read_recStartBlock; /* Starting allocation block number to read old extent record */ + uint32_t write_recStartBlock; /* Starting allocation block number to insert newly updated extent record */ + Boolean create_record = false; + Boolean is_xattr; + struct cnode *cp; + + is_xattr = extent_info->is_xattr; + extents = extent_info->extents; + cp = VTOC(extent_info->vp); + + if (newBlockCount == 0) { + if (hfs_resize_debug) { + printf ("hfs_split_extent: No splitting required for newBlockCount=0\n"); + } + return error; + } + + if (hfs_resize_debug) { + printf ("hfs_split_extent: Split record:%u recStartBlock=%u %u:(%u,%u) for %u blocks\n", extent_info->overflow_count, extent_info->recStartBlock, index, extents[index].startBlock, extents[index].blockCount, newBlockCount); + } + + /* Extents overflow btree can not have more than 8 extents. + * No split allowed if the 8th extent is already used. + */ + if ((extent_info->fileID == kHFSExtentsFileID) && (extents[kHFSPlusExtentDensity - 1].blockCount != 0)) { + printf ("hfs_split_extent: Maximum 8 extents allowed for extents overflow btree, cannot split further.\n"); + error = ENOSPC; + goto out; + } + + /* Determine the starting allocation block number for the following + * overflow extent record, if any, before the current record + * gets modified. + */ + read_recStartBlock = extent_info->recStartBlock; + for (i = 0; i < kHFSPlusExtentDensity; i++) { + if (extents[i].blockCount == 0) { + break; + } + read_recStartBlock += extents[i].blockCount; + } + + /* Shift and split */ + if (index == kHFSPlusExtentDensity-1) { + /* The new extent created after split will go into following overflow extent record */ + shift_extent.startBlock = extents[index].startBlock + newBlockCount; + shift_extent.blockCount = extents[index].blockCount - newBlockCount; + + /* Last extent in the record will be split, so nothing to shift */ + } else { + /* Splitting of extents can result in at most of one + * extent entry to be shifted into following overflow extent + * record. So, store the last extent entry for later. + */ + shift_extent = extents[kHFSPlusExtentDensity-1]; + if ((hfs_resize_debug) && (shift_extent.blockCount != 0)) { + printf ("hfs_split_extent: Save 7:(%u,%u) to shift into overflow record\n", shift_extent.startBlock, shift_extent.blockCount); + } + + /* Start shifting extent information from the end of the extent + * record to the index where we want to insert the new extent. + * Note that kHFSPlusExtentDensity-1 is already saved above, and + * does not need to be shifted. The extent entry that is being + * split does not get shifted. + */ + for (i = kHFSPlusExtentDensity-2; i > index; i--) { + if (hfs_resize_debug) { + if (extents[i].blockCount) { + printf ("hfs_split_extent: Shift %u:(%u,%u) to %u:(%u,%u)\n", i, extents[i].startBlock, extents[i].blockCount, i+1, extents[i].startBlock, extents[i].blockCount); + } + } + extents[i+1] = extents[i]; + } + } + + if (index == kHFSPlusExtentDensity-1) { + /* The second half of the extent being split will be the overflow + * entry that will go into following overflow extent record. The + * value has been stored in 'shift_extent' above, so there is + * nothing to be done here. + */ + } else { + /* Update the values in the second half of the extent being split + * before updating the first half of the split. Note that the + * extent to split or first half of the split is at index 'index' + * and a new extent or second half of the split will be inserted at + * 'index+1' or into following overflow extent record. + */ + extents[index+1].startBlock = extents[index].startBlock + newBlockCount; + extents[index+1].blockCount = extents[index].blockCount - newBlockCount; + } + /* Update the extent being split, only the block count will change */ + extents[index].blockCount = newBlockCount; + + if (hfs_resize_debug) { + printf ("hfs_split_extent: Split %u:(%u,%u) and ", index, extents[index].startBlock, extents[index].blockCount); + if (index != kHFSPlusExtentDensity-1) { + printf ("%u:(%u,%u)\n", index+1, extents[index+1].startBlock, extents[index+1].blockCount); + } else { + printf ("overflow:(%u,%u)\n", shift_extent.startBlock, shift_extent.blockCount); + } + } + + /* Write out information about the newly split extent to the disk */ + if (extent_info->catalog_fp) { + /* (extent_info->catalog_fp != NULL) means the newly split + * extent exists in the catalog record. This means that + * the cnode was updated. Therefore, to write out the changes, + * mark the cnode as modified. We cannot call hfs_update() + * in this function because the caller hfs_reclaim_extent() + * is holding the catalog lock currently. + */ + cp->c_flag |= C_MODIFIED; + } else { + /* The newly split extent is for large EAs or is in overflow + * extent record, so update it directly in the btree using the + * iterator information from the shared extent_info structure + */ + error = BTReplaceRecord(extent_info->fcb, extent_info->iterator, + &(extent_info->btdata), extent_info->recordlen); + if (error) { + printf ("hfs_split_extent: fileID=%u BTReplaceRecord returned error=%d\n", extent_info->fileID, error); + goto out; + } + } + + /* No extent entry to be shifted into another extent overflow record */ + if (shift_extent.blockCount == 0) { + if (hfs_resize_debug) { + printf ("hfs_split_extent: No extent entry to be shifted into overflow records\n"); + } + error = 0; + goto out; + } + + /* The overflow extent entry has to be shifted into an extent + * overflow record. This means that we might have to shift + * extent entries from all subsequent overflow records by one. + * We start iteration from the first record to the last record, + * and shift the extent entry from one record to another. + * We might have to create a new extent record for the last + * extent entry for the file. + */ + + /* Initialize iterator to search the next record */ + bzero(&iterator, sizeof(iterator)); + if (is_xattr) { + /* Copy the key from the iterator that was used to update the modified attribute record. */ + xattr_key = (HFSPlusAttrKey *)&(iterator.key); + bcopy((HFSPlusAttrKey *)&(extent_info->iterator->key), xattr_key, sizeof(HFSPlusAttrKey)); + /* Note: xattr_key->startBlock will be initialized later in the iteration loop */ + + xattr_rec = hfs_malloc(sizeof(*xattr_rec)); + + btdata.bufferAddress = xattr_rec; + btdata.itemSize = sizeof(HFSPlusAttrRecord); + btdata.itemCount = 1; + extents = xattr_rec->overflowExtents.extents; + } else { + /* Initialize the extent key for the current file */ + extents_key = (HFSPlusExtentKey *) &(iterator.key); + extents_key->keyLength = kHFSPlusExtentKeyMaximumLength; + extents_key->forkType = extent_info->forkType; + extents_key->fileID = extent_info->fileID; + /* Note: extents_key->startBlock will be initialized later in the iteration loop */ + + extents_rec = hfs_malloc(sizeof(*extents_rec)); + + btdata.bufferAddress = extents_rec; + btdata.itemSize = sizeof(HFSPlusExtentRecord); + btdata.itemCount = 1; + extents = extents_rec[0]; + } + + /* The overflow extent entry has to be shifted into an extent + * overflow record. This means that we might have to shift + * extent entries from all subsequent overflow records by one. + * We start iteration from the first record to the last record, + * examine one extent record in each iteration and shift one + * extent entry from one record to another. We might have to + * create a new extent record for the last extent entry for the + * file. + * + * If shift_extent.blockCount is non-zero, it means that there is + * an extent entry that needs to be shifted into the next + * overflow extent record. We keep on going till there are no such + * entries left to be shifted. This will also change the starting + * allocation block number of the extent record which is part of + * the key for the extent record in each iteration. Note that + * because the extent record key is changing while we are searching, + * the record can not be updated directly, instead it has to be + * deleted and inserted again. + */ + while (shift_extent.blockCount) { + if (hfs_resize_debug) { + printf ("hfs_split_extent: Will shift (%u,%u) into overflow record with startBlock=%u\n", shift_extent.startBlock, shift_extent.blockCount, read_recStartBlock); + } + + /* Search if there is any existing overflow extent record + * that matches the current file and the logical start block + * number. + * + * For this, the logical start block number in the key is + * the value calculated based on the logical start block + * number of the current extent record and the total number + * of blocks existing in the current extent record. + */ + if (is_xattr) { + xattr_key->startBlock = read_recStartBlock; + } else { + extents_key->startBlock = read_recStartBlock; + } + error = BTSearchRecord(extent_info->fcb, &iterator, &btdata, &reclen, &iterator); + if (error) { + if (error != btNotFound) { + printf ("hfs_split_extent: fileID=%u startBlock=%u BTSearchRecord error=%d\n", extent_info->fileID, read_recStartBlock, error); + goto out; + } + /* No matching record was found, so create a new extent record. + * Note: Since no record was found, we can't rely on the + * btree key in the iterator any longer. This will be initialized + * later before we insert the record. + */ + create_record = true; + } + + /* The extra extent entry from the previous record is being inserted + * as the first entry in the current extent record. This will change + * the file allocation block number (FABN) of the current extent + * record, which is the startBlock value from the extent record key. + * Since one extra entry is being inserted in the record, the new + * FABN for the record will less than old FABN by the number of blocks + * in the new extent entry being inserted at the start. We have to + * do this before we update read_recStartBlock to point at the + * startBlock of the following record. + */ + write_recStartBlock = read_recStartBlock - shift_extent.blockCount; + if (hfs_resize_debug) { + if (create_record) { + printf ("hfs_split_extent: No records found for startBlock=%u, will create new with startBlock=%u\n", read_recStartBlock, write_recStartBlock); + } + } + + /* Now update the read_recStartBlock to account for total number + * of blocks in this extent record. It will now point to the + * starting allocation block number for the next extent record. + */ + for (i = 0; i < kHFSPlusExtentDensity; i++) { + if (extents[i].blockCount == 0) { + break; + } + read_recStartBlock += extents[i].blockCount; + } + + if (create_record == true) { + /* Initialize new record content with only one extent entry */ + bzero(extents, sizeof(HFSPlusExtentRecord)); + /* The new record will contain only one extent entry */ + extents[0] = shift_extent; + /* There are no more overflow extents to be shifted */ + shift_extent.startBlock = shift_extent.blockCount = 0; + + if (is_xattr) { + /* BTSearchRecord above returned btNotFound, + * but since the attribute btree is never empty + * if we are trying to insert new overflow + * record for the xattrs, the extents_key will + * contain correct data. So we don't need to + * re-initialize it again like below. + */ + + /* Initialize the new xattr record */ + xattr_rec->recordType = kHFSPlusAttrExtents; + xattr_rec->overflowExtents.reserved = 0; + reclen = sizeof(HFSPlusAttrExtents); + } else { + /* BTSearchRecord above returned btNotFound, + * which means that extents_key content might + * not correspond to the record that we are + * trying to create, especially when the extents + * overflow btree is empty. So we reinitialize + * the extents_key again always. + */ + extents_key->keyLength = kHFSPlusExtentKeyMaximumLength; + extents_key->forkType = extent_info->forkType; + extents_key->fileID = extent_info->fileID; + + /* Initialize the new extent record */ + reclen = sizeof(HFSPlusExtentRecord); + } + } else { + /* The overflow extent entry from previous record will be + * the first entry in this extent record. If the last + * extent entry in this record is valid, it will be shifted + * into the following extent record as its first entry. So + * save the last entry before shifting entries in current + * record. + */ + last_extent = extents[kHFSPlusExtentDensity-1]; + + /* Shift all entries by one index towards the end */ + for (i = kHFSPlusExtentDensity-2; i >= 0; i--) { + extents[i+1] = extents[i]; + } + + /* Overflow extent entry saved from previous record + * is now the first entry in the current record. + */ + extents[0] = shift_extent; + + if (hfs_resize_debug) { + printf ("hfs_split_extent: Shift overflow=(%u,%u) to record with updated startBlock=%u\n", shift_extent.startBlock, shift_extent.blockCount, write_recStartBlock); + } + + /* The last entry from current record will be the + * overflow entry which will be the first entry for + * the following extent record. + */ + shift_extent = last_extent; + + /* Since the key->startBlock is being changed for this record, + * it should be deleted and inserted with the new key. + */ + error = BTDeleteRecord(extent_info->fcb, &iterator); + if (error) { + printf ("hfs_split_extent: fileID=%u startBlock=%u BTDeleteRecord error=%d\n", extent_info->fileID, read_recStartBlock, error); + goto out; + } + if (hfs_resize_debug) { + printf ("hfs_split_extent: Deleted extent record with startBlock=%u\n", (is_xattr ? xattr_key->startBlock : extents_key->startBlock)); + } + } + + /* Insert the newly created or modified extent record */ + bzero(&iterator.hint, sizeof(iterator.hint)); + if (is_xattr) { + xattr_key->startBlock = write_recStartBlock; + } else { + extents_key->startBlock = write_recStartBlock; + } + error = BTInsertRecord(extent_info->fcb, &iterator, &btdata, reclen); + if (error) { + printf ("hfs_split_extent: fileID=%u, startBlock=%u BTInsertRecord error=%d\n", extent_info->fileID, write_recStartBlock, error); + goto out; + } + if (hfs_resize_debug) { + printf ("hfs_split_extent: Inserted extent record with startBlock=%u\n", write_recStartBlock); + } + } + +out: + /* + * Extents overflow btree or attributes btree headers might have + * been modified during the split/shift operation, so flush the + * changes to the disk while we are inside journal transaction. + * We should only be able to generate I/O that modifies the B-Tree + * header nodes while we're in the middle of a journal transaction. + * Otherwise it might result in panic during unmount. + */ + BTFlushPath(extent_info->fcb); + + hfs_free(extents_rec, sizeof(*extents_rec)); + hfs_free(xattr_rec, sizeof(*xattr_rec)); + return error; +} + + +/* + * Relocate an extent if it lies beyond the expected end of volume. + * + * This function is called for every extent of the file being relocated. + * It allocates space for relocation, copies the data, deallocates + * the old extent, and update corresponding on-disk extent. If the function + * does not find contiguous space to relocate an extent, it splits the + * extent in smaller size to be able to relocate it out of the area of + * disk being reclaimed. As an optimization, if an extent lies partially + * in the area of the disk being reclaimed, it is split so that we only + * have to relocate the area that was overlapping with the area of disk + * being reclaimed. + * + * Note that every extent is relocated in its own transaction so that + * they do not overwhelm the journal. This function handles the extent + * record that exists in the catalog record, extent record from overflow + * extents btree, and extents for large EAs. + * + * Inputs: + * extent_info - This is the structure that contains state about + * the current file, extent, and extent record that + * is being relocated. This structure is shared + * among code that traverses through all the extents + * of the file, code that relocates extents, and + * code that splits the extent. + */ +static int +hfs_reclaim_extent(struct hfsmount *hfsmp, const u_long allocLimit, struct hfs_reclaim_extent_info *extent_info, vfs_context_t context) +{ + int error = 0; + int index; + struct cnode *cp; + u_int32_t oldStartBlock; + u_int32_t oldBlockCount; + u_int32_t newStartBlock = 0; + u_int32_t newBlockCount; + u_int32_t roundedBlockCount; + uint16_t node_size; + uint32_t remainder_blocks; + u_int32_t alloc_flags; + int blocks_allocated = false; + + index = extent_info->extent_index; + cp = VTOC(extent_info->vp); + + oldStartBlock = extent_info->extents[index].startBlock; + oldBlockCount = extent_info->extents[index].blockCount; + + if (0 && hfs_resize_debug) { + printf ("hfs_reclaim_extent: Examine record:%u recStartBlock=%u, %u:(%u,%u)\n", extent_info->overflow_count, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount); + } + + /* If the current extent lies completely within allocLimit, + * it does not require any relocation. + */ + if ((oldStartBlock + oldBlockCount) <= allocLimit) { + extent_info->cur_blockCount += oldBlockCount; + return error; + } + + /* Every extent should be relocated in its own transaction + * to make sure that we don't overflow the journal buffer. + */ + error = hfs_start_transaction(hfsmp); + if (error) { + return error; + } + extent_info->lockflags = hfs_systemfile_lock(hfsmp, extent_info->lockflags, HFS_EXCLUSIVE_LOCK); + + /* Check if the extent lies partially in the area to reclaim, + * i.e. it starts before allocLimit and ends beyond allocLimit. + * We have already skipped extents that lie completely within + * allocLimit in the check above, so we only check for the + * startBlock. If it lies partially, split it so that we + * only relocate part of the extent. + */ + if (oldStartBlock < allocLimit) { + newBlockCount = allocLimit - oldStartBlock; + + if (hfs_resize_debug) { + int idx = extent_info->extent_index; + printf ("hfs_reclaim_extent: Split straddling extent %u:(%u,%u) for %u blocks\n", idx, extent_info->extents[idx].startBlock, extent_info->extents[idx].blockCount, newBlockCount); + } + + /* If the extent belongs to a btree, check and trim + * it to be multiple of the node size. + */ + if (extent_info->is_sysfile) { + node_size = get_btree_nodesize(extent_info->vp); + /* If the btree node size is less than the block size, + * splitting this extent will not split a node across + * different extents. So we only check and trim if + * node size is more than the allocation block size. + */ + if (node_size > hfsmp->blockSize) { + remainder_blocks = newBlockCount % (node_size / hfsmp->blockSize); + if (remainder_blocks) { + newBlockCount -= remainder_blocks; + if (hfs_resize_debug) { + printf ("hfs_reclaim_extent: Round-down newBlockCount to be multiple of nodeSize, node_allocblks=%u, old=%u, new=%u\n", node_size/hfsmp->blockSize, newBlockCount + remainder_blocks, newBlockCount); + } + } + } + /* The newBlockCount is zero because of rounding-down so that + * btree nodes are not split across extents. Therefore this + * straddling extent across resize-boundary does not require + * splitting. Skip over to relocating of complete extent. + */ + if (newBlockCount == 0) { + if (hfs_resize_debug) { + printf ("hfs_reclaim_extent: After round-down newBlockCount=0, skip split, relocate full extent\n"); + } + goto relocate_full_extent; + } + } + + /* Split the extents into two parts --- the first extent lies + * completely within allocLimit and therefore does not require + * relocation. The second extent will require relocation which + * will be handled when the caller calls this function again + * for the next extent. + */ + error = hfs_split_extent(extent_info, newBlockCount); + if (error == 0) { + /* Split success, no relocation required */ + goto out; + } + /* Split failed, so try to relocate entire extent */ + if (hfs_resize_debug) { + int idx = extent_info->extent_index; + printf ("hfs_reclaim_extent: Split straddling extent %u:(%u,%u) for %u blocks failed, relocate full extent\n", idx, extent_info->extents[idx].startBlock, extent_info->extents[idx].blockCount, newBlockCount); + } + } + +relocate_full_extent: + /* At this point, the current extent requires relocation. + * We will try to allocate space equal to the size of the extent + * being relocated first to try to relocate it without splitting. + * If the allocation fails, we will try to allocate contiguous + * blocks out of metadata zone. If that allocation also fails, + * then we will take a whatever contiguous block run is returned + * by the allocation, split the extent into two parts, and then + * relocate the first splitted extent. + */ + alloc_flags = HFS_ALLOC_FORCECONTIG | HFS_ALLOC_SKIPFREEBLKS; + if (extent_info->is_sysfile) { + alloc_flags |= HFS_ALLOC_METAZONE; + } + + error = BlockAllocate(hfsmp, 1, oldBlockCount, oldBlockCount, alloc_flags, + &newStartBlock, &newBlockCount); + if ((extent_info->is_sysfile == false) && + ((error == dskFulErr) || (error == ENOSPC))) { + /* For non-system files, try reallocating space in metadata zone */ + alloc_flags |= HFS_ALLOC_METAZONE; + error = BlockAllocate(hfsmp, 1, oldBlockCount, oldBlockCount, + alloc_flags, &newStartBlock, &newBlockCount); + } + if ((error == dskFulErr) || (error == ENOSPC)) { + /* + * We did not find desired contiguous space for this + * extent, when we asked for it, including the metazone allocations. + * At this point we are not worrying about getting contiguity anymore. + * + * HOWEVER, if we now allow blocks to be used which were recently + * de-allocated, we may find a contiguous range (though this seems + * unlikely). As a result, assume that we will have to split the + * current extent into two pieces, but if we are able to satisfy + * the request with a single extent, detect that as well. + */ + alloc_flags &= ~HFS_ALLOC_FORCECONTIG; + alloc_flags |= HFS_ALLOC_FLUSHTXN; + + error = BlockAllocate(hfsmp, 1, oldBlockCount, oldBlockCount, + alloc_flags, &newStartBlock, &newBlockCount); + if (error) { + printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u) BlockAllocate error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, error); + goto out; + } + + /* + * Allowing recently deleted extents may now allow us to find + * a single contiguous extent in the amount & size desired. If so, + * do NOT split this extent into two pieces. This is technically a + * check for "< oldBlockCount", but we use != to highlight the point + * that the special case is when they're equal. The allocator should + * never vend back more blocks than were requested. + */ + if (newBlockCount != oldBlockCount) { + blocks_allocated = true; + + /* The number of blocks allocated is less than the requested + * number of blocks. For btree extents, check and trim the + * extent to be multiple of the node size. + */ + if (extent_info->is_sysfile) { + node_size = get_btree_nodesize(extent_info->vp); + if (node_size > hfsmp->blockSize) { + remainder_blocks = newBlockCount % (node_size / hfsmp->blockSize); + if (remainder_blocks) { + roundedBlockCount = newBlockCount - remainder_blocks; + /* Free tail-end blocks of the newly allocated extent */ + BlockDeallocate(hfsmp, newStartBlock + roundedBlockCount, + newBlockCount - roundedBlockCount, + HFS_ALLOC_SKIPFREEBLKS); + newBlockCount = roundedBlockCount; + if (hfs_resize_debug) { + printf ("hfs_reclaim_extent: Fixing extent block count, node_blks=%u, old=%u, new=%u\n", node_size/hfsmp->blockSize, newBlockCount + remainder_blocks, newBlockCount); + } + if (newBlockCount == 0) { + printf ("hfs_reclaim_extent: Not enough contiguous blocks available to relocate fileID=%d\n", extent_info->fileID); + error = ENOSPC; + goto out; + } + } + } + } + + /* The number of blocks allocated is less than the number of + * blocks requested, so split this extent --- the first extent + * will be relocated as part of this function call and the caller + * will handle relocating the second extent by calling this + * function again for the second extent. + */ + error = hfs_split_extent(extent_info, newBlockCount); + if (error) { + printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u) split error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, error); + goto out; + } + oldBlockCount = newBlockCount; + } /* end oldBlockCount != newBlockCount */ + } /* end allocation request for any available free space */ + + if (error) { + printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u) contig BlockAllocate error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, error); + goto out; + } + blocks_allocated = true; + + /* Copy data from old location to new location */ + error = hfs_copy_extent(hfsmp, extent_info->vp, oldStartBlock, + newStartBlock, newBlockCount, context); + if (error) { + printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u)=>(%u,%u) hfs_copy_extent error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, newStartBlock, newBlockCount, error); + goto out; + } + + /* Update the extent record with the new start block information */ + extent_info->extents[index].startBlock = newStartBlock; + + /* Sync the content back to the disk */ + if (extent_info->catalog_fp) { + /* Update the extents in catalog record */ + if (extent_info->is_dirlink) { + error = cat_update_dirlink(hfsmp, extent_info->forkType, + extent_info->dirlink_desc, extent_info->dirlink_attr, + &(extent_info->dirlink_fork->ff_data)); + } else { + cp->c_flag |= C_MODIFIED; + /* If this is a system file, sync volume headers on disk */ + if (extent_info->is_sysfile) { + error = hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT | HFS_FVH_WRITE_ALT); + } + } + } else { + /* Replace record for extents overflow or extents-based xattrs */ + error = BTReplaceRecord(extent_info->fcb, extent_info->iterator, + &(extent_info->btdata), extent_info->recordlen); + } + if (error) { + printf ("hfs_reclaim_extent: fileID=%u, update record error=%u\n", extent_info->fileID, error); + goto out; + } + + /* Deallocate the old extent */ + error = BlockDeallocate(hfsmp, oldStartBlock, oldBlockCount, HFS_ALLOC_SKIPFREEBLKS); + if (error) { + printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u) BlockDeallocate error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, error); + goto out; + } + extent_info->blocks_relocated += newBlockCount; + + if (hfs_resize_debug) { + printf ("hfs_reclaim_extent: Relocated record:%u %u:(%u,%u) to (%u,%u)\n", extent_info->overflow_count, index, oldStartBlock, oldBlockCount, newStartBlock, newBlockCount); + } + +out: + if (error != 0) { + if (blocks_allocated == true) { + BlockDeallocate(hfsmp, newStartBlock, newBlockCount, HFS_ALLOC_SKIPFREEBLKS); + } + } else { + /* On success, increment the total allocation blocks processed */ + extent_info->cur_blockCount += newBlockCount; + } + + hfs_systemfile_unlock(hfsmp, extent_info->lockflags); + + /* For a non-system file, if an extent entry from catalog record + * was modified, sync the in-memory changes to the catalog record + * on disk before ending the transaction. + */ + if ((extent_info->catalog_fp) && + (extent_info->is_sysfile == false)) { + hfs_update(extent_info->vp, 0); + } + + hfs_end_transaction(hfsmp); + + return error; +} + +/* Report intermediate progress during volume resize */ +static void +hfs_truncatefs_progress(struct hfsmount *hfsmp) +{ + u_int32_t cur_progress = 0; + + hfs_resize_progress(hfsmp, &cur_progress); + if (cur_progress > (hfsmp->hfs_resize_progress + 9)) { + printf("hfs_truncatefs: %d%% done...\n", cur_progress); + hfsmp->hfs_resize_progress = cur_progress; + } + return; +} + +/* + * Reclaim space at the end of a volume for given file and forktype. + * + * This routine attempts to move any extent which contains allocation blocks + * at or after "allocLimit." A separate transaction is used for every extent + * that needs to be moved. If there is not contiguous space available for + * moving an extent, it can be split into smaller extents. The contents of + * any moved extents are read and written via the volume's device vnode -- + * NOT via "vp." During the move, moved blocks which are part of a transaction + * have their physical block numbers invalidated so they will eventually be + * written to their new locations. + * + * This function is also called for directory hard links. Directory hard links + * are regular files with no data fork and resource fork that contains alias + * information for backward compatibility with pre-Leopard systems. However + * non-Mac OS X implementation can add/modify data fork or resource fork + * information to directory hard links, so we check, and if required, relocate + * both data fork and resource fork. + * + * Inputs: + * hfsmp The volume being resized. + * vp The vnode for the system file. + * fileID ID of the catalog record that needs to be relocated + * forktype The type of fork that needs relocated, + * kHFSResourceForkType for resource fork, + * kHFSDataForkType for data fork + * allocLimit Allocation limit for the new volume size, + * do not use this block or beyond. All extents + * that use this block or any blocks beyond this limit + * will be relocated. + * + * Side Effects: + * hfsmp->hfs_resize_blocksmoved is incremented by the number of allocation + * blocks that were relocated. + */ +static int +hfs_reclaim_file(struct hfsmount *hfsmp, struct vnode *vp, u_int32_t fileID, + u_int8_t forktype, u_long allocLimit, vfs_context_t context) +{ + int error = 0; + struct hfs_reclaim_extent_info *extent_info; + int i; + int lockflags = 0; + struct cnode *cp; + struct filefork *fp; + int took_truncate_lock = false; + int release_desc = false; + HFSPlusExtentKey *key; + + /* If there is no vnode for this file, then there's nothing to do. */ + if (vp == NULL) { + return 0; + } + + cp = VTOC(vp); + + if (hfs_resize_debug) { + const char *filename = (const char *) cp->c_desc.cd_nameptr; + int namelen = cp->c_desc.cd_namelen; + + if (filename == NULL) { + filename = ""; + namelen = 0; + } + printf("hfs_reclaim_file: reclaiming '%.*s'\n", namelen, filename); + } + + extent_info = hfs_mallocz(sizeof(struct hfs_reclaim_extent_info)); + + extent_info->vp = vp; + extent_info->fileID = fileID; + extent_info->forkType = forktype; + extent_info->is_sysfile = vnode_issystem(vp); + if (vnode_isdir(vp) && (cp->c_flag & C_HARDLINK)) { + extent_info->is_dirlink = true; + } + /* We always need allocation bitmap and extent btree lock */ + lockflags = SFL_BITMAP | SFL_EXTENTS; + if ((fileID == kHFSCatalogFileID) || (extent_info->is_dirlink == true)) { + lockflags |= SFL_CATALOG; + } else if (fileID == kHFSAttributesFileID) { + lockflags |= SFL_ATTRIBUTE; + } else if (fileID == kHFSStartupFileID) { + lockflags |= SFL_STARTUP; + } + extent_info->lockflags = lockflags; + extent_info->fcb = VTOF(hfsmp->hfs_extents_vp); + + /* Flush data associated with current file on disk. + * + * If the current vnode is directory hard link, no flushing of + * journal or vnode is required. The current kernel does not + * modify data/resource fork of directory hard links, so nothing + * will be in the cache. If a directory hard link is newly created, + * the resource fork data is written directly using devvp and + * the code that actually relocates data (hfs_copy_extent()) also + * uses devvp for its I/O --- so they will see a consistent copy. + */ + if (extent_info->is_sysfile) { + /* If the current vnode is system vnode, flush journal + * to make sure that all data is written to the disk. + */ + error = hfs_flush(hfsmp, HFS_FLUSH_JOURNAL_META); + if (error) { + printf ("hfs_reclaim_file: journal_flush returned %d\n", error); + goto out; + } + } else if (extent_info->is_dirlink == false) { + /* Flush all blocks associated with this regular file vnode. + * Normally there should not be buffer cache blocks for regular + * files, but for objects like symlinks, we can have buffer cache + * blocks associated with the vnode. Therefore we call + * buf_flushdirtyblks() also. + */ + buf_flushdirtyblks(vp, 0, BUF_SKIP_LOCKED, "hfs_reclaim_file"); + + hfs_unlock(cp); + hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + took_truncate_lock = true; + (void) cluster_push(vp, 0); + error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); + if (error) { + goto out; + } + + /* If the file no longer exists, nothing left to do */ + if (cp->c_flag & C_NOEXISTS) { + error = 0; + goto out; + } + + /* Wait for any in-progress writes to this vnode to complete, so that we'll + * be copying consistent bits. (Otherwise, it's possible that an async + * write will complete to the old extent after we read from it. That + * could lead to corruption.) + */ + error = vnode_waitforwrites(vp, 0, 0, 0, "hfs_reclaim_file"); + if (error) { + goto out; + } + } + + if (hfs_resize_debug) { + printf("hfs_reclaim_file: === Start reclaiming %sfork for %sid=%u ===\n", (forktype ? "rsrc" : "data"), (extent_info->is_dirlink ? "dirlink" : "file"), fileID); + } + + if (extent_info->is_dirlink) { + extent_info->dirlink_desc = hfs_malloc(sizeof(struct cat_desc)); + extent_info->dirlink_attr = hfs_malloc(sizeof(struct cat_attr)); + extent_info->dirlink_fork = hfs_mallocz(sizeof(struct filefork)); + + /* Lookup catalog record for directory hard link and + * create a fake filefork for the value looked up from + * the disk. + */ + fp = extent_info->dirlink_fork; + extent_info->dirlink_fork->ff_cp = cp; + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); + error = cat_lookup_dirlink(hfsmp, fileID, forktype, + extent_info->dirlink_desc, extent_info->dirlink_attr, + &(extent_info->dirlink_fork->ff_data)); + hfs_systemfile_unlock(hfsmp, lockflags); + if (error) { + printf ("hfs_reclaim_file: cat_lookup_dirlink for fileID=%u returned error=%u\n", fileID, error); + goto out; + } + release_desc = true; + } else { + fp = VTOF(vp); + } + + extent_info->catalog_fp = fp; + extent_info->recStartBlock = 0; + extent_info->extents = extent_info->catalog_fp->ff_extents; + /* Relocate extents from the catalog record */ + for (i = 0; i < kHFSPlusExtentDensity; ++i) { + if (fp->ff_extents[i].blockCount == 0) { + break; + } + extent_info->extent_index = i; + error = hfs_reclaim_extent(hfsmp, allocLimit, extent_info, context); + if (error) { + printf ("hfs_reclaim_file: fileID=%u #%d %u:(%u,%u) hfs_reclaim_extent error=%d\n", fileID, extent_info->overflow_count, i, fp->ff_extents[i].startBlock, fp->ff_extents[i].blockCount, error); + goto out; + } + } + + /* If the number of allocation blocks processed for reclaiming + * are less than total number of blocks for the file, continuing + * working on overflow extents record. + */ + if (fp->ff_blocks <= extent_info->cur_blockCount) { + if (0 && hfs_resize_debug) { + printf ("hfs_reclaim_file: Nothing more to relocate, offset=%d, ff_blocks=%u, cur_blockCount=%u\n", i, fp->ff_blocks, extent_info->cur_blockCount); + } + goto out; + } + + if (hfs_resize_debug) { + printf ("hfs_reclaim_file: Will check overflow records, offset=%d, ff_blocks=%u, cur_blockCount=%u\n", i, fp->ff_blocks, extent_info->cur_blockCount); + } + + extent_info->iterator = hfs_mallocz(sizeof(struct BTreeIterator)); + key = (HFSPlusExtentKey *) &(extent_info->iterator->key); + key->keyLength = kHFSPlusExtentKeyMaximumLength; + key->forkType = forktype; + key->fileID = fileID; + key->startBlock = extent_info->cur_blockCount; + + extent_info->btdata.bufferAddress = extent_info->record.overflow; + extent_info->btdata.itemSize = sizeof(HFSPlusExtentRecord); + extent_info->btdata.itemCount = 1; + + extent_info->catalog_fp = NULL; + + /* Search the first overflow extent with expected startBlock as 'cur_blockCount' */ + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); + error = BTSearchRecord(extent_info->fcb, extent_info->iterator, + &(extent_info->btdata), &(extent_info->recordlen), + extent_info->iterator); + hfs_systemfile_unlock(hfsmp, lockflags); + while (error == 0) { + extent_info->overflow_count++; + extent_info->recStartBlock = key->startBlock; + extent_info->extents = extent_info->record.overflow; + for (i = 0; i < kHFSPlusExtentDensity; i++) { + if (extent_info->record.overflow[i].blockCount == 0) { + goto out; + } + extent_info->extent_index = i; + error = hfs_reclaim_extent(hfsmp, allocLimit, extent_info, context); + if (error) { + printf ("hfs_reclaim_file: fileID=%u #%d %u:(%u,%u) hfs_reclaim_extent error=%d\n", fileID, extent_info->overflow_count, i, extent_info->record.overflow[i].startBlock, extent_info->record.overflow[i].blockCount, error); + goto out; + } + } + + /* Look for more overflow records */ + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); + error = BTIterateRecord(extent_info->fcb, kBTreeNextRecord, + extent_info->iterator, &(extent_info->btdata), + &(extent_info->recordlen)); + hfs_systemfile_unlock(hfsmp, lockflags); + if (error) { + break; + } + /* Stop when we encounter a different file or fork. */ + if ((key->fileID != fileID) || (key->forkType != forktype)) { + break; + } + } + if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) { + error = 0; + } + +out: + /* If any blocks were relocated, account them and report progress */ + if (extent_info->blocks_relocated) { + hfsmp->hfs_resize_blocksmoved += extent_info->blocks_relocated; + hfs_truncatefs_progress(hfsmp); + if (fileID < kHFSFirstUserCatalogNodeID) { + printf ("hfs_reclaim_file: Relocated %u blocks from fileID=%u on \"%s\"\n", + extent_info->blocks_relocated, fileID, hfsmp->vcbVN); + } + } + if (extent_info->iterator) { + hfs_free(extent_info->iterator, sizeof(*extent_info->iterator)); + } + if (release_desc == true) { + cat_releasedesc(extent_info->dirlink_desc); + } + if (extent_info->dirlink_desc) { + hfs_free(extent_info->dirlink_desc, sizeof(*extent_info->dirlink_desc)); + } + if (extent_info->dirlink_attr) { + hfs_free(extent_info->dirlink_attr, sizeof(*extent_info->dirlink_attr)); + } + if (extent_info->dirlink_fork) { + hfs_free(extent_info->dirlink_fork, sizeof(*extent_info->dirlink_fork)); + } + if ((extent_info->blocks_relocated != 0) && (extent_info->is_sysfile == false)) { + hfs_update(vp, 0); + } + if (took_truncate_lock) { + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); + } + if (extent_info) { + hfs_free(extent_info, sizeof(*extent_info)); + } + if (hfs_resize_debug) { + printf("hfs_reclaim_file: === Finished relocating %sfork for fileid=%u (error=%d) ===\n", (forktype ? "rsrc" : "data"), fileID, error); + } + + return error; +} + + +/* + * This journal_relocate callback updates the journal info block to point + * at the new journal location. This write must NOT be done using the + * transaction. We must write the block immediately. We must also force + * it to get to the media so that the new journal location will be seen by + * the replay code before we can safely let journaled blocks be written + * to their normal locations. + * + * The tests for journal_uses_fua below are mildly hacky. Since the journal + * and the file system are both on the same device, I'm leveraging what + * the journal has decided about FUA. + */ +struct hfs_journal_relocate_args { + struct hfsmount *hfsmp; + vfs_context_t context; + u_int32_t newStartBlock; + u_int32_t newBlockCount; +}; + +static errno_t +hfs_journal_relocate_callback(void *_args) +{ + int error; + struct hfs_journal_relocate_args *args = _args; + struct hfsmount *hfsmp = args->hfsmp; + buf_t bp; + JournalInfoBlock *jibp; + + error = buf_meta_bread(hfsmp->hfs_devvp, + (uint64_t)hfsmp->vcbJinfoBlock * (hfsmp->blockSize/hfsmp->hfs_logical_block_size), + hfsmp->blockSize, vfs_context_ucred(args->context), &bp); + if (error) { + printf("hfs_journal_relocate_callback: failed to read JIB (%d)\n", error); + if (bp) { + buf_brelse(bp); + } + return error; + } + jibp = (JournalInfoBlock*) buf_dataptr(bp); + jibp->offset = SWAP_BE64((u_int64_t)args->newStartBlock * hfsmp->blockSize); + jibp->size = SWAP_BE64((u_int64_t)args->newBlockCount * hfsmp->blockSize); + if (journal_uses_fua(hfsmp->jnl)) + buf_markfua(bp); + error = buf_bwrite(bp); + if (error) { + printf("hfs_journal_relocate_callback: failed to write JIB (%d)\n", error); + return error; + } + if (!journal_uses_fua(hfsmp->jnl)) { + error = hfs_flush(hfsmp, HFS_FLUSH_CACHE); + if (error) { + printf("hfs_journal_relocate_callback: hfs_flush failed (%d)\n", error); + error = 0; /* Don't fail the operation. */ + } + } + + return error; +} + + +/* Type of resize operation in progress */ +#define HFS_RESIZE_TRUNCATE 1 +#define HFS_RESIZE_EXTEND 2 + +/* + * Core function to relocate the journal file. This function takes the + * journal size of the newly relocated journal --- the caller can + * provide a new journal size if they want to change the size of + * the journal. The function takes care of updating the journal info + * block and all other data structures correctly. + * + * Note: This function starts a transaction and grabs the btree locks. + */ +static int +hfs_relocate_journal_file(struct hfsmount *hfsmp, u_int32_t jnl_size, int resize_type, vfs_context_t context) +{ + int error; + int journal_err; + int lockflags; + u_int32_t oldStartBlock; + u_int32_t newStartBlock; + u_int32_t oldBlockCount; + u_int32_t newBlockCount; + u_int32_t jnlBlockCount; + u_int32_t alloc_skipfreeblks; + struct cat_desc journal_desc; + struct cat_attr journal_attr; + struct cat_fork journal_fork; + struct hfs_journal_relocate_args callback_args; + + /* Calculate the number of allocation blocks required for the journal */ + jnlBlockCount = howmany(jnl_size, hfsmp->blockSize); + + /* + * During truncatefs(), the volume free block count is updated + * before relocating data and reflects the total number of free + * blocks that will exist on volume after the resize is successful. + * This means that the allocation blocks required for relocation + * have already been reserved and accounted for in the free block + * count. Therefore, block allocation and deallocation routines + * can skip the free block check by passing HFS_ALLOC_SKIPFREEBLKS + * flag. + * + * This special handling is not required when the file system + * is being extended as we want all the allocated and deallocated + * blocks to be accounted for correctly. + */ + if (resize_type == HFS_RESIZE_TRUNCATE) { + alloc_skipfreeblks = HFS_ALLOC_SKIPFREEBLKS; + } else { + alloc_skipfreeblks = 0; + } + + error = hfs_start_transaction(hfsmp); + if (error) { + printf("hfs_relocate_journal_file: hfs_start_transaction returned %d\n", error); + return error; + } + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_BITMAP, HFS_EXCLUSIVE_LOCK); + + error = BlockAllocate(hfsmp, 1, jnlBlockCount, jnlBlockCount, + HFS_ALLOC_METAZONE | HFS_ALLOC_FORCECONTIG | HFS_ALLOC_FLUSHTXN | alloc_skipfreeblks, + &newStartBlock, &newBlockCount); + if (error) { + printf("hfs_relocate_journal_file: BlockAllocate returned %d\n", error); + goto fail; + } + if (newBlockCount != jnlBlockCount) { + printf("hfs_relocate_journal_file: newBlockCount != jnlBlockCount (%u, %u)\n", newBlockCount, jnlBlockCount); + goto free_fail; + } + + error = cat_idlookup(hfsmp, hfsmp->hfs_jnlfileid, 1, 0, &journal_desc, &journal_attr, &journal_fork); + if (error) { + printf("hfs_relocate_journal_file: cat_idlookup returned %d\n", error); + goto free_fail; + } + + oldStartBlock = journal_fork.cf_extents[0].startBlock; + oldBlockCount = journal_fork.cf_extents[0].blockCount; + error = BlockDeallocate(hfsmp, oldStartBlock, oldBlockCount, alloc_skipfreeblks); + if (error) { + printf("hfs_relocate_journal_file: BlockDeallocate returned %d\n", error); + goto free_fail; + } + + /* Update the catalog record for .journal */ + journal_fork.cf_size = hfs_blk_to_bytes(newBlockCount, hfsmp->blockSize); + journal_fork.cf_extents[0].startBlock = newStartBlock; + journal_fork.cf_extents[0].blockCount = newBlockCount; + journal_fork.cf_blocks = newBlockCount; + error = cat_update(hfsmp, &journal_desc, &journal_attr, &journal_fork, NULL); + cat_releasedesc(&journal_desc); /* all done with cat descriptor */ + if (error) { + printf("hfs_relocate_journal_file: cat_update returned %d\n", error); + goto free_fail; + } + + /* + * If the journal is part of the file system, then tell the journal + * code about the new location. If the journal is on an external + * device, then just keep using it as-is. + */ + if (hfsmp->jvp == hfsmp->hfs_devvp) { + callback_args.hfsmp = hfsmp; + callback_args.context = context; + callback_args.newStartBlock = newStartBlock; + callback_args.newBlockCount = newBlockCount; + + error = journal_relocate(hfsmp->jnl, (off_t)newStartBlock*hfsmp->blockSize, + (off_t)newBlockCount*hfsmp->blockSize, 0, + hfs_journal_relocate_callback, &callback_args); + if (error) { + /* NOTE: journal_relocate will mark the journal invalid. */ + printf("hfs_relocate_journal_file: journal_relocate returned %d\n", error); + goto fail; + } + if (hfs_resize_debug) { + printf ("hfs_relocate_journal_file: Successfully relocated journal from (%u,%u) to (%u,%u)\n", oldStartBlock, oldBlockCount, newStartBlock, newBlockCount); + } + hfsmp->jnl_start = newStartBlock; + hfsmp->jnl_size = (off_t)newBlockCount * hfsmp->blockSize; + } + + hfs_systemfile_unlock(hfsmp, lockflags); + error = hfs_end_transaction(hfsmp); + if (error) { + printf("hfs_relocate_journal_file: hfs_end_transaction returned %d\n", error); + } + + return error; + +free_fail: + journal_err = BlockDeallocate(hfsmp, newStartBlock, newBlockCount, HFS_ALLOC_SKIPFREEBLKS); + if (journal_err) { + printf("hfs_relocate_journal_file: BlockDeallocate returned %d\n", error); + hfs_mark_inconsistent(hfsmp, HFS_ROLLBACK_FAILED); + } +fail: + hfs_systemfile_unlock(hfsmp, lockflags); + (void) hfs_end_transaction(hfsmp); + if (hfs_resize_debug) { + printf ("hfs_relocate_journal_file: Error relocating journal file (error=%d)\n", error); + } + return error; +} + + +/* + * Relocate the journal file when the file system is being truncated. + * We do not down-size the journal when the file system size is + * reduced, so we always provide the current journal size to the + * relocate code. + */ +static int +hfs_reclaim_journal_file(struct hfsmount *hfsmp, u_int32_t allocLimit, vfs_context_t context) +{ + int error = 0; + u_int32_t startBlock; + u_int32_t blockCount = hfsmp->jnl_size / hfsmp->blockSize; + + /* + * Figure out the location of the .journal file. When the journal + * is on an external device, we need to look up the .journal file. + */ + if (hfsmp->jvp == hfsmp->hfs_devvp) { + startBlock = hfsmp->jnl_start; + blockCount = hfsmp->jnl_size / hfsmp->blockSize; + } else { + u_int32_t fileid; + u_int32_t old_jnlfileid; + struct cat_attr attr; + struct cat_fork fork; + + /* + * The cat_lookup inside GetFileInfo will fail because hfs_jnlfileid + * is set, and it is trying to hide the .journal file. So temporarily + * unset the field while calling GetFileInfo. + */ + old_jnlfileid = hfsmp->hfs_jnlfileid; + hfsmp->hfs_jnlfileid = 0; + fileid = GetFileInfo(hfsmp, kHFSRootFolderID, ".journal", &attr, &fork); + hfsmp->hfs_jnlfileid = old_jnlfileid; + if (fileid != old_jnlfileid) { + printf("hfs_reclaim_journal_file: cannot find .journal file!\n"); + return EIO; + } + + startBlock = fork.cf_extents[0].startBlock; + blockCount = fork.cf_extents[0].blockCount; + } + + if (startBlock + blockCount <= allocLimit) { + /* The journal file does not require relocation */ + return 0; + } + + error = hfs_relocate_journal_file(hfsmp, hfs_blk_to_bytes(blockCount, hfsmp->blockSize), + HFS_RESIZE_TRUNCATE, context); + if (error == 0) { + hfsmp->hfs_resize_blocksmoved += blockCount; + hfs_truncatefs_progress(hfsmp); + printf ("hfs_reclaim_journal_file: Relocated %u blocks from journal on \"%s\"\n", + blockCount, hfsmp->vcbVN); + } + + return error; +} + + +/* + * Move the journal info block to a new location. We have to make sure the + * new copy of the journal info block gets to the media first, then change + * the field in the volume header and the catalog record. + */ +static int +hfs_reclaim_journal_info_block(struct hfsmount *hfsmp, u_int32_t allocLimit, vfs_context_t context) +{ + int error; + int journal_err; + int lockflags; + u_int32_t oldBlock; + u_int32_t newBlock; + u_int32_t blockCount; + struct cat_desc jib_desc; + struct cat_attr jib_attr; + struct cat_fork jib_fork; + buf_t old_bp, new_bp; + + if (hfsmp->vcbJinfoBlock <= allocLimit) { + /* The journal info block does not require relocation */ + return 0; + } + + error = hfs_start_transaction(hfsmp); + if (error) { + printf("hfs_reclaim_journal_info_block: hfs_start_transaction returned %d\n", error); + return error; + } + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_BITMAP, HFS_EXCLUSIVE_LOCK); + + error = BlockAllocate(hfsmp, 1, 1, 1, + HFS_ALLOC_METAZONE | HFS_ALLOC_FORCECONTIG | HFS_ALLOC_SKIPFREEBLKS | HFS_ALLOC_FLUSHTXN, + &newBlock, &blockCount); + if (error) { + printf("hfs_reclaim_journal_info_block: BlockAllocate returned %d\n", error); + goto fail; + } + if (blockCount != 1) { + printf("hfs_reclaim_journal_info_block: blockCount != 1 (%u)\n", blockCount); + goto free_fail; + } + + /* Copy the old journal info block content to the new location */ + error = buf_meta_bread(hfsmp->hfs_devvp, + (uint64_t)hfsmp->vcbJinfoBlock * (hfsmp->blockSize/hfsmp->hfs_logical_block_size), + hfsmp->blockSize, vfs_context_ucred(context), &old_bp); + if (error) { + printf("hfs_reclaim_journal_info_block: failed to read JIB (%d)\n", error); + if (old_bp) { + buf_brelse(old_bp); + } + goto free_fail; + } + new_bp = buf_getblk(hfsmp->hfs_devvp, + (uint64_t)newBlock * (hfsmp->blockSize/hfsmp->hfs_logical_block_size), + hfsmp->blockSize, 0, 0, BLK_META); + bcopy((char*)buf_dataptr(old_bp), (char*)buf_dataptr(new_bp), hfsmp->blockSize); + buf_brelse(old_bp); + if (journal_uses_fua(hfsmp->jnl)) + buf_markfua(new_bp); + error = buf_bwrite(new_bp); + if (error) { + printf("hfs_reclaim_journal_info_block: failed to write new JIB (%d)\n", error); + goto free_fail; + } + if (!journal_uses_fua(hfsmp->jnl)) { + error = hfs_flush(hfsmp, HFS_FLUSH_CACHE); + if (error) { + printf("hfs_reclaim_journal_info_block: hfs_flush failed (%d)\n", error); + /* Don't fail the operation. */ + } + } + + /* Deallocate the old block once the new one has the new valid content */ + error = BlockDeallocate(hfsmp, hfsmp->vcbJinfoBlock, 1, HFS_ALLOC_SKIPFREEBLKS); + if (error) { + printf("hfs_reclaim_journal_info_block: BlockDeallocate returned %d\n", error); + goto free_fail; + } + + + /* Update the catalog record for .journal_info_block */ + error = cat_idlookup(hfsmp, hfsmp->hfs_jnlinfoblkid, 1, 0, &jib_desc, &jib_attr, &jib_fork); + if (error) { + printf("hfs_reclaim_journal_info_block: cat_idlookup returned %d\n", error); + goto fail; + } + oldBlock = jib_fork.cf_extents[0].startBlock; + jib_fork.cf_size = hfsmp->blockSize; + jib_fork.cf_extents[0].startBlock = newBlock; + jib_fork.cf_extents[0].blockCount = 1; + jib_fork.cf_blocks = 1; + error = cat_update(hfsmp, &jib_desc, &jib_attr, &jib_fork, NULL); + cat_releasedesc(&jib_desc); /* all done with cat descriptor */ + if (error) { + printf("hfs_reclaim_journal_info_block: cat_update returned %d\n", error); + goto fail; + } + + /* Update the pointer to the journal info block in the volume header. */ + hfsmp->vcbJinfoBlock = newBlock; + error = hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT | HFS_FVH_WRITE_ALT); + if (error) { + printf("hfs_reclaim_journal_info_block: hfs_flushvolumeheader returned %d\n", error); + goto fail; + } + hfs_systemfile_unlock(hfsmp, lockflags); + error = hfs_end_transaction(hfsmp); + if (error) { + printf("hfs_reclaim_journal_info_block: hfs_end_transaction returned %d\n", error); + } + error = hfs_flush(hfsmp, HFS_FLUSH_JOURNAL); + if (error) { + printf("hfs_reclaim_journal_info_block: journal_flush returned %d\n", error); + } + + /* Account for the block relocated and print progress */ + hfsmp->hfs_resize_blocksmoved += 1; + hfs_truncatefs_progress(hfsmp); + if (!error) { + printf ("hfs_reclaim_journal_info: Relocated 1 block from journal info on \"%s\"\n", + hfsmp->vcbVN); + if (hfs_resize_debug) { + printf ("hfs_reclaim_journal_info_block: Successfully relocated journal info block from (%u,%u) to (%u,%u)\n", oldBlock, blockCount, newBlock, blockCount); + } + } + return error; + +free_fail: + journal_err = BlockDeallocate(hfsmp, newBlock, blockCount, HFS_ALLOC_SKIPFREEBLKS); + if (journal_err) { + printf("hfs_reclaim_journal_info_block: BlockDeallocate returned %d\n", error); + hfs_mark_inconsistent(hfsmp, HFS_ROLLBACK_FAILED); + } + +fail: + hfs_systemfile_unlock(hfsmp, lockflags); + (void) hfs_end_transaction(hfsmp); + if (hfs_resize_debug) { + printf ("hfs_reclaim_journal_info_block: Error relocating journal info block (error=%d)\n", error); + } + return error; +} + + +static u_int64_t +calculate_journal_size(struct hfsmount *hfsmp, u_int32_t sector_size, u_int64_t sector_count) +{ + u_int64_t journal_size; + u_int32_t journal_scale; + +#define DEFAULT_JOURNAL_SIZE (8*1024*1024) +#define MAX_JOURNAL_SIZE (512*1024*1024) + + /* Calculate the journal size for this volume. We want + * at least 8 MB of journal for each 100 GB of disk space. + * We cap the size at 512 MB, unless the allocation block + * size is larger, in which case, we use one allocation + * block. + */ + journal_scale = (sector_size * sector_count) / ((u_int64_t)100 * 1024 * 1024 * 1024); + journal_size = DEFAULT_JOURNAL_SIZE * (journal_scale + 1); + if (journal_size > MAX_JOURNAL_SIZE) { + journal_size = MAX_JOURNAL_SIZE; + } + if (journal_size < hfsmp->blockSize) { + journal_size = hfsmp->blockSize; + } + return journal_size; +} + + +/* + * Calculate the expected journal size based on current partition size. + * If the size of the current journal is less than the calculated size, + * force journal relocation with the new journal size. + */ +static int +hfs_extend_journal(struct hfsmount *hfsmp, u_int32_t sector_size, u_int64_t sector_count, vfs_context_t context) +{ + int error = 0; + u_int64_t calc_journal_size; + + if (hfsmp->jvp != hfsmp->hfs_devvp) { + if (hfs_resize_debug) { + printf("hfs_extend_journal: not resizing the journal because it is on an external device.\n"); + } + return 0; + } + + calc_journal_size = calculate_journal_size(hfsmp, sector_size, sector_count); + if (calc_journal_size <= hfsmp->jnl_size) { + /* The journal size requires no modification */ + goto out; + } + + if (hfs_resize_debug) { + printf ("hfs_extend_journal: journal old=%u, new=%qd\n", hfsmp->jnl_size, calc_journal_size); + } + + /* Extend the journal to the new calculated size */ + error = hfs_relocate_journal_file(hfsmp, calc_journal_size, HFS_RESIZE_EXTEND, context); + if (error == 0) { + printf ("hfs_extend_journal: Extended journal size to %u bytes on \"%s\"\n", + hfsmp->jnl_size, hfsmp->vcbVN); + } +out: + return error; +} + + +/* + * This function traverses through all extended attribute records for a given + * fileID, and calls function that reclaims data blocks that exist in the + * area of the disk being reclaimed which in turn is responsible for allocating + * new space, copying extent data, deallocating new space, and if required, + * splitting the extent. + * + * Note: The caller has already acquired the cnode lock on the file. Therefore + * we are assured that no other thread would be creating/deleting/modifying + * extended attributes for this file. + * + * Side Effects: + * hfsmp->hfs_resize_blocksmoved is incremented by the number of allocation + * blocks that were relocated. + * + * Returns: + * 0 on success, non-zero on failure. + */ +static int +hfs_reclaim_xattr(struct hfsmount *hfsmp, struct vnode *vp, u_int32_t fileID, u_int32_t allocLimit, vfs_context_t context) +{ + int error = 0; + struct hfs_reclaim_extent_info *extent_info; + int i; + HFSPlusAttrKey *key; + int *lockflags; + + if (hfs_resize_debug) { + printf("hfs_reclaim_xattr: === Start reclaiming xattr for id=%u ===\n", fileID); + } + + extent_info = hfs_mallocz(sizeof(struct hfs_reclaim_extent_info)); + extent_info->vp = vp; + extent_info->fileID = fileID; + extent_info->is_xattr = true; + extent_info->is_sysfile = vnode_issystem(vp); + extent_info->fcb = VTOF(hfsmp->hfs_attribute_vp); + lockflags = &(extent_info->lockflags); + *lockflags = SFL_ATTRIBUTE | SFL_BITMAP; + + /* Initialize iterator from the extent_info structure */ + extent_info->iterator = hfs_mallocz(sizeof(struct BTreeIterator)); + + /* Build attribute key */ + key = (HFSPlusAttrKey *)&(extent_info->iterator->key); + error = hfs_buildattrkey(fileID, NULL, key); + if (error) { + goto out; + } + + /* Initialize btdata from extent_info structure. Note that the + * buffer pointer actually points to the xattr record from the + * extent_info structure itself. + */ + extent_info->btdata.bufferAddress = &(extent_info->record.xattr); + extent_info->btdata.itemSize = sizeof(HFSPlusAttrRecord); + extent_info->btdata.itemCount = 1; + + /* + * Sync all extent-based attribute data to the disk. + * + * All extent-based attribute data I/O is performed via cluster + * I/O using a virtual file that spans across entire file system + * space. + */ + hfs_lock_truncate(VTOC(hfsmp->hfs_attrdata_vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + (void)cluster_push(hfsmp->hfs_attrdata_vp, 0); + error = vnode_waitforwrites(hfsmp->hfs_attrdata_vp, 0, 0, 0, "hfs_reclaim_xattr"); + hfs_unlock_truncate(VTOC(hfsmp->hfs_attrdata_vp), HFS_LOCK_DEFAULT); + if (error) { + goto out; + } + + /* Search for extended attribute for current file. This + * will place the iterator before the first matching record. + */ + *lockflags = hfs_systemfile_lock(hfsmp, *lockflags, HFS_EXCLUSIVE_LOCK); + error = BTSearchRecord(extent_info->fcb, extent_info->iterator, + &(extent_info->btdata), &(extent_info->recordlen), + extent_info->iterator); + hfs_systemfile_unlock(hfsmp, *lockflags); + if (error) { + if (error != btNotFound) { + goto out; + } + /* btNotFound is expected here, so just mask it */ + error = 0; + } + + while (1) { + /* Iterate to the next record */ + *lockflags = hfs_systemfile_lock(hfsmp, *lockflags, HFS_EXCLUSIVE_LOCK); + error = BTIterateRecord(extent_info->fcb, kBTreeNextRecord, + extent_info->iterator, &(extent_info->btdata), + &(extent_info->recordlen)); + hfs_systemfile_unlock(hfsmp, *lockflags); + + /* Stop the iteration if we encounter end of btree or xattr with different fileID */ + if (error || key->fileID != fileID) { + if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) { + error = 0; + } + break; + } + + /* We only care about extent-based EAs */ + if ((extent_info->record.xattr.recordType != kHFSPlusAttrForkData) && + (extent_info->record.xattr.recordType != kHFSPlusAttrExtents)) { + continue; + } + + if (extent_info->record.xattr.recordType == kHFSPlusAttrForkData) { + extent_info->overflow_count = 0; + extent_info->extents = extent_info->record.xattr.forkData.theFork.extents; + } else if (extent_info->record.xattr.recordType == kHFSPlusAttrExtents) { + extent_info->overflow_count++; + extent_info->extents = extent_info->record.xattr.overflowExtents.extents; + } + + extent_info->recStartBlock = key->startBlock; + for (i = 0; i < kHFSPlusExtentDensity; i++) { + if (extent_info->extents[i].blockCount == 0) { + break; + } + extent_info->extent_index = i; + error = hfs_reclaim_extent(hfsmp, allocLimit, extent_info, context); + if (error) { + printf ("hfs_reclaim_xattr: fileID=%u hfs_reclaim_extent error=%d\n", fileID, error); + goto out; + } + } + } + +out: + /* If any blocks were relocated, account them and report progress */ + if (extent_info->blocks_relocated) { + hfsmp->hfs_resize_blocksmoved += extent_info->blocks_relocated; + hfs_truncatefs_progress(hfsmp); + } + if (extent_info->iterator) { + hfs_free(extent_info->iterator, sizeof(*extent_info->iterator)); + } + if (extent_info) { + hfs_free(extent_info, sizeof(*extent_info)); + } + if (hfs_resize_debug) { + printf("hfs_reclaim_xattr: === Finished relocating xattr for fileid=%u (error=%d) ===\n", fileID, error); + } + return error; +} + +/* + * Reclaim any extent-based extended attributes allocation blocks from + * the area of the disk that is being truncated. + * + * The function traverses the attribute btree to find out the fileIDs + * of the extended attributes that need to be relocated. For every + * file whose large EA requires relocation, it looks up the cnode and + * calls hfs_reclaim_xattr() to do all the work for allocating + * new space, copying data, deallocating old space, and if required, + * splitting the extents. + * + * Inputs: + * allocLimit - starting block of the area being reclaimed + * + * Returns: + * returns 0 on success, non-zero on failure. + */ +static int +hfs_reclaim_xattrspace(struct hfsmount *hfsmp, u_int32_t allocLimit, vfs_context_t context) +{ + int error = 0; + FCB *fcb; + struct BTreeIterator *iterator = NULL; + struct FSBufferDescriptor btdata; + HFSPlusAttrKey *key; + HFSPlusAttrRecord rec; + int lockflags = 0; + cnid_t prev_fileid = 0; + struct vnode *vp; + int need_relocate; + int btree_operation; + u_int32_t files_moved = 0; + u_int32_t prev_blocksmoved; + int i; + + fcb = VTOF(hfsmp->hfs_attribute_vp); + /* Store the value to print total blocks moved by this function in end */ + prev_blocksmoved = hfsmp->hfs_resize_blocksmoved; + + iterator = hfs_mallocz(sizeof(*iterator)); + key = (HFSPlusAttrKey *)&iterator->key; + btdata.bufferAddress = &rec; + btdata.itemSize = sizeof(rec); + btdata.itemCount = 1; + + need_relocate = false; + btree_operation = kBTreeFirstRecord; + /* Traverse the attribute btree to find extent-based EAs to reclaim */ + while (1) { + lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_SHARED_LOCK); + error = BTIterateRecord(fcb, btree_operation, iterator, &btdata, NULL); + hfs_systemfile_unlock(hfsmp, lockflags); + if (error) { + if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) { + error = 0; + } + break; + } + btree_operation = kBTreeNextRecord; + + /* If the extents of current fileID were already relocated, skip it */ + if (prev_fileid == key->fileID) { + continue; + } + + /* Check if any of the extents in the current record need to be relocated */ + need_relocate = false; + switch(rec.recordType) { + case kHFSPlusAttrForkData: + for (i = 0; i < kHFSPlusExtentDensity; i++) { + if (rec.forkData.theFork.extents[i].blockCount == 0) { + break; + } + if ((rec.forkData.theFork.extents[i].startBlock + + rec.forkData.theFork.extents[i].blockCount) > allocLimit) { + need_relocate = true; + break; + } + } + break; + + case kHFSPlusAttrExtents: + for (i = 0; i < kHFSPlusExtentDensity; i++) { + if (rec.overflowExtents.extents[i].blockCount == 0) { + break; + } + if ((rec.overflowExtents.extents[i].startBlock + + rec.overflowExtents.extents[i].blockCount) > allocLimit) { + need_relocate = true; + break; + } + } + break; + }; + + /* Continue iterating to next attribute record */ + if (need_relocate == false) { + continue; + } + + /* Look up the vnode for corresponding file. The cnode + * will be locked which will ensure that no one modifies + * the xattrs when we are relocating them. + * + * We want to allow open-unlinked files to be moved, + * so provide allow_deleted == 1 for hfs_vget(). + */ + if (hfs_vget(hfsmp, key->fileID, &vp, 0, 1) != 0) { + continue; + } + + error = hfs_reclaim_xattr(hfsmp, vp, key->fileID, allocLimit, context); + hfs_unlock(VTOC(vp)); + vnode_put(vp); + if (error) { + printf ("hfs_reclaim_xattrspace: Error relocating xattrs for fileid=%u (error=%d)\n", key->fileID, error); + break; + } + prev_fileid = key->fileID; + files_moved++; + } + + if (files_moved) { + printf("hfs_reclaim_xattrspace: Relocated %u xattr blocks from %u files on \"%s\"\n", + (hfsmp->hfs_resize_blocksmoved - prev_blocksmoved), + files_moved, hfsmp->vcbVN); + } + + hfs_free(iterator, sizeof(*iterator)); + return error; +} + +/* + * Reclaim blocks from regular files. + * + * This function iterates over all the record in catalog btree looking + * for files with extents that overlap into the space we're trying to + * free up. If a file extent requires relocation, it looks up the vnode + * and calls function to relocate the data. + * + * Returns: + * Zero on success, non-zero on failure. + */ +static int +hfs_reclaim_filespace(struct hfsmount *hfsmp, u_int32_t allocLimit, vfs_context_t context) +{ + int error; + FCB *fcb; + struct BTreeIterator *iterator = NULL; + struct FSBufferDescriptor btdata; + int btree_operation; + int lockflags; + struct HFSPlusCatalogFile filerec; + struct vnode *vp; + struct vnode *rvp; + struct filefork *datafork; + u_int32_t files_moved = 0; + u_int32_t prev_blocksmoved; + +#if CONFIG_PROTECT + int keys_generated = 0; +#endif + + fcb = VTOF(hfsmp->hfs_catalog_vp); + /* Store the value to print total blocks moved by this function at the end */ + prev_blocksmoved = hfsmp->hfs_resize_blocksmoved; + +#if CONFIG_PROTECT + /* + * For content-protected filesystems, we may need to relocate files that + * are encrypted. If they use the new-style offset-based IVs, then + * we can move them regardless of the lock state. We create a temporary + * key here that we use to read/write the data, then we discard it at the + * end of the function. + */ + if (cp_fs_protected (hfsmp->hfs_mp)) { + error = cpx_gentempkeys(&hfsmp->hfs_resize_cpx, hfsmp); + if (error == 0) { + keys_generated = 1; + } + + if (error) { + printf("hfs_reclaimspace: Error generating temporary keys for resize (%d)\n", error); + goto reclaim_filespace_done; + } + } + +#endif + + iterator = hfs_mallocz(sizeof(*iterator)); + + btdata.bufferAddress = &filerec; + btdata.itemSize = sizeof(filerec); + btdata.itemCount = 1; + + btree_operation = kBTreeFirstRecord; + while (1) { + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); + error = BTIterateRecord(fcb, btree_operation, iterator, &btdata, NULL); + hfs_systemfile_unlock(hfsmp, lockflags); + if (error) { + if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) { + error = 0; + } + break; + } + btree_operation = kBTreeNextRecord; + + if (filerec.recordType != kHFSPlusFileRecord) { + continue; + } + + /* Check if any of the extents require relocation */ + bool overlaps; + error = hfs_file_extent_overlaps(hfsmp, allocLimit, &filerec, &overlaps); + if (error) + break; + + if (!overlaps) + continue; + + /* We want to allow open-unlinked files to be moved, so allow_deleted == 1 */ + if (hfs_vget(hfsmp, filerec.fileID, &vp, 0, 1) != 0) { + if (hfs_resize_debug) { + printf("hfs_reclaim_filespace: hfs_vget(%u) failed.\n", filerec.fileID); + } + continue; + } + + /* If data fork exists or item is a directory hard link, relocate blocks */ + datafork = VTOF(vp); + if ((datafork && datafork->ff_blocks > 0) || vnode_isdir(vp)) { + error = hfs_reclaim_file(hfsmp, vp, filerec.fileID, + kHFSDataForkType, allocLimit, context); + if (error) { + printf ("hfs_reclaimspace: Error reclaiming datafork blocks of fileid=%u (error=%d)\n", filerec.fileID, error); + hfs_unlock(VTOC(vp)); + vnode_put(vp); + break; + } + } + + /* If resource fork exists or item is a directory hard link, relocate blocks */ + if (((VTOC(vp)->c_blocks - (datafork ? datafork->ff_blocks : 0)) > 0) || vnode_isdir(vp)) { + if (vnode_isdir(vp)) { + /* Resource fork vnode lookup is invalid for directory hard link. + * So we fake data fork vnode as resource fork vnode. + */ + rvp = vp; + } else { + error = hfs_vgetrsrc(hfsmp, vp, &rvp); + if (error) { + printf ("hfs_reclaimspace: Error looking up rvp for fileid=%u (error=%d)\n", filerec.fileID, error); + hfs_unlock(VTOC(vp)); + vnode_put(vp); + break; + } + VTOC(rvp)->c_flag |= C_NEED_RVNODE_PUT; + } + + error = hfs_reclaim_file(hfsmp, rvp, filerec.fileID, + kHFSResourceForkType, allocLimit, context); + if (error) { + printf ("hfs_reclaimspace: Error reclaiming rsrcfork blocks of fileid=%u (error=%d)\n", filerec.fileID, error); + hfs_unlock(VTOC(vp)); + vnode_put(vp); + break; + } + } + + /* The file forks were relocated successfully, now drop the + * cnode lock and vnode reference, and continue iterating to + * next catalog record. + */ + hfs_unlock(VTOC(vp)); + vnode_put(vp); + files_moved++; + } + + if (files_moved) { + printf("hfs_reclaim_filespace: Relocated %u blocks from %u files on \"%s\"\n", + (hfsmp->hfs_resize_blocksmoved - prev_blocksmoved), + files_moved, hfsmp->vcbVN); + } + +#if CONFIG_PROTECT +reclaim_filespace_done: + + if (keys_generated) { + cpx_free(hfsmp->hfs_resize_cpx); + hfsmp->hfs_resize_cpx = NULL; + } +#endif + + hfs_free(iterator, sizeof(*iterator)); + + return error; +} + +/* + * Reclaim space at the end of a file system. + * + * Inputs - + * allocLimit - start block of the space being reclaimed + * reclaimblks - number of allocation blocks to reclaim + */ +static int +hfs_reclaimspace(struct hfsmount *hfsmp, u_int32_t allocLimit, u_int32_t reclaimblks, vfs_context_t context) +{ + int error = 0; + + /* + * Preflight the bitmap to find out total number of blocks that need + * relocation. + * + * Note: Since allocLimit is set to the location of new alternate volume + * header, the check below does not account for blocks allocated for old + * alternate volume header. + */ + error = hfs_count_allocated(hfsmp, allocLimit, reclaimblks, &(hfsmp->hfs_resize_totalblocks)); + if (error) { + printf ("hfs_reclaimspace: Unable to determine total blocks to reclaim error=%d\n", error); + return error; + } + if (hfs_resize_debug) { + printf ("hfs_reclaimspace: Total number of blocks to reclaim = %u\n", hfsmp->hfs_resize_totalblocks); + } + + /* Just to be safe, sync the content of the journal to the disk before we proceed */ + hfs_flush(hfsmp, HFS_FLUSH_JOURNAL_META); + + /* First, relocate journal file blocks if they're in the way. + * Doing this first will make sure that journal relocate code + * gets access to contiguous blocks on disk first. The journal + * file has to be contiguous on the disk, otherwise resize will + * fail. + */ + error = hfs_reclaim_journal_file(hfsmp, allocLimit, context); + if (error) { + printf("hfs_reclaimspace: hfs_reclaim_journal_file failed (%d)\n", error); + return error; + } + + /* Relocate journal info block blocks if they're in the way. */ + error = hfs_reclaim_journal_info_block(hfsmp, allocLimit, context); + if (error) { + printf("hfs_reclaimspace: hfs_reclaim_journal_info_block failed (%d)\n", error); + return error; + } + + /* Relocate extents of the Extents B-tree if they're in the way. + * Relocating extents btree before other btrees is important as + * this will provide access to largest contiguous block range on + * the disk for relocating extents btree. Note that extents btree + * can only have maximum of 8 extents. + */ + error = hfs_reclaim_file(hfsmp, hfsmp->hfs_extents_vp, kHFSExtentsFileID, + kHFSDataForkType, allocLimit, context); + if (error) { + printf("hfs_reclaimspace: reclaim extents b-tree returned %d\n", error); + return error; + } + + /* Relocate extents of the Allocation file if they're in the way. */ + error = hfs_reclaim_file(hfsmp, hfsmp->hfs_allocation_vp, kHFSAllocationFileID, + kHFSDataForkType, allocLimit, context); + if (error) { + printf("hfs_reclaimspace: reclaim allocation file returned %d\n", error); + return error; + } + + /* Relocate extents of the Catalog B-tree if they're in the way. */ + error = hfs_reclaim_file(hfsmp, hfsmp->hfs_catalog_vp, kHFSCatalogFileID, + kHFSDataForkType, allocLimit, context); + if (error) { + printf("hfs_reclaimspace: reclaim catalog b-tree returned %d\n", error); + return error; + } + + /* Relocate extents of the Attributes B-tree if they're in the way. */ + error = hfs_reclaim_file(hfsmp, hfsmp->hfs_attribute_vp, kHFSAttributesFileID, + kHFSDataForkType, allocLimit, context); + if (error) { + printf("hfs_reclaimspace: reclaim attribute b-tree returned %d\n", error); + return error; + } + + /* Relocate extents of the Startup File if there is one and they're in the way. */ + error = hfs_reclaim_file(hfsmp, hfsmp->hfs_startup_vp, kHFSStartupFileID, + kHFSDataForkType, allocLimit, context); + if (error) { + printf("hfs_reclaimspace: reclaim startup file returned %d\n", error); + return error; + } + + /* + * We need to make sure the alternate volume header gets flushed if we moved + * any extents in the volume header. But we need to do that before + * shrinking the size of the volume, or else the journal code will panic + * with an invalid (too large) block number. + * + * Note that blks_moved will be set if ANY extent was moved, even + * if it was just an overflow extent. In this case, the journal_flush isn't + * strictly required, but shouldn't hurt. + */ + if (hfsmp->hfs_resize_blocksmoved) { + hfs_flush(hfsmp, HFS_FLUSH_JOURNAL_META); + } + + /* Reclaim extents from catalog file records */ + error = hfs_reclaim_filespace(hfsmp, allocLimit, context); + if (error) { + printf ("hfs_reclaimspace: hfs_reclaim_filespace returned error=%d\n", error); + return error; + } + + /* Reclaim extents from extent-based extended attributes, if any */ + error = hfs_reclaim_xattrspace(hfsmp, allocLimit, context); + if (error) { + printf ("hfs_reclaimspace: hfs_reclaim_xattrspace returned error=%d\n", error); + return error; + } + + /* + * Make sure reserved ranges in the region we're to allocate don't + * overlap. + */ + struct rl_entry *range; +again:; + int lockf = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_SHARED_LOCK); + TAILQ_FOREACH(range, &hfsmp->hfs_reserved_ranges[HFS_LOCKED_BLOCKS], rl_link) { + if (rl_overlap(range, hfsmp->allocLimit, RL_INFINITY) != RL_NOOVERLAP) { + // Wait 100ms + hfs_systemfile_unlock(hfsmp, lockf); + msleep(hfs_reclaimspace, NULL, PINOD, "waiting on reserved blocks", + &(struct timespec){ 0, 100 * 1000000 }); + goto again; + } + } + hfs_systemfile_unlock(hfsmp, lockf); + + return error; +} + + +/* + * Check if there are any extents (including overflow extents) that overlap + * into the disk space that is being reclaimed. + * + * Output - + * true - One of the extents need to be relocated + * false - No overflow extents need to be relocated, or there was an error + */ +static errno_t +hfs_file_extent_overlaps(struct hfsmount *hfsmp, u_int32_t allocLimit, + struct HFSPlusCatalogFile *filerec, bool *overlaps) +{ + struct BTreeIterator * iterator = NULL; + struct FSBufferDescriptor btdata; + HFSPlusExtentRecord extrec; + HFSPlusExtentKey *extkeyptr; + FCB *fcb; + int i, j; + int error; + int lockflags = 0; + u_int32_t endblock; + errno_t ret = 0; + + /* Check if data fork overlaps the target space */ + for (i = 0; i < kHFSPlusExtentDensity; ++i) { + if (filerec->dataFork.extents[i].blockCount == 0) { + break; + } + endblock = filerec->dataFork.extents[i].startBlock + + filerec->dataFork.extents[i].blockCount; + if (endblock > allocLimit) { + *overlaps = true; + goto out; + } + } + + /* Check if resource fork overlaps the target space */ + for (j = 0; j < kHFSPlusExtentDensity; ++j) { + if (filerec->resourceFork.extents[j].blockCount == 0) { + break; + } + endblock = filerec->resourceFork.extents[j].startBlock + + filerec->resourceFork.extents[j].blockCount; + if (endblock > allocLimit) { + *overlaps = true; + goto out; + } + } + + /* Return back if there are no overflow extents for this file */ + if ((i < kHFSPlusExtentDensity) && (j < kHFSPlusExtentDensity)) { + *overlaps = false; + goto out; + } + + iterator = hfs_malloc(sizeof(*iterator)); + + bzero(iterator, sizeof(*iterator)); + extkeyptr = (HFSPlusExtentKey *)&iterator->key; + extkeyptr->keyLength = kHFSPlusExtentKeyMaximumLength; + extkeyptr->forkType = 0; + extkeyptr->fileID = filerec->fileID; + extkeyptr->startBlock = 0; + + btdata.bufferAddress = &extrec; + btdata.itemSize = sizeof(extrec); + btdata.itemCount = 1; + + fcb = VTOF(hfsmp->hfs_extents_vp); + + lockflags = hfs_systemfile_lock(hfsmp, SFL_EXTENTS, HFS_SHARED_LOCK); + + /* This will position the iterator just before the first overflow + * extent record for given fileID. It will always return btNotFound, + * so we special case the error code. + */ + error = BTSearchRecord(fcb, iterator, &btdata, NULL, iterator); + if (error && (error != btNotFound)) { + ret = MacToVFSError(error); + goto out; + } + + /* BTIterateRecord() might return error if the btree is empty, and + * therefore we return that the extent does not overflow to the caller + */ + error = BTIterateRecord(fcb, kBTreeNextRecord, iterator, &btdata, NULL); + while (error == 0) { + /* Stop when we encounter a different file. */ + if (extkeyptr->fileID != filerec->fileID) { + break; + } + /* Check if any of the forks exist in the target space. */ + for (i = 0; i < kHFSPlusExtentDensity; ++i) { + if (extrec[i].blockCount == 0) { + break; + } + endblock = extrec[i].startBlock + extrec[i].blockCount; + if (endblock > allocLimit) { + *overlaps = true; + goto out; + } + } + /* Look for more records. */ + error = BTIterateRecord(fcb, kBTreeNextRecord, iterator, &btdata, NULL); + } + + if (error && error != btNotFound) { + ret = MacToVFSError(error); + goto out; + } + + *overlaps = false; + +out: + if (lockflags) { + hfs_systemfile_unlock(hfsmp, lockflags); + } + + hfs_free(iterator, sizeof(*iterator)); + + return ret; +} + + +/* + * Calculate the progress of a file system resize operation. + */ +int +hfs_resize_progress(struct hfsmount *hfsmp, u_int32_t *progress) +{ + if ((hfsmp->hfs_flags & HFS_RESIZE_IN_PROGRESS) == 0) { + return (ENXIO); + } + + if (hfsmp->hfs_resize_totalblocks > 0) { + *progress = (u_int32_t)((hfsmp->hfs_resize_blocksmoved * 100ULL) / hfsmp->hfs_resize_totalblocks); + } else { + *progress = 0; + } + + return (0); +} diff --git a/core/hfs_search.c b/core/hfs_search.c new file mode 100644 index 0000000..45aee7b --- /dev/null +++ b/core/hfs_search.c @@ -0,0 +1,1395 @@ +/* + * Copyright (c) 1997-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + * + * @(#)hfs_search.c + */ +/* + * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce + * support for mandatory and extensible security protections. This notice + * is included in support of clause 2.2 (b) of the Apple Public License, + * Version 2.0. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if CONFIG_MACF +#include +#endif + +#include "hfs.h" +#include "hfs_dbg.h" +#include "hfs_catalog.h" +#include "hfs_attrlist.h" +#include "hfs_endian.h" + +#include "FileMgrInternal.h" +#include "HFSUnicodeWrappers.h" +#include "BTreesPrivate.h" +#include "BTreeScanner.h" +#include "CatalogPrivate.h" + +#if CONFIG_SEARCHFS + +/* Search criterea. */ +struct directoryInfoSpec +{ + u_int32_t numFiles; +}; + +struct fileInfoSpec +{ + off_t dataLogicalLength; + off_t dataPhysicalLength; + off_t resourceLogicalLength; + off_t resourcePhysicalLength; +}; + +struct searchinfospec +{ + u_char name[kHFSPlusMaxFileNameBytes]; + u_int32_t nameLength; + char attributes; // see IM:Files 2-100 + u_int32_t nodeID; + u_int32_t parentDirID; + struct timespec creationDate; + struct timespec modificationDate; + struct timespec changeDate; + struct timespec accessDate; + struct timespec lastBackupDate; + u_int8_t finderInfo[32]; + uid_t uid; + gid_t gid; + mode_t mask; + struct fileInfoSpec f; + struct directoryInfoSpec d; +}; +typedef struct searchinfospec searchinfospec_t; + +static void ResolveHardlink(struct hfsmount *hfsmp, HFSPlusCatalogFile *recp); + + +static int UnpackSearchAttributeBlock(struct hfsmount *hfsmp, struct attrlist *alist, + searchinfospec_t *searchInfo, void *attributeBuffer, int firstblock); + +static int CheckCriteria( ExtendedVCB *vcb, + u_long searchBits, + struct attrlist *attrList, + CatalogRecord *rec, + CatalogKey *key, + searchinfospec_t *searchInfo1, + searchinfospec_t *searchInfo2, + struct vfs_context *ctx); + +static int CheckAccess(ExtendedVCB *vcb, u_long searchBits, CatalogKey *key, struct vfs_context *ctx); + +static int InsertMatch(struct hfsmount *hfsmp, uio_t a_uio, CatalogRecord *rec, + CatalogKey *key, struct attrlist *returnAttrList, + void *attributesBuffer, void *variableBuffer, + uint32_t * nummatches ); + +static Boolean CompareRange(u_long val, u_long low, u_long high); +static Boolean CompareWideRange(u_int64_t val, u_int64_t low, u_int64_t high); + +static Boolean CompareRange( u_long val, u_long low, u_long high ) +{ + return( (val >= low) && (val <= high) ); +} + +static Boolean CompareWideRange( u_int64_t val, u_int64_t low, u_int64_t high ) +{ + return( (val >= low) && (val <= high) ); +} +//#define CompareRange(val, low, high) ((val >= low) && (val <= high)) + + +/************************************************************************/ +/* Entry for searchfs() */ +/************************************************************************/ + +#define errSearchBufferFull 101 /* Internal search errors */ +/* +# +#% searchfs vp L L L +# +vnop_searchfs { + IN struct vnode *vp; + IN off_t length; + IN int flags; + IN kauth_cred_t cred; + IN struct proc *p; +}; +*/ + +int +hfs_vnop_search(ap) + struct vnop_searchfs_args *ap; /* + struct vnodeop_desc *a_desc; + struct vnode *a_vp; + void *a_searchparams1; + void *a_searchparams2; + struct attrlist *a_searchattrs; + u_long a_maxmatches; + struct timeval *a_timelimit; + struct attrlist *a_returnattrs; + u_long *a_nummatches; + u_long a_scriptcode; + u_long a_options; + struct uio *a_uio; + struct searchstate *a_searchstate; + vfs_context_t a_context; + */ +{ + ExtendedVCB *vcb = VTOVCB(ap->a_vp); + struct hfsmount *hfsmp; + FCB * catalogFCB; + searchinfospec_t searchInfo1; + searchinfospec_t searchInfo2; + void *attributesBuffer = NULL; + void *variableBuffer; + u_int32_t fixedBlockSize; + u_int32_t eachReturnBufferSize; + struct proc *p = current_proc(); + int err = E_NONE; + int isHFSPlus; + CatalogKey * myCurrentKeyPtr; + CatalogRecord * myCurrentDataPtr; + CatPosition * myCatPositionPtr; + BTScanState myBTScanState; + user_addr_t user_start = 0; + user_size_t user_len = 0; + int32_t searchTime; + int lockflags; + boolean_t timerExpired = FALSE; + + /* XXX Parameter check a_searchattrs? */ + + *(ap->a_nummatches) = 0; + + if (ap->a_options & ~SRCHFS_VALIDOPTIONSMASK) { + return (EINVAL); + } + + /* + * Fail requests for attributes that HFS does not support for the + * items that match the search criteria. Note that these checks + * are for the OUTBOUND attributes to be returned (not search criteria). + */ + if ((ap->a_returnattrs->commonattr & ~HFS_ATTR_CMN_VALID) || + (ap->a_returnattrs->volattr != 0) || + (ap->a_returnattrs->dirattr & ~HFS_ATTR_DIR_VALID) || + (ap->a_returnattrs->fileattr & ~HFS_ATTR_FILE_VALID) || + (ap->a_returnattrs->forkattr != 0)) { + + return (EINVAL); + } + + /* SRCHFS_SKIPLINKS requires root access. + * This option cannot be used with either + * the ATTR_CMN_NAME or ATTR_CMN_PAROBJID + * attributes. + */ + if (ap->a_options & SRCHFS_SKIPLINKS) { + attrgroup_t attrs; + + attrs = ap->a_searchattrs->commonattr | ap->a_returnattrs->commonattr; + if (attrs & (ATTR_CMN_NAME | ATTR_CMN_PAROBJID)) { + return (EINVAL); + } + + if ((err = vfs_context_suser(ap->a_context))) { + return (err); + } + } + + // If both 32-bit and 64-bit parent ids or file ids are given + // then return an error. + + attrgroup_t test_attrs=ap->a_searchattrs->commonattr; + + if (((test_attrs & ATTR_CMN_OBJID) && (test_attrs & ATTR_CMN_FILEID)) || + ((test_attrs & ATTR_CMN_PARENTID) && (test_attrs & ATTR_CMN_PAROBJID))) { + return (EINVAL); + } + + if (uio_resid(ap->a_uio) <= 0) { + return (EINVAL); + } + + isHFSPlus = (vcb->vcbSigWord == kHFSPlusSigWord); + hfsmp = VTOHFS(ap->a_vp); + + searchTime = kMaxMicroSecsInKernel; + if (ap->a_timelimit->tv_sec == 0 && + ap->a_timelimit->tv_usec > 0 && + ap->a_timelimit->tv_usec < kMaxMicroSecsInKernel) { + searchTime = ap->a_timelimit->tv_usec; + } + + /* UnPack the search boundries, searchInfo1, searchInfo2 */ + err = UnpackSearchAttributeBlock(hfsmp, ap->a_searchattrs, + &searchInfo1, ap->a_searchparams1, 1); + if (err) { + return err; + } + err = UnpackSearchAttributeBlock(hfsmp, ap->a_searchattrs, + &searchInfo2, ap->a_searchparams2, 0); + if (err) { + return err; + } + //shadow search bits if 64-bit file/parent ids are used + if (ap->a_searchattrs->commonattr & ATTR_CMN_FILEID) + ap->a_searchattrs->commonattr |= ATTR_CMN_OBJID; + if (ap->a_searchattrs->commonattr & ATTR_CMN_PARENTID) + ap->a_searchattrs->commonattr |= ATTR_CMN_PAROBJID; + + fixedBlockSize = sizeof(u_int32_t) + hfs_attrblksize(ap->a_returnattrs); /* u_int32_t for length word */ + + eachReturnBufferSize = fixedBlockSize; + + if ( ap->a_returnattrs->commonattr & ATTR_CMN_NAME ) /* XXX should be more robust! */ + eachReturnBufferSize += kHFSPlusMaxFileNameBytes + 1; + + attributesBuffer = hfs_mallocz(eachReturnBufferSize); + variableBuffer = (void*)((char*) attributesBuffer + fixedBlockSize); + + // XXXdbg - have to lock the user's buffer so we don't fault + // while holding the shared catalog file lock. see the comment + // in hfs_readdir() for more details. + // + if (hfsmp->jnl && uio_isuserspace(ap->a_uio)) { + user_start = uio_curriovbase(ap->a_uio); + user_len = uio_curriovlen(ap->a_uio); + + if ((err = vslock(user_start, user_len)) != 0) { + user_start = 0; + goto ExitThisRoutine; + } + } + + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); + + catalogFCB = GetFileControlBlock(vcb->catalogRefNum); + myCurrentKeyPtr = NULL; + myCurrentDataPtr = NULL; + myCatPositionPtr = (CatPosition *)ap->a_searchstate; + + if (ap->a_options & SRCHFS_START) { + /* Starting a new search. */ + /* Make sure the on-disk Catalog file is current */ + (void) hfs_fsync(vcb->catalogRefNum, MNT_WAIT, 0, p); + if (hfsmp->jnl) { + hfs_systemfile_unlock(hfsmp, lockflags); + hfs_flush(hfsmp, HFS_FLUSH_JOURNAL); + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); + } + + ap->a_options &= ~SRCHFS_START; + bzero((caddr_t)myCatPositionPtr, sizeof(*myCatPositionPtr)); + err = BTScanInitialize(catalogFCB, 0, 0, 0, kCatSearchBufferSize, &myBTScanState); + if (err) { + hfs_systemfile_unlock(hfsmp, lockflags); + goto ExitThisRoutine; + } + } else { + /* Resuming a search. */ + err = BTScanInitialize(catalogFCB, myCatPositionPtr->nextNode, + myCatPositionPtr->nextRecord, + myCatPositionPtr->recordsFound, + kCatSearchBufferSize, + &myBTScanState); + /* Make sure Catalog hasn't changed. */ + if (err == 0 + && myCatPositionPtr->writeCount != myBTScanState.btcb->writeCount) { + myCatPositionPtr->writeCount = myBTScanState.btcb->writeCount; + err = EBUSY; /* catChangedErr */ + } + } + hfs_systemfile_unlock(hfsmp, lockflags); + + if (err) + goto ExitThisRoutine; + + /* + * Check all the catalog btree records... + * return the attributes for matching items + */ + for (;;) { + struct timeval myCurrentTime; + struct timeval myElapsedTime; + + err = BTScanNextRecord(&myBTScanState, timerExpired, + (void **)&myCurrentKeyPtr, (void **)&myCurrentDataPtr, + NULL); + if (err) + break; + + /* Resolve any hardlinks */ + if (isHFSPlus && (ap->a_options & SRCHFS_SKIPLINKS) == 0) { + ResolveHardlink(vcb, (HFSPlusCatalogFile *)myCurrentDataPtr); + } + if (CheckCriteria( vcb, ap->a_options, ap->a_searchattrs, myCurrentDataPtr, + myCurrentKeyPtr, &searchInfo1, &searchInfo2, ap->a_context ) + && CheckAccess(vcb, ap->a_options, myCurrentKeyPtr, ap->a_context)) { + err = InsertMatch(hfsmp, ap->a_uio, myCurrentDataPtr, + myCurrentKeyPtr, ap->a_returnattrs, + attributesBuffer, variableBuffer, ap->a_nummatches); + if (err) { + /* + * The last match didn't fit so come back + * to this record on the next trip. + */ + --myBTScanState.recordsFound; + --myBTScanState.recordNum; + break; + } + + if (*(ap->a_nummatches) >= ap->a_maxmatches) + break; + } + if (timerExpired == FALSE) { + /* + * Check our elapsed time and bail if we've hit the max. + * The idea here is to throttle the amount of time we + * spend in the kernel. + */ + microuptime(&myCurrentTime); + timersub(&myCurrentTime, &myBTScanState.startTime, &myElapsedTime); + /* + * Note: assumes kMaxMicroSecsInKernel is less than 1,000,000 + */ + if (myElapsedTime.tv_sec > 0 + || myElapsedTime.tv_usec >= searchTime) { + timerExpired = TRUE; + } else if (throttle_io_will_be_throttled(-1, HFSTOVFS(hfsmp))) + timerExpired = TRUE; + } + } + + /* Update catalog position */ + myCatPositionPtr->writeCount = myBTScanState.btcb->writeCount; + + BTScanTerminate(&myBTScanState, &myCatPositionPtr->nextNode, + &myCatPositionPtr->nextRecord, + &myCatPositionPtr->recordsFound); + + if ( err == E_NONE ) { + err = EAGAIN; /* signal to the user to call searchfs again */ + } else if ( err == errSearchBufferFull ) { + if ( *(ap->a_nummatches) > 0 ) + err = EAGAIN; + else + err = ENOBUFS; + } else if ( err == btNotFound ) { + err = E_NONE; /* the entire disk has been searched */ + } else if ( err == fsBTTimeOutErr ) { + err = EAGAIN; + } + +ExitThisRoutine: + if (attributesBuffer) + hfs_free(attributesBuffer, eachReturnBufferSize); + + if (user_start) { + vsunlock(user_start, user_len, TRUE); + } + + return (MacToVFSError(err)); +} + + +static void +ResolveHardlink(struct hfsmount *hfsmp, HFSPlusCatalogFile *recp) +{ + u_int32_t type, creator; + int isdirlink = 0; + int isfilelink = 0; + time_t filecreatedate; + + if (recp->recordType != kHFSPlusFileRecord) { + return; + } + type = SWAP_BE32(recp->userInfo.fdType); + creator = SWAP_BE32(recp->userInfo.fdCreator); + filecreatedate = to_bsd_time(recp->createDate); + + if ((type == kHardLinkFileType && creator == kHFSPlusCreator) && + (filecreatedate == (time_t)hfsmp->hfs_itime || + filecreatedate == (time_t)hfsmp->hfs_metadata_createdate)) { + isfilelink = 1; + } else if ((type == kHFSAliasType && creator == kHFSAliasCreator) && + (recp->flags & kHFSHasLinkChainMask) && + (filecreatedate == (time_t)hfsmp->hfs_itime || + filecreatedate == (time_t)hfsmp->hfs_metadata_createdate)) { + isdirlink = 1; + } + + if (isfilelink || isdirlink) { + cnid_t saved_cnid; + int lockflags; + + /* Export link's cnid (a unique value) instead of inode's cnid */ + saved_cnid = recp->fileID; + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); + + (void) cat_resolvelink(hfsmp, recp->hl_linkReference, isdirlink, recp); + + recp->fileID = saved_cnid; + hfs_systemfile_unlock(hfsmp, lockflags); + } +} + + +static Boolean +CompareMasked(const u_int32_t *thisValue, const u_int32_t *compareData, + const u_int32_t *compareMask, u_int32_t count) +{ + Boolean matched; + u_int32_t i; + + matched = true; /* Assume it will all match */ + + for (i=0; i= f_len) { + *tsp = f_len; + + if (FastRelString(tsp++, find) == 0) + return TRUE; + } + + return FALSE; +} +#endif + + +/* + * Check to see if caller has access rights to this item + */ + +static int +CheckAccess(ExtendedVCB *theVCBPtr, u_long searchBits, CatalogKey *theKeyPtr, struct vfs_context *ctx) +{ + Boolean isHFSPlus; + int myErr; + int myResult; + HFSCatalogNodeID myNodeID; + hfsmount_t * hfsmp; + struct FndrDirInfo *finfop; + struct vnode * vp = NULL; + + myResult = 0; /* default to "no access" */ + + if (!vfs_context_suser(ctx)) { + myResult = 1; /* allow access */ + goto ExitThisRoutine; /* root always has access */ + } + + hfsmp = VCBTOHFS( theVCBPtr ); + isHFSPlus = ( theVCBPtr->vcbSigWord == kHFSPlusSigWord ); + if ( isHFSPlus ) + myNodeID = theKeyPtr->hfsPlus.parentID; +#if CONFIG_HFS_STD + else + myNodeID = theKeyPtr->hfs.parentID; +#endif + + while ( myNodeID >= kRootDirID ) { + cnode_t * cp; + + /* now go get catalog data for this directory */ + myErr = hfs_vget(hfsmp, myNodeID, &vp, 0, 0); + if ( myErr ) { + goto ExitThisRoutine; /* no access */ + } + + cp = VTOC(vp); + finfop = (struct FndrDirInfo *)&cp->c_attr.ca_finderinfo[0]; + + if ( searchBits & SRCHFS_SKIPPACKAGES ) { + if ( (SWAP_BE16(finfop->frFlags) & kHasBundle) + || (cp->c_desc.cd_nameptr != NULL + && is_package_name((const char *)cp->c_desc.cd_nameptr, cp->c_desc.cd_namelen)) ) { + myResult = 0; + goto ExitThisRoutine; + } + } + + if ( searchBits & SRCHFS_SKIPINAPPROPRIATE ) { + if ( cp->c_parentcnid == kRootDirID && cp->c_desc.cd_nameptr != NULL && + vn_searchfs_inappropriate_name((const char *)cp->c_desc.cd_nameptr, cp->c_desc.cd_namelen) ) { + myResult = 0; + goto ExitThisRoutine; + } + } + + if ( (searchBits & SRCHFS_SKIPINVISIBLE) && + (SWAP_BE16(finfop->frFlags) & kIsInvisible) ) { + myResult = 0; + goto ExitThisRoutine; + } + + myNodeID = cp->c_parentcnid; /* move up the hierarchy */ + hfs_unlock(VTOC(vp)); + +#if CONFIG_MACF + if (vp->v_type == VDIR) { + myErr = mac_vnode_check_readdir(ctx, vp); + } else { + myErr = mac_vnode_check_stat(ctx, NOCRED, vp); + } + if (myErr) { + vnode_put(vp); + vp = NULL; + goto ExitThisRoutine; + } +#endif /* MAC */ + + if (vnode_vtype(vp) == VDIR) { + myErr = vnode_authorize(vp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), ctx); + } else { + myErr = vnode_authorize(vp, NULL, (KAUTH_VNODE_SEARCH), ctx); + } + vnode_put(vp); + vp = NULL; + if ( myErr ) { + goto ExitThisRoutine; /* no access */ + } + } + myResult = 1; /* allow access */ + +ExitThisRoutine: + if ( vp != NULL ) { + hfs_unlock(VTOC(vp)); + vnode_put(vp); + } + return ( myResult ); + +} + +static int +CheckCriteria( ExtendedVCB *vcb, + u_long searchBits, + struct attrlist *attrList, + CatalogRecord *rec, + CatalogKey *key, + searchinfospec_t *searchInfo1, + searchinfospec_t *searchInfo2, + struct vfs_context *ctx) +{ + Boolean matched, atleastone; + Boolean isHFSPlus; + attrgroup_t searchAttributes; + struct cat_attr c_attr; + struct cat_fork datafork; + struct cat_fork rsrcfork; + int force_case_sensitivity = proc_is_forcing_hfs_case_sensitivity(vfs_context_proc(ctx)); + + bzero(&c_attr, sizeof(c_attr)); + isHFSPlus = (vcb->vcbSigWord == kHFSPlusSigWord); + + switch (rec->recordType) { + +#if CONFIG_HFS_STD + case kHFSFolderRecord: + if ( (searchBits & SRCHFS_MATCHDIRS) == 0 ) { /* If we are NOT searching folders */ + matched = false; + goto TestDone; + } + break; + + case kHFSFileRecord: + if ( (searchBits & SRCHFS_MATCHFILES) == 0 ) { /* If we are NOT searching files */ + matched = false; + goto TestDone; + } + break; +#endif + + case kHFSPlusFolderRecord: + if ( (searchBits & SRCHFS_MATCHDIRS) == 0 ) { /* If we are NOT searching folders */ + matched = false; + goto TestDone; + } + break; + + case kHFSPlusFileRecord: + /* Check if hardlink links should be skipped. */ + if (searchBits & SRCHFS_SKIPLINKS) { + cnid_t parid = key->hfsPlus.parentID; + HFSPlusCatalogFile *filep = (HFSPlusCatalogFile *)rec; + + if ((SWAP_BE32(filep->userInfo.fdType) == kHardLinkFileType) && + (SWAP_BE32(filep->userInfo.fdCreator) == kHFSPlusCreator)) { + return (false); /* skip over file link records */ + } else if ((parid == vcb->hfs_private_desc[FILE_HARDLINKS].cd_cnid) && + (filep->bsdInfo.special.linkCount == 0)) { + return (false); /* skip over unlinked files */ + } else if ((SWAP_BE32(filep->userInfo.fdType) == kHFSAliasType) && + (SWAP_BE32(filep->userInfo.fdCreator) == kHFSAliasCreator) && + (filep->flags & kHFSHasLinkChainMask)) { + return (false); /* skip over dir link records */ + } + } else if (key->hfsPlus.parentID == vcb->hfs_private_desc[FILE_HARDLINKS].cd_cnid) { + return (false); /* skip over private files */ + } else if (key->hfsPlus.parentID == vcb->hfs_private_desc[DIR_HARDLINKS].cd_cnid) { + return (false); /* skip over private files */ + } + + if ( (searchBits & SRCHFS_MATCHFILES) == 0 ) { /* If we are NOT searching files */ + matched = false; + goto TestDone; + } + break; + + default: /* Never match a thread record or any other type. */ + return( false ); /* Not a file or folder record, so can't search it */ + } + + matched = true; /* Assume we got a match */ + atleastone = false; /* Dont insert unless we match at least one criteria */ + + /* First, attempt to match the name -- either partial or complete */ + if ( attrList->commonattr & ATTR_CMN_NAME ) { + if (isHFSPlus) { + int case_sensitive = 0; + + /* + * Longstanding default behavior here is to use a non-case-sensitive + * search, even on case-sensitive filesystems. + * + * We only force case sensitivity if the controlling process has explicitly + * asked for it in the proc flags, and only if they are not doing + * a partial name match. Consider that if you are doing a partial + * name match ("all files that begin with 'image'"), the likelihood is + * high that you would want to see all matches, even those that do not + * explicitly match the case. + */ + if (force_case_sensitivity) { + case_sensitive = 1; + } + + /* Check for partial/full HFS Plus name match */ + + if ( searchBits & SRCHFS_MATCHPARTIALNAMES ) { + /* always use a case-INSENSITIVE search here */ + matched = ComparePartialUnicodeName(key->hfsPlus.nodeName.unicode, + key->hfsPlus.nodeName.length, + (UniChar*)searchInfo1->name, + searchInfo1->nameLength, 0); + } + else { + /* Full name match. Are we HFSX (case sensitive) or HFS+ ? */ + if (case_sensitive) { + matched = (UnicodeBinaryCompare(key->hfsPlus.nodeName.unicode, + key->hfsPlus.nodeName.length, + (UniChar*)searchInfo1->name, + searchInfo1->nameLength ) == 0); + } + else { + matched = (FastUnicodeCompare(key->hfsPlus.nodeName.unicode, + key->hfsPlus.nodeName.length, + (UniChar*)searchInfo1->name, + searchInfo1->nameLength ) == 0); + } + } + } +#if CONFIG_HFS_STD + else { + /* Check for partial/full HFS name match */ + + if ( searchBits & SRCHFS_MATCHPARTIALNAMES ) + matched = ComparePartialPascalName(key->hfs.nodeName, (u_char*)searchInfo1->name); + else /* full HFS name match */ + matched = (FastRelString(key->hfs.nodeName, (u_char*)searchInfo1->name) == 0); + } +#endif + + if ( matched == false || (searchBits & ~SRCHFS_MATCHPARTIALNAMES) == 0 ) + goto TestDone; /* no match, or nothing more to compare */ + + atleastone = true; + } + + /* Convert catalog record into cat_attr format. */ + cat_convertattr(VCBTOHFS(vcb), rec, &c_attr, &datafork, &rsrcfork); + + if (searchBits & SRCHFS_SKIPINVISIBLE) { + int flags; + + switch (rec->recordType) { +#if CONFIG_HFS_STD + case kHFSFolderRecord: + { + struct FndrDirInfo *finder_info; + + finder_info = (struct FndrDirInfo *)&c_attr.ca_finderinfo[0]; + flags = SWAP_BE16(finder_info->frFlags); + break; + } + + case kHFSFileRecord: + { + struct FndrFileInfo *finder_info; + + finder_info = (struct FndrFileInfo *)&c_attr.ca_finderinfo[0]; + flags = SWAP_BE16(finder_info->fdFlags); + break; + } +#endif + + case kHFSPlusFolderRecord: + { + struct FndrDirInfo *finder_info; + + finder_info = (struct FndrDirInfo *)&c_attr.ca_finderinfo[0]; + flags = SWAP_BE16(finder_info->frFlags); + break; + } + + case kHFSPlusFileRecord: + { + struct FndrFileInfo *finder_info; + + finder_info = (struct FndrFileInfo *)&c_attr.ca_finderinfo[0]; + flags = SWAP_BE16(finder_info->fdFlags); + break; + } + + default: + { + flags = kIsInvisible; + break; + } + } + + if (flags & kIsInvisible) { + matched = false; + goto TestDone; + } + } + + + + /* Now that we have a record worth searching, see if it matches the search attributes */ +#if CONFIG_HFS_STD + if (rec->recordType == kHFSFileRecord || + rec->recordType == kHFSPlusFileRecord) { +#else + if (rec->recordType == kHFSPlusFileRecord) { +#endif + + if ((attrList->fileattr & ~ATTR_FILE_VALIDMASK) != 0) { /* attr we do know about */ + matched = false; + goto TestDone; + } + else if ((attrList->fileattr & ATTR_FILE_VALIDMASK) != 0) { + searchAttributes = attrList->fileattr; + +#if HFS_COMPRESSION + if ( c_attr.ca_flags & UF_COMPRESSED ) { + /* for compressed files, set the data length to the uncompressed data size */ + if (( searchAttributes & ATTR_FILE_DATALENGTH ) || + ( searchAttributes & ATTR_FILE_DATAALLOCSIZE ) ) { + if ( 0 == hfs_uncompressed_size_of_compressed_file(vcb, NULL, c_attr.ca_fileid, &datafork.cf_size, 1) ) { /* 1 == don't take the cnode lock */ + datafork.cf_blocks = rsrcfork.cf_blocks; + } + } + /* treat compressed files as if their resource fork is empty */ + if (( searchAttributes & ATTR_FILE_RSRCLENGTH ) || + ( searchAttributes & ATTR_FILE_RSRCALLOCSIZE ) ) { + rsrcfork.cf_size = 0; + rsrcfork.cf_blocks = 0; + } + } +#endif /* HFS_COMPRESSION */ + + /* File logical length (data fork) */ + if ( searchAttributes & ATTR_FILE_DATALENGTH ) { + matched = CompareWideRange( + datafork.cf_size, + searchInfo1->f.dataLogicalLength, + searchInfo2->f.dataLogicalLength); + if (matched == false) goto TestDone; + atleastone = true; + } + + /* File physical length (data fork) */ + if ( searchAttributes & ATTR_FILE_DATAALLOCSIZE ) { + matched = CompareWideRange( + (u_int64_t)datafork.cf_blocks * (u_int64_t)vcb->blockSize, + searchInfo1->f.dataPhysicalLength, + searchInfo2->f.dataPhysicalLength); + if (matched == false) goto TestDone; + atleastone = true; + } + + /* File logical length (resource fork) */ + if ( searchAttributes & ATTR_FILE_RSRCLENGTH ) { + matched = CompareWideRange( + rsrcfork.cf_size, + searchInfo1->f.resourceLogicalLength, + searchInfo2->f.resourceLogicalLength); + if (matched == false) goto TestDone; + atleastone = true; + } + + /* File physical length (resource fork) */ + if ( searchAttributes & ATTR_FILE_RSRCALLOCSIZE ) { + matched = CompareWideRange( + (u_int64_t)rsrcfork.cf_blocks * (u_int64_t)vcb->blockSize, + searchInfo1->f.resourcePhysicalLength, + searchInfo2->f.resourcePhysicalLength); + if (matched == false) goto TestDone; + atleastone = true; + } + } + else { + atleastone = true; /* to match SRCHFS_MATCHFILES */ + } + } + /* + * Check the directory attributes + */ +#if CONFIG_HFS_STD + else if (rec->recordType == kHFSFolderRecord || + rec->recordType == kHFSPlusFolderRecord) { +#else + else if (rec->recordType == kHFSPlusFolderRecord) { +#endif + if ((attrList->dirattr & ~ATTR_DIR_VALIDMASK) != 0) { /* attr we do know about */ + matched = false; + goto TestDone; + } + else if ((attrList->dirattr & ATTR_DIR_VALIDMASK) != 0) { + searchAttributes = attrList->dirattr; + + /* Directory valence */ + if ( searchAttributes & ATTR_DIR_ENTRYCOUNT ) { + matched = CompareRange(c_attr.ca_entries, + searchInfo1->d.numFiles, + searchInfo2->d.numFiles ); + if (matched == false) goto TestDone; + atleastone = true; + } + } + else { + atleastone = true; /* to match SRCHFS_MATCHDIRS */ + } + } + + /* + * Check the common attributes + */ + searchAttributes = attrList->commonattr; + if ( (searchAttributes & ATTR_CMN_VALIDMASK) != 0 ) { + /* node ID */ + if ( searchAttributes & ATTR_CMN_OBJID ) { + matched = CompareRange(c_attr.ca_fileid, + searchInfo1->nodeID, + searchInfo2->nodeID ); + if (matched == false) goto TestDone; + atleastone = true; + } + + /* Parent ID */ + if ( searchAttributes & ATTR_CMN_PAROBJID ) { + HFSCatalogNodeID parentID; + + if (isHFSPlus) + parentID = key->hfsPlus.parentID; +#if CONFIG_HFS_STD + else + parentID = key->hfs.parentID; +#endif + + matched = CompareRange(parentID, searchInfo1->parentDirID, + searchInfo2->parentDirID ); + if (matched == false) goto TestDone; + atleastone = true; + } + + /* Finder Info & Extended Finder Info where extFinderInfo is last 32 bytes */ + if ( searchAttributes & ATTR_CMN_FNDRINFO ) { + u_int32_t *thisValue; + thisValue = (u_int32_t *) &c_attr.ca_finderinfo; + + /* + * Note: ioFlFndrInfo and ioDrUsrWds have the same offset in search info, so + * no need to test the object type here. + */ + matched = CompareMasked(thisValue, + (u_int32_t *)&searchInfo1->finderInfo, + (u_int32_t *) &searchInfo2->finderInfo, 8); + if (matched == false) goto TestDone; + atleastone = true; + } + + /* Create date */ + if ( searchAttributes & ATTR_CMN_CRTIME ) { + matched = CompareRange(c_attr.ca_itime, + searchInfo1->creationDate.tv_sec, + searchInfo2->creationDate.tv_sec); + if (matched == false) goto TestDone; + atleastone = true; + } + + /* Mod date */ + if ( searchAttributes & ATTR_CMN_MODTIME ) { + matched = CompareRange(c_attr.ca_mtime, + searchInfo1->modificationDate.tv_sec, + searchInfo2->modificationDate.tv_sec); + if (matched == false) goto TestDone; + atleastone = true; + } + + /* Change Time */ + if ( searchAttributes & ATTR_CMN_CHGTIME ) { + matched = CompareRange(c_attr.ca_ctime, + searchInfo1->changeDate.tv_sec, + searchInfo2->changeDate.tv_sec); + if (matched == false) goto TestDone; + atleastone = true; + } + + /* Access date */ + if ( searchAttributes & ATTR_CMN_ACCTIME ) { + matched = CompareRange(c_attr.ca_atime, + searchInfo1->accessDate.tv_sec, + searchInfo2->accessDate.tv_sec); + if (matched == false) goto TestDone; + atleastone = true; + } + + /* Backup date */ + if ( searchAttributes & ATTR_CMN_BKUPTIME ) { + matched = CompareRange(c_attr.ca_btime, + searchInfo1->lastBackupDate.tv_sec, + searchInfo2->lastBackupDate.tv_sec); + if (matched == false) goto TestDone; + atleastone = true; + } + + /* User ID */ + if ( searchAttributes & ATTR_CMN_OWNERID ) { + matched = CompareRange(c_attr.ca_uid, + searchInfo1->uid, searchInfo2->uid); + if (matched == false) goto TestDone; + atleastone = true; + } + + /* Group ID */ + if ( searchAttributes & ATTR_CMN_GRPID ) { + matched = CompareRange(c_attr.ca_gid, + searchInfo1->gid, searchInfo2->gid); + if (matched == false) goto TestDone; + atleastone = true; + } + + /* mode */ + if ( searchAttributes & ATTR_CMN_ACCESSMASK ) { + matched = CompareRange((u_int32_t)c_attr.ca_mode, + (u_int32_t)searchInfo1->mask, + (u_int32_t)searchInfo2->mask); + if (matched == false) goto TestDone; + atleastone = true; + } + } + + /* If we got here w/o matching any, then set to false */ + if (! atleastone) + matched = false; + +TestDone: + /* + * Finally, determine whether we need to negate the sense of the match + * (i.e. find all objects that DON'T match). + */ + if ( searchBits & SRCHFS_NEGATEPARAMS ) + matched = !matched; + + return( matched ); +} + + +/* + * Adds another record to the packed array for output + */ +static int +InsertMatch(struct hfsmount *hfsmp, uio_t a_uio, CatalogRecord *rec, + CatalogKey *key, struct attrlist *returnAttrList, + void *attributesBuffer, void *variableBuffer, uint32_t * nummatches) +{ + int err; + void *rovingAttributesBuffer; + void *rovingVariableBuffer; + long packedBufferSize; + struct attrblock attrblk; + struct cat_desc c_desc; + struct cat_attr c_attr; + struct cat_fork datafork; + struct cat_fork rsrcfork; + + bzero(&c_desc, sizeof(c_desc)); + bzero(&c_attr, sizeof(c_attr)); + rovingAttributesBuffer = (char*)attributesBuffer + sizeof(u_int32_t); /* Reserve space for length field */ + rovingVariableBuffer = variableBuffer; + + /* Convert catalog record into cat_attr format. */ + cat_convertattr(hfsmp, rec, &c_attr, &datafork, &rsrcfork); + + /* Hide our private meta data directories */ + if (c_attr.ca_fileid == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid || + c_attr.ca_fileid == hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) { + err = 0; + goto exit; + } + + /* Hide the private journal files */ + if (hfsmp->jnl && + ((c_attr.ca_fileid == hfsmp->hfs_jnlfileid) || + (c_attr.ca_fileid == hfsmp->hfs_jnlinfoblkid))) { + err = 0; + goto exit; + } + + if (returnAttrList->commonattr & ATTR_CMN_NAME) { + err = cat_convertkey(hfsmp, key, rec, &c_desc); + if (err) { + /* This means that we probably had a CNID error */ + goto exit; + } + } else { + c_desc.cd_cnid = c_attr.ca_fileid; + if ((hfsmp->hfs_flags & HFS_STANDARD) == 0) + c_desc.cd_parentcnid = key->hfsPlus.parentID; +#if CONFIG_HFS_STD + else + c_desc.cd_parentcnid = key->hfs.parentID; +#endif + + } + + attrblk.ab_attrlist = returnAttrList; + attrblk.ab_attrbufpp = &rovingAttributesBuffer; + attrblk.ab_varbufpp = &rovingVariableBuffer; + attrblk.ab_flags = 0; + attrblk.ab_blocksize = 0; + attrblk.ab_context = vfs_context_current(); + + hfs_packattrblk(&attrblk, hfsmp, NULL, &c_desc, &c_attr, &datafork, &rsrcfork, vfs_context_current()); + + packedBufferSize = (char*)rovingVariableBuffer - (char*)attributesBuffer; + + if ( packedBufferSize > uio_resid(a_uio) ) + return( errSearchBufferFull ); + + (* nummatches)++; + + *((u_int32_t *)attributesBuffer) = packedBufferSize; /* Store length of fixed + var block */ + + err = uiomove( (caddr_t)attributesBuffer, packedBufferSize, a_uio ); +exit: + cat_releasedesc(&c_desc); + + return( err ); +} + + +static int +UnpackSearchAttributeBlock( struct hfsmount *hfsmp, struct attrlist *alist, + searchinfospec_t *searchInfo, void *attributeBuffer, int firstblock) +{ + attrgroup_t a; + u_int32_t bufferSize; + boolean_t is_64_bit; + + hfs_assert(searchInfo != NULL); + + is_64_bit = proc_is64bit(current_proc()); + + bufferSize = *((u_int32_t *)attributeBuffer); + if (bufferSize == 0) + return (EINVAL); /* XXX -DJB is a buffer size of zero ever valid for searchfs? */ + + attributeBuffer = (u_int32_t *)attributeBuffer + 1; /* advance past the size */ + + /* + * UnPack common attributes + */ + a = alist->commonattr; + if ( a != 0 ) { + if ( a & ATTR_CMN_NAME ) { + if (firstblock) { + /* Only use the attrreference_t for the first searchparams */ + char *s; + u_int32_t len; + + s = (char*) attributeBuffer + ((attrreference_t *) attributeBuffer)->attr_dataoffset; + len = ((attrreference_t *) attributeBuffer)->attr_length; + + if (len > sizeof(searchInfo->name)) + return (EINVAL); + + + if ((hfsmp->hfs_flags & HFS_STANDARD) == 0) { + size_t ucslen; + /* Convert name to Unicode to match HFS Plus B-Tree names */ + + if (len > 0) { + if (utf8_decodestr((u_int8_t *)s, len-1, (UniChar*)searchInfo->name, &ucslen, + sizeof(searchInfo->name), ':', UTF_DECOMPOSED | UTF_ESCAPE_ILLEGAL)) + return (EINVAL); + + searchInfo->nameLength = ucslen / sizeof(UniChar); + } else { + searchInfo->nameLength = 0; + } + } +#if CONFIG_HFS_STD + else { + /* Convert name to pascal string to match HFS (Standard) B-Tree names */ + + if (len > 0) { + if (utf8_to_hfs(HFSTOVCB(hfsmp), len-1, (u_char *)s, (u_char*)searchInfo->name) != 0) + return (EINVAL); + + searchInfo->nameLength = searchInfo->name[0]; + } else { + searchInfo->name[0] = searchInfo->nameLength = 0; + } + } +#endif + } + attributeBuffer = (attrreference_t*) attributeBuffer +1; + } + if ( a & ATTR_CMN_OBJID ) { + searchInfo->nodeID = ((fsobj_id_t *) attributeBuffer)->fid_objno; /* ignore fid_generation */ + attributeBuffer = (fsobj_id_t *)attributeBuffer + 1; + } + if ( a & ATTR_CMN_PAROBJID ) { + searchInfo->parentDirID = ((fsobj_id_t *) attributeBuffer)->fid_objno; /* ignore fid_generation */ + attributeBuffer = (fsobj_id_t *)attributeBuffer + 1; + } + + if ( a & ATTR_CMN_CRTIME ) { + if (is_64_bit) { + struct user64_timespec tmp; + tmp = *((struct user64_timespec *)attributeBuffer); + searchInfo->creationDate.tv_sec = (time_t)tmp.tv_sec; + searchInfo->creationDate.tv_nsec = tmp.tv_nsec; + attributeBuffer = (struct user64_timespec *)attributeBuffer + 1; + } + else { + struct user32_timespec tmp; + tmp = *((struct user32_timespec *)attributeBuffer); + searchInfo->creationDate.tv_sec = (time_t)tmp.tv_sec; + searchInfo->creationDate.tv_nsec = tmp.tv_nsec; + attributeBuffer = (struct user32_timespec *)attributeBuffer + 1; + } + } + if ( a & ATTR_CMN_MODTIME ) { + if (is_64_bit) { + struct user64_timespec tmp; + tmp = *((struct user64_timespec *)attributeBuffer); + searchInfo->modificationDate.tv_sec = (time_t)tmp.tv_sec; + searchInfo->modificationDate.tv_nsec = tmp.tv_nsec; + attributeBuffer = (struct user64_timespec *)attributeBuffer + 1; + } + else { + struct user32_timespec tmp; + tmp = *((struct user32_timespec *)attributeBuffer); + searchInfo->modificationDate.tv_sec = (time_t)tmp.tv_sec; + searchInfo->modificationDate.tv_nsec = tmp.tv_nsec; + attributeBuffer = (struct user32_timespec *)attributeBuffer + 1; + } + } + if ( a & ATTR_CMN_CHGTIME ) { + if (is_64_bit) { + struct user64_timespec tmp; + tmp = *((struct user64_timespec *)attributeBuffer); + searchInfo->changeDate.tv_sec = (time_t)tmp.tv_sec; + searchInfo->changeDate.tv_nsec = tmp.tv_nsec; + attributeBuffer = (struct user64_timespec *)attributeBuffer + 1; + } + else { + struct user32_timespec tmp; + tmp = *((struct user32_timespec *)attributeBuffer); + searchInfo->changeDate.tv_sec = (time_t)tmp.tv_sec; + searchInfo->changeDate.tv_nsec = tmp.tv_nsec; + attributeBuffer = (struct user32_timespec *)attributeBuffer + 1; + } + } + if ( a & ATTR_CMN_ACCTIME ) { + if (is_64_bit) { + struct user64_timespec tmp; + tmp = *((struct user64_timespec *)attributeBuffer); + searchInfo->accessDate.tv_sec = (time_t)tmp.tv_sec; + searchInfo->accessDate.tv_nsec = tmp.tv_nsec; + attributeBuffer = (struct user64_timespec *)attributeBuffer + 1; + } + else { + struct user32_timespec tmp; + tmp = *((struct user32_timespec *)attributeBuffer); + searchInfo->accessDate.tv_sec = (time_t)tmp.tv_sec; + searchInfo->accessDate.tv_nsec = tmp.tv_nsec; + attributeBuffer = (struct user32_timespec *)attributeBuffer + 1; + } + } + if ( a & ATTR_CMN_BKUPTIME ) { + if (is_64_bit) { + struct user64_timespec tmp; + tmp = *((struct user64_timespec *)attributeBuffer); + searchInfo->lastBackupDate.tv_sec = (time_t)tmp.tv_sec; + searchInfo->lastBackupDate.tv_nsec = tmp.tv_nsec; + attributeBuffer = (struct user64_timespec *)attributeBuffer + 1; + } + else { + struct user32_timespec tmp; + tmp = *((struct user32_timespec *)attributeBuffer); + searchInfo->lastBackupDate.tv_sec = (time_t)tmp.tv_sec; + searchInfo->lastBackupDate.tv_nsec = tmp.tv_nsec; + attributeBuffer = (struct user32_timespec *)attributeBuffer + 1; + } + } + if ( a & ATTR_CMN_FNDRINFO ) { + bcopy( attributeBuffer, searchInfo->finderInfo, sizeof(searchInfo->finderInfo) ); + attributeBuffer = (u_int8_t *)attributeBuffer + 32; + } + if ( a & ATTR_CMN_OWNERID ) { + searchInfo->uid = *((uid_t *)attributeBuffer); + attributeBuffer = (uid_t *)attributeBuffer + 1; + } + if ( a & ATTR_CMN_GRPID ) { + searchInfo->gid = *((gid_t *)attributeBuffer); + attributeBuffer = (gid_t *)attributeBuffer + 1; + } + if ( a & ATTR_CMN_ACCESSMASK ) { + searchInfo->mask = *((mode_t *)attributeBuffer); + attributeBuffer = (mode_t *)attributeBuffer + 1; + } + if ( a & ATTR_CMN_FILEID ) { + searchInfo->nodeID = (u_int32_t)*((u_int64_t *) attributeBuffer); + attributeBuffer = (u_int64_t *)attributeBuffer + 1; + } + if ( a & ATTR_CMN_PARENTID ) { + searchInfo->parentDirID = (u_int32_t)*((u_int64_t *) attributeBuffer); + attributeBuffer = (u_int64_t *)attributeBuffer + 1; + } + } + + a = alist->dirattr; + if ( a != 0 ) { + if ( a & ATTR_DIR_ENTRYCOUNT ) { + searchInfo->d.numFiles = *((u_int32_t *)attributeBuffer); + attributeBuffer = (u_int32_t *)attributeBuffer + 1; + } + } + + a = alist->fileattr; + if ( a != 0 ) { + if ( a & ATTR_FILE_DATALENGTH ) { + searchInfo->f.dataLogicalLength = *((off_t *)attributeBuffer); + attributeBuffer = (off_t *)attributeBuffer + 1; + } + if ( a & ATTR_FILE_DATAALLOCSIZE ) { + searchInfo->f.dataPhysicalLength = *((off_t *)attributeBuffer); + attributeBuffer = (off_t *)attributeBuffer + 1; + } + if ( a & ATTR_FILE_RSRCLENGTH ) { + searchInfo->f.resourceLogicalLength = *((off_t *)attributeBuffer); + attributeBuffer = (off_t *)attributeBuffer + 1; + } + if ( a & ATTR_FILE_RSRCALLOCSIZE ) { + searchInfo->f.resourcePhysicalLength = *((off_t *)attributeBuffer); + attributeBuffer = (off_t *)attributeBuffer + 1; + } + } + + return (0); +} +#endif /* CONFIG_SEARCHFS */ diff --git a/core/hfs_unistr.h b/core/hfs_unistr.h new file mode 100644 index 0000000..5b300a2 --- /dev/null +++ b/core/hfs_unistr.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2013 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef __HFS_UNISTR__ +#define __HFS_UNISTR__ + +#include + +/* + * hfs_unitstr.h + * + * This file contains definition of the unicode string used for HFS Plus + * files and folder names, as described by the on-disk format. + * + */ + +#ifdef __cplusplus +extern "C" { +#endif + + +#ifndef _HFSUNISTR255_DEFINED_ +#define _HFSUNISTR255_DEFINED_ +/* Unicode strings are used for HFS Plus file and folder names */ +struct HFSUniStr255 { + u_int16_t length; /* number of unicode characters */ + u_int16_t unicode[255]; /* unicode characters */ +} __attribute__((aligned(2), packed)); +typedef struct HFSUniStr255 HFSUniStr255; +typedef const HFSUniStr255 *ConstHFSUniStr255Param; +#endif /* _HFSUNISTR255_DEFINED_ */ + + +#ifdef __cplusplus +} +#endif + + +#endif /* __HFS_UNISTR__ */ diff --git a/core/hfs_vfsops.c b/core/hfs_vfsops.c new file mode 100644 index 0000000..6ceab54 --- /dev/null +++ b/core/hfs_vfsops.c @@ -0,0 +1,4751 @@ +/* + * Copyright (c) 1999-2017 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + * Copyright (c) 1991, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * hfs_vfsops.c + * derived from @(#)ufs_vfsops.c 8.8 (Berkeley) 5/20/95 + * + * (c) Copyright 1997-2002 Apple Inc. All rights reserved. + * + * hfs_vfsops.c -- VFS layer for loadable HFS file system. + * + */ +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* for parsing boot-args */ +#include + + +#include + +#include "hfs_journal.h" + +#include +#include "hfs_mount.h" + +#include +#include + +#include "hfs_iokit.h" +#include "hfs.h" +#include "hfs_catalog.h" +#include "hfs_cnode.h" +#include "hfs_dbg.h" +#include "hfs_endian.h" +#include "hfs_hotfiles.h" +#include "hfs_quota.h" +#include "hfs_btreeio.h" +#include "hfs_kdebug.h" +#include "hfs_cprotect.h" + +#include "FileMgrInternal.h" +#include "BTreesInternal.h" + +#define HFS_MOUNT_DEBUG 1 + +/* Enable/disable debugging code for live volume resizing, defined in hfs_resize.c */ +extern int hfs_resize_debug; + +lck_grp_attr_t * hfs_group_attr; +lck_attr_t * hfs_lock_attr; +lck_grp_t * hfs_mutex_group; +lck_grp_t * hfs_rwlock_group; +lck_grp_t * hfs_spinlock_group; + +// variables to manage HFS kext retain count -- only supported on Macs +#if TARGET_OS_OSX +int hfs_active_mounts = 0; +#endif + +extern struct vnodeopv_desc hfs_vnodeop_opv_desc; + +#if CONFIG_HFS_STD +extern struct vnodeopv_desc hfs_std_vnodeop_opv_desc; +static int hfs_flushMDB(struct hfsmount *hfsmp, int waitfor, int altflush); +#endif + +/* not static so we can re-use in hfs_readwrite.c for build_path calls */ +int hfs_vfs_vget(struct mount *mp, ino64_t ino, struct vnode **vpp, vfs_context_t context); + +static int hfs_changefs(struct mount *mp, struct hfs_mount_args *args); +static int hfs_fhtovp(struct mount *mp, int fhlen, unsigned char *fhp, struct vnode **vpp, vfs_context_t context); +static int hfs_flushfiles(struct mount *, int, struct proc *); +static int hfs_init(struct vfsconf *vfsp); +static void hfs_locks_destroy(struct hfsmount *hfsmp); +static int hfs_quotactl(struct mount *, int, uid_t, caddr_t, vfs_context_t context); +static int hfs_start(struct mount *mp, int flags, vfs_context_t context); +static int hfs_vptofh(struct vnode *vp, int *fhlenp, unsigned char *fhp, vfs_context_t context); +static void hfs_syncer_free(struct hfsmount *hfsmp); + +void hfs_initialize_allocator (struct hfsmount *hfsmp); +int hfs_teardown_allocator (struct hfsmount *hfsmp); + +int hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t context); +int hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, int journal_replay_only, vfs_context_t context); +int hfs_reload(struct mount *mp); +int hfs_statfs(struct mount *mp, register struct vfsstatfs *sbp, vfs_context_t context); +int hfs_sync(struct mount *mp, int waitfor, vfs_context_t context); +int hfs_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, + user_addr_t newp, size_t newlen, vfs_context_t context); +int hfs_unmount(struct mount *mp, int mntflags, vfs_context_t context); + +static int hfs_journal_replay(vnode_t devvp, vfs_context_t context); + +#if HFS_LEAK_DEBUG +#include +#endif + +/* + * VFS Operations. + * + * mount system call + */ + +int +hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t context) +{ + +#if HFS_LEAK_DEBUG + +#warning HFS_LEAK_DEBUG is on + + hfs_alloc_trace_enable(); + +#endif + + struct proc *p = vfs_context_proc(context); + struct hfsmount *hfsmp = NULL; + struct hfs_mount_args args; + int retval = E_NONE; + u_int32_t cmdflags; + + if (data && (retval = copyin(data, (caddr_t)&args, sizeof(args)))) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mount: copyin returned %d for fs\n", retval); + } + return (retval); + } + cmdflags = (u_int32_t)vfs_flags(mp) & MNT_CMDFLAGS; + if (cmdflags & MNT_UPDATE) { + hfs_assert(data); + + hfsmp = VFSTOHFS(mp); + + /* Reload incore data after an fsck. */ + if (cmdflags & MNT_RELOAD) { + if (vfs_isrdonly(mp)) { + int error = hfs_reload(mp); + if (error && HFS_MOUNT_DEBUG) { + printf("hfs_mount: hfs_reload returned %d on %s \n", error, hfsmp->vcbVN); + } + return error; + } + else { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mount: MNT_RELOAD not supported on rdwr filesystem %s\n", hfsmp->vcbVN); + } + return (EINVAL); + } + } + + /* Change to a read-only file system. */ + if (((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) && + vfs_isrdonly(mp)) { + int flags; + + /* Set flag to indicate that a downgrade to read-only + * is in progress and therefore block any further + * modifications to the file system. + */ + hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK); + hfsmp->hfs_flags |= HFS_RDONLY_DOWNGRADE; + hfsmp->hfs_downgrading_thread = current_thread(); + hfs_unlock_global (hfsmp); + hfs_syncer_free(hfsmp); + + /* use hfs_sync to push out System (btree) files */ + retval = hfs_sync(mp, MNT_WAIT, context); + if (retval && ((cmdflags & MNT_FORCE) == 0)) { + hfsmp->hfs_flags &= ~HFS_RDONLY_DOWNGRADE; + hfsmp->hfs_downgrading_thread = NULL; + if (HFS_MOUNT_DEBUG) { + printf("hfs_mount: VFS_SYNC returned %d during b-tree sync of %s \n", retval, hfsmp->vcbVN); + } + goto out; + } + + flags = WRITECLOSE; + if (cmdflags & MNT_FORCE) + flags |= FORCECLOSE; + + if ((retval = hfs_flushfiles(mp, flags, p))) { + hfsmp->hfs_flags &= ~HFS_RDONLY_DOWNGRADE; + hfsmp->hfs_downgrading_thread = NULL; + if (HFS_MOUNT_DEBUG) { + printf("hfs_mount: hfs_flushfiles returned %d on %s \n", retval, hfsmp->vcbVN); + } + goto out; + } + + /* mark the volume cleanly unmounted */ + hfsmp->vcbAtrb |= kHFSVolumeUnmountedMask; + retval = hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT); + hfsmp->hfs_flags |= HFS_READ_ONLY; + + /* + * Close down the journal. + * + * NOTE: It is critically important to close down the journal + * and have it issue all pending I/O prior to calling VNOP_FSYNC below. + * In a journaled environment it is expected that the journal be + * the only actor permitted to issue I/O for metadata blocks in HFS. + * If we were to call VNOP_FSYNC prior to closing down the journal, + * we would inadvertantly issue (and wait for) the I/O we just + * initiated above as part of the flushvolumeheader call. + * + * To avoid this, we follow the same order of operations as in + * unmount and issue the journal_close prior to calling VNOP_FSYNC. + */ + + if (hfsmp->jnl) { + hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK); + + journal_close(hfsmp->jnl); + hfsmp->jnl = NULL; + + // Note: we explicitly don't want to shutdown + // access to the jvp because we may need + // it later if we go back to being read-write. + + hfs_unlock_global (hfsmp); + + vfs_clearflags(hfsmp->hfs_mp, MNT_JOURNALED); + } + + /* + * Write out any pending I/O still outstanding against the device node + * now that the journal has been closed. + */ + if (retval == 0) { + vnode_get(hfsmp->hfs_devvp); + retval = VNOP_FSYNC(hfsmp->hfs_devvp, MNT_WAIT, context); + vnode_put(hfsmp->hfs_devvp); + } + + if (retval) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mount: FSYNC on devvp returned %d for fs %s\n", retval, hfsmp->vcbVN); + } + hfsmp->hfs_flags &= ~HFS_RDONLY_DOWNGRADE; + hfsmp->hfs_downgrading_thread = NULL; + hfsmp->hfs_flags &= ~HFS_READ_ONLY; + goto out; + } + + if (hfsmp->hfs_flags & HFS_SUMMARY_TABLE) { + if (hfsmp->hfs_summary_table) { + int err = 0; + /* + * Take the bitmap lock to serialize against a concurrent bitmap scan still in progress + */ + if (hfsmp->hfs_allocation_vp) { + err = hfs_lock (VTOC(hfsmp->hfs_allocation_vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + } + hfs_free(hfsmp->hfs_summary_table, hfsmp->hfs_summary_bytes); + hfsmp->hfs_summary_table = NULL; + hfsmp->hfs_flags &= ~HFS_SUMMARY_TABLE; + if (err == 0 && hfsmp->hfs_allocation_vp){ + hfs_unlock (VTOC(hfsmp->hfs_allocation_vp)); + } + } + } + + hfsmp->hfs_downgrading_thread = NULL; + } + + /* Change to a writable file system. */ + if (vfs_iswriteupgrade(mp)) { + /* + * On inconsistent disks, do not allow read-write mount + * unless it is the boot volume being mounted. + */ + if (!(vfs_flags(mp) & MNT_ROOTFS) && + (hfsmp->vcbAtrb & kHFSVolumeInconsistentMask)) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mount: attempting to mount inconsistent non-root volume %s\n", (hfsmp->vcbVN)); + } + retval = EINVAL; + goto out; + } + + // If the journal was shut-down previously because we were + // asked to be read-only, let's start it back up again now + + if ( (HFSTOVCB(hfsmp)->vcbAtrb & kHFSVolumeJournaledMask) + && hfsmp->jnl == NULL + && hfsmp->jvp != NULL) { + int jflags; + + if (hfsmp->hfs_flags & HFS_NEED_JNL_RESET) { + jflags = JOURNAL_RESET; + } else { + jflags = 0; + } + + hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK); + + /* We provide the mount point twice here: The first is used as + * an opaque argument to be passed back when hfs_sync_metadata + * is called. The second is provided to the throttling code to + * indicate which mount's device should be used when accounting + * for metadata writes. + */ + hfsmp->jnl = journal_open(hfsmp->jvp, + hfs_blk_to_bytes(hfsmp->jnl_start, HFSTOVCB(hfsmp)->blockSize) + (off_t)HFSTOVCB(hfsmp)->hfsPlusIOPosOffset, + hfsmp->jnl_size, + hfsmp->hfs_devvp, + hfsmp->hfs_logical_block_size, + jflags, + 0, + hfs_sync_metadata, hfsmp->hfs_mp, + hfsmp->hfs_mp); + + /* + * Set up the trim callback function so that we can add + * recently freed extents to the free extent cache once + * the transaction that freed them is written to the + * journal on disk. + */ + if (hfsmp->jnl) + journal_trim_set_callback(hfsmp->jnl, hfs_trim_callback, hfsmp); + + hfs_unlock_global (hfsmp); + + if (hfsmp->jnl == NULL) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mount: journal_open == NULL; couldn't be opened on %s \n", (hfsmp->vcbVN)); + } + retval = EINVAL; + goto out; + } else { + hfsmp->hfs_flags &= ~HFS_NEED_JNL_RESET; + vfs_setflags(hfsmp->hfs_mp, MNT_JOURNALED); + } + } + + /* See if we need to erase unused Catalog nodes due to . */ + retval = hfs_erase_unused_nodes(hfsmp); + if (retval != E_NONE) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mount: hfs_erase_unused_nodes returned %d for fs %s\n", retval, hfsmp->vcbVN); + } + goto out; + } + + /* If this mount point was downgraded from read-write + * to read-only, clear that information as we are now + * moving back to read-write. + */ + hfsmp->hfs_flags &= ~HFS_RDONLY_DOWNGRADE; + hfsmp->hfs_downgrading_thread = NULL; + + /* mark the volume dirty (clear clean unmount bit) */ + hfsmp->vcbAtrb &= ~kHFSVolumeUnmountedMask; + + retval = hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT); + if (retval != E_NONE) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mount: hfs_flushvolumeheader returned %d for fs %s\n", retval, hfsmp->vcbVN); + } + goto out; + } + + /* Only clear HFS_READ_ONLY after a successful write */ + hfsmp->hfs_flags &= ~HFS_READ_ONLY; + + + if (!(hfsmp->hfs_flags & (HFS_READ_ONLY | HFS_STANDARD))) { + /* Setup private/hidden directories for hardlinks. */ + hfs_privatedir_init(hfsmp, FILE_HARDLINKS); + hfs_privatedir_init(hfsmp, DIR_HARDLINKS); + + hfs_remove_orphans(hfsmp); + + /* + * Since we're upgrading to a read-write mount, allow + * hot file clustering if conditions allow. + * + * Note: this normally only would happen if you booted + * single-user and upgraded the mount to read-write + * + * Note: at this point we are not allowed to fail the + * mount operation because the HotFile init code + * in hfs_recording_init() will lookup vnodes with + * VNOP_LOOKUP() which hangs vnodes off the mount + * (and if we were to fail, VFS is not prepared to + * clean that up at this point. Since HotFiles are + * optional, this is not a big deal. + */ + if (ISSET(hfsmp->hfs_flags, HFS_METADATA_ZONE) + && (!ISSET(hfsmp->hfs_flags, HFS_SSD) + || ISSET(hfsmp->hfs_flags, HFS_CS_HOTFILE_PIN))) { + hfs_recording_init(hfsmp); + } + /* Force ACLs on HFS+ file systems. */ + if (vfs_extendedsecurity(HFSTOVFS(hfsmp)) == 0) { + vfs_setextendedsecurity(HFSTOVFS(hfsmp)); + } + } + } + + /* Update file system parameters. */ + retval = hfs_changefs(mp, &args); + if (retval && HFS_MOUNT_DEBUG) { + printf("hfs_mount: hfs_changefs returned %d for %s\n", retval, hfsmp->vcbVN); + } + + } else /* not an update request */ { + if (devvp == NULL) { + retval = EINVAL; + goto out; + } + /* Set the mount flag to indicate that we support volfs */ + vfs_setflags(mp, (u_int64_t)((unsigned int)MNT_DOVOLFS)); + + retval = hfs_mountfs(devvp, mp, data ? &args : NULL, 0, context); + if (retval) { + const char *name = vnode_getname(devvp); + printf("hfs_mount: hfs_mountfs returned error=%d for device %s\n", retval, (name ? name : "unknown-dev")); + if (name) { + vnode_putname(name); + } + goto out; + } + + /* After hfs_mountfs succeeds, we should have valid hfsmp */ + hfsmp = VFSTOHFS(mp); + + /* Set up the maximum defrag file size */ + hfsmp->hfs_defrag_max = HFS_INITIAL_DEFRAG_SIZE; + + + if (!data) { + // Root mount + + hfsmp->hfs_uid = UNKNOWNUID; + hfsmp->hfs_gid = UNKNOWNGID; + hfsmp->hfs_dir_mask = (S_IRWXU | S_IRGRP|S_IXGRP | S_IROTH|S_IXOTH); /* 0755 */ + hfsmp->hfs_file_mask = (S_IRWXU | S_IRGRP|S_IXGRP | S_IROTH|S_IXOTH); /* 0755 */ + + /* Establish the free block reserve. */ + hfsmp->reserveBlocks = ((u_int64_t)hfsmp->totalBlocks * HFS_MINFREE) / 100; + hfsmp->reserveBlocks = MIN(hfsmp->reserveBlocks, HFS_MAXRESERVE / hfsmp->blockSize); + } +#if TARGET_OS_OSX + // increment kext retain count + OSIncrementAtomic(&hfs_active_mounts); + OSKextRetainKextWithLoadTag(OSKextGetCurrentLoadTag()); + if (hfs_active_mounts <= 0 && panic_on_assert) + panic("hfs_mount: error - kext resource count is non-positive: %d but at least one active mount\n", hfs_active_mounts); +#endif + } + +out: + if (retval == 0) { + (void)hfs_statfs(mp, vfs_statfs(mp), context); + } + return (retval); +} + + +struct hfs_changefs_cargs { + struct hfsmount *hfsmp; + int namefix; + int permfix; + int permswitch; +}; + +static int +hfs_changefs_callback(struct vnode *vp, void *cargs) +{ + ExtendedVCB *vcb; + struct cnode *cp; + struct cat_desc cndesc; + struct cat_attr cnattr; + struct hfs_changefs_cargs *args; + int lockflags; + int error; + + args = (struct hfs_changefs_cargs *)cargs; + + cp = VTOC(vp); + vcb = HFSTOVCB(args->hfsmp); + + lockflags = hfs_systemfile_lock(args->hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); + error = cat_lookup(args->hfsmp, &cp->c_desc, 0, 0, &cndesc, &cnattr, NULL, NULL); + hfs_systemfile_unlock(args->hfsmp, lockflags); + if (error) { + /* + * If we couldn't find this guy skip to the next one + */ + if (args->namefix) + cache_purge(vp); + + return (VNODE_RETURNED); + } + /* + * Get the real uid/gid and perm mask from disk. + */ + if (args->permswitch || args->permfix) { + cp->c_uid = cnattr.ca_uid; + cp->c_gid = cnattr.ca_gid; + cp->c_mode = cnattr.ca_mode; + } + /* + * If we're switching name converters then... + * Remove the existing entry from the namei cache. + * Update name to one based on new encoder. + */ + if (args->namefix) { + cache_purge(vp); + replace_desc(cp, &cndesc); + + if (cndesc.cd_cnid == kHFSRootFolderID) { + strlcpy((char *)vcb->vcbVN, (const char *)cp->c_desc.cd_nameptr, NAME_MAX+1); + cp->c_desc.cd_encoding = args->hfsmp->hfs_encoding; + } + } else { + cat_releasedesc(&cndesc); + } + return (VNODE_RETURNED); +} + +/* Change fs mount parameters */ +static int +hfs_changefs(struct mount *mp, struct hfs_mount_args *args) +{ + int retval = 0; + int namefix, permfix, permswitch; + struct hfsmount *hfsmp; + ExtendedVCB *vcb; + struct hfs_changefs_cargs cargs; + u_int32_t mount_flags; + +#if CONFIG_HFS_STD + u_int32_t old_encoding = 0; + hfs_to_unicode_func_t get_unicode_func; + unicode_to_hfs_func_t get_hfsname_func = NULL; +#endif + + hfsmp = VFSTOHFS(mp); + vcb = HFSTOVCB(hfsmp); + mount_flags = (unsigned int)vfs_flags(mp); + + hfsmp->hfs_flags |= HFS_IN_CHANGEFS; + + permswitch = (((hfsmp->hfs_flags & HFS_UNKNOWN_PERMS) && + ((mount_flags & MNT_UNKNOWNPERMISSIONS) == 0)) || + (((hfsmp->hfs_flags & HFS_UNKNOWN_PERMS) == 0) && + (mount_flags & MNT_UNKNOWNPERMISSIONS))); + + /* The root filesystem must operate with actual permissions: */ + if (permswitch && (mount_flags & MNT_ROOTFS) && (mount_flags & MNT_UNKNOWNPERMISSIONS)) { + vfs_clearflags(mp, (u_int64_t)((unsigned int)MNT_UNKNOWNPERMISSIONS)); /* Just say "No". */ + retval = EINVAL; + goto exit; + } + if (mount_flags & MNT_UNKNOWNPERMISSIONS) + hfsmp->hfs_flags |= HFS_UNKNOWN_PERMS; + else + hfsmp->hfs_flags &= ~HFS_UNKNOWN_PERMS; + + namefix = permfix = 0; + + /* + * Tracking of hot files requires up-to-date access times. So if + * access time updates are disabled, we must also disable hot files. + */ + if (mount_flags & MNT_NOATIME) { + (void) hfs_recording_suspend(hfsmp); + } + + /* Change the timezone (Note: this affects all hfs volumes and hfs+ volume create dates) */ + if (args->hfs_timezone.tz_minuteswest != VNOVAL) { + gTimeZone = args->hfs_timezone; + } + + /* Change the default uid, gid and/or mask */ + if ((args->hfs_uid != (uid_t)VNOVAL) && (hfsmp->hfs_uid != args->hfs_uid)) { + hfsmp->hfs_uid = args->hfs_uid; + if (vcb->vcbSigWord == kHFSPlusSigWord) + ++permfix; + } + if ((args->hfs_gid != (gid_t)VNOVAL) && (hfsmp->hfs_gid != args->hfs_gid)) { + hfsmp->hfs_gid = args->hfs_gid; + if (vcb->vcbSigWord == kHFSPlusSigWord) + ++permfix; + } + if (args->hfs_mask != (mode_t)VNOVAL) { + if (hfsmp->hfs_dir_mask != (args->hfs_mask & ALLPERMS)) { + hfsmp->hfs_dir_mask = args->hfs_mask & ALLPERMS; + hfsmp->hfs_file_mask = args->hfs_mask & ALLPERMS; + if ((args->flags != VNOVAL) && (args->flags & HFSFSMNT_NOXONFILES)) + hfsmp->hfs_file_mask = (args->hfs_mask & DEFFILEMODE); + if (vcb->vcbSigWord == kHFSPlusSigWord) + ++permfix; + } + } + +#if CONFIG_HFS_STD + /* Change the hfs encoding value (hfs only) */ + if ((vcb->vcbSigWord == kHFSSigWord) && + (args->hfs_encoding != (u_int32_t)VNOVAL) && + (hfsmp->hfs_encoding != args->hfs_encoding)) { + + retval = hfs_getconverter(args->hfs_encoding, &get_unicode_func, &get_hfsname_func); + if (retval) + goto exit; + + /* + * Connect the new hfs_get_unicode converter but leave + * the old hfs_get_hfsname converter in place so that + * we can lookup existing vnodes to get their correctly + * encoded names. + * + * When we're all finished, we can then connect the new + * hfs_get_hfsname converter and release our interest + * in the old converters. + */ + hfsmp->hfs_get_unicode = get_unicode_func; + old_encoding = hfsmp->hfs_encoding; + hfsmp->hfs_encoding = args->hfs_encoding; + ++namefix; + } +#endif + + if (!(namefix || permfix || permswitch)) + goto exit; + + /* XXX 3762912 hack to support HFS filesystem 'owner' */ + if (permfix) { + vfs_setowner(mp, + hfsmp->hfs_uid == UNKNOWNUID ? KAUTH_UID_NONE : hfsmp->hfs_uid, + hfsmp->hfs_gid == UNKNOWNGID ? KAUTH_GID_NONE : hfsmp->hfs_gid); + } + + /* + * For each active vnode fix things that changed + * + * Note that we can visit a vnode more than once + * and we can race with fsync. + * + * hfs_changefs_callback will be called for each vnode + * hung off of this mount point + * + * The vnode will be properly referenced and unreferenced + * around the callback + */ + cargs.hfsmp = hfsmp; + cargs.namefix = namefix; + cargs.permfix = permfix; + cargs.permswitch = permswitch; + + vnode_iterate(mp, 0, hfs_changefs_callback, (void *)&cargs); + +#if CONFIG_HFS_STD + /* + * If we're switching name converters we can now + * connect the new hfs_get_hfsname converter and + * release our interest in the old converters. + */ + if (namefix) { + /* HFS standard only */ + hfsmp->hfs_get_hfsname = get_hfsname_func; + vcb->volumeNameEncodingHint = args->hfs_encoding; + (void) hfs_relconverter(old_encoding); + } +#endif + +exit: + hfsmp->hfs_flags &= ~HFS_IN_CHANGEFS; + return (retval); +} + + +struct hfs_reload_cargs { + struct hfsmount *hfsmp; + int error; +}; + +static int +hfs_reload_callback(struct vnode *vp, void *cargs) +{ + struct cnode *cp; + struct hfs_reload_cargs *args; + int lockflags; + + args = (struct hfs_reload_cargs *)cargs; + /* + * flush all the buffers associated with this node + */ + (void) buf_invalidateblks(vp, 0, 0, 0); + + cp = VTOC(vp); + /* + * Remove any directory hints + */ + if (vnode_isdir(vp)) + hfs_reldirhints(cp, 0); + + /* + * Re-read cnode data for all active vnodes (non-metadata files). + */ + if (!vnode_issystem(vp) && !VNODE_IS_RSRC(vp) && (cp->c_fileid >= kHFSFirstUserCatalogNodeID)) { + struct cat_fork *datafork; + struct cat_desc desc; + + datafork = cp->c_datafork ? &cp->c_datafork->ff_data : NULL; + + /* lookup by fileID since name could have changed */ + lockflags = hfs_systemfile_lock(args->hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); + args->error = cat_idlookup(args->hfsmp, cp->c_fileid, 0, 0, &desc, &cp->c_attr, datafork); + hfs_systemfile_unlock(args->hfsmp, lockflags); + if (args->error) { + return (VNODE_RETURNED_DONE); + } + + /* update cnode's catalog descriptor */ + (void) replace_desc(cp, &desc); + } + return (VNODE_RETURNED); +} + +/* + * Reload all incore data for a filesystem (used after running fsck on + * the root filesystem and finding things to fix). The filesystem must + * be mounted read-only. + * + * Things to do to update the mount: + * invalidate all cached meta-data. + * invalidate all inactive vnodes. + * invalidate all cached file data. + * re-read volume header from disk. + * re-load meta-file info (extents, file size). + * re-load B-tree header data. + * re-read cnode data for all active vnodes. + */ +int +hfs_reload(struct mount *mountp) +{ + register struct vnode *devvp; + struct buf *bp; + int error, i; + struct hfsmount *hfsmp; + struct HFSPlusVolumeHeader *vhp; + ExtendedVCB *vcb; + struct filefork *forkp; + struct cat_desc cndesc; + struct hfs_reload_cargs args; + daddr64_t priIDSector; + + hfsmp = VFSTOHFS(mountp); + vcb = HFSTOVCB(hfsmp); + + if (vcb->vcbSigWord == kHFSSigWord) + return (EINVAL); /* rooting from HFS is not supported! */ + + /* + * Invalidate all cached meta-data. + */ + devvp = hfsmp->hfs_devvp; + if (buf_invalidateblks(devvp, 0, 0, 0)) + panic("hfs_reload: dirty1"); + + args.hfsmp = hfsmp; + args.error = 0; + /* + * hfs_reload_callback will be called for each vnode + * hung off of this mount point that can't be recycled... + * vnode_iterate will recycle those that it can (the VNODE_RELOAD option) + * the vnode will be in an 'unbusy' state (VNODE_WAIT) and + * properly referenced and unreferenced around the callback + */ + vnode_iterate(mountp, VNODE_RELOAD | VNODE_WAIT, hfs_reload_callback, (void *)&args); + + if (args.error) + return (args.error); + + /* + * Re-read VolumeHeader from disk. + */ + priIDSector = (daddr64_t)((vcb->hfsPlusIOPosOffset / hfsmp->hfs_logical_block_size) + + HFS_PRI_SECTOR(hfsmp->hfs_logical_block_size)); + + error = (int)buf_meta_bread(hfsmp->hfs_devvp, + HFS_PHYSBLK_ROUNDDOWN(priIDSector, hfsmp->hfs_log_per_phys), + hfsmp->hfs_physical_block_size, NOCRED, &bp); + if (error) { + if (bp != NULL) + buf_brelse(bp); + return (error); + } + + vhp = (HFSPlusVolumeHeader *) (buf_dataptr(bp) + HFS_PRI_OFFSET(hfsmp->hfs_physical_block_size)); + + /* Do a quick sanity check */ + if ((SWAP_BE16(vhp->signature) != kHFSPlusSigWord && + SWAP_BE16(vhp->signature) != kHFSXSigWord) || + (SWAP_BE16(vhp->version) != kHFSPlusVersion && + SWAP_BE16(vhp->version) != kHFSXVersion) || + SWAP_BE32(vhp->blockSize) != vcb->blockSize) { + buf_brelse(bp); + return (EIO); + } + + vcb->vcbLsMod = to_bsd_time(SWAP_BE32(vhp->modifyDate)); + vcb->vcbAtrb = SWAP_BE32 (vhp->attributes); + vcb->vcbJinfoBlock = SWAP_BE32(vhp->journalInfoBlock); + vcb->vcbClpSiz = SWAP_BE32 (vhp->rsrcClumpSize); + vcb->vcbNxtCNID = SWAP_BE32 (vhp->nextCatalogID); + vcb->vcbVolBkUp = to_bsd_time(SWAP_BE32(vhp->backupDate)); + vcb->vcbWrCnt = SWAP_BE32 (vhp->writeCount); + vcb->vcbFilCnt = SWAP_BE32 (vhp->fileCount); + vcb->vcbDirCnt = SWAP_BE32 (vhp->folderCount); + HFS_UPDATE_NEXT_ALLOCATION(vcb, SWAP_BE32 (vhp->nextAllocation)); + vcb->totalBlocks = SWAP_BE32 (vhp->totalBlocks); + vcb->freeBlocks = SWAP_BE32 (vhp->freeBlocks); + vcb->encodingsBitmap = SWAP_BE64 (vhp->encodingsBitmap); + bcopy(vhp->finderInfo, vcb->vcbFndrInfo, sizeof(vhp->finderInfo)); + vcb->localCreateDate = SWAP_BE32 (vhp->createDate); /* hfs+ create date is in local time */ + + /* + * Re-load meta-file vnode data (extent info, file size, etc). + */ + forkp = VTOF((struct vnode *)vcb->extentsRefNum); + for (i = 0; i < kHFSPlusExtentDensity; i++) { + forkp->ff_extents[i].startBlock = + SWAP_BE32 (vhp->extentsFile.extents[i].startBlock); + forkp->ff_extents[i].blockCount = + SWAP_BE32 (vhp->extentsFile.extents[i].blockCount); + } + forkp->ff_size = SWAP_BE64 (vhp->extentsFile.logicalSize); + forkp->ff_blocks = SWAP_BE32 (vhp->extentsFile.totalBlocks); + forkp->ff_clumpsize = SWAP_BE32 (vhp->extentsFile.clumpSize); + + + forkp = VTOF((struct vnode *)vcb->catalogRefNum); + for (i = 0; i < kHFSPlusExtentDensity; i++) { + forkp->ff_extents[i].startBlock = + SWAP_BE32 (vhp->catalogFile.extents[i].startBlock); + forkp->ff_extents[i].blockCount = + SWAP_BE32 (vhp->catalogFile.extents[i].blockCount); + } + forkp->ff_size = SWAP_BE64 (vhp->catalogFile.logicalSize); + forkp->ff_blocks = SWAP_BE32 (vhp->catalogFile.totalBlocks); + forkp->ff_clumpsize = SWAP_BE32 (vhp->catalogFile.clumpSize); + + if (hfsmp->hfs_attribute_vp) { + forkp = VTOF(hfsmp->hfs_attribute_vp); + for (i = 0; i < kHFSPlusExtentDensity; i++) { + forkp->ff_extents[i].startBlock = + SWAP_BE32 (vhp->attributesFile.extents[i].startBlock); + forkp->ff_extents[i].blockCount = + SWAP_BE32 (vhp->attributesFile.extents[i].blockCount); + } + forkp->ff_size = SWAP_BE64 (vhp->attributesFile.logicalSize); + forkp->ff_blocks = SWAP_BE32 (vhp->attributesFile.totalBlocks); + forkp->ff_clumpsize = SWAP_BE32 (vhp->attributesFile.clumpSize); + } + + forkp = VTOF((struct vnode *)vcb->allocationsRefNum); + for (i = 0; i < kHFSPlusExtentDensity; i++) { + forkp->ff_extents[i].startBlock = + SWAP_BE32 (vhp->allocationFile.extents[i].startBlock); + forkp->ff_extents[i].blockCount = + SWAP_BE32 (vhp->allocationFile.extents[i].blockCount); + } + forkp->ff_size = SWAP_BE64 (vhp->allocationFile.logicalSize); + forkp->ff_blocks = SWAP_BE32 (vhp->allocationFile.totalBlocks); + forkp->ff_clumpsize = SWAP_BE32 (vhp->allocationFile.clumpSize); + + buf_brelse(bp); + vhp = NULL; + + /* + * Re-load B-tree header data + */ + forkp = VTOF((struct vnode *)vcb->extentsRefNum); + if ( (error = MacToVFSError( BTReloadData((FCB*)forkp) )) ) + return (error); + + forkp = VTOF((struct vnode *)vcb->catalogRefNum); + if ( (error = MacToVFSError( BTReloadData((FCB*)forkp) )) ) + return (error); + + if (hfsmp->hfs_attribute_vp) { + forkp = VTOF(hfsmp->hfs_attribute_vp); + if ( (error = MacToVFSError( BTReloadData((FCB*)forkp) )) ) + return (error); + } + + /* Reload the volume name */ + if ((error = cat_idlookup(hfsmp, kHFSRootFolderID, 0, 0, &cndesc, NULL, NULL))) + return (error); + vcb->volumeNameEncodingHint = cndesc.cd_encoding; + bcopy(cndesc.cd_nameptr, vcb->vcbVN, min(255, cndesc.cd_namelen)); + cat_releasedesc(&cndesc); + + /* Re-establish private/hidden directories. */ + hfs_privatedir_init(hfsmp, FILE_HARDLINKS); + hfs_privatedir_init(hfsmp, DIR_HARDLINKS); + + /* In case any volume information changed to trigger a notification */ + hfs_generate_volume_notifications(hfsmp); + + return (0); +} + +__unused +static uint64_t tv_to_usecs(struct timeval *tv) +{ + return tv->tv_sec * 1000000ULL + tv->tv_usec; +} + +// Returns TRUE if b - a >= usecs +static bool hfs_has_elapsed (const struct timeval *a, + const struct timeval *b, + uint64_t usecs) +{ + struct timeval diff; + timersub(b, a, &diff); + return diff.tv_sec * 1000000ULL + diff.tv_usec >= usecs; +} + +void hfs_syncer(void *arg, __unused wait_result_t wr) +{ + struct hfsmount *hfsmp = arg; + struct timeval now; + + KDBG(HFSDBG_SYNCER | DBG_FUNC_START, obfuscate_addr(hfsmp)); + + hfs_syncer_lock(hfsmp); + + while (ISSET(hfsmp->hfs_flags, HFS_RUN_SYNCER) + && timerisset(&hfsmp->hfs_sync_req_oldest)) { + + hfs_syncer_wait(hfsmp, &HFS_META_DELAY_TS); + + if (!ISSET(hfsmp->hfs_flags, HFS_RUN_SYNCER) + || !timerisset(&hfsmp->hfs_sync_req_oldest)) { + break; + } + + /* Check to see whether we should flush now: either the oldest + is > HFS_MAX_META_DELAY or HFS_META_DELAY has elapsed since + the request and there are no pending writes. */ + + microuptime(&now); + uint64_t idle_time = vfs_idle_time(hfsmp->hfs_mp); + + if (!hfs_has_elapsed(&hfsmp->hfs_sync_req_oldest, &now, + HFS_MAX_META_DELAY) + && idle_time < HFS_META_DELAY) { + continue; + } + + timerclear(&hfsmp->hfs_sync_req_oldest); + + hfs_syncer_unlock(hfsmp); + + KDBG(HFSDBG_SYNCER_TIMED | DBG_FUNC_START, obfuscate_addr(hfsmp)); + + /* + * We intentionally do a synchronous flush (of the journal or entire volume) here. + * For journaled volumes, this means we wait until the metadata blocks are written + * to both the journal and their final locations (in the B-trees, etc.). + * + * This tends to avoid interleaving the metadata writes with other writes (for + * example, user data, or to the journal when a later transaction notices that + * an earlier transaction has finished its async writes, and then updates the + * journal start in the journal header). Avoiding interleaving of writes is + * very good for performance on simple flash devices like SD cards, thumb drives; + * and on devices like floppies. Since removable devices tend to be this kind of + * simple device, doing a synchronous flush actually improves performance in + * practice. + * + * NOTE: For non-journaled volumes, the call to hfs_sync will also cause dirty + * user data to be written. + */ + if (hfsmp->jnl) { + hfs_flush(hfsmp, HFS_FLUSH_JOURNAL_META); + } else { + hfs_sync(hfsmp->hfs_mp, MNT_WAIT, vfs_context_current()); + } + + KDBG(HFSDBG_SYNCER_TIMED | DBG_FUNC_END); + + hfs_syncer_lock(hfsmp); + } // while (...) + + hfsmp->hfs_syncer_thread = NULL; + hfs_syncer_unlock(hfsmp); + hfs_syncer_wakeup(hfsmp); + + /* BE CAREFUL WHAT YOU ADD HERE: at this point hfs_unmount is free + to continue and therefore hfsmp might be invalid. */ + + KDBG(HFSDBG_SYNCER | DBG_FUNC_END); +} + +/* + * Call into the allocator code and perform a full scan of the bitmap file. + * + * This allows us to TRIM unallocated ranges if needed, and also to build up + * an in-memory summary table of the state of the allocated blocks. + */ +void hfs_scan_blocks (struct hfsmount *hfsmp) { + /* + * Take the allocation file lock. Journal transactions will block until + * we're done here. + */ + + int flags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK); + + /* + * We serialize here with the HFS mount lock as we're mounting. + * + * The mount can only proceed once this thread has acquired the bitmap + * lock, since we absolutely do not want someone else racing in and + * getting the bitmap lock, doing a read/write of the bitmap file, + * then us getting the bitmap lock. + * + * To prevent this, the mount thread takes the HFS mount mutex, starts us + * up, then immediately msleeps on the scan_var variable in the mount + * point as a condition variable. This serialization is safe since + * if we race in and try to proceed while they're still holding the lock, + * we'll block trying to acquire the global lock. Since the mount thread + * acquires the HFS mutex before starting this function in a new thread, + * any lock acquisition on our part must be linearizably AFTER the mount thread's. + * + * Note that the HFS mount mutex is always taken last, and always for only + * a short time. In this case, we just take it long enough to mark the + * scan-in-flight bit. + */ + (void) hfs_lock_mount (hfsmp); + hfsmp->scan_var |= HFS_ALLOCATOR_SCAN_INFLIGHT; + wakeup((caddr_t) &hfsmp->scan_var); + hfs_unlock_mount (hfsmp); + + /* Initialize the summary table */ + if (hfs_init_summary (hfsmp)) { + printf("hfs: could not initialize summary table for %s\n", hfsmp->vcbVN); + } + + /* + * ScanUnmapBlocks assumes that the bitmap lock is held when you + * call the function. We don't care if there were any errors issuing unmaps. + * + * It will also attempt to build up the summary table for subsequent + * allocator use, as configured. + */ + (void) ScanUnmapBlocks(hfsmp); + + (void) hfs_lock_mount (hfsmp); + hfsmp->scan_var &= ~HFS_ALLOCATOR_SCAN_INFLIGHT; + hfsmp->scan_var |= HFS_ALLOCATOR_SCAN_COMPLETED; + wakeup((caddr_t) &hfsmp->scan_var); + hfs_unlock_mount (hfsmp); + + buf_invalidateblks(hfsmp->hfs_allocation_vp, 0, 0, 0); + + hfs_systemfile_unlock(hfsmp, flags); + +} + +/* + * Common code for mount and mountroot + */ +int +hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, + int journal_replay_only, vfs_context_t context) +{ + struct proc *p = vfs_context_proc(context); + int retval = E_NONE; + struct hfsmount *hfsmp = NULL; + struct buf *bp; + dev_t dev; + HFSMasterDirectoryBlock *mdbp = NULL; + int ronly; +#if QUOTA + int i; +#endif + int mntwrapper; + kauth_cred_t cred; + u_int64_t disksize; + daddr64_t log_blkcnt; + u_int32_t log_blksize; + u_int32_t phys_blksize; + u_int32_t minblksize; + u_int32_t iswritable; + daddr64_t mdb_offset; + int isvirtual = 0; + int isroot = !journal_replay_only && args == NULL; + u_int32_t device_features = 0; + int isssd; + + ronly = mp && vfs_isrdonly(mp); + dev = vnode_specrdev(devvp); + cred = p ? vfs_context_ucred(context) : NOCRED; + mntwrapper = 0; + + bp = NULL; + hfsmp = NULL; + mdbp = NULL; + minblksize = kHFSBlockSize; + + /* Advisory locking should be handled at the VFS layer */ + if (mp) + vfs_setlocklocal(mp); + + /* Get the logical block size (treated as physical block size everywhere) */ + if (VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE, (caddr_t)&log_blksize, 0, context)) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: DKIOCGETBLOCKSIZE failed\n"); + } + retval = ENXIO; + goto error_exit; + } + if (log_blksize == 0 || log_blksize > 1024*1024*1024) { + printf("hfs: logical block size 0x%x looks bad. Not mounting.\n", log_blksize); + retval = ENXIO; + goto error_exit; + } + + /* Get the physical block size. */ + retval = VNOP_IOCTL(devvp, DKIOCGETPHYSICALBLOCKSIZE, (caddr_t)&phys_blksize, 0, context); + if (retval) { + if ((retval != ENOTSUP) && (retval != ENOTTY)) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: DKIOCGETPHYSICALBLOCKSIZE failed\n"); + } + retval = ENXIO; + goto error_exit; + } + /* If device does not support this ioctl, assume that physical + * block size is same as logical block size + */ + phys_blksize = log_blksize; + } + if (phys_blksize == 0 || phys_blksize > MAXBSIZE) { + printf("hfs: physical block size 0x%x looks bad. Not mounting.\n", phys_blksize); + retval = ENXIO; + goto error_exit; + } + + /* Switch to 512 byte sectors (temporarily) */ + if (log_blksize > 512) { + u_int32_t size512 = 512; + + if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&size512, FWRITE, context)) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: DKIOCSETBLOCKSIZE failed \n"); + } + retval = ENXIO; + goto error_exit; + } + } + /* Get the number of 512 byte physical blocks. */ + if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&log_blkcnt, 0, context)) { + /* resetting block size may fail if getting block count did */ + (void)VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&log_blksize, FWRITE, context); + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: DKIOCGETBLOCKCOUNT failed\n"); + } + retval = ENXIO; + goto error_exit; + } + /* Compute an accurate disk size (i.e. within 512 bytes) */ + disksize = (u_int64_t)log_blkcnt * (u_int64_t)512; + + /* + * On Tiger it is not necessary to switch the device + * block size to be 4k if there are more than 31-bits + * worth of blocks but to insure compatibility with + * pre-Tiger systems we have to do it. + * + * If the device size is not a multiple of 4K (8 * 512), then + * switching the logical block size isn't going to help because + * we will be unable to write the alternate volume header. + * In this case, just leave the logical block size unchanged. + */ + if (log_blkcnt > 0x000000007fffffff && (log_blkcnt & 7) == 0) { + minblksize = log_blksize = 4096; + if (phys_blksize < log_blksize) + phys_blksize = log_blksize; + } + + /* + * The cluster layer is not currently prepared to deal with a logical + * block size larger than the system's page size. (It can handle + * blocks per page, but not multiple pages per block.) So limit the + * logical block size to the page size. + */ + if (log_blksize > PAGE_SIZE) { + log_blksize = PAGE_SIZE; + } + + /* Now switch to our preferred physical block size. */ + if (log_blksize > 512) { + if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&log_blksize, FWRITE, context)) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: DKIOCSETBLOCKSIZE (2) failed\n"); + } + retval = ENXIO; + goto error_exit; + } + /* Get the count of physical blocks. */ + if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&log_blkcnt, 0, context)) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: DKIOCGETBLOCKCOUNT (2) failed\n"); + } + retval = ENXIO; + goto error_exit; + } + } + /* + * At this point: + * minblksize is the minimum physical block size + * log_blksize has our preferred physical block size + * log_blkcnt has the total number of physical blocks + */ + + mdb_offset = (daddr64_t)HFS_PRI_SECTOR(log_blksize); + if ((retval = (int)buf_meta_bread(devvp, + HFS_PHYSBLK_ROUNDDOWN(mdb_offset, (phys_blksize/log_blksize)), + phys_blksize, cred, &bp))) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: buf_meta_bread failed with %d\n", retval); + } + goto error_exit; + } + mdbp = hfs_malloc(kMDBSize); + bcopy((char *)buf_dataptr(bp) + HFS_PRI_OFFSET(phys_blksize), mdbp, kMDBSize); + buf_brelse(bp); + bp = NULL; + + hfsmp = hfs_mallocz(sizeof(struct hfsmount)); + + hfs_chashinit_finish(hfsmp); + + /* Init the ID lookup hashtable */ + hfs_idhash_init (hfsmp); + + /* + * See if the disk supports unmap (trim). + * + * NOTE: vfs_init_io_attributes has not been called yet, so we can't use the io_flags field + * returned by vfs_ioattr. We need to call VNOP_IOCTL ourselves. + */ + if (VNOP_IOCTL(devvp, DKIOCGETFEATURES, (caddr_t)&device_features, 0, context) == 0) { + if (device_features & DK_FEATURE_UNMAP) { + hfsmp->hfs_flags |= HFS_UNMAP; + } + + if(device_features & DK_FEATURE_BARRIER) + hfsmp->hfs_flags |= HFS_FEATURE_BARRIER; + } + + /* + * See if the disk is a solid state device, too. We need this to decide what to do about + * hotfiles. + */ + if (VNOP_IOCTL(devvp, DKIOCISSOLIDSTATE, (caddr_t)&isssd, 0, context) == 0) { + if (isssd) { + hfsmp->hfs_flags |= HFS_SSD; + } + } + + /* See if the underlying device is Core Storage or not */ + dk_corestorage_info_t cs_info; + memset(&cs_info, 0, sizeof(dk_corestorage_info_t)); + if (VNOP_IOCTL(devvp, DKIOCCORESTORAGE, (caddr_t)&cs_info, 0, context) == 0) { + hfsmp->hfs_flags |= HFS_CS; + if (isroot && (cs_info.flags & DK_CORESTORAGE_PIN_YOUR_METADATA)) { + hfsmp->hfs_flags |= HFS_CS_METADATA_PIN; + } + if (isroot && (cs_info.flags & DK_CORESTORAGE_ENABLE_HOTFILES)) { + hfsmp->hfs_flags |= HFS_CS_HOTFILE_PIN; + hfsmp->hfs_cs_hotfile_size = cs_info.hotfile_size; + } + if ((cs_info.flags & DK_CORESTORAGE_PIN_YOUR_SWAPFILE)) { + hfsmp->hfs_flags |= HFS_CS_SWAPFILE_PIN; + + struct vfsioattr ioattr; + vfs_ioattr(mp, &ioattr); + ioattr.io_flags |= VFS_IOATTR_FLAGS_SWAPPIN_SUPPORTED; + ioattr.io_max_swappin_available = cs_info.swapfile_pinning; + vfs_setioattr(mp, &ioattr); + } + } + + /* + * Init the volume information structure + */ + + lck_mtx_init(&hfsmp->hfs_mutex, hfs_mutex_group, hfs_lock_attr); + lck_mtx_init(&hfsmp->hfc_mutex, hfs_mutex_group, hfs_lock_attr); + lck_rw_init(&hfsmp->hfs_global_lock, hfs_rwlock_group, hfs_lock_attr); + lck_spin_init(&hfsmp->vcbFreeExtLock, hfs_spinlock_group, hfs_lock_attr); + + if (mp) + vfs_setfsprivate(mp, hfsmp); + hfsmp->hfs_mp = mp; /* Make VFSTOHFS work */ + hfsmp->hfs_raw_dev = vnode_specrdev(devvp); + hfsmp->hfs_devvp = devvp; + vnode_ref(devvp); /* Hold a ref on the device, dropped when hfsmp is freed. */ + hfsmp->hfs_logical_block_size = log_blksize; + hfsmp->hfs_logical_block_count = log_blkcnt; + hfsmp->hfs_logical_bytes = (uint64_t) log_blksize * (uint64_t) log_blkcnt; + hfsmp->hfs_physical_block_size = phys_blksize; + hfsmp->hfs_log_per_phys = (phys_blksize / log_blksize); + hfsmp->hfs_flags |= HFS_WRITEABLE_MEDIA; + if (ronly) + hfsmp->hfs_flags |= HFS_READ_ONLY; + if (mp && ((unsigned int)vfs_flags(mp)) & MNT_UNKNOWNPERMISSIONS) + hfsmp->hfs_flags |= HFS_UNKNOWN_PERMS; + +#if QUOTA + for (i = 0; i < MAXQUOTAS; i++) + dqfileinit(&hfsmp->hfs_qfiles[i]); +#endif + + if (args) { + hfsmp->hfs_uid = (args->hfs_uid == (uid_t)VNOVAL) ? UNKNOWNUID : args->hfs_uid; + if (hfsmp->hfs_uid == 0xfffffffd) hfsmp->hfs_uid = UNKNOWNUID; + hfsmp->hfs_gid = (args->hfs_gid == (gid_t)VNOVAL) ? UNKNOWNGID : args->hfs_gid; + if (hfsmp->hfs_gid == 0xfffffffd) hfsmp->hfs_gid = UNKNOWNGID; + vfs_setowner(mp, hfsmp->hfs_uid, hfsmp->hfs_gid); /* tell the VFS */ + if (args->hfs_mask != (mode_t)VNOVAL) { + hfsmp->hfs_dir_mask = args->hfs_mask & ALLPERMS; + if (args->flags & HFSFSMNT_NOXONFILES) { + hfsmp->hfs_file_mask = (args->hfs_mask & DEFFILEMODE); + } else { + hfsmp->hfs_file_mask = args->hfs_mask & ALLPERMS; + } + } else { + hfsmp->hfs_dir_mask = UNKNOWNPERMISSIONS & ALLPERMS; /* 0777: rwx---rwx */ + hfsmp->hfs_file_mask = UNKNOWNPERMISSIONS & DEFFILEMODE; /* 0666: no --x by default? */ + } + if ((args->flags != (int)VNOVAL) && (args->flags & HFSFSMNT_WRAPPER)) + mntwrapper = 1; + } else { + /* Even w/o explicit mount arguments, MNT_UNKNOWNPERMISSIONS requires setting up uid, gid, and mask: */ + if (mp && ((unsigned int)vfs_flags(mp)) & MNT_UNKNOWNPERMISSIONS) { + hfsmp->hfs_uid = UNKNOWNUID; + hfsmp->hfs_gid = UNKNOWNGID; + vfs_setowner(mp, hfsmp->hfs_uid, hfsmp->hfs_gid); /* tell the VFS */ + hfsmp->hfs_dir_mask = UNKNOWNPERMISSIONS & ALLPERMS; /* 0777: rwx---rwx */ + hfsmp->hfs_file_mask = UNKNOWNPERMISSIONS & DEFFILEMODE; /* 0666: no --x by default? */ + } + } + + /* Find out if disk media is writable. */ + if (VNOP_IOCTL(devvp, DKIOCISWRITABLE, (caddr_t)&iswritable, 0, context) == 0) { + if (iswritable) + hfsmp->hfs_flags |= HFS_WRITEABLE_MEDIA; + else + hfsmp->hfs_flags &= ~HFS_WRITEABLE_MEDIA; + } + + // Reservations + rl_init(&hfsmp->hfs_reserved_ranges[0]); + rl_init(&hfsmp->hfs_reserved_ranges[1]); + + // record the current time at which we're mounting this volume + struct timeval tv; + microtime(&tv); + hfsmp->hfs_mount_time = tv.tv_sec; + + /* Mount a standard HFS disk */ + if ((SWAP_BE16(mdbp->drSigWord) == kHFSSigWord) && + (mntwrapper || (SWAP_BE16(mdbp->drEmbedSigWord) != kHFSPlusSigWord))) { +#if CONFIG_HFS_STD + /* If only journal replay is requested, exit immediately */ + if (journal_replay_only) { + retval = 0; + goto error_exit; + } + + /* On 10.6 and beyond, non read-only mounts for HFS standard vols get rejected */ + if (vfs_isrdwr(mp)) { + retval = EROFS; + goto error_exit; + } + + printf("hfs_mountfs: Mounting HFS Standard volumes was deprecated in Mac OS 10.7 \n"); + + /* Treat it as if it's read-only and not writeable */ + hfsmp->hfs_flags |= HFS_READ_ONLY; + hfsmp->hfs_flags &= ~HFS_WRITEABLE_MEDIA; + + if ((vfs_flags(mp) & MNT_ROOTFS)) { + retval = EINVAL; /* Cannot root from HFS standard disks */ + goto error_exit; + } + /* HFS disks can only use 512 byte physical blocks */ + if (log_blksize > kHFSBlockSize) { + log_blksize = kHFSBlockSize; + if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&log_blksize, FWRITE, context)) { + retval = ENXIO; + goto error_exit; + } + if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&log_blkcnt, 0, context)) { + retval = ENXIO; + goto error_exit; + } + hfsmp->hfs_logical_block_size = log_blksize; + hfsmp->hfs_logical_block_count = log_blkcnt; + hfsmp->hfs_logical_bytes = (uint64_t) log_blksize * (uint64_t) log_blkcnt; + hfsmp->hfs_physical_block_size = log_blksize; + hfsmp->hfs_log_per_phys = 1; + } + if (args) { + hfsmp->hfs_encoding = args->hfs_encoding; + HFSTOVCB(hfsmp)->volumeNameEncodingHint = args->hfs_encoding; + + /* establish the timezone */ + gTimeZone = args->hfs_timezone; + } + + retval = hfs_getconverter(hfsmp->hfs_encoding, &hfsmp->hfs_get_unicode, + &hfsmp->hfs_get_hfsname); + if (retval) + goto error_exit; + + retval = hfs_MountHFSVolume(hfsmp, mdbp, p); + if (retval) + (void) hfs_relconverter(hfsmp->hfs_encoding); +#else + /* On platforms where HFS Standard is not supported, deny the mount altogether */ + retval = EINVAL; + goto error_exit; +#endif + + } + else { /* Mount an HFS Plus disk */ + HFSPlusVolumeHeader *vhp; + off_t embeddedOffset; + int jnl_disable = 0; + + /* Get the embedded Volume Header */ + if (SWAP_BE16(mdbp->drEmbedSigWord) == kHFSPlusSigWord) { + embeddedOffset = SWAP_BE16(mdbp->drAlBlSt) * kHFSBlockSize; + embeddedOffset += (u_int64_t)SWAP_BE16(mdbp->drEmbedExtent.startBlock) * + (u_int64_t)SWAP_BE32(mdbp->drAlBlkSiz); + + /* + * Cooperative Fusion is not allowed on embedded HFS+ + * filesystems (HFS+ inside HFS standard wrapper) + */ + hfsmp->hfs_flags &= ~HFS_CS_METADATA_PIN; + + /* + * If the embedded volume doesn't start on a block + * boundary, then switch the device to a 512-byte + * block size so everything will line up on a block + * boundary. + */ + if ((embeddedOffset % log_blksize) != 0) { + printf("hfs_mountfs: embedded volume offset not" + " a multiple of physical block size (%d);" + " switching to 512\n", log_blksize); + log_blksize = 512; + if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, + (caddr_t)&log_blksize, FWRITE, context)) { + + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: DKIOCSETBLOCKSIZE (3) failed\n"); + } + retval = ENXIO; + goto error_exit; + } + if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, + (caddr_t)&log_blkcnt, 0, context)) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: DKIOCGETBLOCKCOUNT (3) failed\n"); + } + retval = ENXIO; + goto error_exit; + } + /* Note: relative block count adjustment */ + hfsmp->hfs_logical_block_count *= + hfsmp->hfs_logical_block_size / log_blksize; + + /* Update logical /physical block size */ + hfsmp->hfs_logical_block_size = log_blksize; + hfsmp->hfs_physical_block_size = log_blksize; + + phys_blksize = log_blksize; + hfsmp->hfs_log_per_phys = 1; + } + + disksize = (u_int64_t)SWAP_BE16(mdbp->drEmbedExtent.blockCount) * + (u_int64_t)SWAP_BE32(mdbp->drAlBlkSiz); + + hfsmp->hfs_logical_block_count = disksize / log_blksize; + + hfsmp->hfs_logical_bytes = (uint64_t) hfsmp->hfs_logical_block_count * (uint64_t) hfsmp->hfs_logical_block_size; + + mdb_offset = (daddr64_t)((embeddedOffset / log_blksize) + HFS_PRI_SECTOR(log_blksize)); + + if (bp) { + buf_markinvalid(bp); + buf_brelse(bp); + bp = NULL; + } + retval = (int)buf_meta_bread(devvp, HFS_PHYSBLK_ROUNDDOWN(mdb_offset, hfsmp->hfs_log_per_phys), + phys_blksize, cred, &bp); + if (retval) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: buf_meta_bread (2) failed with %d\n", retval); + } + goto error_exit; + } + bcopy((char *)buf_dataptr(bp) + HFS_PRI_OFFSET(phys_blksize), mdbp, 512); + buf_brelse(bp); + bp = NULL; + vhp = (HFSPlusVolumeHeader*) mdbp; + + } + else { /* pure HFS+ */ + embeddedOffset = 0; + vhp = (HFSPlusVolumeHeader*) mdbp; + } + + retval = hfs_ValidateHFSPlusVolumeHeader(hfsmp, vhp); + if (retval) + goto error_exit; + + /* + * If allocation block size is less than the physical block size, + * invalidate the buffer read in using native physical block size + * to ensure data consistency. + * + * HFS Plus reserves one allocation block for the Volume Header. + * If the physical size is larger, then when we read the volume header, + * we will also end up reading in the next allocation block(s). + * If those other allocation block(s) is/are modified, and then the volume + * header is modified, the write of the volume header's buffer will write + * out the old contents of the other allocation blocks. + * + * We assume that the physical block size is same as logical block size. + * The physical block size value is used to round down the offsets for + * reading and writing the primary and alternate volume headers. + * + * The same logic is also in hfs_MountHFSPlusVolume to ensure that + * hfs_mountfs, hfs_MountHFSPlusVolume and later are doing the I/Os + * using same block size. + */ + if (SWAP_BE32(vhp->blockSize) < hfsmp->hfs_physical_block_size) { + phys_blksize = hfsmp->hfs_logical_block_size; + hfsmp->hfs_physical_block_size = hfsmp->hfs_logical_block_size; + hfsmp->hfs_log_per_phys = 1; + // There should be one bp associated with devvp in buffer cache. + retval = buf_invalidateblks(devvp, 0, 0, 0); + if (retval) + goto error_exit; + } + + if (isroot && ((SWAP_BE32(vhp->attributes) & kHFSVolumeUnmountedMask) != 0)) { + vfs_set_root_unmounted_cleanly(); + } + + /* + * On inconsistent disks, do not allow read-write mount + * unless it is the boot volume being mounted. We also + * always want to replay the journal if the journal_replay_only + * flag is set because that will (most likely) get the + * disk into a consistent state before fsck_hfs starts + * looking at it. + */ + if (!journal_replay_only + && !(vfs_flags(mp) & MNT_ROOTFS) + && (SWAP_BE32(vhp->attributes) & kHFSVolumeInconsistentMask) + && !(hfsmp->hfs_flags & HFS_READ_ONLY)) { + + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: failed to mount non-root inconsistent disk\n"); + } + retval = EINVAL; + goto error_exit; + } + + + // XXXdbg + // + hfsmp->jnl = NULL; + hfsmp->jvp = NULL; + if (args != NULL && (args->flags & HFSFSMNT_EXTENDED_ARGS) && + args->journal_disable) { + jnl_disable = 1; + } + + // + // We only initialize the journal here if the last person + // to mount this volume was journaling aware. Otherwise + // we delay journal initialization until later at the end + // of hfs_MountHFSPlusVolume() because the last person who + // mounted it could have messed things up behind our back + // (so we need to go find the .journal file, make sure it's + // the right size, re-sync up if it was moved, etc). + // + if ( (SWAP_BE32(vhp->lastMountedVersion) == kHFSJMountVersion) + && (SWAP_BE32(vhp->attributes) & kHFSVolumeJournaledMask) + && !jnl_disable) { + + // if we're able to init the journal, mark the mount + // point as journaled. + // + if ((retval = hfs_early_journal_init(hfsmp, vhp, args, embeddedOffset, mdb_offset, mdbp, cred)) == 0) { + if (mp) + vfs_setflags(mp, (u_int64_t)((unsigned int)MNT_JOURNALED)); + } else { + if (retval == EROFS) { + // EROFS is a special error code that means the volume has an external + // journal which we couldn't find. in that case we do not want to + // rewrite the volume header - we'll just refuse to mount the volume. + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: hfs_early_journal_init indicated external jnl \n"); + } + retval = EINVAL; + goto error_exit; + } + + // if the journal failed to open, then set the lastMountedVersion + // to be "FSK!" which fsck_hfs will see and force the fsck instead + // of just bailing out because the volume is journaled. + if (!ronly) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: hfs_early_journal_init failed, setting to FSK \n"); + } + + HFSPlusVolumeHeader *jvhp; + + hfsmp->hfs_flags |= HFS_NEED_JNL_RESET; + + if (mdb_offset == 0) { + mdb_offset = (daddr64_t)((embeddedOffset / log_blksize) + HFS_PRI_SECTOR(log_blksize)); + } + + bp = NULL; + retval = (int)buf_meta_bread(devvp, + HFS_PHYSBLK_ROUNDDOWN(mdb_offset, hfsmp->hfs_log_per_phys), + phys_blksize, cred, &bp); + if (retval == 0) { + jvhp = (HFSPlusVolumeHeader *)(buf_dataptr(bp) + HFS_PRI_OFFSET(phys_blksize)); + + if (SWAP_BE16(jvhp->signature) == kHFSPlusSigWord || SWAP_BE16(jvhp->signature) == kHFSXSigWord) { + printf ("hfs(1): Journal replay fail. Writing lastMountVersion as FSK!\n"); + jvhp->lastMountedVersion = SWAP_BE32(kFSKMountVersion); + buf_bwrite(bp); + } else { + buf_brelse(bp); + } + bp = NULL; + } else if (bp) { + buf_brelse(bp); + // clear this so the error exit path won't try to use it + bp = NULL; + } + } + + // if this isn't the root device just bail out. + // If it is the root device we just continue on + // in the hopes that fsck_hfs will be able to + // fix any damage that exists on the volume. + if (mp && !(vfs_flags(mp) & MNT_ROOTFS)) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: hfs_early_journal_init failed, erroring out \n"); + } + retval = EINVAL; + goto error_exit; + } + } + } + + /* Either the journal is replayed successfully, or there + * was nothing to replay, or no journal exists. In any case, + * return success. + */ + if (journal_replay_only) { + retval = 0; + goto error_exit; + } + +#if CONFIG_HFS_STD + (void) hfs_getconverter(0, &hfsmp->hfs_get_unicode, &hfsmp->hfs_get_hfsname); +#endif + + retval = hfs_MountHFSPlusVolume(hfsmp, vhp, embeddedOffset, disksize, p, args, cred); + /* + * If the backend didn't like our physical blocksize + * then retry with physical blocksize of 512. + */ + if ((retval == ENXIO) && (log_blksize > 512) && (log_blksize != minblksize)) { + printf("hfs_mountfs: could not use physical block size " + "(%d) switching to 512\n", log_blksize); + log_blksize = 512; + if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&log_blksize, FWRITE, context)) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: DKIOCSETBLOCKSIZE (4) failed \n"); + } + retval = ENXIO; + goto error_exit; + } + if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&log_blkcnt, 0, context)) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: DKIOCGETBLOCKCOUNT (4) failed \n"); + } + retval = ENXIO; + goto error_exit; + } + set_fsblocksize(devvp); + /* Note: relative block count adjustment (in case this is an embedded volume). */ + hfsmp->hfs_logical_block_count *= hfsmp->hfs_logical_block_size / log_blksize; + hfsmp->hfs_logical_block_size = log_blksize; + hfsmp->hfs_log_per_phys = hfsmp->hfs_physical_block_size / log_blksize; + + hfsmp->hfs_logical_bytes = (uint64_t) hfsmp->hfs_logical_block_count * (uint64_t) hfsmp->hfs_logical_block_size; + + if (hfsmp->jnl && hfsmp->jvp == devvp) { + // close and re-open this with the new block size + journal_close(hfsmp->jnl); + hfsmp->jnl = NULL; + if (hfs_early_journal_init(hfsmp, vhp, args, embeddedOffset, mdb_offset, mdbp, cred) == 0) { + vfs_setflags(mp, (u_int64_t)((unsigned int)MNT_JOURNALED)); + } else { + // if the journal failed to open, then set the lastMountedVersion + // to be "FSK!" which fsck_hfs will see and force the fsck instead + // of just bailing out because the volume is journaled. + if (!ronly) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: hfs_early_journal_init (2) resetting.. \n"); + } + HFSPlusVolumeHeader *jvhp; + + hfsmp->hfs_flags |= HFS_NEED_JNL_RESET; + + if (mdb_offset == 0) { + mdb_offset = (daddr64_t)((embeddedOffset / log_blksize) + HFS_PRI_SECTOR(log_blksize)); + } + + bp = NULL; + retval = (int)buf_meta_bread(devvp, HFS_PHYSBLK_ROUNDDOWN(mdb_offset, hfsmp->hfs_log_per_phys), + phys_blksize, cred, &bp); + if (retval == 0) { + jvhp = (HFSPlusVolumeHeader *)(buf_dataptr(bp) + HFS_PRI_OFFSET(phys_blksize)); + + if (SWAP_BE16(jvhp->signature) == kHFSPlusSigWord || SWAP_BE16(jvhp->signature) == kHFSXSigWord) { + printf ("hfs(2): Journal replay fail. Writing lastMountVersion as FSK!\n"); + jvhp->lastMountedVersion = SWAP_BE32(kFSKMountVersion); + buf_bwrite(bp); + } else { + buf_brelse(bp); + } + bp = NULL; + } else if (bp) { + buf_brelse(bp); + // clear this so the error exit path won't try to use it + bp = NULL; + } + } + + // if this isn't the root device just bail out. + // If it is the root device we just continue on + // in the hopes that fsck_hfs will be able to + // fix any damage that exists on the volume. + if ( !(vfs_flags(mp) & MNT_ROOTFS)) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: hfs_early_journal_init (2) failed \n"); + } + retval = EINVAL; + goto error_exit; + } + } + } + + /* Try again with a smaller block size... */ + retval = hfs_MountHFSPlusVolume(hfsmp, vhp, embeddedOffset, disksize, p, args, cred); + if (retval && HFS_MOUNT_DEBUG) { + printf("hfs_MountHFSPlusVolume (late) returned %d\n",retval); + } + } +#if CONFIG_HFS_STD + if (retval) + (void) hfs_relconverter(0); +#endif + } + + // save off a snapshot of the mtime from the previous mount + // (for matador). + hfsmp->hfs_last_mounted_mtime = hfsmp->hfs_mtime; + + if ( retval ) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: encountered failure %d \n", retval); + } + goto error_exit; + } + + struct vfsstatfs *vsfs = vfs_statfs(mp); + vsfs->f_fsid.val[0] = dev; + vsfs->f_fsid.val[1] = vfs_typenum(mp); + + vfs_setmaxsymlen(mp, 0); + +#if CONFIG_HFS_STD + if (ISSET(hfsmp->hfs_flags, HFS_STANDARD)) { + /* HFS standard doesn't support extended readdir! */ + mount_set_noreaddirext (mp); + } +#endif + + if (args) { + /* + * Set the free space warning levels for a non-root volume: + * + * Set the "danger" limit to 1% of the volume size or 150MB, whichever is less. + * Set the "warning" limit to 2% of the volume size or 500MB, whichever is less. + * Set the "near warning" limit to 10% of the volume size or 1GB, whichever is less. + * And last, set the "desired" freespace level to to 12% of the volume size or 1.2GB, + * whichever is less. + */ + hfsmp->hfs_freespace_notify_dangerlimit = + MIN(HFS_VERYLOWDISKTRIGGERLEVEL / HFSTOVCB(hfsmp)->blockSize, + (HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_VERYLOWDISKTRIGGERFRACTION); + hfsmp->hfs_freespace_notify_warninglimit = + MIN(HFS_LOWDISKTRIGGERLEVEL / HFSTOVCB(hfsmp)->blockSize, + (HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_LOWDISKTRIGGERFRACTION); + hfsmp->hfs_freespace_notify_nearwarninglimit = + MIN(HFS_NEARLOWDISKTRIGGERLEVEL / HFSTOVCB(hfsmp)->blockSize, + (HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_NEARLOWDISKTRIGGERFRACTION); + hfsmp->hfs_freespace_notify_desiredlevel = + MIN(HFS_LOWDISKSHUTOFFLEVEL / HFSTOVCB(hfsmp)->blockSize, + (HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_LOWDISKSHUTOFFFRACTION); + } else { + /* + * Set the free space warning levels for the root volume: + * + * Set the "danger" limit to 5% of the volume size or 512MB, whichever is less. + * Set the "warning" limit to 10% of the volume size or 1GB, whichever is less. + * Set the "near warning" limit to 10.5% of the volume size or 1.1GB, whichever is less. + * And last, set the "desired" freespace level to to 11% of the volume size or 1.25GB, + * whichever is less. + * + * NOTE: While those are the default limits, KernelEventAgent (as of 3/2016) + * will unilaterally override these to the following on OSX only: + * Danger: 3GB + * Warning: Min (2% of root volume, 10GB), with a floor of 10GB + * Desired: Warning Threshold + 1.5GB + */ + hfsmp->hfs_freespace_notify_dangerlimit = + MIN(HFS_ROOTVERYLOWDISKTRIGGERLEVEL / HFSTOVCB(hfsmp)->blockSize, + (HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_ROOTVERYLOWDISKTRIGGERFRACTION); + hfsmp->hfs_freespace_notify_warninglimit = + MIN(HFS_ROOTLOWDISKTRIGGERLEVEL / HFSTOVCB(hfsmp)->blockSize, + (HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_ROOTLOWDISKTRIGGERFRACTION); + hfsmp->hfs_freespace_notify_nearwarninglimit = + MIN(HFS_ROOTNEARLOWDISKTRIGGERLEVEL / HFSTOVCB(hfsmp)->blockSize, + (HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_ROOTNEARLOWDISKTRIGGERFRACTION); + hfsmp->hfs_freespace_notify_desiredlevel = + MIN(HFS_ROOTLOWDISKSHUTOFFLEVEL / HFSTOVCB(hfsmp)->blockSize, + (HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_ROOTLOWDISKSHUTOFFFRACTION); + }; + + /* Check if the file system exists on virtual device, like disk image */ + if (VNOP_IOCTL(devvp, DKIOCISVIRTUAL, (caddr_t)&isvirtual, 0, context) == 0) { + if (isvirtual) { + hfsmp->hfs_flags |= HFS_VIRTUAL_DEVICE; + } + } + + if (!isroot + && !ISSET(hfsmp->hfs_flags, HFS_VIRTUAL_DEVICE) + && hfs_is_ejectable(vfs_statfs(mp)->f_mntfromname)) { + SET(hfsmp->hfs_flags, HFS_RUN_SYNCER); + } + + const char *dev_name = (hfsmp->hfs_devvp + ? vnode_getname_printable(hfsmp->hfs_devvp) : NULL); + + printf("hfs: mounted %s on device %s\n", + (hfsmp->vcbVN[0] ? (const char*) hfsmp->vcbVN : "unknown"), + dev_name ?: "unknown device"); + + if (dev_name) + vnode_putname_printable(dev_name); + + /* + * Start looking for free space to drop below this level and generate a + * warning immediately if needed: + */ + hfsmp->hfs_notification_conditions = 0; + hfs_generate_volume_notifications(hfsmp); + + if (ronly == 0) { + (void) hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT); + } + hfs_free(mdbp, kMDBSize); + return (0); + +error_exit: + if (bp) + buf_brelse(bp); + + hfs_free(mdbp, kMDBSize); + + hfs_close_jvp(hfsmp); + + if (hfsmp) { + if (hfsmp->hfs_devvp) { + vnode_rele(hfsmp->hfs_devvp); + } + hfs_locks_destroy(hfsmp); + hfs_delete_chash(hfsmp); + hfs_idhash_destroy (hfsmp); + + hfs_free(hfsmp, sizeof(*hfsmp)); + if (mp) + vfs_setfsprivate(mp, NULL); + } + return (retval); +} + + +/* + * Make a filesystem operational. + * Nothing to do at the moment. + */ +/* ARGSUSED */ +static int +hfs_start(__unused struct mount *mp, __unused int flags, __unused vfs_context_t context) +{ + return (0); +} + + +/* + * unmount system call + */ +int +hfs_unmount(struct mount *mp, int mntflags, vfs_context_t context) +{ + struct proc *p = vfs_context_proc(context); + struct hfsmount *hfsmp = VFSTOHFS(mp); + int retval = E_NONE; + int flags; + int force; + int started_tr = 0; + + flags = 0; + force = 0; + if (mntflags & MNT_FORCE) { + flags |= FORCECLOSE; + force = 1; + } + + const char *dev_name = (hfsmp->hfs_devvp + ? vnode_getname_printable(hfsmp->hfs_devvp) : NULL); + + printf("hfs: unmount initiated on %s on device %s\n", + (hfsmp->vcbVN[0] ? (const char*) hfsmp->vcbVN : "unknown"), + dev_name ?: "unknown device"); + + if (dev_name) + vnode_putname_printable(dev_name); + + if ((retval = hfs_flushfiles(mp, flags, p)) && !force) + return (retval); + + if (hfsmp->hfs_flags & HFS_METADATA_ZONE) + (void) hfs_recording_suspend(hfsmp); + + hfs_syncer_free(hfsmp); + + if (hfsmp->hfs_flags & HFS_SUMMARY_TABLE) { + if (hfsmp->hfs_summary_table) { + int err = 0; + /* + * Take the bitmap lock to serialize against a concurrent bitmap scan still in progress + */ + if (hfsmp->hfs_allocation_vp) { + err = hfs_lock (VTOC(hfsmp->hfs_allocation_vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + } + hfs_free(hfsmp->hfs_summary_table, hfsmp->hfs_summary_bytes); + hfsmp->hfs_summary_table = NULL; + hfsmp->hfs_flags &= ~HFS_SUMMARY_TABLE; + + if (err == 0 && hfsmp->hfs_allocation_vp){ + hfs_unlock (VTOC(hfsmp->hfs_allocation_vp)); + } + + } + } + + /* + * Flush out the b-trees, volume bitmap and Volume Header + */ + if ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) { + retval = hfs_start_transaction(hfsmp); + if (retval == 0) { + started_tr = 1; + } else if (!force) { + goto err_exit; + } + + if (hfsmp->hfs_startup_vp) { + (void) hfs_lock(VTOC(hfsmp->hfs_startup_vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + retval = hfs_fsync(hfsmp->hfs_startup_vp, MNT_WAIT, 0, p); + hfs_unlock(VTOC(hfsmp->hfs_startup_vp)); + if (retval && !force) + goto err_exit; + } + + if (hfsmp->hfs_attribute_vp) { + (void) hfs_lock(VTOC(hfsmp->hfs_attribute_vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + retval = hfs_fsync(hfsmp->hfs_attribute_vp, MNT_WAIT, 0, p); + hfs_unlock(VTOC(hfsmp->hfs_attribute_vp)); + if (retval && !force) + goto err_exit; + } + + (void) hfs_lock(VTOC(hfsmp->hfs_catalog_vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + retval = hfs_fsync(hfsmp->hfs_catalog_vp, MNT_WAIT, 0, p); + hfs_unlock(VTOC(hfsmp->hfs_catalog_vp)); + if (retval && !force) + goto err_exit; + + (void) hfs_lock(VTOC(hfsmp->hfs_extents_vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + retval = hfs_fsync(hfsmp->hfs_extents_vp, MNT_WAIT, 0, p); + hfs_unlock(VTOC(hfsmp->hfs_extents_vp)); + if (retval && !force) + goto err_exit; + + if (hfsmp->hfs_allocation_vp) { + (void) hfs_lock(VTOC(hfsmp->hfs_allocation_vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + retval = hfs_fsync(hfsmp->hfs_allocation_vp, MNT_WAIT, 0, p); + hfs_unlock(VTOC(hfsmp->hfs_allocation_vp)); + if (retval && !force) + goto err_exit; + } + + if (hfsmp->hfc_filevp && vnode_issystem(hfsmp->hfc_filevp)) { + retval = hfs_fsync(hfsmp->hfc_filevp, MNT_WAIT, 0, p); + if (retval && !force) + goto err_exit; + } + + /* If runtime corruption was detected, indicate that the volume + * was not unmounted cleanly. + */ + if (hfsmp->vcbAtrb & kHFSVolumeInconsistentMask) { + HFSTOVCB(hfsmp)->vcbAtrb &= ~kHFSVolumeUnmountedMask; + } else { + HFSTOVCB(hfsmp)->vcbAtrb |= kHFSVolumeUnmountedMask; + } + + if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) { + int i; + u_int32_t min_start = hfsmp->totalBlocks; + + // set the nextAllocation pointer to the smallest free block number + // we've seen so on the next mount we won't rescan unnecessarily + lck_spin_lock(&hfsmp->vcbFreeExtLock); + for(i=0; i < (int)hfsmp->vcbFreeExtCnt; i++) { + if (hfsmp->vcbFreeExt[i].startBlock < min_start) { + min_start = hfsmp->vcbFreeExt[i].startBlock; + } + } + lck_spin_unlock(&hfsmp->vcbFreeExtLock); + if (min_start < hfsmp->nextAllocation) { + hfsmp->nextAllocation = min_start; + } + } + + retval = hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT); + if (retval) { + HFSTOVCB(hfsmp)->vcbAtrb &= ~kHFSVolumeUnmountedMask; + if (!force) + goto err_exit; /* could not flush everything */ + } + + if (started_tr) { + hfs_end_transaction(hfsmp); + started_tr = 0; + } + } + + if (hfsmp->jnl) { + hfs_flush(hfsmp, HFS_FLUSH_FULL); + } + + /* + * Invalidate our caches and release metadata vnodes + */ + (void) hfsUnmount(hfsmp, p); + +#if CONFIG_HFS_STD + if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord) { + (void) hfs_relconverter(hfsmp->hfs_encoding); + } +#endif + + // XXXdbg + if (hfsmp->jnl) { + journal_close(hfsmp->jnl); + hfsmp->jnl = NULL; + } + + VNOP_FSYNC(hfsmp->hfs_devvp, MNT_WAIT, context); + + hfs_close_jvp(hfsmp); + + /* + * Last chance to dump unreferenced system files. + */ + (void) vflush(mp, NULLVP, FORCECLOSE); + +#if HFS_SPARSE_DEV + /* Drop our reference on the backing fs (if any). */ + if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) && hfsmp->hfs_backingvp) { + struct vnode * tmpvp; + + hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE; + tmpvp = hfsmp->hfs_backingvp; + hfsmp->hfs_backingvp = NULLVP; + vnode_rele(tmpvp); + } +#endif /* HFS_SPARSE_DEV */ + + vnode_rele(hfsmp->hfs_devvp); + + hfs_locks_destroy(hfsmp); + hfs_delete_chash(hfsmp); + hfs_idhash_destroy(hfsmp); + + hfs_assert(TAILQ_EMPTY(&hfsmp->hfs_reserved_ranges[HFS_TENTATIVE_BLOCKS]) + && TAILQ_EMPTY(&hfsmp->hfs_reserved_ranges[HFS_LOCKED_BLOCKS])); + hfs_assert(!hfsmp->lockedBlocks); + + hfs_free(hfsmp, sizeof(*hfsmp)); + + // decrement kext retain count +#if TARGET_OS_OSX + OSDecrementAtomic(&hfs_active_mounts); + OSKextReleaseKextWithLoadTag(OSKextGetCurrentLoadTag()); +#endif + +#if HFS_LEAK_DEBUG && TARGET_OS_OSX + if (hfs_active_mounts == 0) { + if (hfs_dump_allocations()) + Debugger(NULL); + else { + printf("hfs: last unmount and nothing was leaked!\n"); + msleep(hfs_unmount, NULL, PINOD, "hfs_unmount", + &(struct timespec){ 5, 0 }); + } + } +#endif + + return (0); + + err_exit: + if (started_tr) { + hfs_end_transaction(hfsmp); + } + return retval; +} + + +/* + * Return the root of a filesystem. + */ +int hfs_vfs_root(struct mount *mp, struct vnode **vpp, __unused vfs_context_t context) +{ + return hfs_vget(VFSTOHFS(mp), (cnid_t)kHFSRootFolderID, vpp, 1, 0); +} + + +/* + * Do operations associated with quotas + */ +#if !QUOTA +static int +hfs_quotactl(__unused struct mount *mp, __unused int cmds, __unused uid_t uid, __unused caddr_t datap, __unused vfs_context_t context) +{ + return (ENOTSUP); +} +#else +static int +hfs_quotactl(struct mount *mp, int cmds, uid_t uid, caddr_t datap, vfs_context_t context) +{ + struct proc *p = vfs_context_proc(context); + int cmd, type, error; + + if (uid == ~0U) + uid = kauth_cred_getuid(vfs_context_ucred(context)); + cmd = cmds >> SUBCMDSHIFT; + + switch (cmd) { + case Q_SYNC: + case Q_QUOTASTAT: + break; + case Q_GETQUOTA: + if (uid == kauth_cred_getuid(vfs_context_ucred(context))) + break; + /* fall through */ + default: + if ( (error = vfs_context_suser(context)) ) + return (error); + } + + type = cmds & SUBCMDMASK; + if ((u_int)type >= MAXQUOTAS) + return (EINVAL); + if ((error = vfs_busy(mp, LK_NOWAIT)) != 0) + return (error); + + switch (cmd) { + + case Q_QUOTAON: + error = hfs_quotaon(p, mp, type, datap); + break; + + case Q_QUOTAOFF: + error = hfs_quotaoff(p, mp, type); + break; + + case Q_SETQUOTA: + error = hfs_setquota(mp, uid, type, datap); + break; + + case Q_SETUSE: + error = hfs_setuse(mp, uid, type, datap); + break; + + case Q_GETQUOTA: + error = hfs_getquota(mp, uid, type, datap); + break; + + case Q_SYNC: + error = hfs_qsync(mp); + break; + + case Q_QUOTASTAT: + error = hfs_quotastat(mp, type, datap); + break; + + default: + error = EINVAL; + break; + } + vfs_unbusy(mp); + + return (error); +} +#endif /* QUOTA */ + +/* Subtype is composite of bits */ +#define HFS_SUBTYPE_JOURNALED 0x01 +#define HFS_SUBTYPE_CASESENSITIVE 0x02 +/* bits 2 - 6 reserved */ +#define HFS_SUBTYPE_STANDARDHFS 0x80 + +/* + * Get file system statistics. + */ +int +hfs_statfs(struct mount *mp, register struct vfsstatfs *sbp, __unused vfs_context_t context) +{ + ExtendedVCB *vcb = VFSTOVCB(mp); + struct hfsmount *hfsmp = VFSTOHFS(mp); + u_int16_t subtype = 0; + + sbp->f_bsize = (u_int32_t)vcb->blockSize; + sbp->f_iosize = (size_t)cluster_max_io_size(mp, 0); + sbp->f_blocks = (u_int64_t)((u_int32_t)vcb->totalBlocks); + sbp->f_bfree = (u_int64_t)((u_int32_t )hfs_freeblks(hfsmp, 0)); + sbp->f_bavail = (u_int64_t)((u_int32_t )hfs_freeblks(hfsmp, 1)); + sbp->f_files = (u_int64_t)HFS_MAX_FILES; + sbp->f_ffree = (u_int64_t)hfs_free_cnids(hfsmp); + + /* + * Subtypes (flavors) for HFS + * 0: Mac OS Extended + * 1: Mac OS Extended (Journaled) + * 2: Mac OS Extended (Case Sensitive) + * 3: Mac OS Extended (Case Sensitive, Journaled) + * 4 - 127: Reserved + * 128: Mac OS Standard + * + */ + if ((hfsmp->hfs_flags & HFS_STANDARD) == 0) { + /* HFS+ & variants */ + if (hfsmp->jnl) { + subtype |= HFS_SUBTYPE_JOURNALED; + } + if (hfsmp->hfs_flags & HFS_CASE_SENSITIVE) { + subtype |= HFS_SUBTYPE_CASESENSITIVE; + } + } +#if CONFIG_HFS_STD + else { + /* HFS standard */ + subtype = HFS_SUBTYPE_STANDARDHFS; + } +#endif + sbp->f_fssubtype = subtype; + + return (0); +} + + +// +// XXXdbg -- this is a callback to be used by the journal to +// get meta data blocks flushed out to disk. +// +// XXXdbg -- be smarter and don't flush *every* block on each +// call. try to only flush some so we don't wind up +// being too synchronous. +// +void +hfs_sync_metadata(void *arg) +{ + struct mount *mp = (struct mount *)arg; + struct hfsmount *hfsmp; + ExtendedVCB *vcb; + buf_t bp; + int retval; + daddr64_t priIDSector; + hfsmp = VFSTOHFS(mp); + vcb = HFSTOVCB(hfsmp); + + // now make sure the super block is flushed + priIDSector = (daddr64_t)((vcb->hfsPlusIOPosOffset / hfsmp->hfs_logical_block_size) + + HFS_PRI_SECTOR(hfsmp->hfs_logical_block_size)); + + retval = (int)buf_meta_bread(hfsmp->hfs_devvp, + HFS_PHYSBLK_ROUNDDOWN(priIDSector, hfsmp->hfs_log_per_phys), + hfsmp->hfs_physical_block_size, NOCRED, &bp); + if ((retval != 0 ) && (retval != ENXIO)) { + printf("hfs_sync_metadata: can't read volume header at %d! (retval 0x%x)\n", + (int)priIDSector, retval); + } + + if (retval == 0 && ((buf_flags(bp) & (B_DELWRI | B_LOCKED)) == B_DELWRI)) { + buf_bwrite(bp); + } else if (bp) { + buf_brelse(bp); + } + + /* Note that these I/Os bypass the journal (no calls to journal_start_modify_block) */ + + // the alternate super block... + // XXXdbg - we probably don't need to do this each and every time. + // hfs_btreeio.c:FlushAlternate() should flag when it was + // written... + if (hfsmp->hfs_partition_avh_sector) { + retval = (int)buf_meta_bread(hfsmp->hfs_devvp, + HFS_PHYSBLK_ROUNDDOWN(hfsmp->hfs_partition_avh_sector, hfsmp->hfs_log_per_phys), + hfsmp->hfs_physical_block_size, NOCRED, &bp); + if (retval == 0 && ((buf_flags(bp) & (B_DELWRI | B_LOCKED)) == B_DELWRI)) { + /* + * note this I/O can fail if the partition shrank behind our backs! + * So failure should be OK here. + */ + buf_bwrite(bp); + } else if (bp) { + buf_brelse(bp); + } + } + + /* Is the FS's idea of the AVH different than the partition ? */ + if ((hfsmp->hfs_fs_avh_sector) && (hfsmp->hfs_partition_avh_sector != hfsmp->hfs_fs_avh_sector)) { + retval = (int)buf_meta_bread(hfsmp->hfs_devvp, + HFS_PHYSBLK_ROUNDDOWN(hfsmp->hfs_fs_avh_sector, hfsmp->hfs_log_per_phys), + hfsmp->hfs_physical_block_size, NOCRED, &bp); + if (retval == 0 && ((buf_flags(bp) & (B_DELWRI | B_LOCKED)) == B_DELWRI)) { + buf_bwrite(bp); + } else if (bp) { + buf_brelse(bp); + } + } + +} + + +struct hfs_sync_cargs { + kauth_cred_t cred; + struct proc *p; + int waitfor; + int error; + int atime_only_syncs; + time_t sync_start_time; +}; + + +static int +hfs_sync_callback(struct vnode *vp, void *cargs) +{ + struct cnode *cp = VTOC(vp); + struct hfs_sync_cargs *args; + int error; + + args = (struct hfs_sync_cargs *)cargs; + + if (hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT) != 0) { + return (VNODE_RETURNED); + } + + hfs_dirty_t dirty_state = hfs_is_dirty(cp); + + bool sync = dirty_state == HFS_DIRTY || vnode_hasdirtyblks(vp); + + if (!sync && dirty_state == HFS_DIRTY_ATIME + && args->atime_only_syncs < 256) { + // We only update if the atime changed more than 60s ago + if (args->sync_start_time - cp->c_attr.ca_atime > 60) { + sync = true; + ++args->atime_only_syncs; + } + } + + if (sync) { + error = hfs_fsync(vp, args->waitfor, 0, args->p); + + if (error) + args->error = error; + } else if (cp->c_touch_acctime) + hfs_touchtimes(VTOHFS(vp), cp); + + hfs_unlock(cp); + return (VNODE_RETURNED); +} + + + +/* + * Go through the disk queues to initiate sandbagged IO; + * go through the inodes to write those that have been modified; + * initiate the writing of the super block if it has been modified. + * + * Note: we are always called with the filesystem marked `MPBUSY'. + */ +int +hfs_sync(struct mount *mp, int waitfor, vfs_context_t context) +{ + struct proc *p = vfs_context_proc(context); + struct cnode *cp; + struct hfsmount *hfsmp; + ExtendedVCB *vcb; + struct vnode *meta_vp[4]; + int i; + int error, allerror = 0; + struct hfs_sync_cargs args; + + hfsmp = VFSTOHFS(mp); + + // Back off if hfs_changefs or a freeze is underway + hfs_lock_mount(hfsmp); + if ((hfsmp->hfs_flags & HFS_IN_CHANGEFS) + || hfsmp->hfs_freeze_state != HFS_THAWED) { + hfs_unlock_mount(hfsmp); + return 0; + } + + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + hfs_unlock_mount(hfsmp); + return (EROFS); + } + + ++hfsmp->hfs_syncers; + hfs_unlock_mount(hfsmp); + + args.cred = kauth_cred_get(); + args.waitfor = waitfor; + args.p = p; + args.error = 0; + args.atime_only_syncs = 0; + + struct timeval tv; + microtime(&tv); + + args.sync_start_time = tv.tv_sec; + + /* + * hfs_sync_callback will be called for each vnode + * hung off of this mount point... the vnode will be + * properly referenced and unreferenced around the callback + */ + vnode_iterate(mp, 0, hfs_sync_callback, (void *)&args); + + if (args.error) + allerror = args.error; + + vcb = HFSTOVCB(hfsmp); + + meta_vp[0] = vcb->extentsRefNum; + meta_vp[1] = vcb->catalogRefNum; + meta_vp[2] = vcb->allocationsRefNum; /* This is NULL for standard HFS */ + meta_vp[3] = hfsmp->hfs_attribute_vp; /* Optional file */ + + /* Now sync our three metadata files */ + for (i = 0; i < 4; ++i) { + struct vnode *btvp; + + btvp = meta_vp[i];; + if ((btvp==0) || (vnode_mount(btvp) != mp)) + continue; + + /* XXX use hfs_systemfile_lock instead ? */ + (void) hfs_lock(VTOC(btvp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + cp = VTOC(btvp); + + if (!hfs_is_dirty(cp) && !vnode_hasdirtyblks(btvp)) { + hfs_unlock(VTOC(btvp)); + continue; + } + error = vnode_get(btvp); + if (error) { + hfs_unlock(VTOC(btvp)); + continue; + } + if ((error = hfs_fsync(btvp, waitfor, 0, p))) + allerror = error; + + hfs_unlock(cp); + vnode_put(btvp); + }; + + +#if CONFIG_HFS_STD + /* + * Force stale file system control information to be flushed. + */ + if (vcb->vcbSigWord == kHFSSigWord) { + if ((error = VNOP_FSYNC(hfsmp->hfs_devvp, waitfor, context))) { + allerror = error; + } + } +#endif + +#if QUOTA + hfs_qsync(mp); +#endif /* QUOTA */ + + hfs_hotfilesync(hfsmp, vfs_context_kernel()); + + /* + * Write back modified superblock. + */ + if (IsVCBDirty(vcb)) { + error = hfs_flushvolumeheader(hfsmp, waitfor == MNT_WAIT ? HFS_FVH_WAIT : 0); + if (error) + allerror = error; + } + + if (hfsmp->jnl) { + hfs_flush(hfsmp, HFS_FLUSH_JOURNAL); + } + + hfs_lock_mount(hfsmp); + boolean_t wake = (!--hfsmp->hfs_syncers + && hfsmp->hfs_freeze_state == HFS_WANT_TO_FREEZE); + hfs_unlock_mount(hfsmp); + if (wake) + wakeup(&hfsmp->hfs_freeze_state); + + return (allerror); +} + + +/* + * File handle to vnode + * + * Have to be really careful about stale file handles: + * - check that the cnode id is valid + * - call hfs_vget() to get the locked cnode + * - check for an unallocated cnode (i_mode == 0) + * - check that the given client host has export rights and return + * those rights via. exflagsp and credanonp + */ +static int +hfs_fhtovp(struct mount *mp, int fhlen, unsigned char *fhp, struct vnode **vpp, __unused vfs_context_t context) +{ + struct hfsfid *hfsfhp; + struct vnode *nvp; + int result; + + *vpp = NULL; + hfsfhp = (struct hfsfid *)fhp; + + if (fhlen < (int)sizeof(struct hfsfid)) + return (EINVAL); + + result = hfs_vget(VFSTOHFS(mp), ntohl(hfsfhp->hfsfid_cnid), &nvp, 0, 0); + if (result) { + if (result == ENOENT) + result = ESTALE; + return result; + } + + /* + * We used to use the create time as the gen id of the file handle, + * but it is not static enough because it can change at any point + * via system calls. We still don't have another volume ID or other + * unique identifier to use for a generation ID across reboots that + * persists until the file is removed. Using only the CNID exposes + * us to the potential wrap-around case, but as of 2/2008, it would take + * over 2 months to wrap around if the machine did nothing but allocate + * CNIDs. Using some kind of wrap counter would only be effective if + * each file had the wrap counter associated with it. For now, + * we use only the CNID to identify the file as it's good enough. + */ + + *vpp = nvp; + + hfs_unlock(VTOC(nvp)); + return (0); +} + + +/* + * Vnode pointer to File handle + */ +/* ARGSUSED */ +static int +hfs_vptofh(struct vnode *vp, int *fhlenp, unsigned char *fhp, __unused vfs_context_t context) +{ + struct cnode *cp; + struct hfsfid *hfsfhp; + + if (ISHFS(VTOVCB(vp))) + return (ENOTSUP); /* hfs standard is not exportable */ + + if (*fhlenp < (int)sizeof(struct hfsfid)) + return (EOVERFLOW); + + cp = VTOC(vp); + hfsfhp = (struct hfsfid *)fhp; + /* only the CNID is used to identify the file now */ + hfsfhp->hfsfid_cnid = htonl(cp->c_fileid); + hfsfhp->hfsfid_gen = htonl(cp->c_fileid); + *fhlenp = sizeof(struct hfsfid); + + return (0); +} + + +/* + * Initialize HFS filesystems, done only once per boot. + * + * HFS is not a kext-based file system. This makes it difficult to find + * out when the last HFS file system was unmounted and call hfs_uninit() + * to deallocate data structures allocated in hfs_init(). Therefore we + * never deallocate memory allocated by lock attribute and group initializations + * in this function. + */ +static int +hfs_init(__unused struct vfsconf *vfsp) +{ + static int done = 0; + + if (done) + return (0); + done = 1; + hfs_chashinit(); + + BTReserveSetup(); + + hfs_lock_attr = lck_attr_alloc_init(); + hfs_group_attr = lck_grp_attr_alloc_init(); + hfs_mutex_group = lck_grp_alloc_init("hfs-mutex", hfs_group_attr); + hfs_rwlock_group = lck_grp_alloc_init("hfs-rwlock", hfs_group_attr); + hfs_spinlock_group = lck_grp_alloc_init("hfs-spinlock", hfs_group_attr); + +#if HFS_COMPRESSION + decmpfs_init(); +#endif + + journal_init(); + + return (0); +} + + +/* + * Destroy all locks, mutexes and spinlocks in hfsmp on unmount or failed mount + */ +static void +hfs_locks_destroy(struct hfsmount *hfsmp) +{ + + lck_mtx_destroy(&hfsmp->hfs_mutex, hfs_mutex_group); + lck_mtx_destroy(&hfsmp->hfc_mutex, hfs_mutex_group); + lck_rw_destroy(&hfsmp->hfs_global_lock, hfs_rwlock_group); + lck_spin_destroy(&hfsmp->vcbFreeExtLock, hfs_spinlock_group); + + return; +} + + +static int +hfs_getmountpoint(struct vnode *vp, struct hfsmount **hfsmpp) +{ + struct hfsmount * hfsmp; + char fstypename[MFSNAMELEN]; + + if (vp == NULL) + return (EINVAL); + + if (!vnode_isvroot(vp)) + return (EINVAL); + + vnode_vfsname(vp, fstypename); + if (strncmp(fstypename, "hfs", sizeof(fstypename)) != 0) + return (EINVAL); + + hfsmp = VTOHFS(vp); + + if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord) + return (EINVAL); + + *hfsmpp = hfsmp; + + return (0); +} + +// Replace user-space value +static errno_t ureplace(user_addr_t oldp, size_t *oldlenp, + user_addr_t newp, size_t newlen, + void *data, size_t len) +{ + errno_t error; + if (!oldlenp) + return EFAULT; + if (oldp && *oldlenp < len) + return ENOMEM; + if (newp && newlen != len) + return EINVAL; + *oldlenp = len; + if (oldp) { + error = copyout(data, oldp, len); + if (error) + return error; + } + return newp ? copyin(newp, data, len) : 0; +} + +#define UREPLACE(oldp, oldlenp, newp, newlenp, v) \ + ureplace(oldp, oldlenp, newp, newlenp, &v, sizeof(v)) + +static hfsmount_t *hfs_mount_from_cwd(vfs_context_t ctx) +{ + vnode_t vp = vfs_context_cwd(ctx); + + if (!vp) + return NULL; + + /* + * We could use vnode_tag, but it is probably more future proof to + * compare fstypename. + */ + char fstypename[MFSNAMELEN]; + vnode_vfsname(vp, fstypename); + + if (strcmp(fstypename, "hfs")) + return NULL; + + return VTOHFS(vp); +} + +/* + * HFS filesystem related variables. + */ +int +hfs_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, + user_addr_t newp, size_t newlen, vfs_context_t context) +{ + int error; + struct hfsmount *hfsmp; + struct proc *p = NULL; + + /* all sysctl names at this level are terminal */ +#if TARGET_OS_OSX + p = vfs_context_proc(context); + if (name[0] == HFS_ENCODINGBIAS) { + int bias; + + bias = hfs_getencodingbias(); + + error = UREPLACE(oldp, oldlenp, newp, newlen, bias); + if (error || !newp) + return error; + + hfs_setencodingbias(bias); + + return 0; + } else +#endif //OSX + if (name[0] == HFS_EXTEND_FS) { + u_int64_t newsize = 0; + vnode_t vp = vfs_context_cwd(context); + + if (newp == USER_ADDR_NULL || vp == NULLVP + || newlen != sizeof(quad_t) || !oldlenp) + return EINVAL; + if ((error = hfs_getmountpoint(vp, &hfsmp))) + return (error); + + /* Start with the 'size' set to the current number of bytes in the filesystem */ + newsize = ((uint64_t)hfsmp->totalBlocks) * ((uint64_t)hfsmp->blockSize); + + error = UREPLACE(oldp, oldlenp, newp, newlen, newsize); + if (error) + return error; + + return hfs_extendfs(hfsmp, newsize, context); + } else if (name[0] == HFS_ENABLE_JOURNALING) { + // make the file system journaled... + vnode_t jvp; + ExtendedVCB *vcb; + struct cat_attr jnl_attr; + struct cat_attr jinfo_attr; + struct cat_fork jnl_fork; + struct cat_fork jinfo_fork; + buf_t jib_buf; + uint64_t jib_blkno; + uint32_t tmpblkno; + uint64_t journal_byte_offset; + uint64_t journal_size; + vnode_t jib_vp = NULLVP; + struct JournalInfoBlock local_jib; + int err = 0; + void *jnl = NULL; + int lockflags; + + /* Only root can enable journaling */ + if (!kauth_cred_issuser(kauth_cred_get())) { + return (EPERM); + } + if (namelen != 4) + return EINVAL; + hfsmp = hfs_mount_from_cwd(context); + if (!hfsmp) + return EINVAL; + + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return EROFS; + } + if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord) { + printf("hfs: can't make a plain hfs volume journaled.\n"); + return EINVAL; + } + + if (hfsmp->jnl) { + printf("hfs: volume %s is already journaled!\n", hfsmp->vcbVN); + return EAGAIN; + } + vcb = HFSTOVCB(hfsmp); + + /* Set up local copies of the initialization info */ + tmpblkno = (uint32_t) name[1]; + jib_blkno = (uint64_t) tmpblkno; + journal_byte_offset = (uint64_t) name[2]; + journal_byte_offset *= hfsmp->blockSize; + journal_byte_offset += hfsmp->hfsPlusIOPosOffset; + journal_size = (uint64_t)((unsigned)name[3]); + + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_EXTENTS, HFS_EXCLUSIVE_LOCK); + if (BTHasContiguousNodes(VTOF(vcb->catalogRefNum)) == 0 || + BTHasContiguousNodes(VTOF(vcb->extentsRefNum)) == 0) { + + printf("hfs: volume has a btree w/non-contiguous nodes. can not enable journaling.\n"); + hfs_systemfile_unlock(hfsmp, lockflags); + return EINVAL; + } + hfs_systemfile_unlock(hfsmp, lockflags); + + // make sure these both exist! + if ( GetFileInfo(vcb, kHFSRootFolderID, ".journal_info_block", &jinfo_attr, &jinfo_fork) == 0 + || GetFileInfo(vcb, kHFSRootFolderID, ".journal", &jnl_attr, &jnl_fork) == 0) { + + return EINVAL; + } + + /* + * At this point, we have a copy of the metadata that lives in the catalog for the + * journal info block. Compare that the journal info block's single extent matches + * that which was passed into this sysctl. + * + * If it is different, deny the journal enable call. + */ + if (jinfo_fork.cf_blocks > 1) { + /* too many blocks */ + return EINVAL; + } + + if (jinfo_fork.cf_extents[0].startBlock != jib_blkno) { + /* Wrong block */ + return EINVAL; + } + + /* + * We want to immediately purge the vnode for the JIB. + * + * Because it was written to from userland, there's probably + * a vnode somewhere in the vnode cache (possibly with UBC backed blocks). + * So we bring the vnode into core, then immediately do whatever + * we can to flush/vclean it out. This is because those blocks will be + * interpreted as user data, which may be treated separately on some platforms + * than metadata. If the vnode is gone, then there cannot be backing blocks + * in the UBC. + */ + if (hfs_vget (hfsmp, jinfo_attr.ca_fileid, &jib_vp, 1, 0)) { + return EINVAL; + } + /* + * Now we have a vnode for the JIB. recycle it. Because we hold an iocount + * on the vnode, we'll just mark it for termination when the last iocount + * (hopefully ours), is dropped. + */ + vnode_recycle (jib_vp); + err = vnode_put (jib_vp); + if (err) { + return EINVAL; + } + + /* Initialize the local copy of the JIB (just like hfs.util) */ + memset (&local_jib, 'Z', sizeof(struct JournalInfoBlock)); + local_jib.flags = SWAP_BE32(kJIJournalInFSMask); + /* Note that the JIB's offset is in bytes */ + local_jib.offset = SWAP_BE64(journal_byte_offset); + local_jib.size = SWAP_BE64(journal_size); + + /* + * Now write out the local JIB. This essentially overwrites the userland + * copy of the JIB. Read it as BLK_META to treat it as a metadata read/write. + */ + jib_buf = buf_getblk (hfsmp->hfs_devvp, + jib_blkno * (hfsmp->blockSize / hfsmp->hfs_logical_block_size), + hfsmp->blockSize, 0, 0, BLK_META); + char* buf_ptr = (char*) buf_dataptr (jib_buf); + + /* Zero out the portion of the block that won't contain JIB data */ + memset (buf_ptr, 0, hfsmp->blockSize); + + bcopy(&local_jib, buf_ptr, sizeof(local_jib)); + if (buf_bwrite (jib_buf)) { + return EIO; + } + + /* Force a flush track cache */ + hfs_flush(hfsmp, HFS_FLUSH_CACHE); + + /* Now proceed with full volume sync */ + hfs_sync(hfsmp->hfs_mp, MNT_WAIT, context); + + printf("hfs: Initializing the journal (joffset 0x%llx sz 0x%llx)...\n", + (off_t)name[2], (off_t)name[3]); + + // + // XXXdbg - note that currently (Sept, 08) hfs_util does not support + // enabling the journal on a separate device so it is safe + // to just copy hfs_devvp here. If hfs_util gets the ability + // to dynamically enable the journal on a separate device then + // we will have to do the same thing as hfs_early_journal_init() + // to locate and open the journal device. + // + jvp = hfsmp->hfs_devvp; + jnl = journal_create(jvp, journal_byte_offset, journal_size, + hfsmp->hfs_devvp, + hfsmp->hfs_logical_block_size, + 0, + 0, + hfs_sync_metadata, hfsmp->hfs_mp, + hfsmp->hfs_mp); + + /* + * Set up the trim callback function so that we can add + * recently freed extents to the free extent cache once + * the transaction that freed them is written to the + * journal on disk. + */ + if (jnl) + journal_trim_set_callback(jnl, hfs_trim_callback, hfsmp); + + if (jnl == NULL) { + printf("hfs: FAILED to create the journal!\n"); + return EIO; + } + + hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK); + + /* + * Flush all dirty metadata buffers. + */ + buf_flushdirtyblks(hfsmp->hfs_devvp, TRUE, 0, "hfs_sysctl"); + buf_flushdirtyblks(hfsmp->hfs_extents_vp, TRUE, 0, "hfs_sysctl"); + buf_flushdirtyblks(hfsmp->hfs_catalog_vp, TRUE, 0, "hfs_sysctl"); + buf_flushdirtyblks(hfsmp->hfs_allocation_vp, TRUE, 0, "hfs_sysctl"); + if (hfsmp->hfs_attribute_vp) + buf_flushdirtyblks(hfsmp->hfs_attribute_vp, TRUE, 0, "hfs_sysctl"); + + HFSTOVCB(hfsmp)->vcbJinfoBlock = name[1]; + HFSTOVCB(hfsmp)->vcbAtrb |= kHFSVolumeJournaledMask; + hfsmp->jvp = jvp; + hfsmp->jnl = jnl; + + // save this off for the hack-y check in hfs_remove() + hfsmp->jnl_start = (u_int32_t)name[2]; + hfsmp->jnl_size = (off_t)((unsigned)name[3]); + hfsmp->hfs_jnlinfoblkid = jinfo_attr.ca_fileid; + hfsmp->hfs_jnlfileid = jnl_attr.ca_fileid; + + vfs_setflags(hfsmp->hfs_mp, (u_int64_t)((unsigned int)MNT_JOURNALED)); + + hfs_unlock_global (hfsmp); + hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT | HFS_FVH_WRITE_ALT); + + { + fsid_t fsid; + + fsid.val[0] = (int32_t)hfsmp->hfs_raw_dev; + fsid.val[1] = (int32_t)vfs_typenum(HFSTOVFS(hfsmp)); + vfs_event_signal(&fsid, VQ_UPDATE, (intptr_t)NULL); + } + return 0; + } else if (name[0] == HFS_DISABLE_JOURNALING) { + // clear the journaling bit + + /* Only root can disable journaling */ + if (!kauth_cred_issuser(kauth_cred_get())) { + return (EPERM); + } + + hfsmp = hfs_mount_from_cwd(context); + if (!hfsmp) + return EINVAL; + + /* + * Disabling journaling is disallowed on volumes with directory hard links + * because we have not tested the relevant code path. + */ + if (hfsmp->hfs_private_attr[DIR_HARDLINKS].ca_entries != 0){ + printf("hfs: cannot disable journaling on volumes with directory hardlinks\n"); + return EPERM; + } + + printf("hfs: disabling journaling for %s\n", hfsmp->vcbVN); + + hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK); + + // Lights out for you buddy! + journal_close(hfsmp->jnl); + hfsmp->jnl = NULL; + + hfs_close_jvp(hfsmp); + vfs_clearflags(hfsmp->hfs_mp, (u_int64_t)((unsigned int)MNT_JOURNALED)); + hfsmp->jnl_start = 0; + hfsmp->hfs_jnlinfoblkid = 0; + hfsmp->hfs_jnlfileid = 0; + + HFSTOVCB(hfsmp)->vcbAtrb &= ~kHFSVolumeJournaledMask; + + hfs_unlock_global (hfsmp); + + hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT | HFS_FVH_WRITE_ALT); + + { + fsid_t fsid; + + fsid.val[0] = (int32_t)hfsmp->hfs_raw_dev; + fsid.val[1] = (int32_t)vfs_typenum(HFSTOVFS(hfsmp)); + vfs_event_signal(&fsid, VQ_UPDATE, (intptr_t)NULL); + } + return 0; + } else if (name[0] == VFS_CTL_QUERY) { +#if TARGET_OS_IPHONE + return EPERM; +#else //!TARGET_OS_IPHONE + struct sysctl_req *req; + union union_vfsidctl vc; + struct mount *mp; + struct vfsquery vq; + + req = CAST_DOWN(struct sysctl_req *, oldp); /* we're new style vfs sysctl. */ + if (req == NULL) { + return EFAULT; + } + + error = SYSCTL_IN(req, &vc, proc_is64bit(p)? sizeof(vc.vc64):sizeof(vc.vc32)); + if (error) return (error); + + mp = vfs_getvfs(&vc.vc32.vc_fsid); /* works for 32 and 64 */ + if (mp == NULL) return (ENOENT); + + hfsmp = VFSTOHFS(mp); + bzero(&vq, sizeof(vq)); + vq.vq_flags = hfsmp->hfs_notification_conditions; + return SYSCTL_OUT(req, &vq, sizeof(vq));; +#endif // TARGET_OS_IPHONE + } else if (name[0] == HFS_REPLAY_JOURNAL) { + vnode_t devvp = NULL; + int device_fd; + if (namelen != 2) { + return (EINVAL); + } + device_fd = name[1]; + error = file_vnode(device_fd, &devvp); + if (error) { + return error; + } + error = vnode_getwithref(devvp); + if (error) { + file_drop(device_fd); + return error; + } + error = hfs_journal_replay(devvp, context); + file_drop(device_fd); + vnode_put(devvp); + return error; + } +#if DEBUG || TARGET_OS_OSX + else if (name[0] == HFS_ENABLE_RESIZE_DEBUG) { + if (!kauth_cred_issuser(kauth_cred_get())) { + return (EPERM); + } + + int old = hfs_resize_debug; + + int res = UREPLACE(oldp, oldlenp, newp, newlen, hfs_resize_debug); + + if (old != hfs_resize_debug) { + printf("hfs: %s resize debug\n", + hfs_resize_debug ? "enabled" : "disabled"); + } + + return res; + } +#endif // DEBUG || OSX + + return (ENOTSUP); +} + +/* + * hfs_vfs_vget is not static since it is used in hfs_readwrite.c to support + * the build_path ioctl. We use it to leverage the code below that updates + * the origin list cache if necessary + */ + +int +hfs_vfs_vget(struct mount *mp, ino64_t ino, struct vnode **vpp, __unused vfs_context_t context) +{ + int error; + int lockflags; + struct hfsmount *hfsmp; + + hfsmp = VFSTOHFS(mp); + + error = hfs_vget(hfsmp, (cnid_t)ino, vpp, 1, 0); + if (error) + return error; + + /* + * If the look-up was via the object ID (rather than the link ID), + * then we make sure there's a parent here. We can't leave this + * until hfs_vnop_getattr because if there's a problem getting the + * parent at that point, all the caller will do is call + * hfs_vfs_vget again and we'll end up in an infinite loop. + */ + + cnode_t *cp = VTOC(*vpp); + + if (ISSET(cp->c_flag, C_HARDLINK) && ino == cp->c_fileid) { + hfs_lock_always(cp, HFS_SHARED_LOCK); + + if (!hfs_haslinkorigin(cp)) { + if (!hfs_lock_upgrade(cp)) + hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK); + + if (cp->c_cnid == cp->c_fileid) { + /* + * Descriptor is stale, so we need to refresh it. We + * pick the first link. + */ + cnid_t link_id; + + error = hfs_first_link(hfsmp, cp, &link_id); + + if (!error) { + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); + error = cat_findname(hfsmp, link_id, &cp->c_desc); + hfs_systemfile_unlock(hfsmp, lockflags); + } + } else { + // We'll use whatever link the descriptor happens to have + error = 0; + } + if (!error) + hfs_savelinkorigin(cp, cp->c_parentcnid); + } + + hfs_unlock(cp); + + if (error) { + vnode_put(*vpp); + *vpp = NULL; + } + } + + return error; +} + + +/* + * Look up an HFS object by ID. + * + * The object is returned with an iocount reference and the cnode locked. + * + * If the object is a file then it will represent the data fork. + */ +int +hfs_vget(struct hfsmount *hfsmp, cnid_t cnid, struct vnode **vpp, int skiplock, int allow_deleted) +{ + struct vnode *vp = NULLVP; + struct cat_desc cndesc; + struct cat_attr cnattr; + struct cat_fork cnfork; + u_int32_t linkref = 0; + int error; + + /* Check for cnids that should't be exported. */ + if ((cnid < kHFSFirstUserCatalogNodeID) && + (cnid != kHFSRootFolderID && cnid != kHFSRootParentID)) { + return (ENOENT); + } + /* Don't export our private directories. */ + if (cnid == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid || + cnid == hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) { + return (ENOENT); + } + /* + * Check the hash first + */ + vp = hfs_chash_getvnode(hfsmp, cnid, 0, skiplock, allow_deleted); + if (vp) { + *vpp = vp; + return(0); + } + + bzero(&cndesc, sizeof(cndesc)); + bzero(&cnattr, sizeof(cnattr)); + bzero(&cnfork, sizeof(cnfork)); + + /* + * Not in hash, lookup in catalog + */ + if (cnid == kHFSRootParentID) { + static char hfs_rootname[] = "/"; + + cndesc.cd_nameptr = (const u_int8_t *)&hfs_rootname[0]; + cndesc.cd_namelen = 1; + cndesc.cd_parentcnid = kHFSRootParentID; + cndesc.cd_cnid = kHFSRootFolderID; + cndesc.cd_flags = CD_ISDIR; + + cnattr.ca_fileid = kHFSRootFolderID; + cnattr.ca_linkcount = 1; + cnattr.ca_entries = 1; + cnattr.ca_dircount = 1; + cnattr.ca_mode = (S_IFDIR | S_IRWXU | S_IRWXG | S_IRWXO); + } else { + int lockflags; + cnid_t pid; + const char *nameptr; + + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); + error = cat_idlookup(hfsmp, cnid, 0, 0, &cndesc, &cnattr, &cnfork); + hfs_systemfile_unlock(hfsmp, lockflags); + + if (error) { + *vpp = NULL; + return (error); + } + + /* + * Check for a raw hardlink inode and save its linkref. + */ + pid = cndesc.cd_parentcnid; + nameptr = (const char *)cndesc.cd_nameptr; + + if ((pid == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid) && + cndesc.cd_namelen > HFS_INODE_PREFIX_LEN && + (bcmp(nameptr, HFS_INODE_PREFIX, HFS_INODE_PREFIX_LEN) == 0)) { + linkref = strtoul(&nameptr[HFS_INODE_PREFIX_LEN], NULL, 10); + + } else if ((pid == hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) && + cndesc.cd_namelen > HFS_DIRINODE_PREFIX_LEN && + (bcmp(nameptr, HFS_DIRINODE_PREFIX, HFS_DIRINODE_PREFIX_LEN) == 0)) { + linkref = strtoul(&nameptr[HFS_DIRINODE_PREFIX_LEN], NULL, 10); + + } else if ((pid == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid) && + cndesc.cd_namelen > HFS_DELETE_PREFIX_LEN && + (bcmp(nameptr, HFS_DELETE_PREFIX, HFS_DELETE_PREFIX_LEN) == 0)) { + *vpp = NULL; + cat_releasedesc(&cndesc); + return (ENOENT); /* open unlinked file */ + } + } + + /* + * Finish initializing cnode descriptor for hardlinks. + * + * We need a valid name and parent for reverse lookups. + */ + if (linkref) { + cnid_t lastid; + struct cat_desc linkdesc; + int linkerr = 0; + + cnattr.ca_linkref = linkref; + bzero (&linkdesc, sizeof (linkdesc)); + + /* + * If the caller supplied the raw inode value, then we don't know exactly + * which hardlink they wanted. It's likely that they acquired the raw inode + * value BEFORE the item became a hardlink, in which case, they probably + * want the oldest link. So request the oldest link from the catalog. + * + * Unfortunately, this requires that we iterate through all N hardlinks. On the plus + * side, since we know that we want the last linkID, we can also have this one + * call give us back the name of the last ID, since it's going to have it in-hand... + */ + linkerr = hfs_lookup_lastlink (hfsmp, linkref, &lastid, &linkdesc); + if ((linkerr == 0) && (lastid != 0)) { + /* + * Release any lingering buffers attached to our local descriptor. + * Then copy the name and other business into the cndesc + */ + cat_releasedesc (&cndesc); + bcopy (&linkdesc, &cndesc, sizeof(linkdesc)); + } + /* If it failed, the linkref code will just use whatever it had in-hand below. */ + } + + if (linkref) { + int newvnode_flags = 0; + + error = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, + &cnfork, &vp, &newvnode_flags); + if (error == 0) { + VTOC(vp)->c_flag |= C_HARDLINK; + vnode_setmultipath(vp); + } + } else { + int newvnode_flags = 0; + + void *buf = hfs_malloc(MAXPATHLEN); + + /* Supply hfs_getnewvnode with a component name. */ + struct componentname cn = { + .cn_nameiop = LOOKUP, + .cn_flags = ISLASTCN, + .cn_pnlen = MAXPATHLEN, + .cn_namelen = cndesc.cd_namelen, + .cn_pnbuf = buf, + .cn_nameptr = buf + }; + + bcopy(cndesc.cd_nameptr, cn.cn_nameptr, cndesc.cd_namelen + 1); + + error = hfs_getnewvnode(hfsmp, NULLVP, &cn, &cndesc, 0, &cnattr, + &cnfork, &vp, &newvnode_flags); + + if (error == 0 && (VTOC(vp)->c_flag & C_HARDLINK)) { + hfs_savelinkorigin(VTOC(vp), cndesc.cd_parentcnid); + } + + hfs_free(buf, MAXPATHLEN); + } + cat_releasedesc(&cndesc); + + *vpp = vp; + if (vp && skiplock) { + hfs_unlock(VTOC(vp)); + } + return (error); +} + + +/* + * Flush out all the files in a filesystem. + */ +static int +#if QUOTA +hfs_flushfiles(struct mount *mp, int flags, struct proc *p) +#else +hfs_flushfiles(struct mount *mp, int flags, __unused struct proc *p) +#endif /* QUOTA */ +{ + struct hfsmount *hfsmp; + struct vnode *skipvp = NULLVP; + int error; + int accounted_root_usecounts; +#if QUOTA + int i; +#endif + + hfsmp = VFSTOHFS(mp); + + accounted_root_usecounts = 0; +#if QUOTA + /* + * The open quota files have an indirect reference on + * the root directory vnode. We must account for this + * extra reference when doing the intial vflush. + */ + if (((unsigned int)vfs_flags(mp)) & MNT_QUOTA) { + /* Find out how many quota files we have open. */ + for (i = 0; i < MAXQUOTAS; i++) { + if (hfsmp->hfs_qfiles[i].qf_vp != NULLVP) + ++accounted_root_usecounts; + } + } +#endif /* QUOTA */ + + if (accounted_root_usecounts > 0) { + /* Obtain the root vnode so we can skip over it. */ + skipvp = hfs_chash_getvnode(hfsmp, kHFSRootFolderID, 0, 0, 0); + } + + error = vflush(mp, skipvp, SKIPSYSTEM | SKIPSWAP | flags); + if (error != 0) + return(error); + + error = vflush(mp, skipvp, SKIPSYSTEM | flags); + + if (skipvp) { + /* + * See if there are additional references on the + * root vp besides the ones obtained from the open + * quota files and CoreStorage. + */ + if ((error == 0) && + (vnode_isinuse(skipvp, accounted_root_usecounts))) { + error = EBUSY; /* root directory is still open */ + } + hfs_unlock(VTOC(skipvp)); + /* release the iocount from the hfs_chash_getvnode call above. */ + vnode_put(skipvp); + } + if (error && (flags & FORCECLOSE) == 0) + return (error); + +#if QUOTA + if (((unsigned int)vfs_flags(mp)) & MNT_QUOTA) { + for (i = 0; i < MAXQUOTAS; i++) { + if (hfsmp->hfs_qfiles[i].qf_vp == NULLVP) + continue; + hfs_quotaoff(p, mp, i); + } + } +#endif /* QUOTA */ + + if (skipvp) { + error = vflush(mp, NULLVP, SKIPSYSTEM | flags); + } + + return (error); +} + +/* + * Update volume encoding bitmap (HFS Plus only) + * + * Mark a legacy text encoding as in-use (as needed) + * in the volume header of this HFS+ filesystem. + */ +void +hfs_setencodingbits(struct hfsmount *hfsmp, u_int32_t encoding) +{ +#define kIndexMacUkrainian 48 /* MacUkrainian encoding is 152 */ +#define kIndexMacFarsi 49 /* MacFarsi encoding is 140 */ + + u_int32_t index; + + switch (encoding) { + case kTextEncodingMacUkrainian: + index = kIndexMacUkrainian; + break; + case kTextEncodingMacFarsi: + index = kIndexMacFarsi; + break; + default: + index = encoding; + break; + } + + /* Only mark the encoding as in-use if it wasn't already set */ + if (index < 64 && (hfsmp->encodingsBitmap & (u_int64_t)(1ULL << index)) == 0) { + hfs_lock_mount (hfsmp); + hfsmp->encodingsBitmap |= (u_int64_t)(1ULL << index); + MarkVCBDirty(hfsmp); + hfs_unlock_mount(hfsmp); + } +} + +/* + * Update volume stats + * + * On journal volumes this will cause a volume header flush + */ +int +hfs_volupdate(struct hfsmount *hfsmp, enum volop op, int inroot) +{ + struct timeval tv; + + microtime(&tv); + + hfs_lock_mount (hfsmp); + + MarkVCBDirty(hfsmp); + hfsmp->hfs_mtime = tv.tv_sec; + + switch (op) { + case VOL_UPDATE: + break; + case VOL_MKDIR: + if (hfsmp->hfs_dircount != 0xFFFFFFFF) + ++hfsmp->hfs_dircount; + if (inroot && hfsmp->vcbNmRtDirs != 0xFFFF) + ++hfsmp->vcbNmRtDirs; + break; + case VOL_RMDIR: + if (hfsmp->hfs_dircount != 0) + --hfsmp->hfs_dircount; + if (inroot && hfsmp->vcbNmRtDirs != 0xFFFF) + --hfsmp->vcbNmRtDirs; + break; + case VOL_MKFILE: + if (hfsmp->hfs_filecount != 0xFFFFFFFF) + ++hfsmp->hfs_filecount; + if (inroot && hfsmp->vcbNmFls != 0xFFFF) + ++hfsmp->vcbNmFls; + break; + case VOL_RMFILE: + if (hfsmp->hfs_filecount != 0) + --hfsmp->hfs_filecount; + if (inroot && hfsmp->vcbNmFls != 0xFFFF) + --hfsmp->vcbNmFls; + break; + } + + hfs_unlock_mount (hfsmp); + + if (hfsmp->jnl) { + hfs_flushvolumeheader(hfsmp, 0); + } + + return (0); +} + + +#if CONFIG_HFS_STD +/* HFS Standard MDB flush */ +static int +hfs_flushMDB(struct hfsmount *hfsmp, int waitfor, int altflush) +{ + ExtendedVCB *vcb = HFSTOVCB(hfsmp); + struct filefork *fp; + HFSMasterDirectoryBlock *mdb; + struct buf *bp = NULL; + int retval; + int sector_size; + ByteCount namelen; + + sector_size = hfsmp->hfs_logical_block_size; + retval = (int)buf_bread(hfsmp->hfs_devvp, (daddr64_t)HFS_PRI_SECTOR(sector_size), sector_size, NOCRED, &bp); + if (retval) { + if (bp) + buf_brelse(bp); + return retval; + } + + hfs_lock_mount (hfsmp); + + mdb = (HFSMasterDirectoryBlock *)(buf_dataptr(bp) + HFS_PRI_OFFSET(sector_size)); + + mdb->drCrDate = SWAP_BE32 (UTCToLocal(to_hfs_time(vcb->hfs_itime))); + mdb->drLsMod = SWAP_BE32 (UTCToLocal(to_hfs_time(vcb->vcbLsMod))); + mdb->drAtrb = SWAP_BE16 (vcb->vcbAtrb); + mdb->drNmFls = SWAP_BE16 (vcb->vcbNmFls); + mdb->drAllocPtr = SWAP_BE16 (vcb->nextAllocation); + mdb->drClpSiz = SWAP_BE32 (vcb->vcbClpSiz); + mdb->drNxtCNID = SWAP_BE32 (vcb->vcbNxtCNID); + mdb->drFreeBks = SWAP_BE16 (vcb->freeBlocks); + + namelen = strlen((char *)vcb->vcbVN); + retval = utf8_to_hfs(vcb, namelen, vcb->vcbVN, mdb->drVN); + /* Retry with MacRoman in case that's how it was exported. */ + if (retval) + retval = utf8_to_mac_roman(namelen, vcb->vcbVN, mdb->drVN); + + mdb->drVolBkUp = SWAP_BE32 (UTCToLocal(to_hfs_time(vcb->vcbVolBkUp))); + mdb->drWrCnt = SWAP_BE32 (vcb->vcbWrCnt); + mdb->drNmRtDirs = SWAP_BE16 (vcb->vcbNmRtDirs); + mdb->drFilCnt = SWAP_BE32 (vcb->vcbFilCnt); + mdb->drDirCnt = SWAP_BE32 (vcb->vcbDirCnt); + + bcopy(vcb->vcbFndrInfo, mdb->drFndrInfo, sizeof(mdb->drFndrInfo)); + + fp = VTOF(vcb->extentsRefNum); + mdb->drXTExtRec[0].startBlock = SWAP_BE16 (fp->ff_extents[0].startBlock); + mdb->drXTExtRec[0].blockCount = SWAP_BE16 (fp->ff_extents[0].blockCount); + mdb->drXTExtRec[1].startBlock = SWAP_BE16 (fp->ff_extents[1].startBlock); + mdb->drXTExtRec[1].blockCount = SWAP_BE16 (fp->ff_extents[1].blockCount); + mdb->drXTExtRec[2].startBlock = SWAP_BE16 (fp->ff_extents[2].startBlock); + mdb->drXTExtRec[2].blockCount = SWAP_BE16 (fp->ff_extents[2].blockCount); + mdb->drXTFlSize = SWAP_BE32 (fp->ff_blocks * vcb->blockSize); + mdb->drXTClpSiz = SWAP_BE32 (fp->ff_clumpsize); + FTOC(fp)->c_flag &= ~C_MODIFIED; + + fp = VTOF(vcb->catalogRefNum); + mdb->drCTExtRec[0].startBlock = SWAP_BE16 (fp->ff_extents[0].startBlock); + mdb->drCTExtRec[0].blockCount = SWAP_BE16 (fp->ff_extents[0].blockCount); + mdb->drCTExtRec[1].startBlock = SWAP_BE16 (fp->ff_extents[1].startBlock); + mdb->drCTExtRec[1].blockCount = SWAP_BE16 (fp->ff_extents[1].blockCount); + mdb->drCTExtRec[2].startBlock = SWAP_BE16 (fp->ff_extents[2].startBlock); + mdb->drCTExtRec[2].blockCount = SWAP_BE16 (fp->ff_extents[2].blockCount); + mdb->drCTFlSize = SWAP_BE32 (fp->ff_blocks * vcb->blockSize); + mdb->drCTClpSiz = SWAP_BE32 (fp->ff_clumpsize); + FTOC(fp)->c_flag &= ~C_MODIFIED; + + MarkVCBClean( vcb ); + + hfs_unlock_mount (hfsmp); + + /* If requested, flush out the alternate MDB */ + if (altflush) { + struct buf *alt_bp = NULL; + + if (buf_meta_bread(hfsmp->hfs_devvp, hfsmp->hfs_partition_avh_sector, sector_size, NOCRED, &alt_bp) == 0) { + bcopy(mdb, (char *)buf_dataptr(alt_bp) + HFS_ALT_OFFSET(sector_size), kMDBSize); + + (void) VNOP_BWRITE(alt_bp); + } else if (alt_bp) + buf_brelse(alt_bp); + } + + if (waitfor != MNT_WAIT) + buf_bawrite(bp); + else + retval = VNOP_BWRITE(bp); + + return (retval); +} +#endif + +/* + * Flush any dirty in-memory mount data to the on-disk + * volume header. + * + * Note: the on-disk volume signature is intentionally + * not flushed since the on-disk "H+" and "HX" signatures + * are always stored in-memory as "H+". + */ +int +hfs_flushvolumeheader(struct hfsmount *hfsmp, + hfs_flush_volume_header_options_t options) +{ + ExtendedVCB *vcb = HFSTOVCB(hfsmp); + struct filefork *fp; + HFSPlusVolumeHeader *volumeHeader, *altVH; + int retval; + struct buf *bp, *alt_bp; + int i; + daddr64_t priIDSector; + bool critical = false; + u_int16_t signature; + u_int16_t hfsversion; + daddr64_t avh_sector; + bool altflush = ISSET(options, HFS_FVH_WRITE_ALT); + + if (ISSET(options, HFS_FVH_FLUSH_IF_DIRTY) + && !hfs_header_needs_flushing(hfsmp)) { + return 0; + } + + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return(0); + } +#if CONFIG_HFS_STD + if (hfsmp->hfs_flags & HFS_STANDARD) { + return hfs_flushMDB(hfsmp, ISSET(options, HFS_FVH_WAIT) ? MNT_WAIT : 0, altflush); + } +#endif + priIDSector = (daddr64_t)((vcb->hfsPlusIOPosOffset / hfsmp->hfs_logical_block_size) + + HFS_PRI_SECTOR(hfsmp->hfs_logical_block_size)); + + if (hfs_start_transaction(hfsmp) != 0) { + return EINVAL; + } + + bp = NULL; + alt_bp = NULL; + + retval = (int)buf_meta_bread(hfsmp->hfs_devvp, + HFS_PHYSBLK_ROUNDDOWN(priIDSector, hfsmp->hfs_log_per_phys), + hfsmp->hfs_physical_block_size, NOCRED, &bp); + if (retval) { + printf("hfs: err %d reading VH blk (vol=%s)\n", retval, vcb->vcbVN); + goto err_exit; + } + + volumeHeader = (HFSPlusVolumeHeader *)((char *)buf_dataptr(bp) + + HFS_PRI_OFFSET(hfsmp->hfs_physical_block_size)); + + /* + * Sanity check what we just read. If it's bad, try the alternate + * instead. + */ + signature = SWAP_BE16 (volumeHeader->signature); + hfsversion = SWAP_BE16 (volumeHeader->version); + if ((signature != kHFSPlusSigWord && signature != kHFSXSigWord) || + (hfsversion < kHFSPlusVersion) || (hfsversion > 100) || + (SWAP_BE32 (volumeHeader->blockSize) != vcb->blockSize)) { + printf("hfs: corrupt VH on %s, sig 0x%04x, ver %d, blksize %d\n", + vcb->vcbVN, signature, hfsversion, + SWAP_BE32 (volumeHeader->blockSize)); + hfs_mark_inconsistent(hfsmp, HFS_INCONSISTENCY_DETECTED); + + /* Almost always we read AVH relative to the partition size */ + avh_sector = hfsmp->hfs_partition_avh_sector; + + if (hfsmp->hfs_partition_avh_sector != hfsmp->hfs_fs_avh_sector) { + /* + * The two altVH offsets do not match --- which means that a smaller file + * system exists in a larger partition. Verify that we have the correct + * alternate volume header sector as per the current parititon size. + * The GPT device that we are mounted on top could have changed sizes + * without us knowing. + * + * We're in a transaction, so it's safe to modify the partition_avh_sector + * field if necessary. + */ + + uint64_t sector_count; + + /* Get underlying device block count */ + if ((retval = VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCGETBLOCKCOUNT, + (caddr_t)§or_count, 0, vfs_context_current()))) { + printf("hfs_flushVH: err %d getting block count (%s) \n", retval, vcb->vcbVN); + retval = ENXIO; + goto err_exit; + } + + /* Partition size was changed without our knowledge */ + if (sector_count != (uint64_t)hfsmp->hfs_logical_block_count) { + hfsmp->hfs_partition_avh_sector = (hfsmp->hfsPlusIOPosOffset / hfsmp->hfs_logical_block_size) + + HFS_ALT_SECTOR(hfsmp->hfs_logical_block_size, sector_count); + /* Note: hfs_fs_avh_sector will remain unchanged */ + printf ("hfs_flushVH: partition size changed, partition_avh_sector=%qu, fs_avh_sector=%qu\n", + hfsmp->hfs_partition_avh_sector, hfsmp->hfs_fs_avh_sector); + + /* + * We just updated the offset for AVH relative to + * the partition size, so the content of that AVH + * will be invalid. But since we are also maintaining + * a valid AVH relative to the file system size, we + * can read it since primary VH and partition AVH + * are not valid. + */ + avh_sector = hfsmp->hfs_fs_avh_sector; + } + } + + printf ("hfs: trying alternate (for %s) avh_sector=%qu\n", + (avh_sector == hfsmp->hfs_fs_avh_sector) ? "file system" : "partition", avh_sector); + + if (avh_sector) { + retval = buf_meta_bread(hfsmp->hfs_devvp, + HFS_PHYSBLK_ROUNDDOWN(avh_sector, hfsmp->hfs_log_per_phys), + hfsmp->hfs_physical_block_size, NOCRED, &alt_bp); + if (retval) { + printf("hfs: err %d reading alternate VH (%s)\n", retval, vcb->vcbVN); + goto err_exit; + } + + altVH = (HFSPlusVolumeHeader *)((char *)buf_dataptr(alt_bp) + + HFS_ALT_OFFSET(hfsmp->hfs_physical_block_size)); + signature = SWAP_BE16(altVH->signature); + hfsversion = SWAP_BE16(altVH->version); + + if ((signature != kHFSPlusSigWord && signature != kHFSXSigWord) || + (hfsversion < kHFSPlusVersion) || (kHFSPlusVersion > 100) || + (SWAP_BE32(altVH->blockSize) != vcb->blockSize)) { + printf("hfs: corrupt alternate VH on %s, sig 0x%04x, ver %d, blksize %d\n", + vcb->vcbVN, signature, hfsversion, + SWAP_BE32(altVH->blockSize)); + retval = EIO; + goto err_exit; + } + + /* The alternate is plausible, so use it. */ + bcopy(altVH, volumeHeader, kMDBSize); + buf_brelse(alt_bp); + alt_bp = NULL; + } else { + /* No alternate VH, nothing more we can do. */ + retval = EIO; + goto err_exit; + } + } + + if (hfsmp->jnl) { + journal_modify_block_start(hfsmp->jnl, bp); + } + + /* + * For embedded HFS+ volumes, update create date if it changed + * (ie from a setattrlist call) + */ + if ((vcb->hfsPlusIOPosOffset != 0) && + (SWAP_BE32 (volumeHeader->createDate) != vcb->localCreateDate)) { + struct buf *bp2; + HFSMasterDirectoryBlock *mdb; + + retval = (int)buf_meta_bread(hfsmp->hfs_devvp, + HFS_PHYSBLK_ROUNDDOWN(HFS_PRI_SECTOR(hfsmp->hfs_logical_block_size), hfsmp->hfs_log_per_phys), + hfsmp->hfs_physical_block_size, NOCRED, &bp2); + if (retval) { + if (bp2) + buf_brelse(bp2); + retval = 0; + } else { + mdb = (HFSMasterDirectoryBlock *)(buf_dataptr(bp2) + + HFS_PRI_OFFSET(hfsmp->hfs_physical_block_size)); + + if ( SWAP_BE32 (mdb->drCrDate) != vcb->localCreateDate ) + { + if (hfsmp->jnl) { + journal_modify_block_start(hfsmp->jnl, bp2); + } + + mdb->drCrDate = SWAP_BE32 (vcb->localCreateDate); /* pick up the new create date */ + + if (hfsmp->jnl) { + journal_modify_block_end(hfsmp->jnl, bp2, NULL, NULL); + } else { + (void) VNOP_BWRITE(bp2); /* write out the changes */ + } + } + else + { + buf_brelse(bp2); /* just release it */ + } + } + } + + hfs_lock_mount (hfsmp); + + /* Note: only update the lower 16 bits worth of attributes */ + volumeHeader->attributes = SWAP_BE32 (vcb->vcbAtrb); + volumeHeader->journalInfoBlock = SWAP_BE32 (vcb->vcbJinfoBlock); + if (hfsmp->jnl) { + volumeHeader->lastMountedVersion = SWAP_BE32 (kHFSJMountVersion); + } else { + volumeHeader->lastMountedVersion = SWAP_BE32 (kHFSPlusMountVersion); + } + volumeHeader->createDate = SWAP_BE32 (vcb->localCreateDate); /* volume create date is in local time */ + volumeHeader->modifyDate = SWAP_BE32 (to_hfs_time(vcb->vcbLsMod)); + volumeHeader->backupDate = SWAP_BE32 (to_hfs_time(vcb->vcbVolBkUp)); + volumeHeader->fileCount = SWAP_BE32 (vcb->vcbFilCnt); + volumeHeader->folderCount = SWAP_BE32 (vcb->vcbDirCnt); + volumeHeader->totalBlocks = SWAP_BE32 (vcb->totalBlocks); + volumeHeader->freeBlocks = SWAP_BE32 (vcb->freeBlocks + vcb->reclaimBlocks); + volumeHeader->nextAllocation = SWAP_BE32 (vcb->nextAllocation); + volumeHeader->rsrcClumpSize = SWAP_BE32 (vcb->vcbClpSiz); + volumeHeader->dataClumpSize = SWAP_BE32 (vcb->vcbClpSiz); + volumeHeader->nextCatalogID = SWAP_BE32 (vcb->vcbNxtCNID); + volumeHeader->writeCount = SWAP_BE32 (vcb->vcbWrCnt); + volumeHeader->encodingsBitmap = SWAP_BE64 (vcb->encodingsBitmap); + + if (bcmp(vcb->vcbFndrInfo, volumeHeader->finderInfo, sizeof(volumeHeader->finderInfo)) != 0) { + bcopy(vcb->vcbFndrInfo, volumeHeader->finderInfo, sizeof(volumeHeader->finderInfo)); + critical = true; + } + + if (!altflush && !ISSET(options, HFS_FVH_FLUSH_IF_DIRTY)) { + goto done; + } + + /* Sync Extents over-flow file meta data */ + fp = VTOF(vcb->extentsRefNum); + if (FTOC(fp)->c_flag & C_MODIFIED) { + for (i = 0; i < kHFSPlusExtentDensity; i++) { + volumeHeader->extentsFile.extents[i].startBlock = + SWAP_BE32 (fp->ff_extents[i].startBlock); + volumeHeader->extentsFile.extents[i].blockCount = + SWAP_BE32 (fp->ff_extents[i].blockCount); + } + volumeHeader->extentsFile.logicalSize = SWAP_BE64 (fp->ff_size); + volumeHeader->extentsFile.totalBlocks = SWAP_BE32 (fp->ff_blocks); + volumeHeader->extentsFile.clumpSize = SWAP_BE32 (fp->ff_clumpsize); + FTOC(fp)->c_flag &= ~C_MODIFIED; + altflush = true; + } + + /* Sync Catalog file meta data */ + fp = VTOF(vcb->catalogRefNum); + if (FTOC(fp)->c_flag & C_MODIFIED) { + for (i = 0; i < kHFSPlusExtentDensity; i++) { + volumeHeader->catalogFile.extents[i].startBlock = + SWAP_BE32 (fp->ff_extents[i].startBlock); + volumeHeader->catalogFile.extents[i].blockCount = + SWAP_BE32 (fp->ff_extents[i].blockCount); + } + volumeHeader->catalogFile.logicalSize = SWAP_BE64 (fp->ff_size); + volumeHeader->catalogFile.totalBlocks = SWAP_BE32 (fp->ff_blocks); + volumeHeader->catalogFile.clumpSize = SWAP_BE32 (fp->ff_clumpsize); + FTOC(fp)->c_flag &= ~C_MODIFIED; + altflush = true; + } + + /* Sync Allocation file meta data */ + fp = VTOF(vcb->allocationsRefNum); + if (FTOC(fp)->c_flag & C_MODIFIED) { + for (i = 0; i < kHFSPlusExtentDensity; i++) { + volumeHeader->allocationFile.extents[i].startBlock = + SWAP_BE32 (fp->ff_extents[i].startBlock); + volumeHeader->allocationFile.extents[i].blockCount = + SWAP_BE32 (fp->ff_extents[i].blockCount); + } + volumeHeader->allocationFile.logicalSize = SWAP_BE64 (fp->ff_size); + volumeHeader->allocationFile.totalBlocks = SWAP_BE32 (fp->ff_blocks); + volumeHeader->allocationFile.clumpSize = SWAP_BE32 (fp->ff_clumpsize); + FTOC(fp)->c_flag &= ~C_MODIFIED; + altflush = true; + } + + /* Sync Attribute file meta data */ + if (hfsmp->hfs_attribute_vp) { + fp = VTOF(hfsmp->hfs_attribute_vp); + for (i = 0; i < kHFSPlusExtentDensity; i++) { + volumeHeader->attributesFile.extents[i].startBlock = + SWAP_BE32 (fp->ff_extents[i].startBlock); + volumeHeader->attributesFile.extents[i].blockCount = + SWAP_BE32 (fp->ff_extents[i].blockCount); + } + if (ISSET(FTOC(fp)->c_flag, C_MODIFIED)) { + FTOC(fp)->c_flag &= ~C_MODIFIED; + altflush = true; + } + volumeHeader->attributesFile.logicalSize = SWAP_BE64 (fp->ff_size); + volumeHeader->attributesFile.totalBlocks = SWAP_BE32 (fp->ff_blocks); + volumeHeader->attributesFile.clumpSize = SWAP_BE32 (fp->ff_clumpsize); + } + + /* Sync Startup file meta data */ + if (hfsmp->hfs_startup_vp) { + fp = VTOF(hfsmp->hfs_startup_vp); + if (FTOC(fp)->c_flag & C_MODIFIED) { + for (i = 0; i < kHFSPlusExtentDensity; i++) { + volumeHeader->startupFile.extents[i].startBlock = + SWAP_BE32 (fp->ff_extents[i].startBlock); + volumeHeader->startupFile.extents[i].blockCount = + SWAP_BE32 (fp->ff_extents[i].blockCount); + } + volumeHeader->startupFile.logicalSize = SWAP_BE64 (fp->ff_size); + volumeHeader->startupFile.totalBlocks = SWAP_BE32 (fp->ff_blocks); + volumeHeader->startupFile.clumpSize = SWAP_BE32 (fp->ff_clumpsize); + FTOC(fp)->c_flag &= ~C_MODIFIED; + altflush = true; + } + } + + if (altflush) + critical = true; + +done: + MarkVCBClean(hfsmp); + hfs_unlock_mount (hfsmp); + + /* If requested, flush out the alternate volume header */ + if (altflush) { + /* + * The two altVH offsets do not match --- which means that a smaller file + * system exists in a larger partition. Verify that we have the correct + * alternate volume header sector as per the current parititon size. + * The GPT device that we are mounted on top could have changed sizes + * without us knowning. + * + * We're in a transaction, so it's safe to modify the partition_avh_sector + * field if necessary. + */ + if (hfsmp->hfs_partition_avh_sector != hfsmp->hfs_fs_avh_sector) { + uint64_t sector_count; + + /* Get underlying device block count */ + if ((retval = VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCGETBLOCKCOUNT, + (caddr_t)§or_count, 0, vfs_context_current()))) { + printf("hfs_flushVH: err %d getting block count (%s) \n", retval, vcb->vcbVN); + retval = ENXIO; + goto err_exit; + } + + /* Partition size was changed without our knowledge */ + if (sector_count != (uint64_t)hfsmp->hfs_logical_block_count) { + hfsmp->hfs_partition_avh_sector = (hfsmp->hfsPlusIOPosOffset / hfsmp->hfs_logical_block_size) + + HFS_ALT_SECTOR(hfsmp->hfs_logical_block_size, sector_count); + /* Note: hfs_fs_avh_sector will remain unchanged */ + printf ("hfs_flushVH: altflush: partition size changed, partition_avh_sector=%qu, fs_avh_sector=%qu\n", + hfsmp->hfs_partition_avh_sector, hfsmp->hfs_fs_avh_sector); + } + } + + /* + * First see if we need to write I/O to the "secondary" AVH + * located at FS Size - 1024 bytes, because this one will + * always go into the journal. We put this AVH into the journal + * because even if the filesystem size has shrunk, this LBA should be + * reachable after the partition-size modification has occurred. + * The one where we need to be careful is partitionsize-1024, since the + * partition size should hopefully shrink. + * + * Most of the time this block will not execute. + */ + if ((hfsmp->hfs_fs_avh_sector) && + (hfsmp->hfs_partition_avh_sector != hfsmp->hfs_fs_avh_sector)) { + if (buf_meta_bread(hfsmp->hfs_devvp, + HFS_PHYSBLK_ROUNDDOWN(hfsmp->hfs_fs_avh_sector, hfsmp->hfs_log_per_phys), + hfsmp->hfs_physical_block_size, NOCRED, &alt_bp) == 0) { + if (hfsmp->jnl) { + journal_modify_block_start(hfsmp->jnl, alt_bp); + } + + bcopy(volumeHeader, (char *)buf_dataptr(alt_bp) + + HFS_ALT_OFFSET(hfsmp->hfs_physical_block_size), + kMDBSize); + + if (hfsmp->jnl) { + journal_modify_block_end(hfsmp->jnl, alt_bp, NULL, NULL); + } else { + (void) VNOP_BWRITE(alt_bp); + } + } else if (alt_bp) { + buf_brelse(alt_bp); + } + } + + /* + * Flush out alternate volume header located at 1024 bytes before + * end of the partition as part of journal transaction. In + * most cases, this will be the only alternate volume header + * that we need to worry about because the file system size is + * same as the partition size, therefore hfs_fs_avh_sector is + * same as hfs_partition_avh_sector. This is the "priority" AVH. + * + * However, do not always put this I/O into the journal. If we skipped the + * FS-Size AVH write above, then we will put this I/O into the journal as + * that indicates the two were in sync. However, if the FS size is + * not the same as the partition size, we are tracking two. We don't + * put it in the journal in that case, since if the partition + * size changes between uptimes, and we need to replay the journal, + * this I/O could generate an EIO if during replay it is now trying + * to access blocks beyond the device EOF. + */ + if (hfsmp->hfs_partition_avh_sector) { + if (buf_meta_bread(hfsmp->hfs_devvp, + HFS_PHYSBLK_ROUNDDOWN(hfsmp->hfs_partition_avh_sector, hfsmp->hfs_log_per_phys), + hfsmp->hfs_physical_block_size, NOCRED, &alt_bp) == 0) { + + /* only one AVH, put this I/O in the journal. */ + if ((hfsmp->jnl) && (hfsmp->hfs_partition_avh_sector == hfsmp->hfs_fs_avh_sector)) { + journal_modify_block_start(hfsmp->jnl, alt_bp); + } + + bcopy(volumeHeader, (char *)buf_dataptr(alt_bp) + + HFS_ALT_OFFSET(hfsmp->hfs_physical_block_size), + kMDBSize); + + /* If journaled and we only have one AVH to track */ + if ((hfsmp->jnl) && (hfsmp->hfs_partition_avh_sector == hfsmp->hfs_fs_avh_sector)) { + journal_modify_block_end (hfsmp->jnl, alt_bp, NULL, NULL); + } else { + /* + * If we don't have a journal or there are two AVH's at the + * moment, then this one doesn't go in the journal. Note that + * this one may generate I/O errors, since the partition + * can be resized behind our backs at any moment and this I/O + * may now appear to be beyond the device EOF. + */ + (void) VNOP_BWRITE(alt_bp); + hfs_flush(hfsmp, HFS_FLUSH_CACHE); + } + } else if (alt_bp) { + buf_brelse(alt_bp); + } + } + } + + /* Finish modifying the block for the primary VH */ + if (hfsmp->jnl) { + journal_modify_block_end(hfsmp->jnl, bp, NULL, NULL); + } else { + if (!ISSET(options, HFS_FVH_WAIT)) { + buf_bawrite(bp); + } else { + retval = VNOP_BWRITE(bp); + /* When critical data changes, flush the device cache */ + if (critical && (retval == 0)) { + hfs_flush(hfsmp, HFS_FLUSH_CACHE); + } + } + } + hfs_end_transaction(hfsmp); + + return (retval); + +err_exit: + if (alt_bp) + buf_brelse(alt_bp); + if (bp) + buf_brelse(bp); + hfs_end_transaction(hfsmp); + return retval; +} + + +/* + * Creates a UUID from a unique "name" in the HFS UUID Name space. + * See version 3 UUID. + */ +void +hfs_getvoluuid(struct hfsmount *hfsmp, uuid_t result_uuid) +{ + + if (uuid_is_null(hfsmp->hfs_full_uuid)) { + uuid_t result; + + MD5_CTX md5c; + uint8_t rawUUID[8]; + + ((uint32_t *)rawUUID)[0] = hfsmp->vcbFndrInfo[6]; + ((uint32_t *)rawUUID)[1] = hfsmp->vcbFndrInfo[7]; + + MD5Init( &md5c ); + MD5Update( &md5c, HFS_UUID_NAMESPACE_ID, sizeof( uuid_t ) ); + MD5Update( &md5c, rawUUID, sizeof (rawUUID) ); + MD5Final( result, &md5c ); + + result[6] = 0x30 | ( result[6] & 0x0F ); + result[8] = 0x80 | ( result[8] & 0x3F ); + + uuid_copy(hfsmp->hfs_full_uuid, result); + } + uuid_copy (result_uuid, hfsmp->hfs_full_uuid); + +} + +/* + * Get file system attributes. + */ +static int +hfs_vfs_getattr(struct mount *mp, struct vfs_attr *fsap, __unused vfs_context_t context) +{ +#define HFS_ATTR_FILE_VALIDMASK (ATTR_FILE_VALIDMASK & ~(ATTR_FILE_FILETYPE | ATTR_FILE_FORKCOUNT | ATTR_FILE_FORKLIST | ATTR_FILE_CLUMPSIZE)) +#define HFS_ATTR_CMN_VOL_VALIDMASK (ATTR_CMN_VALIDMASK & ~(ATTR_CMN_DATA_PROTECT_FLAGS)) + + ExtendedVCB *vcb = VFSTOVCB(mp); + struct hfsmount *hfsmp = VFSTOHFS(mp); + + int searchfs_on = 0; + int exchangedata_on = 1; + +#if CONFIG_SEARCHFS + searchfs_on = 1; +#endif + +#if CONFIG_PROTECT + if (cp_fs_protected(mp)) { + exchangedata_on = 0; + } +#endif + + VFSATTR_RETURN(fsap, f_objcount, (u_int64_t)hfsmp->vcbFilCnt + (u_int64_t)hfsmp->vcbDirCnt); + VFSATTR_RETURN(fsap, f_filecount, (u_int64_t)hfsmp->vcbFilCnt); + VFSATTR_RETURN(fsap, f_dircount, (u_int64_t)hfsmp->vcbDirCnt); + VFSATTR_RETURN(fsap, f_maxobjcount, (u_int64_t)0xFFFFFFFF); + VFSATTR_RETURN(fsap, f_iosize, (size_t)cluster_max_io_size(mp, 0)); + VFSATTR_RETURN(fsap, f_blocks, (u_int64_t)hfsmp->totalBlocks); + VFSATTR_RETURN(fsap, f_bfree, (u_int64_t)hfs_freeblks(hfsmp, 0)); + VFSATTR_RETURN(fsap, f_bavail, (u_int64_t)hfs_freeblks(hfsmp, 1)); + VFSATTR_RETURN(fsap, f_bsize, (u_int32_t)vcb->blockSize); + /* XXX needs clarification */ + VFSATTR_RETURN(fsap, f_bused, hfsmp->totalBlocks - hfs_freeblks(hfsmp, 1)); + VFSATTR_RETURN(fsap, f_files, (u_int64_t)HFS_MAX_FILES); + VFSATTR_RETURN(fsap, f_ffree, (u_int64_t)hfs_free_cnids(hfsmp)); + + fsap->f_fsid.val[0] = hfsmp->hfs_raw_dev; + fsap->f_fsid.val[1] = vfs_typenum(mp); + VFSATTR_SET_SUPPORTED(fsap, f_fsid); + + VFSATTR_RETURN(fsap, f_signature, vcb->vcbSigWord); + VFSATTR_RETURN(fsap, f_carbon_fsid, 0); + + if (VFSATTR_IS_ACTIVE(fsap, f_capabilities)) { + vol_capabilities_attr_t *cap; + + cap = &fsap->f_capabilities; + + if ((hfsmp->hfs_flags & HFS_STANDARD) == 0) { + /* HFS+ & variants */ + cap->capabilities[VOL_CAPABILITIES_FORMAT] = + VOL_CAP_FMT_PERSISTENTOBJECTIDS | + VOL_CAP_FMT_SYMBOLICLINKS | + VOL_CAP_FMT_HARDLINKS | + VOL_CAP_FMT_JOURNAL | + VOL_CAP_FMT_ZERO_RUNS | + (hfsmp->jnl ? VOL_CAP_FMT_JOURNAL_ACTIVE : 0) | + (hfsmp->hfs_flags & HFS_CASE_SENSITIVE ? VOL_CAP_FMT_CASE_SENSITIVE : 0) | + VOL_CAP_FMT_CASE_PRESERVING | + VOL_CAP_FMT_FAST_STATFS | + VOL_CAP_FMT_2TB_FILESIZE | + VOL_CAP_FMT_HIDDEN_FILES | +#if HFS_COMPRESSION + VOL_CAP_FMT_DECMPFS_COMPRESSION | +#endif +#if CONFIG_HFS_DIRLINK + VOL_CAP_FMT_DIR_HARDLINKS | +#endif +#ifdef VOL_CAP_FMT_DOCUMENT_ID + VOL_CAP_FMT_DOCUMENT_ID | +#endif /* VOL_CAP_FMT_DOCUMENT_ID */ +#ifdef VOL_CAP_FMT_WRITE_GENERATION_COUNT + VOL_CAP_FMT_WRITE_GENERATION_COUNT | +#endif /* VOL_CAP_FMT_WRITE_GENERATION_COUNT */ + VOL_CAP_FMT_PATH_FROM_ID; + } +#if CONFIG_HFS_STD + else { + /* HFS standard */ + cap->capabilities[VOL_CAPABILITIES_FORMAT] = + VOL_CAP_FMT_PERSISTENTOBJECTIDS | + VOL_CAP_FMT_CASE_PRESERVING | + VOL_CAP_FMT_FAST_STATFS | + VOL_CAP_FMT_HIDDEN_FILES | + VOL_CAP_FMT_PATH_FROM_ID; + } +#endif + + /* + * The capabilities word in 'cap' tell you whether or not + * this particular filesystem instance has feature X enabled. + */ + + cap->capabilities[VOL_CAPABILITIES_INTERFACES] = + VOL_CAP_INT_ATTRLIST | + VOL_CAP_INT_NFSEXPORT | + VOL_CAP_INT_READDIRATTR | + VOL_CAP_INT_ALLOCATE | + VOL_CAP_INT_VOL_RENAME | + VOL_CAP_INT_ADVLOCK | + VOL_CAP_INT_FLOCK | +#if VOL_CAP_INT_RENAME_EXCL + VOL_CAP_INT_RENAME_EXCL | +#endif +#if NAMEDSTREAMS + VOL_CAP_INT_EXTENDED_ATTR | + VOL_CAP_INT_NAMEDSTREAMS; +#else + VOL_CAP_INT_EXTENDED_ATTR; +#endif + + /* HFS may conditionally support searchfs and exchangedata depending on the runtime */ + + if (searchfs_on) { + cap->capabilities[VOL_CAPABILITIES_INTERFACES] |= VOL_CAP_INT_SEARCHFS; + } + if (exchangedata_on) { + cap->capabilities[VOL_CAPABILITIES_INTERFACES] |= VOL_CAP_INT_EXCHANGEDATA; + } + + cap->capabilities[VOL_CAPABILITIES_RESERVED1] = 0; + cap->capabilities[VOL_CAPABILITIES_RESERVED2] = 0; + + cap->valid[VOL_CAPABILITIES_FORMAT] = + VOL_CAP_FMT_PERSISTENTOBJECTIDS | + VOL_CAP_FMT_SYMBOLICLINKS | + VOL_CAP_FMT_HARDLINKS | + VOL_CAP_FMT_JOURNAL | + VOL_CAP_FMT_JOURNAL_ACTIVE | + VOL_CAP_FMT_NO_ROOT_TIMES | + VOL_CAP_FMT_SPARSE_FILES | + VOL_CAP_FMT_ZERO_RUNS | + VOL_CAP_FMT_CASE_SENSITIVE | + VOL_CAP_FMT_CASE_PRESERVING | + VOL_CAP_FMT_FAST_STATFS | + VOL_CAP_FMT_2TB_FILESIZE | + VOL_CAP_FMT_OPENDENYMODES | + VOL_CAP_FMT_HIDDEN_FILES | + VOL_CAP_FMT_PATH_FROM_ID | + VOL_CAP_FMT_DECMPFS_COMPRESSION | +#ifdef VOL_CAP_FMT_DOCUMENT_ID + VOL_CAP_FMT_DOCUMENT_ID | +#endif /* VOL_CAP_FMT_DOCUMENT_ID */ +#ifdef VOL_CAP_FMT_WRITE_GENERATION_COUNT + VOL_CAP_FMT_WRITE_GENERATION_COUNT | +#endif /* VOL_CAP_FMT_WRITE_GENERATION_COUNT */ + VOL_CAP_FMT_DIR_HARDLINKS; + + /* + * Bits in the "valid" field tell you whether or not the on-disk + * format supports feature X. + */ + + cap->valid[VOL_CAPABILITIES_INTERFACES] = + VOL_CAP_INT_ATTRLIST | + VOL_CAP_INT_NFSEXPORT | + VOL_CAP_INT_READDIRATTR | + VOL_CAP_INT_COPYFILE | + VOL_CAP_INT_ALLOCATE | + VOL_CAP_INT_VOL_RENAME | + VOL_CAP_INT_ADVLOCK | + VOL_CAP_INT_FLOCK | + VOL_CAP_INT_MANLOCK | +#if VOL_CAP_INT_RENAME_EXCL + VOL_CAP_INT_RENAME_EXCL | +#endif + +#if NAMEDSTREAMS + VOL_CAP_INT_EXTENDED_ATTR | + VOL_CAP_INT_NAMEDSTREAMS; +#else + VOL_CAP_INT_EXTENDED_ATTR; +#endif + + /* HFS always supports exchangedata and searchfs in the on-disk format natively */ + cap->valid[VOL_CAPABILITIES_INTERFACES] |= (VOL_CAP_INT_SEARCHFS | VOL_CAP_INT_EXCHANGEDATA); + + + cap->valid[VOL_CAPABILITIES_RESERVED1] = 0; + cap->valid[VOL_CAPABILITIES_RESERVED2] = 0; + VFSATTR_SET_SUPPORTED(fsap, f_capabilities); + } + if (VFSATTR_IS_ACTIVE(fsap, f_attributes)) { + vol_attributes_attr_t *attrp = &fsap->f_attributes; + + attrp->validattr.commonattr = HFS_ATTR_CMN_VOL_VALIDMASK; +#if CONFIG_PROTECT + attrp->validattr.commonattr |= ATTR_CMN_DATA_PROTECT_FLAGS; +#endif // CONFIG_PROTECT + + attrp->validattr.volattr = ATTR_VOL_VALIDMASK & ~ATTR_VOL_INFO; + attrp->validattr.dirattr = ATTR_DIR_VALIDMASK; + attrp->validattr.fileattr = HFS_ATTR_FILE_VALIDMASK; + attrp->validattr.forkattr = 0; + + attrp->nativeattr.commonattr = HFS_ATTR_CMN_VOL_VALIDMASK; +#if CONFIG_PROTECT + attrp->nativeattr.commonattr |= ATTR_CMN_DATA_PROTECT_FLAGS; +#endif // CONFIG_PROTECT + + attrp->nativeattr.volattr = ATTR_VOL_VALIDMASK & ~ATTR_VOL_INFO; + attrp->nativeattr.dirattr = ATTR_DIR_VALIDMASK; + attrp->nativeattr.fileattr = HFS_ATTR_FILE_VALIDMASK; + attrp->nativeattr.forkattr = 0; + VFSATTR_SET_SUPPORTED(fsap, f_attributes); + } + fsap->f_create_time.tv_sec = hfsmp->hfs_itime; + fsap->f_create_time.tv_nsec = 0; + VFSATTR_SET_SUPPORTED(fsap, f_create_time); + fsap->f_modify_time.tv_sec = hfsmp->vcbLsMod; + fsap->f_modify_time.tv_nsec = 0; + VFSATTR_SET_SUPPORTED(fsap, f_modify_time); + // We really don't have volume access time, they should check the root node, fake it up + if (VFSATTR_IS_ACTIVE(fsap, f_access_time)) { + struct timeval tv; + + microtime(&tv); + fsap->f_access_time.tv_sec = tv.tv_sec; + fsap->f_access_time.tv_nsec = 0; + VFSATTR_SET_SUPPORTED(fsap, f_access_time); + } + + fsap->f_backup_time.tv_sec = hfsmp->vcbVolBkUp; + fsap->f_backup_time.tv_nsec = 0; + VFSATTR_SET_SUPPORTED(fsap, f_backup_time); + + if (VFSATTR_IS_ACTIVE(fsap, f_fssubtype)) { + u_int16_t subtype = 0; + + /* + * Subtypes (flavors) for HFS + * 0: Mac OS Extended + * 1: Mac OS Extended (Journaled) + * 2: Mac OS Extended (Case Sensitive) + * 3: Mac OS Extended (Case Sensitive, Journaled) + * 4 - 127: Reserved + * 128: Mac OS Standard + * + */ + if ((hfsmp->hfs_flags & HFS_STANDARD) == 0) { + if (hfsmp->jnl) { + subtype |= HFS_SUBTYPE_JOURNALED; + } + if (hfsmp->hfs_flags & HFS_CASE_SENSITIVE) { + subtype |= HFS_SUBTYPE_CASESENSITIVE; + } + } +#if CONFIG_HFS_STD + else { + subtype = HFS_SUBTYPE_STANDARDHFS; + } +#endif + fsap->f_fssubtype = subtype; + VFSATTR_SET_SUPPORTED(fsap, f_fssubtype); + } + + if (VFSATTR_IS_ACTIVE(fsap, f_vol_name)) { + strlcpy(fsap->f_vol_name, (char *) hfsmp->vcbVN, MAXPATHLEN); + VFSATTR_SET_SUPPORTED(fsap, f_vol_name); + } + if (VFSATTR_IS_ACTIVE(fsap, f_uuid)) { + hfs_getvoluuid(hfsmp, fsap->f_uuid); + VFSATTR_SET_SUPPORTED(fsap, f_uuid); + } + return (0); +} + +/* + * Perform a volume rename. Requires the FS' root vp. + */ +static int +hfs_rename_volume(struct vnode *vp, const char *name, proc_t p) +{ + ExtendedVCB *vcb = VTOVCB(vp); + struct cnode *cp = VTOC(vp); + struct hfsmount *hfsmp = VTOHFS(vp); + struct cat_desc to_desc; + struct cat_desc todir_desc; + struct cat_desc new_desc; + cat_cookie_t cookie; + int lockflags; + int error = 0; + char converted_volname[256]; + size_t volname_length = 0; + size_t conv_volname_length = 0; + + + /* + * Ignore attempts to rename a volume to a zero-length name. + */ + if (name[0] == 0) + return(0); + + bzero(&to_desc, sizeof(to_desc)); + bzero(&todir_desc, sizeof(todir_desc)); + bzero(&new_desc, sizeof(new_desc)); + bzero(&cookie, sizeof(cookie)); + + todir_desc.cd_parentcnid = kHFSRootParentID; + todir_desc.cd_cnid = kHFSRootFolderID; + todir_desc.cd_flags = CD_ISDIR; + + to_desc.cd_nameptr = (const u_int8_t *)name; + to_desc.cd_namelen = strlen(name); + to_desc.cd_parentcnid = kHFSRootParentID; + to_desc.cd_cnid = cp->c_cnid; + to_desc.cd_flags = CD_ISDIR; + + if ((error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT)) == 0) { + if ((error = hfs_start_transaction(hfsmp)) == 0) { + if ((error = cat_preflight(hfsmp, CAT_RENAME, &cookie, p)) == 0) { + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); + + error = cat_rename(hfsmp, &cp->c_desc, &todir_desc, &to_desc, &new_desc); + + /* + * If successful, update the name in the VCB, ensure it's terminated. + */ + if (error == 0) { + strlcpy((char *)vcb->vcbVN, name, sizeof(vcb->vcbVN)); + + volname_length = strlen ((const char*)vcb->vcbVN); + /* Send the volume name down to CoreStorage if necessary */ + error = utf8_normalizestr(vcb->vcbVN, volname_length, (u_int8_t*)converted_volname, &conv_volname_length, 256, UTF_PRECOMPOSED); + if (error == 0) { + (void) VNOP_IOCTL (hfsmp->hfs_devvp, _DKIOCCSSETLVNAME, converted_volname, 0, vfs_context_current()); + } + error = 0; + } + + hfs_systemfile_unlock(hfsmp, lockflags); + cat_postflight(hfsmp, &cookie, p); + + if (error) + MarkVCBDirty(vcb); + (void) hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT); + } + hfs_end_transaction(hfsmp); + } + if (!error) { + /* Release old allocated name buffer */ + if (cp->c_desc.cd_flags & CD_HASBUF) { + const char *tmp_name = (const char *)cp->c_desc.cd_nameptr; + + cp->c_desc.cd_nameptr = 0; + cp->c_desc.cd_namelen = 0; + cp->c_desc.cd_flags &= ~CD_HASBUF; + vfs_removename(tmp_name); + } + /* Update cnode's catalog descriptor */ + replace_desc(cp, &new_desc); + vcb->volumeNameEncodingHint = new_desc.cd_encoding; + cp->c_touch_chgtime = TRUE; + } + + hfs_unlock(cp); + } + + return(error); +} + +/* + * Get file system attributes. + */ +static int +hfs_vfs_setattr(struct mount *mp, struct vfs_attr *fsap, vfs_context_t context) +{ + kauth_cred_t cred = vfs_context_ucred(context); + int error = 0; + + /* + * Must be superuser or owner of filesystem to change volume attributes + */ + if (!kauth_cred_issuser(cred) && (kauth_cred_getuid(cred) != vfs_statfs(mp)->f_owner)) + return(EACCES); + + if (VFSATTR_IS_ACTIVE(fsap, f_vol_name)) { + vnode_t root_vp; + + error = hfs_vfs_root(mp, &root_vp, context); + if (error) + goto out; + + error = hfs_rename_volume(root_vp, fsap->f_vol_name, vfs_context_proc(context)); + (void) vnode_put(root_vp); + if (error) + goto out; + + VFSATTR_SET_SUPPORTED(fsap, f_vol_name); + } + +out: + return error; +} + +/* If a runtime corruption is detected, set the volume inconsistent + * bit in the volume attributes. The volume inconsistent bit is a persistent + * bit which represents that the volume is corrupt and needs repair. + * The volume inconsistent bit can be set from the kernel when it detects + * runtime corruption or from file system repair utilities like fsck_hfs when + * a repair operation fails. The bit should be cleared only from file system + * verify/repair utility like fsck_hfs when a verify/repair succeeds. + */ +void hfs_mark_inconsistent(struct hfsmount *hfsmp, + hfs_inconsistency_reason_t reason) +{ + hfs_lock_mount (hfsmp); + if ((hfsmp->vcbAtrb & kHFSVolumeInconsistentMask) == 0) { + hfsmp->vcbAtrb |= kHFSVolumeInconsistentMask; + MarkVCBDirty(hfsmp); + } + if ((hfsmp->hfs_flags & HFS_READ_ONLY)==0) { + switch (reason) { + case HFS_INCONSISTENCY_DETECTED: + printf("hfs_mark_inconsistent: Runtime corruption detected on %s, fsck will be forced on next mount.\n", + hfsmp->vcbVN); + break; + case HFS_ROLLBACK_FAILED: + printf("hfs_mark_inconsistent: Failed to roll back; volume `%s' might be inconsistent; fsck will be forced on next mount.\n", + hfsmp->vcbVN); + break; + case HFS_OP_INCOMPLETE: + printf("hfs_mark_inconsistent: Failed to complete operation; volume `%s' might be inconsistent; fsck will be forced on next mount.\n", + hfsmp->vcbVN); + break; + case HFS_FSCK_FORCED: + printf("hfs_mark_inconsistent: fsck requested for `%s'; fsck will be forced on next mount.\n", + hfsmp->vcbVN); + break; + } + } + hfs_unlock_mount (hfsmp); +} + +/* Replay the journal on the device node provided. Returns zero if + * journal replay succeeded or no journal was supposed to be replayed. + */ +static int hfs_journal_replay(vnode_t devvp, vfs_context_t context) +{ + int retval = 0; + int error = 0; + + /* Replay allowed only on raw devices */ + if (!vnode_ischr(devvp) && !vnode_isblk(devvp)) + return EINVAL; + + retval = hfs_mountfs(devvp, NULL, NULL, /* journal_replay_only: */ 1, context); + buf_flushdirtyblks(devvp, TRUE, 0, "hfs_journal_replay"); + + /* FSYNC the devnode to be sure all data has been flushed */ + error = VNOP_FSYNC(devvp, MNT_WAIT, context); + if (error) { + retval = error; + } + + return retval; +} + + +/* + * Cancel the syncer + */ +static void +hfs_syncer_free(struct hfsmount *hfsmp) +{ + if (hfsmp && ISSET(hfsmp->hfs_flags, HFS_RUN_SYNCER)) { + hfs_syncer_lock(hfsmp); + CLR(hfsmp->hfs_flags, HFS_RUN_SYNCER); + hfs_syncer_unlock(hfsmp); + + // Wait for the syncer thread to finish + if (hfsmp->hfs_syncer_thread) { + hfs_syncer_wakeup(hfsmp); + hfs_syncer_lock(hfsmp); + while (hfsmp->hfs_syncer_thread) + hfs_syncer_wait(hfsmp, NULL); + hfs_syncer_unlock(hfsmp); + } + } +} + +static int hfs_vfs_ioctl(struct mount *mp, u_long command, caddr_t data, + __unused int flags, __unused vfs_context_t context) +{ + switch (command) { +#if CONFIG_PROTECT + case FIODEVICELOCKED: + cp_device_locked_callback(mp, (cp_lock_state_t)data); + return 0; +#endif + } + return ENOTTY; +} + +/* + * hfs vfs operations. + */ +const struct vfsops hfs_vfsops = { + .vfs_mount = hfs_mount, + .vfs_start = hfs_start, + .vfs_unmount = hfs_unmount, + .vfs_root = hfs_vfs_root, + .vfs_quotactl = hfs_quotactl, + .vfs_getattr = hfs_vfs_getattr, + .vfs_sync = hfs_sync, + .vfs_vget = hfs_vfs_vget, + .vfs_fhtovp = hfs_fhtovp, + .vfs_vptofh = hfs_vptofh, + .vfs_init = hfs_init, + .vfs_sysctl = hfs_sysctl, + .vfs_setattr = hfs_vfs_setattr, + .vfs_ioctl = hfs_vfs_ioctl, +}; diff --git a/core/hfs_vfsutils.c b/core/hfs_vfsutils.c new file mode 100644 index 0000000..fa2d856 --- /dev/null +++ b/core/hfs_vfsutils.c @@ -0,0 +1,4462 @@ +/* + * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* @(#)hfs_vfsutils.c 4.0 +* +* (c) 1997-2002 Apple Inc. All Rights Reserved +* +* hfs_vfsutils.c -- Routines that go between the HFS layer and the VFS. +* +*/ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* for parsing boot-args */ +#include +#include +#include + +#include "hfs_iokit.h" +#include "hfs.h" +#include "hfs_catalog.h" +#include "hfs_dbg.h" +#include "hfs_mount.h" +#include "hfs_endian.h" +#include "hfs_cnode.h" +#include "hfs_fsctl.h" +#include "hfs_cprotect.h" + +#include "FileMgrInternal.h" +#include "BTreesInternal.h" +#include "HFSUnicodeWrappers.h" + +/* Enable/disable debugging code for live volume resizing, defined in hfs_resize.c */ +extern int hfs_resize_debug; + +static void ReleaseMetaFileVNode(struct vnode *vp); +static int hfs_late_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, void *_args); + +static u_int32_t hfs_hotfile_freeblocks(struct hfsmount *); +static void hfs_thaw_locked(struct hfsmount *hfsmp); + +#define HFS_MOUNT_DEBUG 1 + + +//******************************************************************************* +// Note: Finder information in the HFS/HFS+ metadata are considered opaque and +// hence are not in the right byte order on little endian machines. It is +// the responsibility of the finder and other clients to swap the data. +//******************************************************************************* + +//******************************************************************************* +// Routine: hfs_MountHFSVolume +// +// +//******************************************************************************* +unsigned char hfs_catname[] = "Catalog B-tree"; +unsigned char hfs_extname[] = "Extents B-tree"; +unsigned char hfs_vbmname[] = "Volume Bitmap"; +unsigned char hfs_attrname[] = "Attribute B-tree"; +unsigned char hfs_startupname[] = "Startup File"; + +#if CONFIG_HFS_STD +OSErr hfs_MountHFSVolume(struct hfsmount *hfsmp, HFSMasterDirectoryBlock *mdb, + __unused struct proc *p) +{ + ExtendedVCB *vcb = HFSTOVCB(hfsmp); + int error; + ByteCount utf8chars; + struct cat_desc cndesc; + struct cat_attr cnattr; + struct cat_fork fork; + int newvnode_flags = 0; + + /* Block size must be a multiple of 512 */ + if (SWAP_BE32(mdb->drAlBlkSiz) == 0 || + (SWAP_BE32(mdb->drAlBlkSiz) & 0x01FF) != 0) + return (EINVAL); + + /* don't mount a writeable volume if its dirty, it must be cleaned by fsck_hfs */ + if (((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) && + ((SWAP_BE16(mdb->drAtrb) & kHFSVolumeUnmountedMask) == 0)) { + return (EINVAL); + } + hfsmp->hfs_flags |= HFS_STANDARD; + /* + * The MDB seems OK: transfer info from it into VCB + * Note - the VCB starts out clear (all zeros) + * + */ + vcb->vcbSigWord = SWAP_BE16 (mdb->drSigWord); + vcb->hfs_itime = to_bsd_time(LocalToUTC(SWAP_BE32(mdb->drCrDate))); + vcb->localCreateDate = SWAP_BE32 (mdb->drCrDate); + vcb->vcbLsMod = to_bsd_time(LocalToUTC(SWAP_BE32(mdb->drLsMod))); + vcb->vcbAtrb = SWAP_BE16 (mdb->drAtrb); + vcb->vcbNmFls = SWAP_BE16 (mdb->drNmFls); + vcb->vcbVBMSt = SWAP_BE16 (mdb->drVBMSt); + vcb->nextAllocation = SWAP_BE16 (mdb->drAllocPtr); + vcb->totalBlocks = SWAP_BE16 (mdb->drNmAlBlks); + vcb->allocLimit = vcb->totalBlocks; + vcb->blockSize = SWAP_BE32 (mdb->drAlBlkSiz); + vcb->vcbClpSiz = SWAP_BE32 (mdb->drClpSiz); + vcb->vcbAlBlSt = SWAP_BE16 (mdb->drAlBlSt); + vcb->vcbNxtCNID = SWAP_BE32 (mdb->drNxtCNID); + vcb->freeBlocks = SWAP_BE16 (mdb->drFreeBks); + vcb->vcbVolBkUp = to_bsd_time(LocalToUTC(SWAP_BE32(mdb->drVolBkUp))); + vcb->vcbWrCnt = SWAP_BE32 (mdb->drWrCnt); + vcb->vcbNmRtDirs = SWAP_BE16 (mdb->drNmRtDirs); + vcb->vcbFilCnt = SWAP_BE32 (mdb->drFilCnt); + vcb->vcbDirCnt = SWAP_BE32 (mdb->drDirCnt); + bcopy(mdb->drFndrInfo, vcb->vcbFndrInfo, sizeof(vcb->vcbFndrInfo)); + if ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) + vcb->vcbWrCnt++; /* Compensate for write of MDB on last flush */ + + /* convert hfs encoded name into UTF-8 string */ + error = hfs_to_utf8(vcb, mdb->drVN, NAME_MAX, &utf8chars, vcb->vcbVN); + /* + * When an HFS name cannot be encoded with the current + * volume encoding we use MacRoman as a fallback. + */ + if (error || (utf8chars == 0)) { + error = mac_roman_to_utf8(mdb->drVN, NAME_MAX, &utf8chars, vcb->vcbVN); + /* If we fail to encode to UTF8 from Mac Roman, the name is bad. Deny the mount */ + if (error) { + goto MtVolErr; + } + } + + hfsmp->hfs_logBlockSize = BestBlockSizeFit(vcb->blockSize, MAXBSIZE, hfsmp->hfs_logical_block_size); + vcb->vcbVBMIOSize = kHFSBlockSize; + + /* Generate the partition-based AVH location */ + hfsmp->hfs_partition_avh_sector = HFS_ALT_SECTOR(hfsmp->hfs_logical_block_size, + hfsmp->hfs_logical_block_count); + + /* HFS standard is read-only, so just stuff the FS location in here, too */ + hfsmp->hfs_fs_avh_sector = hfsmp->hfs_partition_avh_sector; + + bzero(&cndesc, sizeof(cndesc)); + cndesc.cd_parentcnid = kHFSRootParentID; + cndesc.cd_flags |= CD_ISMETA; + bzero(&cnattr, sizeof(cnattr)); + cnattr.ca_linkcount = 1; + cnattr.ca_mode = S_IFREG; + bzero(&fork, sizeof(fork)); + + /* + * Set up Extents B-tree vnode + */ + cndesc.cd_nameptr = hfs_extname; + cndesc.cd_namelen = strlen((char *)hfs_extname); + cndesc.cd_cnid = cnattr.ca_fileid = kHFSExtentsFileID; + fork.cf_size = SWAP_BE32(mdb->drXTFlSize); + fork.cf_blocks = fork.cf_size / vcb->blockSize; + fork.cf_clump = SWAP_BE32(mdb->drXTClpSiz); + fork.cf_vblocks = 0; + fork.cf_extents[0].startBlock = SWAP_BE16(mdb->drXTExtRec[0].startBlock); + fork.cf_extents[0].blockCount = SWAP_BE16(mdb->drXTExtRec[0].blockCount); + fork.cf_extents[1].startBlock = SWAP_BE16(mdb->drXTExtRec[1].startBlock); + fork.cf_extents[1].blockCount = SWAP_BE16(mdb->drXTExtRec[1].blockCount); + fork.cf_extents[2].startBlock = SWAP_BE16(mdb->drXTExtRec[2].startBlock); + fork.cf_extents[2].blockCount = SWAP_BE16(mdb->drXTExtRec[2].blockCount); + cnattr.ca_blocks = fork.cf_blocks; + + error = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &fork, + &hfsmp->hfs_extents_vp, &newvnode_flags); + if (error) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfs (std): error creating Ext Vnode (%d) \n", error); + } + goto MtVolErr; + } + error = MacToVFSError(BTOpenPath(VTOF(hfsmp->hfs_extents_vp), + (KeyCompareProcPtr)CompareExtentKeys)); + if (error) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfs (std): error opening Ext Vnode (%d) \n", error); + } + hfs_unlock(VTOC(hfsmp->hfs_extents_vp)); + goto MtVolErr; + } + hfsmp->hfs_extents_cp = VTOC(hfsmp->hfs_extents_vp); + + /* + * Set up Catalog B-tree vnode... + */ + cndesc.cd_nameptr = hfs_catname; + cndesc.cd_namelen = strlen((char *)hfs_catname); + cndesc.cd_cnid = cnattr.ca_fileid = kHFSCatalogFileID; + fork.cf_size = SWAP_BE32(mdb->drCTFlSize); + fork.cf_blocks = fork.cf_size / vcb->blockSize; + fork.cf_clump = SWAP_BE32(mdb->drCTClpSiz); + fork.cf_vblocks = 0; + fork.cf_extents[0].startBlock = SWAP_BE16(mdb->drCTExtRec[0].startBlock); + fork.cf_extents[0].blockCount = SWAP_BE16(mdb->drCTExtRec[0].blockCount); + fork.cf_extents[1].startBlock = SWAP_BE16(mdb->drCTExtRec[1].startBlock); + fork.cf_extents[1].blockCount = SWAP_BE16(mdb->drCTExtRec[1].blockCount); + fork.cf_extents[2].startBlock = SWAP_BE16(mdb->drCTExtRec[2].startBlock); + fork.cf_extents[2].blockCount = SWAP_BE16(mdb->drCTExtRec[2].blockCount); + cnattr.ca_blocks = fork.cf_blocks; + + error = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &fork, + &hfsmp->hfs_catalog_vp, &newvnode_flags); + if (error) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfs (std): error creating catalog Vnode (%d) \n", error); + } + hfs_unlock(VTOC(hfsmp->hfs_extents_vp)); + goto MtVolErr; + } + error = MacToVFSError(BTOpenPath(VTOF(hfsmp->hfs_catalog_vp), + (KeyCompareProcPtr)CompareCatalogKeys)); + if (error) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfs (std): error opening catalog Vnode (%d) \n", error); + } + hfs_unlock(VTOC(hfsmp->hfs_catalog_vp)); + hfs_unlock(VTOC(hfsmp->hfs_extents_vp)); + goto MtVolErr; + } + hfsmp->hfs_catalog_cp = VTOC(hfsmp->hfs_catalog_vp); + + /* + * Set up dummy Allocation file vnode (used only for locking bitmap) + */ + cndesc.cd_nameptr = hfs_vbmname; + cndesc.cd_namelen = strlen((char *)hfs_vbmname); + cndesc.cd_cnid = cnattr.ca_fileid = kHFSAllocationFileID; + bzero(&fork, sizeof(fork)); + cnattr.ca_blocks = 0; + + error = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &fork, + &hfsmp->hfs_allocation_vp, &newvnode_flags); + if (error) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfs (std): error creating bitmap Vnode (%d) \n", error); + } + hfs_unlock(VTOC(hfsmp->hfs_catalog_vp)); + hfs_unlock(VTOC(hfsmp->hfs_extents_vp)); + goto MtVolErr; + } + hfsmp->hfs_allocation_cp = VTOC(hfsmp->hfs_allocation_vp); + + /* mark the volume dirty (clear clean unmount bit) */ + vcb->vcbAtrb &= ~kHFSVolumeUnmountedMask; + + if (error == noErr) { + error = cat_idlookup(hfsmp, kHFSRootFolderID, 0, 0, NULL, NULL, NULL); + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfs (std): error looking up root folder (%d) \n", error); + } + } + + if (error == noErr) { + /* If the disk isn't write protected.. */ + if ( !(vcb->vcbAtrb & kHFSVolumeHardwareLockMask)) { + MarkVCBDirty (vcb); // mark VCB dirty so it will be written + } + } + + /* + * all done with system files so we can unlock now... + */ + hfs_unlock(VTOC(hfsmp->hfs_allocation_vp)); + hfs_unlock(VTOC(hfsmp->hfs_catalog_vp)); + hfs_unlock(VTOC(hfsmp->hfs_extents_vp)); + + if (error == noErr) { + /* If successful, then we can just return once we've unlocked the cnodes */ + return error; + } + + //-- Release any resources allocated so far before exiting with an error: +MtVolErr: + hfsUnmount(hfsmp, NULL); + + return (error); +} + +#endif + +//******************************************************************************* +// +// Sanity check Volume Header Block: +// Input argument *vhp is a pointer to a HFSPlusVolumeHeader block that has +// not been endian-swapped and represents the on-disk contents of this sector. +// This routine will not change the endianness of vhp block. +// +//******************************************************************************* +OSErr hfs_ValidateHFSPlusVolumeHeader(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp) +{ + u_int16_t signature; + u_int16_t hfs_version; + u_int32_t blockSize; + + signature = SWAP_BE16(vhp->signature); + hfs_version = SWAP_BE16(vhp->version); + + if (signature == kHFSPlusSigWord) { + if (hfs_version != kHFSPlusVersion) { + printf("hfs_ValidateHFSPlusVolumeHeader: invalid HFS+ version: %x\n", hfs_version); + return (EINVAL); + } + } else if (signature == kHFSXSigWord) { + if (hfs_version != kHFSXVersion) { + printf("hfs_ValidateHFSPlusVolumeHeader: invalid HFSX version: %x\n", hfs_version); + return (EINVAL); + } + } else { + /* Removed printf for invalid HFS+ signature because it gives + * false error for UFS root volume + */ + if (HFS_MOUNT_DEBUG) { + printf("hfs_ValidateHFSPlusVolumeHeader: unknown Volume Signature : %x\n", signature); + } + return (EINVAL); + } + + /* Block size must be at least 512 and a power of 2 */ + blockSize = SWAP_BE32(vhp->blockSize); + if (blockSize < 512 || !powerof2(blockSize)) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_ValidateHFSPlusVolumeHeader: invalid blocksize (%d) \n", blockSize); + } + return (EINVAL); + } + + if (blockSize < hfsmp->hfs_logical_block_size) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_ValidateHFSPlusVolumeHeader: invalid physical blocksize (%d), hfs_logical_blocksize (%d) \n", + blockSize, hfsmp->hfs_logical_block_size); + } + return (EINVAL); + } + return 0; +} + +//******************************************************************************* +// Routine: hfs_MountHFSPlusVolume +// +// +//******************************************************************************* + +OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, + off_t embeddedOffset, u_int64_t disksize, __unused struct proc *p, void *args, kauth_cred_t cred) +{ + register ExtendedVCB *vcb; + struct cat_desc cndesc; + struct cat_attr cnattr; + struct cat_fork cfork; + u_int32_t blockSize; + daddr64_t spare_sectors; + struct BTreeInfoRec btinfo; + u_int16_t signature; + u_int16_t hfs_version; + int newvnode_flags = 0; + int i; + OSErr retval; + char converted_volname[256]; + size_t volname_length = 0; + size_t conv_volname_length = 0; + bool async_bitmap_scan; + + signature = SWAP_BE16(vhp->signature); + hfs_version = SWAP_BE16(vhp->version); + + retval = hfs_ValidateHFSPlusVolumeHeader(hfsmp, vhp); + if (retval) + return retval; + + if (signature == kHFSXSigWord) { + /* The in-memory signature is always 'H+'. */ + signature = kHFSPlusSigWord; + hfsmp->hfs_flags |= HFS_X; + } + + blockSize = SWAP_BE32(vhp->blockSize); + /* don't mount a writable volume if its dirty, it must be cleaned by fsck_hfs */ + if ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0 && hfsmp->jnl == NULL && + (SWAP_BE32(vhp->attributes) & kHFSVolumeUnmountedMask) == 0) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: cannot mount dirty non-journaled volumes\n"); + } + return (EINVAL); + } + + /* Make sure we can live with the physical block size. */ + if ((disksize & (hfsmp->hfs_logical_block_size - 1)) || + (embeddedOffset & (hfsmp->hfs_logical_block_size - 1))) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: hfs_logical_blocksize (%d) \n", + hfsmp->hfs_logical_block_size); + } + return (ENXIO); + } + + /* + * If allocation block size is less than the physical block size, + * same data could be cached in two places and leads to corruption. + * + * HFS Plus reserves one allocation block for the Volume Header. + * If the physical size is larger, then when we read the volume header, + * we will also end up reading in the next allocation block(s). + * If those other allocation block(s) is/are modified, and then the volume + * header is modified, the write of the volume header's buffer will write + * out the old contents of the other allocation blocks. + * + * We assume that the physical block size is same as logical block size. + * The physical block size value is used to round down the offsets for + * reading and writing the primary and alternate volume headers. + * + * The same logic to ensure good hfs_physical_block_size is also in + * hfs_mountfs so that hfs_mountfs, hfs_MountHFSPlusVolume and + * later are doing the I/Os using same block size. + */ + if (blockSize < hfsmp->hfs_physical_block_size) { + hfsmp->hfs_physical_block_size = hfsmp->hfs_logical_block_size; + hfsmp->hfs_log_per_phys = 1; + } + + /* + * The VolumeHeader seems OK: transfer info from it into VCB + * Note - the VCB starts out clear (all zeros) + */ + vcb = HFSTOVCB(hfsmp); + + vcb->vcbSigWord = signature; + vcb->vcbJinfoBlock = SWAP_BE32(vhp->journalInfoBlock); + vcb->vcbLsMod = to_bsd_time(SWAP_BE32(vhp->modifyDate)); + vcb->vcbAtrb = SWAP_BE32(vhp->attributes); + vcb->vcbClpSiz = SWAP_BE32(vhp->rsrcClumpSize); + vcb->vcbNxtCNID = SWAP_BE32(vhp->nextCatalogID); + vcb->vcbVolBkUp = to_bsd_time(SWAP_BE32(vhp->backupDate)); + vcb->vcbWrCnt = SWAP_BE32(vhp->writeCount); + vcb->vcbFilCnt = SWAP_BE32(vhp->fileCount); + vcb->vcbDirCnt = SWAP_BE32(vhp->folderCount); + + /* copy 32 bytes of Finder info */ + bcopy(vhp->finderInfo, vcb->vcbFndrInfo, sizeof(vhp->finderInfo)); + + vcb->vcbAlBlSt = 0; /* hfs+ allocation blocks start at first block of volume */ + if ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) + vcb->vcbWrCnt++; /* compensate for write of Volume Header on last flush */ + + /* Now fill in the Extended VCB info */ + vcb->nextAllocation = SWAP_BE32(vhp->nextAllocation); + vcb->totalBlocks = SWAP_BE32(vhp->totalBlocks); + vcb->allocLimit = vcb->totalBlocks; + vcb->freeBlocks = SWAP_BE32(vhp->freeBlocks); + vcb->blockSize = blockSize; + vcb->encodingsBitmap = SWAP_BE64(vhp->encodingsBitmap); + vcb->localCreateDate = SWAP_BE32(vhp->createDate); + + vcb->hfsPlusIOPosOffset = embeddedOffset; + + /* Default to no free block reserve */ + vcb->reserveBlocks = 0; + + /* + * Update the logical block size in the mount struct + * (currently set up from the wrapper MDB) using the + * new blocksize value: + */ + hfsmp->hfs_logBlockSize = BestBlockSizeFit(vcb->blockSize, MAXBSIZE, hfsmp->hfs_logical_block_size); + vcb->vcbVBMIOSize = min(vcb->blockSize, MAXPHYSIO); + + /* + * Validate and initialize the location of the alternate volume header. + * + * Note that there may be spare sectors beyond the end of the filesystem that still + * belong to our partition. + */ + + spare_sectors = hfsmp->hfs_logical_block_count - + (((daddr64_t)vcb->totalBlocks * blockSize) / + hfsmp->hfs_logical_block_size); + + /* + * Differentiate between "innocuous" spare sectors and the more unusual + * degenerate case: + * + * *** Innocuous spare sectors exist if: + * + * A) the number of bytes assigned to the partition (by multiplying logical + * block size * logical block count) is greater than the filesystem size + * (by multiplying allocation block count and allocation block size) + * + * and + * + * B) the remainder is less than the size of a full allocation block's worth of bytes. + * + * This handles the normal case where there may be a few extra sectors, but the two + * are fundamentally in sync. + * + * *** Degenerate spare sectors exist if: + * A) The number of bytes assigned to the partition (by multiplying logical + * block size * logical block count) is greater than the filesystem size + * (by multiplying allocation block count and block size). + * + * and + * + * B) the remainder is greater than a full allocation's block worth of bytes. + * In this case, a smaller file system exists in a larger partition. + * This can happen in various ways, including when volume is resized but the + * partition is yet to be resized. Under this condition, we have to assume that + * a partition management software may resize the partition to match + * the file system size in the future. Therefore we should update + * alternate volume header at two locations on the disk, + * a. 1024 bytes before end of the partition + * b. 1024 bytes before end of the file system + */ + + if (spare_sectors > (daddr64_t)(blockSize / hfsmp->hfs_logical_block_size)) { + /* + * Handle the degenerate case above. FS < partition size. + * AVH located at 1024 bytes from the end of the partition + */ + hfsmp->hfs_partition_avh_sector = (hfsmp->hfsPlusIOPosOffset / hfsmp->hfs_logical_block_size) + + HFS_ALT_SECTOR(hfsmp->hfs_logical_block_size, hfsmp->hfs_logical_block_count); + + /* AVH located at 1024 bytes from the end of the filesystem */ + hfsmp->hfs_fs_avh_sector = (hfsmp->hfsPlusIOPosOffset / hfsmp->hfs_logical_block_size) + + HFS_ALT_SECTOR(hfsmp->hfs_logical_block_size, + (((daddr64_t)vcb->totalBlocks * blockSize) / hfsmp->hfs_logical_block_size)); + } + else { + /* Innocuous spare sectors; Partition & FS notion are in sync */ + hfsmp->hfs_partition_avh_sector = (hfsmp->hfsPlusIOPosOffset / hfsmp->hfs_logical_block_size) + + HFS_ALT_SECTOR(hfsmp->hfs_logical_block_size, hfsmp->hfs_logical_block_count); + + hfsmp->hfs_fs_avh_sector = hfsmp->hfs_partition_avh_sector; + } + if (hfs_resize_debug) { + printf ("hfs_MountHFSPlusVolume: partition_avh_sector=%qu, fs_avh_sector=%qu\n", + hfsmp->hfs_partition_avh_sector, hfsmp->hfs_fs_avh_sector); + } + + bzero(&cndesc, sizeof(cndesc)); + cndesc.cd_parentcnid = kHFSRootParentID; + cndesc.cd_flags |= CD_ISMETA; + bzero(&cnattr, sizeof(cnattr)); + cnattr.ca_linkcount = 1; + cnattr.ca_mode = S_IFREG; + + /* + * Set up Extents B-tree vnode + */ + cndesc.cd_nameptr = hfs_extname; + cndesc.cd_namelen = strlen((char *)hfs_extname); + cndesc.cd_cnid = cnattr.ca_fileid = kHFSExtentsFileID; + + cfork.cf_size = SWAP_BE64 (vhp->extentsFile.logicalSize); + cfork.cf_new_size= 0; + cfork.cf_clump = SWAP_BE32 (vhp->extentsFile.clumpSize); + cfork.cf_blocks = SWAP_BE32 (vhp->extentsFile.totalBlocks); + cfork.cf_vblocks = 0; + cnattr.ca_blocks = cfork.cf_blocks; + for (i = 0; i < kHFSPlusExtentDensity; i++) { + cfork.cf_extents[i].startBlock = + SWAP_BE32 (vhp->extentsFile.extents[i].startBlock); + cfork.cf_extents[i].blockCount = + SWAP_BE32 (vhp->extentsFile.extents[i].blockCount); + } + retval = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &cfork, + &hfsmp->hfs_extents_vp, &newvnode_flags); + if (retval) + { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: hfs_getnewvnode returned (%d) getting extentoverflow BT\n", retval); + } + goto ErrorExit; + } + + hfsmp->hfs_extents_cp = VTOC(hfsmp->hfs_extents_vp); + + retval = MacToVFSError(BTOpenPath(VTOF(hfsmp->hfs_extents_vp), + (KeyCompareProcPtr) CompareExtentKeysPlus)); + + hfs_unlock(hfsmp->hfs_extents_cp); + + if (retval) + { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: BTOpenPath returned (%d) getting extentoverflow BT\n", retval); + } + goto ErrorExit; + } + /* + * Set up Catalog B-tree vnode + */ + cndesc.cd_nameptr = hfs_catname; + cndesc.cd_namelen = strlen((char *)hfs_catname); + cndesc.cd_cnid = cnattr.ca_fileid = kHFSCatalogFileID; + + cfork.cf_size = SWAP_BE64 (vhp->catalogFile.logicalSize); + cfork.cf_clump = SWAP_BE32 (vhp->catalogFile.clumpSize); + cfork.cf_blocks = SWAP_BE32 (vhp->catalogFile.totalBlocks); + cfork.cf_vblocks = 0; + cnattr.ca_blocks = cfork.cf_blocks; + for (i = 0; i < kHFSPlusExtentDensity; i++) { + cfork.cf_extents[i].startBlock = + SWAP_BE32 (vhp->catalogFile.extents[i].startBlock); + cfork.cf_extents[i].blockCount = + SWAP_BE32 (vhp->catalogFile.extents[i].blockCount); + } + retval = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &cfork, + &hfsmp->hfs_catalog_vp, &newvnode_flags); + if (retval) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: hfs_getnewvnode returned (%d) getting catalog BT\n", retval); + } + goto ErrorExit; + } + hfsmp->hfs_catalog_cp = VTOC(hfsmp->hfs_catalog_vp); + + retval = MacToVFSError(BTOpenPath(VTOF(hfsmp->hfs_catalog_vp), + (KeyCompareProcPtr) CompareExtendedCatalogKeys)); + + if (retval) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: BTOpenPath returned (%d) getting catalog BT\n", retval); + } + hfs_unlock(hfsmp->hfs_catalog_cp); + goto ErrorExit; + } + if ((hfsmp->hfs_flags & HFS_X) && + BTGetInformation(VTOF(hfsmp->hfs_catalog_vp), 0, &btinfo) == 0) { + if (btinfo.keyCompareType == kHFSBinaryCompare) { + hfsmp->hfs_flags |= HFS_CASE_SENSITIVE; + /* Install a case-sensitive key compare */ + (void) BTOpenPath(VTOF(hfsmp->hfs_catalog_vp), + (KeyCompareProcPtr)cat_binarykeycompare); + } + } + + hfs_unlock(hfsmp->hfs_catalog_cp); + + /* + * Set up Allocation file vnode + */ + cndesc.cd_nameptr = hfs_vbmname; + cndesc.cd_namelen = strlen((char *)hfs_vbmname); + cndesc.cd_cnid = cnattr.ca_fileid = kHFSAllocationFileID; + + cfork.cf_size = SWAP_BE64 (vhp->allocationFile.logicalSize); + cfork.cf_clump = SWAP_BE32 (vhp->allocationFile.clumpSize); + cfork.cf_blocks = SWAP_BE32 (vhp->allocationFile.totalBlocks); + cfork.cf_vblocks = 0; + cnattr.ca_blocks = cfork.cf_blocks; + for (i = 0; i < kHFSPlusExtentDensity; i++) { + cfork.cf_extents[i].startBlock = + SWAP_BE32 (vhp->allocationFile.extents[i].startBlock); + cfork.cf_extents[i].blockCount = + SWAP_BE32 (vhp->allocationFile.extents[i].blockCount); + } + retval = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &cfork, + &hfsmp->hfs_allocation_vp, &newvnode_flags); + if (retval) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: hfs_getnewvnode returned (%d) getting bitmap\n", retval); + } + goto ErrorExit; + } + hfsmp->hfs_allocation_cp = VTOC(hfsmp->hfs_allocation_vp); + hfs_unlock(hfsmp->hfs_allocation_cp); + + /* + * Set up Attribute B-tree vnode + */ + if (vhp->attributesFile.totalBlocks != 0) { + cndesc.cd_nameptr = hfs_attrname; + cndesc.cd_namelen = strlen((char *)hfs_attrname); + cndesc.cd_cnid = cnattr.ca_fileid = kHFSAttributesFileID; + + cfork.cf_size = SWAP_BE64 (vhp->attributesFile.logicalSize); + cfork.cf_clump = SWAP_BE32 (vhp->attributesFile.clumpSize); + cfork.cf_blocks = SWAP_BE32 (vhp->attributesFile.totalBlocks); + cfork.cf_vblocks = 0; + cnattr.ca_blocks = cfork.cf_blocks; + for (i = 0; i < kHFSPlusExtentDensity; i++) { + cfork.cf_extents[i].startBlock = + SWAP_BE32 (vhp->attributesFile.extents[i].startBlock); + cfork.cf_extents[i].blockCount = + SWAP_BE32 (vhp->attributesFile.extents[i].blockCount); + } + retval = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &cfork, + &hfsmp->hfs_attribute_vp, &newvnode_flags); + if (retval) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: hfs_getnewvnode returned (%d) getting EA BT\n", retval); + } + goto ErrorExit; + } + hfsmp->hfs_attribute_cp = VTOC(hfsmp->hfs_attribute_vp); + retval = MacToVFSError(BTOpenPath(VTOF(hfsmp->hfs_attribute_vp), + (KeyCompareProcPtr) hfs_attrkeycompare)); + hfs_unlock(hfsmp->hfs_attribute_cp); + if (retval) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: BTOpenPath returned (%d) getting EA BT\n", retval); + } + goto ErrorExit; + } + + /* Initialize vnode for virtual attribute data file that spans the + * entire file system space for performing I/O to attribute btree + * We hold iocount on the attrdata vnode for the entire duration + * of mount (similar to btree vnodes) + */ + retval = init_attrdata_vnode(hfsmp); + if (retval) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: init_attrdata_vnode returned (%d) for virtual EA file\n", retval); + } + goto ErrorExit; + } + } + + /* + * Set up Startup file vnode + */ + if (vhp->startupFile.totalBlocks != 0) { + cndesc.cd_nameptr = hfs_startupname; + cndesc.cd_namelen = strlen((char *)hfs_startupname); + cndesc.cd_cnid = cnattr.ca_fileid = kHFSStartupFileID; + + cfork.cf_size = SWAP_BE64 (vhp->startupFile.logicalSize); + cfork.cf_clump = SWAP_BE32 (vhp->startupFile.clumpSize); + cfork.cf_blocks = SWAP_BE32 (vhp->startupFile.totalBlocks); + cfork.cf_vblocks = 0; + cnattr.ca_blocks = cfork.cf_blocks; + for (i = 0; i < kHFSPlusExtentDensity; i++) { + cfork.cf_extents[i].startBlock = + SWAP_BE32 (vhp->startupFile.extents[i].startBlock); + cfork.cf_extents[i].blockCount = + SWAP_BE32 (vhp->startupFile.extents[i].blockCount); + } + retval = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &cfork, + &hfsmp->hfs_startup_vp, &newvnode_flags); + if (retval) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: hfs_getnewvnode returned (%d) getting startup file\n", retval); + } + goto ErrorExit; + } + hfsmp->hfs_startup_cp = VTOC(hfsmp->hfs_startup_vp); + hfs_unlock(hfsmp->hfs_startup_cp); + } + + /* + * Pick up volume name and create date + * + * Acquiring the volume name should not manipulate the bitmap, only the catalog + * btree and possibly the extents overflow b-tree. + */ + retval = cat_idlookup(hfsmp, kHFSRootFolderID, 0, 0, &cndesc, &cnattr, NULL); + if (retval) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: cat_idlookup returned (%d) getting rootfolder \n", retval); + } + goto ErrorExit; + } + vcb->hfs_itime = cnattr.ca_itime; + vcb->volumeNameEncodingHint = cndesc.cd_encoding; + bcopy(cndesc.cd_nameptr, vcb->vcbVN, min(255, cndesc.cd_namelen)); + volname_length = strlen ((const char*)vcb->vcbVN); + cat_releasedesc(&cndesc); + + /* Send the volume name down to CoreStorage if necessary */ + retval = utf8_normalizestr(vcb->vcbVN, volname_length, (u_int8_t*)converted_volname, &conv_volname_length, 256, UTF_PRECOMPOSED); + if (retval == 0) { + (void) VNOP_IOCTL (hfsmp->hfs_devvp, _DKIOCCSSETLVNAME, converted_volname, 0, vfs_context_current()); + } + + /* reset retval == 0. we don't care about errors in volname conversion */ + retval = 0; + + /* + * pull in the volume UUID while we are still single-threaded. + * This brings the volume UUID into the cached one dangling off of the HFSMP + * Otherwise it would have to be computed on first access. + */ + uuid_t throwaway; + hfs_getvoluuid (hfsmp, throwaway); + + /* + * We now always initiate a full bitmap scan even if the volume is read-only because this is + * our only shot to do I/Os of dramaticallly different sizes than what the buffer cache ordinarily + * expects. TRIMs will not be delivered to the underlying media if the volume is not + * read-write though. + */ + hfsmp->scan_var = 0; + + /* + * We have to ensure if we can proceed to scan the bitmap allocation + * file asynchronously. If the catalog file is fragmented such that it + * has overflow extents and the volume needs journal transaction we + * cannot scan the bitmap asynchronously. Doing so will cause the mount + * thread to block at journal transaction on bitmap lock, while scan + * thread which hold the bitmap lock exclusively performs disk I/O to + * issue TRIMS to unallocated ranges and build summary table. The + * amount of time the mount thread is blocked depends on the size of + * the volume, type of disk, etc. This blocking can cause the watchdog + * timer to timeout resulting in panic. Thus to ensure we don't timeout + * watchdog in such cases we scan the bitmap synchronously. + * + * Please NOTE: Currently this timeout only seem to happen for non SSD + * drives. Possibly reading a big fragmented allocation file to + * construct the summary table takes enough time to timeout watchdog. + * Thus we check if we need to scan the bitmap synchronously only if + * the disk is not SSD. + */ + async_bitmap_scan = true; + if (!ISSET(hfsmp->hfs_flags, HFS_SSD) && hfsmp->hfs_catalog_cp) { + bool catalog_has_overflow_extents; + bool journal_transaction_needed; + + catalog_has_overflow_extents = false; + if ((hfsmp->hfs_catalog_vp != NULL) && + (overflow_extents(VTOF(hfsmp->hfs_catalog_vp)))) { + catalog_has_overflow_extents = true; + } + + journal_transaction_needed = false; + if (hfsmp->jnl || ((vcb->vcbAtrb & kHFSVolumeJournaledMask) && + (hfsmp->hfs_flags & HFS_READ_ONLY))) { + journal_transaction_needed = true; + } + + if (catalog_has_overflow_extents && journal_transaction_needed) + async_bitmap_scan = false; + } + + if (async_bitmap_scan) { + thread_t allocator_scanner; + + /* Take the HFS mount mutex and wait on scan_var */ + hfs_lock_mount (hfsmp); + + + /* + * Scan the bitmap asynchronously. + */ + kernel_thread_start ((thread_continue_t)hfs_scan_blocks, hfsmp, + &allocator_scanner); + + /* + * Wait until it registers that it's got the appropriate locks + * (or that it is finished). + */ + while ((hfsmp->scan_var & (HFS_ALLOCATOR_SCAN_INFLIGHT| + HFS_ALLOCATOR_SCAN_COMPLETED)) == 0) { + msleep (&hfsmp->scan_var, &hfsmp->hfs_mutex, PINOD, + "hfs_scan_blocks", 0); + } + + hfs_unlock_mount(hfsmp); + + thread_deallocate (allocator_scanner); + } else { + + /* + * Initialize the summary table and then scan the bitmap + * synchronously. Since we are scanning the bitmap + * synchronously we don't need to hold the bitmap lock. + */ + if (hfs_init_summary (hfsmp)) { + printf ("hfs: could not initialize summary table for " + "%s\n", hfsmp->vcbVN); + } + + (void)ScanUnmapBlocks (hfsmp); + + /* + * We need to set that the allocator scan is completed because + * hot file clustering waits for this condition later. + */ + hfsmp->scan_var |= HFS_ALLOCATOR_SCAN_COMPLETED; + buf_invalidateblks (hfsmp->hfs_allocation_vp, 0, 0, 0); + } + + /* mark the volume dirty (clear clean unmount bit) */ + vcb->vcbAtrb &= ~kHFSVolumeUnmountedMask; + if (hfsmp->jnl && (hfsmp->hfs_flags & HFS_READ_ONLY) == 0) { + hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT); + } + + /* kHFSHasFolderCount is only supported/updated on HFSX volumes */ + if ((hfsmp->hfs_flags & HFS_X) != 0) { + hfsmp->hfs_flags |= HFS_FOLDERCOUNT; + } + + // + // Check if we need to do late journal initialization. This only + // happens if a previous version of MacOS X (or 9) touched the disk. + // In that case hfs_late_journal_init() will go re-locate the journal + // and journal_info_block files and validate that they're still kosher. + // + if ( (vcb->vcbAtrb & kHFSVolumeJournaledMask) + && (SWAP_BE32(vhp->lastMountedVersion) != kHFSJMountVersion) + && (hfsmp->jnl == NULL)) { + + retval = hfs_late_journal_init(hfsmp, vhp, args); + if (retval != 0) { + if (retval == EROFS) { + // EROFS is a special error code that means the volume has an external + // journal which we couldn't find. in that case we do not want to + // rewrite the volume header - we'll just refuse to mount the volume. + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: hfs_late_journal_init returned (%d), maybe an external jnl?\n", retval); + } + retval = EINVAL; + goto ErrorExit; + } + + hfsmp->jnl = NULL; + + // if the journal failed to open, then set the lastMountedVersion + // to be "FSK!" which fsck_hfs will see and force the fsck instead + // of just bailing out because the volume is journaled. + if (!(hfsmp->hfs_flags & HFS_READ_ONLY)) { + HFSPlusVolumeHeader *jvhp; + daddr64_t mdb_offset; + struct buf *bp = NULL; + + hfsmp->hfs_flags |= HFS_NEED_JNL_RESET; + + mdb_offset = (daddr64_t)((embeddedOffset / blockSize) + HFS_PRI_SECTOR(blockSize)); + + bp = NULL; + retval = (int)buf_meta_bread(hfsmp->hfs_devvp, + HFS_PHYSBLK_ROUNDDOWN(mdb_offset, hfsmp->hfs_log_per_phys), + hfsmp->hfs_physical_block_size, cred, &bp); + if (retval == 0) { + jvhp = (HFSPlusVolumeHeader *)(buf_dataptr(bp) + HFS_PRI_OFFSET(hfsmp->hfs_physical_block_size)); + + if (SWAP_BE16(jvhp->signature) == kHFSPlusSigWord || SWAP_BE16(jvhp->signature) == kHFSXSigWord) { + printf ("hfs(3): Journal replay fail. Writing lastMountVersion as FSK!\n"); + jvhp->lastMountedVersion = SWAP_BE32(kFSKMountVersion); + buf_bwrite(bp); + } else { + buf_brelse(bp); + } + bp = NULL; + } else if (bp) { + buf_brelse(bp); + // clear this so the error exit path won't try to use it + bp = NULL; + } + } + + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: hfs_late_journal_init returned (%d)\n", retval); + } + retval = EINVAL; + goto ErrorExit; + } else if (hfsmp->jnl) { + vfs_setflags(hfsmp->hfs_mp, (u_int64_t)((unsigned int)MNT_JOURNALED)); + } + } else if (hfsmp->jnl || ((vcb->vcbAtrb & kHFSVolumeJournaledMask) && (hfsmp->hfs_flags & HFS_READ_ONLY))) { + struct cat_attr jinfo_attr, jnl_attr; + + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + vcb->vcbAtrb &= ~kHFSVolumeJournaledMask; + } + + // if we're here we need to fill in the fileid's for the + // journal and journal_info_block. + hfsmp->hfs_jnlinfoblkid = GetFileInfo(vcb, kRootDirID, ".journal_info_block", &jinfo_attr, NULL); + hfsmp->hfs_jnlfileid = GetFileInfo(vcb, kRootDirID, ".journal", &jnl_attr, NULL); + if (hfsmp->hfs_jnlinfoblkid == 0 || hfsmp->hfs_jnlfileid == 0) { + printf("hfs: danger! couldn't find the file-id's for the journal or journal_info_block\n"); + printf("hfs: jnlfileid %d, jnlinfoblkid %d\n", hfsmp->hfs_jnlfileid, hfsmp->hfs_jnlinfoblkid); + } + + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + vcb->vcbAtrb |= kHFSVolumeJournaledMask; + } + + if (hfsmp->jnl == NULL) { + vfs_clearflags(hfsmp->hfs_mp, (u_int64_t)((unsigned int)MNT_JOURNALED)); + } + } + + if ( !(vcb->vcbAtrb & kHFSVolumeHardwareLockMask) ) // if the disk is not write protected + { + MarkVCBDirty( vcb ); // mark VCB dirty so it will be written + } + + if (hfsmp->hfs_flags & HFS_CS_METADATA_PIN) { + hfs_pin_fs_metadata(hfsmp); + } + /* + * Distinguish 3 potential cases involving content protection: + * 1. mount point bit set; vcbAtrb does not support it. Fail. + * 2. mount point bit set; vcbattrb supports it. we're good. + * 3. mount point bit not set; vcbatrb supports it, turn bit on, then good. + */ + if (vfs_flags(hfsmp->hfs_mp) & MNT_CPROTECT) { + /* Does the mount point support it ? */ + if ((vcb->vcbAtrb & kHFSContentProtectionMask) == 0) { + /* Case 1 above */ + retval = EINVAL; + goto ErrorExit; + } + } + else { + /* not requested in the mount point. Is it in FS? */ + if (vcb->vcbAtrb & kHFSContentProtectionMask) { + /* Case 3 above */ + vfs_setflags (hfsmp->hfs_mp, MNT_CPROTECT); + } + } + + /* At this point, if the mount point flag is set, we can enable it. */ + if (vfs_flags(hfsmp->hfs_mp) & MNT_CPROTECT) { + /* Cases 2+3 above */ +#if CONFIG_PROTECT + /* Get the EAs as needed. */ + int cperr = 0; + struct cp_root_xattr *xattr = NULL; + xattr = hfs_malloc(sizeof(*xattr)); + + /* go get the EA to get the version information */ + cperr = cp_getrootxattr (hfsmp, xattr); + /* + * If there was no EA there, then write one out. + * Assuming EA is not present on the root means + * this is an erase install or a very old FS + */ + + if (cperr == 0) { + /* Have to run a valid CP version. */ + if (!cp_is_supported_version(xattr->major_version)) { + cperr = EINVAL; + } + } + else if (cperr == ENOATTR) { + printf("No root EA set, creating new EA with new version: %d\n", CP_CURRENT_VERS); + bzero(xattr, sizeof(struct cp_root_xattr)); + xattr->major_version = CP_CURRENT_VERS; + xattr->minor_version = CP_MINOR_VERS; + cperr = cp_setrootxattr (hfsmp, xattr); + } + + if (cperr) { + hfs_free(xattr, sizeof(*xattr)); + retval = EPERM; + goto ErrorExit; + } + + /* If we got here, then the CP version is valid. Set it in the mount point */ + hfsmp->hfs_running_cp_major_vers = xattr->major_version; + printf("Running with CP root xattr: %d.%d\n", xattr->major_version, xattr->minor_version); + hfsmp->cproot_flags = xattr->flags; + hfsmp->cp_crypto_generation = ISSET(xattr->flags, CP_ROOT_CRYPTOG1) ? 1 : 0; +#if HFS_CONFIG_KEY_ROLL + hfsmp->hfs_auto_roll_min_key_os_version = xattr->auto_roll_min_version; + hfsmp->hfs_auto_roll_max_key_os_version = xattr->auto_roll_max_version; +#endif + + hfs_free(xattr, sizeof(*xattr)); + + /* + * Acquire the boot-arg for the AKS default key; if invalid, obtain from the device tree. + * Ensure that the boot-arg's value is valid for FILES (not directories), + * since only files are actually protected for now. + */ + + PE_parse_boot_argn("aks_default_class", &hfsmp->default_cp_class, sizeof(hfsmp->default_cp_class)); + + if (cp_is_valid_class(0, hfsmp->default_cp_class) == 0) { + PE_get_default("kern.default_cp_class", &hfsmp->default_cp_class, sizeof(hfsmp->default_cp_class)); + } + +#if HFS_TMPDBG +#if !SECURE_KERNEL + PE_parse_boot_argn("aks_verbose", &hfsmp->hfs_cp_verbose, sizeof(hfsmp->hfs_cp_verbose)); +#endif +#endif + + if (cp_is_valid_class(0, hfsmp->default_cp_class) == 0) { + hfsmp->default_cp_class = PROTECTION_CLASS_C; + } + +#else + /* If CONFIG_PROTECT not built, ignore CP */ + vfs_clearflags(hfsmp->hfs_mp, MNT_CPROTECT); +#endif + } + + /* + * Establish a metadata allocation zone. + */ + hfs_metadatazone_init(hfsmp, false); + + /* + * Make any metadata zone adjustments. + */ + if (hfsmp->hfs_flags & HFS_METADATA_ZONE) { + /* Keep the roving allocator out of the metadata zone. */ + if (vcb->nextAllocation >= hfsmp->hfs_metazone_start && + vcb->nextAllocation <= hfsmp->hfs_metazone_end) { + HFS_UPDATE_NEXT_ALLOCATION(hfsmp, hfsmp->hfs_metazone_end + 1); + } + } else { + if (vcb->nextAllocation <= 1) { + vcb->nextAllocation = hfsmp->hfs_min_alloc_start; + } + } + vcb->sparseAllocation = hfsmp->hfs_min_alloc_start; + + /* Setup private/hidden directories for hardlinks. */ + hfs_privatedir_init(hfsmp, FILE_HARDLINKS); + hfs_privatedir_init(hfsmp, DIR_HARDLINKS); + + if ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) + hfs_remove_orphans(hfsmp); + + /* See if we need to erase unused Catalog nodes due to . */ + if ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) + { + retval = hfs_erase_unused_nodes(hfsmp); + if (retval) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: hfs_erase_unused_nodes returned (%d) for %s \n", retval, hfsmp->vcbVN); + } + + goto ErrorExit; + } + } + + /* + * Allow hot file clustering if conditions allow. + */ + if ((hfsmp->hfs_flags & HFS_METADATA_ZONE) && !(hfsmp->hfs_flags & HFS_READ_ONLY) && + ((hfsmp->hfs_flags & HFS_SSD) == 0 || (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN))) { + // + // Wait until the bitmap scan completes before we initializes the + // hotfile area so that we do not run into any issues with the + // bitmap being read while hotfiles is initializing itself. On + // some older/slower machines, without this interlock, the bitmap + // would sometimes get corrupted at boot time. + // + hfs_lock_mount(hfsmp); + while(!(hfsmp->scan_var & HFS_ALLOCATOR_SCAN_COMPLETED)) { + (void) msleep (&hfsmp->scan_var, &hfsmp->hfs_mutex, PINOD, "hfs_hotfile_bitmap_interlock", 0); + } + hfs_unlock_mount(hfsmp); + + /* + * Note: at this point we are not allowed to fail the + * mount operation because the HotFile init code + * in hfs_recording_init() will lookup vnodes with + * VNOP_LOOKUP() which hangs vnodes off the mount + * (and if we were to fail, VFS is not prepared to + * clean that up at this point. Since HotFiles are + * optional, this is not a big deal. + */ + (void) hfs_recording_init(hfsmp); + } + + /* Force ACLs on HFS+ file systems. */ + vfs_setextendedsecurity(HFSTOVFS(hfsmp)); + + /* Enable extent-based extended attributes by default */ + hfsmp->hfs_flags |= HFS_XATTR_EXTENTS; + + return (0); + +ErrorExit: + /* + * A fatal error occurred and the volume cannot be mounted, so + * release any resources that we acquired... + */ + hfsUnmount(hfsmp, NULL); + + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: encountered error (%d)\n", retval); + } + return (retval); +} + +static int +_pin_metafile(struct hfsmount *hfsmp, vnode_t vp) +{ + int err; + + err = hfs_lock(VTOC(vp), HFS_SHARED_LOCK, HFS_LOCK_DEFAULT); + if (err == 0) { + err = hfs_pin_vnode(hfsmp, vp, HFS_PIN_IT, NULL); + hfs_unlock(VTOC(vp)); + } + + return err; +} + +void +hfs_pin_fs_metadata(struct hfsmount *hfsmp) +{ + ExtendedVCB *vcb; + int err; + + vcb = HFSTOVCB(hfsmp); + + err = _pin_metafile(hfsmp, hfsmp->hfs_extents_vp); + if (err != 0) { + printf("hfs: failed to pin extents overflow file %d\n", err); + } + err = _pin_metafile(hfsmp, hfsmp->hfs_catalog_vp); + if (err != 0) { + printf("hfs: failed to pin catalog file %d\n", err); + } + err = _pin_metafile(hfsmp, hfsmp->hfs_allocation_vp); + if (err != 0) { + printf("hfs: failed to pin bitmap file %d\n", err); + } + err = _pin_metafile(hfsmp, hfsmp->hfs_attribute_vp); + if (err != 0) { + printf("hfs: failed to pin extended attr file %d\n", err); + } + + hfs_pin_block_range(hfsmp, HFS_PIN_IT, 0, 1); + hfs_pin_block_range(hfsmp, HFS_PIN_IT, vcb->totalBlocks-1, 1); + + if (vfs_flags(hfsmp->hfs_mp) & MNT_JOURNALED) { + // and hey, if we've got a journal, let's pin that too! + hfs_pin_block_range(hfsmp, HFS_PIN_IT, hfsmp->jnl_start, howmany(hfsmp->jnl_size, vcb->blockSize)); + } +} + +/* + * ReleaseMetaFileVNode + * + * vp L - - + */ +static void ReleaseMetaFileVNode(struct vnode *vp) +{ + struct filefork *fp; + + if (vp && (fp = VTOF(vp))) { + if (fp->fcbBTCBPtr != NULL) { + (void)hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + (void) BTClosePath(fp); + hfs_unlock(VTOC(vp)); + } + + /* release the node even if BTClosePath fails */ + vnode_recycle(vp); + vnode_put(vp); + } +} + + +/************************************************************* +* +* Unmounts a hfs volume. +* At this point vflush() has been called (to dump all non-metadata files) +* +*************************************************************/ + +int +hfsUnmount( register struct hfsmount *hfsmp, __unused struct proc *p) +{ + /* Get rid of our attribute data vnode (if any). This is done + * after the vflush() during mount, so we don't need to worry + * about any locks. + */ + if (hfsmp->hfs_attrdata_vp) { + ReleaseMetaFileVNode(hfsmp->hfs_attrdata_vp); + hfsmp->hfs_attrdata_vp = NULLVP; + } + + if (hfsmp->hfs_startup_vp) { + ReleaseMetaFileVNode(hfsmp->hfs_startup_vp); + hfsmp->hfs_startup_cp = NULL; + hfsmp->hfs_startup_vp = NULL; + } + + if (hfsmp->hfs_attribute_vp) { + ReleaseMetaFileVNode(hfsmp->hfs_attribute_vp); + hfsmp->hfs_attribute_cp = NULL; + hfsmp->hfs_attribute_vp = NULL; + } + + if (hfsmp->hfs_catalog_vp) { + ReleaseMetaFileVNode(hfsmp->hfs_catalog_vp); + hfsmp->hfs_catalog_cp = NULL; + hfsmp->hfs_catalog_vp = NULL; + } + + if (hfsmp->hfs_extents_vp) { + ReleaseMetaFileVNode(hfsmp->hfs_extents_vp); + hfsmp->hfs_extents_cp = NULL; + hfsmp->hfs_extents_vp = NULL; + } + + if (hfsmp->hfs_allocation_vp) { + ReleaseMetaFileVNode(hfsmp->hfs_allocation_vp); + hfsmp->hfs_allocation_cp = NULL; + hfsmp->hfs_allocation_vp = NULL; + } + + return (0); +} + + +/* + * Test if fork has overflow extents. + * + * Returns: + * non-zero - overflow extents exist + * zero - overflow extents do not exist + */ +bool overflow_extents(struct filefork *fp) +{ + u_int32_t blocks; + + // + // If the vnode pointer is NULL then we're being called + // from hfs_remove_orphans() with a faked-up filefork + // and therefore it has to be an HFS+ volume. Otherwise + // we check through the volume header to see what type + // of volume we're on. + // + +#if CONFIG_HFS_STD + if (FTOV(fp) && VTOVCB(FTOV(fp))->vcbSigWord == kHFSSigWord) { + if (fp->ff_extents[2].blockCount == 0) + return false; + + blocks = fp->ff_extents[0].blockCount + + fp->ff_extents[1].blockCount + + fp->ff_extents[2].blockCount; + + return fp->ff_blocks > blocks; + } +#endif + + if (fp->ff_extents[7].blockCount == 0) + return false; + + blocks = fp->ff_extents[0].blockCount + + fp->ff_extents[1].blockCount + + fp->ff_extents[2].blockCount + + fp->ff_extents[3].blockCount + + fp->ff_extents[4].blockCount + + fp->ff_extents[5].blockCount + + fp->ff_extents[6].blockCount + + fp->ff_extents[7].blockCount; + + return fp->ff_blocks > blocks; +} + +static __attribute__((pure)) +boolean_t hfs_is_frozen(struct hfsmount *hfsmp) +{ + return (hfsmp->hfs_freeze_state == HFS_FROZEN + || (hfsmp->hfs_freeze_state == HFS_FREEZING + && current_thread() != hfsmp->hfs_freezing_thread)); +} + +/* + * Lock the HFS global journal lock + */ +int +hfs_lock_global (struct hfsmount *hfsmp, enum hfs_locktype locktype) +{ + thread_t thread = current_thread(); + + if (hfsmp->hfs_global_lockowner == thread) { + panic ("hfs_lock_global: locking against myself!"); + } + + /* + * This check isn't really necessary but this stops us taking + * the mount lock in most cases. The essential check is below. + */ + if (hfs_is_frozen(hfsmp)) { + /* + * Unfortunately, there is no easy way of getting a notification + * for when a process is exiting and it's possible for the exiting + * process to get blocked somewhere else. To catch this, we + * periodically monitor the frozen process here and thaw if + * we spot that it's exiting. + */ +frozen: + hfs_lock_mount(hfsmp); + + struct timespec ts = { 0, 500 * NSEC_PER_MSEC }; + + while (hfs_is_frozen(hfsmp)) { + if (hfsmp->hfs_freeze_state == HFS_FROZEN + && proc_exiting(hfsmp->hfs_freezing_proc)) { + hfs_thaw_locked(hfsmp); + break; + } + + msleep(&hfsmp->hfs_freeze_state, &hfsmp->hfs_mutex, + PWAIT, "hfs_lock_global (frozen)", &ts); + } + hfs_unlock_mount(hfsmp); + } + + /* HFS_SHARED_LOCK */ + if (locktype == HFS_SHARED_LOCK) { + lck_rw_lock_shared (&hfsmp->hfs_global_lock); + hfsmp->hfs_global_lockowner = HFS_SHARED_OWNER; + } + /* HFS_EXCLUSIVE_LOCK */ + else { + lck_rw_lock_exclusive (&hfsmp->hfs_global_lock); + hfsmp->hfs_global_lockowner = thread; + } + + /* + * We have to check if we're frozen again because of the time + * between when we checked and when we took the global lock. + */ + if (hfs_is_frozen(hfsmp)) { + hfs_unlock_global(hfsmp); + goto frozen; + } + + return 0; +} + + +/* + * Unlock the HFS global journal lock + */ +void +hfs_unlock_global (struct hfsmount *hfsmp) +{ + thread_t thread = current_thread(); + + /* HFS_LOCK_EXCLUSIVE */ + if (hfsmp->hfs_global_lockowner == thread) { + hfsmp->hfs_global_lockowner = NULL; + lck_rw_unlock_exclusive (&hfsmp->hfs_global_lock); + } + /* HFS_LOCK_SHARED */ + else { + lck_rw_unlock_shared (&hfsmp->hfs_global_lock); + } +} + +/* + * Lock the HFS mount lock + * + * Note: this is a mutex, not a rw lock! + */ +inline +void hfs_lock_mount (struct hfsmount *hfsmp) { + lck_mtx_lock (&(hfsmp->hfs_mutex)); +} + +/* + * Unlock the HFS mount lock + * + * Note: this is a mutex, not a rw lock! + */ +inline +void hfs_unlock_mount (struct hfsmount *hfsmp) { + lck_mtx_unlock (&(hfsmp->hfs_mutex)); +} + +/* + * Lock HFS system file(s). + * + * This function accepts a @flags parameter which indicates which + * system file locks are required. The value it returns should be + * used in a subsequent call to hfs_systemfile_unlock. The caller + * should treat this value as opaque; it may or may not have a + * relation to the @flags field that is passed in. The *only* + * guarantee that we make is that a value of zero means that no locks + * were taken and that there is no need to call hfs_systemfile_unlock + * (although it is harmless to do so). Recursion is supported but + * care must still be taken to ensure correct lock ordering. Note + * that requests for certain locks may cause other locks to also be + * taken, including locks that are not possible to ask for via the + * @flags parameter. + */ +int +hfs_systemfile_lock(struct hfsmount *hfsmp, int flags, enum hfs_locktype locktype) +{ + /* + * Locking order is Catalog file, Attributes file, Startup file, Bitmap file, Extents file + */ + if (flags & SFL_CATALOG) { + if (hfsmp->hfs_catalog_cp + && hfsmp->hfs_catalog_cp->c_lockowner != current_thread()) { +#ifdef HFS_CHECK_LOCK_ORDER + if (hfsmp->hfs_attribute_cp && hfsmp->hfs_attribute_cp->c_lockowner == current_thread()) { + panic("hfs_systemfile_lock: bad lock order (Attributes before Catalog)"); + } + if (hfsmp->hfs_startup_cp && hfsmp->hfs_startup_cp->c_lockowner == current_thread()) { + panic("hfs_systemfile_lock: bad lock order (Startup before Catalog)"); + } + if (hfsmp-> hfs_extents_cp && hfsmp->hfs_extents_cp->c_lockowner == current_thread()) { + panic("hfs_systemfile_lock: bad lock order (Extents before Catalog)"); + } +#endif /* HFS_CHECK_LOCK_ORDER */ + + (void) hfs_lock(hfsmp->hfs_catalog_cp, locktype, HFS_LOCK_DEFAULT); + /* + * When the catalog file has overflow extents then + * also acquire the extents b-tree lock if its not + * already requested. + */ + if (((flags & SFL_EXTENTS) == 0) && + (hfsmp->hfs_catalog_vp != NULL) && + (overflow_extents(VTOF(hfsmp->hfs_catalog_vp)))) { + flags |= SFL_EXTENTS; + } + } else { + flags &= ~SFL_CATALOG; + } + } + + if (flags & SFL_ATTRIBUTE) { + if (hfsmp->hfs_attribute_cp + && hfsmp->hfs_attribute_cp->c_lockowner != current_thread()) { +#ifdef HFS_CHECK_LOCK_ORDER + if (hfsmp->hfs_startup_cp && hfsmp->hfs_startup_cp->c_lockowner == current_thread()) { + panic("hfs_systemfile_lock: bad lock order (Startup before Attributes)"); + } + if (hfsmp->hfs_extents_cp && hfsmp->hfs_extents_cp->c_lockowner == current_thread()) { + panic("hfs_systemfile_lock: bad lock order (Extents before Attributes)"); + } +#endif /* HFS_CHECK_LOCK_ORDER */ + + (void) hfs_lock(hfsmp->hfs_attribute_cp, locktype, HFS_LOCK_DEFAULT); + /* + * When the attribute file has overflow extents then + * also acquire the extents b-tree lock if its not + * already requested. + */ + if (((flags & SFL_EXTENTS) == 0) && + (hfsmp->hfs_attribute_vp != NULL) && + (overflow_extents(VTOF(hfsmp->hfs_attribute_vp)))) { + flags |= SFL_EXTENTS; + } + } else { + flags &= ~SFL_ATTRIBUTE; + } + } + + if (flags & SFL_STARTUP) { + if (hfsmp->hfs_startup_cp + && hfsmp->hfs_startup_cp->c_lockowner != current_thread()) { +#ifdef HFS_CHECK_LOCK_ORDER + if (hfsmp-> hfs_extents_cp && hfsmp->hfs_extents_cp->c_lockowner == current_thread()) { + panic("hfs_systemfile_lock: bad lock order (Extents before Startup)"); + } +#endif /* HFS_CHECK_LOCK_ORDER */ + + (void) hfs_lock(hfsmp->hfs_startup_cp, locktype, HFS_LOCK_DEFAULT); + /* + * When the startup file has overflow extents then + * also acquire the extents b-tree lock if its not + * already requested. + */ + if (((flags & SFL_EXTENTS) == 0) && + (hfsmp->hfs_startup_vp != NULL) && + (overflow_extents(VTOF(hfsmp->hfs_startup_vp)))) { + flags |= SFL_EXTENTS; + } + } else { + flags &= ~SFL_STARTUP; + } + } + + /* + * To prevent locks being taken in the wrong order, the extent lock + * gets a bitmap lock as well. + */ + if (flags & (SFL_BITMAP | SFL_EXTENTS)) { + if (hfsmp->hfs_allocation_cp) { + (void) hfs_lock(hfsmp->hfs_allocation_cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + /* + * The bitmap lock is also grabbed when only extent lock + * was requested. Set the bitmap lock bit in the lock + * flags which callers will use during unlock. + */ + flags |= SFL_BITMAP; + } else { + flags &= ~SFL_BITMAP; + } + } + + if (flags & SFL_EXTENTS) { + /* + * Since the extents btree lock is recursive we always + * need exclusive access. + */ + if (hfsmp->hfs_extents_cp) { + (void) hfs_lock(hfsmp->hfs_extents_cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + + if (vfs_isswapmount(hfsmp->hfs_mp)) { + /* + * because we may need this lock on the pageout path (if a swapfile allocation + * spills into the extents overflow tree), we will grant the holder of this + * lock the privilege of dipping into the reserve free pool in order to prevent + * a deadlock from occurring if we need those pageouts to complete before we + * will make any new pages available on the free list... the deadlock can occur + * if this thread needs to allocate memory while this lock is held + */ + if (set_vm_privilege(TRUE) == FALSE) { + /* + * indicate that we need to drop vm_privilege + * when we unlock + */ + flags |= SFL_VM_PRIV; + } + } + } else { + flags &= ~SFL_EXTENTS; + } + } + + return (flags); +} + +/* + * unlock HFS system file(s). + */ +void +hfs_systemfile_unlock(struct hfsmount *hfsmp, int flags) +{ + if (!flags) + return; + + struct timeval tv; + u_int32_t lastfsync; + int numOfLockedBuffs; + + if (hfsmp->jnl == NULL) { + microuptime(&tv); + lastfsync = tv.tv_sec; + } + if (flags & SFL_STARTUP && hfsmp->hfs_startup_cp) { + hfs_unlock(hfsmp->hfs_startup_cp); + } + if (flags & SFL_ATTRIBUTE && hfsmp->hfs_attribute_cp) { + if (hfsmp->jnl == NULL) { + BTGetLastSync((FCB*)VTOF(hfsmp->hfs_attribute_vp), &lastfsync); + numOfLockedBuffs = count_lock_queue(); + if ((numOfLockedBuffs > kMaxLockedMetaBuffers) || + ((numOfLockedBuffs > 1) && ((tv.tv_sec - lastfsync) > + kMaxSecsForFsync))) { + hfs_btsync(hfsmp->hfs_attribute_vp, HFS_SYNCTRANS); + } + } + hfs_unlock(hfsmp->hfs_attribute_cp); + } + if (flags & SFL_CATALOG && hfsmp->hfs_catalog_cp) { + if (hfsmp->jnl == NULL) { + BTGetLastSync((FCB*)VTOF(hfsmp->hfs_catalog_vp), &lastfsync); + numOfLockedBuffs = count_lock_queue(); + if ((numOfLockedBuffs > kMaxLockedMetaBuffers) || + ((numOfLockedBuffs > 1) && ((tv.tv_sec - lastfsync) > + kMaxSecsForFsync))) { + hfs_btsync(hfsmp->hfs_catalog_vp, HFS_SYNCTRANS); + } + } + hfs_unlock(hfsmp->hfs_catalog_cp); + } + if (flags & SFL_BITMAP && hfsmp->hfs_allocation_cp) { + hfs_unlock(hfsmp->hfs_allocation_cp); + } + if (flags & SFL_EXTENTS && hfsmp->hfs_extents_cp) { + if (hfsmp->jnl == NULL) { + BTGetLastSync((FCB*)VTOF(hfsmp->hfs_extents_vp), &lastfsync); + numOfLockedBuffs = count_lock_queue(); + if ((numOfLockedBuffs > kMaxLockedMetaBuffers) || + ((numOfLockedBuffs > 1) && ((tv.tv_sec - lastfsync) > + kMaxSecsForFsync))) { + hfs_btsync(hfsmp->hfs_extents_vp, HFS_SYNCTRANS); + } + } + hfs_unlock(hfsmp->hfs_extents_cp); + + if (flags & SFL_VM_PRIV) { + /* + * revoke the vm_privilege we granted this thread + * now that we have unlocked the overflow extents + */ + set_vm_privilege(FALSE); + } + } +} + + +/* + * RequireFileLock + * + * Check to see if a vnode is locked in the current context + * This is to be used for debugging purposes only!! + */ +#if DEBUG +void RequireFileLock(FileReference vp, int shareable) +{ + int locked; + + /* The extents btree and allocation bitmap are always exclusive. */ + if (VTOC(vp)->c_fileid == kHFSExtentsFileID || + VTOC(vp)->c_fileid == kHFSAllocationFileID) { + shareable = 0; + } + + locked = VTOC(vp)->c_lockowner == current_thread(); + + if (!locked && !shareable) { + switch (VTOC(vp)->c_fileid) { + case kHFSExtentsFileID: + panic("hfs: extents btree not locked! v: 0x%08X\n #\n", (u_int)vp); + break; + case kHFSCatalogFileID: + panic("hfs: catalog btree not locked! v: 0x%08X\n #\n", (u_int)vp); + break; + case kHFSAllocationFileID: + /* The allocation file can hide behind the jornal lock. */ + if (VTOHFS(vp)->jnl == NULL) + panic("hfs: allocation file not locked! v: 0x%08X\n #\n", (u_int)vp); + break; + case kHFSStartupFileID: + panic("hfs: startup file not locked! v: 0x%08X\n #\n", (u_int)vp); + case kHFSAttributesFileID: + panic("hfs: attributes btree not locked! v: 0x%08X\n #\n", (u_int)vp); + break; + } + } +} +#endif // DEBUG + + +/* + * There are three ways to qualify for ownership rights on an object: + * + * 1. (a) Your UID matches the cnode's UID. + * (b) The object in question is owned by "unknown" + * 2. (a) Permissions on the filesystem are being ignored and + * your UID matches the replacement UID. + * (b) Permissions on the filesystem are being ignored and + * the replacement UID is "unknown". + * 3. You are root. + * + */ +int +hfs_owner_rights(struct hfsmount *hfsmp, uid_t cnode_uid, kauth_cred_t cred, + __unused struct proc *p, int invokesuperuserstatus) +{ + if ((kauth_cred_getuid(cred) == cnode_uid) || /* [1a] */ + (cnode_uid == UNKNOWNUID) || /* [1b] */ + ((((unsigned int)vfs_flags(HFSTOVFS(hfsmp))) & MNT_UNKNOWNPERMISSIONS) && /* [2] */ + ((kauth_cred_getuid(cred) == hfsmp->hfs_uid) || /* [2a] */ + (hfsmp->hfs_uid == UNKNOWNUID))) || /* [2b] */ + (invokesuperuserstatus && (suser(cred, 0) == 0))) { /* [3] */ + return (0); + } else { + return (EPERM); + } +} + + +u_int32_t BestBlockSizeFit(u_int32_t allocationBlockSize, + u_int32_t blockSizeLimit, + u_int32_t baseMultiple) { + /* + Compute the optimal (largest) block size (no larger than allocationBlockSize) that is less than the + specified limit but still an even multiple of the baseMultiple. + */ + int baseBlockCount, blockCount; + u_int32_t trialBlockSize; + + if (allocationBlockSize % baseMultiple != 0) { + /* + Whoops: the allocation blocks aren't even multiples of the specified base: + no amount of dividing them into even parts will be a multiple, either then! + */ + return 512; /* Hope for the best */ + }; + + /* Try the obvious winner first, to prevent 12K allocation blocks, for instance, + from being handled as two 6K logical blocks instead of 3 4K logical blocks. + Even though the former (the result of the loop below) is the larger allocation + block size, the latter is more efficient: */ + if (allocationBlockSize % PAGE_SIZE == 0) return PAGE_SIZE; + + /* No clear winner exists: pick the largest even fraction <= MAXBSIZE: */ + baseBlockCount = allocationBlockSize / baseMultiple; /* Now guaranteed to be an even multiple */ + + for (blockCount = baseBlockCount; blockCount > 0; --blockCount) { + trialBlockSize = blockCount * baseMultiple; + if (allocationBlockSize % trialBlockSize == 0) { /* An even multiple? */ + if ((trialBlockSize <= blockSizeLimit) && + (trialBlockSize % baseMultiple == 0)) { + return trialBlockSize; + }; + }; + }; + + /* Note: we should never get here, since blockCount = 1 should always work, + but this is nice and safe and makes the compiler happy, too ... */ + return 512; +} + + +u_int32_t +GetFileInfo(ExtendedVCB *vcb, __unused u_int32_t dirid, const char *name, + struct cat_attr *fattr, struct cat_fork *forkinfo) +{ + struct hfsmount * hfsmp; + struct cat_desc jdesc; + int lockflags; + int error; + + if (vcb->vcbSigWord != kHFSPlusSigWord) + return (0); + + hfsmp = VCBTOHFS(vcb); + + memset(&jdesc, 0, sizeof(struct cat_desc)); + jdesc.cd_parentcnid = kRootDirID; + jdesc.cd_nameptr = (const u_int8_t *)name; + jdesc.cd_namelen = strlen(name); + + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); + error = cat_lookup(hfsmp, &jdesc, 0, 0, NULL, fattr, forkinfo, NULL); + hfs_systemfile_unlock(hfsmp, lockflags); + + if (error == 0) { + return (fattr->ca_fileid); + } else if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return (0); + } + + return (0); /* XXX what callers expect on an error */ +} + + +/* + * On HFS Plus Volumes, there can be orphaned files or directories + * These are files or directories that were unlinked while busy. + * If the volume was not cleanly unmounted then some of these may + * have persisted and need to be removed. + */ +void +hfs_remove_orphans(struct hfsmount * hfsmp) +{ + struct BTreeIterator * iterator = NULL; + struct FSBufferDescriptor btdata; + struct HFSPlusCatalogFile filerec; + struct HFSPlusCatalogKey * keyp; + struct proc *p = current_proc(); + FCB *fcb; + ExtendedVCB *vcb; + char filename[32]; + char tempname[32]; + size_t namelen; + cat_cookie_t cookie; + int catlock = 0; + int catreserve = 0; + bool started_tr = false; + int lockflags; + int result; + int orphaned_files = 0; + int orphaned_dirs = 0; + + bzero(&cookie, sizeof(cookie)); + + if (hfsmp->hfs_flags & HFS_CLEANED_ORPHANS) + return; + + vcb = HFSTOVCB(hfsmp); + fcb = VTOF(hfsmp->hfs_catalog_vp); + + btdata.bufferAddress = &filerec; + btdata.itemSize = sizeof(filerec); + btdata.itemCount = 1; + + iterator = hfs_mallocz(sizeof(*iterator)); + + /* Build a key to "temp" */ + keyp = (HFSPlusCatalogKey*)&iterator->key; + keyp->parentID = hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid; + keyp->nodeName.length = 4; /* "temp" */ + keyp->keyLength = kHFSPlusCatalogKeyMinimumLength + keyp->nodeName.length * 2; + keyp->nodeName.unicode[0] = 't'; + keyp->nodeName.unicode[1] = 'e'; + keyp->nodeName.unicode[2] = 'm'; + keyp->nodeName.unicode[3] = 'p'; + + /* + * Position the iterator just before the first real temp file/dir. + */ + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); + (void) BTSearchRecord(fcb, iterator, NULL, NULL, iterator); + hfs_systemfile_unlock(hfsmp, lockflags); + + /* Visit all the temp files/dirs in the HFS+ private directory. */ + for (;;) { + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); + result = BTIterateRecord(fcb, kBTreeNextRecord, iterator, &btdata, NULL); + hfs_systemfile_unlock(hfsmp, lockflags); + if (result) + break; + if (keyp->parentID != hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid) + break; + + (void) utf8_encodestr(keyp->nodeName.unicode, keyp->nodeName.length * 2, + (u_int8_t *)filename, &namelen, sizeof(filename), 0, 0); + + (void) snprintf(tempname, sizeof(tempname), "%s%d", + HFS_DELETE_PREFIX, filerec.fileID); + + /* + * Delete all files (and directories) named "tempxxx", + * where xxx is the file's cnid in decimal. + * + */ + if (bcmp(tempname, filename, namelen + 1) != 0) + continue; + + struct filefork dfork; + struct filefork rfork; + struct cnode cnode; + int mode = 0; + + bzero(&dfork, sizeof(dfork)); + bzero(&rfork, sizeof(rfork)); + bzero(&cnode, sizeof(cnode)); + + if (hfs_start_transaction(hfsmp) != 0) { + printf("hfs_remove_orphans: failed to start transaction\n"); + goto exit; + } + started_tr = true; + + /* + * Reserve some space in the Catalog file. + */ + if (cat_preflight(hfsmp, CAT_DELETE, &cookie, p) != 0) { + printf("hfs_remove_orphans: cat_preflight failed\n"); + goto exit; + } + catreserve = 1; + + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_ATTRIBUTE | SFL_EXTENTS | SFL_BITMAP, HFS_EXCLUSIVE_LOCK); + catlock = 1; + + /* Build a fake cnode */ + cat_convertattr(hfsmp, (CatalogRecord *)&filerec, &cnode.c_attr, + &dfork.ff_data, &rfork.ff_data); + cnode.c_desc.cd_parentcnid = hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid; + cnode.c_desc.cd_nameptr = (const u_int8_t *)filename; + cnode.c_desc.cd_namelen = namelen; + cnode.c_desc.cd_cnid = cnode.c_attr.ca_fileid; + cnode.c_blocks = dfork.ff_blocks + rfork.ff_blocks; + + /* Position iterator at previous entry */ + if (BTIterateRecord(fcb, kBTreePrevRecord, iterator, + NULL, NULL) != 0) { + break; + } + + /* Truncate the file to zero (both forks) */ + if (dfork.ff_blocks > 0) { + u_int64_t fsize; + + dfork.ff_cp = &cnode; + cnode.c_datafork = &dfork; + cnode.c_rsrcfork = NULL; + fsize = (u_int64_t)dfork.ff_blocks * (u_int64_t)HFSTOVCB(hfsmp)->blockSize; + while (fsize > 0) { + if (fsize > HFS_BIGFILE_SIZE) { + fsize -= HFS_BIGFILE_SIZE; + } else { + fsize = 0; + } + + if (TruncateFileC(vcb, (FCB*)&dfork, fsize, 1, 0, + cnode.c_attr.ca_fileid, false) != 0) { + printf("hfs: error truncating data fork!\n"); + break; + } + + // + // if we're iteratively truncating this file down, + // then end the transaction and start a new one so + // that no one transaction gets too big. + // + if (fsize > 0) { + /* Drop system file locks before starting + * another transaction to preserve lock order. + */ + hfs_systemfile_unlock(hfsmp, lockflags); + catlock = 0; + hfs_end_transaction(hfsmp); + + if (hfs_start_transaction(hfsmp) != 0) { + started_tr = false; + goto exit; + } + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_ATTRIBUTE | SFL_EXTENTS | SFL_BITMAP, HFS_EXCLUSIVE_LOCK); + catlock = 1; + } + } + } + + if (rfork.ff_blocks > 0) { + rfork.ff_cp = &cnode; + cnode.c_datafork = NULL; + cnode.c_rsrcfork = &rfork; + if (TruncateFileC(vcb, (FCB*)&rfork, 0, 1, 1, cnode.c_attr.ca_fileid, false) != 0) { + printf("hfs: error truncating rsrc fork!\n"); + break; + } + } + + // Deal with extended attributes + if (ISSET(cnode.c_attr.ca_recflags, kHFSHasAttributesMask)) { + // hfs_removeallattr uses its own transactions + hfs_systemfile_unlock(hfsmp, lockflags); + catlock = false; + hfs_end_transaction(hfsmp); + + hfs_removeallattr(hfsmp, cnode.c_attr.ca_fileid, &started_tr); + + if (!started_tr) { + if (hfs_start_transaction(hfsmp) != 0) { + printf("hfs_remove_orphans: failed to start transaction\n"); + goto exit; + } + started_tr = true; + } + + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_ATTRIBUTE | SFL_EXTENTS | SFL_BITMAP, HFS_EXCLUSIVE_LOCK); + catlock = 1; + } + + /* Remove the file or folder record from the Catalog */ + if (cat_delete(hfsmp, &cnode.c_desc, &cnode.c_attr) != 0) { + printf("hfs_remove_orphans: error deleting cat rec for id %d!\n", cnode.c_desc.cd_cnid); + hfs_systemfile_unlock(hfsmp, lockflags); + catlock = 0; + hfs_volupdate(hfsmp, VOL_UPDATE, 0); + break; + } + + mode = cnode.c_attr.ca_mode & S_IFMT; + + if (mode == S_IFDIR) { + orphaned_dirs++; + } + else { + orphaned_files++; + } + + /* Update parent and volume counts */ + hfsmp->hfs_private_attr[FILE_HARDLINKS].ca_entries--; + if (mode == S_IFDIR) { + DEC_FOLDERCOUNT(hfsmp, hfsmp->hfs_private_attr[FILE_HARDLINKS]); + } + + (void)cat_update(hfsmp, &hfsmp->hfs_private_desc[FILE_HARDLINKS], + &hfsmp->hfs_private_attr[FILE_HARDLINKS], NULL, NULL); + + /* Drop locks and end the transaction */ + hfs_systemfile_unlock(hfsmp, lockflags); + cat_postflight(hfsmp, &cookie, p); + catlock = catreserve = 0; + + /* + Now that Catalog is unlocked, update the volume info, making + sure to differentiate between files and directories + */ + if (mode == S_IFDIR) { + hfs_volupdate(hfsmp, VOL_RMDIR, 0); + } + else{ + hfs_volupdate(hfsmp, VOL_RMFILE, 0); + } + + hfs_end_transaction(hfsmp); + started_tr = false; + } /* end for */ + +exit: + + if (orphaned_files > 0 || orphaned_dirs > 0) + printf("hfs: Removed %d orphaned / unlinked files and %d directories \n", orphaned_files, orphaned_dirs); + if (catlock) { + hfs_systemfile_unlock(hfsmp, lockflags); + } + if (catreserve) { + cat_postflight(hfsmp, &cookie, p); + } + if (started_tr) { + hfs_end_transaction(hfsmp); + } + + hfs_free(iterator, sizeof(*iterator)); + hfsmp->hfs_flags |= HFS_CLEANED_ORPHANS; +} + + +/* + * This will return the correct logical block size for a given vnode. + * For most files, it is the allocation block size, for meta data like + * BTrees, this is kept as part of the BTree private nodeSize + */ +u_int32_t +GetLogicalBlockSize(struct vnode *vp) +{ +u_int32_t logBlockSize; + + hfs_assert(vp != NULL); + + /* start with default */ + logBlockSize = VTOHFS(vp)->hfs_logBlockSize; + + if (vnode_issystem(vp)) { + if (VTOF(vp)->fcbBTCBPtr != NULL) { + BTreeInfoRec bTreeInfo; + + /* + * We do not lock the BTrees, because if we are getting block..then the tree + * should be locked in the first place. + * We just want the nodeSize wich will NEVER change..so even if the world + * is changing..the nodeSize should remain the same. Which argues why lock + * it in the first place?? + */ + + (void) BTGetInformation (VTOF(vp), kBTreeInfoVersion, &bTreeInfo); + + logBlockSize = bTreeInfo.nodeSize; + + } else if (VTOC(vp)->c_fileid == kHFSAllocationFileID) { + logBlockSize = VTOVCB(vp)->vcbVBMIOSize; + } + } + + hfs_assert(logBlockSize > 0); + + return logBlockSize; +} + +#if HFS_SPARSE_DEV +static bool hfs_get_backing_free_blks(hfsmount_t *hfsmp, uint64_t *pfree_blks) +{ + struct vfsstatfs *vfsp; /* 272 bytes */ + uint64_t vfreeblks; + struct timeval now; + + hfs_lock_mount(hfsmp); + + vnode_t backing_vp = hfsmp->hfs_backingvp; + if (!backing_vp) { + hfs_unlock_mount(hfsmp); + return false; + } + + // usecount is not enough; we need iocount + if (vnode_get(backing_vp)) { + hfs_unlock_mount(hfsmp); + *pfree_blks = 0; + return true; + } + + uint32_t loanedblks = hfsmp->loanedBlocks + hfsmp->lockedBlocks; + uint32_t bandblks = hfsmp->hfs_sparsebandblks; + uint64_t maxblks = hfsmp->hfs_backingfs_maxblocks; + + hfs_unlock_mount(hfsmp); + + mount_t backingfs_mp = vnode_mount(backing_vp); + + microtime(&now); + if ((now.tv_sec - hfsmp->hfs_last_backingstatfs) >= 1) { + vfs_update_vfsstat(backingfs_mp, vfs_context_kernel(), VFS_KERNEL_EVENT); + hfsmp->hfs_last_backingstatfs = now.tv_sec; + } + + if (!(vfsp = vfs_statfs(backingfs_mp))) { + vnode_put(backing_vp); + return false; + } + + vfreeblks = vfsp->f_bavail; + /* Normalize block count if needed. */ + if (vfsp->f_bsize != hfsmp->blockSize) + vfreeblks = vfreeblks * vfsp->f_bsize / hfsmp->blockSize; + if (vfreeblks > bandblks) + vfreeblks -= bandblks; + else + vfreeblks = 0; + + /* + * Take into account any delayed allocations. It is not + * certain what the original reason for the "2 *" is. Most + * likely it is to allow for additional requirements in the + * host file system and metadata required by disk images. The + * number of loaned blocks is likely to be small and we will + * stop using them as we get close to the limit. + */ + loanedblks = 2 * loanedblks; + if (vfreeblks > loanedblks) + vfreeblks -= loanedblks; + else + vfreeblks = 0; + + if (maxblks) + vfreeblks = MIN(vfreeblks, maxblks); + + vnode_put(backing_vp); + + *pfree_blks = vfreeblks; + + return true; +} +#endif + +u_int32_t +hfs_free_cnids(struct hfsmount * hfsmp) +{ + return HFS_MAX_FILES - hfsmp->hfs_filecount - hfsmp->hfs_dircount; +} + +u_int32_t +hfs_freeblks(struct hfsmount * hfsmp, int wantreserve) +{ + u_int32_t freeblks; + u_int32_t rsrvblks; + u_int32_t loanblks; + + /* + * We don't bother taking the mount lock + * to look at these values since the values + * themselves are each updated atomically + * on aligned addresses. + */ + freeblks = hfsmp->freeBlocks; + rsrvblks = hfsmp->reserveBlocks; + loanblks = hfsmp->loanedBlocks + hfsmp->lockedBlocks; + if (wantreserve) { + if (freeblks > rsrvblks) + freeblks -= rsrvblks; + else + freeblks = 0; + } + if (freeblks > loanblks) + freeblks -= loanblks; + else + freeblks = 0; + +#if HFS_SPARSE_DEV + /* + * When the underlying device is sparse, check the + * available space on the backing store volume. + */ + uint64_t vfreeblks; + if (hfs_get_backing_free_blks(hfsmp, &vfreeblks)) + freeblks = MIN(freeblks, vfreeblks); +#endif /* HFS_SPARSE_DEV */ + + return (freeblks); +} + +/* + * Map HFS Common errors (negative) to BSD error codes (positive). + * Positive errors (ie BSD errors) are passed through unchanged. + */ +short MacToVFSError(OSErr err) +{ + if (err >= 0) + return err; + + /* BSD/VFS internal errnos */ + switch (err) { + case HFS_ERESERVEDNAME: /* -8 */ + return err; + } + + switch (err) { + case dskFulErr: /* -34 */ + case btNoSpaceAvail: /* -32733 */ + return ENOSPC; + case fxOvFlErr: /* -32750 */ + return EOVERFLOW; + + case btBadNode: /* -32731 */ + return EIO; + + case memFullErr: /* -108 */ + return ENOMEM; /* +12 */ + + case cmExists: /* -32718 */ + case btExists: /* -32734 */ + return EEXIST; /* +17 */ + + case cmNotFound: /* -32719 */ + case btNotFound: /* -32735 */ + return ENOENT; /* 28 */ + + case cmNotEmpty: /* -32717 */ + return ENOTEMPTY; /* 66 */ + + case cmFThdDirErr: /* -32714 */ + return EISDIR; /* 21 */ + + case fxRangeErr: /* -32751 */ + return ERANGE; + + case bdNamErr: /* -37 */ + return ENAMETOOLONG; /* 63 */ + + case paramErr: /* -50 */ + case fileBoundsErr: /* -1309 */ + return EINVAL; /* +22 */ + + case fsBTBadNodeSize: + return ENXIO; + + default: + return EIO; /* +5 */ + } +} + + +/* + * Find the current thread's directory hint for a given index. + * + * Requires an exclusive lock on directory cnode. + * + * Use detach if the cnode lock must be dropped while the hint is still active. + */ +directoryhint_t * +hfs_getdirhint(struct cnode *dcp, int index, int detach) +{ + struct timeval tv; + directoryhint_t *hint; + boolean_t need_remove, need_init; + const u_int8_t * name; + + microuptime(&tv); + + /* + * Look for an existing hint first. If not found, create a new one (when + * the list is not full) or recycle the oldest hint. Since new hints are + * always added to the head of the list, the last hint is always the + * oldest. + */ + TAILQ_FOREACH(hint, &dcp->c_hintlist, dh_link) { + if (hint->dh_index == index) + break; + } + if (hint != NULL) { /* found an existing hint */ + need_init = false; + need_remove = true; + } else { /* cannot find an existing hint */ + need_init = true; + if (dcp->c_dirhintcnt < HFS_MAXDIRHINTS) { /* we don't need recycling */ + /* Create a default directory hint */ + hint = hfs_zalloc(HFS_DIRHINT_ZONE); + ++dcp->c_dirhintcnt; + need_remove = false; + } else { /* recycle the last (i.e., the oldest) hint */ + hint = TAILQ_LAST(&dcp->c_hintlist, hfs_hinthead); + if ((hint->dh_desc.cd_flags & CD_HASBUF) && + (name = hint->dh_desc.cd_nameptr)) { + hint->dh_desc.cd_nameptr = NULL; + hint->dh_desc.cd_namelen = 0; + hint->dh_desc.cd_flags &= ~CD_HASBUF; + vfs_removename((const char *)name); + } + need_remove = true; + } + } + + if (need_remove) + TAILQ_REMOVE(&dcp->c_hintlist, hint, dh_link); + + if (detach) + --dcp->c_dirhintcnt; + else + TAILQ_INSERT_HEAD(&dcp->c_hintlist, hint, dh_link); + + if (need_init) { + hint->dh_index = index; + hint->dh_desc.cd_flags = 0; + hint->dh_desc.cd_encoding = 0; + hint->dh_desc.cd_namelen = 0; + hint->dh_desc.cd_nameptr = NULL; + hint->dh_desc.cd_parentcnid = dcp->c_fileid; + hint->dh_desc.cd_hint = dcp->c_childhint; + hint->dh_desc.cd_cnid = 0; + } + hint->dh_time = tv.tv_sec; + return (hint); +} + +/* + * Release a single directory hint. + * + * Requires an exclusive lock on directory cnode. + */ +void +hfs_reldirhint(struct cnode *dcp, directoryhint_t * relhint) +{ + const u_int8_t * name; + directoryhint_t *hint; + + /* Check if item is on list (could be detached) */ + TAILQ_FOREACH(hint, &dcp->c_hintlist, dh_link) { + if (hint == relhint) { + TAILQ_REMOVE(&dcp->c_hintlist, relhint, dh_link); + --dcp->c_dirhintcnt; + break; + } + } + name = relhint->dh_desc.cd_nameptr; + if ((relhint->dh_desc.cd_flags & CD_HASBUF) && (name != NULL)) { + relhint->dh_desc.cd_nameptr = NULL; + relhint->dh_desc.cd_namelen = 0; + relhint->dh_desc.cd_flags &= ~CD_HASBUF; + vfs_removename((const char *)name); + } + hfs_zfree(relhint, HFS_DIRHINT_ZONE); +} + +/* + * Release directory hints for given directory + * + * Requires an exclusive lock on directory cnode. + */ +void +hfs_reldirhints(struct cnode *dcp, int stale_hints_only) +{ + struct timeval tv; + directoryhint_t *hint, *prev; + const u_int8_t * name; + + if (stale_hints_only) + microuptime(&tv); + + /* searching from the oldest to the newest, so we can stop early when releasing stale hints only */ + for (hint = TAILQ_LAST(&dcp->c_hintlist, hfs_hinthead); hint != NULL; hint = prev) { + if (stale_hints_only && (tv.tv_sec - hint->dh_time) < HFS_DIRHINT_TTL) + break; /* stop here if this entry is too new */ + name = hint->dh_desc.cd_nameptr; + if ((hint->dh_desc.cd_flags & CD_HASBUF) && (name != NULL)) { + hint->dh_desc.cd_nameptr = NULL; + hint->dh_desc.cd_namelen = 0; + hint->dh_desc.cd_flags &= ~CD_HASBUF; + vfs_removename((const char *)name); + } + prev = TAILQ_PREV(hint, hfs_hinthead, dh_link); /* must save this pointer before calling FREE_ZONE on this node */ + TAILQ_REMOVE(&dcp->c_hintlist, hint, dh_link); + hfs_zfree(hint, HFS_DIRHINT_ZONE); + --dcp->c_dirhintcnt; + } +} + +/* + * Insert a detached directory hint back into the list of dirhints. + * + * Requires an exclusive lock on directory cnode. + */ +void +hfs_insertdirhint(struct cnode *dcp, directoryhint_t * hint) +{ + directoryhint_t *test; + + TAILQ_FOREACH(test, &dcp->c_hintlist, dh_link) { + if (test == hint) + panic("hfs_insertdirhint: hint %p already on list!", hint); + } + + TAILQ_INSERT_HEAD(&dcp->c_hintlist, hint, dh_link); + ++dcp->c_dirhintcnt; +} + +/* + * Perform a case-insensitive compare of two UTF-8 filenames. + * + * Returns 0 if the strings match. + */ +int +hfs_namecmp(const u_int8_t *str1, size_t len1, const u_int8_t *str2, size_t len2) +{ + u_int16_t *ustr1, *ustr2; + size_t ulen1, ulen2; + size_t maxbytes; + int cmp = -1; + + if (len1 != len2) + return (cmp); + + maxbytes = kHFSPlusMaxFileNameChars << 1; + ustr1 = hfs_malloc(maxbytes << 1); + ustr2 = ustr1 + (maxbytes >> 1); + + if (utf8_decodestr(str1, len1, ustr1, &ulen1, maxbytes, ':', 0) != 0) + goto out; + if (utf8_decodestr(str2, len2, ustr2, &ulen2, maxbytes, ':', 0) != 0) + goto out; + + cmp = FastUnicodeCompare(ustr1, ulen1>>1, ustr2, ulen2>>1); +out: + hfs_free(ustr1, maxbytes << 1); + return (cmp); +} + +typedef struct jopen_cb_info { + mount_t mp; + off_t jsize; + char *desired_uuid; + struct vnode *jvp; + size_t blksize; + int need_clean; + int need_init; +} jopen_cb_info; + +static int +journal_open_cb(const char *bsd_dev_name, const char *uuid_str, void *arg) +{ + jopen_cb_info *ji = (jopen_cb_info *)arg; + char bsd_name[256]; + int error; + + strlcpy(&bsd_name[0], "/dev/", sizeof(bsd_name)); + strlcpy(&bsd_name[5], bsd_dev_name, sizeof(bsd_name)-5); + + if ((error = vnode_lookup(bsd_name, VNODE_LOOKUP_NOFOLLOW, &ji->jvp, + vfs_context_kernel()))) { + printf("hfs: journal open cb: error %d looking up device %s (dev uuid %s)\n", error, bsd_name, uuid_str); + return 1; // keep iterating + } + + struct vnop_open_args oargs = { + .a_vp = ji->jvp, + .a_mode = FREAD | FWRITE, + .a_context = vfs_context_kernel(), + }; + + if (spec_open(&oargs)) { + vnode_put(ji->jvp); + ji->jvp = NULL; + return 1; + } + + // if the journal is dirty and we didn't specify a desired + // journal device uuid, then do not use the journal. but + // if the journal is just invalid (e.g. it hasn't been + // initialized) then just set the need_init flag. + if (ji->need_clean && ji->desired_uuid && ji->desired_uuid[0] == '\0') { + error = journal_is_clean(ji->jvp, 0, ji->jsize, + (void *)1, ji->blksize); + if (error == EBUSY) { + struct vnop_close_args cargs = { + .a_vp = ji->jvp, + .a_fflag = FREAD | FWRITE, + .a_context = vfs_context_kernel() + }; + spec_close(&cargs); + vnode_put(ji->jvp); + ji->jvp = NULL; + return 1; // keep iterating + } else if (error == EINVAL) { + ji->need_init = 1; + } + } + + if (ji->desired_uuid && ji->desired_uuid[0] == '\0') { + strlcpy(ji->desired_uuid, uuid_str, 128); + } + vnode_setmountedon(ji->jvp); + return 0; // stop iterating +} + +static vnode_t +open_journal_dev(mount_t mp, + const char *vol_device, + int need_clean, + char *uuid_str, + char *machine_serial_num, + off_t jsize, + size_t blksize, + int *need_init) +{ + int retry_counter=0; + jopen_cb_info ji; + + ji.mp = mp; + ji.jsize = jsize; + ji.desired_uuid = uuid_str; + ji.jvp = NULL; + ji.blksize = blksize; + ji.need_clean = need_clean; + ji.need_init = 0; + +// if (uuid_str[0] == '\0') { +// printf("hfs: open journal dev: %s: locating any available non-dirty external journal partition\n", vol_device); +// } else { +// printf("hfs: open journal dev: %s: trying to find the external journal partition w/uuid %s\n", vol_device, uuid_str); +// } + while (ji.jvp == NULL && retry_counter++ < 4) { + if (retry_counter > 1) { + if (uuid_str[0]) { + printf("hfs: open_journal_dev: uuid %s not found. waiting 10sec.\n", uuid_str); + } else { + printf("hfs: open_journal_dev: no available external journal partition found. waiting 10sec.\n"); + } + delay_for_interval(10* 1000000, NSEC_PER_USEC); // wait for ten seconds and then try again + } + + hfs_iterate_media_with_content(EXTJNL_CONTENT_TYPE_UUID, + journal_open_cb, &ji); + } + + if (ji.jvp == NULL) { + printf("hfs: volume: %s: did not find jnl device uuid: %s from machine serial number: %s\n", + vol_device, uuid_str, machine_serial_num); + } + + *need_init = ji.need_init; + + return ji.jvp; +} + +void hfs_close_jvp(hfsmount_t *hfsmp) +{ + if (!hfsmp || !hfsmp->jvp || hfsmp->jvp == hfsmp->hfs_devvp) + return; + + vnode_clearmountedon(hfsmp->jvp); + struct vnop_close_args cargs = { + .a_vp = hfsmp->jvp, + .a_fflag = FREAD | FWRITE, + .a_context = vfs_context_kernel() + }; + spec_close(&cargs); + vnode_put(hfsmp->jvp); + hfsmp->jvp = NULL; +} + +int +hfs_early_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, + void *_args, off_t embeddedOffset, daddr64_t mdb_offset, + HFSMasterDirectoryBlock *mdbp, kauth_cred_t cred) +{ + JournalInfoBlock *jibp; + struct buf *jinfo_bp, *bp; + int sectors_per_fsblock, arg_flags=0, arg_tbufsz=0; + int retval, write_jibp = 0; + uint32_t blksize = hfsmp->hfs_logical_block_size; + struct vnode *devvp; + struct hfs_mount_args *args = _args; + u_int32_t jib_flags; + u_int64_t jib_offset; + u_int64_t jib_size; + const char *dev_name; + + devvp = hfsmp->hfs_devvp; + dev_name = vnode_getname_printable(devvp); + + if (args != NULL && (args->flags & HFSFSMNT_EXTENDED_ARGS)) { + arg_flags = args->journal_flags; + arg_tbufsz = args->journal_tbuffer_size; + } + + sectors_per_fsblock = SWAP_BE32(vhp->blockSize) / blksize; + + jinfo_bp = NULL; + retval = (int)buf_meta_bread(devvp, + (daddr64_t)((embeddedOffset/blksize) + + ((u_int64_t)SWAP_BE32(vhp->journalInfoBlock)*sectors_per_fsblock)), + hfsmp->hfs_physical_block_size, cred, &jinfo_bp); + if (retval) { + if (jinfo_bp) { + buf_brelse(jinfo_bp); + } + goto cleanup_dev_name; + } + + jibp = (JournalInfoBlock *)buf_dataptr(jinfo_bp); + jib_flags = SWAP_BE32(jibp->flags); + jib_size = SWAP_BE64(jibp->size); + + if (jib_flags & kJIJournalInFSMask) { + hfsmp->jvp = hfsmp->hfs_devvp; + jib_offset = SWAP_BE64(jibp->offset); + } else { + int need_init=0; + + // if the volume was unmounted cleanly then we'll pick any + // available external journal partition + // + if (SWAP_BE32(vhp->attributes) & kHFSVolumeUnmountedMask) { + *((char *)&jibp->ext_jnl_uuid[0]) = '\0'; + } + + hfsmp->jvp = open_journal_dev(hfsmp->hfs_mp, + dev_name, + !(jib_flags & kJIJournalNeedInitMask), + (char *)&jibp->ext_jnl_uuid[0], + (char *)&jibp->machine_serial_num[0], + jib_size, + hfsmp->hfs_logical_block_size, + &need_init); + if (hfsmp->jvp == NULL) { + buf_brelse(jinfo_bp); + retval = EROFS; + goto cleanup_dev_name; + } else { + if (hfs_get_platform_serial_number(&jibp->machine_serial_num[0], sizeof(jibp->machine_serial_num)) != KERN_SUCCESS) { + strlcpy(&jibp->machine_serial_num[0], "unknown-machine-uuid", sizeof(jibp->machine_serial_num)); + } + } + + jib_offset = 0; + write_jibp = 1; + if (need_init) { + jib_flags |= kJIJournalNeedInitMask; + } + } + + // save this off for the hack-y check in hfs_remove() + hfsmp->jnl_start = jib_offset / SWAP_BE32(vhp->blockSize); + hfsmp->jnl_size = jib_size; + + if ((hfsmp->hfs_flags & HFS_READ_ONLY) && (vfs_flags(hfsmp->hfs_mp) & MNT_ROOTFS) == 0) { + // if the file system is read-only, check if the journal is empty. + // if it is, then we can allow the mount. otherwise we have to + // return failure. + retval = journal_is_clean(hfsmp->jvp, + jib_offset + embeddedOffset, + jib_size, + devvp, + hfsmp->hfs_logical_block_size); + + hfsmp->jnl = NULL; + + buf_brelse(jinfo_bp); + + if (retval) { + const char *name = vnode_getname_printable(devvp); + printf("hfs: early journal init: volume on %s is read-only and journal is dirty. Can not mount volume.\n", + name); + vnode_putname_printable(name); + } + + goto cleanup_dev_name; + } + + if (jib_flags & kJIJournalNeedInitMask) { + printf("hfs: Initializing the journal (joffset 0x%llx sz 0x%llx)...\n", + jib_offset + embeddedOffset, jib_size); + hfsmp->jnl = journal_create(hfsmp->jvp, + jib_offset + embeddedOffset, + jib_size, + devvp, + blksize, + arg_flags, + arg_tbufsz, + hfs_sync_metadata, hfsmp->hfs_mp, + hfsmp->hfs_mp); + if (hfsmp->jnl) + journal_trim_set_callback(hfsmp->jnl, hfs_trim_callback, hfsmp); + + // no need to start a transaction here... if this were to fail + // we'd just re-init it on the next mount. + jib_flags &= ~kJIJournalNeedInitMask; + jibp->flags = SWAP_BE32(jib_flags); + buf_bwrite(jinfo_bp); + jinfo_bp = NULL; + jibp = NULL; + } else { + //printf("hfs: Opening the journal (joffset 0x%llx sz 0x%llx vhp_blksize %d)...\n", + // jib_offset + embeddedOffset, + // jib_size, SWAP_BE32(vhp->blockSize)); + + hfsmp->jnl = journal_open(hfsmp->jvp, + jib_offset + embeddedOffset, + jib_size, + devvp, + blksize, + arg_flags, + arg_tbufsz, + hfs_sync_metadata, hfsmp->hfs_mp, + hfsmp->hfs_mp); + if (hfsmp->jnl) + journal_trim_set_callback(hfsmp->jnl, hfs_trim_callback, hfsmp); + + if (write_jibp) { + buf_bwrite(jinfo_bp); + } else { + buf_brelse(jinfo_bp); + } + jinfo_bp = NULL; + jibp = NULL; + + if (hfsmp->jnl && mdbp) { + // reload the mdb because it could have changed + // if the journal had to be replayed. + if (mdb_offset == 0) { + mdb_offset = (daddr64_t)((embeddedOffset / blksize) + HFS_PRI_SECTOR(blksize)); + } + bp = NULL; + retval = (int)buf_meta_bread(devvp, + HFS_PHYSBLK_ROUNDDOWN(mdb_offset, hfsmp->hfs_log_per_phys), + hfsmp->hfs_physical_block_size, cred, &bp); + if (retval) { + if (bp) { + buf_brelse(bp); + } + printf("hfs: failed to reload the mdb after opening the journal (retval %d)!\n", + retval); + goto cleanup_dev_name; + } + bcopy((char *)buf_dataptr(bp) + HFS_PRI_OFFSET(hfsmp->hfs_physical_block_size), mdbp, 512); + buf_brelse(bp); + bp = NULL; + } + } + + // if we expected the journal to be there and we couldn't + // create it or open it then we have to bail out. + if (hfsmp->jnl == NULL) { + printf("hfs: early jnl init: failed to open/create the journal (retval %d).\n", retval); + retval = EINVAL; + goto cleanup_dev_name; + } + + retval = 0; + +cleanup_dev_name: + vnode_putname_printable(dev_name); + return retval; +} + + +// +// This function will go and re-locate the .journal_info_block and +// the .journal files in case they moved (which can happen if you +// run Norton SpeedDisk). If we fail to find either file we just +// disable journaling for this volume and return. We turn off the +// journaling bit in the vcb and assume it will get written to disk +// later (if it doesn't on the next mount we'd do the same thing +// again which is harmless). If we disable journaling we don't +// return an error so that the volume is still mountable. +// +// If the info we find for the .journal_info_block and .journal files +// isn't what we had stored, we re-set our cached info and proceed +// with opening the journal normally. +// +static int +hfs_late_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, void *_args) +{ + JournalInfoBlock *jibp; + struct buf *jinfo_bp; + int sectors_per_fsblock, arg_flags=0, arg_tbufsz=0; + int retval, write_jibp = 0, recreate_journal = 0; + struct vnode *devvp; + struct cat_attr jib_attr, jattr; + struct cat_fork jib_fork, jfork; + ExtendedVCB *vcb; + u_int32_t fid; + struct hfs_mount_args *args = _args; + u_int32_t jib_flags; + u_int64_t jib_offset; + u_int64_t jib_size; + + devvp = hfsmp->hfs_devvp; + vcb = HFSTOVCB(hfsmp); + + if (args != NULL && (args->flags & HFSFSMNT_EXTENDED_ARGS)) { + if (args->journal_disable) { + return 0; + } + + arg_flags = args->journal_flags; + arg_tbufsz = args->journal_tbuffer_size; + } + + fid = GetFileInfo(vcb, kRootDirID, ".journal_info_block", &jib_attr, &jib_fork); + if (fid == 0 || jib_fork.cf_extents[0].startBlock == 0 || jib_fork.cf_size == 0) { + printf("hfs: can't find the .journal_info_block! disabling journaling (start: %d).\n", + fid ? jib_fork.cf_extents[0].startBlock : 0); + vcb->vcbAtrb &= ~kHFSVolumeJournaledMask; + return 0; + } + hfsmp->hfs_jnlinfoblkid = fid; + + // make sure the journal_info_block begins where we think it should. + if (SWAP_BE32(vhp->journalInfoBlock) != jib_fork.cf_extents[0].startBlock) { + printf("hfs: The journal_info_block moved (was: %d; is: %d). Fixing up\n", + SWAP_BE32(vhp->journalInfoBlock), jib_fork.cf_extents[0].startBlock); + + vcb->vcbJinfoBlock = jib_fork.cf_extents[0].startBlock; + vhp->journalInfoBlock = SWAP_BE32(jib_fork.cf_extents[0].startBlock); + recreate_journal = 1; + } + + + sectors_per_fsblock = SWAP_BE32(vhp->blockSize) / hfsmp->hfs_logical_block_size; + jinfo_bp = NULL; + retval = (int)buf_meta_bread(devvp, + (vcb->hfsPlusIOPosOffset / hfsmp->hfs_logical_block_size + + ((u_int64_t)SWAP_BE32(vhp->journalInfoBlock)*sectors_per_fsblock)), + hfsmp->hfs_physical_block_size, NOCRED, &jinfo_bp); + if (retval) { + if (jinfo_bp) { + buf_brelse(jinfo_bp); + } + printf("hfs: can't read journal info block. disabling journaling.\n"); + vcb->vcbAtrb &= ~kHFSVolumeJournaledMask; + return 0; + } + + jibp = (JournalInfoBlock *)buf_dataptr(jinfo_bp); + jib_flags = SWAP_BE32(jibp->flags); + jib_offset = SWAP_BE64(jibp->offset); + jib_size = SWAP_BE64(jibp->size); + + fid = GetFileInfo(vcb, kRootDirID, ".journal", &jattr, &jfork); + if (fid == 0 || jfork.cf_extents[0].startBlock == 0 || jfork.cf_size == 0) { + printf("hfs: can't find the journal file! disabling journaling (start: %d)\n", + fid ? jfork.cf_extents[0].startBlock : 0); + buf_brelse(jinfo_bp); + vcb->vcbAtrb &= ~kHFSVolumeJournaledMask; + return 0; + } + hfsmp->hfs_jnlfileid = fid; + + // make sure the journal file begins where we think it should. + if ((jib_flags & kJIJournalInFSMask) && (jib_offset / (u_int64_t)vcb->blockSize) != jfork.cf_extents[0].startBlock) { + printf("hfs: The journal file moved (was: %lld; is: %d). Fixing up\n", + (jib_offset / (u_int64_t)vcb->blockSize), jfork.cf_extents[0].startBlock); + + jib_offset = (u_int64_t)jfork.cf_extents[0].startBlock * (u_int64_t)vcb->blockSize; + write_jibp = 1; + recreate_journal = 1; + } + + // check the size of the journal file. + if (jib_size != (u_int64_t)jfork.cf_extents[0].blockCount*vcb->blockSize) { + printf("hfs: The journal file changed size! (was %lld; is %lld). Fixing up.\n", + jib_size, (u_int64_t)jfork.cf_extents[0].blockCount*vcb->blockSize); + + jib_size = (u_int64_t)jfork.cf_extents[0].blockCount * vcb->blockSize; + write_jibp = 1; + recreate_journal = 1; + } + + if (jib_flags & kJIJournalInFSMask) { + hfsmp->jvp = hfsmp->hfs_devvp; + jib_offset += (off_t)vcb->hfsPlusIOPosOffset; + } else { + const char *dev_name; + int need_init = 0; + + dev_name = vnode_getname_printable(devvp); + + // since the journal is empty, just use any available external journal + *((char *)&jibp->ext_jnl_uuid[0]) = '\0'; + + // this fills in the uuid of the device we actually get + hfsmp->jvp = open_journal_dev(hfsmp->hfs_mp, + dev_name, + !(jib_flags & kJIJournalNeedInitMask), + (char *)&jibp->ext_jnl_uuid[0], + (char *)&jibp->machine_serial_num[0], + jib_size, + hfsmp->hfs_logical_block_size, + &need_init); + if (hfsmp->jvp == NULL) { + buf_brelse(jinfo_bp); + vnode_putname_printable(dev_name); + return EROFS; + } else { + if (hfs_get_platform_serial_number(&jibp->machine_serial_num[0], sizeof(jibp->machine_serial_num)) != KERN_SUCCESS) { + strlcpy(&jibp->machine_serial_num[0], "unknown-machine-serial-num", sizeof(jibp->machine_serial_num)); + } + } + jib_offset = 0; + recreate_journal = 1; + write_jibp = 1; + if (need_init) { + jib_flags |= kJIJournalNeedInitMask; + } + vnode_putname_printable(dev_name); + } + + // save this off for the hack-y check in hfs_remove() + hfsmp->jnl_start = jib_offset / SWAP_BE32(vhp->blockSize); + hfsmp->jnl_size = jib_size; + + if ((hfsmp->hfs_flags & HFS_READ_ONLY) && (vfs_flags(hfsmp->hfs_mp) & MNT_ROOTFS) == 0) { + // if the file system is read-only, check if the journal is empty. + // if it is, then we can allow the mount. otherwise we have to + // return failure. + retval = journal_is_clean(hfsmp->jvp, + jib_offset, + jib_size, + devvp, + hfsmp->hfs_logical_block_size); + + hfsmp->jnl = NULL; + + buf_brelse(jinfo_bp); + + if (retval) { + const char *name = vnode_getname_printable(devvp); + printf("hfs: late journal init: volume on %s is read-only and journal is dirty. Can not mount volume.\n", + name); + vnode_putname_printable(name); + } + + return retval; + } + + if ((jib_flags & kJIJournalNeedInitMask) || recreate_journal) { + printf("hfs: Initializing the journal (joffset 0x%llx sz 0x%llx)...\n", + jib_offset, jib_size); + hfsmp->jnl = journal_create(hfsmp->jvp, + jib_offset, + jib_size, + devvp, + hfsmp->hfs_logical_block_size, + arg_flags, + arg_tbufsz, + hfs_sync_metadata, hfsmp->hfs_mp, + hfsmp->hfs_mp); + if (hfsmp->jnl) + journal_trim_set_callback(hfsmp->jnl, hfs_trim_callback, hfsmp); + + // no need to start a transaction here... if this were to fail + // we'd just re-init it on the next mount. + jib_flags &= ~kJIJournalNeedInitMask; + write_jibp = 1; + + } else { + // + // if we weren't the last person to mount this volume + // then we need to throw away the journal because it + // is likely that someone else mucked with the disk. + // if the journal is empty this is no big deal. if the + // disk is dirty this prevents us from replaying the + // journal over top of changes that someone else made. + // + arg_flags |= JOURNAL_RESET; + + //printf("hfs: Opening the journal (joffset 0x%llx sz 0x%llx vhp_blksize %d)...\n", + // jib_offset, + // jib_size, SWAP_BE32(vhp->blockSize)); + + hfsmp->jnl = journal_open(hfsmp->jvp, + jib_offset, + jib_size, + devvp, + hfsmp->hfs_logical_block_size, + arg_flags, + arg_tbufsz, + hfs_sync_metadata, hfsmp->hfs_mp, + hfsmp->hfs_mp); + if (hfsmp->jnl) + journal_trim_set_callback(hfsmp->jnl, hfs_trim_callback, hfsmp); + } + + + if (write_jibp) { + jibp->flags = SWAP_BE32(jib_flags); + jibp->offset = SWAP_BE64(jib_offset); + jibp->size = SWAP_BE64(jib_size); + + buf_bwrite(jinfo_bp); + } else { + buf_brelse(jinfo_bp); + } + jinfo_bp = NULL; + jibp = NULL; + + // if we expected the journal to be there and we couldn't + // create it or open it then we have to bail out. + if (hfsmp->jnl == NULL) { + printf("hfs: late jnl init: failed to open/create the journal (retval %d).\n", retval); + return EINVAL; + } + + return 0; +} + +/* + * Calculate the allocation zone for metadata. + * + * This zone includes the following: + * Allocation Bitmap file + * Overflow Extents file + * Journal file + * Quota files + * Clustered Hot files + * Catalog file + * + * METADATA ALLOCATION ZONE + * ____________________________________________________________________________ + * | | | | | | | + * | BM | JF | OEF | CATALOG |---> | HOT FILES | + * |____|____|_____|_______________|______________________________|___________| + * + * <------------------------------- N * 128 MB -------------------------------> + * + */ +#define GIGABYTE (u_int64_t)(1024*1024*1024) + +#define HOTBAND_MINIMUM_SIZE (10*1024*1024) +#define HOTBAND_MAXIMUM_SIZE (512*1024*1024) + +/* Initialize the metadata zone. + * + * If the size of the volume is less than the minimum size for + * metadata zone, metadata zone is disabled. + * + * If disable is true, disable metadata zone unconditionally. + */ +void +hfs_metadatazone_init(struct hfsmount *hfsmp, int disable) +{ + ExtendedVCB *vcb; + u_int64_t fs_size; + u_int64_t zonesize; + u_int64_t temp; + u_int64_t filesize; + u_int32_t blk; + int items, really_do_it=1; + + vcb = HFSTOVCB(hfsmp); + fs_size = (u_int64_t)vcb->blockSize * (u_int64_t)vcb->allocLimit; + + /* + * For volumes less than 10 GB, don't bother. + */ + if (fs_size < ((u_int64_t)10 * GIGABYTE)) { + really_do_it = 0; + } + + /* + * Skip non-journaled volumes as well. + */ + if (hfsmp->jnl == NULL) { + really_do_it = 0; + } + + /* If caller wants to disable metadata zone, do it */ + if (disable == true) { + really_do_it = 0; + } + + /* + * Start with space for the boot blocks and Volume Header. + * 1536 = byte offset from start of volume to end of volume header: + * 1024 bytes is the offset from the start of the volume to the + * start of the volume header (defined by the volume format) + * + 512 bytes (the size of the volume header). + */ + zonesize = roundup(1536, hfsmp->blockSize); + + /* + * Add the on-disk size of allocation bitmap. + */ + zonesize += hfsmp->hfs_allocation_cp->c_datafork->ff_blocks * hfsmp->blockSize; + + /* + * Add space for the Journal Info Block and Journal (if they're in + * this file system). + */ + if (hfsmp->jnl && hfsmp->jvp == hfsmp->hfs_devvp) { + zonesize += hfsmp->blockSize + hfsmp->jnl_size; + } + + /* + * Add the existing size of the Extents Overflow B-tree. + * (It rarely grows, so don't bother reserving additional room for it.) + */ + zonesize += hfs_blk_to_bytes(hfsmp->hfs_extents_cp->c_datafork->ff_blocks, hfsmp->blockSize); + + /* + * If there is an Attributes B-tree, leave room for 11 clumps worth. + * newfs_hfs allocates one clump, and leaves a gap of 10 clumps. + * When installing a full OS install onto a 20GB volume, we use + * 7 to 8 clumps worth of space (depending on packages), so that leaves + * us with another 3 or 4 clumps worth before we need another extent. + */ + if (hfsmp->hfs_attribute_cp) { + zonesize += 11 * hfsmp->hfs_attribute_cp->c_datafork->ff_clumpsize; + } + + /* + * Leave room for 11 clumps of the Catalog B-tree. + * Again, newfs_hfs allocates one clump plus a gap of 10 clumps. + * When installing a full OS install onto a 20GB volume, we use + * 7 to 8 clumps worth of space (depending on packages), so that leaves + * us with another 3 or 4 clumps worth before we need another extent. + */ + zonesize += 11 * hfsmp->hfs_catalog_cp->c_datafork->ff_clumpsize; + + /* + * Add space for hot file region. + * + * ...for now, use 5 MB per 1 GB (0.5 %) + */ + filesize = (fs_size / 1024) * 5; + if (filesize > HOTBAND_MAXIMUM_SIZE) + filesize = HOTBAND_MAXIMUM_SIZE; + else if (filesize < HOTBAND_MINIMUM_SIZE) + filesize = HOTBAND_MINIMUM_SIZE; + /* + * Calculate user quota file requirements. + */ + if (hfsmp->hfs_flags & HFS_QUOTAS) { + items = QF_USERS_PER_GB * (fs_size / GIGABYTE); + if (items < QF_MIN_USERS) + items = QF_MIN_USERS; + else if (items > QF_MAX_USERS) + items = QF_MAX_USERS; + if (!powerof2(items)) { + int x = items; + items = 4; + while (x>>1 != 1) { + x = x >> 1; + items = items << 1; + } + } + filesize += (items + 1) * sizeof(struct dqblk); + /* + * Calculate group quota file requirements. + * + */ + items = QF_GROUPS_PER_GB * (fs_size / GIGABYTE); + if (items < QF_MIN_GROUPS) + items = QF_MIN_GROUPS; + else if (items > QF_MAX_GROUPS) + items = QF_MAX_GROUPS; + if (!powerof2(items)) { + int x = items; + items = 4; + while (x>>1 != 1) { + x = x >> 1; + items = items << 1; + } + } + filesize += (items + 1) * sizeof(struct dqblk); + } + zonesize += filesize; + + /* + * Round up entire zone to a bitmap block's worth. + * The extra space goes to the catalog file and hot file area. + */ + temp = zonesize; + zonesize = roundup(zonesize, (u_int64_t)vcb->vcbVBMIOSize * 8 * vcb->blockSize); + hfsmp->hfs_min_alloc_start = zonesize / vcb->blockSize; + /* + * If doing the round up for hfs_min_alloc_start would push us past + * allocLimit, then just reset it back to 0. Though using a value + * bigger than allocLimit would not cause damage in the block allocator + * code, this value could get stored in the volume header and make it out + * to disk, making the volume header technically corrupt. + */ + if (hfsmp->hfs_min_alloc_start >= hfsmp->allocLimit) { + hfsmp->hfs_min_alloc_start = 0; + } + + if (really_do_it == 0) { + /* If metadata zone needs to be disabled because the + * volume was truncated, clear the bit and zero out + * the values that are no longer needed. + */ + if (hfsmp->hfs_flags & HFS_METADATA_ZONE) { + /* Disable metadata zone */ + hfsmp->hfs_flags &= ~HFS_METADATA_ZONE; + + /* Zero out mount point values that are not required */ + hfsmp->hfs_catalog_maxblks = 0; + hfsmp->hfs_hotfile_maxblks = 0; + hfsmp->hfs_hotfile_start = 0; + hfsmp->hfs_hotfile_end = 0; + hfsmp->hfs_hotfile_freeblks = 0; + hfsmp->hfs_metazone_start = 0; + hfsmp->hfs_metazone_end = 0; + } + + return; + } + + temp = zonesize - temp; /* temp has extra space */ + filesize += temp / 3; + hfsmp->hfs_catalog_maxblks += (temp - (temp / 3)) / vcb->blockSize; + + if (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) { + hfsmp->hfs_hotfile_maxblks = (uint32_t) (hfsmp->hfs_cs_hotfile_size / HFSTOVCB(hfsmp)->blockSize); + } else { + hfsmp->hfs_hotfile_maxblks = filesize / vcb->blockSize; + } + + /* Convert to allocation blocks. */ + blk = zonesize / vcb->blockSize; + + /* The default metadata zone location is at the start of volume. */ + hfsmp->hfs_metazone_start = 1; + hfsmp->hfs_metazone_end = blk - 1; + + /* The default hotfile area is at the end of the zone. */ + if (vfs_flags(HFSTOVFS(hfsmp)) & MNT_ROOTFS) { + hfsmp->hfs_hotfile_start = blk - (filesize / vcb->blockSize); + hfsmp->hfs_hotfile_end = hfsmp->hfs_metazone_end; + hfsmp->hfs_hotfile_freeblks = hfs_hotfile_freeblocks(hfsmp); + } + else { + hfsmp->hfs_hotfile_start = 0; + hfsmp->hfs_hotfile_end = 0; + hfsmp->hfs_hotfile_freeblks = 0; + } +#if DEBUG + printf("hfs:%s: metadata zone is %d to %d\n", hfsmp->vcbVN, hfsmp->hfs_metazone_start, hfsmp->hfs_metazone_end); + printf("hfs:%s: hot file band is %d to %d\n", hfsmp->vcbVN, hfsmp->hfs_hotfile_start, hfsmp->hfs_hotfile_end); + printf("hfs:%s: hot file band free blocks = %d\n", hfsmp->vcbVN, hfsmp->hfs_hotfile_freeblks); +#endif + + hfsmp->hfs_flags |= HFS_METADATA_ZONE; +} + + +static u_int32_t +hfs_hotfile_freeblocks(struct hfsmount *hfsmp) +{ + ExtendedVCB *vcb = HFSTOVCB(hfsmp); + int lockflags; + int freeblocks; + + if (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) { + // + // This is only used at initialization time and on an ssd + // we'll get the real info from the hotfile btree user + // info + // + return 0; + } + + lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK); + freeblocks = MetaZoneFreeBlocks(vcb); + hfs_systemfile_unlock(hfsmp, lockflags); + + /* Minus Extents overflow file reserve. */ + if ((uint32_t)hfsmp->hfs_overflow_maxblks >= VTOF(hfsmp->hfs_extents_vp)->ff_blocks) { + freeblocks -= hfsmp->hfs_overflow_maxblks - VTOF(hfsmp->hfs_extents_vp)->ff_blocks; + } + + /* Minus catalog file reserve. */ + if ((uint32_t)hfsmp->hfs_catalog_maxblks >= VTOF(hfsmp->hfs_catalog_vp)->ff_blocks) { + freeblocks -= hfsmp->hfs_catalog_maxblks - VTOF(hfsmp->hfs_catalog_vp)->ff_blocks; + } + + if (freeblocks < 0) + freeblocks = 0; + + // printf("hfs: hotfile_freeblocks: MIN(%d, %d) = %d\n", freeblocks, hfsmp->hfs_hotfile_maxblks, MIN(freeblocks, hfsmp->hfs_hotfile_maxblks)); + return MIN(freeblocks, hfsmp->hfs_hotfile_maxblks); +} + +/* + * Determine if a file is a "virtual" metadata file. + * This includes journal and quota files. + */ +int +hfs_virtualmetafile(struct cnode *cp) +{ + const char * filename; + + + if (cp->c_parentcnid != kHFSRootFolderID) + return (0); + + filename = (const char *)cp->c_desc.cd_nameptr; + if (filename == NULL) + return (0); + + if ((strncmp(filename, ".journal", sizeof(".journal")) == 0) || + (strncmp(filename, ".journal_info_block", sizeof(".journal_info_block")) == 0) || + (strncmp(filename, ".quota.user", sizeof(".quota.user")) == 0) || + (strncmp(filename, ".quota.group", sizeof(".quota.group")) == 0) || + (strncmp(filename, ".hotfiles.btree", sizeof(".hotfiles.btree")) == 0)) + return (1); + + return (0); +} + +void hfs_syncer_lock(struct hfsmount *hfsmp) +{ + hfs_lock_mount(hfsmp); +} + +void hfs_syncer_unlock(struct hfsmount *hfsmp) +{ + hfs_unlock_mount(hfsmp); +} + +void hfs_syncer_wait(struct hfsmount *hfsmp, struct timespec *ts) +{ + msleep(&hfsmp->hfs_syncer_thread, &hfsmp->hfs_mutex, PWAIT, + "hfs_syncer_wait", ts); +} + +void hfs_syncer_wakeup(struct hfsmount *hfsmp) +{ + wakeup(&hfsmp->hfs_syncer_thread); +} + +uint64_t hfs_usecs_to_deadline(uint64_t usecs) +{ + uint64_t deadline; + clock_interval_to_deadline(usecs, NSEC_PER_USEC, &deadline); + return deadline; +} + +// +// Fire off a timed callback to sync the disk if the +// volume is on ejectable media. +// +void hfs_sync_ejectable(struct hfsmount *hfsmp) +{ + // If we don't have a syncer or we get called by the syncer, just return + if (!ISSET(hfsmp->hfs_flags, HFS_RUN_SYNCER) + || current_thread() == hfsmp->hfs_syncer_thread) { + return; + } + + hfs_syncer_lock(hfsmp); + + if (!timerisset(&hfsmp->hfs_sync_req_oldest)) + microuptime(&hfsmp->hfs_sync_req_oldest); + + /* If hfs_unmount is running, it will clear the HFS_RUN_SYNCER + flag. Also, we don't want to queue again if there is a sync + outstanding. */ + if (!ISSET(hfsmp->hfs_flags, HFS_RUN_SYNCER) + || hfsmp->hfs_syncer_thread) { + hfs_syncer_unlock(hfsmp); + return; + } + + hfsmp->hfs_syncer_thread = (void *)1; + + hfs_syncer_unlock(hfsmp); + + kernel_thread_start(hfs_syncer, hfsmp, &hfsmp->hfs_syncer_thread); + thread_deallocate(hfsmp->hfs_syncer_thread); +} + +int +hfs_start_transaction(struct hfsmount *hfsmp) +{ + int ret = 0, unlock_on_err = 0; + thread_t thread = current_thread(); + +#ifdef HFS_CHECK_LOCK_ORDER + /* + * You cannot start a transaction while holding a system + * file lock. (unless the transaction is nested.) + */ + if (hfsmp->jnl && journal_owner(hfsmp->jnl) != thread) { + if (hfsmp->hfs_catalog_cp && hfsmp->hfs_catalog_cp->c_lockowner == thread) { + panic("hfs_start_transaction: bad lock order (cat before jnl)\n"); + } + if (hfsmp->hfs_attribute_cp && hfsmp->hfs_attribute_cp->c_lockowner == thread) { + panic("hfs_start_transaction: bad lock order (attr before jnl)\n"); + } + if (hfsmp->hfs_extents_cp && hfsmp->hfs_extents_cp->c_lockowner == thread) { + panic("hfs_start_transaction: bad lock order (ext before jnl)\n"); + } + } +#endif /* HFS_CHECK_LOCK_ORDER */ + +again: + + if (hfsmp->jnl) { + if (journal_owner(hfsmp->jnl) != thread) { + /* + * The global lock should be held shared if journal is + * active to prevent disabling. If we're not the owner + * of the journal lock, verify that we're not already + * holding the global lock exclusive before moving on. + */ + if (hfsmp->hfs_global_lockowner == thread) { + ret = EBUSY; + goto out; + } + + hfs_lock_global (hfsmp, HFS_SHARED_LOCK); + + // Things could have changed + if (!hfsmp->jnl) { + hfs_unlock_global(hfsmp); + goto again; + } + + OSAddAtomic(1, (SInt32 *)&hfsmp->hfs_active_threads); + unlock_on_err = 1; + } + } else { + // No journal + if (hfsmp->hfs_global_lockowner != thread) { + hfs_lock_global(hfsmp, HFS_EXCLUSIVE_LOCK); + + // Things could have changed + if (hfsmp->jnl) { + hfs_unlock_global(hfsmp); + goto again; + } + + OSAddAtomic(1, (SInt32 *)&hfsmp->hfs_active_threads); + unlock_on_err = 1; + } + } + + /* If a downgrade to read-only mount is in progress, no other + * thread than the downgrade thread is allowed to modify + * the file system. + */ + if ((hfsmp->hfs_flags & HFS_RDONLY_DOWNGRADE) && + hfsmp->hfs_downgrading_thread != thread) { + ret = EROFS; + goto out; + } + + if (hfsmp->jnl) { + ret = journal_start_transaction(hfsmp->jnl); + } else { + ret = 0; + } + + if (ret == 0) + ++hfsmp->hfs_transaction_nesting; + +out: + if (ret != 0 && unlock_on_err) { + hfs_unlock_global (hfsmp); + OSAddAtomic(-1, (SInt32 *)&hfsmp->hfs_active_threads); + } + + return ret; +} + +int +hfs_end_transaction(struct hfsmount *hfsmp) +{ + int ret; + + hfs_assert(!hfsmp->jnl || journal_owner(hfsmp->jnl) == current_thread()); + hfs_assert(hfsmp->hfs_transaction_nesting > 0); + + if (hfsmp->jnl && hfsmp->hfs_transaction_nesting == 1) + hfs_flushvolumeheader(hfsmp, HFS_FVH_FLUSH_IF_DIRTY); + + bool need_unlock = !--hfsmp->hfs_transaction_nesting; + + if (hfsmp->jnl) { + ret = journal_end_transaction(hfsmp->jnl); + } else { + ret = 0; + } + + if (need_unlock) { + OSAddAtomic(-1, (SInt32 *)&hfsmp->hfs_active_threads); + hfs_unlock_global (hfsmp); + hfs_sync_ejectable(hfsmp); + } + + return ret; +} + + +void +hfs_journal_lock(struct hfsmount *hfsmp) +{ + /* Only peek at hfsmp->jnl while holding the global lock */ + hfs_lock_global (hfsmp, HFS_SHARED_LOCK); + if (hfsmp->jnl) { + journal_lock(hfsmp->jnl); + } + hfs_unlock_global (hfsmp); +} + +void +hfs_journal_unlock(struct hfsmount *hfsmp) +{ + /* Only peek at hfsmp->jnl while holding the global lock */ + hfs_lock_global (hfsmp, HFS_SHARED_LOCK); + if (hfsmp->jnl) { + journal_unlock(hfsmp->jnl); + } + hfs_unlock_global (hfsmp); +} + +/* + * Flush the contents of the journal to the disk. + * + * - HFS_FLUSH_JOURNAL + * Wait to write in-memory journal to the disk consistently. + * This means that the journal still contains uncommitted + * transactions and the file system metadata blocks in + * the journal transactions might be written asynchronously + * to the disk. But there is no guarantee that they are + * written to the disk before returning to the caller. + * Note that this option is sufficient for file system + * data integrity as it guarantees consistent journal + * content on the disk. + * + * - HFS_FLUSH_JOURNAL_META + * Wait to write in-memory journal to the disk + * consistently, and also wait to write all asynchronous + * metadata blocks to its corresponding locations + * consistently on the disk. This is overkill in normal + * scenarios but is useful whenever the metadata blocks + * are required to be consistent on-disk instead of + * just the journalbeing consistent; like before live + * verification and live volume resizing. The update of the + * metadata doesn't include a barrier of track cache flush. + * + * - HFS_FLUSH_FULL + * HFS_FLUSH_JOURNAL + force a track cache flush to media + * + * - HFS_FLUSH_CACHE + * Force a track cache flush to media. + * + * - HFS_FLUSH_BARRIER + * Barrier-only flush to ensure write order + * + */ +errno_t hfs_flush(struct hfsmount *hfsmp, hfs_flush_mode_t mode) +{ + errno_t error = 0; + int options = 0; + dk_synchronize_t sync_req = { .options = DK_SYNCHRONIZE_OPTION_BARRIER }; + + switch (mode) { + case HFS_FLUSH_JOURNAL_META: + // wait for journal, metadata blocks and previous async flush to finish + SET(options, JOURNAL_WAIT_FOR_IO); + + // no break + + case HFS_FLUSH_JOURNAL: + case HFS_FLUSH_JOURNAL_BARRIER: + case HFS_FLUSH_FULL: + + if (mode == HFS_FLUSH_JOURNAL_BARRIER && + !(hfsmp->hfs_flags & HFS_FEATURE_BARRIER)) + mode = HFS_FLUSH_FULL; + + if (mode == HFS_FLUSH_FULL) + SET(options, JOURNAL_FLUSH_FULL); + + /* Only peek at hfsmp->jnl while holding the global lock */ + hfs_lock_global (hfsmp, HFS_SHARED_LOCK); + + if (hfsmp->jnl) + error = journal_flush(hfsmp->jnl, options); + + hfs_unlock_global (hfsmp); + + /* + * This may result in a double barrier as + * journal_flush may have issued a barrier itself + */ + if (mode == HFS_FLUSH_JOURNAL_BARRIER) + error = VNOP_IOCTL(hfsmp->hfs_devvp, + DKIOCSYNCHRONIZE, (caddr_t)&sync_req, + FWRITE, NULL); + + break; + + case HFS_FLUSH_CACHE: + // Do a full sync + sync_req.options = 0; + + // no break + + case HFS_FLUSH_BARRIER: + // If barrier only flush doesn't support, fall back to use full flush. + if (!(hfsmp->hfs_flags & HFS_FEATURE_BARRIER)) + sync_req.options = 0; + + error = VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZE, (caddr_t)&sync_req, + FWRITE, NULL); + break; + + default: + error = EINVAL; + } + + return error; +} + +/* + * hfs_erase_unused_nodes + * + * Check wheter a volume may suffer from unused Catalog B-tree nodes that + * are not zeroed (due to ). If so, just write + * zeroes to the unused nodes. + * + * How do we detect when a volume needs this repair? We can't always be + * certain. If a volume was created after a certain date, then it may have + * been created with the faulty newfs_hfs. Since newfs_hfs only created one + * clump, we can assume that if a Catalog B-tree is larger than its clump size, + * that means that the entire first clump must have been written to, which means + * there shouldn't be unused and unwritten nodes in that first clump, and this + * repair is not needed. + * + * We have defined a bit in the Volume Header's attributes to indicate when the + * unused nodes have been repaired. A newer newfs_hfs will set this bit. + * As will fsck_hfs when it repairs the unused nodes. + */ +int hfs_erase_unused_nodes(struct hfsmount *hfsmp) +{ + int result; + struct filefork *catalog; + int lockflags; + + if (hfsmp->vcbAtrb & kHFSUnusedNodeFixMask) + { + /* This volume has already been checked and repaired. */ + return 0; + } + + if ((hfsmp->localCreateDate < kHFSUnusedNodesFixDate)) + { + /* This volume is too old to have had the problem. */ + hfsmp->vcbAtrb |= kHFSUnusedNodeFixMask; + return 0; + } + + catalog = hfsmp->hfs_catalog_cp->c_datafork; + if (catalog->ff_size > catalog->ff_clumpsize) + { + /* The entire first clump must have been in use at some point. */ + hfsmp->vcbAtrb |= kHFSUnusedNodeFixMask; + return 0; + } + + /* + * If we get here, we need to zero out those unused nodes. + * + * We start a transaction and lock the catalog since we're going to be + * making on-disk changes. But note that BTZeroUnusedNodes doens't actually + * do its writing via the journal, because that would be too much I/O + * to fit in a transaction, and it's a pain to break it up into multiple + * transactions. (It behaves more like growing a B-tree would.) + */ + printf("hfs_erase_unused_nodes: updating volume %s.\n", hfsmp->vcbVN); + result = hfs_start_transaction(hfsmp); + if (result) + goto done; + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); + result = BTZeroUnusedNodes(catalog); + vnode_waitforwrites(hfsmp->hfs_catalog_vp, 0, 0, 0, "hfs_erase_unused_nodes"); + hfs_systemfile_unlock(hfsmp, lockflags); + hfs_end_transaction(hfsmp); + if (result == 0) + hfsmp->vcbAtrb |= kHFSUnusedNodeFixMask; + printf("hfs_erase_unused_nodes: done updating volume %s.\n", hfsmp->vcbVN); + +done: + return result; +} + + +int +check_for_dataless_file(struct vnode *vp, uint64_t op_type) +{ + int error; + + if (vp == NULL || (VTOC(vp)->c_bsdflags & UF_COMPRESSED) == 0 || VTOCMP(vp) == NULL || decmpfs_cnode_cmp_type(VTOCMP(vp)) != DATALESS_CMPFS_TYPE) { + // there's nothing to do, it's not dataless + return 0; + } + + /* Swap files are special; ignore them */ + if (vnode_isswap(vp)) { + return 0; + } + + // printf("hfs: dataless: encountered a file with the dataless bit set! (vp %p)\n", vp); + error = resolve_nspace_item(vp, op_type | NAMESPACE_HANDLER_NSPACE_EVENT); + if (error == EDEADLK && op_type == NAMESPACE_HANDLER_WRITE_OP) { + error = 0; + } else if (error) { + if (error == EAGAIN) { + printf("hfs: dataless: timed out waiting for namespace handler...\n"); + // XXXdbg - return the fabled ENOTPRESENT (i.e. EJUKEBOX)? + return 0; + } else if (error == EINTR) { + // printf("hfs: dataless: got a signal while waiting for namespace handler...\n"); + return EINTR; + } + } else if (VTOC(vp)->c_bsdflags & UF_COMPRESSED) { + // + // if we're here, the dataless bit is still set on the file + // which means it didn't get handled. we return an error + // but it's presently ignored by all callers of this function. + // + // XXXdbg - EDATANOTPRESENT is what we really need... + // + return EBADF; + } + + return error; +} + + +// +// NOTE: this function takes care of starting a transaction and +// acquiring the systemfile lock so that it can call +// cat_update(). +// +// NOTE: do NOT hold and cnode locks while calling this function +// to avoid deadlocks (because we take a lock on the root +// cnode) +// +int +hfs_generate_document_id(struct hfsmount *hfsmp, uint32_t *docid) +{ + struct vnode *rvp; + struct cnode *cp; + int error; + + error = hfs_vfs_root(HFSTOVFS(hfsmp), &rvp, vfs_context_kernel()); + if (error) { + return error; + } + + cp = VTOC(rvp); + if ((error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT)) != 0) { + return error; + } + struct FndrExtendedDirInfo *extinfo = (struct FndrExtendedDirInfo *)((void *)((char *)&cp->c_attr.ca_finderinfo + 16)); + + int lockflags; + if ((error = hfs_start_transaction(hfsmp)) != 0) { + return error; + } + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); + + if (extinfo->document_id == 0) { + // initialize this to start at 3 (one greater than the root-dir id) + extinfo->document_id = 3; + } + + *docid = extinfo->document_id++; + + // mark the root cnode dirty + cp->c_flag |= C_MODIFIED; + hfs_update(cp->c_vp, 0); + + hfs_systemfile_unlock (hfsmp, lockflags); + (void) hfs_end_transaction(hfsmp); + + (void) hfs_unlock(cp); + + vnode_put(rvp); + rvp = NULL; + + return 0; +} + + +/* + * Return information about number of file system allocation blocks + * taken by metadata on a volume. + * + * This function populates struct hfsinfo_metadata with allocation blocks + * used by extents overflow btree, catalog btree, bitmap, attribute btree, + * journal file, and sum of all of the above. + */ +int +hfs_getinfo_metadata_blocks(struct hfsmount *hfsmp, struct hfsinfo_metadata *hinfo) +{ + int lockflags = 0; + int ret_lockflags = 0; + + /* Zero out the output buffer */ + bzero(hinfo, sizeof(struct hfsinfo_metadata)); + + /* + * Getting number of allocation blocks for all btrees + * should be a quick operation, so we grab locks for + * all of them at the same time + */ + lockflags = SFL_CATALOG | SFL_EXTENTS | SFL_BITMAP | SFL_ATTRIBUTE; + ret_lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); + /* + * Make sure that we were able to acquire all locks requested + * to protect us against conditions like unmount in progress. + */ + if ((lockflags & ret_lockflags) != lockflags) { + /* Release any locks that were acquired */ + hfs_systemfile_unlock(hfsmp, ret_lockflags); + return EPERM; + } + + /* Get information about all the btrees */ + hinfo->extents = hfsmp->hfs_extents_cp->c_datafork->ff_blocks; + hinfo->catalog = hfsmp->hfs_catalog_cp->c_datafork->ff_blocks; + hinfo->allocation = hfsmp->hfs_allocation_cp->c_datafork->ff_blocks; + hinfo->attribute = hfsmp->hfs_attribute_cp->c_datafork->ff_blocks; + + /* Done with btrees, give up the locks */ + hfs_systemfile_unlock(hfsmp, ret_lockflags); + + /* Get information about journal file */ + hinfo->journal = howmany(hfsmp->jnl_size, hfsmp->blockSize); + + /* Calculate total number of metadata blocks */ + hinfo->total = hinfo->extents + hinfo->catalog + + hinfo->allocation + hinfo->attribute + + hinfo->journal; + + return 0; +} + +static int +hfs_freezewrite_callback(struct vnode *vp, __unused void *cargs) +{ + vnode_waitforwrites(vp, 0, 0, 0, "hfs freeze 8"); + + return 0; +} + +int hfs_freeze(struct hfsmount *hfsmp) +{ + // First make sure some other process isn't freezing + hfs_lock_mount(hfsmp); + while (hfsmp->hfs_freeze_state != HFS_THAWED) { + if (msleep(&hfsmp->hfs_freeze_state, &hfsmp->hfs_mutex, + PWAIT | PCATCH, "hfs freeze 1", NULL) == EINTR) { + hfs_unlock_mount(hfsmp); + return EINTR; + } + } + + // Stop new syncers from starting + hfsmp->hfs_freeze_state = HFS_WANT_TO_FREEZE; + + // Now wait for all syncers to finish + while (hfsmp->hfs_syncers) { + if (msleep(&hfsmp->hfs_freeze_state, &hfsmp->hfs_mutex, + PWAIT | PCATCH, "hfs freeze 2", NULL) == EINTR) { + hfs_thaw_locked(hfsmp); + hfs_unlock_mount(hfsmp); + return EINTR; + } + } + hfs_unlock_mount(hfsmp); + + // flush things before we get started to try and prevent + // dirty data from being paged out while we're frozen. + // note: we can't do this once we're in the freezing state because + // other threads will need to take the global lock + vnode_iterate(hfsmp->hfs_mp, 0, hfs_freezewrite_callback, NULL); + + // Block everything in hfs_lock_global now + hfs_lock_mount(hfsmp); + hfsmp->hfs_freeze_state = HFS_FREEZING; + hfsmp->hfs_freezing_thread = current_thread(); + hfs_unlock_mount(hfsmp); + + /* Take the exclusive lock to flush out anything else that + might have the global lock at the moment and also so we + can flush the journal. */ + hfs_lock_global(hfsmp, HFS_EXCLUSIVE_LOCK); + journal_flush(hfsmp->jnl, JOURNAL_WAIT_FOR_IO); + hfs_unlock_global(hfsmp); + + // don't need to iterate on all vnodes, we just need to + // wait for writes to the system files and the device vnode + // + // Now that journal flush waits for all metadata blocks to + // be written out, waiting for btree writes is probably no + // longer required. + if (HFSTOVCB(hfsmp)->extentsRefNum) + vnode_waitforwrites(HFSTOVCB(hfsmp)->extentsRefNum, 0, 0, 0, "hfs freeze 3"); + if (HFSTOVCB(hfsmp)->catalogRefNum) + vnode_waitforwrites(HFSTOVCB(hfsmp)->catalogRefNum, 0, 0, 0, "hfs freeze 4"); + if (HFSTOVCB(hfsmp)->allocationsRefNum) + vnode_waitforwrites(HFSTOVCB(hfsmp)->allocationsRefNum, 0, 0, 0, "hfs freeze 5"); + if (hfsmp->hfs_attribute_vp) + vnode_waitforwrites(hfsmp->hfs_attribute_vp, 0, 0, 0, "hfs freeze 6"); + vnode_waitforwrites(hfsmp->hfs_devvp, 0, 0, 0, "hfs freeze 7"); + + // We're done, mark frozen + hfs_lock_mount(hfsmp); + hfsmp->hfs_freeze_state = HFS_FROZEN; + hfsmp->hfs_freezing_proc = current_proc(); + hfs_unlock_mount(hfsmp); + + return 0; +} + +int hfs_thaw(struct hfsmount *hfsmp, const struct proc *process) +{ + hfs_lock_mount(hfsmp); + + if (hfsmp->hfs_freeze_state != HFS_FROZEN) { + hfs_unlock_mount(hfsmp); + return EINVAL; + } + if (process && hfsmp->hfs_freezing_proc != process) { + hfs_unlock_mount(hfsmp); + return EPERM; + } + + hfs_thaw_locked(hfsmp); + + hfs_unlock_mount(hfsmp); + + return 0; +} + +static void hfs_thaw_locked(struct hfsmount *hfsmp) +{ + hfsmp->hfs_freezing_proc = NULL; + hfsmp->hfs_freeze_state = HFS_THAWED; + + wakeup(&hfsmp->hfs_freeze_state); +} + +uintptr_t obfuscate_addr(void *addr) +{ + vm_offset_t new_addr; + vm_kernel_addrperm_external((vm_offset_t)addr, &new_addr); + return new_addr; +} + +#if CONFIG_HFS_STD +/* + * Convert HFS encoded string into UTF-8 + * + * Unicode output is fully decomposed + * '/' chars are converted to ':' + */ +int +hfs_to_utf8(ExtendedVCB *vcb, const Str31 hfs_str, ByteCount maxDstLen, ByteCount *actualDstLen, unsigned char* dstStr) +{ + int error; + UniChar uniStr[MAX_HFS_UNICODE_CHARS]; + ItemCount uniCount; + size_t utf8len; + hfs_to_unicode_func_t hfs_get_unicode = VCBTOHFS(vcb)->hfs_get_unicode; + u_int8_t pascal_length = 0; + + /* + * Validate the length of the Pascal-style string before passing it + * down to the decoding engine. + */ + pascal_length = *((const u_int8_t*)(hfs_str)); + if (pascal_length > 31) { + /* invalid string; longer than 31 bytes */ + error = EINVAL; + return error; + } + + error = hfs_get_unicode(hfs_str, uniStr, MAX_HFS_UNICODE_CHARS, &uniCount); + + if (uniCount == 0) + error = EINVAL; + + if (error == 0) { + error = utf8_encodestr(uniStr, uniCount * sizeof(UniChar), dstStr, &utf8len, maxDstLen , ':', 0); + if (error == ENAMETOOLONG) + *actualDstLen = utf8_encodelen(uniStr, uniCount * sizeof(UniChar), ':', 0); + else + *actualDstLen = utf8len; + } + + return error; +} + +/* + * Convert UTF-8 string into HFS encoding + * + * ':' chars are converted to '/' + * Assumes input represents fully decomposed Unicode + */ +int +utf8_to_hfs(ExtendedVCB *vcb, ByteCount srcLen, const unsigned char* srcStr, Str31 dstStr/*, int retry*/) +{ + int error; + UniChar uniStr[MAX_HFS_UNICODE_CHARS]; + size_t ucslen; + + error = utf8_decodestr(srcStr, srcLen, uniStr, &ucslen, sizeof(uniStr), ':', 0); + if (error == 0) + error = unicode_to_hfs(vcb, ucslen, uniStr, dstStr, 1); + + return error; +} + +/* + * Convert Unicode string into HFS encoding + * + * ':' chars are converted to '/' + * Assumes input represents fully decomposed Unicode + */ +int +unicode_to_hfs(ExtendedVCB *vcb, ByteCount srcLen, u_int16_t* srcStr, Str31 dstStr, int retry) +{ + int error; + unicode_to_hfs_func_t hfs_get_hfsname = VCBTOHFS(vcb)->hfs_get_hfsname; + + error = hfs_get_hfsname(srcStr, srcLen/sizeof(UniChar), dstStr); + if (error && retry) { + error = unicode_to_mac_roman(srcStr, srcLen/sizeof(UniChar), dstStr); + } + return error; +} + +#endif // CONFIG_HFS_STD + +static uint64_t hfs_allocated __attribute__((aligned(8))); + +#if HFS_MALLOC_DEBUG + +#warning HFS_MALLOC_DEBUG is on + +#include +#include "hfs_alloc_trace.h" + +struct alloc_debug_header { + uint32_t magic; + uint32_t size; + uint64_t sequence; + LIST_ENTRY(alloc_debug_header) chain; + void *backtrace[HFS_ALLOC_BACKTRACE_LEN]; +}; + +enum { + HFS_ALLOC_MAGIC = 0x68667361, // "hfsa" + HFS_ALLOC_DEAD = 0x68667364, // "hfsd" +}; + +static LIST_HEAD(, alloc_debug_header) hfs_alloc_list; +static lck_mtx_t *hfs_alloc_mtx; +static int hfs_alloc_tracing; +static uint64_t hfs_alloc_sequence; + +void hfs_alloc_trace_enable(void) +{ + if (hfs_alloc_tracing) + return; + + // Not thread-safe, but this is debug so who cares + extern lck_grp_t *hfs_mutex_group; + extern lck_attr_t *hfs_lock_attr; + + if (!hfs_alloc_mtx) { + hfs_alloc_mtx = lck_mtx_alloc_init(hfs_mutex_group, hfs_lock_attr); + LIST_INIT(&hfs_alloc_list); + } + + // Using OSCompareAndSwap in lieu of a barrier + OSCompareAndSwap(hfs_alloc_tracing, true, &hfs_alloc_tracing); +} + +void hfs_alloc_trace_disable(void) +{ + if (!hfs_alloc_tracing) + return; + + hfs_alloc_tracing = false; + + lck_mtx_lock_spin(hfs_alloc_mtx); + + struct alloc_debug_header *hdr; + LIST_FOREACH(hdr, &hfs_alloc_list, chain) { + hdr->chain.le_prev = NULL; + } + LIST_INIT(&hfs_alloc_list); + + lck_mtx_unlock(hfs_alloc_mtx); +} + +static int hfs_handle_alloc_tracing SYSCTL_HANDLER_ARGS +{ + int v = hfs_alloc_tracing; + + int err = sysctl_handle_int(oidp, &v, 0, req); + if (err || req->newptr == USER_ADDR_NULL || v == hfs_alloc_tracing) + return err; + + if (v) + hfs_alloc_trace_enable(); + else + hfs_alloc_trace_disable(); + + return 0; +} + +HFS_SYSCTL(PROC, _vfs_generic_hfs, OID_AUTO, alloc_tracing, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, NULL, 0, + hfs_handle_alloc_tracing, "I", "Allocation tracing") + +static int hfs_handle_alloc_trace_info SYSCTL_HANDLER_ARGS +{ + if (!hfs_alloc_tracing) { + struct hfs_alloc_trace_info info = {}; + return sysctl_handle_opaque(oidp, &info, sizeof(info), req); + } + + const int size = 128 * 1024; + struct hfs_alloc_trace_info *info = kalloc(size); + + const int max_entries = ((size - sizeof(*info)) + / sizeof(struct hfs_alloc_info_entry)); + + info->entry_count = 0; + info->more = false; + + lck_mtx_lock_spin(hfs_alloc_mtx); + + struct alloc_debug_header *hdr; + LIST_FOREACH(hdr, &hfs_alloc_list, chain) { + if (info->entry_count == max_entries) { + info->more = true; + break; + } + vm_offset_t o; + vm_kernel_addrperm_external((vm_offset_t)hdr, &o); + info->entries[info->entry_count].ptr = o; + info->entries[info->entry_count].size = hdr->size; + info->entries[info->entry_count].sequence = hdr->sequence; + for (int i = 0; i < HFS_ALLOC_BACKTRACE_LEN; ++i) { + vm_kernel_unslide_or_perm_external((vm_offset_t)hdr->backtrace[i], &o); + info->entries[info->entry_count].backtrace[i] = o; + } + ++info->entry_count; + } + + lck_mtx_unlock(hfs_alloc_mtx); + + int err = sysctl_handle_opaque(oidp, info, + sizeof(*info) + info->entry_count + * sizeof(struct hfs_alloc_info_entry), + req); + + kfree(info, size); + + return err; +} + +HFS_SYSCTL(PROC, _vfs_generic_hfs, OID_AUTO, alloc_trace_info, + CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_LOCKED, NULL, 0, + hfs_handle_alloc_trace_info, "-", "Allocation trace info") + +bool hfs_dump_allocations(void) +{ + if (!hfs_allocated) + return false; + + lck_mtx_lock(hfs_alloc_mtx); + + struct alloc_debug_header *hdr; + LIST_FOREACH(hdr, &hfs_alloc_list, chain) { + vm_offset_t o; + vm_kernel_addrperm_external((vm_offset_t)hdr, &o); + printf(" -- 0x%lx:%llu <%u> --\n", o, hdr->sequence, hdr->size); + for (int j = 0; j < HFS_ALLOC_BACKTRACE_LEN && hdr->backtrace[j]; ++j) { + vm_kernel_unslide_or_perm_external((vm_offset_t)hdr->backtrace[j], &o); + printf("0x%lx\n", o); + } + } + + lck_mtx_unlock(hfs_alloc_mtx); + + return true; +} + +#endif + +HFS_SYSCTL(QUAD, _vfs_generic_hfs, OID_AUTO, allocated, + CTLFLAG_RD | CTLFLAG_LOCKED, &hfs_allocated, "Memory allocated") + +void *hfs_malloc(size_t size) +{ +#if HFS_MALLOC_DEBUG + hfs_assert(size <= 0xffffffff); + + struct alloc_debug_header *hdr; + + void *ptr; + ptr = kalloc(size + sizeof(*hdr)); + + hdr = ptr + size; + + hdr->magic = HFS_ALLOC_MAGIC; + hdr->size = size; + + if (hfs_alloc_tracing) { + OSBacktrace(hdr->backtrace, HFS_ALLOC_BACKTRACE_LEN); + lck_mtx_lock_spin(hfs_alloc_mtx); + LIST_INSERT_HEAD(&hfs_alloc_list, hdr, chain); + hdr->sequence = ++hfs_alloc_sequence; + lck_mtx_unlock(hfs_alloc_mtx); + } else + hdr->chain.le_prev = NULL; +#else + void *ptr; + ptr = kalloc(size); +#endif + + OSAddAtomic64(size, &hfs_allocated); + + return ptr; +} + +void hfs_free(void *ptr, size_t size) +{ + if (!ptr) + return; + + OSAddAtomic64(-(int64_t)size, &hfs_allocated); + +#if HFS_MALLOC_DEBUG + struct alloc_debug_header *hdr = ptr + size; + + hfs_assert(hdr->magic == HFS_ALLOC_MAGIC); + hfs_assert(hdr->size == size); + + hdr->magic = HFS_ALLOC_DEAD; + + if (hdr->chain.le_prev) { + lck_mtx_lock_spin(hfs_alloc_mtx); + LIST_REMOVE(hdr, chain); + lck_mtx_unlock(hfs_alloc_mtx); + } + + kfree(ptr, size + sizeof(*hdr)); +#else + kfree(ptr, size); +#endif +} + +void *hfs_mallocz(size_t size) +{ + void *ptr = hfs_malloc(size); + bzero(ptr, size); + return ptr; +} + +// -- Zone allocator-related structures and routines -- + +hfs_zone_entry_t hfs_zone_entries[HFS_NUM_ZONES] = { + { HFS_CNODE_ZONE, sizeof(struct cnode), "HFS node", true }, + { HFS_FILEFORK_ZONE, sizeof(struct filefork), "HFS fork", true }, + { HFS_DIRHINT_ZONE, sizeof(struct directoryhint), "HFS dirhint", true } +}; + +hfs_zone_t hfs_zones[HFS_NUM_ZONES]; + +void hfs_init_zones(void) { + for (int i = 0; i < HFS_NUM_ZONES; i++) { + hfs_zones[i].hz_zone = zinit(hfs_zone_entries[i].hze_elem_size, 1024 * 1024, PAGE_SIZE, hfs_zone_entries[i].hze_name); + hfs_zones[i].hz_elem_size = hfs_zone_entries[i].hze_elem_size; + + zone_change(hfs_zones[i].hz_zone, Z_CALLERACCT, false); + if (hfs_zone_entries[i].hze_noencrypt) + zone_change(hfs_zones[i].hz_zone, Z_NOENCRYPT, true); + } +} + +void *hfs_zalloc(hfs_zone_kind_t zone) +{ + OSAddAtomic64(hfs_zones[zone].hz_elem_size, &hfs_allocated); + + return zalloc(hfs_zones[zone].hz_zone); +} + +void hfs_zfree(void *ptr, hfs_zone_kind_t zone) +{ + OSAddAtomic64(-(int64_t)hfs_zones[zone].hz_elem_size, &hfs_allocated); + + zfree(hfs_zones[zone].hz_zone, ptr); +} + +struct hfs_sysctl_chain *sysctl_list; + +void hfs_sysctl_register(void) +{ + struct hfs_sysctl_chain *e = sysctl_list; + while (e) { + sysctl_register_oid(e->oid); + e = e->next; + } +} + +void hfs_sysctl_unregister(void) +{ + struct hfs_sysctl_chain *e = sysctl_list; + while (e) { + sysctl_unregister_oid(e->oid); + e = e->next; + } +} + +void hfs_assert_fail(const char *file, unsigned line, const char *expr) +{ + Assert(file, line, expr); + __builtin_unreachable(); +} diff --git a/core/hfs_vnops.c b/core/hfs_vnops.c new file mode 100644 index 0000000..db88785 --- /dev/null +++ b/core/hfs_vnops.c @@ -0,0 +1,7622 @@ +/* + * Copyright (c) 2000-2017 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +#include "hfs.h" +#include "hfs_catalog.h" +#include "hfs_cnode.h" +#include "hfs_dbg.h" +#include "hfs_mount.h" +#include "hfs_quota.h" +#include "hfs_endian.h" +#include "hfs_kdebug.h" +#include "hfs_cprotect.h" + +#if HFS_CONFIG_KEY_ROLL +#include "hfs_key_roll.h" +#endif + +#include "BTreesInternal.h" +#include "FileMgrInternal.h" + +/* Global vfs data structures for hfs */ + +/* + * Always F_FULLFSYNC? 1=yes,0=no (default due to "various" reasons is + * 'no'). At some point this might need to move into VFS and we might + * need to provide an API to get at it, but for now, this is only used + * by HFS+. + */ +int always_do_fullfsync = 0; +SYSCTL_DECL(_vfs_generic); +HFS_SYSCTL(INT, _vfs_generic, OID_AUTO, always_do_fullfsync, CTLFLAG_RW | CTLFLAG_LOCKED, &always_do_fullfsync, 0, "always F_FULLFSYNC when fsync is called") + +int hfs_makenode(struct vnode *dvp, struct vnode **vpp, + struct componentname *cnp, struct vnode_attr *vap, + vfs_context_t ctx); +int hfs_metasync(struct hfsmount *hfsmp, daddr64_t node, __unused struct proc *p); +int hfs_metasync_all(struct hfsmount *hfsmp); + +int hfs_removedir(struct vnode *, struct vnode *, struct componentname *, + int, int); +int hfs_removefile(struct vnode *, struct vnode *, struct componentname *, + int, int, int, struct vnode *, int); + +/* Used here and in cnode teardown -- for symlinks */ +int hfs_removefile_callback(struct buf *bp, void *hfsmp); + +enum { + HFS_MOVE_DATA_INCLUDE_RSRC = 1, +}; +typedef uint32_t hfs_move_data_options_t; + +static int hfs_move_data(cnode_t *from_cp, cnode_t *to_cp, + hfs_move_data_options_t options); +static int hfs_move_fork(filefork_t *srcfork, cnode_t *src, + filefork_t *dstfork, cnode_t *dst); + + +static int hfs_exchangedata_getxattr (struct vnode *vp, uint32_t name_selector, void **buffer, size_t *xattr_size); +static int hfs_exchangedata_setxattr (struct hfsmount *hfsmp, uint32_t fileid, + uint32_t name_selector, void *buffer, size_t xattr_size); + +enum XATTR_NAME_ENTRIES { + quarantine = 0, + MAX_NUM_XATTR_NAMES //must be last +}; + + +/* These are special EAs that follow the content in exchangedata(2). */ +const char *XATTR_NAMES [MAX_NUM_XATTR_NAMES] = { "com.apple.quarantine" }; + +#define MAX_EXCHANGE_EA_SIZE 4096 + +#if HFS_COMPRESSION +static int hfs_move_compressed(cnode_t *from_vp, cnode_t *to_vp); +#endif + +decmpfs_cnode* hfs_lazy_init_decmpfs_cnode (struct cnode *cp); + +#if FIFO +static int hfsfifo_read(struct vnop_read_args *); +static int hfsfifo_write(struct vnop_write_args *); +static int hfsfifo_close(struct vnop_close_args *); + +extern int (**fifo_vnodeop_p)(void *); +#endif /* FIFO */ + +int hfs_vnop_close(struct vnop_close_args*); +int hfs_vnop_exchange(struct vnop_exchange_args*); +int hfs_vnop_fsync(struct vnop_fsync_args*); +int hfs_vnop_mkdir(struct vnop_mkdir_args*); +int hfs_vnop_mknod(struct vnop_mknod_args*); +int hfs_vnop_getattr(struct vnop_getattr_args*); +int hfs_vnop_open(struct vnop_open_args*); +int hfs_vnop_readdir(struct vnop_readdir_args*); +int hfs_vnop_rename(struct vnop_rename_args*); +int hfs_vnop_renamex(struct vnop_renamex_args*); +int hfs_vnop_rmdir(struct vnop_rmdir_args*); +int hfs_vnop_symlink(struct vnop_symlink_args*); +int hfs_vnop_setattr(struct vnop_setattr_args*); +int hfs_vnop_readlink(struct vnop_readlink_args *); +int hfs_vnop_pathconf(struct vnop_pathconf_args *); +int hfs_vnop_mmap(struct vnop_mmap_args *ap); +int hfsspec_read(struct vnop_read_args *); +int hfsspec_write(struct vnop_write_args *); +int hfsspec_close(struct vnop_close_args *); + +/* Options for hfs_removedir and hfs_removefile */ +#define HFSRM_SKIP_RESERVE 0x01 + + + +/***************************************************************************** +* +* Common Operations on vnodes +* +*****************************************************************************/ + +/* + * Is the given cnode either the .journal or .journal_info_block file on + * a volume with an active journal? Many VNOPs use this to deny access + * to those files. + * + * Note: the .journal file on a volume with an external journal still + * returns true here, even though it does not actually hold the contents + * of the volume's journal. + */ +bool +hfs_is_journal_file(struct hfsmount *hfsmp, struct cnode *cp) +{ + if (hfsmp->jnl != NULL && + (cp->c_fileid == hfsmp->hfs_jnlinfoblkid || + cp->c_fileid == hfsmp->hfs_jnlfileid)) { + return true; + } else { + return false; + } +} + +/* + * Create a regular file. + */ +int +hfs_vnop_create(struct vnop_create_args *ap) +{ + /* + * We leave handling of certain race conditions here to the caller + * which will have a better understanding of the semantics it + * requires. For example, if it turns out that the file exists, + * it would be wrong of us to return a reference to the existing + * file because the caller might not want that and it would be + * misleading to suggest the file had been created when it hadn't + * been. Note that our NFS server code does not set the + * VA_EXCLUSIVE flag so you cannot assume that callers don't want + * EEXIST errors if it's not set. The common case, where users + * are calling open with the O_CREAT mode, is handled in VFS; when + * we return EEXIST, it will loop and do the look-up again. + */ + return hfs_makenode(ap->a_dvp, ap->a_vpp, ap->a_cnp, ap->a_vap, ap->a_context); +} + +/* + * Make device special file. + */ +int +hfs_vnop_mknod(struct vnop_mknod_args *ap) +{ + struct vnode_attr *vap = ap->a_vap; + struct vnode *dvp = ap->a_dvp; + struct vnode **vpp = ap->a_vpp; + struct cnode *cp; + int error; + + if (VTOVCB(dvp)->vcbSigWord != kHFSPlusSigWord) { + return (ENOTSUP); + } + + /* Create the vnode */ + error = hfs_makenode(dvp, vpp, ap->a_cnp, vap, ap->a_context); + if (error) + return (error); + + cp = VTOC(*vpp); + cp->c_touch_acctime = TRUE; + cp->c_touch_chgtime = TRUE; + cp->c_touch_modtime = TRUE; + + if ((vap->va_rdev != VNOVAL) && + (vap->va_type == VBLK || vap->va_type == VCHR)) + cp->c_rdev = vap->va_rdev; + + return (0); +} + +#if HFS_COMPRESSION +/* + * hfs_ref_data_vp(): returns the data fork vnode for a given cnode. + * In the (hopefully rare) case where the data fork vnode is not + * present, it will use hfs_vget() to create a new vnode for the + * data fork. + * + * NOTE: If successful and a vnode is returned, the caller is responsible + * for releasing the returned vnode with vnode_rele(). + */ +static int +hfs_ref_data_vp(struct cnode *cp, struct vnode **data_vp, int skiplock) +{ + int vref = 0; + + if (!data_vp || !cp) /* sanity check incoming parameters */ + return EINVAL; + + /* maybe we should take the hfs cnode lock here, and if so, use the skiplock parameter to tell us not to */ + + if (!skiplock) hfs_lock(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT); + struct vnode *c_vp = cp->c_vp; + if (c_vp) { + /* we already have a data vnode */ + *data_vp = c_vp; + vref = vnode_ref(*data_vp); + if (!skiplock) hfs_unlock(cp); + if (vref == 0) { + return 0; + } + return EINVAL; + } + /* no data fork vnode in the cnode, so ask hfs for one. */ + + if (!cp->c_rsrc_vp) { + /* if we don't have either a c_vp or c_rsrc_vp, we can't really do anything useful */ + *data_vp = NULL; + if (!skiplock) hfs_unlock(cp); + return EINVAL; + } + + if (0 == hfs_vget(VTOHFS(cp->c_rsrc_vp), cp->c_cnid, data_vp, 1, 0) && + 0 != data_vp) { + vref = vnode_ref(*data_vp); + vnode_put(*data_vp); + if (!skiplock) hfs_unlock(cp); + if (vref == 0) { + return 0; + } + return EINVAL; + } + /* there was an error getting the vnode */ + *data_vp = NULL; + if (!skiplock) hfs_unlock(cp); + return EINVAL; +} + +/* + * hfs_lazy_init_decmpfs_cnode(): returns the decmpfs_cnode for a cnode, + * allocating it if necessary; returns NULL if there was an allocation error. + * function is non-static so that it can be used from the FCNTL handler. + */ +decmpfs_cnode * +hfs_lazy_init_decmpfs_cnode(struct cnode *cp) +{ + if (!cp->c_decmp) { + decmpfs_cnode *dp = decmpfs_cnode_alloc(); + decmpfs_cnode_init(dp); + if (!OSCompareAndSwapPtr(NULL, dp, (void * volatile *)&cp->c_decmp)) { + /* another thread got here first, so free the decmpfs_cnode we allocated */ + decmpfs_cnode_destroy(dp); + decmpfs_cnode_free(dp); + } + } + + return cp->c_decmp; +} + +/* + * hfs_file_is_compressed(): returns 1 if the file is compressed, and 0 (zero) if not. + * if the file's compressed flag is set, makes sure that the decmpfs_cnode field + * is allocated by calling hfs_lazy_init_decmpfs_cnode(), then makes sure it is populated, + * or else fills it in via the decmpfs_file_is_compressed() function. + */ +int +hfs_file_is_compressed(struct cnode *cp, int skiplock) +{ + int ret = 0; + + /* fast check to see if file is compressed. If flag is clear, just answer no */ + if (!(cp->c_bsdflags & UF_COMPRESSED)) { + return 0; + } + + decmpfs_cnode *dp = hfs_lazy_init_decmpfs_cnode(cp); + if (!dp) { + /* error allocating a decmpfs cnode, treat the file as uncompressed */ + return 0; + } + + /* flag was set, see if the decmpfs_cnode state is valid (zero == invalid) */ + uint32_t decmpfs_state = decmpfs_cnode_get_vnode_state(dp); + switch(decmpfs_state) { + case FILE_IS_COMPRESSED: + case FILE_IS_CONVERTING: /* treat decompressing files as if they are compressed */ + return 1; + case FILE_IS_NOT_COMPRESSED: + return 0; + /* otherwise the state is not cached yet */ + } + + /* decmpfs hasn't seen this file yet, so call decmpfs_file_is_compressed() to init the decmpfs_cnode struct */ + struct vnode *data_vp = NULL; + if (0 == hfs_ref_data_vp(cp, &data_vp, skiplock)) { + if (data_vp) { + ret = decmpfs_file_is_compressed(data_vp, VTOCMP(data_vp)); // fill in decmpfs_cnode + vnode_rele(data_vp); + } + } + return ret; +} + +/* hfs_uncompressed_size_of_compressed_file() - get the uncompressed size of the file. + * if the caller has passed a valid vnode (has a ref count > 0), then hfsmp and fid are not required. + * if the caller doesn't have a vnode, pass NULL in vp, and pass valid hfsmp and fid. + * files size is returned in size (required) + * if the indicated file is a directory (or something that doesn't have a data fork), then this call + * will return an error and the caller should fall back to treating the item as an uncompressed file + */ +int +hfs_uncompressed_size_of_compressed_file(struct hfsmount *hfsmp, struct vnode *vp, cnid_t fid, off_t *size, int skiplock) +{ + int ret = 0; + int putaway = 0; /* flag to remember if we used hfs_vget() */ + + if (!size) { + return EINVAL; /* no place to put the file size */ + } + + if (NULL == vp) { + if (!hfsmp || !fid) { /* make sure we have the required parameters */ + return EINVAL; + } + if (0 != hfs_vget(hfsmp, fid, &vp, skiplock, 0)) { /* vnode is null, use hfs_vget() to get it */ + vp = NULL; + } else { + putaway = 1; /* note that hfs_vget() was used to aquire the vnode */ + } + } + /* this double check for compression (hfs_file_is_compressed) + * ensures the cached size is present in case decmpfs hasn't + * encountered this node yet. + */ + if (vp) { + if (hfs_file_is_compressed(VTOC(vp), skiplock) ) { + *size = decmpfs_cnode_get_vnode_cached_size(VTOCMP(vp)); /* file info will be cached now, so get size */ + } else if (VTOCMP(vp)) { + uint32_t cmp_type = decmpfs_cnode_cmp_type(VTOCMP(vp)); + + if (cmp_type == DATALESS_CMPFS_TYPE) { + *size = decmpfs_cnode_get_vnode_cached_size(VTOCMP(vp)); /* file info will be cached now, so get size */ + ret = 0; + } else if (cmp_type >= CMP_MAX && VTOC(vp)->c_datafork) { + // if we don't recognize this type, just use the real data fork size + *size = VTOC(vp)->c_datafork->ff_size; + ret = 0; + } else + ret = EINVAL; + } else + ret = EINVAL; + } + + if (putaway) { /* did we use hfs_vget() to get this vnode? */ + vnode_put(vp); /* if so, release it and set it to null */ + vp = NULL; + } + return ret; +} + +int +hfs_hides_rsrc(vfs_context_t ctx, struct cnode *cp, int skiplock) +{ + if (ctx == decmpfs_ctx) + return 0; + if (!hfs_file_is_compressed(cp, skiplock)) + return 0; + return decmpfs_hides_rsrc(ctx, cp->c_decmp); +} + +int +hfs_hides_xattr(vfs_context_t ctx, struct cnode *cp, const char *name, int skiplock) +{ + if (ctx == decmpfs_ctx) + return 0; + if (!hfs_file_is_compressed(cp, skiplock)) + return 0; + return decmpfs_hides_xattr(ctx, cp->c_decmp, name); +} +#endif /* HFS_COMPRESSION */ + +/* + * Open a file/directory. + */ +int +hfs_vnop_open(struct vnop_open_args *ap) +{ + struct vnode *vp = ap->a_vp; + struct filefork *fp; + struct timeval tv; + int error; + static int past_bootup = 0; + struct cnode *cp = VTOC(vp); + struct hfsmount *hfsmp = VTOHFS(vp); + +#if CONFIG_PROTECT + error = cp_handle_open(vp, ap->a_mode); + if (error) + return error; +#endif + +#if HFS_COMPRESSION + if (ap->a_mode & FWRITE) { + /* open for write */ + if ( hfs_file_is_compressed(cp, 1) ) { /* 1 == don't take the cnode lock */ + /* opening a compressed file for write, so convert it to decompressed */ + struct vnode *data_vp = NULL; + error = hfs_ref_data_vp(cp, &data_vp, 1); /* 1 == don't take the cnode lock */ + if (0 == error) { + if (data_vp) { + error = decmpfs_decompress_file(data_vp, VTOCMP(data_vp), -1, 1, 0); + vnode_rele(data_vp); + } else { + error = EINVAL; + } + } + if (error != 0) + return error; + } + } else { + /* open for read */ + if (hfs_file_is_compressed(cp, 1) ) { /* 1 == don't take the cnode lock */ + if (VNODE_IS_RSRC(vp)) { + /* opening the resource fork of a compressed file, so nothing to do */ + } else { + /* opening a compressed file for read, make sure it validates */ + error = decmpfs_validate_compressed_file(vp, VTOCMP(vp)); + if (error != 0) + return error; + } + } + } +#endif + + /* + * Files marked append-only must be opened for appending. + */ + if ((cp->c_bsdflags & APPEND) && !vnode_isdir(vp) && + (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE) + return (EPERM); + + if (vnode_issystem(vp)) + return (EBUSY); /* file is in use by the kernel */ + + /* Don't allow journal to be opened externally. */ + if (hfs_is_journal_file(hfsmp, cp)) + return (EPERM); + + bool have_lock = false; + +#if CONFIG_PROTECT + if (ISSET(ap->a_mode, FENCRYPTED) && cp->c_cpentry && vnode_isreg(vp)) { + bool have_trunc_lock = false; + +#if HFS_CONFIG_KEY_ROLL + again: +#endif + + if ((error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { + if (have_trunc_lock) + hfs_unlock_truncate(cp, 0); + return error; + } + + have_lock = true; + + if (cp->c_cpentry->cp_raw_open_count + 1 + < cp->c_cpentry->cp_raw_open_count) { + // Overflow; too many raw opens on this file + hfs_unlock(cp); + if (have_trunc_lock) + hfs_unlock_truncate(cp, 0); + return ENFILE; + } + +#if HFS_CONFIG_KEY_ROLL + if (cp_should_auto_roll(hfsmp, cp->c_cpentry)) { + if (!have_trunc_lock) { + hfs_unlock(cp); + hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, 0); + have_trunc_lock = true; + goto again; + } + + error = hfs_key_roll_start(cp); + if (error) { + hfs_unlock(cp); + hfs_unlock_truncate(cp, 0); + return error; + } + } +#endif + + if (have_trunc_lock) + hfs_unlock_truncate(cp, 0); + + ++cp->c_cpentry->cp_raw_open_count; + } +#endif + + if (ISSET(hfsmp->hfs_flags, HFS_READ_ONLY) + || !vnode_isreg(vp) +#if NAMEDSTREAMS + || vnode_isnamedstream(vp) +#endif + || !hfsmp->jnl || vnode_isinuse(vp, 0)) { + +#if CONFIG_PROTECT + if (have_lock) + hfs_unlock(cp); +#endif + + return (0); + } + + if (!have_lock && (error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) + return (error); + +#if QUOTA + /* If we're going to write to the file, initialize quotas. */ + if ((ap->a_mode & FWRITE) && (hfsmp->hfs_flags & HFS_QUOTAS)) + (void)hfs_getinoquota(cp); +#endif /* QUOTA */ + + /* + * On the first (non-busy) open of a fragmented + * file attempt to de-frag it, if it's less than hfs_defrag_max bytes. + * That field is initially set to 20MB. + */ + fp = VTOF(vp); + if (fp->ff_blocks && + fp->ff_extents[7].blockCount != 0 && + fp->ff_size <= hfsmp->hfs_defrag_max) { + + int no_mods = 0; + struct timeval now; + /* + * Wait until system bootup is done (3 min). + * And don't relocate a file that's been modified + * within the past minute -- this can lead to + * system thrashing. + */ + + if (hfsmp->hfs_defrag_nowait) { + /* If this is toggled, then issue the defrag if appropriate */ + past_bootup = 1; + no_mods = 1; + } + + if (!past_bootup) { + microuptime(&tv); + if (tv.tv_sec > (60*3)) { + past_bootup = 1; + } + } + + microtime(&now); + if ((now.tv_sec - cp->c_mtime) > 60) { + no_mods = 1; + } + + if (past_bootup && no_mods) { + (void) hfs_relocate(vp, hfsmp->nextAllocation + 4096, + vfs_context_ucred(ap->a_context), + vfs_context_proc(ap->a_context)); + } + } + + hfs_unlock(cp); + + return (0); +} + + +/* + * Close a file/directory. + */ +int +hfs_vnop_close(struct vnop_close_args *ap) +{ + register struct vnode *vp = ap->a_vp; + register struct cnode *cp; + struct proc *p = vfs_context_proc(ap->a_context); + struct hfsmount *hfsmp; + int busy; + int tooktrunclock = 0; + int knownrefs = 0; + + if ( hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT) != 0) + return (0); + cp = VTOC(vp); + hfsmp = VTOHFS(vp); + +#if CONFIG_PROTECT + if (cp->c_cpentry && ISSET(ap->a_fflag, FENCRYPTED) && vnode_isreg(vp)) { + hfs_assert(cp->c_cpentry->cp_raw_open_count > 0); + --cp->c_cpentry->cp_raw_open_count; + } +#endif + + /* + * If the rsrc fork is a named stream, it can cause the data fork to + * stay around, preventing de-allocation of these blocks. + * Do checks for truncation on close. Purge extra extents if they exist. + * Make sure the vp is not a directory, and that it has a resource fork, + * and that resource fork is also a named stream. + */ + + if ((vnode_vtype(vp) == VREG) && (cp->c_rsrc_vp) + && (vnode_isnamedstream(cp->c_rsrc_vp))) { + uint32_t blks; + + blks = howmany(VTOF(vp)->ff_size, VTOVCB(vp)->blockSize); + /* + * If there are extra blocks and there are only 2 refs on + * this vp (ourselves + rsrc fork holding ref on us), go ahead + * and try to truncate. + */ + if ((blks < VTOF(vp)->ff_blocks) && (!vnode_isinuse(vp, 2))) { + // release cnode lock; must acquire truncate lock BEFORE cnode lock + hfs_unlock(cp); + + hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + tooktrunclock = 1; + + if (hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT) != 0) { + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); + // bail out if we can't re-acquire cnode lock + return 0; + } + // now re-test to make sure it's still valid + if (cp->c_rsrc_vp) { + knownrefs = 1 + vnode_isnamedstream(cp->c_rsrc_vp); + if (!vnode_isinuse(vp, knownrefs)){ + // now we can truncate the file, if necessary + blks = howmany(VTOF(vp)->ff_size, VTOVCB(vp)->blockSize); + if (blks < VTOF(vp)->ff_blocks){ + (void) hfs_truncate(vp, VTOF(vp)->ff_size, IO_NDELAY, + 0, ap->a_context); + } + } + } + } + } + + + // if we froze the fs and we're exiting, then "thaw" the fs + if (hfsmp->hfs_freeze_state == HFS_FROZEN + && hfsmp->hfs_freezing_proc == p && proc_exiting(p)) { + hfs_thaw(hfsmp, p); + } + + busy = vnode_isinuse(vp, 1); + + if (busy) { + hfs_touchtimes(VTOHFS(vp), cp); + } + if (vnode_isdir(vp)) { + hfs_reldirhints(cp, busy); + } else if (vnode_issystem(vp) && !busy) { + vnode_recycle(vp); + } + + if (tooktrunclock){ + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); + } + hfs_unlock(cp); + + if (ap->a_fflag & FWASWRITTEN) { + hfs_sync_ejectable(hfsmp); + } + + return (0); +} + +static bool hfs_should_generate_document_id(hfsmount_t *hfsmp, cnode_t *cp) +{ + return (!ISSET(hfsmp->hfs_flags, HFS_READ_ONLY) + && ISSET(cp->c_bsdflags, UF_TRACKED) + && cp->c_desc.cd_cnid != kHFSRootFolderID + && (S_ISDIR(cp->c_mode) || S_ISREG(cp->c_mode) || S_ISLNK(cp->c_mode))); +} + +/* + * Get basic attributes. + */ +int +hfs_vnop_getattr(struct vnop_getattr_args *ap) +{ +#define VNODE_ATTR_TIMES \ + (VNODE_ATTR_va_access_time|VNODE_ATTR_va_change_time|VNODE_ATTR_va_modify_time) +#define VNODE_ATTR_AUTH \ + (VNODE_ATTR_va_mode | VNODE_ATTR_va_uid | VNODE_ATTR_va_gid | \ + VNODE_ATTR_va_flags | VNODE_ATTR_va_acl) + + struct vnode *vp = ap->a_vp; + struct vnode_attr *vap = ap->a_vap; + struct vnode *rvp = NULLVP; + struct hfsmount *hfsmp; + struct cnode *cp; + uint64_t data_size; + enum vtype v_type; + int error = 0; + cp = VTOC(vp); + +#if HFS_COMPRESSION + /* we need to inspect the decmpfs state of the file before we take the hfs cnode lock */ + int compressed = 0; + int hide_size = 0; + off_t uncompressed_size = -1; + if (VATTR_IS_ACTIVE(vap, va_data_size) || VATTR_IS_ACTIVE(vap, va_total_alloc) || VATTR_IS_ACTIVE(vap, va_data_alloc) || VATTR_IS_ACTIVE(vap, va_total_size)) { + /* we only care about whether the file is compressed if asked for the uncompressed size */ + if (VNODE_IS_RSRC(vp)) { + /* if it's a resource fork, decmpfs may want us to hide the size */ + hide_size = hfs_hides_rsrc(ap->a_context, cp, 0); + } else { + /* if it's a data fork, we need to know if it was compressed so we can report the uncompressed size */ + compressed = hfs_file_is_compressed(cp, 0); + } + if ((VATTR_IS_ACTIVE(vap, va_data_size) || VATTR_IS_ACTIVE(vap, va_total_size))) { + // if it's compressed + if (compressed || (!VNODE_IS_RSRC(vp) && cp->c_decmp && decmpfs_cnode_cmp_type(cp->c_decmp) >= CMP_MAX)) { + if (0 != hfs_uncompressed_size_of_compressed_file(NULL, vp, 0, &uncompressed_size, 0)) { + /* failed to get the uncompressed size, we'll check for this later */ + uncompressed_size = -1; + } else { + // fake that it's compressed + compressed = 1; + } + } + } + } +#endif + + /* + * Shortcut for vnode_authorize path. Each of the attributes + * in this set is updated atomically so we don't need to take + * the cnode lock to access them. + */ + if ((vap->va_active & ~VNODE_ATTR_AUTH) == 0) { + /* Make sure file still exists. */ + if (cp->c_flag & C_NOEXISTS) + return (ENOENT); + + vap->va_uid = cp->c_uid; + vap->va_gid = cp->c_gid; + vap->va_mode = cp->c_mode; + vap->va_flags = cp->c_bsdflags; + vap->va_supported |= VNODE_ATTR_AUTH & ~VNODE_ATTR_va_acl; + + if ((cp->c_attr.ca_recflags & kHFSHasSecurityMask) == 0) { + vap->va_acl = (kauth_acl_t) KAUTH_FILESEC_NONE; + VATTR_SET_SUPPORTED(vap, va_acl); + } + + return (0); + } + + hfsmp = VTOHFS(vp); + v_type = vnode_vtype(vp); + + if (VATTR_IS_ACTIVE(vap, va_document_id)) { + uint32_t document_id; + + if (cp->c_desc.cd_cnid == kHFSRootFolderID) + document_id = kHFSRootFolderID; + else { + /* + * This is safe without a lock because we're just reading + * a 32 bit aligned integer which should be atomic on all + * platforms we support. + */ + document_id = hfs_get_document_id(cp); + + if (!document_id && hfs_should_generate_document_id(hfsmp, cp)) { + uint32_t new_document_id; + + error = hfs_generate_document_id(hfsmp, &new_document_id); + if (error) + return error; + + error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + if (error) + return error; + + bool want_docid_fsevent = false; + + // Need to check again now that we have the lock + document_id = hfs_get_document_id(cp); + if (!document_id && hfs_should_generate_document_id(hfsmp, cp)) { + cp->c_attr.ca_finderextendeddirinfo.document_id = document_id = new_document_id; + want_docid_fsevent = true; + SET(cp->c_flag, C_MODIFIED); + } + + hfs_unlock(cp); + + if (want_docid_fsevent) { + add_fsevent(FSE_DOCID_CHANGED, ap->a_context, + FSE_ARG_DEV, hfsmp->hfs_raw_dev, + FSE_ARG_INO, (ino64_t)0, // src inode # + FSE_ARG_INO, (ino64_t)cp->c_fileid, // dst inode # + FSE_ARG_INT32, document_id, + FSE_ARG_DONE); + + if (need_fsevent(FSE_STAT_CHANGED, vp)) { + add_fsevent(FSE_STAT_CHANGED, ap->a_context, + FSE_ARG_VNODE, vp, FSE_ARG_DONE); + } + } + } + } + + vap->va_document_id = document_id; + VATTR_SET_SUPPORTED(vap, va_document_id); + } + + /* + * If time attributes are requested and we have cnode times + * that require updating, then acquire an exclusive lock on + * the cnode before updating the times. Otherwise we can + * just acquire a shared lock. + */ + if ((vap->va_active & VNODE_ATTR_TIMES) && + (cp->c_touch_acctime || cp->c_touch_chgtime || cp->c_touch_modtime)) { + if ((error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) + return (error); + hfs_touchtimes(hfsmp, cp); + + // downgrade to a shared lock since that's all we need from here on out + cp->c_lockowner = HFS_SHARED_OWNER; + lck_rw_lock_exclusive_to_shared(&cp->c_rwlock); + + } else if ((error = hfs_lock(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT))) { + return (error); + } + + if (v_type == VDIR) { + data_size = (cp->c_entries + 2) * AVERAGE_HFSDIRENTRY_SIZE; + + if (VATTR_IS_ACTIVE(vap, va_nlink)) { + int nlink; + + /* + * For directories, the va_nlink is esentially a count + * of the ".." references to a directory plus the "." + * reference and the directory itself. So for HFS+ this + * becomes the sub-directory count plus two. + * + * In the absence of a sub-directory count we use the + * directory's item count. This will be too high in + * most cases since it also includes files. + */ + if ((hfsmp->hfs_flags & HFS_FOLDERCOUNT) && + (cp->c_attr.ca_recflags & kHFSHasFolderCountMask)) + nlink = cp->c_attr.ca_dircount; /* implied ".." entries */ + else + nlink = cp->c_entries; + + /* Account for ourself and our "." entry */ + nlink += 2; + /* Hide our private directories. */ + if (cp->c_cnid == kHFSRootFolderID) { + if (hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid != 0) { + --nlink; + } + if (hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid != 0) { + --nlink; + } + } + VATTR_RETURN(vap, va_nlink, (u_int64_t)nlink); + } + if (VATTR_IS_ACTIVE(vap, va_nchildren)) { + int entries; + + entries = cp->c_entries; + /* Hide our private files and directories. */ + if (cp->c_cnid == kHFSRootFolderID) { + if (hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid != 0) + --entries; + if (hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid != 0) + --entries; + if (hfsmp->jnl || ((hfsmp->vcbAtrb & kHFSVolumeJournaledMask) && (hfsmp->hfs_flags & HFS_READ_ONLY))) + entries -= 2; /* hide the journal files */ + } + VATTR_RETURN(vap, va_nchildren, entries); + } + /* + * The va_dirlinkcount is the count of real directory hard links. + * (i.e. its not the sum of the implied "." and ".." references) + */ + if (VATTR_IS_ACTIVE(vap, va_dirlinkcount)) { + VATTR_RETURN(vap, va_dirlinkcount, (uint32_t)cp->c_linkcount); + } + } else /* !VDIR */ { + data_size = VCTOF(vp, cp)->ff_size; + + VATTR_RETURN(vap, va_nlink, (u_int64_t)cp->c_linkcount); + if (VATTR_IS_ACTIVE(vap, va_data_alloc)) { + u_int64_t blocks; + +#if HFS_COMPRESSION + if (hide_size) { + VATTR_RETURN(vap, va_data_alloc, 0); + } else if (compressed) { + /* for compressed files, we report all allocated blocks as belonging to the data fork */ + blocks = cp->c_blocks; + VATTR_RETURN(vap, va_data_alloc, blocks * (u_int64_t)hfsmp->blockSize); + } + else +#endif + { + blocks = VCTOF(vp, cp)->ff_blocks; + VATTR_RETURN(vap, va_data_alloc, blocks * (u_int64_t)hfsmp->blockSize); + } + } + } + + /* conditional because 64-bit arithmetic can be expensive */ + if (VATTR_IS_ACTIVE(vap, va_total_size)) { + if (v_type == VDIR) { + VATTR_RETURN(vap, va_total_size, (cp->c_entries + 2) * AVERAGE_HFSDIRENTRY_SIZE); + } else { + u_int64_t total_size = ~0ULL; + struct cnode *rcp; +#if HFS_COMPRESSION + if (hide_size) { + /* we're hiding the size of this file, so just return 0 */ + total_size = 0; + } else if (compressed) { + if (uncompressed_size == -1) { + /* + * We failed to get the uncompressed size above, + * so we'll fall back to the standard path below + * since total_size is still -1 + */ + } else { + /* use the uncompressed size we fetched above */ + total_size = uncompressed_size; + } + } +#endif + if (total_size == ~0ULL) { + if (cp->c_datafork) { + total_size = cp->c_datafork->ff_size; + } + + if (cp->c_blocks - VTOF(vp)->ff_blocks) { + /* We deal with rsrc fork vnode iocount at the end of the function */ + error = hfs_vgetrsrc(hfsmp, vp, &rvp); + if (error) { + /* + * Note that we call hfs_vgetrsrc with error_on_unlinked + * set to FALSE. This is because we may be invoked via + * fstat() on an open-unlinked file descriptor and we must + * continue to support access to the rsrc fork until it disappears. + * The code at the end of this function will be + * responsible for releasing the iocount generated by + * hfs_vgetrsrc. This is because we can't drop the iocount + * without unlocking the cnode first. + */ + goto out; + } + + rcp = VTOC(rvp); + if (rcp && rcp->c_rsrcfork) { + total_size += rcp->c_rsrcfork->ff_size; + } + } + } + + VATTR_RETURN(vap, va_total_size, total_size); + } + } + if (VATTR_IS_ACTIVE(vap, va_total_alloc)) { + if (v_type == VDIR) { + VATTR_RETURN(vap, va_total_alloc, 0); + } else { + VATTR_RETURN(vap, va_total_alloc, (u_int64_t)cp->c_blocks * (u_int64_t)hfsmp->blockSize); + } + } + + /* + * If the VFS wants extended security data, and we know that we + * don't have any (because it never told us it was setting any) + * then we can return the supported bit and no data. If we do + * have extended security, we can just leave the bit alone and + * the VFS will use the fallback path to fetch it. + */ + if (VATTR_IS_ACTIVE(vap, va_acl)) { + if ((cp->c_attr.ca_recflags & kHFSHasSecurityMask) == 0) { + vap->va_acl = (kauth_acl_t) KAUTH_FILESEC_NONE; + VATTR_SET_SUPPORTED(vap, va_acl); + } + } + + vap->va_access_time.tv_sec = cp->c_atime; + vap->va_access_time.tv_nsec = 0; + vap->va_create_time.tv_sec = cp->c_itime; + vap->va_create_time.tv_nsec = 0; + vap->va_modify_time.tv_sec = cp->c_mtime; + vap->va_modify_time.tv_nsec = 0; + vap->va_change_time.tv_sec = cp->c_ctime; + vap->va_change_time.tv_nsec = 0; + vap->va_backup_time.tv_sec = cp->c_btime; + vap->va_backup_time.tv_nsec = 0; + + /* See if we need to emit the date added field to the user */ + if (VATTR_IS_ACTIVE(vap, va_addedtime)) { + u_int32_t dateadded = hfs_get_dateadded (cp); + if (dateadded) { + vap->va_addedtime.tv_sec = dateadded; + vap->va_addedtime.tv_nsec = 0; + VATTR_SET_SUPPORTED (vap, va_addedtime); + } + } + + /* XXX is this really a good 'optimal I/O size'? */ + vap->va_iosize = hfsmp->hfs_logBlockSize; + vap->va_uid = cp->c_uid; + vap->va_gid = cp->c_gid; + vap->va_mode = cp->c_mode; + vap->va_flags = cp->c_bsdflags; + + /* + * Exporting file IDs from HFS Plus: + * + * For "normal" files the c_fileid is the same value as the + * c_cnid. But for hard link files, they are different - the + * c_cnid belongs to the active directory entry (ie the link) + * and the c_fileid is for the actual inode (ie the data file). + * + * The stat call (getattr) uses va_fileid and the Carbon APIs, + * which are hardlink-ignorant, will ask for va_linkid. + */ + vap->va_fileid = (u_int64_t)cp->c_fileid; + /* + * We need to use the origin cache for both hardlinked files + * and directories. Hardlinked directories have multiple cnids + * and parents (one per link). Hardlinked files also have their + * own parents and link IDs separate from the indirect inode number. + * If we don't use the cache, we could end up vending the wrong ID + * because the cnode will only reflect the link that was looked up most recently. + */ + if (cp->c_flag & C_HARDLINK) { + vap->va_linkid = (u_int64_t)hfs_currentcnid(cp); + vap->va_parentid = (u_int64_t)hfs_currentparent(cp, /* have_lock: */ true); + } else { + vap->va_linkid = (u_int64_t)cp->c_cnid; + vap->va_parentid = (u_int64_t)cp->c_parentcnid; + } + + vap->va_fsid = hfsmp->hfs_raw_dev; + if (VATTR_IS_ACTIVE(vap, va_devid)) { + VATTR_RETURN(vap, va_devid, hfsmp->hfs_raw_dev); + } + vap->va_filerev = 0; + vap->va_encoding = cp->c_encoding; + vap->va_rdev = (v_type == VBLK || v_type == VCHR) ? cp->c_rdev : 0; +#if HFS_COMPRESSION + if (VATTR_IS_ACTIVE(vap, va_data_size)) { + if (hide_size) + vap->va_data_size = 0; + else if (compressed) { + if (uncompressed_size == -1) { + /* failed to get the uncompressed size above, so just return data_size */ + vap->va_data_size = data_size; + } else { + /* use the uncompressed size we fetched above */ + vap->va_data_size = uncompressed_size; + } + } else + vap->va_data_size = data_size; + VATTR_SET_SUPPORTED(vap, va_data_size); + } +#else + vap->va_data_size = data_size; + vap->va_supported |= VNODE_ATTR_va_data_size; +#endif + +#if CONFIG_PROTECT + if (VATTR_IS_ACTIVE(vap, va_dataprotect_class)) { + vap->va_dataprotect_class = cp->c_cpentry ? CP_CLASS(cp->c_cpentry->cp_pclass) : 0; + VATTR_SET_SUPPORTED(vap, va_dataprotect_class); + } +#endif + if (VATTR_IS_ACTIVE(vap, va_write_gencount)) { + if (ubc_is_mapped_writable(vp)) { + /* + * Return 0 to the caller to indicate the file may be + * changing. There is no need for us to increment the + * generation counter here because it gets done as part of + * page-out and also when the file is unmapped (to account + * for changes we might not have seen). + */ + vap->va_write_gencount = 0; + } else { + vap->va_write_gencount = hfs_get_gencount(cp); + } + + VATTR_SET_SUPPORTED(vap, va_write_gencount); + } + + /* Mark them all at once instead of individual VATTR_SET_SUPPORTED calls. */ + vap->va_supported |= VNODE_ATTR_va_access_time | + VNODE_ATTR_va_create_time | VNODE_ATTR_va_modify_time | + VNODE_ATTR_va_change_time| VNODE_ATTR_va_backup_time | + VNODE_ATTR_va_iosize | VNODE_ATTR_va_uid | + VNODE_ATTR_va_gid | VNODE_ATTR_va_mode | + VNODE_ATTR_va_flags |VNODE_ATTR_va_fileid | + VNODE_ATTR_va_linkid | VNODE_ATTR_va_parentid | + VNODE_ATTR_va_fsid | VNODE_ATTR_va_filerev | + VNODE_ATTR_va_encoding | VNODE_ATTR_va_rdev; + + /* If this is the root, let VFS to find out the mount name, which + * may be different from the real name. Otherwise, we need to take care + * for hardlinked files, which need to be looked up, if necessary + */ + if (VATTR_IS_ACTIVE(vap, va_name) && (cp->c_cnid != kHFSRootFolderID)) { + struct cat_desc linkdesc; + int lockflags; + int uselinkdesc = 0; + cnid_t nextlinkid = 0; + cnid_t prevlinkid = 0; + + /* Get the name for ATTR_CMN_NAME. We need to take special care for hardlinks + * here because the info. for the link ID requested by getattrlist may be + * different than what's currently in the cnode. This is because the cnode + * will be filled in with the information for the most recent link ID that went + * through namei/lookup(). If there are competing lookups for hardlinks that point + * to the same inode, one (or more) getattrlists could be vended incorrect name information. + * Also, we need to beware of open-unlinked files which could have a namelen of 0. + */ + + if ((cp->c_flag & C_HARDLINK) && + ((cp->c_desc.cd_namelen == 0) || (vap->va_linkid != cp->c_cnid))) { + /* + * If we have no name and our link ID is the raw inode number, then we may + * have an open-unlinked file. Go to the next link in this case. + */ + if ((cp->c_desc.cd_namelen == 0) && (vap->va_linkid == cp->c_fileid)) { + if ((error = hfs_lookup_siblinglinks(hfsmp, vap->va_linkid, &prevlinkid, &nextlinkid))){ + goto out; + } + } + else { + /* just use link obtained from vap above */ + nextlinkid = vap->va_linkid; + } + + /* We need to probe the catalog for the descriptor corresponding to the link ID + * stored in nextlinkid. Note that we don't know if we have the exclusive lock + * for the cnode here, so we can't just update the descriptor. Instead, + * we should just store the descriptor's value locally and then use it to pass + * out the name value as needed below. + */ + if (nextlinkid){ + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); + error = cat_findname(hfsmp, nextlinkid, &linkdesc); + hfs_systemfile_unlock(hfsmp, lockflags); + if (error == 0) { + uselinkdesc = 1; + } + } + } + + /* By this point, we've either patched up the name above and the c_desc + * points to the correct data, or it already did, in which case we just proceed + * by copying the name into the vap. Note that we will never set va_name to + * supported if nextlinkid is never initialized. This could happen in the degenerate + * case above involving the raw inode number, where it has no nextlinkid. In this case + * we will simply not mark the name bit as supported. + */ + if (uselinkdesc) { + strlcpy(vap->va_name, (const char*) linkdesc.cd_nameptr, MAXPATHLEN); + VATTR_SET_SUPPORTED(vap, va_name); + cat_releasedesc(&linkdesc); + } + else if (cp->c_desc.cd_namelen) { + strlcpy(vap->va_name, (const char*) cp->c_desc.cd_nameptr, MAXPATHLEN); + VATTR_SET_SUPPORTED(vap, va_name); + } + } + +out: + hfs_unlock(cp); + /* + * We need to vnode_put the rsrc fork vnode only *after* we've released + * the cnode lock, since vnode_put can trigger an inactive call, which + * will go back into HFS and try to acquire a cnode lock. + */ + if (rvp) { + vnode_put (rvp); + } + + return (error); +} + +int +hfs_set_bsd_flags(struct hfsmount *hfsmp, struct cnode *cp, + u_int32_t new_bsd_flags, u_int32_t document_id, + vfs_context_t ctx, int *compression_changedp) +{ + u_int16_t *fdFlags; + + if ((new_bsd_flags & UF_TRACKED) && !(cp->c_bsdflags & UF_TRACKED)) { + struct FndrExtendedDirInfo *fip = (struct FndrExtendedDirInfo *)((char *)&cp->c_attr.ca_finderinfo + 16); + + // + // we're marking this item UF_TRACKED. if the document_id is + // not set, get a new one and put it on the file. + // + if (fip->document_id == 0) { + if (document_id != 0) { + // printf("SETATTR: assigning doc-id %d to %s (ino %d)\n", document_id, vp->v_name, cp->c_desc.cd_cnid); + fip->document_id = (uint32_t)document_id; + add_fsevent(FSE_DOCID_CHANGED, ctx, + FSE_ARG_DEV, hfsmp->hfs_raw_dev, + FSE_ARG_INO, (ino64_t)0, // src inode # + FSE_ARG_INO, (ino64_t)cp->c_fileid, // dst inode # + FSE_ARG_INT32, document_id, + FSE_ARG_DONE); + } else { + // printf("hfs: could not acquire a new document_id for %s (ino %d)\n", vp->v_name, cp->c_desc.cd_cnid); + } + } + + } else if (!(new_bsd_flags & UF_TRACKED) && (cp->c_bsdflags & UF_TRACKED)) { + // + // UF_TRACKED is being cleared so clear the document_id + // + struct FndrExtendedDirInfo *fip = (struct FndrExtendedDirInfo *)((char *)&cp->c_attr.ca_finderinfo + 16); + if (fip->document_id) { + // printf("SETATTR: clearing doc-id %d from %s (ino %d)\n", fip->document_id, vp->v_name, cp->c_desc.cd_cnid); + add_fsevent(FSE_DOCID_CHANGED, ctx, + FSE_ARG_DEV, hfsmp->hfs_raw_dev, + FSE_ARG_INO, (ino64_t)cp->c_fileid, // src inode # + FSE_ARG_INO, (ino64_t)0, // dst inode # + FSE_ARG_INT32, fip->document_id, // document id + FSE_ARG_DONE); + fip->document_id = 0; + cp->c_bsdflags &= ~UF_TRACKED; + } + } + +#if HFS_COMPRESSION + if ((cp->c_bsdflags ^ new_bsd_flags) & UF_COMPRESSED) { + /* + * the UF_COMPRESSED was toggled, so reset our cached compressed state + * but we don't want to actually do the update until we've released the cnode lock down below + * NOTE: turning the flag off doesn't actually decompress the file, so that we can + * turn off the flag and look at the "raw" file for debugging purposes + */ + *compression_changedp = 1; + } +#endif + + cp->c_bsdflags = new_bsd_flags; + cp->c_flag |= C_MODIFIED; + cp->c_touch_chgtime = TRUE; + + /* + * Mirror the UF_HIDDEN flag to the invisible bit of the Finder Info. + * + * The fdFlags for files and frFlags for folders are both 8 bytes + * into the userInfo (the first 16 bytes of the Finder Info). They + * are both 16-bit fields. + */ + fdFlags = (u_int16_t *) &cp->c_finderinfo[8]; + if (new_bsd_flags & UF_HIDDEN) + *fdFlags |= OSSwapHostToBigConstInt16(kFinderInvisibleMask); + else + *fdFlags &= ~OSSwapHostToBigConstInt16(kFinderInvisibleMask); + + return 0; +} + +int +hfs_vnop_setattr(struct vnop_setattr_args *ap) +{ + struct vnode_attr *vap = ap->a_vap; + struct vnode *vp = ap->a_vp; + struct cnode *cp = NULL; + struct hfsmount *hfsmp; + kauth_cred_t cred = vfs_context_ucred(ap->a_context); + struct proc *p = vfs_context_proc(ap->a_context); + int error = 0; + uid_t nuid; + gid_t ngid; + time_t orig_ctime; + + orig_ctime = VTOC(vp)->c_ctime; + +#if HFS_COMPRESSION + int decmpfs_reset_state = 0; + /* + we call decmpfs_update_attributes even if the file is not compressed + because we want to update the incoming flags if the xattrs are invalid + */ + error = decmpfs_update_attributes(vp, vap); + if (error) + return error; +#endif + // + // if this is not a size-changing setattr and it is not just + // an atime update, then check for a snapshot. + // + if (!VATTR_IS_ACTIVE(vap, va_data_size) && !(vap->va_active == VNODE_ATTR_va_access_time)) { + nspace_snapshot_event(vp, orig_ctime, NAMESPACE_HANDLER_METADATA_MOD, NSPACE_REARM_NO_ARG); + } + +#if CONFIG_PROTECT + /* + * All metadata changes should be allowed except a size-changing setattr, which + * has effects on file content and requires calling into cp_handle_vnop + * to have content protection check. + */ + if (VATTR_IS_ACTIVE(vap, va_data_size)) { + if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) { + return (error); + } + } +#endif /* CONFIG_PROTECT */ + + hfsmp = VTOHFS(vp); + + /* Don't allow modification of the journal. */ + if (hfs_is_journal_file(hfsmp, VTOC(vp))) { + return (EPERM); + } + + // + // Check if we'll need a document_id and if so, get it before we lock the + // the cnode to avoid any possible deadlock with the root vnode which has + // to get locked to get the document id + // + u_int32_t document_id=0; + if (VATTR_IS_ACTIVE(vap, va_flags) && (vap->va_flags & UF_TRACKED) && !(VTOC(vp)->c_bsdflags & UF_TRACKED)) { + struct FndrExtendedDirInfo *fip = (struct FndrExtendedDirInfo *)((char *)&(VTOC(vp)->c_attr.ca_finderinfo) + 16); + // + // If the document_id is not set, get a new one. It will be set + // on the file down below once we hold the cnode lock. + // + if (fip->document_id == 0) { + if (hfs_generate_document_id(hfsmp, &document_id) != 0) { + document_id = 0; + } + } + } + + + /* + * File size change request. + * We are guaranteed that this is not a directory, and that + * the filesystem object is writeable. + * + * NOTE: HFS COMPRESSION depends on the data_size being set *before* the bsd flags are updated + */ + VATTR_SET_SUPPORTED(vap, va_data_size); + if (VATTR_IS_ACTIVE(vap, va_data_size)) { + if (!vnode_isreg(vp)) { + if (vnode_isdir(vp)) { + return EISDIR; + } + //otherwise return EINVAL + return EINVAL; + } + +#if HFS_COMPRESSION + /* keep the compressed state locked until we're done truncating the file */ + decmpfs_cnode *dp = VTOCMP(vp); + if (!dp) { + /* + * call hfs_lazy_init_decmpfs_cnode() to make sure that the decmpfs_cnode + * is filled in; we need a decmpfs_cnode to lock out decmpfs state changes + * on this file while it's truncating + */ + dp = hfs_lazy_init_decmpfs_cnode(VTOC(vp)); + if (!dp) { + /* failed to allocate a decmpfs_cnode */ + return ENOMEM; /* what should this be? */ + } + } + + nspace_snapshot_event(vp, orig_ctime, vap->va_data_size == 0 ? NAMESPACE_HANDLER_TRUNCATE_OP|NAMESPACE_HANDLER_DELETE_OP : NAMESPACE_HANDLER_TRUNCATE_OP, NULL); + + decmpfs_lock_compressed_data(dp, 1); + if (hfs_file_is_compressed(VTOC(vp), 1)) { + error = decmpfs_decompress_file(vp, dp, -1/*vap->va_data_size*/, 0, 1); + if (error != 0) { + decmpfs_unlock_compressed_data(dp, 1); + return error; + } + } +#endif + + // Take truncate lock + hfs_lock_truncate(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + + // hfs_truncate will deal with the cnode lock + error = hfs_truncate(vp, vap->va_data_size, vap->va_vaflags & 0xffff, + 0, ap->a_context); + + hfs_unlock_truncate(VTOC(vp), HFS_LOCK_DEFAULT); +#if HFS_COMPRESSION + decmpfs_unlock_compressed_data(dp, 1); +#endif + if (error) + return error; + } + if (cp == NULL) { + if ((error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) + return (error); + cp = VTOC(vp); + } + + /* + * If it is just an access time update request by itself + * we know the request is from kernel level code, and we + * can delay it without being as worried about consistency. + * This change speeds up mmaps, in the rare case that they + * get caught behind a sync. + */ + + if (vap->va_active == VNODE_ATTR_va_access_time) { + cp->c_touch_acctime=TRUE; + goto out; + } + + + + /* + * Owner/group change request. + * We are guaranteed that the new owner/group is valid and legal. + */ + VATTR_SET_SUPPORTED(vap, va_uid); + VATTR_SET_SUPPORTED(vap, va_gid); + nuid = VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : (uid_t)VNOVAL; + ngid = VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : (gid_t)VNOVAL; + if (((nuid != (uid_t)VNOVAL) || (ngid != (gid_t)VNOVAL)) && + ((error = hfs_chown(vp, nuid, ngid, cred, p)) != 0)) + goto out; + + /* + * Mode change request. + * We are guaranteed that the mode value is valid and that in + * conjunction with the owner and group, this change is legal. + */ + VATTR_SET_SUPPORTED(vap, va_mode); + if (VATTR_IS_ACTIVE(vap, va_mode) && + ((error = hfs_chmod(vp, (int)vap->va_mode, cred, p)) != 0)) + goto out; + + /* + * File flags change. + * We are guaranteed that only flags allowed to change given the + * current securelevel are being changed. + */ + VATTR_SET_SUPPORTED(vap, va_flags); + if (VATTR_IS_ACTIVE(vap, va_flags)) { + if ((error = hfs_set_bsd_flags(hfsmp, cp, vap->va_flags, document_id, + ap->a_context, + &decmpfs_reset_state)) != 0) { + goto out; + } + } + + /* + * Timestamp updates. + */ + VATTR_SET_SUPPORTED(vap, va_create_time); + VATTR_SET_SUPPORTED(vap, va_access_time); + VATTR_SET_SUPPORTED(vap, va_modify_time); + VATTR_SET_SUPPORTED(vap, va_backup_time); + VATTR_SET_SUPPORTED(vap, va_change_time); + if (VATTR_IS_ACTIVE(vap, va_create_time) || + VATTR_IS_ACTIVE(vap, va_access_time) || + VATTR_IS_ACTIVE(vap, va_modify_time) || + VATTR_IS_ACTIVE(vap, va_backup_time)) { + if (VATTR_IS_ACTIVE(vap, va_create_time)) + cp->c_itime = vap->va_create_time.tv_sec; + if (VATTR_IS_ACTIVE(vap, va_access_time)) { + cp->c_atime = vap->va_access_time.tv_sec; + cp->c_touch_acctime = FALSE; + } + if (VATTR_IS_ACTIVE(vap, va_modify_time)) { + cp->c_mtime = vap->va_modify_time.tv_sec; + cp->c_touch_modtime = FALSE; + cp->c_touch_chgtime = TRUE; + + hfs_clear_might_be_dirty_flag(cp); + + /* + * The utimes system call can reset the modification + * time but it doesn't know about HFS create times. + * So we need to ensure that the creation time is + * always at least as old as the modification time. + */ + if ((VTOVCB(vp)->vcbSigWord == kHFSPlusSigWord) && + (cp->c_cnid != kHFSRootFolderID) && + !VATTR_IS_ACTIVE(vap, va_create_time) && + (cp->c_mtime < cp->c_itime)) { + cp->c_itime = cp->c_mtime; + } + } + if (VATTR_IS_ACTIVE(vap, va_backup_time)) + cp->c_btime = vap->va_backup_time.tv_sec; + cp->c_flag |= C_MINOR_MOD; + } + + // Set the date added time + VATTR_SET_SUPPORTED(vap, va_addedtime); + if (VATTR_IS_ACTIVE(vap, va_addedtime)) { + hfs_write_dateadded(&cp->c_attr, vap->va_addedtime.tv_sec); + cp->c_flag &= ~C_NEEDS_DATEADDED; + cp->c_touch_chgtime = true; + } + + /* + * Set name encoding. + */ + VATTR_SET_SUPPORTED(vap, va_encoding); + if (VATTR_IS_ACTIVE(vap, va_encoding)) { + cp->c_encoding = vap->va_encoding; + cp->c_flag |= C_MODIFIED; + hfs_setencodingbits(hfsmp, cp->c_encoding); + } + + if ((error = hfs_update(vp, 0)) != 0) + goto out; + +out: + if (cp) { + /* Purge origin cache for cnode, since caller now has correct link ID for it + * We purge it here since it was acquired for us during lookup, and we no longer need it. + */ + if ((cp->c_flag & C_HARDLINK) && (vnode_vtype(vp) != VDIR)){ + hfs_relorigin(cp, 0); + } + + hfs_unlock(cp); +#if HFS_COMPRESSION + if (decmpfs_reset_state) { + /* + * we've changed the UF_COMPRESSED flag, so reset the decmpfs state for this cnode + * but don't do it while holding the hfs cnode lock + */ + decmpfs_cnode *dp = VTOCMP(vp); + if (!dp) { + /* + * call hfs_lazy_init_decmpfs_cnode() to make sure that the decmpfs_cnode + * is filled in; we need a decmpfs_cnode to prevent decmpfs state changes + * on this file if it's locked + */ + dp = hfs_lazy_init_decmpfs_cnode(VTOC(vp)); + if (!dp) { + /* failed to allocate a decmpfs_cnode */ + return ENOMEM; /* what should this be? */ + } + } + decmpfs_cnode_set_vnode_state(dp, FILE_TYPE_UNKNOWN, 0); + } +#endif + } + +#if CONFIG_PROTECT + VATTR_SET_SUPPORTED(vap, va_dataprotect_class); + if (!error && VATTR_IS_ACTIVE(vap, va_dataprotect_class)) + error = cp_vnode_setclass(vp, vap->va_dataprotect_class); +#endif + + return (error); +} + + +/* + * Change the mode on a file. + * cnode must be locked before calling. + */ +int +hfs_chmod(struct vnode *vp, int mode, __unused kauth_cred_t cred, __unused struct proc *p) +{ + register struct cnode *cp = VTOC(vp); + + if (VTOVCB(vp)->vcbSigWord != kHFSPlusSigWord) + return (0); + + // Don't allow modification of the journal or journal_info_block + if (hfs_is_journal_file(VTOHFS(vp), cp)) { + return EPERM; + } + +#if OVERRIDE_UNKNOWN_PERMISSIONS + if (((unsigned int)vfs_flags(VTOVFS(vp))) & MNT_UNKNOWNPERMISSIONS) { + return (0); + }; +#endif + + mode_t new_mode = (cp->c_mode & ~ALLPERMS) | (mode & ALLPERMS); + if (new_mode != cp->c_mode) { + cp->c_mode = new_mode; + cp->c_flag |= C_MINOR_MOD; + } + cp->c_touch_chgtime = TRUE; + return (0); +} + + +int +hfs_write_access(struct vnode *vp, kauth_cred_t cred, struct proc *p, Boolean considerFlags) +{ + struct cnode *cp = VTOC(vp); + int retval = 0; + int is_member; + + /* + * Disallow write attempts on read-only file systems; + * unless the file is a socket, fifo, or a block or + * character device resident on the file system. + */ + switch (vnode_vtype(vp)) { + case VDIR: + case VLNK: + case VREG: + if (VTOHFS(vp)->hfs_flags & HFS_READ_ONLY) + return (EROFS); + break; + default: + break; + } + + /* If immutable bit set, nobody gets to write it. */ + if (considerFlags && (cp->c_bsdflags & IMMUTABLE)) + return (EPERM); + + /* Otherwise, user id 0 always gets access. */ + if (!suser(cred, NULL)) + return (0); + + /* Otherwise, check the owner. */ + if ((retval = hfs_owner_rights(VTOHFS(vp), cp->c_uid, cred, p, false)) == 0) + return ((cp->c_mode & S_IWUSR) == S_IWUSR ? 0 : EACCES); + + /* Otherwise, check the groups. */ + if (kauth_cred_ismember_gid(cred, cp->c_gid, &is_member) == 0 && is_member) { + return ((cp->c_mode & S_IWGRP) == S_IWGRP ? 0 : EACCES); + } + + /* Otherwise, check everyone else. */ + return ((cp->c_mode & S_IWOTH) == S_IWOTH ? 0 : EACCES); +} + + +/* + * Perform chown operation on cnode cp; + * code must be locked prior to call. + */ +int +#if !QUOTA +hfs_chown(struct vnode *vp, uid_t uid, gid_t gid, __unused kauth_cred_t cred, + __unused struct proc *p) +#else +hfs_chown(struct vnode *vp, uid_t uid, gid_t gid, kauth_cred_t cred, + __unused struct proc *p) +#endif +{ + register struct cnode *cp = VTOC(vp); + uid_t ouid; + gid_t ogid; +#if QUOTA + int error = 0; + register int i; + int64_t change; +#endif /* QUOTA */ + + if (VTOVCB(vp)->vcbSigWord != kHFSPlusSigWord) + return (ENOTSUP); + + if (((unsigned int)vfs_flags(VTOVFS(vp))) & MNT_UNKNOWNPERMISSIONS) + return (0); + + if (uid == (uid_t)VNOVAL) + uid = cp->c_uid; + if (gid == (gid_t)VNOVAL) + gid = cp->c_gid; + +#if 0 /* we are guaranteed that this is already the case */ + /* + * If we don't own the file, are trying to change the owner + * of the file, or are not a member of the target group, + * the caller must be superuser or the call fails. + */ + if ((kauth_cred_getuid(cred) != cp->c_uid || uid != cp->c_uid || + (gid != cp->c_gid && + (kauth_cred_ismember_gid(cred, gid, &is_member) || !is_member))) && + (error = suser(cred, 0))) + return (error); +#endif + + ogid = cp->c_gid; + ouid = cp->c_uid; + + if (ouid == uid && ogid == gid) { + // No change, just set change time + cp->c_touch_chgtime = TRUE; + return 0; + } + +#if QUOTA + if ((error = hfs_getinoquota(cp))) + return (error); + if (ouid == uid) { + dqrele(cp->c_dquot[USRQUOTA]); + cp->c_dquot[USRQUOTA] = NODQUOT; + } + if (ogid == gid) { + dqrele(cp->c_dquot[GRPQUOTA]); + cp->c_dquot[GRPQUOTA] = NODQUOT; + } + + /* + * Eventually need to account for (fake) a block per directory + * if (vnode_isdir(vp)) + * change = VTOHFS(vp)->blockSize; + * else + */ + + change = (int64_t)(cp->c_blocks) * (int64_t)VTOVCB(vp)->blockSize; + (void) hfs_chkdq(cp, -change, cred, CHOWN); + (void) hfs_chkiq(cp, -1, cred, CHOWN); + for (i = 0; i < MAXQUOTAS; i++) { + dqrele(cp->c_dquot[i]); + cp->c_dquot[i] = NODQUOT; + } +#endif /* QUOTA */ + cp->c_gid = gid; + cp->c_uid = uid; +#if QUOTA + if ((error = hfs_getinoquota(cp)) == 0) { + if (ouid == uid) { + dqrele(cp->c_dquot[USRQUOTA]); + cp->c_dquot[USRQUOTA] = NODQUOT; + } + if (ogid == gid) { + dqrele(cp->c_dquot[GRPQUOTA]); + cp->c_dquot[GRPQUOTA] = NODQUOT; + } + if ((error = hfs_chkdq(cp, change, cred, CHOWN)) == 0) { + if ((error = hfs_chkiq(cp, 1, cred, CHOWN)) == 0) + goto good; + else + (void) hfs_chkdq(cp, -change, cred, CHOWN|FORCE); + } + for (i = 0; i < MAXQUOTAS; i++) { + dqrele(cp->c_dquot[i]); + cp->c_dquot[i] = NODQUOT; + } + } + cp->c_gid = ogid; + cp->c_uid = ouid; + if (hfs_getinoquota(cp) == 0) { + if (ouid == uid) { + dqrele(cp->c_dquot[USRQUOTA]); + cp->c_dquot[USRQUOTA] = NODQUOT; + } + if (ogid == gid) { + dqrele(cp->c_dquot[GRPQUOTA]); + cp->c_dquot[GRPQUOTA] = NODQUOT; + } + (void) hfs_chkdq(cp, change, cred, FORCE|CHOWN); + (void) hfs_chkiq(cp, 1, cred, FORCE|CHOWN); + (void) hfs_getinoquota(cp); + } + return (error); +good: + if (hfs_getinoquota(cp)) + panic("hfs_chown: lost quota"); +#endif /* QUOTA */ + + /* + * Without quotas, we could probably make this a minor + * modification. + */ + cp->c_flag |= C_MODIFIED; + + /* + According to the SUSv3 Standard, chown() shall mark + for update the st_ctime field of the file. + (No exceptions mentioned) + */ + cp->c_touch_chgtime = TRUE; + return (0); +} + +#if HFS_COMPRESSION +/* + * Flush the resource fork if it exists. vp is the data fork and has + * an iocount. + */ +static int hfs_flush_rsrc(vnode_t vp, vfs_context_t ctx) +{ + cnode_t *cp = VTOC(vp); + + hfs_lock(cp, HFS_SHARED_LOCK, 0); + + vnode_t rvp = cp->c_rsrc_vp; + + if (!rvp) { + hfs_unlock(cp); + return 0; + } + + int vid = vnode_vid(rvp); + + hfs_unlock(cp); + + int error = vnode_getwithvid(rvp, vid); + + if (error) + return error == ENOENT ? 0 : error; + + hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, 0); + hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK); + hfs_filedone(rvp, ctx, HFS_FILE_DONE_NO_SYNC); + hfs_unlock(cp); + hfs_unlock_truncate(cp, 0); + + error = ubc_msync(rvp, 0, ubc_getsize(rvp), NULL, + UBC_PUSHALL | UBC_SYNC); + + vnode_put(rvp); + + return error; +} +#endif // HFS_COMPRESSION + + +/* Helper Functions for exchangedata(2) */ + +/* + * hfs_exchangedata_getxattr + * arguments: + * vp: vnode to extract the EA for + * name_selector: the index into the array of EA name entries. + * buffer: address for output buffer to store the output EA + * NOTE: This function will allocate the buffer, it is the caller's responsibility to free it. + * xattr_size: output argument; will return the size of the EA, to correspond with the buffer. + * + * Return: 0 on success. + * errno on error. If we return any error, the buffer is guaranteed to be NULL. + * + * Assumes CNODE lock held on cnode for 'vp' + */ +static +int hfs_exchangedata_getxattr (struct vnode *vp, uint32_t name_selector, void **buffer, size_t *xattr_size) { + void *xattr_rawdata = NULL; + void *extracted_xattr = NULL; + uio_t uio; + size_t memsize = MAX_EXCHANGE_EA_SIZE; + size_t attrsize; + int error = 0; + struct hfsmount *hfsmp = NULL; + + /* Sanity check inputs */ + if (name_selector > MAX_NUM_XATTR_NAMES) { + return EINVAL; + } + + if (buffer == NULL || xattr_size == NULL) { + return EINVAL; + } + + hfsmp = VTOHFS(vp); + + //allocate 4k memory to hold the EA. We don't use this for "large" EAs, and the default + //EA B-tree size should produce inline attributes of size < 4K + xattr_rawdata = hfs_malloc (MAX_EXCHANGE_EA_SIZE); + if (!xattr_rawdata) { + return ENOMEM; + } + + //now create the UIO + uio = uio_create (1, 0, UIO_SYSSPACE, UIO_READ); + if (!uio) { + hfs_free (xattr_rawdata, memsize); + return ENOMEM; + } + uio_addiov(uio, CAST_USER_ADDR_T(xattr_rawdata), memsize); + attrsize = memsize; + + struct vnop_getxattr_args vga = { + .a_uio = uio, + .a_name = XATTR_NAMES[name_selector], + .a_size = &attrsize + }; + + // this takes care of grabbing the systemfile locks for us. + error = hfs_getxattr_internal (VTOC(vp), &vga, hfsmp, 0); + + if (error) { + /* + * We could have gotten a variety of errors back from the XATTR tree: + * is it too big? (bigger than 4k?) == ERANGE + * was the EA not found? == ENOATTR + */ + uio_free(uio); + hfs_free (xattr_rawdata, memsize); + return error; + } + + //free the UIO + uio_free(uio); + + //upon success, a_size/attrsize now contains the actua/exported EA size + extracted_xattr = hfs_malloc (attrsize); + memcpy (extracted_xattr, xattr_rawdata, attrsize); + hfs_free (xattr_rawdata, memsize); + + *xattr_size = attrsize; + *buffer = extracted_xattr; + + return error; +} + + +/* + * hfs_exchangedata_setxattr + * + * Note: This function takes fileIDs in as inputs, because exchangedata does + * swizzly things with the two cnodes (See big block comment in hfs_vnop_exchange) + * so we operate with FileIDs more or less directly on the XATTR b-tree. + * + * arguments: + * hfsmp: the mount we're working on + * fileid: the fileID of the EA to store into the tree. + * name_selector: selector into the EA name array. + * buffer: pointer to the memory of the EA to write. + * xattr_size: size of the EA to write. + * + * Returns 0 on success + * errno on failure + * + * Assumes that a transaction has already begun when this is called + */ + +static +int hfs_exchangedata_setxattr (struct hfsmount *hfsmp, uint32_t fileid, + uint32_t name_selector, void *buffer, size_t xattr_size) { + + int error = 0; + + + /* Sanity check arguments */ + if (name_selector > MAX_NUM_XATTR_NAMES) { + return EINVAL; + } + + if (buffer == NULL || xattr_size == 0 || fileid < kHFSFirstUserCatalogNodeID ) { + return EINVAL; + } + + // is the size too big? + if (xattr_size > hfsmp->hfs_max_inline_attrsize) { + return EINVAL; + } + + /* setup the arguments to setxattr*/ + struct vnop_setxattr_args vsa = { + .a_desc = NULL, + .a_vp = NULL, + .a_name = XATTR_NAMES[name_selector], + .a_uio = NULL, // we use the data_ptr argument to setxattr_internal instead + .a_options = 0, + .a_context = NULL // no context needed, only done from within exchangedata + }; + + /* + * Since we must be in a transaction to guard the exchangedata operation, this will start + * a nested transaction within the exchangedata one. + */ + error = hfs_setxattr_internal (NULL, (caddr_t) buffer, xattr_size, &vsa, hfsmp, fileid); + + return error; + +} + +/* + * hfs_vnop_exchange: + * + * Inputs: + * 'from' vnode/cnode + * 'to' vnode/cnode + * options flag bits + * vfs_context + * + * Discussion: + * hfs_vnop_exchange is used to service the exchangedata(2) system call. + * Per the requirements of that system call, this function "swaps" some + * of the information that lives in one catalog record for some that + * lives in another. Note that not everything is swapped; in particular, + * the extent information stored in each cnode is kept local to that + * cnode. This allows existing file descriptor references to continue + * to operate on the same content, regardless of the location in the + * namespace that the file may have moved to. See inline comments + * in the function for more information. + */ +int +hfs_vnop_exchange(struct vnop_exchange_args *ap) +{ + struct vnode *from_vp = ap->a_fvp; + struct vnode *to_vp = ap->a_tvp; + struct cnode *from_cp; + struct cnode *to_cp; + struct hfsmount *hfsmp; + struct cat_desc tempdesc; + struct cat_attr tempattr; + const unsigned char *from_nameptr; + const unsigned char *to_nameptr; + char from_iname[32]; + char to_iname[32]; + uint32_t to_flag_special; + uint32_t from_flag_special; + + uint16_t to_recflags_special; + uint16_t from_recflags_special; + + cnid_t from_parid; + cnid_t to_parid; + int lockflags; + int error = 0, started_tr = 0, got_cookie = 0; + cat_cookie_t cookie; + time_t orig_from_ctime, orig_to_ctime; + bool have_cnode_locks = false, have_from_trunc_lock = false, have_to_trunc_lock = false; + + /* For the quarantine EA */ + void *from_xattr = NULL; + void *to_xattr = NULL; + size_t from_attrsize = 0; + size_t to_attrsize = 0; + + + /* + * VFS does the following checks: + * 1. Validate that both are files. + * 2. Validate that both are on the same mount. + * 3. Validate that they're not the same vnode. + */ + + from_cp = VTOC(from_vp); + to_cp = VTOC(to_vp); + hfsmp = VTOHFS(from_vp); + + orig_from_ctime = from_cp->c_ctime; + orig_to_ctime = to_cp->c_ctime; + +#if CONFIG_PROTECT + /* + * Do not allow exchangedata/F_MOVEDATAEXTENTS on data-protected filesystems + * because the EAs will not be swapped. As a result, the persistent keys would not + * match and the files will be garbage. + */ + if (cp_fs_protected (vnode_mount(from_vp))) { + return EINVAL; + } +#endif + +#if HFS_COMPRESSION + if (!ISSET(ap->a_options, FSOPT_EXCHANGE_DATA_ONLY)) { + if ( hfs_file_is_compressed(from_cp, 0) ) { + if ( 0 != ( error = decmpfs_decompress_file(from_vp, VTOCMP(from_vp), -1, 0, 1) ) ) { + return error; + } + } + + if ( hfs_file_is_compressed(to_cp, 0) ) { + if ( 0 != ( error = decmpfs_decompress_file(to_vp, VTOCMP(to_vp), -1, 0, 1) ) ) { + return error; + } + } + } +#endif // HFS_COMPRESSION + + // Resource forks cannot be exchanged. + if (VNODE_IS_RSRC(from_vp) || VNODE_IS_RSRC(to_vp)) + return EINVAL; + + /* + * Normally, we want to notify the user handlers about the event, + * except if it's a handler driving the event. + */ + if ((ap->a_options & FSOPT_EXCHANGE_DATA_ONLY) == 0) { + nspace_snapshot_event(from_vp, orig_from_ctime, NAMESPACE_HANDLER_WRITE_OP, NULL); + nspace_snapshot_event(to_vp, orig_to_ctime, NAMESPACE_HANDLER_WRITE_OP, NULL); + } else { + /* + * This is currently used by mtmd so we should tidy up the + * file now because the data won't be used again in the + * destination file. + */ + hfs_lock_truncate(from_cp, HFS_EXCLUSIVE_LOCK, 0); + hfs_lock_always(from_cp, HFS_EXCLUSIVE_LOCK); + hfs_filedone(from_vp, ap->a_context, HFS_FILE_DONE_NO_SYNC); + hfs_unlock(from_cp); + hfs_unlock_truncate(from_cp, 0); + + // Flush all the data from the source file + error = ubc_msync(from_vp, 0, ubc_getsize(from_vp), NULL, + UBC_PUSHALL | UBC_SYNC); + if (error) + goto exit; + +#if HFS_COMPRESSION + /* + * If this is a compressed file, we need to do the same for + * the resource fork. + */ + if (ISSET(from_cp->c_bsdflags, UF_COMPRESSED)) { + error = hfs_flush_rsrc(from_vp, ap->a_context); + if (error) + goto exit; + } +#endif + + /* + * We're doing a data-swap so we need to take the truncate + * lock exclusively. We need an exclusive lock because we + * will be completely truncating the source file and we must + * make sure nobody else sneaks in and trys to issue I/O + * whilst we don't have the cnode lock. + * + * After taking the truncate lock we do a quick check to + * verify there are no other references (including mmap + * references), but we must remember that this does not stop + * anybody coming in later and taking a reference. We will + * have the truncate lock exclusively so that will prevent + * them from issuing any I/O. + */ + + if (to_cp < from_cp) { + hfs_lock_truncate(to_cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + have_to_trunc_lock = true; + } + + hfs_lock_truncate(from_cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + have_from_trunc_lock = true; + + /* + * Do an early check to verify the source is not in use by + * anyone. We should be called from an FD opened as F_EVTONLY + * so that doesn't count as a reference. + */ + if (vnode_isinuse(from_vp, 0)) { + error = EBUSY; + goto exit; + } + + if (to_cp >= from_cp) { + hfs_lock_truncate(to_cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + have_to_trunc_lock = true; + } + } + + if ((error = hfs_lockpair(from_cp, to_cp, HFS_EXCLUSIVE_LOCK))) + goto exit; + have_cnode_locks = true; + + // Don't allow modification of the journal or journal_info_block + if (hfs_is_journal_file(hfsmp, from_cp) || + hfs_is_journal_file(hfsmp, to_cp)) { + error = EPERM; + goto exit; + } + + /* + * If doing a data move, then call the underlying function. + */ + if (ISSET(ap->a_options, FSOPT_EXCHANGE_DATA_ONLY)) { +#if HFS_COMPRESSION + if (ISSET(from_cp->c_bsdflags, UF_COMPRESSED)) { + error = hfs_move_compressed(from_cp, to_cp); + goto exit; + } +#endif + + error = hfs_move_data(from_cp, to_cp, 0); + goto exit; + } + + /* + * If we're doing a normal exchangedata, then get the source/dst quarantine + * EAs as needed. We do it here before we start the transaction. + */ + + //get the EA for the 'from' vnode if it exists. + error = hfs_exchangedata_getxattr (from_vp, quarantine, &from_xattr, &from_attrsize); + if (error) { + if (error == ENOATTR) { + //it's OK for the quarantine EA to not exist + error = 0; + } + else { + goto exit; + } + } + + + //get the EA from the 'to' vnode if it exists + error = hfs_exchangedata_getxattr (to_vp, quarantine, &to_xattr, &to_attrsize); + if (error) { + if (error == ENOATTR) { + //it's OK for the quarantine EA to not exist + error = 0; + } + else { + goto exit; + } + } + + + /* Start a transaction; we have to do all of this atomically */ + if ((error = hfs_start_transaction(hfsmp)) != 0) { + goto exit; + } + started_tr = 1; + + /* + * Reserve some space in the Catalog file. + */ + if ((error = cat_preflight(hfsmp, CAT_EXCHANGE, &cookie, vfs_context_proc(ap->a_context)))) { + goto exit; + } + got_cookie = 1; + + /* The backend code always tries to delete the virtual + * extent id for exchanging files so we need to lock + * the extents b-tree. + */ + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_EXTENTS | SFL_ATTRIBUTE, HFS_EXCLUSIVE_LOCK); + + /* Account for the location of the catalog objects. */ + if (from_cp->c_flag & C_HARDLINK) { + MAKE_INODE_NAME(from_iname, sizeof(from_iname), + from_cp->c_attr.ca_linkref); + from_nameptr = (unsigned char *)from_iname; + from_parid = hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid; + from_cp->c_hint = 0; + } else { + from_nameptr = from_cp->c_desc.cd_nameptr; + from_parid = from_cp->c_parentcnid; + } + if (to_cp->c_flag & C_HARDLINK) { + MAKE_INODE_NAME(to_iname, sizeof(to_iname), + to_cp->c_attr.ca_linkref); + to_nameptr = (unsigned char *)to_iname; + to_parid = hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid; + to_cp->c_hint = 0; + } else { + to_nameptr = to_cp->c_desc.cd_nameptr; + to_parid = to_cp->c_parentcnid; + } + + /* + * ExchangeFileIDs swaps the on-disk, or in-BTree extent information + * attached to two different file IDs. It also swaps the extent + * information that may live in the extents-overflow B-Tree. + * + * We do this in a transaction as this may require a lot of B-Tree nodes + * to do completely, particularly if one of the files in question + * has a lot of extents. + * + * For example, assume "file1" has fileID 50, and "file2" has fileID 52. + * For the on-disk records, which are assumed to be synced, we will + * first swap the resident inline-8 extents as part of the catalog records. + * Then we will swap any extents overflow records for each file. + * + * When ExchangeFileIDs returns successfully, "file1" will have fileID 52, + * and "file2" will have fileID 50. However, note that this is only + * approximately half of the work that exchangedata(2) will need to + * accomplish. In other words, we swap "too much" of the information + * because if we only called ExchangeFileIDs, both the fileID and extent + * information would be the invariants of this operation. We don't + * actually want that; we want to conclude with "file1" having + * file ID 50, and "file2" having fileID 52. + * + * The remainder of hfs_vnop_exchange will swap the file ID and other cnode + * data back to the proper ownership, while still allowing the cnode to remain + * pointing at the same set of extents that it did originally. + */ + error = ExchangeFileIDs(hfsmp, from_nameptr, to_nameptr, from_parid, + to_parid, from_cp->c_hint, to_cp->c_hint); + hfs_systemfile_unlock(hfsmp, lockflags); + + if (error != E_NONE) { + error = MacToVFSError(error); + goto exit; + } + + /* + * Now, we have to swap the quarantine EA. + * + * Ordinarily, we would not have to swap/exchange any extended attributes, + * since they are keyed by the file ID, and this function is supposed + * to manipulate the main data stream/fork only. + * + * However, we want the quarantine EA to follow the file content. + */ + + int from_xattr_status = 0; + if (from_xattr) { + /* + * Caution! + * We've crossed a point of no return here, because if we + * have successfully swapped the file content above, we need to continue here + * to swap the rest of the cnode content, which is not subject to failure. + * Failing the whole function because the xattr swap will result in perceived + * data loss to the caller, so we swallow the error case here. + */ + from_xattr_status = hfs_removexattr_by_id (hfsmp, from_cp->c_fileid, XATTR_NAMES[quarantine]); + if (from_xattr_status == 0) { + int xattr_lockflags; + int remaining_eas; + /* + * Check to see if we need to remove the xattr bit from the catalog record flags while + * 'from_cp' still tracks with its original file ID. Once the cnodes' contents are swapped + * and they are ready to be re-hashed, we will OR in the bit if we know that we moved the + * EA to the counterpart. + */ + xattr_lockflags = hfs_systemfile_lock (hfsmp, SFL_ATTRIBUTE, HFS_SHARED_LOCK); + remaining_eas = file_attribute_exist (hfsmp, from_cp->c_fileid); + if (remaining_eas == 0) { + from_cp->c_attr.ca_recflags &= ~kHFSHasAttributesMask; + //the cnode will be pushed out to disk LATER on. + } + hfs_systemfile_unlock (hfsmp, xattr_lockflags); + + } + } + + //and the same for to_xattr + if (to_xattr) { + int xattr_status = hfs_removexattr_by_id (hfsmp, to_cp->c_fileid, XATTR_NAMES[quarantine]); + + if (xattr_status == 0) { + int xattr_lockflags; + int remaining_eas; + /* + * Check to see if we need to remove the xattr bit from the catalog record flags while + * 'to_cp' still tracks with its original file ID. Once the cnodes' contents are swapped + * and they are ready to be re-hashed, we will OR in the bit if we know that we moved the + * EA to the counterpart. + */ + xattr_lockflags = hfs_systemfile_lock (hfsmp, SFL_ATTRIBUTE, HFS_SHARED_LOCK); + remaining_eas = file_attribute_exist (hfsmp, from_cp->c_fileid); + if (remaining_eas == 0) { + to_cp->c_attr.ca_recflags &= ~kHFSHasAttributesMask; + //the cnode will be pushed out to disk LATER on. + } + hfs_systemfile_unlock (hfsmp, xattr_lockflags); + + /* Now move the EA to the counterparty fileID. We piggyback on the larger transaction here */ + hfs_exchangedata_setxattr (hfsmp, from_cp->c_fileid, quarantine, to_xattr, to_attrsize); + } + } + + if (from_xattr && from_xattr_status == 0) { + /* + * if the from EA got removed properly, then attach it to the 'to' file. We do it at this point + * to ensure that it got removed properly above before re-setting it again. + */ + hfs_exchangedata_setxattr (hfsmp, to_cp->c_fileid, quarantine, from_xattr, from_attrsize); + } + + + /* Purge the vnodes from the name cache */ + if (from_vp) + cache_purge(from_vp); + if (to_vp) + cache_purge(to_vp); + + /* Bump both source and destination write counts before any swaps. */ + { + hfs_incr_gencount (from_cp); + hfs_incr_gencount (to_cp); + } + + /* Save a copy of "from" attributes before swapping. */ + bcopy(&from_cp->c_desc, &tempdesc, sizeof(struct cat_desc)); + bcopy(&from_cp->c_attr, &tempattr, sizeof(struct cat_attr)); + + /* Save whether or not each cnode is a hardlink or has EAs */ + from_flag_special = from_cp->c_flag & (C_HARDLINK | C_HASXATTRS); + from_recflags_special = (from_cp->c_attr.ca_recflags & kHFSHasAttributesMask); + + to_flag_special = to_cp->c_flag & (C_HARDLINK | C_HASXATTRS); + to_recflags_special = (to_cp->c_attr.ca_recflags & kHFSHasAttributesMask); + + /* Drop the special bits from each cnode */ + from_cp->c_flag &= ~(C_HARDLINK | C_HASXATTRS); + to_cp->c_flag &= ~(C_HARDLINK | C_HASXATTRS); + from_cp->c_attr.ca_recflags &= ~(kHFSHasAttributesMask); + to_cp->c_attr.ca_recflags &= ~(kHFSHasAttributesMask); + + /* + * Now complete the in-memory portion of the copy. + * + * ExchangeFileIDs swaps the on-disk records involved. We complete the + * operation by swapping the in-memory contents of the two files here. + * We swap the cnode descriptors, which contain name, BSD attributes, + * timestamps, etc, about the file. + * + * NOTE: We do *NOT* swap the fileforks of the two cnodes. We have + * already swapped the on-disk extent information. As long as we swap the + * IDs, the in-line resident 8 extents that live in the filefork data + * structure will point to the right data for the new file ID if we leave + * them alone. + * + * As a result, any file descriptor that points to a particular + * vnode (even though it should change names), will continue + * to point to the same content. + */ + + /* Copy the "to" -> "from" cnode */ + bcopy(&to_cp->c_desc, &from_cp->c_desc, sizeof(struct cat_desc)); + + from_cp->c_hint = 0; + /* + * If 'to' was a hardlink, then we copied over its link ID/CNID/(namespace ID) + * when we bcopy'd the descriptor above. However, the cnode attributes + * are not bcopied. As a result, make sure to swap the file IDs of each item. + * + * Further, other hardlink attributes must be moved along in this swap: + * the linkcount, the linkref, and the firstlink all need to move + * along with the file IDs. See note below regarding the flags and + * what moves vs. what does not. + * + * For Reference: + * linkcount == total # of hardlinks. + * linkref == the indirect inode pointer. + * firstlink == the first hardlink in the chain (written to the raw inode). + * These three are tied to the fileID and must move along with the rest of the data. + */ + from_cp->c_fileid = to_cp->c_attr.ca_fileid; + + from_cp->c_itime = to_cp->c_itime; + from_cp->c_btime = to_cp->c_btime; + from_cp->c_atime = to_cp->c_atime; + from_cp->c_ctime = to_cp->c_ctime; + from_cp->c_gid = to_cp->c_gid; + from_cp->c_uid = to_cp->c_uid; + from_cp->c_bsdflags = to_cp->c_bsdflags; + from_cp->c_mode = to_cp->c_mode; + from_cp->c_linkcount = to_cp->c_linkcount; + from_cp->c_attr.ca_linkref = to_cp->c_attr.ca_linkref; + from_cp->c_attr.ca_firstlink = to_cp->c_attr.ca_firstlink; + + /* + * The cnode flags need to stay with the cnode and not get transferred + * over along with everything else because they describe the content; they are + * not attributes that reflect changes specific to the file ID. In general, + * fields that are tied to the file ID are the ones that will move. + * + * This reflects the fact that the file may have borrowed blocks, dirty metadata, + * or other extents, which may not yet have been written to the catalog. If + * they were, they would have been transferred above in the ExchangeFileIDs call above... + * + * The flags that are special are: + * C_HARDLINK, C_HASXATTRS + * + * and the c_attr recflag: + * kHFSHasAttributesMask + * + * These flags move with the item and file ID in the namespace since their + * state is tied to that of the file ID. + * + * So to transfer the flags, we have to take the following steps + * 1) Store in a localvar whether or not the special bits are set. + * 2) Drop the special bits from the current flags + * 3) swap the special flag bits to their destination + */ + from_cp->c_flag |= to_flag_special | C_MODIFIED; + from_cp->c_attr.ca_recflags = to_cp->c_attr.ca_recflags; + from_cp->c_attr.ca_recflags |= to_recflags_special; + if (from_xattr) { + /* + * NOTE: + * This is counter-intuitive and part of the complexity of exchangedata. + * if 'from_cp' originally had a quarantine EA, then ensure that the cnode + * pointed to by 'from_cp' CONTINUES to keep the "has EAs" bit. This is because + * the cnode is about to be re-hashed with a new ID, but the file CONTENT + * (i.e. the file fork) stayed put. And we want the quarantine EA to follow + * the content. The check above is correct. + */ + from_cp->c_attr.ca_recflags |= kHFSHasAttributesMask; + } + + bcopy(to_cp->c_finderinfo, from_cp->c_finderinfo, 32); + + + /* Copy the "from" -> "to" cnode */ + bcopy(&tempdesc, &to_cp->c_desc, sizeof(struct cat_desc)); + to_cp->c_hint = 0; + /* + * Pull the file ID from the tempattr we copied above. We can't assume + * it is the same as the CNID. + */ + to_cp->c_fileid = tempattr.ca_fileid; + to_cp->c_itime = tempattr.ca_itime; + to_cp->c_btime = tempattr.ca_btime; + to_cp->c_atime = tempattr.ca_atime; + to_cp->c_ctime = tempattr.ca_ctime; + to_cp->c_gid = tempattr.ca_gid; + to_cp->c_uid = tempattr.ca_uid; + to_cp->c_bsdflags = tempattr.ca_flags; + to_cp->c_mode = tempattr.ca_mode; + to_cp->c_linkcount = tempattr.ca_linkcount; + to_cp->c_attr.ca_linkref = tempattr.ca_linkref; + to_cp->c_attr.ca_firstlink = tempattr.ca_firstlink; + + /* + * Only OR in the "from" flags into our cnode flags below. + * Leave the rest of the flags alone. + */ + to_cp->c_flag |= from_flag_special | C_MODIFIED; + to_cp->c_attr.ca_recflags = tempattr.ca_recflags; + to_cp->c_attr.ca_recflags |= from_recflags_special; + + if (to_xattr) { + /* + * NOTE: + * This is counter-intuitive and part of the complexity of exchangedata. + * if 'to_cp' originally had a quarantine EA, then ensure that the cnode + * pointed to by 'to_cp' CONTINUES to keep the "has EAs" bit. This is because + * the cnode is about to be re-hashed with a new ID, but the file CONTENT + * (i.e. the file fork) stayed put. And we want the quarantine EA to follow + * the content. The check above is correct. + */ + to_cp->c_attr.ca_recflags |= kHFSHasAttributesMask; + } + + bcopy(tempattr.ca_finderinfo, to_cp->c_finderinfo, 32); + + + /* Rehash the cnodes using their new file IDs */ + hfs_chash_rehash(hfsmp, from_cp, to_cp); + + /* + * When a file moves out of "Cleanup At Startup" + * we can drop its NODUMP status. + */ + if ((from_cp->c_bsdflags & UF_NODUMP) && + (from_cp->c_parentcnid != to_cp->c_parentcnid)) { + from_cp->c_bsdflags &= ~UF_NODUMP; + from_cp->c_touch_chgtime = TRUE; + } + if ((to_cp->c_bsdflags & UF_NODUMP) && + (to_cp->c_parentcnid != from_cp->c_parentcnid)) { + to_cp->c_bsdflags &= ~UF_NODUMP; + to_cp->c_touch_chgtime = TRUE; + } + +exit: + if (got_cookie) { + cat_postflight(hfsmp, &cookie, vfs_context_proc(ap->a_context)); + } + if (started_tr) { + hfs_end_transaction(hfsmp); + } + + if (have_cnode_locks) + hfs_unlockpair(from_cp, to_cp); + + if (have_from_trunc_lock) + hfs_unlock_truncate(from_cp, 0); + + if (have_to_trunc_lock) + hfs_unlock_truncate(to_cp, 0); + + /* Free the memory used by the EAs */ + if (from_xattr) { + hfs_free (from_xattr, from_attrsize); + from_xattr = NULL; + } + + if (to_xattr) { + hfs_free (to_xattr, to_attrsize); + to_xattr = NULL; + } + + return (error); +} + +#if HFS_COMPRESSION +/* + * This function is used specifically for the case when a namespace + * handler is trying to steal data before it's deleted. Note that we + * don't bother deleting the xattr from the source because it will get + * deleted a short time later anyway. + * + * cnodes must be locked + */ +static int hfs_move_compressed(cnode_t *from_cp, cnode_t *to_cp) +{ + int ret; + void *data = NULL; + + CLR(from_cp->c_bsdflags, UF_COMPRESSED); + SET(from_cp->c_flag, C_MODIFIED); + + ret = hfs_move_data(from_cp, to_cp, HFS_MOVE_DATA_INCLUDE_RSRC); + if (ret) + goto exit; + + /* + * Transfer the xattr that decmpfs uses. Ideally, this code + * should be with the other decmpfs code but it's file system + * agnostic and this path is currently, and likely to remain, HFS+ + * specific. It's easier and more performant if we implement it + * here. + */ + + size_t size; + data = hfs_malloc(size = MAX_DECMPFS_XATTR_SIZE); + + ret = hfs_xattr_read(from_cp->c_vp, DECMPFS_XATTR_NAME, data, &size); + if (ret) + goto exit; + + ret = hfs_xattr_write(to_cp->c_vp, DECMPFS_XATTR_NAME, data, size); + if (ret) + goto exit; + + SET(to_cp->c_bsdflags, UF_COMPRESSED); + SET(to_cp->c_flag, C_MODIFIED); + +exit: + hfs_free(data, MAX_DECMPFS_XATTR_SIZE); + + return ret; +} +#endif // HFS_COMPRESSION + +int +hfs_vnop_mmap(struct vnop_mmap_args *ap) +{ + struct vnode *vp = ap->a_vp; + cnode_t *cp = VTOC(vp); + int error; + + if (VNODE_IS_RSRC(vp)) { + /* allow pageins of the resource fork */ + } else { + int compressed = hfs_file_is_compressed(cp, 1); /* 1 == don't take the cnode lock */ + time_t orig_ctime = cp->c_ctime; + + if (!compressed && (cp->c_bsdflags & UF_COMPRESSED)) { + error = check_for_dataless_file(vp, NAMESPACE_HANDLER_READ_OP); + if (error != 0) { + return error; + } + } + + if (ap->a_fflags & PROT_WRITE) { + nspace_snapshot_event(vp, orig_ctime, NAMESPACE_HANDLER_WRITE_OP, NULL); + } + } + +#if CONFIG_PROTECT + error = cp_handle_vnop(vp, (ap->a_fflags & PROT_WRITE + ? CP_WRITE_ACCESS : 0) | CP_READ_ACCESS, 0); + if (error) + return error; +#endif + + // + // NOTE: we return ENOTSUP because we want the cluster layer + // to actually do all the real work. + // + return (ENOTSUP); +} + +static errno_t hfs_vnop_mnomap(struct vnop_mnomap_args *ap) +{ + vnode_t vp = ap->a_vp; + + /* + * Whilst the file was mapped, there may not have been any + * page-outs so we need to increment the generation counter now. + * Unfortunately this may lead to a change in the generation + * counter when no actual change has been made, but there is + * little we can do about that with our current architecture. + */ + if (ubc_is_mapped_writable(vp)) { + cnode_t *cp = VTOC(vp); + hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); + hfs_incr_gencount(cp); + + /* + * We don't want to set the modification time here since a + * change to that is not acceptable if no changes were made. + * Instead we set a flag so that if we get any page-outs we + * know to update the modification time. It's possible that + * they weren't actually because of changes made whilst the + * file was mapped but that's not easy to fix now. + */ + SET(cp->c_flag, C_MIGHT_BE_DIRTY_FROM_MAPPING); + + hfs_unlock(cp); + } + + return 0; +} + +/* + * Mark the resource fork as needing a ubc_setsize when we drop the + * cnode lock later. + */ +static void hfs_rsrc_setsize(cnode_t *cp) +{ + /* + * We need to take an iocount if we don't have one. vnode_get + * will return ENOENT if the vnode is terminating which is what we + * want as it's not safe to call ubc_setsize in that case. + */ + if (cp->c_rsrc_vp && !vnode_get(cp->c_rsrc_vp)) { + // Shouldn't happen, but better safe... + if (ISSET(cp->c_flag, C_NEED_RVNODE_PUT)) + vnode_put(cp->c_rsrc_vp); + SET(cp->c_flag, C_NEED_RVNODE_PUT | C_NEED_RSRC_SETSIZE); + } +} + +/* + * hfs_move_data + * + * This is a non-symmetric variant of exchangedata. In this function, + * the contents of the data fork (and optionally the resource fork) + * are moved from from_cp to to_cp. + * + * The cnodes must be locked. + * + * The cnode pointed to by 'to_cp' *must* be empty prior to invoking + * this function. We impose this restriction because we may not be + * able to fully delete the entire file's contents in a single + * transaction, particularly if it has a lot of extents. In the + * normal file deletion codepath, the file is screened for two + * conditions: 1) bigger than 400MB, and 2) more than 8 extents. If + * so, the file is relocated to the hidden directory and the deletion + * is broken up into multiple truncates. We can't do that here + * because both files need to exist in the namespace. The main reason + * this is imposed is that we may have to touch a whole lot of bitmap + * blocks if there are many extents. + * + * Any data written to 'from_cp' after this call completes is not + * guaranteed to be moved. + * + * Arguments: + * cnode_t *from_cp : source file + * cnode_t *to_cp : destination file; must be empty + * + * Returns: + * + * EBUSY - File has been deleted or is in use + * EFBIG - Destination file was not empty + * EIO - An I/O error + * 0 - success + * other - Other errors that can be returned from called functions + */ +int hfs_move_data(cnode_t *from_cp, cnode_t *to_cp, + hfs_move_data_options_t options) +{ + hfsmount_t *hfsmp = VTOHFS(from_cp->c_vp); + int error = 0; + int lockflags = 0; + bool return_EIO_on_error = false; + const bool include_rsrc = ISSET(options, HFS_MOVE_DATA_INCLUDE_RSRC); + + /* Verify that neither source/dest file is open-unlinked */ + if (ISSET(from_cp->c_flag, C_DELETED | C_NOEXISTS) + || ISSET(to_cp->c_flag, C_DELETED | C_NOEXISTS)) { + return EBUSY; + } + + /* + * Verify the source file is not in use by anyone besides us. + * + * This function is typically invoked by a namespace handler + * process responding to a temporarily stalled system call. + * The FD that it is working off of is opened O_EVTONLY, so + * it really has no active usecounts (the kusecount from O_EVTONLY + * is subtracted from the total usecounts). + * + * As a result, we shouldn't have any active usecounts against + * this vnode when we go to check it below. + */ + if (vnode_isinuse(from_cp->c_vp, 0)) + return EBUSY; + + if (include_rsrc && from_cp->c_rsrc_vp) { + if (vnode_isinuse(from_cp->c_rsrc_vp, 0)) + return EBUSY; + + /* + * In the code below, if the destination file doesn't have a + * c_rsrcfork then we don't create it which means we we cannot + * transfer the ff_invalidranges and cf_vblocks fields. These + * shouldn't be set because we flush the resource fork before + * calling this function but there is a tiny window when we + * did not have any locks... + */ + if (!to_cp->c_rsrcfork + && (!TAILQ_EMPTY(&from_cp->c_rsrcfork->ff_invalidranges) + || from_cp->c_rsrcfork->ff_unallocblocks)) { + /* + * The file isn't really busy now but something did slip + * in and tinker with the file while we didn't have any + * locks, so this is the most meaningful return code for + * the caller. + */ + return EBUSY; + } + } + + // Check the destination file is empty + if (to_cp->c_datafork->ff_blocks + || to_cp->c_datafork->ff_size + || (include_rsrc + && (to_cp->c_blocks + || (to_cp->c_rsrcfork && to_cp->c_rsrcfork->ff_size)))) { + return EFBIG; + } + + if ((error = hfs_start_transaction (hfsmp))) + return error; + + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_EXTENTS | SFL_ATTRIBUTE, + HFS_EXCLUSIVE_LOCK); + + // filefork_t is 128 bytes which should be OK + filefork_t rfork_buf, *from_rfork = NULL; + + if (include_rsrc) { + from_rfork = from_cp->c_rsrcfork; + + /* + * Creating resource fork vnodes is expensive, so just get get + * the fork data if we need it. + */ + if (!from_rfork && hfs_has_rsrc(from_cp)) { + from_rfork = &rfork_buf; + + from_rfork->ff_cp = from_cp; + TAILQ_INIT(&from_rfork->ff_invalidranges); + + error = cat_idlookup(hfsmp, from_cp->c_fileid, 0, 1, NULL, NULL, + &from_rfork->ff_data); + + if (error) + goto exit; + } + } + + /* + * From here on, any failures mean that we might be leaving things + * in a weird or inconsistent state. Ideally, we should back out + * all the changes, but to do that properly we need to fix + * MoveData. We'll save fixing that for another time. For now, + * just return EIO in all cases to the caller so that they know. + */ + return_EIO_on_error = true; + + bool data_overflow_extents = overflow_extents(from_cp->c_datafork); + + // Move the data fork + if ((error = hfs_move_fork (from_cp->c_datafork, from_cp, + to_cp->c_datafork, to_cp))) { + goto exit; + } + + SET(from_cp->c_flag, C_NEED_DATA_SETSIZE); + SET(to_cp->c_flag, C_NEED_DATA_SETSIZE); + + // We move the resource fork later + + /* + * Note that because all we're doing is moving the extents around, + * we can probably do this in a single transaction: Each extent + * record (group of 8) is 64 bytes. A extent overflow B-Tree node + * is typically 4k. This means each node can hold roughly ~60 + * extent records == (480 extents). + * + * If a file was massively fragmented and had 20k extents, this + * means we'd roughly touch 20k/480 == 41 to 42 nodes, plus the + * index nodes, for half of the operation. (inserting or + * deleting). So if we're manipulating 80-100 nodes, this is + * basically 320k of data to write to the journal in a bad case. + */ + if (data_overflow_extents) { + if ((error = MoveData(hfsmp, from_cp->c_cnid, to_cp->c_cnid, 0))) + goto exit; + } + + if (from_rfork && overflow_extents(from_rfork)) { + if ((error = MoveData(hfsmp, from_cp->c_cnid, to_cp->c_cnid, 1))) + goto exit; + } + + // Touch times + from_cp->c_touch_acctime = TRUE; + from_cp->c_touch_chgtime = TRUE; + from_cp->c_touch_modtime = TRUE; + hfs_touchtimes(hfsmp, from_cp); + + to_cp->c_touch_acctime = TRUE; + to_cp->c_touch_chgtime = TRUE; + to_cp->c_touch_modtime = TRUE; + hfs_touchtimes(hfsmp, to_cp); + + struct cat_fork dfork_buf; + const struct cat_fork *dfork, *rfork; + + dfork = hfs_prepare_fork_for_update(to_cp->c_datafork, NULL, + &dfork_buf, hfsmp->blockSize); + rfork = hfs_prepare_fork_for_update(from_rfork, NULL, + &rfork_buf.ff_data, hfsmp->blockSize); + + // Update the catalog nodes, to_cp first + if ((error = cat_update(hfsmp, &to_cp->c_desc, &to_cp->c_attr, + dfork, rfork))) { + goto exit; + } + + CLR(to_cp->c_flag, C_MODIFIED | C_MINOR_MOD); + + // Update in-memory resource fork data here + if (from_rfork) { + // Update c_blocks + uint32_t moving = from_rfork->ff_blocks + from_rfork->ff_unallocblocks; + + from_cp->c_blocks -= moving; + to_cp->c_blocks += moving; + + // Update to_cp's resource data if it has it + filefork_t *to_rfork = to_cp->c_rsrcfork; + if (to_rfork) { + TAILQ_SWAP(&to_rfork->ff_invalidranges, + &from_rfork->ff_invalidranges, rl_entry, rl_link); + to_rfork->ff_data = from_rfork->ff_data; + + // Deal with ubc_setsize + hfs_rsrc_setsize(to_cp); + } + + // Wipe out the resource fork in from_cp + rl_init(&from_rfork->ff_invalidranges); + bzero(&from_rfork->ff_data, sizeof(from_rfork->ff_data)); + + // Deal with ubc_setsize + hfs_rsrc_setsize(from_cp); + } + + // Currently unnecessary, but might be useful in future... + dfork = hfs_prepare_fork_for_update(from_cp->c_datafork, NULL, &dfork_buf, + hfsmp->blockSize); + rfork = hfs_prepare_fork_for_update(from_rfork, NULL, &rfork_buf.ff_data, + hfsmp->blockSize); + + // Update from_cp + if ((error = cat_update(hfsmp, &from_cp->c_desc, &from_cp->c_attr, + dfork, rfork))) { + goto exit; + } + + CLR(from_cp->c_flag, C_MODIFIED | C_MINOR_MOD); + +exit: + if (lockflags) { + hfs_systemfile_unlock(hfsmp, lockflags); + hfs_end_transaction(hfsmp); + } + + if (error && error != EIO && return_EIO_on_error) { + printf("hfs_move_data: encountered error %d\n", error); + error = EIO; + } + + return error; +} + +/* + * Move all of the catalog and runtime data in srcfork to dstfork. + * + * This allows us to maintain the invalid ranges across the move data + * operation so we don't need to force all of the pending IO right + * now. In addition, we move all non overflow-extent extents into the + * destination here. + * + * The destination fork must be empty and should have been checked + * prior to calling this. + */ +static int hfs_move_fork(filefork_t *srcfork, cnode_t *src_cp, + filefork_t *dstfork, cnode_t *dst_cp) +{ + // Move the invalid ranges + TAILQ_SWAP(&dstfork->ff_invalidranges, &srcfork->ff_invalidranges, + rl_entry, rl_link); + rl_remove_all(&srcfork->ff_invalidranges); + + // Move the fork data (copy whole structure) + dstfork->ff_data = srcfork->ff_data; + bzero(&srcfork->ff_data, sizeof(srcfork->ff_data)); + + // Update c_blocks + src_cp->c_blocks -= dstfork->ff_blocks + dstfork->ff_unallocblocks; + dst_cp->c_blocks += dstfork->ff_blocks + dstfork->ff_unallocblocks; + + return 0; +} + +/* + * cnode must be locked + */ +int +hfs_fsync(struct vnode *vp, int waitfor, hfs_fsync_mode_t fsyncmode, struct proc *p) +{ + struct cnode *cp = VTOC(vp); + struct filefork *fp = NULL; + int retval = 0; + struct hfsmount *hfsmp = VTOHFS(vp); + struct timeval tv; + int waitdata; /* attributes necessary for data retrieval */ + int wait; /* all other attributes (e.g. atime, etc.) */ + int took_trunc_lock = 0; + int fsync_default = 1; + + /* + * Applications which only care about data integrity rather than full + * file integrity may opt out of (delay) expensive metadata update + * operations as a performance optimization. + */ + wait = (waitfor == MNT_WAIT); + waitdata = (waitfor == MNT_DWAIT) | wait; + + if (always_do_fullfsync) + fsyncmode = HFS_FSYNC_FULL; + if (fsyncmode != HFS_FSYNC) + fsync_default = 0; + + /* HFS directories don't have any data blocks. */ + if (vnode_isdir(vp)) + goto metasync; + fp = VTOF(vp); + + /* + * For system files flush the B-tree header and + * for regular files write out any clusters + */ + if (vnode_issystem(vp)) { + if (VTOF(vp)->fcbBTCBPtr != NULL) { + // XXXdbg + if (hfsmp->jnl == NULL) { + BTFlushPath(VTOF(vp)); + } + } + } else { + hfs_unlock(cp); + hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT); + took_trunc_lock = 1; + + if (fp->ff_unallocblocks != 0) { + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); + + hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + } + + /* Don't hold cnode lock when calling into cluster layer. */ + (void) cluster_push(vp, waitdata ? IO_SYNC : 0); + + hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); + } + /* + * When MNT_WAIT is requested and the zero fill timeout + * has expired then we must explicitly zero out any areas + * that are currently marked invalid (holes). + * + * Files with NODUMP can bypass zero filling here. + */ + if (fp && (((cp->c_flag & C_ALWAYS_ZEROFILL) && !TAILQ_EMPTY(&fp->ff_invalidranges)) || + ((wait || (cp->c_flag & C_ZFWANTSYNC)) && + ((cp->c_bsdflags & UF_NODUMP) == 0) && + (vnode_issystem(vp) ==0) && + cp->c_zftimeout != 0))) { + + microuptime(&tv); + if ((cp->c_flag & C_ALWAYS_ZEROFILL) == 0 && fsync_default && tv.tv_sec < (long)cp->c_zftimeout) { + /* Remember that a force sync was requested. */ + cp->c_flag |= C_ZFWANTSYNC; + goto datasync; + } + if (!TAILQ_EMPTY(&fp->ff_invalidranges)) { + if (!took_trunc_lock || (cp->c_truncatelockowner == HFS_SHARED_OWNER)) { + hfs_unlock(cp); + if (took_trunc_lock) { + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); + } + hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); + took_trunc_lock = 1; + } + hfs_flush_invalid_ranges(vp); + hfs_unlock(cp); + (void) cluster_push(vp, waitdata ? IO_SYNC : 0); + hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); + } + } +datasync: + if (took_trunc_lock) { + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); + took_trunc_lock = 0; + } + + if (!hfsmp->jnl) + buf_flushdirtyblks(vp, waitdata, 0, "hfs_fsync"); + else if (fsync_default && vnode_islnk(vp) + && vnode_hasdirtyblks(vp) && vnode_isrecycled(vp)) { + /* + * If it's a symlink that's dirty and is about to be recycled, + * we need to flush the journal. + */ + fsync_default = 0; + } + +metasync: + if (vnode_isreg(vp) && vnode_issystem(vp)) { + if (VTOF(vp)->fcbBTCBPtr != NULL) { + microuptime(&tv); + BTSetLastSync(VTOF(vp), tv.tv_sec); + } + cp->c_touch_acctime = FALSE; + cp->c_touch_chgtime = FALSE; + cp->c_touch_modtime = FALSE; + } else if (!vnode_isswap(vp)) { + retval = hfs_update(vp, HFS_UPDATE_FORCE); + + /* + * When MNT_WAIT is requested push out the catalog record for + * this file. If they asked for a full fsync, we can skip this + * because the journal_flush or hfs_metasync_all will push out + * all of the metadata changes. + */ + if ((retval == 0) && wait && fsync_default && cp->c_hint && + !ISSET(cp->c_flag, C_DELETED | C_NOEXISTS)) { + hfs_metasync(VTOHFS(vp), (daddr64_t)cp->c_hint, p); + } + + /* + * If this was a full fsync, make sure all metadata + * changes get to stable storage. + */ + if (!fsync_default) { + if (hfsmp->jnl) { + if (fsyncmode == HFS_FSYNC_FULL) + hfs_flush(hfsmp, HFS_FLUSH_FULL); + else + hfs_flush(hfsmp, HFS_FLUSH_JOURNAL_BARRIER); + } else { + retval = hfs_metasync_all(hfsmp); + /* XXX need to pass context! */ + hfs_flush(hfsmp, HFS_FLUSH_CACHE); + } + } + } + + if (!hfs_is_dirty(cp) && !ISSET(cp->c_flag, C_DELETED)) + vnode_cleardirty(vp); + + return (retval); +} + + +/* Sync an hfs catalog b-tree node */ +int +hfs_metasync(struct hfsmount *hfsmp, daddr64_t node, __unused struct proc *p) +{ + vnode_t vp; + buf_t bp; + int lockflags; + + vp = HFSTOVCB(hfsmp)->catalogRefNum; + + // XXXdbg - don't need to do this on a journaled volume + if (hfsmp->jnl) { + return 0; + } + + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); + /* + * Look for a matching node that has been delayed + * but is not part of a set (B_LOCKED). + * + * BLK_ONLYVALID causes buf_getblk to return a + * buf_t for the daddr64_t specified only if it's + * currently resident in the cache... the size + * parameter to buf_getblk is ignored when this flag + * is set + */ + bp = buf_getblk(vp, node, 0, 0, 0, BLK_META | BLK_ONLYVALID); + + if (bp) { + if ((buf_flags(bp) & (B_LOCKED | B_DELWRI)) == B_DELWRI) + (void) VNOP_BWRITE(bp); + else + buf_brelse(bp); + } + + hfs_systemfile_unlock(hfsmp, lockflags); + + return (0); +} + + +/* + * Sync all hfs B-trees. Use this instead of journal_flush for a volume + * without a journal. Note that the volume bitmap does not get written; + * we rely on fsck_hfs to fix that up (which it can do without any loss + * of data). + */ +int +hfs_metasync_all(struct hfsmount *hfsmp) +{ + int lockflags; + + /* Lock all of the B-trees so we get a mutually consistent state */ + lockflags = hfs_systemfile_lock(hfsmp, + SFL_CATALOG|SFL_EXTENTS|SFL_ATTRIBUTE, HFS_EXCLUSIVE_LOCK); + + /* Sync each of the B-trees */ + if (hfsmp->hfs_catalog_vp) + hfs_btsync(hfsmp->hfs_catalog_vp, 0); + if (hfsmp->hfs_extents_vp) + hfs_btsync(hfsmp->hfs_extents_vp, 0); + if (hfsmp->hfs_attribute_vp) + hfs_btsync(hfsmp->hfs_attribute_vp, 0); + + /* Wait for all of the writes to complete */ + if (hfsmp->hfs_catalog_vp) + vnode_waitforwrites(hfsmp->hfs_catalog_vp, 0, 0, 0, "hfs_metasync_all"); + if (hfsmp->hfs_extents_vp) + vnode_waitforwrites(hfsmp->hfs_extents_vp, 0, 0, 0, "hfs_metasync_all"); + if (hfsmp->hfs_attribute_vp) + vnode_waitforwrites(hfsmp->hfs_attribute_vp, 0, 0, 0, "hfs_metasync_all"); + + hfs_systemfile_unlock(hfsmp, lockflags); + + return 0; +} + + +/*ARGSUSED 1*/ +static int +hfs_btsync_callback(struct buf *bp, __unused void *dummy) +{ + buf_clearflags(bp, B_LOCKED); + (void) buf_bawrite(bp); + + return(BUF_CLAIMED); +} + + +int +hfs_btsync(struct vnode *vp, int sync_transaction) +{ + struct cnode *cp = VTOC(vp); + struct timeval tv; + int flags = 0; + + if (sync_transaction) + flags |= BUF_SKIP_NONLOCKED; + /* + * Flush all dirty buffers associated with b-tree. + */ + buf_iterate(vp, hfs_btsync_callback, flags, 0); + + microuptime(&tv); + if (vnode_issystem(vp) && (VTOF(vp)->fcbBTCBPtr != NULL)) + (void) BTSetLastSync(VTOF(vp), tv.tv_sec); + cp->c_touch_acctime = FALSE; + cp->c_touch_chgtime = FALSE; + cp->c_touch_modtime = FALSE; + + return 0; +} + +/* + * Remove a directory. + */ +int +hfs_vnop_rmdir(struct vnop_rmdir_args *ap) +{ + struct vnode *dvp = ap->a_dvp; + struct vnode *vp = ap->a_vp; + struct cnode *dcp = VTOC(dvp); + struct cnode *cp = VTOC(vp); + int error; + time_t orig_ctime; + + orig_ctime = VTOC(vp)->c_ctime; + + if (!S_ISDIR(cp->c_mode)) { + return (ENOTDIR); + } + if (dvp == vp) { + return (EINVAL); + } + + nspace_snapshot_event(vp, orig_ctime, NAMESPACE_HANDLER_DELETE_OP, NULL); + cp = VTOC(vp); + + if ((error = hfs_lockpair(dcp, cp, HFS_EXCLUSIVE_LOCK))) { + return (error); + } + + /* Check for a race with rmdir on the parent directory */ + if (dcp->c_flag & (C_DELETED | C_NOEXISTS)) { + hfs_unlockpair (dcp, cp); + return ENOENT; + } + + // + // if the item is tracked but doesn't have a document_id, assign one and generate an fsevent for it + // + if ((cp->c_bsdflags & UF_TRACKED) && ((struct FndrExtendedDirInfo *)((char *)&cp->c_attr.ca_finderinfo + 16))->document_id == 0) { + uint32_t newid; + + hfs_unlockpair(dcp, cp); + + if (hfs_generate_document_id(VTOHFS(vp), &newid) == 0) { + hfs_lockpair(dcp, cp, HFS_EXCLUSIVE_LOCK); + ((struct FndrExtendedDirInfo *)((char *)&cp->c_attr.ca_finderinfo + 16))->document_id = newid; + add_fsevent(FSE_DOCID_CHANGED, vfs_context_current(), + FSE_ARG_DEV, VTOHFS(vp)->hfs_raw_dev, + FSE_ARG_INO, (ino64_t)0, // src inode # + FSE_ARG_INO, (ino64_t)cp->c_fileid, // dst inode # + FSE_ARG_INT32, newid, + FSE_ARG_DONE); + } else { + // XXXdbg - couldn't get a new docid... what to do? can't really fail the rm... + hfs_lockpair(dcp, cp, HFS_EXCLUSIVE_LOCK); + } + } + + error = hfs_removedir(dvp, vp, ap->a_cnp, 0, 0); + + hfs_unlockpair(dcp, cp); + + return (error); +} + +/* + * Remove a directory + * + * Both dvp and vp cnodes are locked + */ +int +hfs_removedir(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, + int skip_reserve, int only_unlink) +{ + struct cnode *cp; + struct cnode *dcp; + struct hfsmount * hfsmp; + struct cat_desc desc; + int lockflags; + int error = 0, started_tr = 0; + + cp = VTOC(vp); + dcp = VTOC(dvp); + hfsmp = VTOHFS(vp); + + if (dcp == cp) { + return (EINVAL); /* cannot remove "." */ + } + if (cp->c_flag & (C_NOEXISTS | C_DELETED)) { + return (0); + } + if (cp->c_entries != 0) { + return (ENOTEMPTY); + } + + /* + * If the directory is open or in use (e.g. opendir() or current working + * directory for some process); wait for inactive/reclaim to actually + * remove cnode from the catalog. Both inactive and reclaim codepaths are capable + * of removing open-unlinked directories from the catalog, as well as getting rid + * of EAs still on the element. So change only_unlink to true, so that it will get + * cleaned up below. + * + * Otherwise, we can get into a weird old mess where the directory has C_DELETED, + * but it really means C_NOEXISTS because the item was actually removed from the + * catalog. Then when we try to remove the entry from the catalog later on, it won't + * really be there anymore. + */ + if (vnode_isinuse(vp, 0)) { + only_unlink = 1; + } + + /* Deal with directory hardlinks */ + if (cp->c_flag & C_HARDLINK) { + /* + * Note that if we have a directory which was a hardlink at any point, + * its actual directory data is stored in the directory inode in the hidden + * directory rather than the leaf element(s) present in the namespace. + * + * If there are still other hardlinks to this directory, + * then we'll just eliminate this particular link and the vnode will still exist. + * If this is the last link to an empty directory, then we'll open-unlink the + * directory and it will be only tagged with C_DELETED (as opposed to C_NOEXISTS). + * + * We could also return EBUSY here. + */ + + return hfs_unlink(hfsmp, dvp, vp, cnp, skip_reserve); + } + + /* + * In a few cases, we may want to allow the directory to persist in an + * open-unlinked state. If the directory is being open-unlinked (still has usecount + * references), or if it has EAs, or if it was being deleted as part of a rename, + * then we go ahead and move it to the hidden directory. + * + * If the directory is being open-unlinked, then we want to keep the catalog entry + * alive so that future EA calls and fchmod/fstat etc. do not cause issues later. + * + * If the directory had EAs, then we want to use the open-unlink trick so that the + * EA removal is not done in one giant transaction. Otherwise, it could cause a panic + * due to overflowing the journal. + * + * Finally, if it was deleted as part of a rename, we move it to the hidden directory + * in order to maintain rename atomicity. + * + * Note that the allow_dirs argument to hfs_removefile specifies that it is + * supposed to handle directories for this case. + */ + + if (((hfsmp->hfs_attribute_vp != NULL) && + ((cp->c_attr.ca_recflags & kHFSHasAttributesMask) != 0)) || + (only_unlink != 0)) { + + int ret = hfs_removefile(dvp, vp, cnp, 0, 0, 1, NULL, only_unlink); + /* + * Even though hfs_vnop_rename calls vnode_recycle for us on tvp we call + * it here just in case we were invoked by rmdir() on a directory that had + * EAs. To ensure that we start reclaiming the space as soon as possible, + * we call vnode_recycle on the directory. + */ + vnode_recycle(vp); + + return ret; + + } + + dcp->c_flag |= C_DIR_MODIFICATION; + +#if QUOTA + if (hfsmp->hfs_flags & HFS_QUOTAS) + (void)hfs_getinoquota(cp); +#endif + if ((error = hfs_start_transaction(hfsmp)) != 0) { + goto out; + } + started_tr = 1; + + /* + * Verify the directory is empty (and valid). + * (Rmdir ".." won't be valid since + * ".." will contain a reference to + * the current directory and thus be + * non-empty.) + */ + if ((dcp->c_bsdflags & APPEND) || (cp->c_bsdflags & (IMMUTABLE | APPEND))) { + error = EPERM; + goto out; + } + + /* Remove the entry from the namei cache: */ + cache_purge(vp); + + /* + * Protect against a race with rename by using the component + * name passed in and parent id from dvp (instead of using + * the cp->c_desc which may have changed). + */ + desc.cd_nameptr = (const u_int8_t *)cnp->cn_nameptr; + desc.cd_namelen = cnp->cn_namelen; + desc.cd_parentcnid = dcp->c_fileid; + desc.cd_cnid = cp->c_cnid; + desc.cd_flags = CD_ISDIR; + desc.cd_encoding = cp->c_encoding; + desc.cd_hint = 0; + + if (!hfs_valid_cnode(hfsmp, dvp, cnp, cp->c_fileid, NULL, &error)) { + error = 0; + goto out; + } + + /* Remove entry from catalog */ + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_ATTRIBUTE | SFL_BITMAP, HFS_EXCLUSIVE_LOCK); + + if (!skip_reserve) { + /* + * Reserve some space in the Catalog file. + */ + if ((error = cat_preflight(hfsmp, CAT_DELETE, NULL, 0))) { + hfs_systemfile_unlock(hfsmp, lockflags); + goto out; + } + } + + error = cat_delete(hfsmp, &desc, &cp->c_attr); + + if (!error) { + // + // if skip_reserve == 1 then we're being called from hfs_vnop_rename() and thus + // we don't need to touch the document_id as it's handled by the rename code. + // otherwise it's a normal remove and we need to save the document id in the + // per thread struct and clear it from the cnode. + // + struct doc_tombstone *ut; + ut = doc_tombstone_get(); + if (!skip_reserve && (cp->c_bsdflags & UF_TRACKED) + && doc_tombstone_should_save(ut, vp, cnp)) { + + uint32_t doc_id = hfs_get_document_id(cp); + + // this event is more of a "pending-delete" + if (ut->t_lastop_document_id) { + add_fsevent(FSE_DOCID_CHANGED, vfs_context_current(), + FSE_ARG_DEV, hfsmp->hfs_raw_dev, + FSE_ARG_INO, (ino64_t)cp->c_fileid, // src inode # + FSE_ARG_INO, (ino64_t)0, // dst inode # + FSE_ARG_INT32, doc_id, + FSE_ARG_DONE); + } + + doc_tombstone_save(dvp, vp, cnp, doc_id, cp->c_fileid); + + struct FndrExtendedFileInfo *fip = (struct FndrExtendedFileInfo *)((char *)&cp->c_attr.ca_finderinfo + 16); + + // clear this so it's never returned again + fip->document_id = 0; + cp->c_bsdflags &= ~UF_TRACKED; + } + + /* The parent lost a child */ + if (dcp->c_entries > 0) + dcp->c_entries--; + DEC_FOLDERCOUNT(hfsmp, dcp->c_attr); + dcp->c_dirchangecnt++; + hfs_incr_gencount(dcp); + + dcp->c_touch_chgtime = TRUE; + dcp->c_touch_modtime = TRUE; + dcp->c_flag |= C_MODIFIED; + + hfs_update(dcp->c_vp, 0); + } + + hfs_systemfile_unlock(hfsmp, lockflags); + + if (error) + goto out; + +#if QUOTA + if (hfsmp->hfs_flags & HFS_QUOTAS) + (void)hfs_chkiq(cp, -1, NOCRED, 0); +#endif /* QUOTA */ + + hfs_volupdate(hfsmp, VOL_RMDIR, (dcp->c_cnid == kHFSRootFolderID)); + + /* Mark C_NOEXISTS since the catalog entry is now gone */ + cp->c_flag |= C_NOEXISTS; + +out: + dcp->c_flag &= ~C_DIR_MODIFICATION; + wakeup((caddr_t)&dcp->c_flag); + + if (started_tr) { + hfs_end_transaction(hfsmp); + } + + return (error); +} + + +/* + * Remove a file or link. + */ +int +hfs_vnop_remove(struct vnop_remove_args *ap) +{ + struct vnode *dvp = ap->a_dvp; + struct vnode *vp = ap->a_vp; + struct cnode *dcp = VTOC(dvp); + struct cnode *cp; + struct vnode *rvp = NULL; + int error=0, recycle_rsrc=0; + int recycle_vnode = 0; + uint32_t rsrc_vid = 0; + time_t orig_ctime; + + if (dvp == vp) { + return (EINVAL); + } + + orig_ctime = VTOC(vp)->c_ctime; + if (!vnode_isnamedstream(vp) && ((ap->a_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) == 0)) { + error = nspace_snapshot_event(vp, orig_ctime, NAMESPACE_HANDLER_DELETE_OP, NULL); + if (error) { + // XXXdbg - decide on a policy for handling namespace handler failures! + // for now we just let them proceed. + } + } + error = 0; + + cp = VTOC(vp); + +relock: + + hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + + if ((error = hfs_lockpair(dcp, cp, HFS_EXCLUSIVE_LOCK))) { + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); + if (rvp) { + vnode_put (rvp); + } + return (error); + } + // + // if the item is tracked but doesn't have a document_id, assign one and generate an fsevent for it + // + if ((cp->c_bsdflags & UF_TRACKED) && ((struct FndrExtendedDirInfo *)((char *)&cp->c_attr.ca_finderinfo + 16))->document_id == 0) { + uint32_t newid; + + hfs_unlockpair(dcp, cp); + + if (hfs_generate_document_id(VTOHFS(vp), &newid) == 0) { + hfs_lockpair(dcp, cp, HFS_EXCLUSIVE_LOCK); + ((struct FndrExtendedDirInfo *)((char *)&cp->c_attr.ca_finderinfo + 16))->document_id = newid; + add_fsevent(FSE_DOCID_CHANGED, vfs_context_current(), + FSE_ARG_DEV, VTOHFS(vp)->hfs_raw_dev, + FSE_ARG_INO, (ino64_t)0, // src inode # + FSE_ARG_INO, (ino64_t)cp->c_fileid, // dst inode # + FSE_ARG_INT32, newid, + FSE_ARG_DONE); + } else { + // XXXdbg - couldn't get a new docid... what to do? can't really fail the rm... + hfs_lockpair(dcp, cp, HFS_EXCLUSIVE_LOCK); + } + } + + /* + * Lazily respond to determining if there is a valid resource fork + * vnode attached to 'cp' if it is a regular file or symlink. + * If the vnode does not exist, then we may proceed without having to + * create it. + * + * If, however, it does exist, then we need to acquire an iocount on the + * vnode after acquiring its vid. This ensures that if we have to do I/O + * against it, it can't get recycled from underneath us in the middle + * of this call. + * + * Note: this function may be invoked for directory hardlinks, so just skip these + * steps if 'vp' is a directory. + */ + + enum vtype vtype = vnode_vtype(vp); + if ((vtype == VLNK) || (vtype == VREG)) { + if ((cp->c_rsrc_vp) && (rvp == NULL)) { + /* We need to acquire the rsrc vnode */ + rvp = cp->c_rsrc_vp; + rsrc_vid = vnode_vid (rvp); + + /* Unlock everything to acquire iocount on the rsrc vnode */ + hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT); + hfs_unlockpair (dcp, cp); + /* Use the vid to maintain identity on rvp */ + if (vnode_getwithvid(rvp, rsrc_vid)) { + /* + * If this fails, then it was recycled or + * reclaimed in the interim. Reset fields and + * start over. + */ + rvp = NULL; + rsrc_vid = 0; + } + goto relock; + } + } + + /* + * Check to see if we raced rmdir for the parent directory + * hfs_removefile already checks for a race on vp/cp + */ + if (dcp->c_flag & (C_DELETED | C_NOEXISTS)) { + error = ENOENT; + goto rm_done; + } + + error = hfs_removefile(dvp, vp, ap->a_cnp, ap->a_flags, 0, 0, NULL, 0); + + /* + * If the remove succeeded in deleting the file, then we may need to mark + * the resource fork for recycle so that it is reclaimed as quickly + * as possible. If it were not recycled quickly, then this resource fork + * vnode could keep a v_parent reference on the data fork, which prevents it + * from going through reclaim (by giving it extra usecounts), except in the force- + * unmount case. + * + * However, a caveat: we need to continue to supply resource fork + * access to open-unlinked files even if the resource fork is not open. This is + * a requirement for the compressed files work. Luckily, hfs_vgetrsrc will handle + * this already if the data fork has been re-parented to the hidden directory. + * + * As a result, all we really need to do here is mark the resource fork vnode + * for recycle. If it goes out of core, it can be brought in again if needed. + * If the cnode was instead marked C_NOEXISTS, then there wouldn't be any + * more work. + */ + if (error == 0) { + hfs_hotfile_deleted(vp); + + if (rvp) { + recycle_rsrc = 1; + } + /* + * If the target was actually removed from the catalog schedule it for + * full reclamation/inactivation. We hold an iocount on it so it should just + * get marked with MARKTERM + */ + if (cp->c_flag & C_NOEXISTS) { + recycle_vnode = 1; + } + } + + + /* + * Drop the truncate lock before unlocking the cnode + * (which can potentially perform a vnode_put and + * recycle the vnode which in turn might require the + * truncate lock) + */ +rm_done: + hfs_unlockpair(dcp, cp); + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); + + if (recycle_rsrc) { + /* inactive or reclaim on rvp will clean up the blocks from the rsrc fork */ + vnode_recycle(rvp); + } + if (recycle_vnode) { + vnode_recycle (vp); + } + + if (rvp) { + /* drop iocount on rsrc fork, was obtained at beginning of fxn */ + vnode_put(rvp); + } + + return (error); +} + + +int +hfs_removefile_callback(struct buf *bp, void *hfsmp) { + + if ( !(buf_flags(bp) & B_META)) + panic("hfs: symlink bp @ %p is not marked meta-data!\n", bp); + /* + * it's part of the current transaction, kill it. + */ + journal_kill_block(((struct hfsmount *)hfsmp)->jnl, bp); + + return (BUF_CLAIMED); +} + +/* + * hfs_removefile + * + * Similar to hfs_vnop_remove except there are additional options. + * This function may be used to remove directories if they have + * lots of EA's -- note the 'allow_dirs' argument. + * + * This function is able to delete blocks & fork data for the resource + * fork even if it does not exist in core (and have a backing vnode). + * It should infer the correct behavior based on the number of blocks + * in the cnode and whether or not the resource fork pointer exists or + * not. As a result, one only need pass in the 'vp' corresponding to the + * data fork of this file (or main vnode in the case of a directory). + * Passing in a resource fork will result in an error. + * + * Because we do not create any vnodes in this function, we are not at + * risk of deadlocking against ourselves by double-locking. + * + * Requires cnode and truncate locks to be held. + */ +int +hfs_removefile(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, + int flags, int skip_reserve, int allow_dirs, + __unused struct vnode *rvp, int only_unlink) +{ + struct cnode *cp; + struct cnode *dcp; + struct vnode *rsrc_vp = NULL; + struct hfsmount *hfsmp; + struct cat_desc desc; + struct timeval tv; + int dataforkbusy = 0; + int rsrcforkbusy = 0; + int lockflags; + int error = 0; + int started_tr = 0; + int isbigfile = 0, defer_remove=0, isdir=0; + int update_vh = 0; + + cp = VTOC(vp); + dcp = VTOC(dvp); + hfsmp = VTOHFS(vp); + + /* Check if we lost a race post lookup. */ + if (cp->c_flag & (C_NOEXISTS | C_DELETED)) { + return (0); + } + + if (!hfs_valid_cnode(hfsmp, dvp, cnp, cp->c_fileid, NULL, &error)) { + return 0; + } + + /* Make sure a remove is permitted */ + if (VNODE_IS_RSRC(vp)) { + return (EPERM); + } + else { + /* + * We know it's a data fork. + * Probe the cnode to see if we have a valid resource fork + * in hand or not. + */ + rsrc_vp = cp->c_rsrc_vp; + } + + /* Don't allow deleting the journal or journal_info_block. */ + if (hfs_is_journal_file(hfsmp, cp)) { + return (EPERM); + } + + /* + * Hard links require special handling. + */ + if (cp->c_flag & C_HARDLINK) { + if ((flags & VNODE_REMOVE_NODELETEBUSY) && vnode_isinuse(vp, 0)) { + return (EBUSY); + } else { + /* A directory hard link with a link count of one is + * treated as a regular directory. Therefore it should + * only be removed using rmdir(). + */ + if ((vnode_isdir(vp) == 1) && (cp->c_linkcount == 1) && + (allow_dirs == 0)) { + return (EPERM); + } + return hfs_unlink(hfsmp, dvp, vp, cnp, skip_reserve); + } + } + + /* Directories should call hfs_rmdir! (unless they have a lot of attributes) */ + if (vnode_isdir(vp)) { + if (allow_dirs == 0) + return (EPERM); /* POSIX */ + isdir = 1; + } + /* Sanity check the parent ids. */ + if ((cp->c_parentcnid != hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid) && + (cp->c_parentcnid != dcp->c_fileid)) { + return (EINVAL); + } + + dcp->c_flag |= C_DIR_MODIFICATION; + + // this guy is going away so mark him as such + cp->c_flag |= C_DELETED; + + + /* Remove our entry from the namei cache. */ + cache_purge(vp); + + /* + * If the caller was operating on a file (as opposed to a + * directory with EAs), then we need to figure out + * whether or not it has a valid resource fork vnode. + * + * If there was a valid resource fork vnode, then we need + * to use hfs_truncate to eliminate its data. If there is + * no vnode, then we hold the cnode lock which would + * prevent it from being created. As a result, + * we can use the data deletion functions which do not + * require that a cnode/vnode pair exist. + */ + + /* Check if this file is being used. */ + if (isdir == 0) { + dataforkbusy = vnode_isinuse(vp, 0); + /* + * At this point, we know that 'vp' points to the + * a data fork because we checked it up front. And if + * there is no rsrc fork, rsrc_vp will be NULL. + */ + if (rsrc_vp && (cp->c_blocks - VTOF(vp)->ff_blocks)) { + rsrcforkbusy = vnode_isinuse(rsrc_vp, 0); + } + } + + /* Check if we have to break the deletion into multiple pieces. */ + if (isdir == 0) + isbigfile = cp->c_datafork->ff_size >= HFS_BIGFILE_SIZE; + + /* Check if the file has xattrs. If it does we'll have to delete them in + individual transactions in case there are too many */ + if ((hfsmp->hfs_attribute_vp != NULL) && + (cp->c_attr.ca_recflags & kHFSHasAttributesMask) != 0) { + defer_remove = 1; + } + + /* If we are explicitly told to only unlink item and move to hidden dir, then do it */ + if (only_unlink) { + defer_remove = 1; + } + + /* + * Carbon semantics prohibit deleting busy files. + * (enforced when VNODE_REMOVE_NODELETEBUSY is requested) + */ + if (dataforkbusy || rsrcforkbusy) { + if ((flags & VNODE_REMOVE_NODELETEBUSY) || + (hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid == 0)) { + error = EBUSY; + goto out; + } + } + +#if QUOTA + if (hfsmp->hfs_flags & HFS_QUOTAS) + (void)hfs_getinoquota(cp); +#endif /* QUOTA */ + + /* + * Do a ubc_setsize to indicate we need to wipe contents if: + * 1) item is a regular file. + * 2) Neither fork is busy AND we are not told to unlink this. + * + * We need to check for the defer_remove since it can be set without + * having a busy data or rsrc fork + */ + if (isdir == 0 && (!dataforkbusy || !rsrcforkbusy) && (defer_remove == 0)) { + /* + * A ubc_setsize can cause a pagein so defer it + * until after the cnode lock is dropped. The + * cnode lock cannot be dropped/reacquired here + * since we might already hold the journal lock. + */ + if (!dataforkbusy && cp->c_datafork->ff_blocks && !isbigfile) { + cp->c_flag |= C_NEED_DATA_SETSIZE; + } + if (!rsrcforkbusy && rsrc_vp) { + cp->c_flag |= C_NEED_RSRC_SETSIZE; + } + } + + if ((error = hfs_start_transaction(hfsmp)) != 0) { + goto out; + } + started_tr = 1; + + // XXXdbg - if we're journaled, kill any dirty symlink buffers + if (hfsmp->jnl && vnode_islnk(vp) && (defer_remove == 0)) { + buf_iterate(vp, hfs_removefile_callback, BUF_SKIP_NONLOCKED, (void *)hfsmp); + } + + /* + * Prepare to truncate any non-busy forks. Busy forks will + * get truncated when their vnode goes inactive. + * Note that we will only enter this region if we + * can avoid creating an open-unlinked file. If + * either region is busy, we will have to create an open + * unlinked file. + * + * Since we are deleting the file, we need to stagger the runtime + * modifications to do things in such a way that a crash won't + * result in us getting overlapped extents or any other + * bad inconsistencies. As such, we call prepare_release_storage + * which updates the UBC, updates quota information, and releases + * any loaned blocks that belong to this file. No actual + * truncation or bitmap manipulation is done until *AFTER* + * the catalog record is removed. + */ + if (isdir == 0 && (!dataforkbusy && !rsrcforkbusy) && (only_unlink == 0)) { + + if (!dataforkbusy && !isbigfile && cp->c_datafork->ff_blocks != 0) { + + error = hfs_prepare_release_storage (hfsmp, vp); + if (error) { + goto out; + } + update_vh = 1; + } + + /* + * If the resource fork vnode does not exist, we can skip this step. + */ + if (!rsrcforkbusy && rsrc_vp) { + error = hfs_prepare_release_storage (hfsmp, rsrc_vp); + if (error) { + goto out; + } + update_vh = 1; + } + } + + /* + * Protect against a race with rename by using the component + * name passed in and parent id from dvp (instead of using + * the cp->c_desc which may have changed). Also, be aware that + * because we allow directories to be passed in, we need to special case + * this temporary descriptor in case we were handed a directory. + */ + if (isdir) { + desc.cd_flags = CD_ISDIR; + } + else { + desc.cd_flags = 0; + } + desc.cd_encoding = cp->c_desc.cd_encoding; + desc.cd_nameptr = (const u_int8_t *)cnp->cn_nameptr; + desc.cd_namelen = cnp->cn_namelen; + desc.cd_parentcnid = dcp->c_fileid; + desc.cd_hint = cp->c_desc.cd_hint; + desc.cd_cnid = cp->c_cnid; + microtime(&tv); + + /* + * There are two cases to consider: + * 1. File/Dir is busy/big/defer_remove ==> move/rename the file/dir + * 2. File is not in use ==> remove the file + * + * We can get a directory in case 1 because it may have had lots of attributes, + * which need to get removed here. + */ + if (dataforkbusy || rsrcforkbusy || isbigfile || defer_remove) { + char delname[32]; + struct cat_desc to_desc; + struct cat_desc todir_desc; + + /* + * Orphan this file or directory (move to hidden directory). + * Again, we need to take care that we treat directories as directories, + * and files as files. Because directories with attributes can be passed in + * check to make sure that we have a directory or a file before filling in the + * temporary descriptor's flags. We keep orphaned directories AND files in + * the FILE_HARDLINKS private directory since we're generalizing over all + * orphaned filesystem objects. + */ + bzero(&todir_desc, sizeof(todir_desc)); + todir_desc.cd_parentcnid = 2; + + MAKE_DELETED_NAME(delname, sizeof(delname), cp->c_fileid); + bzero(&to_desc, sizeof(to_desc)); + to_desc.cd_nameptr = (const u_int8_t *)delname; + to_desc.cd_namelen = strlen(delname); + to_desc.cd_parentcnid = hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid; + if (isdir) { + to_desc.cd_flags = CD_ISDIR; + } + else { + to_desc.cd_flags = 0; + } + to_desc.cd_cnid = cp->c_cnid; + + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); + if (!skip_reserve) { + if ((error = cat_preflight(hfsmp, CAT_RENAME, NULL, 0))) { + hfs_systemfile_unlock(hfsmp, lockflags); + goto out; + } + } + + error = cat_rename(hfsmp, &desc, &todir_desc, + &to_desc, (struct cat_desc *)NULL); + + if (error == 0) { + hfsmp->hfs_private_attr[FILE_HARDLINKS].ca_entries++; + if (isdir == 1) { + INC_FOLDERCOUNT(hfsmp, hfsmp->hfs_private_attr[FILE_HARDLINKS]); + } + (void) cat_update(hfsmp, &hfsmp->hfs_private_desc[FILE_HARDLINKS], + &hfsmp->hfs_private_attr[FILE_HARDLINKS], NULL, NULL); + + /* Update the parent directory */ + if (dcp->c_entries > 0) + dcp->c_entries--; + if (isdir == 1) { + DEC_FOLDERCOUNT(hfsmp, dcp->c_attr); + } + dcp->c_dirchangecnt++; + hfs_incr_gencount(dcp); + + dcp->c_ctime = tv.tv_sec; + dcp->c_mtime = tv.tv_sec; + (void) cat_update(hfsmp, &dcp->c_desc, &dcp->c_attr, NULL, NULL); + + /* Update the file or directory's state */ + cp->c_flag |= C_DELETED; + cp->c_ctime = tv.tv_sec; + --cp->c_linkcount; + (void) cat_update(hfsmp, &to_desc, &cp->c_attr, NULL, NULL); + } + hfs_systemfile_unlock(hfsmp, lockflags); + if (error) + goto out; + + } + else { + /* + * Nobody is using this item; we can safely remove everything. + */ + struct filefork *temp_rsrc_fork = NULL; +#if QUOTA + off_t savedbytes; + int blksize = hfsmp->blockSize; +#endif + u_int32_t fileid = cp->c_fileid; + + /* + * Figure out if we need to read the resource fork data into + * core before wiping out the catalog record. + * + * 1) Must not be a directory + * 2) cnode's c_rsrcfork ptr must be NULL. + * 3) rsrc fork must have actual blocks + */ + if ((isdir == 0) && (cp->c_rsrcfork == NULL) && + (cp->c_blocks - VTOF(vp)->ff_blocks)) { + /* + * The resource fork vnode & filefork did not exist. + * Create a temporary one for use in this function only. + */ + temp_rsrc_fork = hfs_zalloc(HFS_FILEFORK_ZONE); + bzero(temp_rsrc_fork, sizeof(struct filefork)); + temp_rsrc_fork->ff_cp = cp; + rl_init(&temp_rsrc_fork->ff_invalidranges); + } + + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_ATTRIBUTE | SFL_BITMAP, HFS_EXCLUSIVE_LOCK); + + /* Look up the resource fork first, if necessary */ + if (temp_rsrc_fork) { + error = cat_lookup (hfsmp, &desc, 1, 0, (struct cat_desc*) NULL, + (struct cat_attr*) NULL, &temp_rsrc_fork->ff_data, NULL); + if (error) { + hfs_zfree(temp_rsrc_fork, HFS_FILEFORK_ZONE); + hfs_systemfile_unlock (hfsmp, lockflags); + goto out; + } + } + + if (!skip_reserve) { + if ((error = cat_preflight(hfsmp, CAT_DELETE, NULL, 0))) { + if (temp_rsrc_fork) { + hfs_zfree(temp_rsrc_fork, HFS_FILEFORK_ZONE); + } + hfs_systemfile_unlock(hfsmp, lockflags); + goto out; + } + } + + error = cat_delete(hfsmp, &desc, &cp->c_attr); + + if (error && error != ENXIO && error != ENOENT) { + printf("hfs_removefile: deleting file %s (id=%d) vol=%s err=%d\n", + cp->c_desc.cd_nameptr, cp->c_attr.ca_fileid, hfsmp->vcbVN, error); + } + + if (error == 0) { + /* Update the parent directory */ + if (dcp->c_entries > 0) + dcp->c_entries--; + dcp->c_dirchangecnt++; + hfs_incr_gencount(dcp); + + dcp->c_ctime = tv.tv_sec; + dcp->c_mtime = tv.tv_sec; + (void) cat_update(hfsmp, &dcp->c_desc, &dcp->c_attr, NULL, NULL); + } + hfs_systemfile_unlock(hfsmp, lockflags); + + if (error) { + if (temp_rsrc_fork) { + hfs_zfree(temp_rsrc_fork, HFS_FILEFORK_ZONE); + } + goto out; + } + + /* + * Now that we've wiped out the catalog record, the file effectively doesn't + * exist anymore. So update the quota records to reflect the loss of the + * data fork and the resource fork. + */ +#if QUOTA + if (cp->c_datafork->ff_blocks > 0) { + savedbytes = ((off_t)cp->c_datafork->ff_blocks * (off_t)blksize); + (void) hfs_chkdq(cp, (int64_t)-(savedbytes), NOCRED, 0); + } + + /* + * We may have just deleted the catalog record for a resource fork even + * though it did not exist in core as a vnode. However, just because there + * was a resource fork pointer in the cnode does not mean that it had any blocks. + */ + if (temp_rsrc_fork || cp->c_rsrcfork) { + if (cp->c_rsrcfork) { + if (cp->c_rsrcfork->ff_blocks > 0) { + savedbytes = ((off_t)cp->c_rsrcfork->ff_blocks * (off_t)blksize); + (void) hfs_chkdq(cp, (int64_t)-(savedbytes), NOCRED, 0); + } + } + else { + /* we must have used a temporary fork */ + savedbytes = ((off_t)temp_rsrc_fork->ff_blocks * (off_t)blksize); + (void) hfs_chkdq(cp, (int64_t)-(savedbytes), NOCRED, 0); + } + } + + if (hfsmp->hfs_flags & HFS_QUOTAS) { + (void)hfs_chkiq(cp, -1, NOCRED, 0); + } +#endif + + if (vnode_islnk(vp) && cp->c_datafork->ff_symlinkptr) { + hfs_free(cp->c_datafork->ff_symlinkptr, cp->c_datafork->ff_size); + cp->c_datafork->ff_symlinkptr = NULL; + } + + /* + * If we didn't get any errors deleting the catalog entry, then go ahead + * and release the backing store now. The filefork pointers are still valid. + */ + if (temp_rsrc_fork) { + error = hfs_release_storage (hfsmp, cp->c_datafork, temp_rsrc_fork, fileid); + } + else { + /* if cp->c_rsrcfork == NULL, hfs_release_storage will skip over it. */ + error = hfs_release_storage (hfsmp, cp->c_datafork, cp->c_rsrcfork, fileid); + } + if (error) { + /* + * If we encountered an error updating the extents and bitmap, + * mark the volume inconsistent. At this point, the catalog record has + * already been deleted, so we can't recover it at this point. We need + * to proceed and update the volume header and mark the cnode C_NOEXISTS. + * The subsequent fsck should be able to recover the free space for us. + */ + hfs_mark_inconsistent(hfsmp, HFS_OP_INCOMPLETE); + } + else { + /* reset update_vh to 0, since hfs_release_storage should have done it for us */ + update_vh = 0; + } + + /* Get rid of the temporary rsrc fork */ + if (temp_rsrc_fork) { + hfs_zfree(temp_rsrc_fork, HFS_FILEFORK_ZONE); + } + + cp->c_flag |= C_NOEXISTS; + cp->c_flag &= ~C_DELETED; + + cp->c_touch_chgtime = TRUE; + --cp->c_linkcount; + + /* + * We must never get a directory if we're in this else block. We could + * accidentally drop the number of files in the volume header if we did. + */ + hfs_volupdate(hfsmp, VOL_RMFILE, (dcp->c_cnid == kHFSRootFolderID)); + + } + + // + // if skip_reserve == 1 then we're being called from hfs_vnop_rename() and thus + // we don't need to touch the document_id as it's handled by the rename code. + // otherwise it's a normal remove and we need to save the document id in the + // per thread struct and clear it from the cnode. + // + if (!error && !skip_reserve && (cp->c_bsdflags & UF_TRACKED) + && cp->c_linkcount <= 1) { + struct doc_tombstone *ut; + ut = doc_tombstone_get(); + if (doc_tombstone_should_save(ut, vp, cnp)) { + add_fsevent(FSE_DOCID_CHANGED, vfs_context_current(), + FSE_ARG_DEV, hfsmp->hfs_raw_dev, + FSE_ARG_INO, (ino64_t)cp->c_fileid, // src inode # + FSE_ARG_INO, (ino64_t)0, // dst inode # + FSE_ARG_INT32, hfs_get_document_id(cp), // document id + FSE_ARG_DONE); + + doc_tombstone_save(dvp, vp, cnp, hfs_get_document_id(cp), + cp->c_fileid); + + struct FndrExtendedFileInfo *fip = (struct FndrExtendedFileInfo *)((char *)&cp->c_attr.ca_finderinfo + 16); + + fip->document_id = 0; + cp->c_bsdflags &= ~UF_TRACKED; + } + } + + /* + * All done with this cnode's descriptor... + * + * Note: all future catalog calls for this cnode must be by + * fileid only. This is OK for HFS (which doesn't have file + * thread records) since HFS doesn't support the removal of + * busy files. + */ + cat_releasedesc(&cp->c_desc); + +out: + if (error) { + cp->c_flag &= ~C_DELETED; + } + + if (update_vh) { + /* + * If we bailed out earlier, we may need to update the volume header + * to deal with the borrowed blocks accounting. + */ + hfs_volupdate (hfsmp, VOL_UPDATE, 0); + } + + if (started_tr) { + hfs_end_transaction(hfsmp); + } + + dcp->c_flag &= ~C_DIR_MODIFICATION; + wakeup((caddr_t)&dcp->c_flag); + + return (error); +} + + +void +replace_desc(struct cnode *cp, struct cat_desc *cdp) +{ + // fixes 4348457 and 4463138 + if (&cp->c_desc == cdp) { + return; + } + + /* First release allocated name buffer */ + if (cp->c_desc.cd_flags & CD_HASBUF && cp->c_desc.cd_nameptr != 0) { + const u_int8_t *name = cp->c_desc.cd_nameptr; + + cp->c_desc.cd_nameptr = 0; + cp->c_desc.cd_namelen = 0; + cp->c_desc.cd_flags &= ~CD_HASBUF; + vfs_removename((const char *)name); + } + bcopy(cdp, &cp->c_desc, sizeof(cp->c_desc)); + + /* Cnode now owns the name buffer */ + cdp->cd_nameptr = 0; + cdp->cd_namelen = 0; + cdp->cd_flags &= ~CD_HASBUF; +} + +/* + * hfs_vnop_rename + * + * Just forwards the arguments from VNOP_RENAME into those of + * VNOP_RENAMEX but zeros out the flags word. + */ +int hfs_vnop_rename (struct vnop_rename_args *args) { + struct vnop_renamex_args vrx; + + vrx.a_desc = args->a_desc; // we aren't using it to switch into the vnop array, so fine as is. + vrx.a_fdvp = args->a_fdvp; + vrx.a_fvp = args->a_fvp; + vrx.a_fcnp = args->a_fcnp; + vrx.a_tdvp = args->a_tdvp; + vrx.a_tvp = args->a_tvp; + vrx.a_tcnp = args->a_tcnp; + vrx.a_vap = NULL; // not used + vrx.a_flags = 0; //zero out the flags. + vrx.a_context = args->a_context; + + return hfs_vnop_renamex (&vrx); +} + + + +/* + * Rename a cnode. + * + * The VFS layer guarantees that: + * - source and destination will either both be directories, or + * both not be directories. + * - all the vnodes are from the same file system + * + * When the target is a directory, HFS must ensure that its empty. + * + * Note that this function requires up to 6 vnodes in order to work properly + * if it is operating on files (and not on directories). This is because only + * files can have resource forks, and we now require iocounts to be held on the + * vnodes corresponding to the resource forks (if applicable) as well as + * the files or directories undergoing rename. The problem with not holding + * iocounts on the resource fork vnodes is that it can lead to a deadlock + * situation: The rsrc fork of the source file may be recycled and reclaimed + * in order to provide a vnode for the destination file's rsrc fork. Since + * data and rsrc forks share the same cnode, we'd eventually try to lock the + * source file's cnode in order to sync its rsrc fork to disk, but it's already + * been locked. By taking the rsrc fork vnodes up front we ensure that they + * cannot be recycled, and that the situation mentioned above cannot happen. + */ +int +hfs_vnop_renamex(struct vnop_renamex_args *ap) +{ + struct vnode *tvp = ap->a_tvp; + struct vnode *tdvp = ap->a_tdvp; + struct vnode *fvp = ap->a_fvp; + struct vnode *fdvp = ap->a_fdvp; + /* + * Note that we only need locals for the target/destination's + * resource fork vnode (and only if necessary). We don't care if the + * source has a resource fork vnode or not. + */ + struct vnode *tvp_rsrc = NULLVP; + uint32_t tvp_rsrc_vid = 0; + struct componentname *tcnp = ap->a_tcnp; + struct componentname *fcnp = ap->a_fcnp; + struct proc *p = vfs_context_proc(ap->a_context); + struct cnode *fcp; + struct cnode *fdcp; + struct cnode *tdcp; + struct cnode *tcp; + struct cnode *error_cnode; + struct cat_desc from_desc; + struct cat_desc to_desc; + struct cat_desc out_desc; + struct hfsmount *hfsmp; + cat_cookie_t cookie; + int tvp_deleted = 0; + int started_tr = 0, got_cookie = 0; + int took_trunc_lock = 0; + int lockflags; + int error; + time_t orig_from_ctime, orig_to_ctime; + int emit_rename = 1; + int emit_delete = 1; + int is_tracked = 0; + int unlocked; + vnode_t old_doc_vp = NULL; + int rename_exclusive = 0; + + orig_from_ctime = VTOC(fvp)->c_ctime; + if (tvp && VTOC(tvp)) { + orig_to_ctime = VTOC(tvp)->c_ctime; + } else { + orig_to_ctime = ~0; + } + + hfsmp = VTOHFS(tdvp); + + /* Check the flags first, so we can avoid grabbing locks if necessary */ + if (ap->a_flags) { + /* These are the only flags we support for now */ + if ((ap->a_flags & (VFS_RENAME_EXCL)) == 0) { + return ENOTSUP; + } + + /* The rename flags are mutually exclusive for HFS+ */ + switch (ap->a_flags & VFS_RENAME_FLAGS_MASK) { + case VFS_RENAME_EXCL: + rename_exclusive = true; + break; + default: + return ENOTSUP; + } + } + + /* + * Do special case checks here. If fvp == tvp then we need to check the + * cnode with locks held. + */ + if (fvp == tvp) { + int is_hardlink = 0; + /* + * In this case, we do *NOT* ever emit a DELETE event. + * We may not necessarily emit a RENAME event + */ + emit_delete = 0; + if ((error = hfs_lock(VTOC(fvp), HFS_SHARED_LOCK, HFS_LOCK_DEFAULT))) { + return error; + } + /* Check to see if the item is a hardlink or not */ + is_hardlink = (VTOC(fvp)->c_flag & C_HARDLINK); + hfs_unlock (VTOC(fvp)); + + /* + * If the item is not a hardlink, then case sensitivity must be off, otherwise + * two names should not resolve to the same cnode unless they were case variants. + */ + if (is_hardlink) { + emit_rename = 0; + /* + * Hardlinks are a little trickier. We only want to emit a rename event + * if the item is a hardlink, the parent directories are the same, case sensitivity + * is off, and the case folded names are the same. See the fvp == tvp case below for more + * info. + */ + + if ((fdvp == tdvp) && ((hfsmp->hfs_flags & HFS_CASE_SENSITIVE) == 0)) { + if (hfs_namecmp((const u_int8_t *)fcnp->cn_nameptr, fcnp->cn_namelen, + (const u_int8_t *)tcnp->cn_nameptr, tcnp->cn_namelen) == 0) { + /* Then in this case only it is ok to emit a rename */ + emit_rename = 1; + } + } + } + } + if (emit_rename) { + /* c_bsdflags should only be assessed while holding the cnode lock. + * This is not done consistently throughout the code and can result + * in race. This will be fixed via rdar://12181064 + */ + if (VTOC(fvp)->c_bsdflags & UF_TRACKED) { + is_tracked = 1; + } + nspace_snapshot_event(fvp, orig_from_ctime, NAMESPACE_HANDLER_RENAME_OP, NULL); + } + + if (tvp && VTOC(tvp)) { + if (emit_delete) { + nspace_snapshot_event(tvp, orig_to_ctime, NAMESPACE_HANDLER_DELETE_OP, NULL); + } + } + +retry: + /* When tvp exists, take the truncate lock for hfs_removefile(). */ + if (tvp && (vnode_isreg(tvp) || vnode_islnk(tvp))) { + hfs_lock_truncate(VTOC(tvp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + took_trunc_lock = 1; + } + +relock: + error = hfs_lockfour(VTOC(fdvp), VTOC(fvp), VTOC(tdvp), tvp ? VTOC(tvp) : NULL, + HFS_EXCLUSIVE_LOCK, &error_cnode); + if (error) { + if (took_trunc_lock) { + hfs_unlock_truncate(VTOC(tvp), HFS_LOCK_DEFAULT); + took_trunc_lock = 0; + } + + /* + * We hit an error path. If we were trying to re-acquire the locks + * after coming through here once, we might have already obtained + * an iocount on tvp's resource fork vnode. Drop that before dealing + * with the failure. Note this is safe -- since we are in an + * error handling path, we can't be holding the cnode locks. + */ + if (tvp_rsrc) { + vnode_put (tvp_rsrc); + tvp_rsrc_vid = 0; + tvp_rsrc = NULL; + } + + /* + * tvp might no longer exist. If the cause of the lock failure + * was tvp, then we can try again with tvp/tcp set to NULL. + * This is ok because the vfs syscall will vnode_put the vnodes + * after we return from hfs_vnop_rename. + */ + if ((error == ENOENT) && (tvp != NULL) && (error_cnode == VTOC(tvp))) { + tcp = NULL; + tvp = NULL; + goto retry; + } + + /* If we want to reintroduce notifications for failed renames, this + is the place to do it. */ + + return (error); + } + + fdcp = VTOC(fdvp); + fcp = VTOC(fvp); + tdcp = VTOC(tdvp); + tcp = tvp ? VTOC(tvp) : NULL; + + + /* + * If caller requested an exclusive rename (VFS_RENAME_EXCL) and 'tcp' exists + * then we must fail the operation. + */ + if (tcp && rename_exclusive) { + error = EEXIST; + goto out; + } + + // + // if the item is tracked but doesn't have a document_id, assign one and generate an fsevent for it + // + unlocked = 0; + if ((fcp->c_bsdflags & UF_TRACKED) && ((struct FndrExtendedDirInfo *)((char *)&fcp->c_attr.ca_finderinfo + 16))->document_id == 0) { + uint32_t newid; + + hfs_unlockfour(VTOC(fdvp), VTOC(fvp), VTOC(tdvp), tvp ? VTOC(tvp) : NULL); + unlocked = 1; + + if (hfs_generate_document_id(hfsmp, &newid) == 0) { + hfs_lock(fcp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + ((struct FndrExtendedDirInfo *)((char *)&fcp->c_attr.ca_finderinfo + 16))->document_id = newid; + add_fsevent(FSE_DOCID_CHANGED, vfs_context_current(), + FSE_ARG_DEV, hfsmp->hfs_raw_dev, + FSE_ARG_INO, (ino64_t)0, // src inode # + FSE_ARG_INO, (ino64_t)fcp->c_fileid, // dst inode # + FSE_ARG_INT32, newid, + FSE_ARG_DONE); + hfs_unlock(fcp); + } else { + // XXXdbg - couldn't get a new docid... what to do? can't really fail the rename... + } + + // + // check if we're going to need to fix tcp as well. if we aren't, go back relock + // everything. otherwise continue on and fix up tcp as well before relocking. + // + if (tcp == NULL || !(tcp->c_bsdflags & UF_TRACKED) || ((struct FndrExtendedDirInfo *)((char *)&tcp->c_attr.ca_finderinfo + 16))->document_id != 0) { + goto relock; + } + } + + // + // same thing for tcp if it's set + // + if (tcp && (tcp->c_bsdflags & UF_TRACKED) && ((struct FndrExtendedDirInfo *)((char *)&tcp->c_attr.ca_finderinfo + 16))->document_id == 0) { + uint32_t newid; + + if (!unlocked) { + hfs_unlockfour(VTOC(fdvp), VTOC(fvp), VTOC(tdvp), tvp ? VTOC(tvp) : NULL); + unlocked = 1; + } + + if (hfs_generate_document_id(hfsmp, &newid) == 0) { + hfs_lock(tcp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + ((struct FndrExtendedDirInfo *)((char *)&tcp->c_attr.ca_finderinfo + 16))->document_id = newid; + add_fsevent(FSE_DOCID_CHANGED, vfs_context_current(), + FSE_ARG_DEV, hfsmp->hfs_raw_dev, + FSE_ARG_INO, (ino64_t)0, // src inode # + FSE_ARG_INO, (ino64_t)tcp->c_fileid, // dst inode # + FSE_ARG_INT32, newid, + FSE_ARG_DONE); + hfs_unlock(tcp); + } else { + // XXXdbg - couldn't get a new docid... what to do? can't really fail the rename... + } + + // go back up and relock everything. next time through the if statement won't be true + // and we'll skip over this block of code. + goto relock; + } + + + + /* + * Acquire iocounts on the destination's resource fork vnode + * if necessary. If dst/src are files and the dst has a resource + * fork vnode, then we need to try and acquire an iocount on the rsrc vnode. + * If it does not exist, then we don't care and can skip it. + */ + if ((vnode_isreg(fvp)) || (vnode_islnk(fvp))) { + if ((tvp) && (tcp->c_rsrc_vp) && (tvp_rsrc == NULL)) { + tvp_rsrc = tcp->c_rsrc_vp; + /* + * We can look at the vid here because we're holding the + * cnode lock on the underlying cnode for this rsrc vnode. + */ + tvp_rsrc_vid = vnode_vid (tvp_rsrc); + + /* Unlock everything to acquire iocount on this rsrc vnode */ + if (took_trunc_lock) { + hfs_unlock_truncate (VTOC(tvp), HFS_LOCK_DEFAULT); + took_trunc_lock = 0; + } + hfs_unlockfour(fdcp, fcp, tdcp, tcp); + + if (vnode_getwithvid (tvp_rsrc, tvp_rsrc_vid)) { + /* iocount acquisition failed. Reset fields and start over.. */ + tvp_rsrc_vid = 0; + tvp_rsrc = NULL; + } + goto retry; + } + } + + + + /* Ensure we didn't race src or dst parent directories with rmdir. */ + if (fdcp->c_flag & (C_NOEXISTS | C_DELETED)) { + error = ENOENT; + goto out; + } + + if (tdcp->c_flag & (C_NOEXISTS | C_DELETED)) { + error = ENOENT; + goto out; + } + + + /* Check for a race against unlink. The hfs_valid_cnode checks validate + * the parent/child relationship with fdcp and tdcp, as well as the + * component name of the target cnodes. + */ + if ((fcp->c_flag & (C_NOEXISTS | C_DELETED)) || !hfs_valid_cnode(hfsmp, fdvp, fcnp, fcp->c_fileid, NULL, &error)) { + error = ENOENT; + goto out; + } + + if (tcp && ((tcp->c_flag & (C_NOEXISTS | C_DELETED)) || !hfs_valid_cnode(hfsmp, tdvp, tcnp, tcp->c_fileid, NULL, &error))) { + // + // hmm, the destination vnode isn't valid any more. + // in this case we can just drop him and pretend he + // never existed in the first place. + // + if (took_trunc_lock) { + hfs_unlock_truncate(VTOC(tvp), HFS_LOCK_DEFAULT); + took_trunc_lock = 0; + } + error = 0; + + hfs_unlockfour(fdcp, fcp, tdcp, tcp); + + tcp = NULL; + tvp = NULL; + + // retry the locking with tvp null'ed out + goto retry; + } + + fdcp->c_flag |= C_DIR_MODIFICATION; + if (fdvp != tdvp) { + tdcp->c_flag |= C_DIR_MODIFICATION; + } + + /* + * Disallow renaming of a directory hard link if the source and + * destination parent directories are different, or a directory whose + * descendant is a directory hard link and the one of the ancestors + * of the destination directory is a directory hard link. + */ + if (vnode_isdir(fvp) && (fdvp != tdvp)) { + if (fcp->c_flag & C_HARDLINK) { + error = EPERM; + goto out; + } + if (fcp->c_attr.ca_recflags & kHFSHasChildLinkMask) { + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); + if (cat_check_link_ancestry(hfsmp, tdcp->c_fileid, 0)) { + error = EPERM; + hfs_systemfile_unlock(hfsmp, lockflags); + goto out; + } + hfs_systemfile_unlock(hfsmp, lockflags); + } + } + + /* + * The following edge case is caught here: + * (to cannot be a descendent of from) + * + * o fdvp + * / + * / + * o fvp + * \ + * \ + * o tdvp + * / + * / + * o tvp + */ + if (tdcp->c_parentcnid == fcp->c_fileid) { + error = EINVAL; + goto out; + } + + /* + * The following two edge cases are caught here: + * (note tvp is not empty) + * + * o tdvp o tdvp + * / / + * / / + * o tvp tvp o fdvp + * \ \ + * \ \ + * o fdvp o fvp + * / + * / + * o fvp + */ + if (tvp && vnode_isdir(tvp) && (tcp->c_entries != 0) && fvp != tvp) { + error = ENOTEMPTY; + goto out; + } + + /* + * The following edge case is caught here: + * (the from child and parent are the same) + * + * o tdvp + * / + * / + * fdvp o fvp + */ + if (fdvp == fvp) { + error = EINVAL; + goto out; + } + + /* + * Make sure "from" vnode and its parent are changeable. + */ + if ((fcp->c_bsdflags & (IMMUTABLE | APPEND)) || (fdcp->c_bsdflags & APPEND)) { + error = EPERM; + goto out; + } + + /* + * If the destination parent directory is "sticky", then the + * user must own the parent directory, or the destination of + * the rename, otherwise the destination may not be changed + * (except by root). This implements append-only directories. + * + * Note that checks for immutable and write access are done + * by the call to hfs_removefile. + */ + if (tvp && (tdcp->c_mode & S_ISTXT) && + (suser(vfs_context_ucred(ap->a_context), NULL)) && + (kauth_cred_getuid(vfs_context_ucred(ap->a_context)) != tdcp->c_uid) && + (hfs_owner_rights(hfsmp, tcp->c_uid, vfs_context_ucred(ap->a_context), p, false)) ) { + error = EPERM; + goto out; + } + + /* Don't allow modification of the journal or journal_info_block */ + if (hfs_is_journal_file(hfsmp, fcp) || + (tcp && hfs_is_journal_file(hfsmp, tcp))) { + error = EPERM; + goto out; + } + +#if QUOTA + if (tvp) + (void)hfs_getinoquota(tcp); +#endif + /* Preflighting done, take fvp out of the name space. */ + cache_purge(fvp); + +#if CONFIG_SECLUDED_RENAME + /* + * Check for "secure" rename that imposes additional restrictions on the + * source vnode. We wait until here to check in order to prevent a race + * with other threads that manage to look up fvp, but their open or link + * is blocked by our locks. At this point, with fvp out of the name cache, + * and holding the lock on fdvp, no other thread can find fvp. + * + * TODO: Do we need to limit these checks to regular files only? + */ + if (fcnp->cn_flags & CN_SECLUDE_RENAME) { + if (vnode_isdir(fvp)) { + error = EISDIR; + goto out; + } + + /* + * Neither fork of source may be open or memory mapped. + * We also don't want it in use by any other system call. + * The file must not have hard links. + * + * We can't simply use vnode_isinuse() because that does not + * count opens with O_EVTONLY. We don't want a malicious + * process using O_EVTONLY to subvert a secluded rename. + */ + if (fcp->c_linkcount != 1) { + error = EMLINK; + goto out; + } + + if (fcp->c_rsrc_vp && (vnode_usecount(fcp->c_rsrc_vp) > 0 || + vnode_iocount(fcp->c_rsrc_vp) > 0)) { + /* Resource fork is in use (including O_EVTONLY) */ + error = EBUSY; + goto out; + } + if (fcp->c_vp && (vnode_usecount(fcp->c_vp) > (fcp->c_rsrc_vp ? 1 : 0) || + vnode_iocount(fcp->c_vp) > 1)) { + /* + * Data fork is in use, including O_EVTONLY, but not + * including a reference from the resource fork. + */ + error = EBUSY; + goto out; + } + } +#endif + + bzero(&from_desc, sizeof(from_desc)); + from_desc.cd_nameptr = (const u_int8_t *)fcnp->cn_nameptr; + from_desc.cd_namelen = fcnp->cn_namelen; + from_desc.cd_parentcnid = fdcp->c_fileid; + from_desc.cd_flags = fcp->c_desc.cd_flags & ~(CD_HASBUF | CD_DECOMPOSED); + from_desc.cd_cnid = fcp->c_cnid; + + bzero(&to_desc, sizeof(to_desc)); + to_desc.cd_nameptr = (const u_int8_t *)tcnp->cn_nameptr; + to_desc.cd_namelen = tcnp->cn_namelen; + to_desc.cd_parentcnid = tdcp->c_fileid; + to_desc.cd_flags = fcp->c_desc.cd_flags & ~(CD_HASBUF | CD_DECOMPOSED); + to_desc.cd_cnid = fcp->c_cnid; + + if ((error = hfs_start_transaction(hfsmp)) != 0) { + goto out; + } + started_tr = 1; + + /* hfs_vnop_link() and hfs_vnop_rename() set kHFSHasChildLinkMask + * inside a journal transaction and without holding a cnode lock. + * As setting of this bit depends on being in journal transaction for + * concurrency, check this bit again after we start journal transaction for rename + * to ensure that this directory does not have any descendant that + * is a directory hard link. + */ + if (vnode_isdir(fvp) && (fdvp != tdvp)) { + if (fcp->c_attr.ca_recflags & kHFSHasChildLinkMask) { + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); + if (cat_check_link_ancestry(hfsmp, tdcp->c_fileid, 0)) { + error = EPERM; + hfs_systemfile_unlock(hfsmp, lockflags); + goto out; + } + hfs_systemfile_unlock(hfsmp, lockflags); + } + } + + // if it's a hardlink then re-lookup the name so + // that we get the correct cnid in from_desc (see + // the comment in hfs_removefile for more details) + // + if (fcp->c_flag & C_HARDLINK) { + struct cat_desc tmpdesc; + cnid_t real_cnid; + + tmpdesc.cd_nameptr = (const u_int8_t *)fcnp->cn_nameptr; + tmpdesc.cd_namelen = fcnp->cn_namelen; + tmpdesc.cd_parentcnid = fdcp->c_fileid; + tmpdesc.cd_hint = fdcp->c_childhint; + tmpdesc.cd_flags = fcp->c_desc.cd_flags & CD_ISDIR; + tmpdesc.cd_encoding = 0; + + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); + + if (cat_lookup(hfsmp, &tmpdesc, 0, 0, NULL, NULL, NULL, &real_cnid) != 0) { + hfs_systemfile_unlock(hfsmp, lockflags); + goto out; + } + + // use the real cnid instead of whatever happened to be there + from_desc.cd_cnid = real_cnid; + hfs_systemfile_unlock(hfsmp, lockflags); + } + + /* + * Reserve some space in the Catalog file. + */ + if ((error = cat_preflight(hfsmp, CAT_RENAME + CAT_DELETE, &cookie, p))) { + goto out; + } + got_cookie = 1; + + /* + * If the destination exists then it may need to be removed. + * + * Due to HFS's locking system, we should always move the + * existing 'tvp' element to the hidden directory in hfs_vnop_rename. + * Because the VNOP_LOOKUP call enters and exits the filesystem independently + * of the actual vnop that it was trying to do (stat, link, readlink), + * we must release the cnode lock of that element during the interim to + * do MAC checking, vnode authorization, and other calls. In that time, + * the item can be deleted (or renamed over). However, only in the rename + * case is it inappropriate to return ENOENT from any of those calls. Either + * the call should return information about the old element (stale), or get + * information about the newer element that we are about to write in its place. + * + * HFS lookup has been modified to detect a rename and re-drive its + * lookup internally. For other calls that have already succeeded in + * their lookup call and are waiting to acquire the cnode lock in order + * to proceed, that cnode lock will not fail due to the cnode being marked + * C_NOEXISTS, because it won't have been marked as such. It will only + * have C_DELETED. Thus, they will simply act on the stale open-unlinked + * element. All future callers will get the new element. + * + * To implement this behavior, we pass the "only_unlink" argument to + * hfs_removefile and hfs_removedir. This will result in the vnode acting + * as though it is open-unlinked. Additionally, when we are done moving the + * element to the hidden directory, we vnode_recycle the target so that it is + * reclaimed as soon as possible. Reclaim and inactive are both + * capable of clearing out unused blocks for an open-unlinked file or dir. + */ + if (tvp) { + // + // if the destination has a document id, we need to preserve it + // + if (fvp != tvp) { + uint32_t document_id; + struct FndrExtendedDirInfo *ffip = (struct FndrExtendedDirInfo *)((char *)&fcp->c_attr.ca_finderinfo + 16); + struct FndrExtendedDirInfo *tfip = (struct FndrExtendedDirInfo *)((char *)&tcp->c_attr.ca_finderinfo + 16); + + if (ffip->document_id && tfip->document_id) { + // both documents are tracked. only save a tombstone from tcp and do nothing else. + doc_tombstone_save(tdvp, tvp, tcnp, hfs_get_document_id(tcp), + tcp->c_fileid); + } else { + struct doc_tombstone *ut; + ut = doc_tombstone_get(); + + document_id = tfip->document_id; + tfip->document_id = 0; + + if (document_id != 0) { + // clear UF_TRACKED as well since tcp is now no longer tracked + tcp->c_bsdflags &= ~UF_TRACKED; + (void) cat_update(hfsmp, &tcp->c_desc, &tcp->c_attr, NULL, NULL); + } + + if (ffip->document_id == 0 && document_id != 0) { + // printf("RENAME: preserving doc-id %d onto %s (from ino %d, to ino %d)\n", document_id, tcp->c_desc.cd_nameptr, tcp->c_desc.cd_cnid, fcp->c_desc.cd_cnid); + fcp->c_bsdflags |= UF_TRACKED; + ffip->document_id = document_id; + + (void) cat_update(hfsmp, &fcp->c_desc, &fcp->c_attr, NULL, NULL); + add_fsevent(FSE_DOCID_CHANGED, vfs_context_current(), + FSE_ARG_DEV, hfsmp->hfs_raw_dev, + FSE_ARG_INO, (ino64_t)tcp->c_fileid, // src inode # + FSE_ARG_INO, (ino64_t)fcp->c_fileid, // dst inode # + FSE_ARG_INT32, (uint32_t)ffip->document_id, + FSE_ARG_DONE); + } + else if ((fcp->c_bsdflags & UF_TRACKED) && doc_tombstone_should_save(ut, fvp, fcnp)) { + + if (ut->t_lastop_document_id) { + doc_tombstone_clear(ut, NULL); + } + doc_tombstone_save(fdvp, fvp, fcnp, + hfs_get_document_id(fcp), fcp->c_fileid); + + //printf("RENAME: (dest-exists): saving tombstone doc-id %lld @ %s (ino %d)\n", + // ut->t_lastop_document_id, ut->t_lastop_filename, fcp->c_desc.cd_cnid); + } + } + } + + /* + * When fvp matches tvp they could be case variants + * or matching hard links. + */ + if (fvp == tvp) { + if (!(fcp->c_flag & C_HARDLINK)) { + /* + * If they're not hardlinks, then fvp == tvp must mean we + * are using case-insensitive HFS because case-sensitive would + * not use the same vnode for both. In this case we just update + * the catalog for: a -> A + */ + goto skip_rm; /* simple case variant */ + + } + /* For all cases below, we must be using hardlinks */ + else if ((fdvp != tdvp) || + (hfsmp->hfs_flags & HFS_CASE_SENSITIVE)) { + /* + * If the parent directories are not the same, AND the two items + * are hardlinks, posix says to do nothing: + * dir1/fred <-> dir2/bob and the op was mv dir1/fred -> dir2/bob + * We just return 0 in this case. + * + * If case sensitivity is on, and we are using hardlinks + * then renaming is supposed to do nothing. + * dir1/fred <-> dir2/FRED, and op == mv dir1/fred -> dir2/FRED + */ + goto out; /* matching hardlinks, nothing to do */ + + } else if (hfs_namecmp((const u_int8_t *)fcnp->cn_nameptr, fcnp->cn_namelen, + (const u_int8_t *)tcnp->cn_nameptr, tcnp->cn_namelen) == 0) { + /* + * If we get here, then the following must be true: + * a) We are running case-insensitive HFS+. + * b) Both paths 'fvp' and 'tvp' are in the same parent directory. + * c) the two names are case-variants of each other. + * + * In this case, we are really only dealing with a single catalog record + * whose name is being updated. + * + * op is dir1/fred -> dir1/FRED + * + * We need to special case the name matching, because if + * dir1/fred <-> dir1/bob were the two links, and the + * op was dir1/fred -> dir1/bob + * That would fail/do nothing. + */ + goto skip_rm; /* case-variant hardlink in the same dir */ + } else { + goto out; /* matching hardlink, nothing to do */ + } + } + + + if (vnode_isdir(tvp)) { + /* + * hfs_removedir will eventually call hfs_removefile on the directory + * we're working on, because only hfs_removefile does the renaming of the + * item to the hidden directory. The directory will stay around in the + * hidden directory with C_DELETED until it gets an inactive or a reclaim. + * That way, we can destroy all of the EAs as needed and allow new ones to be + * written. + */ + error = hfs_removedir(tdvp, tvp, tcnp, HFSRM_SKIP_RESERVE, 1); + } + else { + error = hfs_removefile(tdvp, tvp, tcnp, 0, HFSRM_SKIP_RESERVE, 0, NULL, 1); + + /* + * If the destination file had a resource fork vnode, then we need to get rid of + * its blocks when there are no more references to it. Because the call to + * hfs_removefile above always open-unlinks things, we need to force an inactive/reclaim + * on the resource fork vnode, in order to prevent block leaks. Otherwise, + * the resource fork vnode could prevent the data fork vnode from going out of scope + * because it holds a v_parent reference on it. So we mark it for termination + * with a call to vnode_recycle. hfs_vnop_reclaim has been modified so that it + * can clean up the blocks of open-unlinked files and resource forks. + * + * We can safely call vnode_recycle on the resource fork because we took an iocount + * reference on it at the beginning of the function. + */ + + if ((error == 0) && (tcp->c_flag & C_DELETED) && (tvp_rsrc)) { + vnode_recycle(tvp_rsrc); + } + } + + if (error) { + goto out; + } + + tvp_deleted = 1; + + /* Mark 'tcp' as being deleted due to a rename */ + tcp->c_flag |= C_RENAMED; + + /* + * Aggressively mark tvp/tcp for termination to ensure that we recover all blocks + * as quickly as possible. + */ + vnode_recycle(tvp); + } else { + struct doc_tombstone *ut; + ut = doc_tombstone_get(); + + // + // There is nothing at the destination. If the file being renamed is + // tracked, save a "tombstone" of the document_id. If the file is + // not a tracked file, then see if it needs to inherit a tombstone. + // + // NOTE: we do not save a tombstone if the file being renamed begins + // with "atmp" which is done to work-around AutoCad's bizarre + // 5-step un-safe save behavior + // + if (fcp->c_bsdflags & UF_TRACKED) { + if (doc_tombstone_should_save(ut, fvp, fcnp)) { + doc_tombstone_save(fdvp, fvp, fcnp, hfs_get_document_id(fcp), + fcp->c_fileid); + + //printf("RENAME: (no dest): saving tombstone doc-id %lld @ %s (ino %d)\n", + // ut->t_lastop_document_id, ut->t_lastop_filename, fcp->c_desc.cd_cnid); + } else { + // intentionally do nothing + } + } else if ( ut->t_lastop_document_id != 0 + && tdvp == ut->t_lastop_parent + && vnode_vid(tdvp) == ut->t_lastop_parent_vid + && strcmp((char *)ut->t_lastop_filename, (char *)tcnp->cn_nameptr) == 0) { + + //printf("RENAME: %s (ino %d) inheriting doc-id %lld\n", tcnp->cn_nameptr, fcp->c_desc.cd_cnid, ut->t_lastop_document_id); + struct FndrExtendedFileInfo *fip = (struct FndrExtendedFileInfo *)((char *)&fcp->c_attr.ca_finderinfo + 16); + fcp->c_bsdflags |= UF_TRACKED; + fip->document_id = ut->t_lastop_document_id; + cat_update(hfsmp, &fcp->c_desc, &fcp->c_attr, NULL, NULL); + + doc_tombstone_clear(ut, &old_doc_vp); + } else if (ut->t_lastop_document_id && doc_tombstone_should_save(ut, fvp, fcnp) && doc_tombstone_should_save(ut, tvp, tcnp)) { + // no match, clear the tombstone + //printf("RENAME: clearing the tombstone %lld @ %s\n", ut->t_lastop_document_id, ut->t_lastop_filename); + doc_tombstone_clear(ut, NULL); + } + + } +skip_rm: + /* + * All done with tvp and fvp. + * + * We also jump to this point if there was no destination observed during lookup and namei. + * However, because only iocounts are held at the VFS layer, there is nothing preventing a + * competing thread from racing us and creating a file or dir at the destination of this rename + * operation. If this occurs, it may cause us to get a spurious EEXIST out of the cat_rename + * call below. To preserve rename's atomicity, we need to signal VFS to re-drive the + * namei/lookup and restart the rename operation. EEXIST is an allowable errno to be bubbled + * out of the rename syscall, but not for this reason, since it is a synonym errno for ENOTEMPTY. + * To signal VFS, we return ERECYCLE (which is also used for lookup restarts). This errno + * will be swallowed and it will restart the operation. + */ + + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); + error = cat_rename(hfsmp, &from_desc, &tdcp->c_desc, &to_desc, &out_desc); + hfs_systemfile_unlock(hfsmp, lockflags); + + if (error) { + if (error == EEXIST) { + error = ERECYCLE; + } + goto out; + } + + /* Invalidate negative cache entries in the destination directory */ + if (tdcp->c_flag & C_NEG_ENTRIES) { + cache_purge_negatives(tdvp); + tdcp->c_flag &= ~C_NEG_ENTRIES; + } + + /* Update cnode's catalog descriptor */ + replace_desc(fcp, &out_desc); + fcp->c_parentcnid = tdcp->c_fileid; + fcp->c_hint = 0; + + /* + * Now indicate this cnode needs to have date-added written to the + * finderinfo, but only if moving to a different directory, or if + * it doesn't already have it. + */ + if (fdvp != tdvp || !ISSET(fcp->c_attr.ca_recflags, kHFSHasDateAddedMask)) + fcp->c_flag |= C_NEEDS_DATEADDED; + + (void) hfs_update (fvp, 0); + + hfs_volupdate(hfsmp, vnode_isdir(fvp) ? VOL_RMDIR : VOL_RMFILE, + (fdcp->c_cnid == kHFSRootFolderID)); + hfs_volupdate(hfsmp, vnode_isdir(fvp) ? VOL_MKDIR : VOL_MKFILE, + (tdcp->c_cnid == kHFSRootFolderID)); + + /* Update both parent directories. */ + if (fdvp != tdvp) { + if (vnode_isdir(fvp)) { + /* If the source directory has directory hard link + * descendants, set the kHFSHasChildLinkBit in the + * destination parent hierarchy + */ + if ((fcp->c_attr.ca_recflags & kHFSHasChildLinkMask) && + !(tdcp->c_attr.ca_recflags & kHFSHasChildLinkMask)) { + + tdcp->c_attr.ca_recflags |= kHFSHasChildLinkMask; + + error = cat_set_childlinkbit(hfsmp, tdcp->c_parentcnid); + if (error) { + printf ("hfs_vnop_rename: error updating parent chain for %u\n", tdcp->c_cnid); + error = 0; + } + } + INC_FOLDERCOUNT(hfsmp, tdcp->c_attr); + DEC_FOLDERCOUNT(hfsmp, fdcp->c_attr); + } + tdcp->c_entries++; + tdcp->c_dirchangecnt++; + tdcp->c_flag |= C_MODIFIED; + hfs_incr_gencount(tdcp); + + if (fdcp->c_entries > 0) + fdcp->c_entries--; + fdcp->c_dirchangecnt++; + fdcp->c_flag |= C_MODIFIED; + fdcp->c_touch_chgtime = TRUE; + fdcp->c_touch_modtime = TRUE; + + if (ISSET(fcp->c_flag, C_HARDLINK)) { + hfs_relorigin(fcp, fdcp->c_fileid); + if (fdcp->c_fileid != fdcp->c_cnid) + hfs_relorigin(fcp, fdcp->c_cnid); + } + + (void) hfs_update(fdvp, 0); + } + hfs_incr_gencount(fdcp); + + tdcp->c_childhint = out_desc.cd_hint; /* Cache directory's location */ + tdcp->c_touch_chgtime = TRUE; + tdcp->c_touch_modtime = TRUE; + + (void) hfs_update(tdvp, 0); + + /* Update the vnode's name now that the rename has completed. */ + vnode_update_identity(fvp, tdvp, tcnp->cn_nameptr, tcnp->cn_namelen, + tcnp->cn_hash, (VNODE_UPDATE_PARENT | VNODE_UPDATE_NAME)); + + /* + * At this point, we may have a resource fork vnode attached to the + * 'from' vnode. If it exists, we will want to update its name, because + * it contains the old name + _PATH_RSRCFORKSPEC. ("/..namedfork/rsrc"). + * + * Note that the only thing we need to update here is the name attached to + * the vnode, since a resource fork vnode does not have a separate resource + * cnode -- it's still 'fcp'. + */ + if (fcp->c_rsrc_vp) { + char* rsrc_path = NULL; + int len; + + /* Create a new temporary buffer that's going to hold the new name */ + rsrc_path = hfs_malloc(MAXPATHLEN); + len = snprintf (rsrc_path, MAXPATHLEN, "%s%s", tcnp->cn_nameptr, _PATH_RSRCFORKSPEC); + len = MIN(len, MAXPATHLEN); + + /* + * vnode_update_identity will do the following for us: + * 1) release reference on the existing rsrc vnode's name. + * 2) copy/insert new name into the name cache + * 3) attach the new name to the resource vnode + * 4) update the vnode's vid + */ + vnode_update_identity (fcp->c_rsrc_vp, fvp, rsrc_path, len, 0, (VNODE_UPDATE_NAME | VNODE_UPDATE_CACHE)); + + /* Free the memory associated with the resource fork's name */ + hfs_free(rsrc_path, MAXPATHLEN); + } +out: + if (got_cookie) { + cat_postflight(hfsmp, &cookie, p); + } + if (started_tr) { + hfs_end_transaction(hfsmp); + } + + fdcp->c_flag &= ~C_DIR_MODIFICATION; + wakeup((caddr_t)&fdcp->c_flag); + if (fdvp != tdvp) { + tdcp->c_flag &= ~C_DIR_MODIFICATION; + wakeup((caddr_t)&tdcp->c_flag); + } + + const ino64_t file_id = fcp->c_fileid; + + hfs_unlockfour(fdcp, fcp, tdcp, tcp); + + if (took_trunc_lock) { + hfs_unlock_truncate(VTOC(tvp), HFS_LOCK_DEFAULT); + } + + /* Now vnode_put the resource forks vnodes if necessary */ + if (tvp_rsrc) { + vnode_put(tvp_rsrc); + tvp_rsrc = NULL; + } + + /* After tvp is removed the only acceptable error is EIO */ + if (error && tvp_deleted) + error = EIO; + + /* If we want to reintroduce notifications for renames, this is the + place to do it. */ + + if (old_doc_vp) { + cnode_t *ocp = VTOC(old_doc_vp); + hfs_lock_always(ocp, HFS_EXCLUSIVE_LOCK); + struct FndrExtendedFileInfo *ofip = (struct FndrExtendedFileInfo *)((char *)&ocp->c_attr.ca_finderinfo + 16); + + const uint32_t doc_id = ofip->document_id; + const ino64_t old_file_id = ocp->c_fileid; + + // printf("clearing doc-id from ino %d\n", ocp->c_desc.cd_cnid); + ofip->document_id = 0; + ocp->c_bsdflags &= ~UF_TRACKED; + ocp->c_flag |= C_MODIFIED; + + hfs_unlock(ocp); + vnode_put(old_doc_vp); + + add_fsevent(FSE_DOCID_CHANGED, vfs_context_current(), + FSE_ARG_DEV, hfsmp->hfs_raw_dev, + FSE_ARG_INO, old_file_id, // src inode # + FSE_ARG_INO, file_id, // dst inode # + FSE_ARG_INT32, doc_id, + FSE_ARG_DONE); + } + + return (error); +} + + +/* + * Make a directory. + */ +int +hfs_vnop_mkdir(struct vnop_mkdir_args *ap) +{ + /***** HACK ALERT ********/ + ap->a_cnp->cn_flags |= MAKEENTRY; + return hfs_makenode(ap->a_dvp, ap->a_vpp, ap->a_cnp, ap->a_vap, ap->a_context); +} + + +/* + * Create a symbolic link. + */ +int +hfs_vnop_symlink(struct vnop_symlink_args *ap) +{ + struct vnode **vpp = ap->a_vpp; + struct vnode *dvp = ap->a_dvp; + struct vnode *vp = NULL; + struct cnode *cp = NULL; + struct hfsmount *hfsmp; + struct filefork *fp; + struct buf *bp = NULL; + char *datap; + int started_tr = 0; + u_int32_t len; + int error; + + /* HFS standard disks don't support symbolic links */ + if (VTOVCB(dvp)->vcbSigWord != kHFSPlusSigWord) + return (ENOTSUP); + + /* Check for empty target name */ + if (ap->a_target[0] == 0) + return (EINVAL); + + hfsmp = VTOHFS(dvp); + + len = strlen(ap->a_target); + if (len > MAXPATHLEN) + return (ENAMETOOLONG); + + /* Check for free space */ + if (((u_int64_t)hfs_freeblks(hfsmp, 0) * (u_int64_t)hfsmp->blockSize) < len) { + return (ENOSPC); + } + + /* Create the vnode */ + ap->a_vap->va_mode |= S_IFLNK; + if ((error = hfs_makenode(dvp, vpp, ap->a_cnp, ap->a_vap, ap->a_context))) { + goto out; + } + vp = *vpp; + if ((error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { + goto out; + } + cp = VTOC(vp); + fp = VTOF(vp); + + if (cp->c_flag & (C_NOEXISTS | C_DELETED)) { + goto out; + } + +#if QUOTA + (void)hfs_getinoquota(cp); +#endif /* QUOTA */ + + if ((error = hfs_start_transaction(hfsmp)) != 0) { + goto out; + } + started_tr = 1; + + /* + * Allocate space for the link. + * + * Since we're already inside a transaction, + * + * Don't need truncate lock since a symlink is treated as a system file. + */ + error = hfs_truncate(vp, len, IO_NOZEROFILL, 0, ap->a_context); + + /* On errors, remove the symlink file */ + if (error) { + /* + * End the transaction so we don't re-take the cnode lock + * below while inside a transaction (lock order violation). + */ + hfs_end_transaction(hfsmp); + + /* hfs_removefile() requires holding the truncate lock */ + hfs_unlock(cp); + hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); + + if (hfs_start_transaction(hfsmp) != 0) { + started_tr = 0; + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); + goto out; + } + + (void) hfs_removefile(dvp, vp, ap->a_cnp, 0, 0, 0, NULL, 0); + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); + goto out; + } + + /* Write the link to disk */ + bp = buf_getblk(vp, (daddr64_t)0, roundup((int)fp->ff_size, hfsmp->hfs_physical_block_size), + 0, 0, BLK_META); + if (hfsmp->jnl) { + journal_modify_block_start(hfsmp->jnl, bp); + } + datap = (char *)buf_dataptr(bp); + bzero(datap, buf_size(bp)); + bcopy(ap->a_target, datap, len); + + if (hfsmp->jnl) { + journal_modify_block_end(hfsmp->jnl, bp, NULL, NULL); + } else { + buf_bawrite(bp); + } +out: + if (started_tr) + hfs_end_transaction(hfsmp); + if ((cp != NULL) && (vp != NULL)) { + hfs_unlock(cp); + } + if (error) { + if (vp) { + vnode_put(vp); + } + *vpp = NULL; + } + return (error); +} + + +/* structures to hold a "." or ".." directory entry */ +struct hfs_stddotentry { + u_int32_t d_fileno; /* unique file number */ + u_int16_t d_reclen; /* length of this structure */ + u_int8_t d_type; /* dirent file type */ + u_int8_t d_namlen; /* len of filename */ + char d_name[4]; /* "." or ".." */ +}; + +struct hfs_extdotentry { + u_int64_t d_fileno; /* unique file number */ + u_int64_t d_seekoff; /* seek offset (optional, used by servers) */ + u_int16_t d_reclen; /* length of this structure */ + u_int16_t d_namlen; /* len of filename */ + u_int8_t d_type; /* dirent file type */ + u_char d_name[3]; /* "." or ".." */ +}; + +typedef union { + struct hfs_stddotentry std; + struct hfs_extdotentry ext; +} hfs_dotentry_t; + +/* + * hfs_vnop_readdir reads directory entries into the buffer pointed + * to by uio, in a filesystem independent format. Up to uio_resid + * bytes of data can be transferred. The data in the buffer is a + * series of packed dirent structures where each one contains the + * following entries: + * + * u_int32_t d_fileno; // file number of entry + * u_int16_t d_reclen; // length of this record + * u_int8_t d_type; // file type + * u_int8_t d_namlen; // length of string in d_name + * char d_name[MAXNAMELEN+1]; // null terminated file name + * + * The current position (uio_offset) refers to the next block of + * entries. The offset can only be set to a value previously + * returned by hfs_vnop_readdir or zero. This offset does not have + * to match the number of bytes returned (in uio_resid). + * + * In fact, the offset used by HFS is essentially an index (26 bits) + * with a tag (6 bits). The tag is for associating the next request + * with the current request. This enables us to have multiple threads + * reading the directory while the directory is also being modified. + * + * Each tag/index pair is tied to a unique directory hint. The hint + * contains information (filename) needed to build the catalog b-tree + * key for finding the next set of entries. + * + * If the directory is marked as deleted-but-in-use (cp->c_flag & C_DELETED), + * do NOT synthesize entries for "." and "..". + */ +int +hfs_vnop_readdir(struct vnop_readdir_args *ap) +{ + struct vnode *vp = ap->a_vp; + uio_t uio = ap->a_uio; + struct cnode *cp = VTOC(vp); + struct hfsmount *hfsmp = VTOHFS(vp); + directoryhint_t *dirhint = NULL; + directoryhint_t localhint; + off_t offset; + off_t startoffset; + int error = 0; + int eofflag = 0; + user_addr_t user_start = 0; + user_size_t user_len = 0; + user_size_t user_original_resid = 0; + int index; + unsigned int tag; + int items; + int lockflags; + int extended; + int nfs_cookies; + cnid_t cnid_hint = 0; + int bump_valence = 0; + + items = 0; + startoffset = offset = uio_offset(uio); + extended = (ap->a_flags & VNODE_READDIR_EXTENDED); + nfs_cookies = extended && (ap->a_flags & VNODE_READDIR_REQSEEKOFF); + + /* Sanity check the uio data. */ + if (uio_iovcnt(uio) > 1) + return (EINVAL); + + if (VTOC(vp)->c_bsdflags & UF_COMPRESSED) { + int compressed = hfs_file_is_compressed(VTOC(vp), 0); /* 0 == take the cnode lock */ + if (VTOCMP(vp) != NULL && !compressed) { + error = check_for_dataless_file(vp, NAMESPACE_HANDLER_READ_OP); + if (error) { + return error; + } + } + } + + // + // We have to lock the user's buffer here so that we won't + // fault on it after we've acquired a shared lock on the + // catalog file. The issue is that you can get a 3-way + // deadlock if someone else starts a transaction and then + // tries to lock the catalog file but can't because we're + // here and we can't service our page fault because VM is + // blocked trying to start a transaction as a result of + // trying to free up pages for our page fault. It's messy + // but it does happen on dual-processors that are paging + // heavily (see radar 3082639 for more info). By locking + // the buffer up-front we prevent ourselves from faulting + // while holding the shared catalog file lock. + // + // Fortunately this and hfs_search() are the only two places + // currently (10/30/02) that can fault on user data with a + // shared lock on the catalog file. + // + if (hfsmp->jnl && uio_isuserspace(uio)) { + user_start = uio_curriovbase(uio); + user_len = uio_curriovlen(uio); + + /* Bounds check the user buffer */ + if (user_len > (256 * 1024)) { + /* only allow the user to wire down at most 256k */ + user_len = (256 * 1024); + user_original_resid = uio_resid(uio); + uio_setresid (uio, (user_ssize_t)(256 * 1024)); + } + + if ((error = vslock(user_start, user_len)) != 0) { + if (user_original_resid > 0) { + uio_setresid(uio, user_original_resid); + user_original_resid = 0; + } + return error; + } + } + + /* Note that the dirhint calls require an exclusive lock. */ + if ((error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { + if (user_start) { + if (user_original_resid > 0) { + uio_setresid(uio, user_original_resid); + user_original_resid = 0; + } + vsunlock(user_start, user_len, TRUE); + } + return error; + } + + /* Pick up cnid hint (if any). */ + if (nfs_cookies) { + cnid_hint = (cnid_t)(uio_offset(uio) >> 32); + uio_setoffset(uio, uio_offset(uio) & 0x00000000ffffffffLL); + if (cnid_hint == INT_MAX) { /* searching pass the last item */ + eofflag = 1; + goto out; + } + } + /* + * Synthesize entries for "." and "..", unless the directory has + * been deleted, but not closed yet (lazy delete in progress). + */ + if (offset == 0 && !(cp->c_flag & C_DELETED)) { + + size_t uiosize; + + /* + * We could use a union of the two types of dot entries (HFS / HFS+) + * but it makes static analysis of this code difficult. The problem is that + * the HFS standard dot entry is smaller than the HFS+ one, and we also ideally + * want the uiomove to operate on a two-element adjacent array. If we used the + * array of unions, we would have to do two separate uiomoves because the memory + * for the hfs standard dot entries would not be adjacent to one another. + * So just allocate the entries on the stack in separate cases. + */ + + if (extended) { + hfs_dotentry_t dotentry[2]; + + /* HFS Plus */ + struct hfs_extdotentry *entry = &dotentry[0].ext; + + entry->d_fileno = cp->c_cnid; + entry->d_reclen = sizeof(struct hfs_extdotentry); + entry->d_type = DT_DIR; + entry->d_namlen = 1; + entry->d_name[0] = '.'; + entry->d_name[1] = '\0'; + entry->d_name[2] = '\0'; + entry->d_seekoff = 1; + + ++entry; + entry->d_fileno = cp->c_parentcnid; + entry->d_reclen = sizeof(struct hfs_extdotentry); + entry->d_type = DT_DIR; + entry->d_namlen = 2; + entry->d_name[0] = '.'; + entry->d_name[1] = '.'; + entry->d_name[2] = '\0'; + entry->d_seekoff = 2; + uiosize = 2 * sizeof(struct hfs_extdotentry); + + if ((error = uiomove((caddr_t)dotentry, uiosize, uio))) { + goto out; + } + + } else { + struct hfs_stddotentry hfs_std_dotentries[2]; + + /* HFS Standard */ + struct hfs_stddotentry *entry = &hfs_std_dotentries[0]; + + entry->d_fileno = cp->c_cnid; + entry->d_reclen = sizeof(struct hfs_stddotentry); + entry->d_type = DT_DIR; + entry->d_namlen = 1; + *(int *)&entry->d_name[0] = 0; + entry->d_name[0] = '.'; + + ++entry; + entry->d_fileno = cp->c_parentcnid; + entry->d_reclen = sizeof(struct hfs_stddotentry); + entry->d_type = DT_DIR; + entry->d_namlen = 2; + *(int *)&entry->d_name[0] = 0; + entry->d_name[0] = '.'; + entry->d_name[1] = '.'; + uiosize = 2 * sizeof(struct hfs_stddotentry); + + if ((error = uiomove((caddr_t)hfs_std_dotentries, uiosize, uio))) { + goto out; + } + } + + offset += 2; + } + + /* + * Intentionally avoid checking the valence here. If we + * have FS corruption that reports the valence is 0, even though it + * has contents, we might artificially skip over iterating + * this directory. + */ + + /* Convert offset into a catalog directory index. */ + index = (offset & HFS_INDEX_MASK) - 2; + tag = offset & ~HFS_INDEX_MASK; + + /* Lock catalog during cat_findname and cat_getdirentries. */ + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); + + /* When called from NFS, try and resolve a cnid hint. */ + if (nfs_cookies && cnid_hint != 0) { + if (cat_findname(hfsmp, cnid_hint, &localhint.dh_desc) == 0) { + if ( localhint.dh_desc.cd_parentcnid == cp->c_fileid) { + localhint.dh_index = index - 1; + localhint.dh_time = 0; + bzero(&localhint.dh_link, sizeof(localhint.dh_link)); + dirhint = &localhint; /* don't forget to release the descriptor */ + } else { + cat_releasedesc(&localhint.dh_desc); + } + } + } + + /* Get a directory hint (cnode must be locked exclusive) */ + if (dirhint == NULL) { + dirhint = hfs_getdirhint(cp, ((index - 1) & HFS_INDEX_MASK) | tag, 0); + + /* Hide tag from catalog layer. */ + dirhint->dh_index &= HFS_INDEX_MASK; + if (dirhint->dh_index == HFS_INDEX_MASK) { + dirhint->dh_index = -1; + } + } + + if (index == 0) { + dirhint->dh_threadhint = cp->c_dirthreadhint; + } + else { + /* + * If we have a non-zero index, there is a possibility that during the last + * call to hfs_vnop_readdir we hit EOF for this directory. If that is the case + * then we don't want to return any new entries for the caller. Just return 0 + * items, mark the eofflag, and bail out. Because we won't have done any work, the + * code at the end of the function will release the dirhint for us. + * + * Don't forget to unlock the catalog lock on the way out, too. + */ + if (dirhint->dh_desc.cd_flags & CD_EOF) { + error = 0; + eofflag = 1; + uio_setoffset(uio, startoffset); + if (user_original_resid > 0) { + uio_setresid(uio, user_original_resid); + user_original_resid = 0; + } + hfs_systemfile_unlock (hfsmp, lockflags); + + goto seekoffcalc; + } + } + + /* Pack the buffer with dirent entries. */ + error = cat_getdirentries(hfsmp, cp->c_entries, dirhint, uio, ap->a_flags, &items, &eofflag); + + if (user_original_resid > 0) { + user_original_resid = user_original_resid - ((user_ssize_t)256*1024 - uio_resid(uio)); + uio_setresid(uio, user_original_resid); + user_original_resid = 0; + } + + if (index == 0 && error == 0) { + cp->c_dirthreadhint = dirhint->dh_threadhint; + } + + hfs_systemfile_unlock(hfsmp, lockflags); + + if (error != 0) { + goto out; + } + + /* Get index to the next item */ + index += items; + + if (items >= (int)cp->c_entries) { + eofflag = 1; + } + + /* + * Detect valence FS corruption. + * + * We are holding the cnode lock exclusive, so there should not be + * anybody modifying the valence field of this cnode. If we enter + * this block, that means we observed filesystem corruption, because + * this directory reported a valence of 0, yet we found at least one + * item. In this case, we need to minimally self-heal this + * directory to prevent userland from tripping over a directory + * that appears empty (getattr of valence reports 0), but actually + * has contents. + * + * We'll force the cnode update at the end of the function after + * completing all of the normal getdirentries steps. + */ + if ((cp->c_entries == 0) && (items > 0)) { + /* disk corruption */ + cp->c_entries++; + /* Mark the cnode as dirty. */ + cp->c_flag |= C_MODIFIED; + printf("hfs_vnop_readdir: repairing valence to non-zero! \n"); + bump_valence++; + } + + + /* Convert catalog directory index back into an offset. */ + while (tag == 0) + tag = (++cp->c_dirhinttag) << HFS_INDEX_BITS; + uio_setoffset(uio, (index + 2) | tag); + dirhint->dh_index |= tag; + +seekoffcalc: + cp->c_touch_acctime = TRUE; + + if (ap->a_numdirent) { + if (startoffset == 0) + items += 2; + *ap->a_numdirent = items; + } + +out: + if (user_start) { + if (user_original_resid > 0) { + uio_setresid(uio, user_original_resid); + user_original_resid = 0; + } + vsunlock(user_start, user_len, TRUE); + } + /* If we didn't do anything then go ahead and dump the hint. */ + if ((dirhint != NULL) && + (dirhint != &localhint) && + (uio_offset(uio) == startoffset)) { + hfs_reldirhint(cp, dirhint); + eofflag = 1; + } + if (ap->a_eofflag) { + *ap->a_eofflag = eofflag; + } + if (dirhint == &localhint) { + cat_releasedesc(&localhint.dh_desc); + } + + if (bump_valence) { + /* force the update before dropping the cnode lock*/ + hfs_update(vp, 0); + } + + hfs_unlock(cp); + + return (error); +} + + +/* + * Read contents of a symbolic link. + */ +int +hfs_vnop_readlink(struct vnop_readlink_args *ap) +{ + struct vnode *vp = ap->a_vp; + struct cnode *cp; + struct filefork *fp; + int error; + + if (!vnode_islnk(vp)) + return (EINVAL); + + if ((error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) + return (error); + cp = VTOC(vp); + fp = VTOF(vp); + + /* Zero length sym links are not allowed */ + if (fp->ff_size == 0 || fp->ff_size > MAXPATHLEN) { + error = EINVAL; + goto exit; + } + + /* Cache the path so we don't waste buffer cache resources */ + if (fp->ff_symlinkptr == NULL) { + struct buf *bp = NULL; + + fp->ff_symlinkptr = hfs_malloc(fp->ff_size); + error = (int)buf_meta_bread(vp, (daddr64_t)0, + roundup((int)fp->ff_size, VTOHFS(vp)->hfs_physical_block_size), + vfs_context_ucred(ap->a_context), &bp); + if (error) { + if (bp) + buf_brelse(bp); + if (fp->ff_symlinkptr) { + hfs_free(fp->ff_symlinkptr, fp->ff_size); + fp->ff_symlinkptr = NULL; + } + goto exit; + } + bcopy((char *)buf_dataptr(bp), fp->ff_symlinkptr, (size_t)fp->ff_size); + + if (VTOHFS(vp)->jnl && (buf_flags(bp) & B_LOCKED) == 0) { + buf_markinvalid(bp); /* data no longer needed */ + } + buf_brelse(bp); + } + error = uiomove((caddr_t)fp->ff_symlinkptr, (int)fp->ff_size, ap->a_uio); + + /* + * Keep track blocks read + */ + if ((VTOHFS(vp)->hfc_stage == HFC_RECORDING) && (error == 0)) { + + /* + * If this file hasn't been seen since the start of + * the current sampling period then start over. + */ + if (cp->c_atime < VTOHFS(vp)->hfc_timebase) + VTOF(vp)->ff_bytesread = fp->ff_size; + else + VTOF(vp)->ff_bytesread += fp->ff_size; + + // if (VTOF(vp)->ff_bytesread > fp->ff_size) + // cp->c_touch_acctime = TRUE; + } + +exit: + hfs_unlock(cp); + return (error); +} + + +/* + * Get configurable pathname variables. + */ +int +hfs_vnop_pathconf(struct vnop_pathconf_args *ap) +{ + +#if CONFIG_HFS_STD + int std_hfs = (VTOHFS(ap->a_vp)->hfs_flags & HFS_STANDARD); +#endif + + switch (ap->a_name) { + case _PC_LINK_MAX: +#if CONFIG_HFS_STD + if (std_hfs) { + *ap->a_retval = 1; + } else +#endif + { + *ap->a_retval = HFS_LINK_MAX; + } + break; + case _PC_NAME_MAX: +#if CONFIG_HFS_STD + if (std_hfs) { + *ap->a_retval = kHFSMaxFileNameChars; /* 31 */ + } else +#endif + { + *ap->a_retval = kHFSPlusMaxFileNameChars; /* 255 */ + } + break; + case _PC_PATH_MAX: + *ap->a_retval = PATH_MAX; /* 1024 */ + break; + case _PC_PIPE_BUF: + *ap->a_retval = PIPE_BUF; + break; + case _PC_CHOWN_RESTRICTED: + *ap->a_retval = 200112; /* _POSIX_CHOWN_RESTRICTED */ + break; + case _PC_NO_TRUNC: + *ap->a_retval = 200112; /* _POSIX_NO_TRUNC */ + break; + case _PC_NAME_CHARS_MAX: +#if CONFIG_HFS_STD + if (std_hfs) { + *ap->a_retval = kHFSMaxFileNameChars; /* 31 */ + } else +#endif + { + *ap->a_retval = kHFSPlusMaxFileNameChars; /* 255 */ + } + break; + case _PC_CASE_SENSITIVE: + if (VTOHFS(ap->a_vp)->hfs_flags & HFS_CASE_SENSITIVE) + *ap->a_retval = 1; + else + *ap->a_retval = 0; + break; + case _PC_CASE_PRESERVING: + *ap->a_retval = 1; + break; + case _PC_FILESIZEBITS: + /* number of bits to store max file size */ +#if CONFIG_HFS_STD + if (std_hfs) { + *ap->a_retval = 32; + } else +#endif + { + *ap->a_retval = 64; + } + break; + case _PC_XATTR_SIZE_BITS: + /* Number of bits to store maximum extended attribute size */ + *ap->a_retval = HFS_XATTR_SIZE_BITS; + break; + default: + return (EINVAL); + } + + return (0); +} + +/* + * Prepares a fork for cat_update by making sure ff_size and ff_blocks + * are no bigger than the valid data on disk thus reducing the chance + * of exposing uninitialised data in the event of a non clean unmount. + * fork_buf is where to put the temporary copy if required. (It can + * be inside pfork.) + */ +const struct cat_fork * +hfs_prepare_fork_for_update(filefork_t *ff, + const struct cat_fork *cf, + struct cat_fork *cf_buf, + uint32_t block_size) +{ + if (!ff) + return NULL; + + if (!cf) + cf = &ff->ff_data; + if (!cf_buf) + cf_buf = &ff->ff_data; + + off_t max_size = ff->ff_size; + + // Check first invalid range + if (!TAILQ_EMPTY(&ff->ff_invalidranges)) + max_size = TAILQ_FIRST(&ff->ff_invalidranges)->rl_start; + + if (!ff->ff_unallocblocks && ff->ff_size <= max_size) + return cf; // Nothing to do + + if (ff->ff_blocks < ff->ff_unallocblocks) { + panic("hfs: ff_blocks %d is less than unalloc blocks %d\n", + ff->ff_blocks, ff->ff_unallocblocks); + } + + struct cat_fork *out = cf_buf; + + if (out != cf) + bcopy(cf, out, sizeof(*cf)); + + // Adjust cf_blocks for cf_vblocks + out->cf_blocks -= out->cf_vblocks; + + /* + * Here we trim the size with the updated cf_blocks. This is + * probably unnecessary now because the invalid ranges should + * catch this (but that wasn't always the case). + */ + off_t alloc_bytes = hfs_blk_to_bytes(out->cf_blocks, block_size); + if (out->cf_size > alloc_bytes) + out->cf_size = alloc_bytes; + + // Trim cf_size to first invalid range + if (out->cf_size > max_size) + out->cf_size = max_size; + + return out; +} + +/* + * Update a cnode's on-disk metadata. + * + * The cnode must be locked exclusive. See declaration for possible + * options. + */ +int +hfs_update(struct vnode *vp, int options) +{ + struct cnode *cp = VTOC(vp); + struct proc *p; + const struct cat_fork *dataforkp = NULL; + const struct cat_fork *rsrcforkp = NULL; + struct cat_fork datafork; + struct cat_fork rsrcfork; + struct hfsmount *hfsmp; + int lockflags; + int error; + uint32_t tstate = 0; + + if (ISSET(cp->c_flag, C_NOEXISTS)) + return 0; + + p = current_proc(); + hfsmp = VTOHFS(vp); + + if (((vnode_issystem(vp) && (cp->c_cnid < kHFSFirstUserCatalogNodeID))) || + hfsmp->hfs_catalog_vp == NULL){ + return (0); + } + if ((hfsmp->hfs_flags & HFS_READ_ONLY) || (cp->c_mode == 0)) { + CLR(cp->c_flag, C_MODIFIED | C_MINOR_MOD | C_NEEDS_DATEADDED); + cp->c_touch_acctime = 0; + cp->c_touch_chgtime = 0; + cp->c_touch_modtime = 0; + return (0); + } + if (kdebug_enable) { + if (cp->c_touch_acctime || cp->c_atime != cp->c_attr.ca_atimeondisk) + tstate |= DBG_HFS_UPDATE_ACCTIME; + if (cp->c_touch_modtime) + tstate |= DBG_HFS_UPDATE_MODTIME; + if (cp->c_touch_chgtime) + tstate |= DBG_HFS_UPDATE_CHGTIME; + + if (cp->c_flag & C_MODIFIED) + tstate |= DBG_HFS_UPDATE_MODIFIED; + if (ISSET(options, HFS_UPDATE_FORCE)) + tstate |= DBG_HFS_UPDATE_FORCE; + if (cp->c_flag & C_NEEDS_DATEADDED) + tstate |= DBG_HFS_UPDATE_DATEADDED; + if (cp->c_flag & C_MINOR_MOD) + tstate |= DBG_HFS_UPDATE_MINOR; + } + hfs_touchtimes(hfsmp, cp); + + if (!ISSET(cp->c_flag, C_MODIFIED | C_MINOR_MOD) + && !hfs_should_save_atime(cp)) { + // Nothing to update + return 0; + } + + KDBG(HFSDBG_UPDATE | DBG_FUNC_START, kdebug_vnode(vp), tstate); + + bool check_txn = false; + + if (!ISSET(options, HFS_UPDATE_FORCE) && !ISSET(cp->c_flag, C_MODIFIED)) { + /* + * This must be a minor modification. If the current + * transaction already has an update for this node, then we + * bundle in the modification. + */ + if (hfsmp->jnl + && journal_current_txn(hfsmp->jnl) == cp->c_update_txn) { + check_txn = true; + } else { + tstate |= DBG_HFS_UPDATE_SKIPPED; + error = 0; + goto exit; + } + } + + if ((error = hfs_start_transaction(hfsmp)) != 0) + goto exit; + + if (check_txn + && journal_current_txn(hfsmp->jnl) != cp->c_update_txn) { + hfs_end_transaction(hfsmp); + tstate |= DBG_HFS_UPDATE_SKIPPED; + error = 0; + goto exit; + } + + if (cp->c_datafork) + dataforkp = &cp->c_datafork->ff_data; + if (cp->c_rsrcfork) + rsrcforkp = &cp->c_rsrcfork->ff_data; + + /* + * Modify the values passed to cat_update based on whether or not + * the file has invalid ranges or borrowed blocks. + */ + dataforkp = hfs_prepare_fork_for_update(cp->c_datafork, NULL, &datafork, hfsmp->blockSize); + rsrcforkp = hfs_prepare_fork_for_update(cp->c_rsrcfork, NULL, &rsrcfork, hfsmp->blockSize); + + if (__builtin_expect(kdebug_enable & KDEBUG_TRACE, 0)) { + long dbg_parms[NUMPARMS]; + int dbg_namelen; + + dbg_namelen = NUMPARMS * sizeof(long); + vn_getpath(vp, (char *)dbg_parms, &dbg_namelen); + + if (dbg_namelen < (int)sizeof(dbg_parms)) + memset((char *)dbg_parms + dbg_namelen, 0, sizeof(dbg_parms) - dbg_namelen); + + kdebug_lookup_gen_events(dbg_parms, dbg_namelen, (void *)vp, TRUE); + } + + /* + * Lock the Catalog b-tree file. + */ + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); + + error = cat_update(hfsmp, &cp->c_desc, &cp->c_attr, dataforkp, rsrcforkp); + + if (hfsmp->jnl) + cp->c_update_txn = journal_current_txn(hfsmp->jnl); + + hfs_systemfile_unlock(hfsmp, lockflags); + + CLR(cp->c_flag, C_MODIFIED | C_MINOR_MOD); + + hfs_end_transaction(hfsmp); + +exit: + + KDBG(HFSDBG_UPDATE | DBG_FUNC_END, kdebug_vnode(vp), tstate, error); + + return error; +} + +/* + * Allocate a new node + */ +int +hfs_makenode(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, + struct vnode_attr *vap, vfs_context_t ctx) +{ + struct cnode *cp = NULL; + struct cnode *dcp = NULL; + struct vnode *tvp; + struct hfsmount *hfsmp; + struct cat_desc in_desc, out_desc; + struct cat_attr attr; + struct timeval tv; + int lockflags; + int error, started_tr = 0; + enum vtype vnodetype; + int mode; + int newvnode_flags = 0; + u_int32_t gnv_flags = 0; + int protectable_target = 0; + int nocache = 0; + vnode_t old_doc_vp = NULL; + +#if CONFIG_PROTECT + struct cprotect *entry = NULL; + int32_t cp_class = -1; + + /* + * By default, it's OK for AKS to overrride our target class preferences. + */ + uint32_t keywrap_flags = CP_KEYWRAP_DIFFCLASS; + + if (VATTR_IS_ACTIVE(vap, va_dataprotect_class)) { + cp_class = (int32_t)vap->va_dataprotect_class; + /* + * Since the user specifically requested this target class be used, + * we want to fail this creation operation if we cannot wrap to their + * target class. The CP_KEYWRAP_DIFFCLASS bit says that it is OK to + * use a different class than the one specified, so we turn that off + * now. + */ + keywrap_flags &= ~CP_KEYWRAP_DIFFCLASS; + } + int protected_mount = 0; +#endif + + + if ((error = hfs_lock(VTOC(dvp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) + return (error); + + /* set the cnode pointer only after successfully acquiring lock */ + dcp = VTOC(dvp); + + /* Don't allow creation of new entries in open-unlinked directories */ + if ((error = hfs_checkdeleted(dcp))) { + hfs_unlock(dcp); + return error; + } + + dcp->c_flag |= C_DIR_MODIFICATION; + + hfsmp = VTOHFS(dvp); + + *vpp = NULL; + tvp = NULL; + out_desc.cd_flags = 0; + out_desc.cd_nameptr = NULL; + + vnodetype = vap->va_type; + if (vnodetype == VNON) + vnodetype = VREG; + mode = MAKEIMODE(vnodetype, vap->va_mode); + + if (S_ISDIR (mode) || S_ISREG (mode)) { + protectable_target = 1; + } + + + /* Check if were out of usable disk space. */ + if ((hfs_freeblks(hfsmp, 1) == 0) && (vfs_context_suser(ctx) != 0)) { + error = ENOSPC; + goto exit; + } + + microtime(&tv); + + /* Setup the default attributes */ + bzero(&attr, sizeof(attr)); + attr.ca_mode = mode; + attr.ca_linkcount = 1; + if (VATTR_IS_ACTIVE(vap, va_rdev)) { + attr.ca_rdev = vap->va_rdev; + } + if (VATTR_IS_ACTIVE(vap, va_create_time)) { + VATTR_SET_SUPPORTED(vap, va_create_time); + attr.ca_itime = vap->va_create_time.tv_sec; + } else { + attr.ca_itime = tv.tv_sec; + } +#if CONFIG_HFS_STD + if ((hfsmp->hfs_flags & HFS_STANDARD) && gTimeZone.tz_dsttime) { + attr.ca_itime += 3600; /* Same as what hfs_update does */ + } +#endif + attr.ca_atime = attr.ca_ctime = attr.ca_mtime = attr.ca_itime; + attr.ca_atimeondisk = attr.ca_atime; + if (VATTR_IS_ACTIVE(vap, va_flags)) { + VATTR_SET_SUPPORTED(vap, va_flags); + attr.ca_flags = vap->va_flags; + } + + /* + * HFS+ only: all files get ThreadExists + * HFSX only: dirs get HasFolderCount + */ +#if CONFIG_HFS_STD + if (!(hfsmp->hfs_flags & HFS_STANDARD)) +#endif + { + if (vnodetype == VDIR) { + if (hfsmp->hfs_flags & HFS_FOLDERCOUNT) + attr.ca_recflags = kHFSHasFolderCountMask; + } else { + attr.ca_recflags = kHFSThreadExistsMask; + } + } + +#if CONFIG_PROTECT + if (cp_fs_protected(hfsmp->hfs_mp)) { + protected_mount = 1; + } + /* + * On a content-protected HFS+/HFSX filesystem, files and directories + * cannot be created without atomically setting/creating the EA that + * contains the protection class metadata and keys at the same time, in + * the same transaction. As a result, pre-set the "EAs exist" flag + * on the cat_attr for protectable catalog record creations. This will + * cause the cnode creation routine in hfs_getnewvnode to mark the cnode + * as having EAs. + */ + if ((protected_mount) && (protectable_target)) { + attr.ca_recflags |= kHFSHasAttributesMask; + /* delay entering in the namecache */ + nocache = 1; + } +#endif + + + /* + * Add the date added to the item. See above, as + * all of the dates are set to the itime. + */ + hfs_write_dateadded (&attr, attr.ca_atime); + + /* Initialize the gen counter to 1 */ + hfs_write_gencount(&attr, (uint32_t)1); + + attr.ca_uid = vap->va_uid; + attr.ca_gid = vap->va_gid; + VATTR_SET_SUPPORTED(vap, va_mode); + VATTR_SET_SUPPORTED(vap, va_uid); + VATTR_SET_SUPPORTED(vap, va_gid); + +#if QUOTA + /* check to see if this node's creation would cause us to go over + * quota. If so, abort this operation. + */ + if (hfsmp->hfs_flags & HFS_QUOTAS) { + if ((error = hfs_quotacheck(hfsmp, 1, attr.ca_uid, attr.ca_gid, + vfs_context_ucred(ctx)))) { + goto exit; + } + } +#endif + + + /* Tag symlinks with a type and creator. */ + if (vnodetype == VLNK) { + struct FndrFileInfo *fip; + + fip = (struct FndrFileInfo *)&attr.ca_finderinfo; + fip->fdType = SWAP_BE32(kSymLinkFileType); + fip->fdCreator = SWAP_BE32(kSymLinkCreator); + } + + /* Setup the descriptor */ + in_desc.cd_nameptr = (const u_int8_t *)cnp->cn_nameptr; + in_desc.cd_namelen = cnp->cn_namelen; + in_desc.cd_parentcnid = dcp->c_fileid; + in_desc.cd_flags = S_ISDIR(mode) ? CD_ISDIR : 0; + in_desc.cd_hint = dcp->c_childhint; + in_desc.cd_encoding = 0; + +#if CONFIG_PROTECT + /* + * To preserve file creation atomicity with regards to the content protection EA, + * we must create the file in the catalog and then write out its EA in the same + * transaction. + * + * We only denote the target class in this EA; key generation is not completed + * until the file has been inserted into the catalog and will be done + * in a separate transaction. + */ + if ((protected_mount) && (protectable_target)) { + error = cp_setup_newentry(hfsmp, dcp, cp_class, attr.ca_mode, &entry); + if (error) { + goto exit; + } + } +#endif + + if ((error = hfs_start_transaction(hfsmp)) != 0) { + goto exit; + } + started_tr = 1; + + // have to also lock the attribute file because cat_create() needs + // to check that any fileID it wants to use does not have orphaned + // attributes in it. + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_ATTRIBUTE, HFS_EXCLUSIVE_LOCK); + cnid_t new_id; + + /* Reserve some space in the Catalog file. */ + if ((error = cat_preflight(hfsmp, CAT_CREATE, NULL, 0))) { + hfs_systemfile_unlock(hfsmp, lockflags); + goto exit; + } + + if ((error = cat_acquire_cnid(hfsmp, &new_id))) { + hfs_systemfile_unlock (hfsmp, lockflags); + goto exit; + } + + error = cat_create(hfsmp, new_id, &in_desc, &attr, &out_desc); + if (error == 0) { + /* Update the parent directory */ + dcp->c_childhint = out_desc.cd_hint; /* Cache directory's location */ + dcp->c_entries++; + + if (vnodetype == VDIR) { + INC_FOLDERCOUNT(hfsmp, dcp->c_attr); + } + dcp->c_dirchangecnt++; + hfs_incr_gencount(dcp); + + dcp->c_touch_chgtime = dcp->c_touch_modtime = true; + dcp->c_flag |= C_MODIFIED; + + hfs_update(dcp->c_vp, 0); + +#if CONFIG_PROTECT + /* + * If we are creating a content protected file, now is when + * we create the EA. We must create it in the same transaction + * that creates the file. We can also guarantee that the file + * MUST exist because we are still holding the catalog lock + * at this point. + */ + if ((attr.ca_fileid != 0) && (protected_mount) && (protectable_target)) { + error = cp_setxattr (NULL, entry, hfsmp, attr.ca_fileid, XATTR_CREATE); + + if (error) { + int delete_err; + /* + * If we fail the EA creation, then we need to delete the file. + * Luckily, we are still holding all of the right locks. + */ + delete_err = cat_delete (hfsmp, &out_desc, &attr); + if (delete_err == 0) { + /* Update the parent directory */ + if (dcp->c_entries > 0) + dcp->c_entries--; + dcp->c_dirchangecnt++; + dcp->c_ctime = tv.tv_sec; + dcp->c_mtime = tv.tv_sec; + (void) cat_update(hfsmp, &dcp->c_desc, &dcp->c_attr, NULL, NULL); + } + + /* Emit EINVAL if we fail to create EA*/ + error = EINVAL; + } + } +#endif + } + hfs_systemfile_unlock(hfsmp, lockflags); + if (error) + goto exit; + + uint32_t txn = hfsmp->jnl ? journal_current_txn(hfsmp->jnl) : 0; + + /* Invalidate negative cache entries in the directory */ + if (dcp->c_flag & C_NEG_ENTRIES) { + cache_purge_negatives(dvp); + dcp->c_flag &= ~C_NEG_ENTRIES; + } + + hfs_volupdate(hfsmp, vnodetype == VDIR ? VOL_MKDIR : VOL_MKFILE, + (dcp->c_cnid == kHFSRootFolderID)); + + // XXXdbg + // have to end the transaction here before we call hfs_getnewvnode() + // because that can cause us to try and reclaim a vnode on a different + // file system which could cause us to start a transaction which can + // deadlock with someone on that other file system (since we could be + // holding two transaction locks as well as various vnodes and we did + // not obtain the locks on them in the proper order). + // + // NOTE: this means that if the quota check fails or we have to update + // the change time on a block-special device that those changes + // will happen as part of independent transactions. + // + if (started_tr) { + hfs_end_transaction(hfsmp); + started_tr = 0; + } + +#if CONFIG_PROTECT + /* + * At this point, we must have encountered success with writing the EA. + * Destroy our temporary cprotect (which had no keys). + */ + + if ((attr.ca_fileid != 0) && (protected_mount) && (protectable_target)) { + cp_entry_destroy (hfsmp, entry); + entry = NULL; + } +#endif + gnv_flags |= GNV_CREATE; + if (nocache) { + gnv_flags |= GNV_NOCACHE; + } + + /* + * Create a vnode for the object just created. + * + * NOTE: Maintaining the cnode lock on the parent directory is important, + * as it prevents race conditions where other threads want to look up entries + * in the directory and/or add things as we are in the process of creating + * the vnode below. However, this has the potential for causing a + * double lock panic when dealing with shadow files on a HFS boot partition. + * The panic could occur if we are not cleaning up after ourselves properly + * when done with a shadow file or in the error cases. The error would occur if we + * try to create a new vnode, and then end up reclaiming another shadow vnode to + * create the new one. However, if everything is working properly, this should + * be a non-issue as we would never enter that reclaim codepath. + * + * The cnode is locked on successful return. + */ + error = hfs_getnewvnode(hfsmp, dvp, cnp, &out_desc, gnv_flags, &attr, + NULL, &tvp, &newvnode_flags); + if (error) + goto exit; + + cp = VTOC(tvp); + + cp->c_update_txn = txn; + + struct doc_tombstone *ut; + ut = doc_tombstone_get(); + if ( ut->t_lastop_document_id != 0 + && ut->t_lastop_parent == dvp + && ut->t_lastop_parent_vid == vnode_vid(dvp) + && strcmp((char *)ut->t_lastop_filename, (const char *)cp->c_desc.cd_nameptr) == 0) { + struct FndrExtendedDirInfo *fip = (struct FndrExtendedDirInfo *)((char *)&cp->c_attr.ca_finderinfo + 16); + + //printf("CREATE: preserving doc-id %lld on %s\n", ut->t_lastop_document_id, ut->t_lastop_filename); + fip->document_id = (uint32_t)(ut->t_lastop_document_id & 0xffffffff); + + cp->c_bsdflags |= UF_TRACKED; + cp->c_flag |= C_MODIFIED; + + if ((error = hfs_start_transaction(hfsmp)) == 0) { + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK); + + (void) cat_update(hfsmp, &cp->c_desc, &cp->c_attr, NULL, NULL); + + hfs_systemfile_unlock (hfsmp, lockflags); + (void) hfs_end_transaction(hfsmp); + } + + doc_tombstone_clear(ut, &old_doc_vp); + } else if (ut->t_lastop_document_id != 0) { + int len = cnp->cn_namelen; + if (len == 0) { + len = strlen(cnp->cn_nameptr); + } + + if (doc_tombstone_should_ignore_name(cnp->cn_nameptr, cnp->cn_namelen)) { + // printf("CREATE: not clearing tombstone because %s is a temp name.\n", cnp->cn_nameptr); + } else { + // Clear the tombstone because the thread is not recreating the same path + // printf("CREATE: clearing tombstone because %s is NOT a temp name.\n", cnp->cn_nameptr); + doc_tombstone_clear(ut, NULL); + } + } + + if ((hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN) && (vnode_isfastdevicecandidate(dvp) && !vnode_isautocandidate(dvp))) { + + //printf("hfs: flagging %s (fileid: %d) as VFASTDEVCANDIDATE (dvp name: %s)\n", + // cnp->cn_nameptr ? cnp->cn_nameptr : "", + // cp->c_fileid, + // dvp->v_name ? dvp->v_name : "no-dir-name"); + + // + // On new files we set the FastDevCandidate flag so that + // any new blocks allocated to it will be pinned. + // + cp->c_attr.ca_recflags |= kHFSFastDevCandidateMask; + vnode_setfastdevicecandidate(tvp); + + // + // properly inherit auto-cached flags + // + if (vnode_isautocandidate(dvp)) { + cp->c_attr.ca_recflags |= kHFSAutoCandidateMask; + vnode_setautocandidate(tvp); + } + + + // + // We also want to add it to the hotfile adoption list so + // that it will eventually land in the hotfile btree + // + (void) hfs_addhotfile(tvp); + } + + *vpp = tvp; + +#if CONFIG_PROTECT + /* + * Now that we have a vnode-in-hand, generate keys for this namespace item. + * If we fail to create the keys, then attempt to delete the item from the + * namespace. If we can't delete the item, that's not desirable but also not fatal.. + * All of the places which deal with restoring/unwrapping keys must also be + * prepared to encounter an entry that does not have keys. + */ + if ((protectable_target) && (protected_mount)) { + struct cprotect *keyed_entry = NULL; + + if (cp->c_cpentry == NULL) { + panic ("hfs_makenode: no cpentry for cnode (%p)", cp); + } + + error = cp_generate_keys (hfsmp, cp, CP_CLASS(cp->c_cpentry->cp_pclass), keywrap_flags, &keyed_entry); + if (error == 0) { + /* + * Upon success, the keys were generated and written out. + * Update the cp pointer in the cnode. + */ + cp_replace_entry (hfsmp, cp, keyed_entry); + if (nocache) { + cache_enter (dvp, tvp, cnp); + } + } + else { + /* If key creation OR the setxattr failed, emit EPERM to userland */ + error = EPERM; + + /* + * Beware! This slightly violates the lock ordering for the + * cnode/vnode 'tvp'. Ordinarily, you must acquire the truncate lock + * which guards file size changes before acquiring the normal cnode lock + * and calling hfs_removefile on an item. + * + * However, in this case, we are still holding the directory lock so + * 'tvp' is not lookup-able and it was a newly created vnode so it + * cannot have any content yet. The only reason we are initiating + * the removefile is because we could not generate content protection keys + * for this namespace item. Note also that we pass a '1' in the allow_dirs + * argument for hfs_removefile because we may be creating a directory here. + * + * All this to say that while it is technically a violation it is + * impossible to race with another thread for this cnode so it is safe. + */ + int err = hfs_removefile (dvp, tvp, cnp, 0, 0, 1, NULL, 0); + if (err) { + printf("hfs_makenode: removefile failed (%d) for CP entry %p\n", err, tvp); + } + + /* Release the cnode lock and mark the vnode for termination */ + hfs_unlock (cp); + err = vnode_recycle (tvp); + if (err) { + printf("hfs_makenode: vnode_recycle failed (%d) for CP entry %p\n", err, tvp); + } + + /* Drop the iocount on the new vnode to force reclamation/recycling */ + vnode_put (tvp); + cp = NULL; + *vpp = NULL; + } + } +#endif + +#if QUOTA + /* + * Once we create this vnode, we need to initialize its quota data + * structures, if necessary. We know that it is OK to just go ahead and + * initialize because we've already validated earlier (through the hfs_quotacheck + * function) to see if creating this cnode/vnode would cause us to go over quota. + */ + if (hfsmp->hfs_flags & HFS_QUOTAS) { + if (cp) { + /* cp could have been zeroed earlier */ + (void) hfs_getinoquota(cp); + } + } +#endif + +exit: + cat_releasedesc(&out_desc); + +#if CONFIG_PROTECT + /* + * We may have jumped here in error-handling various situations above. + * If we haven't already dumped the temporary CP used to initialize + * the file atomically, then free it now. cp_entry_destroy should null + * out the pointer if it was called already. + */ + if (entry) { + cp_entry_destroy (hfsmp, entry); + entry = NULL; + } +#endif + + /* + * Make sure we release cnode lock on dcp. + */ + if (dcp) { + dcp->c_flag &= ~C_DIR_MODIFICATION; + wakeup((caddr_t)&dcp->c_flag); + + hfs_unlock(dcp); + } + ino64_t file_id = 0; + if (error == 0 && cp != NULL) { + file_id = cp->c_fileid; + hfs_unlock(cp); + } + if (started_tr) { + hfs_end_transaction(hfsmp); + started_tr = 0; + } + + if (old_doc_vp) { + cnode_t *ocp = VTOC(old_doc_vp); + hfs_lock_always(ocp, HFS_EXCLUSIVE_LOCK); + struct FndrExtendedFileInfo *ofip = (struct FndrExtendedFileInfo *)((char *)&ocp->c_attr.ca_finderinfo + 16); + + const uint32_t doc_id = ofip->document_id; + const ino64_t old_file_id = ocp->c_fileid; + + // printf("clearing doc-id from ino %d\n", ocp->c_desc.cd_cnid); + ofip->document_id = 0; + ocp->c_bsdflags &= ~UF_TRACKED; + ocp->c_flag |= C_MODIFIED; + + hfs_unlock(ocp); + vnode_put(old_doc_vp); + + add_fsevent(FSE_DOCID_CHANGED, vfs_context_current(), + FSE_ARG_DEV, hfsmp->hfs_raw_dev, + FSE_ARG_INO, old_file_id, // src inode # + FSE_ARG_INO, file_id, // dst inode # + FSE_ARG_INT32, doc_id, + FSE_ARG_DONE); + } + + return (error); +} + + +/* + * hfs_vgetrsrc acquires a resource fork vnode corresponding to the + * cnode that is found in 'vp'. The cnode should be locked upon entry + * and will be returned locked, but it may be dropped temporarily. + * + * If the resource fork vnode does not exist, HFS will attempt to acquire an + * empty (uninitialized) vnode from VFS so as to avoid deadlocks with + * jetsam. If we let the normal getnewvnode code produce the vnode for us + * we would be doing so while holding the cnode lock of our cnode. + * + * On success, *rvpp wlll hold the resource fork vnode with an + * iocount. *Don't* forget the vnode_put. + */ +int +hfs_vgetrsrc(struct hfsmount *hfsmp, struct vnode *vp, struct vnode **rvpp) +{ + struct vnode *rvp = NULLVP; + struct vnode *empty_rvp = NULLVP; + struct vnode *dvp = NULLVP; + struct cnode *cp = VTOC(vp); + int error; + int vid; + + if (vnode_vtype(vp) == VDIR) { + return EINVAL; + } + +restart: + /* Attempt to use existing vnode */ + if ((rvp = cp->c_rsrc_vp)) { + vid = vnode_vid(rvp); + + // vnode_getwithvid can block so we need to drop the cnode lock + hfs_unlock(cp); + + error = vnode_getwithvid(rvp, vid); + + hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK); + + /* + * When our lock was relinquished, the resource fork + * could have been recycled. Check for this and try + * again. + */ + if (error == ENOENT) + goto restart; + + if (error) { + const char * name = (const char *)VTOC(vp)->c_desc.cd_nameptr; + + if (name) + printf("hfs_vgetrsrc: couldn't get resource" + " fork for %s, vol=%s, err=%d\n", name, hfsmp->vcbVN, error); + return (error); + } + } else { + struct cat_fork rsrcfork; + struct componentname cn; + struct cat_desc *descptr = NULL; + struct cat_desc to_desc; + char delname[32]; + int lockflags; + int newvnode_flags = 0; + + /* + * In this case, we don't currently see a resource fork vnode attached + * to this cnode. In most cases, we were called from a read-only VNOP + * like getattr, so it should be safe to drop the cnode lock and then + * re-acquire it. + * + * Here, we drop the lock so that we can acquire an empty/husk + * vnode so that we don't deadlock against jetsam. + * + * It does not currently appear possible to hold the truncate lock via + * FS re-entrancy when we get to this point. (8/2014) + */ + hfs_unlock (cp); + + error = vnode_create_empty (&empty_rvp); + + hfs_lock_always (cp, HFS_EXCLUSIVE_LOCK); + + if (error) { + /* If acquiring the 'empty' vnode failed, then nothing to clean up */ + return error; + } + + /* + * We could have raced with another thread here while we dropped our cnode + * lock. See if the cnode now has a resource fork vnode and restart if appropriate. + * + * Note: We just released the cnode lock, so there is a possibility that the + * cnode that we just acquired has been deleted or even removed from disk + * completely, though this is unlikely. If the file is open-unlinked, the + * check below will resolve it for us. If it has been completely + * removed (even from the catalog!), then when we examine the catalog + * directly, below, while holding the catalog lock, we will not find the + * item and we can fail out properly. + */ + if (cp->c_rsrc_vp) { + /* Drop the empty vnode before restarting */ + vnode_put (empty_rvp); + empty_rvp = NULL; + rvp = NULL; + goto restart; + } + + /* + * hfs_vgetsrc may be invoked for a cnode that has already been marked + * C_DELETED. This is because we need to continue to provide rsrc + * fork access to open-unlinked files. In this case, build a fake descriptor + * like in hfs_removefile. If we don't do this, buildkey will fail in + * cat_lookup because this cnode has no name in its descriptor. + */ + if ((cp->c_flag & C_DELETED ) && (cp->c_desc.cd_namelen == 0)) { + bzero (&to_desc, sizeof(to_desc)); + bzero (delname, 32); + MAKE_DELETED_NAME(delname, sizeof(delname), cp->c_fileid); + to_desc.cd_nameptr = (const u_int8_t*) delname; + to_desc.cd_namelen = strlen(delname); + to_desc.cd_parentcnid = hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid; + to_desc.cd_flags = 0; + to_desc.cd_cnid = cp->c_cnid; + + descptr = &to_desc; + } + else { + descptr = &cp->c_desc; + } + + + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); + + /* + * We call cat_idlookup (instead of cat_lookup) below because we can't + * trust the descriptor in the provided cnode for lookups at this point. + * Between the time of the original lookup of this vnode and now, the + * descriptor could have gotten swapped or replaced. If this occurred, + * the parent/name combo originally desired may not necessarily be provided + * if we use the descriptor. Even worse, if the vnode represents + * a hardlink, we could have removed one of the links from the namespace + * but left the descriptor alone, since hfs_unlink does not invalidate + * the descriptor in the cnode if other links still point to the inode. + * + * Consider the following (slightly contrived) scenario: + * /tmp/a <--> /tmp/b (hardlinks). + * 1. Thread A: open rsrc fork on /tmp/b. + * 1a. Thread A: does lookup, goes out to lunch right before calling getnamedstream. + * 2. Thread B does 'mv /foo/b /tmp/b' + * 2. Thread B succeeds. + * 3. Thread A comes back and wants rsrc fork info for /tmp/b. + * + * Even though the hardlink backing /tmp/b is now eliminated, the descriptor + * is not removed/updated during the unlink process. So, if you were to + * do a lookup on /tmp/b, you'd acquire an entirely different record's resource + * fork. + * + * As a result, we use the fileid, which should be invariant for the lifetime + * of the cnode (possibly barring calls to exchangedata). + * + * Addendum: We can't do the above for HFS standard since we aren't guaranteed to + * have thread records for files. They were only required for directories. So + * we need to do the lookup with the catalog name. This is OK since hardlinks were + * never allowed on HFS standard. + */ + + /* Get resource fork data */ +#if CONFIG_HFS_STD + if (ISSET(hfsmp->hfs_flags, HFS_STANDARD)) { + /* + * HFS standard only: + * + * Get the resource fork for this item with a cat_lookup call, but do not + * force a case lookup since HFS standard is case-insensitive only. We + * don't want the descriptor; just the fork data here. If we tried to + * do a ID lookup (via thread record -> catalog record), then we might fail + * prematurely since, as noted above, thread records were not strictly required + * on files in HFS. + */ + error = cat_lookup (hfsmp, descptr, 1, 0, (struct cat_desc*)NULL, + (struct cat_attr*)NULL, &rsrcfork, NULL); + } else +#endif + { + error = cat_idlookup (hfsmp, cp->c_fileid, 0, 1, NULL, NULL, &rsrcfork); + } + + hfs_systemfile_unlock(hfsmp, lockflags); + if (error) { + /* Drop our 'empty' vnode ! */ + vnode_put (empty_rvp); + return (error); + } + /* + * Supply hfs_getnewvnode with a component name. + */ + cn.cn_pnbuf = NULL; + if (descptr->cd_nameptr) { + void *buf = hfs_malloc(MAXPATHLEN); + + cn = (struct componentname){ + .cn_nameiop = LOOKUP, + .cn_flags = ISLASTCN, + .cn_pnlen = MAXPATHLEN, + .cn_pnbuf = buf, + .cn_nameptr = buf, + .cn_namelen = snprintf(buf, MAXPATHLEN, + "%s%s", descptr->cd_nameptr, + _PATH_RSRCFORKSPEC) + }; + + // Should never happen because cn.cn_nameptr won't ever be long... + if (cn.cn_namelen >= MAXPATHLEN) { + hfs_free(buf, MAXPATHLEN); + /* Drop our 'empty' vnode ! */ + vnode_put (empty_rvp); + return ENAMETOOLONG; + + } + } + dvp = vnode_getparent(vp); + + /* + * We are about to call hfs_getnewvnode and pass in the vnode that we acquired + * earlier when we were not holding any locks. The semantics of GNV_USE_VP require that + * either hfs_getnewvnode consume the vnode and vend it back to us, properly initialized, + * or it will consume/dispose of it properly if it errors out. + */ + rvp = empty_rvp; + + error = hfs_getnewvnode(hfsmp, dvp, cn.cn_pnbuf ? &cn : NULL, + descptr, (GNV_WANTRSRC | GNV_SKIPLOCK | GNV_USE_VP), + &cp->c_attr, &rsrcfork, &rvp, &newvnode_flags); + + if (dvp) + vnode_put(dvp); + hfs_free(cn.cn_pnbuf, MAXPATHLEN); + if (error) + return (error); + } /* End 'else' for rsrc fork not existing */ + + *rvpp = rvp; + return (0); +} + +/* + * Wrapper for special device reads + */ +int +hfsspec_read(struct vnop_read_args *ap) +{ + /* + * Set access flag. + */ + cnode_t *cp = VTOC(ap->a_vp); + + if (cp) + cp->c_touch_acctime = TRUE; + + return spec_read(ap); +} + +/* + * Wrapper for special device writes + */ +int +hfsspec_write(struct vnop_write_args *ap) +{ + /* + * Set update and change flags. + */ + cnode_t *cp = VTOC(ap->a_vp); + + if (cp) { + cp->c_touch_chgtime = TRUE; + cp->c_touch_modtime = TRUE; + } + + return spec_write(ap); +} + +/* + * Wrapper for special device close + * + * Update the times on the cnode then do device close. + */ +int +hfsspec_close(struct vnop_close_args *ap) +{ + struct vnode *vp = ap->a_vp; + cnode_t *cp = VTOC(vp); + + if (cp && vnode_isinuse(ap->a_vp, 0)) { + if (hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT) == 0) { + hfs_touchtimes(VTOHFS(vp), cp); + hfs_unlock(cp); + } + } + return spec_close(ap); +} + +#if FIFO +/* + * Wrapper for fifo reads + */ +static int +hfsfifo_read(struct vnop_read_args *ap) +{ + /* + * Set access flag. + */ + VTOC(ap->a_vp)->c_touch_acctime = TRUE; + return fifo_read(ap); +} + +/* + * Wrapper for fifo writes + */ +static int +hfsfifo_write(struct vnop_write_args *ap) +{ + /* + * Set update and change flags. + */ + VTOC(ap->a_vp)->c_touch_chgtime = TRUE; + VTOC(ap->a_vp)->c_touch_modtime = TRUE; + return fifo_write(ap); +} + +/* + * Wrapper for fifo close + * + * Update the times on the cnode then do device close. + */ +static int +hfsfifo_close(struct vnop_close_args *ap) +{ + struct vnode *vp = ap->a_vp; + struct cnode *cp; + + if (vnode_isinuse(ap->a_vp, 1)) { + if (hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT) == 0) { + cp = VTOC(vp); + hfs_touchtimes(VTOHFS(vp), cp); + hfs_unlock(cp); + } + } + return fifo_close(ap); +} + + +#endif /* FIFO */ + +/* + * Getter for the document_id + * the document_id is stored in FndrExtendedFileInfo/FndrExtendedDirInfo + */ +static u_int32_t +hfs_get_document_id_internal(const uint8_t *finderinfo, mode_t mode) +{ + const uint8_t *finfo = NULL; + u_int32_t doc_id = 0; + + /* overlay the FinderInfo to the correct pointer, and advance */ + finfo = finderinfo + 16; + + if (S_ISDIR(mode) || S_ISREG(mode)) { + const struct FndrExtendedFileInfo *extinfo = (const struct FndrExtendedFileInfo *)finfo; + doc_id = extinfo->document_id; + } + + return doc_id; +} + + +/* getter(s) for document id */ +u_int32_t +hfs_get_document_id(struct cnode *cp) +{ + return (hfs_get_document_id_internal((u_int8_t*)cp->c_finderinfo, + cp->c_attr.ca_mode)); +} + +/* If you have finderinfo and mode, you can use this */ +u_int32_t +hfs_get_document_id_from_blob(const uint8_t *finderinfo, mode_t mode) +{ + return (hfs_get_document_id_internal(finderinfo, mode)); +} + +/* + * Synchronize a file's in-core state with that on disk. + */ +int +hfs_vnop_fsync(struct vnop_fsync_args *ap) +{ + struct vnode* vp = ap->a_vp; + int error; + + /* Note: We check hfs flags instead of vfs mount flag because during + * read-write update, hfs marks itself read-write much earlier than + * the vfs, and hence won't result in skipping of certain writes like + * zero'ing out of unused nodes, creation of hotfiles btree, etc. + */ + if (VTOHFS(vp)->hfs_flags & HFS_READ_ONLY) { + return 0; + } + + /* + * No need to call cp_handle_vnop to resolve fsync(). Any dirty data + * should have caused the keys to be unwrapped at the time the data was + * put into the UBC, either at mmap/pagein/read-write. If we did manage + * to let this by, then strategy will auto-resolve for us. + * + * We also need to allow ENOENT lock errors since unlink + * system call can call VNOP_FSYNC during vclean. + */ + error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + if (error) + return (0); + + error = hfs_fsync(vp, ap->a_waitfor, 0, vfs_context_proc(ap->a_context)); + + hfs_unlock(VTOC(vp)); + return (error); +} + +int (**hfs_vnodeop_p)(void *); + +#define VOPFUNC int (*)(void *) + + +#if CONFIG_HFS_STD +int (**hfs_std_vnodeop_p) (void *); +static int hfs_readonly_op (__unused void* ap) { return (EROFS); } + +/* + * In 10.6 and forward, HFS Standard is read-only and deprecated. The vnop table below + * is for use with HFS standard to block out operations that would modify the file system + */ + +const struct vnodeopv_entry_desc hfs_standard_vnodeop_entries[] = { + { &vnop_default_desc, (VOPFUNC)vn_default_error }, + { &vnop_lookup_desc, (VOPFUNC)hfs_vnop_lookup }, /* lookup */ + { &vnop_create_desc, (VOPFUNC)hfs_readonly_op }, /* create (READONLY) */ + { &vnop_mknod_desc, (VOPFUNC)hfs_readonly_op }, /* mknod (READONLY) */ + { &vnop_open_desc, (VOPFUNC)hfs_vnop_open }, /* open */ + { &vnop_close_desc, (VOPFUNC)hfs_vnop_close }, /* close */ + { &vnop_getattr_desc, (VOPFUNC)hfs_vnop_getattr }, /* getattr */ + { &vnop_setattr_desc, (VOPFUNC)hfs_readonly_op }, /* setattr */ + { &vnop_read_desc, (VOPFUNC)hfs_vnop_read }, /* read */ + { &vnop_write_desc, (VOPFUNC)hfs_readonly_op }, /* write (READONLY) */ + { &vnop_ioctl_desc, (VOPFUNC)hfs_vnop_ioctl }, /* ioctl */ + { &vnop_select_desc, (VOPFUNC)hfs_vnop_select }, /* select */ + { &vnop_revoke_desc, (VOPFUNC)nop_revoke }, /* revoke */ + { &vnop_exchange_desc, (VOPFUNC)hfs_readonly_op }, /* exchange (READONLY)*/ + { &vnop_mmap_desc, (VOPFUNC)err_mmap }, /* mmap */ + { &vnop_fsync_desc, (VOPFUNC)hfs_readonly_op}, /* fsync (READONLY) */ + { &vnop_remove_desc, (VOPFUNC)hfs_readonly_op }, /* remove (READONLY) */ + { &vnop_link_desc, (VOPFUNC)hfs_readonly_op }, /* link ( READONLLY) */ + { &vnop_rename_desc, (VOPFUNC)hfs_readonly_op }, /* rename (READONLY)*/ + { &vnop_mkdir_desc, (VOPFUNC)hfs_readonly_op }, /* mkdir (READONLY) */ + { &vnop_rmdir_desc, (VOPFUNC)hfs_readonly_op }, /* rmdir (READONLY) */ + { &vnop_symlink_desc, (VOPFUNC)hfs_readonly_op }, /* symlink (READONLY) */ + { &vnop_readdir_desc, (VOPFUNC)hfs_vnop_readdir }, /* readdir */ + { &vnop_readdirattr_desc, (VOPFUNC)hfs_vnop_readdirattr }, /* readdirattr */ + { &vnop_readlink_desc, (VOPFUNC)hfs_vnop_readlink }, /* readlink */ + { &vnop_inactive_desc, (VOPFUNC)hfs_vnop_inactive }, /* inactive */ + { &vnop_reclaim_desc, (VOPFUNC)hfs_vnop_reclaim }, /* reclaim */ + { &vnop_strategy_desc, (VOPFUNC)hfs_vnop_strategy }, /* strategy */ + { &vnop_pathconf_desc, (VOPFUNC)hfs_vnop_pathconf }, /* pathconf */ + { &vnop_advlock_desc, (VOPFUNC)err_advlock }, /* advlock */ + { &vnop_allocate_desc, (VOPFUNC)hfs_readonly_op }, /* allocate (READONLY) */ +#if CONFIG_SEARCHFS + { &vnop_searchfs_desc, (VOPFUNC)hfs_vnop_search }, /* search fs */ +#else + { &vnop_searchfs_desc, (VOPFUNC)err_searchfs }, /* search fs */ +#endif + { &vnop_bwrite_desc, (VOPFUNC)hfs_readonly_op }, /* bwrite (READONLY) */ + { &vnop_pagein_desc, (VOPFUNC)hfs_vnop_pagein }, /* pagein */ + { &vnop_pageout_desc,(VOPFUNC) hfs_readonly_op }, /* pageout (READONLY) */ + { &vnop_copyfile_desc, (VOPFUNC)hfs_readonly_op }, /* copyfile (READONLY)*/ + { &vnop_blktooff_desc, (VOPFUNC)hfs_vnop_blktooff }, /* blktooff */ + { &vnop_offtoblk_desc, (VOPFUNC)hfs_vnop_offtoblk }, /* offtoblk */ + { &vnop_blockmap_desc, (VOPFUNC)hfs_vnop_blockmap }, /* blockmap */ + { &vnop_getxattr_desc, (VOPFUNC)hfs_vnop_getxattr}, + { &vnop_setxattr_desc, (VOPFUNC)hfs_readonly_op}, /* set xattr (READONLY) */ + { &vnop_removexattr_desc, (VOPFUNC)hfs_readonly_op}, /* remove xattr (READONLY) */ + { &vnop_listxattr_desc, (VOPFUNC)hfs_vnop_listxattr}, +#if NAMEDSTREAMS + { &vnop_getnamedstream_desc, (VOPFUNC)hfs_vnop_getnamedstream }, + { &vnop_makenamedstream_desc, (VOPFUNC)hfs_readonly_op }, + { &vnop_removenamedstream_desc, (VOPFUNC)hfs_readonly_op }, +#endif + { &vnop_getattrlistbulk_desc, (VOPFUNC)hfs_vnop_getattrlistbulk }, /* getattrlistbulk */ + { NULL, (VOPFUNC)NULL } +}; + +const struct vnodeopv_desc hfs_std_vnodeop_opv_desc = +{ &hfs_std_vnodeop_p, hfs_standard_vnodeop_entries }; +#endif + +/* VNOP table for HFS+ */ +const struct vnodeopv_entry_desc hfs_vnodeop_entries[] = { + { &vnop_default_desc, (VOPFUNC)vn_default_error }, + { &vnop_lookup_desc, (VOPFUNC)hfs_vnop_lookup }, /* lookup */ + { &vnop_create_desc, (VOPFUNC)hfs_vnop_create }, /* create */ + { &vnop_mknod_desc, (VOPFUNC)hfs_vnop_mknod }, /* mknod */ + { &vnop_open_desc, (VOPFUNC)hfs_vnop_open }, /* open */ + { &vnop_close_desc, (VOPFUNC)hfs_vnop_close }, /* close */ + { &vnop_getattr_desc, (VOPFUNC)hfs_vnop_getattr }, /* getattr */ + { &vnop_setattr_desc, (VOPFUNC)hfs_vnop_setattr }, /* setattr */ + { &vnop_read_desc, (VOPFUNC)hfs_vnop_read }, /* read */ + { &vnop_write_desc, (VOPFUNC)hfs_vnop_write }, /* write */ + { &vnop_ioctl_desc, (VOPFUNC)hfs_vnop_ioctl }, /* ioctl */ + { &vnop_select_desc, (VOPFUNC)hfs_vnop_select }, /* select */ + { &vnop_revoke_desc, (VOPFUNC)nop_revoke }, /* revoke */ + { &vnop_exchange_desc, (VOPFUNC)hfs_vnop_exchange }, /* exchange */ + { &vnop_mmap_desc, (VOPFUNC)hfs_vnop_mmap }, /* mmap */ + { &vnop_fsync_desc, (VOPFUNC)hfs_vnop_fsync }, /* fsync */ + { &vnop_remove_desc, (VOPFUNC)hfs_vnop_remove }, /* remove */ + { &vnop_link_desc, (VOPFUNC)hfs_vnop_link }, /* link */ + { &vnop_rename_desc, (VOPFUNC)hfs_vnop_rename }, /* rename */ + { &vnop_renamex_desc, (VOPFUNC)hfs_vnop_renamex }, /* renamex (with flags) */ + { &vnop_mkdir_desc, (VOPFUNC)hfs_vnop_mkdir }, /* mkdir */ + { &vnop_rmdir_desc, (VOPFUNC)hfs_vnop_rmdir }, /* rmdir */ + { &vnop_symlink_desc, (VOPFUNC)hfs_vnop_symlink }, /* symlink */ + { &vnop_readdir_desc, (VOPFUNC)hfs_vnop_readdir }, /* readdir */ + { &vnop_readdirattr_desc, (VOPFUNC)hfs_vnop_readdirattr }, /* readdirattr */ + { &vnop_readlink_desc, (VOPFUNC)hfs_vnop_readlink }, /* readlink */ + { &vnop_inactive_desc, (VOPFUNC)hfs_vnop_inactive }, /* inactive */ + { &vnop_reclaim_desc, (VOPFUNC)hfs_vnop_reclaim }, /* reclaim */ + { &vnop_strategy_desc, (VOPFUNC)hfs_vnop_strategy }, /* strategy */ + { &vnop_pathconf_desc, (VOPFUNC)hfs_vnop_pathconf }, /* pathconf */ + { &vnop_advlock_desc, (VOPFUNC)err_advlock }, /* advlock */ + { &vnop_allocate_desc, (VOPFUNC)hfs_vnop_allocate }, /* allocate */ +#if CONFIG_SEARCHFS + { &vnop_searchfs_desc, (VOPFUNC)hfs_vnop_search }, /* search fs */ +#else + { &vnop_searchfs_desc, (VOPFUNC)err_searchfs }, /* search fs */ +#endif + { &vnop_bwrite_desc, (VOPFUNC)hfs_vnop_bwrite }, /* bwrite */ + { &vnop_pagein_desc, (VOPFUNC)hfs_vnop_pagein }, /* pagein */ + { &vnop_pageout_desc,(VOPFUNC) hfs_vnop_pageout }, /* pageout */ + { &vnop_copyfile_desc, (VOPFUNC)err_copyfile }, /* copyfile */ + { &vnop_blktooff_desc, (VOPFUNC)hfs_vnop_blktooff }, /* blktooff */ + { &vnop_offtoblk_desc, (VOPFUNC)hfs_vnop_offtoblk }, /* offtoblk */ + { &vnop_blockmap_desc, (VOPFUNC)hfs_vnop_blockmap }, /* blockmap */ + { &vnop_getxattr_desc, (VOPFUNC)hfs_vnop_getxattr}, + { &vnop_setxattr_desc, (VOPFUNC)hfs_vnop_setxattr}, + { &vnop_removexattr_desc, (VOPFUNC)hfs_vnop_removexattr}, + { &vnop_listxattr_desc, (VOPFUNC)hfs_vnop_listxattr}, +#if NAMEDSTREAMS + { &vnop_getnamedstream_desc, (VOPFUNC)hfs_vnop_getnamedstream }, + { &vnop_makenamedstream_desc, (VOPFUNC)hfs_vnop_makenamedstream }, + { &vnop_removenamedstream_desc, (VOPFUNC)hfs_vnop_removenamedstream }, +#endif + { &vnop_getattrlistbulk_desc, (VOPFUNC)hfs_vnop_getattrlistbulk }, /* getattrlistbulk */ + { &vnop_mnomap_desc, (VOPFUNC)hfs_vnop_mnomap }, + { NULL, (VOPFUNC)NULL } +}; + +const struct vnodeopv_desc hfs_vnodeop_opv_desc = +{ &hfs_vnodeop_p, hfs_vnodeop_entries }; + + +/* Spec Op vnop table for HFS+ */ +int (**hfs_specop_p)(void *); +const struct vnodeopv_entry_desc hfs_specop_entries[] = { + { &vnop_default_desc, (VOPFUNC)vn_default_error }, + { &vnop_lookup_desc, (VOPFUNC)spec_lookup }, /* lookup */ + { &vnop_create_desc, (VOPFUNC)spec_create }, /* create */ + { &vnop_mknod_desc, (VOPFUNC)spec_mknod }, /* mknod */ + { &vnop_open_desc, (VOPFUNC)spec_open }, /* open */ + { &vnop_close_desc, (VOPFUNC)hfsspec_close }, /* close */ + { &vnop_getattr_desc, (VOPFUNC)hfs_vnop_getattr }, /* getattr */ + { &vnop_setattr_desc, (VOPFUNC)hfs_vnop_setattr }, /* setattr */ + { &vnop_read_desc, (VOPFUNC)hfsspec_read }, /* read */ + { &vnop_write_desc, (VOPFUNC)hfsspec_write }, /* write */ + { &vnop_ioctl_desc, (VOPFUNC)spec_ioctl }, /* ioctl */ + { &vnop_select_desc, (VOPFUNC)spec_select }, /* select */ + { &vnop_revoke_desc, (VOPFUNC)spec_revoke }, /* revoke */ + { &vnop_mmap_desc, (VOPFUNC)spec_mmap }, /* mmap */ + { &vnop_fsync_desc, (VOPFUNC)hfs_vnop_fsync }, /* fsync */ + { &vnop_remove_desc, (VOPFUNC)spec_remove }, /* remove */ + { &vnop_link_desc, (VOPFUNC)spec_link }, /* link */ + { &vnop_rename_desc, (VOPFUNC)spec_rename }, /* rename */ + { &vnop_mkdir_desc, (VOPFUNC)spec_mkdir }, /* mkdir */ + { &vnop_rmdir_desc, (VOPFUNC)spec_rmdir }, /* rmdir */ + { &vnop_symlink_desc, (VOPFUNC)spec_symlink }, /* symlink */ + { &vnop_readdir_desc, (VOPFUNC)spec_readdir }, /* readdir */ + { &vnop_readlink_desc, (VOPFUNC)spec_readlink }, /* readlink */ + { &vnop_inactive_desc, (VOPFUNC)hfs_vnop_inactive }, /* inactive */ + { &vnop_reclaim_desc, (VOPFUNC)hfs_vnop_reclaim }, /* reclaim */ + { &vnop_strategy_desc, (VOPFUNC)spec_strategy }, /* strategy */ + { &vnop_pathconf_desc, (VOPFUNC)spec_pathconf }, /* pathconf */ + { &vnop_advlock_desc, (VOPFUNC)err_advlock }, /* advlock */ + { &vnop_bwrite_desc, (VOPFUNC)hfs_vnop_bwrite }, + { &vnop_pagein_desc, (VOPFUNC)hfs_vnop_pagein }, /* Pagein */ + { &vnop_pageout_desc, (VOPFUNC)hfs_vnop_pageout }, /* Pageout */ + { &vnop_copyfile_desc, (VOPFUNC)err_copyfile }, /* copyfile */ + { &vnop_blktooff_desc, (VOPFUNC)hfs_vnop_blktooff }, /* blktooff */ + { &vnop_offtoblk_desc, (VOPFUNC)hfs_vnop_offtoblk }, /* offtoblk */ + { &vnop_getxattr_desc, (VOPFUNC)hfs_vnop_getxattr}, + { &vnop_setxattr_desc, (VOPFUNC)hfs_vnop_setxattr}, + { &vnop_removexattr_desc, (VOPFUNC)hfs_vnop_removexattr}, + { &vnop_listxattr_desc, (VOPFUNC)hfs_vnop_listxattr}, + { (struct vnodeop_desc*)NULL, (VOPFUNC)NULL } +}; +const struct vnodeopv_desc hfs_specop_opv_desc = + { &hfs_specop_p, hfs_specop_entries }; + +#if FIFO +/* HFS+ FIFO VNOP table */ +int (**hfs_fifoop_p)(void *); +const struct vnodeopv_entry_desc hfs_fifoop_entries[] = { + { &vnop_default_desc, (VOPFUNC)vn_default_error }, + { &vnop_lookup_desc, (VOPFUNC)fifo_lookup }, /* lookup */ + { &vnop_create_desc, (VOPFUNC)fifo_create }, /* create */ + { &vnop_mknod_desc, (VOPFUNC)fifo_mknod }, /* mknod */ + { &vnop_open_desc, (VOPFUNC)fifo_open }, /* open */ + { &vnop_close_desc, (VOPFUNC)hfsfifo_close }, /* close */ + { &vnop_getattr_desc, (VOPFUNC)hfs_vnop_getattr }, /* getattr */ + { &vnop_setattr_desc, (VOPFUNC)hfs_vnop_setattr }, /* setattr */ + { &vnop_read_desc, (VOPFUNC)hfsfifo_read }, /* read */ + { &vnop_write_desc, (VOPFUNC)hfsfifo_write }, /* write */ + { &vnop_ioctl_desc, (VOPFUNC)fifo_ioctl }, /* ioctl */ + { &vnop_select_desc, (VOPFUNC)fifo_select }, /* select */ + { &vnop_revoke_desc, (VOPFUNC)fifo_revoke }, /* revoke */ + { &vnop_mmap_desc, (VOPFUNC)fifo_mmap }, /* mmap */ + { &vnop_fsync_desc, (VOPFUNC)hfs_vnop_fsync }, /* fsync */ + { &vnop_remove_desc, (VOPFUNC)fifo_remove }, /* remove */ + { &vnop_link_desc, (VOPFUNC)fifo_link }, /* link */ + { &vnop_rename_desc, (VOPFUNC)fifo_rename }, /* rename */ + { &vnop_mkdir_desc, (VOPFUNC)fifo_mkdir }, /* mkdir */ + { &vnop_rmdir_desc, (VOPFUNC)fifo_rmdir }, /* rmdir */ + { &vnop_symlink_desc, (VOPFUNC)fifo_symlink }, /* symlink */ + { &vnop_readdir_desc, (VOPFUNC)fifo_readdir }, /* readdir */ + { &vnop_readlink_desc, (VOPFUNC)fifo_readlink }, /* readlink */ + { &vnop_inactive_desc, (VOPFUNC)hfs_vnop_inactive }, /* inactive */ + { &vnop_reclaim_desc, (VOPFUNC)hfs_vnop_reclaim }, /* reclaim */ + { &vnop_strategy_desc, (VOPFUNC)fifo_strategy }, /* strategy */ + { &vnop_pathconf_desc, (VOPFUNC)fifo_pathconf }, /* pathconf */ + { &vnop_advlock_desc, (VOPFUNC)err_advlock }, /* advlock */ + { &vnop_bwrite_desc, (VOPFUNC)hfs_vnop_bwrite }, + { &vnop_pagein_desc, (VOPFUNC)hfs_vnop_pagein }, /* Pagein */ + { &vnop_pageout_desc, (VOPFUNC)hfs_vnop_pageout }, /* Pageout */ + { &vnop_copyfile_desc, (VOPFUNC)err_copyfile }, /* copyfile */ + { &vnop_blktooff_desc, (VOPFUNC)hfs_vnop_blktooff }, /* blktooff */ + { &vnop_offtoblk_desc, (VOPFUNC)hfs_vnop_offtoblk }, /* offtoblk */ + { &vnop_blockmap_desc, (VOPFUNC)hfs_vnop_blockmap }, /* blockmap */ + { &vnop_getxattr_desc, (VOPFUNC)hfs_vnop_getxattr}, + { &vnop_setxattr_desc, (VOPFUNC)hfs_vnop_setxattr}, + { &vnop_removexattr_desc, (VOPFUNC)hfs_vnop_removexattr}, + { &vnop_listxattr_desc, (VOPFUNC)hfs_vnop_listxattr}, + { (struct vnodeop_desc*)NULL, (VOPFUNC)NULL } +}; +const struct vnodeopv_desc hfs_fifoop_opv_desc = + { &hfs_fifoop_p, hfs_fifoop_entries }; +#endif /* FIFO */ diff --git a/core/hfs_xattr.c b/core/hfs_xattr.c new file mode 100644 index 0000000..0d3dcd3 --- /dev/null +++ b/core/hfs_xattr.c @@ -0,0 +1,2633 @@ +/* + * Copyright (c) 2004-2017 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hfs.h" +#include "hfs_cnode.h" +#include "hfs_mount.h" +#include "hfs_format.h" +#include "hfs_endian.h" +#include "hfs_btreeio.h" +#include "hfs_fsctl.h" +#include "hfs_cprotect.h" + +#include "BTreesInternal.h" + +#define HFS_XATTR_VERBOSE 0 + +#define ATTRIBUTE_FILE_NODE_SIZE 8192 + + +/* State information for the listattr_callback callback function. */ +struct listattr_callback_state { + u_int32_t fileID; + int result; + uio_t uio; + size_t size; +#if HFS_COMPRESSION + int showcompressed; + vfs_context_t ctx; + vnode_t vp; +#endif /* HFS_COMPRESSION */ +}; + + +/* HFS Internal Names */ +#define XATTR_XATTREXTENTS_NAME "system.xattrextents" + +static u_int32_t emptyfinfo[8] = {0}; + +static int hfs_zero_hidden_fields (struct cnode *cp, u_int8_t *finderinfo); + +const char hfs_attrdatafilename[] = "Attribute Data"; + +static int listattr_callback(const HFSPlusAttrKey *key, const HFSPlusAttrData *data, + struct listattr_callback_state *state); + +static int remove_attribute_records(struct hfsmount *hfsmp, BTreeIterator * iterator); + +static int getnodecount(struct hfsmount *hfsmp, size_t nodesize); + +static size_t getmaxinlineattrsize(struct vnode * attrvp); + +static int read_attr_data(struct hfsmount *hfsmp, uio_t uio, size_t datasize, HFSPlusExtentDescriptor *extents); + +static int write_attr_data(struct hfsmount *hfsmp, uio_t uio, size_t datasize, HFSPlusExtentDescriptor *extents); + +static int alloc_attr_blks(struct hfsmount *hfsmp, size_t attrsize, size_t extentbufsize, HFSPlusExtentDescriptor *extents, int *blocks); + +static void free_attr_blks(struct hfsmount *hfsmp, int blkcnt, HFSPlusExtentDescriptor *extents); + +static int has_overflow_extents(HFSPlusForkData *forkdata); + +static int count_extent_blocks(int maxblks, HFSPlusExtentRecord extents); + +#if NAMEDSTREAMS +/* + * Obtain the vnode for a stream. + */ +int +hfs_vnop_getnamedstream(struct vnop_getnamedstream_args* ap) +{ + vnode_t vp = ap->a_vp; + vnode_t *svpp = ap->a_svpp; + struct cnode *cp; + int error = 0; + + *svpp = NULL; + + /* + * We only support the "com.apple.ResourceFork" stream. + */ + if (strcmp(ap->a_name, XATTR_RESOURCEFORK_NAME) != 0) { + return (ENOATTR); + } + cp = VTOC(vp); + if ( !S_ISREG(cp->c_mode) ) { + return (EPERM); + } +#if HFS_COMPRESSION + int hide_rsrc = hfs_hides_rsrc(ap->a_context, VTOC(vp), 1); +#endif /* HFS_COMPRESSION */ + if ((error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { + return (error); + } + if ((!hfs_has_rsrc(cp) +#if HFS_COMPRESSION + || hide_rsrc +#endif /* HFS_COMPRESSION */ + ) && (ap->a_operation != NS_OPEN)) { + hfs_unlock(cp); + return (ENOATTR); + } + error = hfs_vgetrsrc(VTOHFS(vp), vp, svpp); + hfs_unlock(cp); + + return (error); +} + +/* + * Create a stream. + */ +int +hfs_vnop_makenamedstream(struct vnop_makenamedstream_args* ap) +{ + vnode_t vp = ap->a_vp; + vnode_t *svpp = ap->a_svpp; + struct cnode *cp; + int error = 0; + + *svpp = NULL; + + /* + * We only support the "com.apple.ResourceFork" stream. + */ + if (strcmp(ap->a_name, XATTR_RESOURCEFORK_NAME) != 0) { + return (ENOATTR); + } + cp = VTOC(vp); + if ( !S_ISREG(cp->c_mode) ) { + return (EPERM); + } +#if HFS_COMPRESSION + if (hfs_hides_rsrc(ap->a_context, VTOC(vp), 1)) { + if (VNODE_IS_RSRC(vp)) { + return EINVAL; + } else { + error = decmpfs_decompress_file(vp, VTOCMP(vp), -1, 1, 0); + if (error != 0) + return error; + } + } +#endif /* HFS_COMPRESSION */ + if ((error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { + return (error); + } + error = hfs_vgetrsrc(VTOHFS(vp), vp, svpp); + hfs_unlock(cp); + + return (error); +} + +/* + * Remove a stream. + */ +int +hfs_vnop_removenamedstream(struct vnop_removenamedstream_args* ap) +{ + vnode_t svp = ap->a_svp; + cnode_t *scp = VTOC(svp); + int error = 0; + + /* + * We only support the "com.apple.ResourceFork" stream. + */ + if (strcmp(ap->a_name, XATTR_RESOURCEFORK_NAME) != 0) { + return (ENOATTR); + } +#if HFS_COMPRESSION + if (hfs_hides_rsrc(ap->a_context, scp, 1)) { + /* do nothing */ + return 0; + } +#endif /* HFS_COMPRESSION */ + + hfs_lock_truncate(scp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + if (VTOF(svp)->ff_size) { + // hfs_truncate will deal with the cnode lock + error = hfs_truncate(svp, 0, IO_NDELAY, 0, ap->a_context); + } + hfs_unlock_truncate(scp, HFS_LOCK_DEFAULT); + + return error; +} +#endif + + +/* Zero out the date added field for the specified cnode */ +static int hfs_zero_hidden_fields (struct cnode *cp, u_int8_t *finderinfo) +{ + u_int8_t *finfo = finderinfo; + + /* Advance finfo by 16 bytes to the 2nd half of the finderinfo */ + finfo = finfo + 16; + + if (S_ISREG(cp->c_attr.ca_mode) || S_ISLNK(cp->c_attr.ca_mode)) { + struct FndrExtendedFileInfo *extinfo = (struct FndrExtendedFileInfo *)finfo; + extinfo->document_id = 0; + extinfo->date_added = 0; + extinfo->write_gen_counter = 0; + } else if (S_ISDIR(cp->c_attr.ca_mode)) { + struct FndrExtendedDirInfo *extinfo = (struct FndrExtendedDirInfo *)finfo; + extinfo->document_id = 0; + extinfo->date_added = 0; + extinfo->write_gen_counter = 0; + } else { + /* Return an error */ + return -1; + } + return 0; + +} + +/* + * Retrieve the data of an extended attribute. + */ +int +hfs_vnop_getxattr(struct vnop_getxattr_args *ap) +/* + struct vnop_getxattr_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + char * a_name; + uio_t a_uio; + size_t *a_size; + int a_options; + vfs_context_t a_context; + }; +*/ +{ + struct vnode *vp = ap->a_vp; + struct cnode *cp; + struct hfsmount *hfsmp; + uio_t uio = ap->a_uio; + size_t bufsize; + int result; + + cp = VTOC(vp); + if (vp == cp->c_vp) { +#if HFS_COMPRESSION + int decmpfs_hide = hfs_hides_xattr(ap->a_context, VTOC(vp), ap->a_name, 1); /* 1 == don't take the cnode lock */ + if (decmpfs_hide && !(ap->a_options & XATTR_SHOWCOMPRESSION)) + return ENOATTR; +#endif /* HFS_COMPRESSION */ + + /* Get the Finder Info. */ + if (strcmp(ap->a_name, XATTR_FINDERINFO_NAME) == 0) { + u_int8_t finderinfo[32]; + bufsize = 32; + + if ((result = hfs_lock(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT))) { + return (result); + } + /* Make a copy since we may not export all of it. */ + bcopy(cp->c_finderinfo, finderinfo, sizeof(finderinfo)); + hfs_unlock(cp); + + /* Zero out the date added field in the local copy */ + hfs_zero_hidden_fields (cp, finderinfo); + + /* Don't expose a symlink's private type/creator. */ + if (vnode_islnk(vp)) { + struct FndrFileInfo *fip; + + fip = (struct FndrFileInfo *)&finderinfo; + fip->fdType = 0; + fip->fdCreator = 0; + } + /* If Finder Info is empty then it doesn't exist. */ + if (bcmp(finderinfo, emptyfinfo, sizeof(emptyfinfo)) == 0) { + return (ENOATTR); + } + if (uio == NULL) { + *ap->a_size = bufsize; + return (0); + } + if ((user_size_t)uio_resid(uio) < bufsize) + return (ERANGE); + + result = uiomove((caddr_t)&finderinfo , bufsize, uio); + + return (result); + } + /* Read the Resource Fork. */ + if (strcmp(ap->a_name, XATTR_RESOURCEFORK_NAME) == 0) { + struct vnode *rvp = NULL; + int openunlinked = 0; + int namelen = 0; + + if ( !S_ISREG(cp->c_mode) ) { + return (EPERM); + } + if ((result = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { + return (result); + } + namelen = cp->c_desc.cd_namelen; + + if (!hfs_has_rsrc(cp)) { + hfs_unlock(cp); + return (ENOATTR); + } + hfsmp = VTOHFS(vp); + if ((cp->c_flag & C_DELETED) && (namelen == 0)) { + openunlinked = 1; + } + + result = hfs_vgetrsrc(hfsmp, vp, &rvp); + hfs_unlock(cp); + if (result) { + return (result); + } + if (uio == NULL) { + *ap->a_size = (size_t)VTOF(rvp)->ff_size; + } else { +#if HFS_COMPRESSION + user_ssize_t uio_size = 0; + if (decmpfs_hide) + uio_size = uio_resid(uio); +#endif /* HFS_COMPRESSION */ + result = VNOP_READ(rvp, uio, 0, ap->a_context); +#if HFS_COMPRESSION + if (decmpfs_hide && + (result == 0) && + (uio_resid(uio) == uio_size)) { + /* + * We intentionally make the above call to VNOP_READ so that + * it can return an authorization/permission/etc. Error + * based on ap->a_context and thus deny this operation; + * in that case, result != 0 and we won't proceed. + * + * However, if result == 0, it will have returned no data + * because hfs_vnop_read hid the resource fork + * (hence uio_resid(uio) == uio_size, i.e. the uio is untouched) + * + * In that case, we try again with the decmpfs_ctx context + * to get the actual data + */ + result = VNOP_READ(rvp, uio, 0, decmpfs_ctx); + } +#endif /* HFS_COMPRESSION */ + } + /* force the rsrc fork vnode to recycle right away */ + if (openunlinked) { + int vref; + vref = vnode_ref (rvp); + if (vref == 0) { + vnode_rele (rvp); + } + vnode_recycle(rvp); + } + vnode_put(rvp); + return (result); + } + } + hfsmp = VTOHFS(vp); +#if CONFIG_HFS_STD + /* + * Standard HFS only supports native FinderInfo and Resource Forks. + */ + if (hfsmp->hfs_flags & HFS_STANDARD) { + return (EPERM); + } +#endif + + if ((result = hfs_lock(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT))) { + return (result); + } + + /* Check for non-rsrc, non-finderinfo EAs */ + result = hfs_getxattr_internal (cp, ap, VTOHFS(cp->c_vp), 0); + + hfs_unlock(cp); + + return MacToVFSError(result); +} + +// Has same limitations as hfs_getxattr_internal below +int hfs_xattr_read(vnode_t vp, const char *name, void *data, size_t *size) +{ + uio_t uio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ); + + uio_addiov(uio, CAST_USER_ADDR_T(data), *size); + + struct vnop_getxattr_args args = { + .a_uio = uio, + .a_name = name, + .a_size = size + }; + + int ret = hfs_getxattr_internal(VTOC(vp), &args, VTOHFS(vp), 0); + + uio_free(uio); + + return ret; +} + +/* + * getxattr_internal + * + * We break out this internal function which searches the attributes B-Tree and the + * overflow extents file to find non-resource, non-finderinfo EAs. There may be cases + * where we need to get EAs in contexts where we are already holding the cnode lock, + * and to re-enter hfs_vnop_getxattr would cause us to double-lock the cnode. Instead, + * we can just directly call this function. + * + * We pass the hfsmp argument directly here because we may not necessarily have a cnode to + * operate on. Under normal conditions, we have a file or directory to query, but if we + * are operating on the root directory (id 1), then we may not have a cnode. In this case, if hte + * 'cp' argument is NULL, then we need to use the 'fileid' argument as the entry to manipulate + * + * NOTE: This function assumes the cnode lock for 'cp' is held exclusive or shared. + */ +int hfs_getxattr_internal (struct cnode *cp, struct vnop_getxattr_args *ap, + struct hfsmount *hfsmp, u_int32_t fileid) +{ + + struct filefork *btfile; + struct BTreeIterator * iterator = NULL; + size_t bufsize = 0; + HFSPlusAttrRecord *recp = NULL; + size_t recp_size = 0; + FSBufferDescriptor btdata; + int lockflags = 0; + int result = 0; + u_int16_t datasize = 0; + uio_t uio = ap->a_uio; + u_int32_t target_id = 0; + + if (cp) { + target_id = cp->c_fileid; + } else { + target_id = fileid; + } + + + /* Bail if we don't have an EA B-Tree. */ + if ((hfsmp->hfs_attribute_vp == NULL) || + ((cp) && (cp->c_attr.ca_recflags & kHFSHasAttributesMask) == 0)) { + result = ENOATTR; + goto exit; + } + + /* Initialize the B-Tree iterator for searching for the proper EA */ + btfile = VTOF(hfsmp->hfs_attribute_vp); + + iterator = hfs_mallocz(sizeof(*iterator)); + + /* Allocate memory for reading in the attribute record. This buffer is + * big enough to read in all types of attribute records. It is not big + * enough to read inline attribute data which is read in later. + */ + recp = hfs_malloc(recp_size = sizeof(HFSPlusAttrRecord)); + btdata.bufferAddress = recp; + btdata.itemSize = sizeof(HFSPlusAttrRecord); + btdata.itemCount = 1; + + result = hfs_buildattrkey(target_id, ap->a_name, (HFSPlusAttrKey *)&iterator->key); + if (result) { + goto exit; + } + + /* Lookup the attribute in the Attribute B-Tree */ + lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_SHARED_LOCK); + result = BTSearchRecord(btfile, iterator, &btdata, &datasize, NULL); + hfs_systemfile_unlock(hfsmp, lockflags); + + if (result) { + if (result == btNotFound) { + result = ENOATTR; + } + goto exit; + } + + /* + * Operate differently if we have inline EAs that can fit in the attribute B-Tree or if + * we have extent based EAs. + */ + switch (recp->recordType) { + + /* Attribute fits in the Attribute B-Tree */ + case kHFSPlusAttrInlineData: { + /* + * Sanity check record size. It's not required to have any + * user data, so the minimum size is 2 bytes less that the + * size of HFSPlusAttrData (since HFSPlusAttrData struct + * has 2 bytes set aside for attribute data). + */ + if (datasize < (sizeof(HFSPlusAttrData) - 2)) { + printf("hfs_getxattr: vol=%s %d,%s invalid record size %d (expecting %lu)\n", + hfsmp->vcbVN, target_id, ap->a_name, datasize, sizeof(HFSPlusAttrData)); + result = ENOATTR; + break; + } + *ap->a_size = recp->attrData.attrSize; + if (uio && recp->attrData.attrSize != 0) { + if (*ap->a_size > (user_size_t)uio_resid(uio)) { + /* User provided buffer is not large enough for the xattr data */ + result = ERANGE; + } else { + /* Previous BTreeSearchRecord() read in only the attribute record, + * and not the attribute data. Now allocate enough memory for + * both attribute record and data, and read the attribute record again. + */ + bufsize = sizeof(HFSPlusAttrData) - 2 + recp->attrData.attrSize; + hfs_free(recp, recp_size); + recp = hfs_malloc(recp_size = bufsize); + + btdata.bufferAddress = recp; + btdata.itemSize = bufsize; + btdata.itemCount = 1; + + bzero(iterator, sizeof(*iterator)); + result = hfs_buildattrkey(target_id, ap->a_name, (HFSPlusAttrKey *)&iterator->key); + if (result) { + goto exit; + } + + /* Lookup the attribute record and inline data */ + lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_SHARED_LOCK); + result = BTSearchRecord(btfile, iterator, &btdata, &datasize, NULL); + hfs_systemfile_unlock(hfsmp, lockflags); + if (result) { + if (result == btNotFound) { + result = ENOATTR; + } + goto exit; + } + + /* Copy-out the attribute data to the user buffer */ + *ap->a_size = recp->attrData.attrSize; + result = uiomove((caddr_t) &recp->attrData.attrData , recp->attrData.attrSize, uio); + } + } + break; + } + + /* Extent-Based EAs */ + case kHFSPlusAttrForkData: { + if (datasize < sizeof(HFSPlusAttrForkData)) { + printf("hfs_getxattr: vol=%s %d,%s invalid record size %d (expecting %lu)\n", + hfsmp->vcbVN, target_id, ap->a_name, datasize, sizeof(HFSPlusAttrForkData)); + result = ENOATTR; + break; + } + *ap->a_size = recp->forkData.theFork.logicalSize; + if (uio == NULL) { + break; + } + if (*ap->a_size > (user_size_t)uio_resid(uio)) { + result = ERANGE; + break; + } + /* Process overflow extents if necessary. */ + if (has_overflow_extents(&recp->forkData.theFork)) { + HFSPlusExtentDescriptor *extentbuf; + HFSPlusExtentDescriptor *extentptr; + size_t extentbufsize; + u_int32_t totalblocks; + u_int32_t blkcnt; + u_int32_t attrlen; + + totalblocks = recp->forkData.theFork.totalBlocks; + /* Ignore bogus block counts. */ + if (totalblocks > howmany(HFS_XATTR_MAXSIZE, hfsmp->blockSize)) { + result = ERANGE; + break; + } + attrlen = recp->forkData.theFork.logicalSize; + + /* Get a buffer to hold the worst case amount of extents. */ + extentbufsize = totalblocks * sizeof(HFSPlusExtentDescriptor); + extentbufsize = roundup(extentbufsize, sizeof(HFSPlusExtentRecord)); + extentbuf = hfs_mallocz(extentbufsize); + extentptr = extentbuf; + + /* Grab the first 8 extents. */ + bcopy(&recp->forkData.theFork.extents[0], extentptr, sizeof(HFSPlusExtentRecord)); + extentptr += kHFSPlusExtentDensity; + blkcnt = count_extent_blocks(totalblocks, recp->forkData.theFork.extents); + + /* Now lookup the overflow extents. */ + lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_SHARED_LOCK); + while (blkcnt < totalblocks) { + ((HFSPlusAttrKey *)&iterator->key)->startBlock = blkcnt; + result = BTSearchRecord(btfile, iterator, &btdata, &datasize, NULL); + if (result || + (recp->recordType != kHFSPlusAttrExtents) || + (datasize < sizeof(HFSPlusAttrExtents))) { + printf("hfs_getxattr: %s missing extents, only %d blks of %d found\n", + ap->a_name, blkcnt, totalblocks); + result = ENOATTR; + break; /* break from while */ + } + /* Grab the next 8 extents. */ + bcopy(&recp->overflowExtents.extents[0], extentptr, sizeof(HFSPlusExtentRecord)); + extentptr += kHFSPlusExtentDensity; + blkcnt += count_extent_blocks(totalblocks, recp->overflowExtents.extents); + } + + /* Release Attr B-Tree lock */ + hfs_systemfile_unlock(hfsmp, lockflags); + + if (blkcnt < totalblocks) { + result = ENOATTR; + } else { + result = read_attr_data(hfsmp, uio, attrlen, extentbuf); + } + hfs_free(extentbuf, extentbufsize); + + } else { /* No overflow extents. */ + result = read_attr_data(hfsmp, uio, recp->forkData.theFork.logicalSize, recp->forkData.theFork.extents); + } + break; + } + + default: + /* We only support Extent or inline EAs. Default to ENOATTR for anything else */ + result = ENOATTR; + break; + } + +exit: + hfs_free(iterator, sizeof(*iterator)); + hfs_free(recp, recp_size); + + return result; + +} + + +/* + * Set the data of an extended attribute. + */ +int +hfs_vnop_setxattr(struct vnop_setxattr_args *ap) +/* + struct vnop_setxattr_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + char * a_name; + uio_t a_uio; + int a_options; + vfs_context_t a_context; + }; +*/ +{ + struct vnode *vp = ap->a_vp; + struct cnode *cp = NULL; + struct hfsmount *hfsmp; + uio_t uio = ap->a_uio; + size_t attrsize; + void * user_data_ptr = NULL; + int result; + time_t orig_ctime=VTOC(vp)->c_ctime; + + if (ap->a_name == NULL || ap->a_name[0] == '\0') { + return (EINVAL); /* invalid name */ + } + hfsmp = VTOHFS(vp); + if (VNODE_IS_RSRC(vp)) { + return (EPERM); + } + +#if HFS_COMPRESSION + if (hfs_hides_xattr(ap->a_context, VTOC(vp), ap->a_name, 1) ) { /* 1 == don't take the cnode lock */ + result = decmpfs_decompress_file(vp, VTOCMP(vp), -1, 1, 0); + if (result != 0) + return result; + } +#endif /* HFS_COMPRESSION */ + + nspace_snapshot_event(vp, orig_ctime, NAMESPACE_HANDLER_METADATA_WRITE_OP, NSPACE_REARM_NO_ARG); + + /* Set the Finder Info. */ + if (strcmp(ap->a_name, XATTR_FINDERINFO_NAME) == 0) { + union { + uint8_t data[32]; + char cdata[32]; + struct FndrFileInfo info; + } fi; + void * finderinfo_start; + u_int8_t *finfo = NULL; + u_int16_t fdFlags; + u_int32_t dateadded = 0; + u_int32_t write_gen_counter = 0; + u_int32_t document_id = 0; + + attrsize = sizeof(VTOC(vp)->c_finderinfo); + + if ((user_size_t)uio_resid(uio) != attrsize) { + return (ERANGE); + } + /* Grab the new Finder Info data. */ + if ((result = uiomove(fi.cdata, attrsize, uio))) { + return (result); + } + + if ((result = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { + return (result); + } + cp = VTOC(vp); + + /* Symlink's don't have an external type/creator. */ + if (vnode_islnk(vp)) { + /* Skip over type/creator fields. */ + finderinfo_start = &cp->c_finderinfo[8]; + attrsize -= 8; + } else { + finderinfo_start = &cp->c_finderinfo[0]; + /* + * Don't allow the external setting of + * file type to kHardLinkFileType. + */ + if (fi.info.fdType == SWAP_BE32(kHardLinkFileType)) { + hfs_unlock(cp); + return (EPERM); + } + } + + /* Grab the current date added from the cnode */ + dateadded = hfs_get_dateadded (cp); + if (S_ISREG(cp->c_attr.ca_mode) || S_ISLNK(cp->c_attr.ca_mode)) { + struct FndrExtendedFileInfo *extinfo = (struct FndrExtendedFileInfo *)((u_int8_t*)cp->c_finderinfo + 16); + /* + * Grab generation counter directly from the cnode + * instead of calling hfs_get_gencount(), because + * for zero generation count values hfs_get_gencount() + * lies and bumps it up to one. + */ + write_gen_counter = extinfo->write_gen_counter; + document_id = extinfo->document_id; + } else if (S_ISDIR(cp->c_attr.ca_mode)) { + struct FndrExtendedDirInfo *extinfo = (struct FndrExtendedDirInfo *)((u_int8_t*)cp->c_finderinfo + 16); + write_gen_counter = extinfo->write_gen_counter; + document_id = extinfo->document_id; + } + + /* + * Zero out the finder info's reserved fields like date added, + * generation counter, and document id to ignore user's attempts + * to set it + */ + hfs_zero_hidden_fields(cp, fi.data); + + if (bcmp(finderinfo_start, emptyfinfo, attrsize)) { + /* attr exists and "create" was specified. */ + if (ap->a_options & XATTR_CREATE) { + hfs_unlock(cp); + return (EEXIST); + } + } else { /* empty */ + /* attr doesn't exists and "replace" was specified. */ + if (ap->a_options & XATTR_REPLACE) { + hfs_unlock(cp); + return (ENOATTR); + } + } + + /* + * Now restore the date added and other reserved fields to the finderinfo to + * be written out. Advance to the 2nd half of the finderinfo to write them + * out into the buffer. + * + * Make sure to endian swap the date added back into big endian. When we used + * hfs_get_dateadded above to retrieve it, it swapped into local endianness + * for us. But now that we're writing it out, put it back into big endian. + */ + finfo = &fi.data[16]; + if (S_ISREG(cp->c_attr.ca_mode) || S_ISLNK(cp->c_attr.ca_mode)) { + struct FndrExtendedFileInfo *extinfo = (struct FndrExtendedFileInfo *)finfo; + extinfo->date_added = OSSwapHostToBigInt32(dateadded); + extinfo->write_gen_counter = write_gen_counter; + extinfo->document_id = document_id; + } else if (S_ISDIR(cp->c_attr.ca_mode)) { + struct FndrExtendedDirInfo *extinfo = (struct FndrExtendedDirInfo *)finfo; + extinfo->date_added = OSSwapHostToBigInt32(dateadded); + extinfo->write_gen_counter = write_gen_counter; + extinfo->document_id = document_id; + } + + /* Set the cnode's Finder Info. */ + if (attrsize == sizeof(cp->c_finderinfo)) { + bcopy(&fi.data[0], finderinfo_start, attrsize); + } else { + bcopy(&fi.data[8], finderinfo_start, attrsize); + } + + /* Updating finderInfo updates change time and modified time */ + cp->c_touch_chgtime = TRUE; + cp->c_flag |= C_MODIFIED; + + /* + * Mirror the invisible bit to the UF_HIDDEN flag. + * + * The fdFlags for files and frFlags for folders are both 8 bytes + * into the userInfo (the first 16 bytes of the Finder Info). They + * are both 16-bit fields. + */ + fdFlags = *((u_int16_t *) &cp->c_finderinfo[8]); + if (fdFlags & OSSwapHostToBigConstInt16(kFinderInvisibleMask)) { + cp->c_bsdflags |= UF_HIDDEN; + } else { + cp->c_bsdflags &= ~UF_HIDDEN; + } + + result = hfs_update(vp, 0); + + hfs_unlock(cp); + return (result); + } + /* Write the Resource Fork. */ + if (strcmp(ap->a_name, XATTR_RESOURCEFORK_NAME) == 0) { + struct vnode *rvp = NULL; + int namelen = 0; + int openunlinked = 0; + + if (!vnode_isreg(vp)) { + return (EPERM); + } + if ((result = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { + return (result); + } + cp = VTOC(vp); + namelen = cp->c_desc.cd_namelen; + + if (hfs_has_rsrc(cp)) { + /* attr exists and "create" was specified. */ + if (ap->a_options & XATTR_CREATE) { + hfs_unlock(cp); + return (EEXIST); + } + } else { + /* attr doesn't exists and "replace" was specified. */ + if (ap->a_options & XATTR_REPLACE) { + hfs_unlock(cp); + return (ENOATTR); + } + } + + /* + * Note that we could be called on to grab the rsrc fork vnode + * for a file that has become open-unlinked. + */ + if ((cp->c_flag & C_DELETED) && (namelen == 0)) { + openunlinked = 1; + } + + result = hfs_vgetrsrc(hfsmp, vp, &rvp); + hfs_unlock(cp); + if (result) { + return (result); + } + /* VNOP_WRITE marks cnode as needing a modtime update */ + result = VNOP_WRITE(rvp, uio, 0, ap->a_context); + + /* if open unlinked, force it inactive */ + if (openunlinked) { + int vref; + vref = vnode_ref (rvp); + if (vref == 0) { + vnode_rele(rvp); + } + vnode_recycle (rvp); + } else { + /* cnode is not open-unlinked, so re-lock cnode to sync */ + if ((result = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { + vnode_recycle (rvp); + vnode_put(rvp); + return result; + } + + /* hfs fsync rsrc fork to force to disk and update modtime */ + result = hfs_fsync (rvp, MNT_NOWAIT, 0, vfs_context_proc (ap->a_context)); + hfs_unlock (cp); + } + + vnode_put(rvp); + return (result); + } +#if CONFIG_HFS_STD + /* + * Standard HFS only supports native FinderInfo and Resource Forks. + */ + if (hfsmp->hfs_flags & HFS_STANDARD) { + return (EPERM); + } +#endif + attrsize = uio_resid(uio); + + /* Enforce an upper limit. */ + if (attrsize > HFS_XATTR_MAXSIZE) { + result = E2BIG; + goto exit; + } + + /* + * Attempt to copy the users attr data before taking any locks, + * only if it will be an inline attribute. For larger attributes, + * the data will be directly read from the uio. + */ + if (attrsize > 0 && + hfsmp->hfs_max_inline_attrsize != 0 && + attrsize < hfsmp->hfs_max_inline_attrsize) { + user_data_ptr = hfs_malloc(attrsize); + + result = uiomove((caddr_t)user_data_ptr, attrsize, uio); + if (result) { + goto exit; + } + } + + result = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + if (result) { + goto exit; + } + cp = VTOC(vp); + + /* + * If we're trying to set a non-finderinfo, non-resourcefork EA, then + * call the breakout function. + */ + result = hfs_setxattr_internal (cp, user_data_ptr, attrsize, ap, VTOHFS(vp), 0); + + exit: + if (cp) { + hfs_unlock(cp); + } + if (user_data_ptr) { + hfs_free(user_data_ptr, attrsize); + } + + return (result == btNotFound ? ENOATTR : MacToVFSError(result)); +} + +// Has same limitations as hfs_setxattr_internal below +int hfs_xattr_write(vnode_t vp, const char *name, const void *data, size_t size) +{ + struct vnop_setxattr_args args = { + .a_vp = vp, + .a_name = name, + }; + + return hfs_setxattr_internal(VTOC(vp), data, size, &args, VTOHFS(vp), 0); +} + +/* + * hfs_setxattr_internal + * + * Internal function to set non-rsrc, non-finderinfo EAs to either the attribute B-Tree or + * extent-based EAs. + * + * See comments from hfs_getxattr_internal on why we need to pass 'hfsmp' and fileid here. + * The gist is that we could end up writing to the root folder which may not have a cnode. + * + * Assumptions: + * 1. cnode 'cp' is locked EXCLUSIVE before calling this function. + * 2. data_ptr contains data to be written. If gathering data from userland, this must be + * done before calling this function. + * 3. If data originates entirely in-kernel, use a null UIO, and ensure the size is less than + * hfsmp->hfs_max_inline_attrsize bytes long. + */ +int hfs_setxattr_internal (struct cnode *cp, const void *data_ptr, size_t attrsize, + struct vnop_setxattr_args *ap, struct hfsmount *hfsmp, + u_int32_t fileid) +{ + uio_t uio = ap->a_uio; + struct vnode *vp = ap->a_vp; + int started_transaction = 0; + struct BTreeIterator * iterator = NULL; + struct filefork *btfile = NULL; + FSBufferDescriptor btdata; + HFSPlusAttrRecord attrdata; /* 90 bytes */ + HFSPlusAttrRecord *recp = NULL; + size_t recp_size = 0; + HFSPlusExtentDescriptor *extentptr = NULL; + size_t extentbufsize = 0; + int result = 0; + int lockflags = 0; + int exists = 0; + int allocatedblks = 0; + u_int32_t target_id; + + if (cp) { + target_id = cp->c_fileid; + } else { + target_id = fileid; + } + + /* Start a transaction for our changes. */ + if (hfs_start_transaction(hfsmp) != 0) { + result = EINVAL; + goto exit; + } + started_transaction = 1; + + /* + * Once we started the transaction, nobody can compete + * with us, so make sure this file is still there. + */ + if ((cp) && (cp->c_flag & C_NOEXISTS)) { + result = ENOENT; + goto exit; + } + + /* + * If there isn't an attributes b-tree then create one. + */ + if (hfsmp->hfs_attribute_vp == NULL) { + result = hfs_create_attr_btree(hfsmp, ATTRIBUTE_FILE_NODE_SIZE, + getnodecount(hfsmp, ATTRIBUTE_FILE_NODE_SIZE)); + if (result) { + goto exit; + } + } + if (hfsmp->hfs_max_inline_attrsize == 0) { + hfsmp->hfs_max_inline_attrsize = getmaxinlineattrsize(hfsmp->hfs_attribute_vp); + } + + lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_EXCLUSIVE_LOCK); + + /* Build the b-tree key. */ + iterator = hfs_mallocz(sizeof(*iterator)); + result = hfs_buildattrkey(target_id, ap->a_name, (HFSPlusAttrKey *)&iterator->key); + if (result) { + goto exit; + } + + /* Preflight for replace/create semantics. */ + btfile = VTOF(hfsmp->hfs_attribute_vp); + btdata.bufferAddress = &attrdata; + btdata.itemSize = sizeof(attrdata); + btdata.itemCount = 1; + exists = BTSearchRecord(btfile, iterator, &btdata, NULL, NULL) == 0; + + /* Replace requires that the attribute already exists. */ + if ((ap->a_options & XATTR_REPLACE) && !exists) { + result = ENOATTR; + goto exit; + } + /* Create requires that the attribute doesn't exist. */ + if ((ap->a_options & XATTR_CREATE) && exists) { + result = EEXIST; + goto exit; + } + + /* If it won't fit inline then use extent-based attributes. */ + if (attrsize > hfsmp->hfs_max_inline_attrsize) { + int blkcnt; + int extentblks; + u_int32_t *keystartblk; + int i; + + if (uio == NULL) { + /* + * setxattrs originating from in-kernel are not supported if they are bigger + * than the inline max size. Just return ENOATTR and force them to do it with a + * smaller EA. + */ + result = EPERM; + goto exit; + } + + /* Get some blocks. */ + blkcnt = howmany(attrsize, hfsmp->blockSize); + extentbufsize = blkcnt * sizeof(HFSPlusExtentDescriptor); + extentbufsize = roundup(extentbufsize, sizeof(HFSPlusExtentRecord)); + extentptr = hfs_mallocz(extentbufsize); + result = alloc_attr_blks(hfsmp, attrsize, extentbufsize, extentptr, &allocatedblks); + if (result) { + allocatedblks = 0; + goto exit; /* no more space */ + } + /* Copy data into the blocks. */ + result = write_attr_data(hfsmp, uio, attrsize, extentptr); + if (result) { + if (vp) { + const char *name = vnode_getname(vp); + printf("hfs_setxattr: write_attr_data vol=%s err (%d) %s:%s\n", + hfsmp->vcbVN, result, name ? name : "", ap->a_name); + if (name) + vnode_putname(name); + } + goto exit; + } + + /* Now remove any previous attribute. */ + if (exists) { + result = remove_attribute_records(hfsmp, iterator); + if (result) { + if (vp) { + const char *name = vnode_getname(vp); + printf("hfs_setxattr: remove_attribute_records vol=%s err (%d) %s:%s\n", + hfsmp->vcbVN, result, name ? name : "", ap->a_name); + if (name) + vnode_putname(name); + } + goto exit; + } + } + /* Create attribute fork data record. */ + recp = hfs_malloc(recp_size = sizeof(HFSPlusAttrRecord)); + + btdata.bufferAddress = recp; + btdata.itemCount = 1; + btdata.itemSize = sizeof(HFSPlusAttrForkData); + + recp->recordType = kHFSPlusAttrForkData; + recp->forkData.reserved = 0; + recp->forkData.theFork.logicalSize = attrsize; + recp->forkData.theFork.clumpSize = 0; + recp->forkData.theFork.totalBlocks = blkcnt; + bcopy(extentptr, recp->forkData.theFork.extents, sizeof(HFSPlusExtentRecord)); + + (void) hfs_buildattrkey(target_id, ap->a_name, (HFSPlusAttrKey *)&iterator->key); + + result = BTInsertRecord(btfile, iterator, &btdata, btdata.itemSize); + if (result) { + printf ("hfs_setxattr: BTInsertRecord(): vol=%s %d,%s err=%d\n", + hfsmp->vcbVN, target_id, ap->a_name, result); + goto exit; + } + extentblks = count_extent_blocks(blkcnt, recp->forkData.theFork.extents); + blkcnt -= extentblks; + keystartblk = &((HFSPlusAttrKey *)&iterator->key)->startBlock; + i = 0; + + /* Create overflow extents as needed. */ + while (blkcnt > 0) { + /* Initialize the key and record. */ + *keystartblk += (u_int32_t)extentblks; + btdata.itemSize = sizeof(HFSPlusAttrExtents); + recp->recordType = kHFSPlusAttrExtents; + recp->overflowExtents.reserved = 0; + + /* Copy the next set of extents. */ + i += kHFSPlusExtentDensity; + bcopy(&extentptr[i], recp->overflowExtents.extents, sizeof(HFSPlusExtentRecord)); + + result = BTInsertRecord(btfile, iterator, &btdata, btdata.itemSize); + if (result) { + printf ("hfs_setxattr: BTInsertRecord() overflow: vol=%s %d,%s err=%d\n", + hfsmp->vcbVN, target_id, ap->a_name, result); + goto exit; + } + extentblks = count_extent_blocks(blkcnt, recp->overflowExtents.extents); + blkcnt -= extentblks; + } + } else { /* Inline data */ + if (exists) { + result = remove_attribute_records(hfsmp, iterator); + if (result) { + goto exit; + } + } + + /* Calculate size of record rounded up to multiple of 2 bytes. */ + btdata.itemSize = sizeof(HFSPlusAttrData) - 2 + attrsize + ((attrsize & 1) ? 1 : 0); + recp = hfs_malloc(recp_size = btdata.itemSize); + + recp->recordType = kHFSPlusAttrInlineData; + recp->attrData.reserved[0] = 0; + recp->attrData.reserved[1] = 0; + recp->attrData.attrSize = attrsize; + + /* Copy in the attribute data (if any). */ + if (attrsize > 0) { + if (data_ptr) { + bcopy(data_ptr, &recp->attrData.attrData, attrsize); + } else { + /* + * A null UIO meant it originated in-kernel. If they didn't supply data_ptr + * then deny the copy operation. + */ + if (uio == NULL) { + result = EPERM; + goto exit; + } + result = uiomove((caddr_t)&recp->attrData.attrData, attrsize, uio); + } + + if (result) { + goto exit; + } + } + + (void) hfs_buildattrkey(target_id, ap->a_name, (HFSPlusAttrKey *)&iterator->key); + + btdata.bufferAddress = recp; + btdata.itemCount = 1; + result = BTInsertRecord(btfile, iterator, &btdata, btdata.itemSize); + } + +exit: + if (btfile && started_transaction) { + (void) BTFlushPath(btfile); + } + hfs_systemfile_unlock(hfsmp, lockflags); + if (result == 0) { + if (vp) { + cp = VTOC(vp); + /* Setting an attribute only updates change time and not + * modified time of the file. + */ + cp->c_touch_chgtime = TRUE; + cp->c_flag |= C_MODIFIED; + cp->c_attr.ca_recflags |= kHFSHasAttributesMask; + if ((strcmp(ap->a_name, KAUTH_FILESEC_XATTR) == 0)) { + cp->c_attr.ca_recflags |= kHFSHasSecurityMask; + } + (void) hfs_update(vp, 0); + } + } + if (started_transaction) { + if (result && allocatedblks) { + free_attr_blks(hfsmp, allocatedblks, extentptr); + } + hfs_end_transaction(hfsmp); + } + + hfs_free(recp, recp_size); + hfs_free(extentptr, extentbufsize); + hfs_free(iterator, sizeof(*iterator)); + + return result; +} + + + + +/* + * Remove an extended attribute. + */ +int +hfs_vnop_removexattr(struct vnop_removexattr_args *ap) +/* + struct vnop_removexattr_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + char * a_name; + int a_options; + vfs_context_t a_context; + }; +*/ +{ + struct vnode *vp = ap->a_vp; + struct cnode *cp = VTOC(vp); + struct hfsmount *hfsmp; + struct BTreeIterator * iterator = NULL; + int lockflags; + int result; + time_t orig_ctime=VTOC(vp)->c_ctime; + + if (ap->a_name == NULL || ap->a_name[0] == '\0') { + return (EINVAL); /* invalid name */ + } + hfsmp = VTOHFS(vp); + if (VNODE_IS_RSRC(vp)) { + return (EPERM); + } + +#if HFS_COMPRESSION + if (hfs_hides_xattr(ap->a_context, VTOC(vp), ap->a_name, 1) && !(ap->a_options & XATTR_SHOWCOMPRESSION)) { + return ENOATTR; + } +#endif /* HFS_COMPRESSION */ + + nspace_snapshot_event(vp, orig_ctime, NAMESPACE_HANDLER_METADATA_DELETE_OP, NSPACE_REARM_NO_ARG); + + /* If Resource Fork is non-empty then truncate it. */ + if (strcmp(ap->a_name, XATTR_RESOURCEFORK_NAME) == 0) { + struct vnode *rvp = NULL; + + if ( !vnode_isreg(vp) ) { + return (EPERM); + } + if ((result = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { + return (result); + } + if (!hfs_has_rsrc(cp)) { + hfs_unlock(cp); + return (ENOATTR); + } + result = hfs_vgetrsrc(hfsmp, vp, &rvp); + hfs_unlock(cp); + if (result) { + return (result); + } + + hfs_lock_truncate(VTOC(rvp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + + // Tell UBC now before we take the cnode lock and start the transaction + hfs_ubc_setsize(rvp, 0, false); + + if ((result = hfs_lock(VTOC(rvp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); + vnode_put(rvp); + return (result); + } + + /* Start a transaction for encapsulating changes in + * hfs_truncate() and hfs_update() + */ + if ((result = hfs_start_transaction(hfsmp))) { + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); + hfs_unlock(cp); + vnode_put(rvp); + return (result); + } + + result = hfs_truncate(rvp, (off_t)0, IO_NDELAY, 0, ap->a_context); + if (result == 0) { + cp->c_touch_chgtime = TRUE; + cp->c_flag |= C_MODIFIED; + result = hfs_update(vp, 0); + } + + hfs_end_transaction(hfsmp); + hfs_unlock_truncate(VTOC(rvp), HFS_LOCK_DEFAULT); + hfs_unlock(VTOC(rvp)); + + vnode_put(rvp); + return (result); + } + /* Clear out the Finder Info. */ + if (strcmp(ap->a_name, XATTR_FINDERINFO_NAME) == 0) { + void * finderinfo_start; + int finderinfo_size; + u_int8_t finderinfo[32]; + u_int32_t date_added = 0, write_gen_counter = 0, document_id = 0; + u_int8_t *finfo = NULL; + + if ((result = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { + return (result); + } + + /* Use the local copy to store our temporary changes. */ + bcopy(cp->c_finderinfo, finderinfo, sizeof(finderinfo)); + + + /* Zero out the date added field in the local copy */ + hfs_zero_hidden_fields (cp, finderinfo); + + /* Don't expose a symlink's private type/creator. */ + if (vnode_islnk(vp)) { + struct FndrFileInfo *fip; + + fip = (struct FndrFileInfo *)&finderinfo; + fip->fdType = 0; + fip->fdCreator = 0; + } + + /* Do the byte compare against the local copy */ + if (bcmp(finderinfo, emptyfinfo, sizeof(emptyfinfo)) == 0) { + hfs_unlock(cp); + return (ENOATTR); + } + + /* + * If there was other content, zero out everything except + * type/creator and date added. First, save the date added. + */ + finfo = cp->c_finderinfo; + finfo = finfo + 16; + if (S_ISREG(cp->c_attr.ca_mode) || S_ISLNK(cp->c_attr.ca_mode)) { + struct FndrExtendedFileInfo *extinfo = (struct FndrExtendedFileInfo *)finfo; + date_added = extinfo->date_added; + write_gen_counter = extinfo->write_gen_counter; + document_id = extinfo->document_id; + } else if (S_ISDIR(cp->c_attr.ca_mode)) { + struct FndrExtendedDirInfo *extinfo = (struct FndrExtendedDirInfo *)finfo; + date_added = extinfo->date_added; + write_gen_counter = extinfo->write_gen_counter; + document_id = extinfo->document_id; + } + + if (vnode_islnk(vp)) { + /* Ignore type/creator */ + finderinfo_start = &cp->c_finderinfo[8]; + finderinfo_size = sizeof(cp->c_finderinfo) - 8; + } else { + finderinfo_start = &cp->c_finderinfo[0]; + finderinfo_size = sizeof(cp->c_finderinfo); + } + bzero(finderinfo_start, finderinfo_size); + + + /* Now restore the date added */ + if (S_ISREG(cp->c_attr.ca_mode) || S_ISLNK(cp->c_attr.ca_mode)) { + struct FndrExtendedFileInfo *extinfo = (struct FndrExtendedFileInfo *)finfo; + extinfo->date_added = date_added; + extinfo->write_gen_counter = write_gen_counter; + extinfo->document_id = document_id; + } else if (S_ISDIR(cp->c_attr.ca_mode)) { + struct FndrExtendedDirInfo *extinfo = (struct FndrExtendedDirInfo *)finfo; + extinfo->date_added = date_added; + extinfo->write_gen_counter = write_gen_counter; + extinfo->document_id = document_id; + } + + /* Updating finderInfo updates change time and modified time */ + cp->c_touch_chgtime = TRUE; + cp->c_flag |= C_MODIFIED; + hfs_update(vp, 0); + + hfs_unlock(cp); + + return (0); + } +#if CONFIG_HFS_STD + /* + * Standard HFS only supports native FinderInfo and Resource Forks. + */ + if (hfsmp->hfs_flags & HFS_STANDARD) { + return (EPERM); + } +#endif + if (hfsmp->hfs_attribute_vp == NULL) { + return (ENOATTR); + } + + iterator = hfs_mallocz(sizeof(*iterator)); + + if ((result = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { + goto exit_nolock; + } + + result = hfs_buildattrkey(cp->c_fileid, ap->a_name, (HFSPlusAttrKey *)&iterator->key); + if (result) { + goto exit; + } + + if (hfs_start_transaction(hfsmp) != 0) { + result = EINVAL; + goto exit; + } + lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE | SFL_BITMAP, HFS_EXCLUSIVE_LOCK); + + result = remove_attribute_records(hfsmp, iterator); + + hfs_systemfile_unlock(hfsmp, lockflags); + + if (result == 0) { + cp->c_touch_chgtime = TRUE; + + lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_SHARED_LOCK); + + /* If no more attributes exist, clear attribute bit */ + result = file_attribute_exist(hfsmp, cp->c_fileid); + if (result == 0) { + cp->c_attr.ca_recflags &= ~kHFSHasAttributesMask; + cp->c_flag |= C_MODIFIED; + } + if (result == EEXIST) { + result = 0; + } + + hfs_systemfile_unlock(hfsmp, lockflags); + + /* If ACL was removed, clear security bit */ + if (strcmp(ap->a_name, KAUTH_FILESEC_XATTR) == 0) { + cp->c_attr.ca_recflags &= ~kHFSHasSecurityMask; + cp->c_flag |= C_MODIFIED; + } + (void) hfs_update(vp, 0); + } + + hfs_end_transaction(hfsmp); +exit: + hfs_unlock(cp); +exit_nolock: + hfs_free(iterator, sizeof(*iterator)); + return MacToVFSError(result); +} + +/* + * Removes a non rsrc-fork, non-finderinfo EA from the specified file ID. + * Note that this results in a bit of code duplication for the xattr removal + * path. This is done because it's a bit messy to deal with things without the + * cnode. This function is used by exchangedata to port XATTRS to alternate + * fileIDs while everything is locked, and the cnodes are in a transitional state. + * + * Assumes that the cnode backing the fileid specified is LOCKED. + */ + +int +hfs_removexattr_by_id (struct hfsmount *hfsmp, uint32_t fileid, const char *xattr_name ) { + struct BTreeIterator iter; // allocated on the stack to avoid heap allocation mid-txn + int ret = 0; + int started_txn = 0; + int lockflags; + + memset (&iter, 0, sizeof(iter)); + + //position the B-Tree iter key before grabbing locks and starting a txn + ret = hfs_buildattrkey (fileid, xattr_name, (HFSPlusAttrKey*)&iter.key); + if (ret) { + goto xattr_out; + } + + //note: this is likely a nested transaction since there is a global transaction cover + if (hfs_start_transaction (hfsmp) != 0) { + ret = EINVAL; + goto xattr_out; + } + started_txn = 1; + + + lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE | SFL_BITMAP, HFS_EXCLUSIVE_LOCK); + + //actually remove the EA from the tree + ret = remove_attribute_records(hfsmp, &iter); + + hfs_systemfile_unlock(hfsmp, lockflags); + + /* + * NOTE: Responsibility of the caller to remove the "has XATTRs" bit in the catalog record + * if this was the last EA. + */ + + +xattr_out: + if (started_txn) { + hfs_end_transaction(hfsmp); + } + + return MacToVFSError(ret); + +} + + +/* Check if any attribute record exist for given fileID. This function + * is called by hfs_vnop_removexattr to determine if it should clear the + * attribute bit in the catalog record or not. + * + * Note - you must acquire a shared lock on the attribute btree before + * calling this function. + * + * Output: + * EEXIST - If attribute record was found + * 0 - Attribute was not found + * (other) - Other error (such as EIO) + */ +int +file_attribute_exist(struct hfsmount *hfsmp, uint32_t fileID) +{ + HFSPlusAttrKey *key; + struct BTreeIterator * iterator = NULL; + struct filefork *btfile; + int result = 0; + + // if there's no attribute b-tree we sure as heck + // can't have any attributes! + if (hfsmp->hfs_attribute_vp == NULL) { + return false; + } + + iterator = hfs_mallocz(sizeof(*iterator)); + + key = (HFSPlusAttrKey *)&iterator->key; + + result = hfs_buildattrkey(fileID, NULL, key); + if (result) { + goto out; + } + + btfile = VTOF(hfsmp->hfs_attribute_vp); + result = BTSearchRecord(btfile, iterator, NULL, NULL, NULL); + if (result && (result != btNotFound)) { + goto out; + } + + result = BTIterateRecord(btfile, kBTreeNextRecord, iterator, NULL, NULL); + /* If no next record was found or fileID for next record did not match, + * no more attributes exist for this fileID + */ + if ((result && (result == btNotFound)) || (key->fileID != fileID)) { + result = 0; + } else { + result = EEXIST; + } + +out: + hfs_free(iterator, sizeof(*iterator)); + return result; +} + + +/* + * Remove all the records for a given attribute. + * + * - Used by hfs_vnop_removexattr, hfs_vnop_setxattr and hfs_removeallattr. + * - A transaction must have been started. + * - The Attribute b-tree file must be locked exclusive. + * - The Allocation Bitmap file must be locked exclusive. + * - The iterator key must be initialized. + */ +int +remove_attribute_records(struct hfsmount *hfsmp, BTreeIterator * iterator) +{ + struct filefork *btfile; + FSBufferDescriptor btdata; + HFSPlusAttrRecord attrdata; /* 90 bytes */ + u_int16_t datasize; + int result; + + btfile = VTOF(hfsmp->hfs_attribute_vp); + + btdata.bufferAddress = &attrdata; + btdata.itemSize = sizeof(attrdata); + btdata.itemCount = 1; + result = BTSearchRecord(btfile, iterator, &btdata, &datasize, NULL); + if (result) { + goto exit; /* no records. */ + } + /* + * Free the blocks from extent based attributes. + * + * Note that the block references (btree records) are removed + * before releasing the blocks in the allocation bitmap. + */ + if (attrdata.recordType == kHFSPlusAttrForkData) { + int totalblks; + int extentblks; + u_int32_t *keystartblk; + + if (datasize < sizeof(HFSPlusAttrForkData)) { + printf("hfs: remove_attribute_records: bad record size %d (expecting %lu)\n", datasize, sizeof(HFSPlusAttrForkData)); + } + totalblks = attrdata.forkData.theFork.totalBlocks; + + /* Process the first 8 extents. */ + extentblks = count_extent_blocks(totalblks, attrdata.forkData.theFork.extents); + if (extentblks > totalblks) + panic("hfs: remove_attribute_records: corruption..."); + if (BTDeleteRecord(btfile, iterator) == 0) { + free_attr_blks(hfsmp, extentblks, attrdata.forkData.theFork.extents); + } + totalblks -= extentblks; + keystartblk = &((HFSPlusAttrKey *)&iterator->key)->startBlock; + + /* Process any overflow extents. */ + while (totalblks) { + *keystartblk += (u_int32_t)extentblks; + + result = BTSearchRecord(btfile, iterator, &btdata, &datasize, NULL); + if (result || + (attrdata.recordType != kHFSPlusAttrExtents) || + (datasize < sizeof(HFSPlusAttrExtents))) { + printf("hfs: remove_attribute_records: BTSearchRecord: vol=%s, err=%d (%d), totalblks %d\n", + hfsmp->vcbVN, MacToVFSError(result), attrdata.recordType != kHFSPlusAttrExtents, totalblks); + result = ENOATTR; + break; /* break from while */ + } + /* Process the next 8 extents. */ + extentblks = count_extent_blocks(totalblks, attrdata.overflowExtents.extents); + if (extentblks > totalblks) + panic("hfs: remove_attribute_records: corruption..."); + if (BTDeleteRecord(btfile, iterator) == 0) { + free_attr_blks(hfsmp, extentblks, attrdata.overflowExtents.extents); + } + totalblks -= extentblks; + } + } else { + result = BTDeleteRecord(btfile, iterator); + } + (void) BTFlushPath(btfile); +exit: + return (result == btNotFound ? ENOATTR : MacToVFSError(result)); +} + + +/* + * Retrieve the list of extended attribute names. + */ +int +hfs_vnop_listxattr(struct vnop_listxattr_args *ap) +/* + struct vnop_listxattr_args { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + uio_t a_uio; + size_t *a_size; + int a_options; + vfs_context_t a_context; +*/ +{ + struct vnode *vp = ap->a_vp; + struct cnode *cp = VTOC(vp); + struct hfsmount *hfsmp; + uio_t uio = ap->a_uio; + struct BTreeIterator * iterator = NULL; + struct filefork *btfile; + struct listattr_callback_state state; + user_addr_t user_start = 0; + user_size_t user_len = 0; + int lockflags; + int result; + u_int8_t finderinfo[32]; + + + if (VNODE_IS_RSRC(vp)) { + return (EPERM); + } + +#if HFS_COMPRESSION + int compressed = hfs_file_is_compressed(cp, 1); /* 1 == don't take the cnode lock */ +#endif /* HFS_COMPRESSION */ + + hfsmp = VTOHFS(vp); + *ap->a_size = 0; + + /* + * Take the truncate lock; this serializes us against the ioctl + * to truncate data & reset the decmpfs state + * in the compressed file handler. + */ + hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT); + + /* Now the regular cnode lock (shared) */ + if ((result = hfs_lock(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT))) { + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); + return (result); + } + + /* + * Make a copy of the cnode's finderinfo to a local so we can + * zero out the date added field. Also zero out the private type/creator + * for symlinks. + */ + bcopy(cp->c_finderinfo, finderinfo, sizeof(finderinfo)); + hfs_zero_hidden_fields (cp, finderinfo); + + /* Don't expose a symlink's private type/creator. */ + if (vnode_islnk(vp)) { + struct FndrFileInfo *fip; + + fip = (struct FndrFileInfo *)&finderinfo; + fip->fdType = 0; + fip->fdCreator = 0; + } + + + /* If Finder Info is non-empty then export it's name. */ + if (bcmp(finderinfo, emptyfinfo, sizeof(emptyfinfo)) != 0) { + if (uio == NULL) { + *ap->a_size += sizeof(XATTR_FINDERINFO_NAME); + } else if ((user_size_t)uio_resid(uio) < sizeof(XATTR_FINDERINFO_NAME)) { + result = ERANGE; + goto exit; + } else { + result = uiomove(XATTR_FINDERINFO_NAME, + sizeof(XATTR_FINDERINFO_NAME), uio); + if (result) + goto exit; + } + } + /* If Resource Fork is non-empty then export it's name. */ + if (S_ISREG(cp->c_mode) && hfs_has_rsrc(cp)) { +#if HFS_COMPRESSION + if ((ap->a_options & XATTR_SHOWCOMPRESSION) || + !compressed || + !decmpfs_hides_rsrc(ap->a_context, VTOCMP(vp)) + ) +#endif /* HFS_COMPRESSION */ + { + if (uio == NULL) { + *ap->a_size += sizeof(XATTR_RESOURCEFORK_NAME); + } else if ((user_size_t)uio_resid(uio) < sizeof(XATTR_RESOURCEFORK_NAME)) { + result = ERANGE; + goto exit; + } else { + result = uiomove(XATTR_RESOURCEFORK_NAME, + sizeof(XATTR_RESOURCEFORK_NAME), uio); + if (result) + goto exit; + } + } + } +#if CONFIG_HFS_STD + /* + * Standard HFS only supports native FinderInfo and Resource Forks. + * Return at this point. + */ + if (hfsmp->hfs_flags & HFS_STANDARD) { + result = 0; + goto exit; + } +#endif + /* Bail if we don't have any extended attributes. */ + if ((hfsmp->hfs_attribute_vp == NULL) || + (cp->c_attr.ca_recflags & kHFSHasAttributesMask) == 0) { + result = 0; + goto exit; + } + btfile = VTOF(hfsmp->hfs_attribute_vp); + + iterator = hfs_mallocz(sizeof(*iterator)); + + result = hfs_buildattrkey(cp->c_fileid, NULL, (HFSPlusAttrKey *)&iterator->key); + if (result) + goto exit; + + /* + * Lock the user's buffer here so that we won't fault on + * it in uiomove while holding the attributes b-tree lock. + */ + if (uio && uio_isuserspace(uio)) { + user_start = uio_curriovbase(uio); + user_len = uio_curriovlen(uio); + + if ((result = vslock(user_start, user_len)) != 0) { + user_start = 0; + goto exit; + } + } + lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_SHARED_LOCK); + + result = BTSearchRecord(btfile, iterator, NULL, NULL, NULL); + if (result && result != btNotFound) { + hfs_systemfile_unlock(hfsmp, lockflags); + goto exit; + } + + state.fileID = cp->c_fileid; + state.result = 0; + state.uio = uio; + state.size = 0; +#if HFS_COMPRESSION + state.showcompressed = !compressed || ap->a_options & XATTR_SHOWCOMPRESSION; + state.ctx = ap->a_context; + state.vp = vp; +#endif /* HFS_COMPRESSION */ + + /* + * Process entries starting just after iterator->key. + */ + result = BTIterateRecords(btfile, kBTreeNextRecord, iterator, + (IterateCallBackProcPtr)listattr_callback, &state); + hfs_systemfile_unlock(hfsmp, lockflags); + if (uio == NULL) { + *ap->a_size += state.size; + } + + if (state.result || result == btNotFound) + result = state.result; + +exit: + if (user_start) { + vsunlock(user_start, user_len, TRUE); + } + hfs_free(iterator, sizeof(*iterator)); + hfs_unlock(cp); + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); + + return MacToVFSError(result); +} + + +/* + * Callback - called for each attribute record + */ +static int +listattr_callback(const HFSPlusAttrKey *key, __unused const HFSPlusAttrData *data, struct listattr_callback_state *state) +{ + char attrname[XATTR_MAXNAMELEN + 1]; + ssize_t bytecount; + int result; + + if (state->fileID != key->fileID) { + state->result = 0; + return (0); /* stop */ + } + /* + * Skip over non-primary keys + */ + if (key->startBlock != 0) { + return (1); /* continue */ + } + + /* Convert the attribute name into UTF-8. */ + result = utf8_encodestr(key->attrName, key->attrNameLen * sizeof(UniChar), + (u_int8_t *)attrname, (size_t *)&bytecount, sizeof(attrname), '/', 0); + if (result) { + state->result = result; + return (0); /* stop */ + } + bytecount++; /* account for null termination char */ + + if (xattr_protected(attrname)) + return (1); /* continue */ + +#if HFS_COMPRESSION + if (!state->showcompressed && decmpfs_hides_xattr(state->ctx, VTOCMP(state->vp), attrname) ) + return 1; /* continue */ +#endif /* HFS_COMPRESSION */ + + if (state->uio == NULL) { + state->size += bytecount; + } else { + if (bytecount > uio_resid(state->uio)) { + state->result = ERANGE; + return (0); /* stop */ + } + result = uiomove((caddr_t) attrname, bytecount, state->uio); + if (result) { + state->result = result; + return (0); /* stop */ + } + } + return (1); /* continue */ +} + +/* + * Remove all the attributes from a cnode. + * + * This function creates/ends its own transaction so that each + * attribute is deleted in its own transaction (to avoid having + * a transaction grow too large). + * + * This function takes the necessary locks on the attribute + * b-tree file and the allocation (bitmap) file. + * + * NOTE: Upon sucecss, this function will return with an open + * transaction. The reason we do it this way is because when we + * delete the last attribute, we must make sure the flag in the + * catalog record that indicates there are no more records is cleared. + * The caller is responsible for doing this and *must* do it before + * ending the transaction. + */ +int +hfs_removeallattr(struct hfsmount *hfsmp, u_int32_t fileid, + bool *open_transaction) +{ + BTreeIterator *iterator = NULL; + HFSPlusAttrKey *key; + struct filefork *btfile; + int result, lockflags = 0; + + *open_transaction = false; + + if (hfsmp->hfs_attribute_vp == NULL) + return 0; + + btfile = VTOF(hfsmp->hfs_attribute_vp); + + iterator = hfs_mallocz(sizeof(BTreeIterator)); + + key = (HFSPlusAttrKey *)&iterator->key; + + /* Loop until there are no more attributes for this file id */ + do { + if (!*open_transaction) + lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_SHARED_LOCK); + + (void) hfs_buildattrkey(fileid, NULL, key); + result = BTIterateRecord(btfile, kBTreeNextRecord, iterator, NULL, NULL); + if (result || key->fileID != fileid) + goto exit; + + hfs_systemfile_unlock(hfsmp, lockflags); + lockflags = 0; + + if (*open_transaction) { + hfs_end_transaction(hfsmp); + *open_transaction = false; + } + + if (hfs_start_transaction(hfsmp) != 0) { + result = EINVAL; + goto exit; + } + + *open_transaction = true; + + lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE | SFL_BITMAP, HFS_EXCLUSIVE_LOCK); + + result = remove_attribute_records(hfsmp, iterator); + +#if HFS_XATTR_VERBOSE + if (result) { + printf("hfs_removeallattr: unexpected err %d\n", result); + } +#endif + } while (!result); + +exit: + hfs_free(iterator, sizeof(*iterator)); + + if (lockflags) + hfs_systemfile_unlock(hfsmp, lockflags); + + result = result == btNotFound ? 0 : MacToVFSError(result); + + if (result && *open_transaction) { + hfs_end_transaction(hfsmp); + *open_transaction = false; + } + + return result; +} + +void +hfs_xattr_init(struct hfsmount * hfsmp) +{ +#if CONFIG_HFS_STD + if (ISSET(hfsmp->hfs_flags, HFS_STANDARD)) + return; +#endif + + /* + * If there isn't an attributes b-tree then create one. + */ + if ((hfsmp->hfs_attribute_vp == NULL) && + !(hfsmp->hfs_flags & HFS_READ_ONLY)) { + (void) hfs_create_attr_btree(hfsmp, ATTRIBUTE_FILE_NODE_SIZE, + getnodecount(hfsmp, ATTRIBUTE_FILE_NODE_SIZE)); + } + if (hfsmp->hfs_attribute_vp) + hfsmp->hfs_max_inline_attrsize = getmaxinlineattrsize(hfsmp->hfs_attribute_vp); +} + +/* + * Enable/Disable volume attributes stored as EA for root file system. + * Supported attributes are - + * 1. Extent-based Extended Attributes + */ +int +hfs_set_volxattr(struct hfsmount *hfsmp, unsigned int xattrtype, int state) +{ + struct BTreeIterator * iterator = NULL; + struct filefork *btfile; + int lockflags; + int result; + +#if CONFIG_HFS_STD + if (hfsmp->hfs_flags & HFS_STANDARD) { + return (ENOTSUP); + } +#endif + if (xattrtype != HFSIOC_SET_XATTREXTENTS_STATE) { + return EINVAL; + } + + /* + * If there isn't an attributes b-tree then create one. + */ + if (hfsmp->hfs_attribute_vp == NULL) { + result = hfs_create_attr_btree(hfsmp, ATTRIBUTE_FILE_NODE_SIZE, + getnodecount(hfsmp, ATTRIBUTE_FILE_NODE_SIZE)); + if (result) { + return (result); + } + } + + iterator = hfs_mallocz(sizeof(*iterator)); + + /* + * Build a b-tree key. + * We use the root's parent id (1) to hold this volume attribute. + */ + (void) hfs_buildattrkey(kHFSRootParentID, XATTR_XATTREXTENTS_NAME, + (HFSPlusAttrKey *)&iterator->key); + + /* Start a transaction for our changes. */ + if (hfs_start_transaction(hfsmp) != 0) { + result = EINVAL; + goto exit; + } + btfile = VTOF(hfsmp->hfs_attribute_vp); + + lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_EXCLUSIVE_LOCK); + + if (state == 0) { + /* Remove the attribute. */ + result = BTDeleteRecord(btfile, iterator); + if (result == btNotFound) + result = 0; + } else { + FSBufferDescriptor btdata; + HFSPlusAttrData attrdata; + u_int16_t datasize; + + datasize = sizeof(attrdata); + btdata.bufferAddress = &attrdata; + btdata.itemSize = datasize; + btdata.itemCount = 1; + attrdata.recordType = kHFSPlusAttrInlineData; + attrdata.reserved[0] = 0; + attrdata.reserved[1] = 0; + attrdata.attrSize = 2; + attrdata.attrData[0] = 0; + attrdata.attrData[1] = 0; + + /* Insert the attribute. */ + result = BTInsertRecord(btfile, iterator, &btdata, datasize); + if (result == btExists) + result = 0; + } + (void) BTFlushPath(btfile); + + hfs_systemfile_unlock(hfsmp, lockflags); + + /* Finish the transaction of our changes. */ + hfs_end_transaction(hfsmp); + + /* Update the state in the mount point */ + hfs_lock_mount (hfsmp); + if (state == 0) { + hfsmp->hfs_flags &= ~HFS_XATTR_EXTENTS; + } else { + hfsmp->hfs_flags |= HFS_XATTR_EXTENTS; + } + hfs_unlock_mount (hfsmp); + +exit: + hfs_free(iterator, sizeof(*iterator)); + return MacToVFSError(result); +} + + +/* + * hfs_attrkeycompare - compare two attribute b-tree keys. + * + * The name portion of the key is compared using a 16-bit binary comparison. + * This is called from the b-tree code. + */ +int +hfs_attrkeycompare(HFSPlusAttrKey *searchKey, HFSPlusAttrKey *trialKey) +{ + u_int32_t searchFileID, trialFileID; + int result; + + searchFileID = searchKey->fileID; + trialFileID = trialKey->fileID; + result = 0; + + if (searchFileID > trialFileID) { + ++result; + } else if (searchFileID < trialFileID) { + --result; + } else { + u_int16_t * str1 = &searchKey->attrName[0]; + u_int16_t * str2 = &trialKey->attrName[0]; + int length1 = searchKey->attrNameLen; + int length2 = trialKey->attrNameLen; + u_int16_t c1, c2; + int length; + + if (length1 < length2) { + length = length1; + --result; + } else if (length1 > length2) { + length = length2; + ++result; + } else { + length = length1; + } + + while (length--) { + c1 = *(str1++); + c2 = *(str2++); + + if (c1 > c2) { + result = 1; + break; + } + if (c1 < c2) { + result = -1; + break; + } + } + if (result) + return (result); + /* + * Names are equal; compare startBlock + */ + if (searchKey->startBlock == trialKey->startBlock) { + return (0); + } else { + return (searchKey->startBlock < trialKey->startBlock ? -1 : 1); + } + } + + return result; +} + + +/* + * hfs_buildattrkey - build an Attribute b-tree key + */ +int +hfs_buildattrkey(u_int32_t fileID, const char *attrname, HFSPlusAttrKey *key) +{ + int result = 0; + size_t unicodeBytes = 0; + + if (attrname != NULL) { + /* + * Convert filename from UTF-8 into Unicode + */ + result = utf8_decodestr((const u_int8_t *)attrname, strlen(attrname), key->attrName, + &unicodeBytes, sizeof(key->attrName), 0, 0); + if (result) { + if (result != ENAMETOOLONG) + result = EINVAL; /* name has invalid characters */ + return (result); + } + key->attrNameLen = unicodeBytes / sizeof(UniChar); + key->keyLength = kHFSPlusAttrKeyMinimumLength + unicodeBytes; + } else { + key->attrNameLen = 0; + key->keyLength = kHFSPlusAttrKeyMinimumLength; + } + key->pad = 0; + key->fileID = fileID; + key->startBlock = 0; + + return (0); + } + +/* + * getnodecount - calculate starting node count for attributes b-tree. + */ +static int +getnodecount(struct hfsmount *hfsmp, size_t nodesize) +{ + u_int64_t freebytes; + u_int64_t calcbytes; + + /* + * 10.4: Scale base on current catalog file size (20 %) up to 20 MB. + * 10.5: Attempt to be as big as the catalog clump size. + * + * Use no more than 10 % of the remaining free space. + */ + freebytes = (u_int64_t)hfs_freeblks(hfsmp, 0) * (u_int64_t)hfsmp->blockSize; + + calcbytes = MIN(hfsmp->hfs_catalog_cp->c_datafork->ff_size / 5, 20 * 1024 * 1024); + + calcbytes = MAX(calcbytes, hfsmp->hfs_catalog_cp->c_datafork->ff_clumpsize); + + calcbytes = MIN(calcbytes, freebytes / 10); + + return (MAX(2, (int)(calcbytes / nodesize))); +} + + +/* + * getmaxinlineattrsize - calculate maximum inline attribute size. + * + * This yields 3,802 bytes for an 8K node size. + */ +static size_t +getmaxinlineattrsize(struct vnode * attrvp) +{ + struct BTreeInfoRec btinfo; + size_t nodesize = ATTRIBUTE_FILE_NODE_SIZE; + size_t maxsize; + + if (attrvp != NULL) { + (void) hfs_lock(VTOC(attrvp), HFS_SHARED_LOCK, HFS_LOCK_DEFAULT); + if (BTGetInformation(VTOF(attrvp), 0, &btinfo) == 0) + nodesize = btinfo.nodeSize; + hfs_unlock(VTOC(attrvp)); + } + maxsize = nodesize; + maxsize -= sizeof(BTNodeDescriptor); /* minus node descriptor */ + maxsize -= 3 * sizeof(u_int16_t); /* minus 3 index slots */ + maxsize /= 2; /* 2 key/rec pairs minumum */ + maxsize -= sizeof(HFSPlusAttrKey); /* minus maximum key size */ + maxsize -= sizeof(HFSPlusAttrData) - 2; /* minus data header */ + maxsize &= 0xFFFFFFFE; /* multiple of 2 bytes */ + + return (maxsize); +} + +/* + * Initialize vnode for attribute data I/O. + * + * On success, + * - returns zero + * - the attrdata vnode is initialized as hfsmp->hfs_attrdata_vp + * - an iocount is taken on the attrdata vnode which exists + * for the entire duration of the mount. It is only dropped + * during unmount + * - the attrdata cnode is not locked + * + * On failure, + * - returns non-zero value + * - the caller does not have to worry about any locks or references + */ +int init_attrdata_vnode(struct hfsmount *hfsmp) +{ + vnode_t vp; + int result = 0; + struct cat_desc cat_desc; + struct cat_attr cat_attr; + struct cat_fork cat_fork; + int newvnode_flags = 0; + + bzero(&cat_desc, sizeof(cat_desc)); + cat_desc.cd_parentcnid = kHFSRootParentID; + cat_desc.cd_nameptr = (const u_int8_t *)hfs_attrdatafilename; + cat_desc.cd_namelen = strlen(hfs_attrdatafilename); + cat_desc.cd_cnid = kHFSAttributeDataFileID; + /* Tag vnode as system file, note that we can still use cluster I/O */ + cat_desc.cd_flags |= CD_ISMETA; + + bzero(&cat_attr, sizeof(cat_attr)); + cat_attr.ca_linkcount = 1; + cat_attr.ca_mode = S_IFREG; + cat_attr.ca_fileid = cat_desc.cd_cnid; + cat_attr.ca_blocks = hfsmp->totalBlocks; + + /* + * The attribute data file is a virtual file that spans the + * entire file system space. + * + * Each extent-based attribute occupies a unique portion of + * in this virtual file. The cluster I/O is done using actual + * allocation block offsets so no additional mapping is needed + * for the VNOP_BLOCKMAP call. + * + * This approach allows the attribute data to be cached without + * incurring the high cost of using a separate vnode per attribute. + * + * Since we need to acquire the attribute b-tree file lock anyways, + * the virtual file doesn't introduce any additional serialization. + */ + bzero(&cat_fork, sizeof(cat_fork)); + cat_fork.cf_size = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize; + cat_fork.cf_blocks = hfsmp->totalBlocks; + cat_fork.cf_extents[0].startBlock = 0; + cat_fork.cf_extents[0].blockCount = cat_fork.cf_blocks; + + result = hfs_getnewvnode(hfsmp, NULL, NULL, &cat_desc, 0, &cat_attr, + &cat_fork, &vp, &newvnode_flags); + if (result == 0) { + hfsmp->hfs_attrdata_vp = vp; + hfs_unlock(VTOC(vp)); + } + return (result); +} + +/* + * Read an extent based attribute. + */ +static int +read_attr_data(struct hfsmount *hfsmp, uio_t uio, size_t datasize, HFSPlusExtentDescriptor *extents) +{ + vnode_t evp = hfsmp->hfs_attrdata_vp; + int bufsize; + int64_t iosize; + int attrsize; + int blksize; + int i; + int result = 0; + + hfs_lock_truncate(VTOC(evp), HFS_SHARED_LOCK, HFS_LOCK_DEFAULT); + + bufsize = (int)uio_resid(uio); + attrsize = (int)datasize; + blksize = (int)hfsmp->blockSize; + + /* + * Read the attribute data one extent at a time. + * For the typical case there is only one extent. + */ + for (i = 0; (attrsize > 0) && (bufsize > 0) && (extents[i].startBlock != 0); ++i) { + iosize = extents[i].blockCount * blksize; + iosize = MIN(iosize, attrsize); + iosize = MIN(iosize, bufsize); + uio_setresid(uio, iosize); + uio_setoffset(uio, (u_int64_t)extents[i].startBlock * (u_int64_t)blksize); + + result = cluster_read(evp, uio, VTOF(evp)->ff_size, IO_SYNC | IO_UNIT); + +#if HFS_XATTR_VERBOSE + printf("hfs: read_attr_data: cr iosize %lld [%d, %d] (%d)\n", + iosize, extents[i].startBlock, extents[i].blockCount, result); +#endif + if (result) + break; + attrsize -= iosize; + bufsize -= iosize; + } + uio_setresid(uio, bufsize); + uio_setoffset(uio, datasize); + + hfs_unlock_truncate(VTOC(evp), HFS_LOCK_DEFAULT); + return (result); +} + +/* + * Write an extent based attribute. + */ +static int +write_attr_data(struct hfsmount *hfsmp, uio_t uio, size_t datasize, HFSPlusExtentDescriptor *extents) +{ + vnode_t evp = hfsmp->hfs_attrdata_vp; + off_t filesize; + int bufsize; + int attrsize; + int64_t iosize; + int blksize; + int i; + int result = 0; + + hfs_lock_truncate(VTOC(evp), HFS_SHARED_LOCK, HFS_LOCK_DEFAULT); + + bufsize = uio_resid(uio); + attrsize = (int) datasize; + blksize = (int) hfsmp->blockSize; + filesize = VTOF(evp)->ff_size; + + /* + * Write the attribute data one extent at a time. + */ + for (i = 0; (attrsize > 0) && (bufsize > 0) && (extents[i].startBlock != 0); ++i) { + iosize = extents[i].blockCount * blksize; + iosize = MIN(iosize, attrsize); + iosize = MIN(iosize, bufsize); + uio_setresid(uio, iosize); + uio_setoffset(uio, (u_int64_t)extents[i].startBlock * (u_int64_t)blksize); + + result = cluster_write(evp, uio, filesize, filesize, filesize, + (off_t) 0, IO_SYNC | IO_UNIT); +#if HFS_XATTR_VERBOSE + printf("hfs: write_attr_data: cw iosize %lld [%d, %d] (%d)\n", + iosize, extents[i].startBlock, extents[i].blockCount, result); +#endif + if (result) + break; + attrsize -= iosize; + bufsize -= iosize; + } + uio_setresid(uio, bufsize); + uio_setoffset(uio, datasize); + + hfs_unlock_truncate(VTOC(evp), HFS_LOCK_DEFAULT); + return (result); +} + +/* + * Allocate blocks for an extent based attribute. + */ +static int +alloc_attr_blks(struct hfsmount *hfsmp, size_t attrsize, size_t extentbufsize, HFSPlusExtentDescriptor *extents, int *blocks) +{ + int blkcnt; + int startblk; + int lockflags; + int i; + int maxextents; + int result = 0; + + startblk = hfsmp->hfs_metazone_end; + blkcnt = howmany(attrsize, hfsmp->blockSize); + if (blkcnt > (int)hfs_freeblks(hfsmp, 0)) { + return (ENOSPC); + } + *blocks = blkcnt; + maxextents = extentbufsize / sizeof(HFSPlusExtentDescriptor); + + lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK); + + for (i = 0; (blkcnt > 0) && (i < maxextents); i++) { + /* Try allocating and see if we find something decent */ + result = BlockAllocate(hfsmp, startblk, blkcnt, blkcnt, 0, + &extents[i].startBlock, &extents[i].blockCount); + /* + * If we couldn't find anything, then re-try the allocation but allow + * journal flushes. + */ + if (result == dskFulErr) { + result = BlockAllocate(hfsmp, startblk, blkcnt, blkcnt, HFS_ALLOC_FLUSHTXN, + &extents[i].startBlock, &extents[i].blockCount); + } + + +#if HFS_XATTR_VERBOSE + printf("hfs: alloc_attr_blks: BA blkcnt %d [%d, %d] (%d)\n", + blkcnt, extents[i].startBlock, extents[i].blockCount, result); +#endif + if (result) { + extents[i].startBlock = 0; + extents[i].blockCount = 0; + break; + } + blkcnt -= extents[i].blockCount; + startblk = extents[i].startBlock + extents[i].blockCount; + } + /* + * If it didn't fit in the extents buffer then bail. + */ + if (blkcnt) { + result = ENOSPC; + +#if HFS_XATTR_VERBOSE + printf("hfs: alloc_attr_blks: unexpected failure, %d blocks unallocated\n", blkcnt); +#endif + for (; i >= 0; i--) { + if ((blkcnt = extents[i].blockCount) != 0) { + (void) BlockDeallocate(hfsmp, extents[i].startBlock, blkcnt, 0); + extents[i].startBlock = 0; + extents[i].blockCount = 0; + } + } + } + + hfs_systemfile_unlock(hfsmp, lockflags); + return MacToVFSError(result); +} + +/* + * Release blocks from an extent based attribute. + */ +static void +free_attr_blks(struct hfsmount *hfsmp, int blkcnt, HFSPlusExtentDescriptor *extents) +{ + vnode_t evp = hfsmp->hfs_attrdata_vp; + int remblks = blkcnt; + int lockflags; + int i; + + lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK); + + for (i = 0; (remblks > 0) && (extents[i].blockCount != 0); i++) { + if (extents[i].blockCount > (u_int32_t)blkcnt) { +#if HFS_XATTR_VERBOSE + printf("hfs: free_attr_blks: skipping bad extent [%d, %d]\n", + extents[i].startBlock, extents[i].blockCount); +#endif + extents[i].blockCount = 0; + continue; + } + if (extents[i].startBlock == 0) { + break; + } + (void)BlockDeallocate(hfsmp, extents[i].startBlock, extents[i].blockCount, 0); + remblks -= extents[i].blockCount; + extents[i].startBlock = 0; + extents[i].blockCount = 0; + +#if HFS_XATTR_VERBOSE + printf("hfs: free_attr_blks: BlockDeallocate [%d, %d]\n", + extents[i].startBlock, extents[i].blockCount); +#endif + /* Discard any resident pages for this block range. */ + if (evp) { + off_t start, end; + + start = (u_int64_t)extents[i].startBlock * (u_int64_t)hfsmp->blockSize; + end = start + (u_int64_t)extents[i].blockCount * (u_int64_t)hfsmp->blockSize; + (void) ubc_msync(hfsmp->hfs_attrdata_vp, start, end, &start, UBC_INVALIDATE); + } + } + + hfs_systemfile_unlock(hfsmp, lockflags); +} + +static int +has_overflow_extents(HFSPlusForkData *forkdata) +{ + u_int32_t blocks; + + if (forkdata->extents[7].blockCount == 0) + return (0); + + blocks = forkdata->extents[0].blockCount + + forkdata->extents[1].blockCount + + forkdata->extents[2].blockCount + + forkdata->extents[3].blockCount + + forkdata->extents[4].blockCount + + forkdata->extents[5].blockCount + + forkdata->extents[6].blockCount + + forkdata->extents[7].blockCount; + + return (forkdata->totalBlocks > blocks); +} + +static int +count_extent_blocks(int maxblks, HFSPlusExtentRecord extents) +{ + int blocks; + int i; + + for (i = 0, blocks = 0; i < kHFSPlusExtentDensity; ++i) { + /* Ignore obvious bogus extents. */ + if (extents[i].blockCount > (u_int32_t)maxblks) + continue; + if (extents[i].startBlock == 0 || extents[i].blockCount == 0) + break; + blocks += extents[i].blockCount; + } + return (blocks); +} + diff --git a/core/install b/core/install new file mode 100755 index 0000000..3975ef9 --- /dev/null +++ b/core/install @@ -0,0 +1,35 @@ +#!/bin/sh + +# install +# hfs +# +# Created by Chris Suter on 4/30/15. +# + +if [ ! "$MACHINE" ] ; then + echo "MACHINE not specified" + exit 1 +fi + +ROOT=hfs-root.tbz + +# Virtual machine stuff +[ "$VM" != "" -a "$VMRUN" != "" ] && { ping -c 1 -t 2 $MACHINE || { + VMX="$HOME/Documents/Virtual Machines.localized/$VM.vmwarevm/$VM.vmx" + + "$VMRUN" revertToSnapshot "$VMX" "Safe" || exit 1 + "$VMRUN" start "$VMX" || exit 1 + } +} + +rsync -P "$BUILT_PRODUCTS_DIR/$ROOT" $MACHINE: || exit 1 + +ssh $MACHINE bash -x -s </dev/null 2>/dev/null + + + + CFBundleDevelopmentRegion + en + CFBundleExecutable + $(EXECUTABLE_NAME) + CFBundleIdentifier + $(PRODUCT_BUNDLE_IDENTIFIER) + CFBundleInfoDictionaryVersion + 6.0 + CFBundleName + $(PRODUCT_NAME) + CFBundlePackageType + KEXT + CFBundleShortVersionString + HFS_KEXT_VERSION + CFBundleSignature + ???? + CFBundleVersion + HFS_KEXT_VERSION + IOKitPersonalities + + com.apple.filesystems.hfs.kext + + CFBundleIdentifier + com.apple.filesystems.hfs.kext + IOClass + com_apple_filesystems_hfs + IOMatchCategory + com_apple_filesystems_hfs + IOProviderClass + IOResources + IOResourceMatch + IOBSD + + + NSHumanReadableCopyright + Copyright © 2015 Apple Inc. All rights reserved. + OSBundleLibraries + + com.apple.kpi.bsd + 14.1 + com.apple.kpi.iokit + 14.1 + com.apple.kpi.libkern + 14.1 + com.apple.kpi.mach + 14.1 + com.apple.kpi.private + 14.1 + com.apple.kpi.unsupported + 14.1 + + OSBundleRequired + Local-Root + + diff --git a/core/kext-config.h b/core/kext-config.h new file mode 100644 index 0000000..fa3b3c2 --- /dev/null +++ b/core/kext-config.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _hfs_config_ +#define _hfs_config_ + +#include + +#define HFS_COMPRESSION 1 +#define FIFO 1 + +// #define HFS_MALLOC_DEBUG 1 +// #define HFS_LEAK_DEBUG 1 + +#if (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) // iOS (real hardware) + +#define QUOTA 0 +#define CONFIG_PROTECT 1 +#define CONFIG_SECLUDED_RENAME 1 + + +#else // OS X + +#define QUOTA 1 +#define NAMEDSTREAMS 1 +#define CONFIG_HFS_DIRLINK 1 +#define CONFIG_SEARCHFS 1 + +#endif + +#endif /* defined(_hfs_config_) */ diff --git a/core/kext.xcconfig b/core/kext.xcconfig new file mode 100644 index 0000000..56ab404 --- /dev/null +++ b/core/kext.xcconfig @@ -0,0 +1,52 @@ +// +// Copyright (c) 2015 Apple Inc. All rights reserved. +// +// @APPLE_OSREFERENCE_LICENSE_HEADER_START@ +// +// This file contains Original Code and/or Modifications of Original Code +// as defined in and that are subject to the Apple Public Source License +// Version 2.0 (the 'License'). You may not use this file except in +// compliance with the License. The rights granted to you under the License +// may not be used to create, or enable the creation or redistribution of, +// unlawful or unlicensed copies of an Apple operating system, or to +// circumvent, violate, or enable the circumvention or violation of, any +// terms of an Apple operating system software license agreement. +// +// Please obtain a copy of the License at +// http://www.opensource.apple.com/apsl/ and read it before using this file. +// +// The Original Code and all software distributed under the License are +// distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER +// EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, +// INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. +// Please see the License for the specific language governing rights and +// limitations under the License. +// +// @APPLE_OSREFERENCE_LICENSE_HEADER_END@ +// + +// Used if building combined xnu & hfs roots +//DEVICES= +//MACHINE= +//XNU_PATH= +//XNU_DST_PATH=$(XNU_PATH)/BUILD/dst +//KERNEL_PATH=$(XNU_DST_PATH)/System/Library/Kernels/kernel.development +//KERNEL_FRAMEWORK_PATH=$(XNU_DST_PATH)/System/Library/Frameworks/Kernel.framework +//HEADER_SEARCH_PATHS=$(KERNEL_FRAMEWORK_PATH)/PrivateHeaders $(KERNEL_FRAMEWORK_PATH)/Headers + +// Enable this to see if Clang has any new warnings +// WARNING_CFLAGS=-Weverything -Wno-unused-parameter -Wno-shorten-64-to-32 -Wno-reserved-id-macro -Wno-undef -Wno-missing-variable-declarations -Wno-padded -Wno-c11-extensions -Wno- documentation -Wno-variadic-macros -Wno-zero-length-array -Wno-documentation-unknown-command -Wno-packed -Wno-pedantic -Wno-format-non-iso -Wno-bad-function-cast -Wno-cast-align -Wno-disabled-macro-expansion -Wno-used-but-marked-unused -Wno-c++98-compat-pedantic -Wno-old-style-cast -Wno-c++98-compat -Wno-vla -Wno-switch-enum -Wno-c++-compat -Wno-global-constructors -Wno-shift-sign-overflow -Wno-covered-switch-default + +GCC_PREFIX_HEADER=core/kext-config.h +GCC_PREPROCESSOR_DEFINITIONS=$(PREPROC_DEFN_$(CONFIGURATION)) +PREPROC_DEFN_Debug=DEBUG +PRIVATE_HEADERS_FOLDER_PATH=/usr/local/include/hfs +PUBLIC_HEADERS_FOLDER_PATH=/usr/include/hfs +OTHER_CFLAGS=$(OTHER_CFLAGS_$(CONFIGURATION)) +OTHER_CFLAGS_Coverage=-fprofile-instr-generate -fcoverage-mapping + +BUILD_VARIANTS[sdk=macosx*] = normal kasan +BUILD_VARIANTS[sdk=iphoneos*] = normal kasan +CODE_SIGN_IDENTITY = - +OTHER_CFLAGS_kasan = $(KASAN_DEFAULT_CFLAGS) diff --git a/core/macosx-Info.plist b/core/macosx-Info.plist new file mode 100644 index 0000000..7d3fca3 --- /dev/null +++ b/core/macosx-Info.plist @@ -0,0 +1,61 @@ + + + + + CFBundleDevelopmentRegion + en + CFBundleExecutable + $(EXECUTABLE_NAME) + CFBundleIdentifier + $(PRODUCT_BUNDLE_IDENTIFIER) + CFBundleInfoDictionaryVersion + 6.0 + CFBundleName + $(PRODUCT_NAME) + CFBundlePackageType + KEXT + CFBundleShortVersionString + HFS_KEXT_VERSION + CFBundleSignature + ???? + CFBundleVersion + HFS_KEXT_VERSION + IOKitPersonalities + + com.apple.filesystems.hfs.kext + + CFBundleIdentifier + com.apple.filesystems.hfs.kext + IOClass + com_apple_filesystems_hfs + IOMatchCategory + com_apple_filesystems_hfs + IOProviderClass + IOResources + IOResourceMatch + IOBSD + + + NSHumanReadableCopyright + Copyright © 2015 Apple Inc. All rights reserved. + OSBundleLibraries + + com.apple.kpi.bsd + 14.1 + com.apple.kpi.iokit + 14.1 + com.apple.kpi.libkern + 14.1 + com.apple.kpi.mach + 14.1 + com.apple.kpi.private + 14.1 + com.apple.kpi.unsupported + 14.1 + com.apple.filesystems.hfs.encodings.kext + 1.0 + + OSBundleRequired + Local-Root + + diff --git a/core/mk-root.sh b/core/mk-root.sh new file mode 100755 index 0000000..fb62b25 --- /dev/null +++ b/core/mk-root.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +# mk-root.sh +# hfs +# +# Created by Chris Suter on 5/3/15. +# + +shopt -s nocasematch + +set -e + +if [[ "$SDKROOT" =~ macosx ]] ; then + if [ ! "$KERNEL_PATH" ] ; then + KERNEL_PATH=$SDKROOT/System/Library/Kernels/kernel.development + fi + + EXTS_PATH="`dirname \"$KERNEL_PATH\"`"/../Extensions + + kextutil -no-load -t -k "$KERNEL_PATH" -no-authentication "$BUILT_PRODUCTS_DIR/HFSEncodings.kext" -d "$EXTS_PATH/System.kext" + kextutil -no-load -t -k "$KERNEL_PATH" -no-authentication "$BUILT_PRODUCTS_DIR/HFS.kext" -d "$EXTS_PATH/System.kext" -d "$BUILT_PRODUCTS_DIR/HFSEncodings.kext" + + if [ "$XNU_PATH" ] ; then + extra_args=(-C "$XNU_PATH/BUILD/dst" .) + fi + gnutar --owner 0 --group 0 --transform 's|^([^/]+.kext)|System/Library/Extensions/\1|x' -C "$BUILT_PRODUCTS_DIR" HFS.kext HFSEncodings.kext "${extra_args[@]}" -cjf "$BUILT_PRODUCTS_DIR/hfs-root.tbz" + echo "Created $BUILT_PRODUCTS_DIR/hfs-root.tbz" + ln -sf $BUILT_PRODUCTS_DIR/hfs-root.tbz /tmp/ +else + ~/bin/copy-kernel-cache-builder + pushd /tmp/KernelCacheBuilder + if [ "$XNU_PATH" ] ; then + extra_args=(KERNEL_PATH="$XNU_DST_PATH") + extra_kext_paths="$BUILT_PRODUCTS_DIR $XNU_PATH/BUILD/dst/System/Library/Extensions" + else + extra_kext_paths="$BUILT_PRODUCTS_DIR" + fi + env -i make TARGETS="$DEVICES" "${extra_args[@]}" BUILDS=development VERBOSE=YES SDKROOT=iphoneos.internal EXTRA_KEXT_PATHS="$BUILT_PRODUCTS_DIR $XNU_PATH/BUILD/dst/System/Library/Extensions" EXTRA_BUNDLES=com.apple.filesystems.hfs.kext 2> >(sed -E '/^.*duplicate BUNDLE_IDS$/d' 1>&2) +fi diff --git a/core/rangelist.c b/core/rangelist.c new file mode 100644 index 0000000..a38ea83 --- /dev/null +++ b/core/rangelist.c @@ -0,0 +1,429 @@ +/* + * Copyright (c) 2001-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include + +#if !RANGELIST_TEST +#include +#include "hfs.h" +#endif + +#include "rangelist.h" + +static enum rl_overlaptype rl_scan_from(struct rl_head *rangelist, off_t start, off_t end, struct rl_entry **overlap, struct rl_entry *range); +static void rl_collapse_forwards(struct rl_head *rangelist, struct rl_entry *range); +static void rl_collapse_backwards(struct rl_head *rangelist, struct rl_entry *range); +static void rl_collapse_neighbors(struct rl_head *rangelist, struct rl_entry *range); + + +#ifdef RL_DIAGNOSTIC +static void +rl_verify(struct rl_head *rangelist) { + struct rl_entry *entry; + struct rl_entry *next; + off_t limit = 0; + + TAILQ_FOREACH_SAFE(rangelist, entry, rl_link, next) { + if ((limit > 0) && (entry->rl_start <= limit)) panic("hfs: rl_verify: bad entry start?!"); + if (entry->rl_end < entry->rl_start) panic("hfs: rl_verify: bad entry end?!"); + limit = entry->rl_end; + }; +} +#endif + + + +/* + * Initialize a range list head + */ +void +rl_init(struct rl_head *rangelist) +{ + TAILQ_INIT(rangelist); +} + +/* + * Add a range to the list + */ +void +rl_add(off_t start, off_t end, struct rl_head *rangelist) +{ + struct rl_entry *range; + struct rl_entry *overlap; + enum rl_overlaptype ovcase; + +#ifdef RL_DIAGNOSTIC + if (end < start) panic("hfs: rl_add: end < start?!"); +#endif + + ovcase = rl_scan(rangelist, start, end, &overlap); + + /* + * Six cases: + * 0) no overlap + * 1) overlap == range + * 2) overlap contains range + * 3) range contains overlap + * 4) overlap starts before range + * 5) overlap ends after range + */ + switch (ovcase) { + case RL_NOOVERLAP: /* 0: no overlap */ + /* + * overlap points to the entry we should insert before, or + * if NULL, we should insert at the end. + */ + range = hfs_malloc(sizeof(*range)); + range->rl_start = start; + range->rl_end = end; + + /* Link in the new range: */ + if (overlap) { + TAILQ_INSERT_BEFORE(overlap, range, rl_link); + } else { + TAILQ_INSERT_TAIL(rangelist, range, rl_link); + } + + /* Check to see if any ranges can be combined (possibly including the immediately + preceding range entry) + */ + rl_collapse_neighbors(rangelist, range); + break; + + case RL_MATCHINGOVERLAP: /* 1: overlap == range */ + case RL_OVERLAPCONTAINSRANGE: /* 2: overlap contains range */ + break; + + case RL_OVERLAPISCONTAINED: /* 3: range contains overlap */ + /* + * Replace the overlap with the new, larger range: + */ + overlap->rl_start = start; + overlap->rl_end = end; + rl_collapse_neighbors(rangelist, overlap); + break; + + case RL_OVERLAPSTARTSBEFORE: /* 4: overlap starts before range */ + /* + * Expand the overlap area to cover the new range: + */ + overlap->rl_end = end; + rl_collapse_forwards(rangelist, overlap); + break; + + case RL_OVERLAPENDSAFTER: /* 5: overlap ends after range */ + /* + * Expand the overlap area to cover the new range: + */ + overlap->rl_start = start; + rl_collapse_backwards(rangelist, overlap); + break; + } + +#ifdef RL_DIAGNOSTIC + rl_verify(rangelist); +#endif +} + + + +/* + * Remove a range from a range list. + * + * Generally, find the range (or an overlap to that range) + * and remove it (or shrink it), then wakeup anyone we can. + */ +void +rl_remove(off_t start, off_t end, struct rl_head *rangelist) +{ + struct rl_entry *range, *next_range, *overlap, *splitrange; + int ovcase; + +#ifdef RL_DIAGNOSTIC + if (end < start) panic("hfs: rl_remove: end < start?!"); +#endif + + if (TAILQ_EMPTY(rangelist)) { + return; + }; + + range = TAILQ_FIRST(rangelist); + while ((ovcase = rl_scan_from(rangelist, start, end, &overlap, range))) { + switch (ovcase) { + + case RL_MATCHINGOVERLAP: /* 1: overlap == range */ + TAILQ_REMOVE(rangelist, overlap, rl_link); + hfs_free(overlap, sizeof(*overlap)); + break; + + case RL_OVERLAPCONTAINSRANGE: /* 2: overlap contains range: split it */ + if (overlap->rl_start == start) { + overlap->rl_start = end + 1; + break; + }; + + if (overlap->rl_end == end) { + overlap->rl_end = start - 1; + break; + }; + + /* + * Make a new range consisting of the last part of the encompassing range + */ + splitrange = hfs_malloc(sizeof *splitrange); + splitrange->rl_start = end + 1; + splitrange->rl_end = overlap->rl_end; + overlap->rl_end = start - 1; + + /* + * Now link the new entry into the range list after the range from which it was split: + */ + TAILQ_INSERT_AFTER(rangelist, overlap, splitrange, rl_link); + break; + + case RL_OVERLAPISCONTAINED: /* 3: range contains overlap */ + /* Check before discarding overlap entry */ + next_range = TAILQ_NEXT(overlap, rl_link); + TAILQ_REMOVE(rangelist, overlap, rl_link); + hfs_free(overlap, sizeof(*overlap)); + if (next_range) { + range = next_range; + continue; + }; + break; + + case RL_OVERLAPSTARTSBEFORE: /* 4: overlap starts before range */ + overlap->rl_end = start - 1; + range = TAILQ_NEXT(overlap, rl_link); + if (range) { + continue; + } + break; + + case RL_OVERLAPENDSAFTER: /* 5: overlap ends after range */ + overlap->rl_start = (end == RL_INFINITY ? RL_INFINITY : end + 1); + break; + } + break; + } + +#ifdef RL_DIAGNOSTIC + rl_verify(rangelist); +#endif +} + + + +/* + * Scan a range list for an entry in a specified range (if any): + * + * NOTE: this returns only the FIRST overlapping range. + * There may be more than one. + */ + +enum rl_overlaptype +rl_scan(struct rl_head *rangelist, + off_t start, + off_t end, + struct rl_entry **overlap) { + + return rl_scan_from(rangelist, start, end, overlap, TAILQ_FIRST(rangelist)); +} + +enum rl_overlaptype +rl_overlap(const struct rl_entry *range, off_t start, off_t end) +{ + /* + * OK, check for overlap + * + * Six cases: + * 0) no overlap (RL_NOOVERLAP) + * 1) overlap == range (RL_MATCHINGOVERLAP) + * 2) overlap contains range (RL_OVERLAPCONTAINSRANGE) + * 3) range contains overlap (RL_OVERLAPISCONTAINED) + * 4) overlap starts before range (RL_OVERLAPSTARTSBEFORE) + * 5) overlap ends after range (RL_OVERLAPENDSAFTER) + */ + if (start > range->rl_end || range->rl_start > end) { + /* Case 0 (RL_NOOVERLAP) */ + return RL_NOOVERLAP; + } + + if (range->rl_start == start && range->rl_end == end) { + /* Case 1 (RL_MATCHINGOVERLAP) */ + return RL_MATCHINGOVERLAP; + } + + if (range->rl_start <= start && range->rl_end >= end) { + /* Case 2 (RL_OVERLAPCONTAINSRANGE) */ + return RL_OVERLAPCONTAINSRANGE; + } + + if (start <= range->rl_start && end >= range->rl_end) { + /* Case 3 (RL_OVERLAPISCONTAINED) */ + return RL_OVERLAPISCONTAINED; + } + + if (range->rl_start < start && range->rl_end < end) { + /* Case 4 (RL_OVERLAPSTARTSBEFORE) */ + return RL_OVERLAPSTARTSBEFORE; + } + + /* Case 5 (RL_OVERLAPENDSAFTER) */ + // range->rl_start > start && range->rl_end > end + return RL_OVERLAPENDSAFTER; +} + +/* + * Walk the list of ranges for an entry to + * find an overlapping range (if any). + * + * NOTE: this returns only the FIRST overlapping range. + * There may be more than one. + */ +static enum rl_overlaptype +rl_scan_from(struct rl_head *rangelist __unused, + off_t start, + off_t end, + struct rl_entry **overlap, + struct rl_entry *range) +{ +#ifdef RL_DIAGNOSTIC + rl_verify(rangelist); +#endif + + while (range) { + enum rl_overlaptype ot = rl_overlap(range, start, end); + + if (ot != RL_NOOVERLAP || range->rl_start > end) { + *overlap = range; + return ot; + } + + range = TAILQ_NEXT(range, rl_link); + } + + *overlap = NULL; + return RL_NOOVERLAP; +} + + +static void +rl_collapse_forwards(struct rl_head *rangelist, struct rl_entry *range) { + struct rl_entry *next_range; + + while ((next_range = TAILQ_NEXT(range, rl_link))) { + if ((range->rl_end != RL_INFINITY) && (range->rl_end < next_range->rl_start - 1)) return; + + /* Expand this range to include the next range: */ + range->rl_end = next_range->rl_end; + + /* Remove the now covered range from the list: */ + TAILQ_REMOVE(rangelist, next_range, rl_link); + hfs_free(next_range, sizeof(*next_range)); + +#ifdef RL_DIAGNOSTIC + rl_verify(rangelist); +#endif + }; +} + + + +static void +rl_collapse_backwards(struct rl_head *rangelist, struct rl_entry *range) { + struct rl_entry *prev_range; + + while ((prev_range = TAILQ_PREV(range, rl_head, rl_link))) { + if (prev_range->rl_end < range->rl_start -1) { +#ifdef RL_DIAGNOSTIC + rl_verify(rangelist); +#endif + return; + }; + + /* Expand this range to include the previous range: */ + range->rl_start = prev_range->rl_start; + + /* Remove the now covered range from the list: */ + TAILQ_REMOVE(rangelist, prev_range, rl_link); + hfs_free(prev_range, sizeof(*prev_range)); + }; +} + + + +static void +rl_collapse_neighbors(struct rl_head *rangelist, struct rl_entry *range) +{ + rl_collapse_forwards(rangelist, range); + rl_collapse_backwards(rangelist, range); +} + +void rl_remove_all(struct rl_head *rangelist) +{ + struct rl_entry *r, *nextr; + TAILQ_FOREACH_SAFE(r, rangelist, rl_link, nextr) + hfs_free(r, sizeof(*r)); + TAILQ_INIT(rangelist); +} + +/* + * In the case where b is contained by a, we return the the largest part + * remaining. The result is stored in a. + */ +void rl_subtract(struct rl_entry *a, const struct rl_entry *b) +{ + switch (rl_overlap(b, a->rl_start, a->rl_end)) { + case RL_MATCHINGOVERLAP: + case RL_OVERLAPCONTAINSRANGE: + a->rl_end = a->rl_start - 1; + break; + case RL_OVERLAPISCONTAINED: + // Keep the bigger part + if (b->rl_start - a->rl_start >= a->rl_end - b->rl_end) { + // Keep left + a->rl_end = b->rl_start - 1; + } else { + // Keep right + a->rl_start = b->rl_end + 1; + } + break; + case RL_OVERLAPSTARTSBEFORE: + a->rl_start = b->rl_end + 1; + break; + case RL_OVERLAPENDSAFTER: + a->rl_end = b->rl_start - 1; + break; + case RL_NOOVERLAP: + break; + } +} diff --git a/core/rangelist.h b/core/rangelist.h new file mode 100644 index 0000000..b0bb7d1 --- /dev/null +++ b/core/rangelist.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2001-2014 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#ifndef _HFS_RANGELIST_H_ +#define _HFS_RANGELIST_H_ + +#include + +#ifdef KERNEL +#ifdef __APPLE_API_PRIVATE +#include +#include + +enum rl_overlaptype { + RL_NOOVERLAP = 0, /* 0 */ + RL_MATCHINGOVERLAP, /* 1 */ + RL_OVERLAPCONTAINSRANGE, /* 2 */ + RL_OVERLAPISCONTAINED, /* 3 */ + RL_OVERLAPSTARTSBEFORE, /* 4 */ + RL_OVERLAPENDSAFTER /* 5 */ +}; + +#define RL_INFINITY INT64_MAX + +TAILQ_HEAD(rl_head, rl_entry); + +struct rl_entry { + TAILQ_ENTRY(rl_entry) rl_link; + off_t rl_start; + off_t rl_end; +}; + +__BEGIN_DECLS +void rl_init(struct rl_head *rangelist); +void rl_add(off_t start, off_t end, struct rl_head *rangelist); +void rl_remove(off_t start, off_t end, struct rl_head *rangelist); +void rl_remove_all(struct rl_head *rangelist); +enum rl_overlaptype rl_scan(struct rl_head *rangelist, + off_t start, + off_t end, + struct rl_entry **overlap); +enum rl_overlaptype rl_overlap(const struct rl_entry *range, + off_t start, off_t end); + +static __attribute__((pure)) inline +off_t rl_len(const struct rl_entry *range) +{ + return range->rl_end - range->rl_start + 1; +} + +void rl_subtract(struct rl_entry *a, const struct rl_entry *b); + +static inline struct rl_entry rl_make(off_t start, off_t end) +{ + return (struct rl_entry){ .rl_start = start, .rl_end = end }; +} + +__END_DECLS + +#endif /* __APPLE_API_PRIVATE */ +#endif /* KERNEL */ +#endif /* ! _HFS_RANGELIST_H_ */ diff --git a/hfs.xcodeproj/project.pbxproj b/hfs.xcodeproj/project.pbxproj index a6b84e0..ace6a65 100644 --- a/hfs.xcodeproj/project.pbxproj +++ b/hfs.xcodeproj/project.pbxproj @@ -344,6 +344,7 @@ D7978426205FC09A00E93B37 /* lf_hfs_endian.h in Headers */ = {isa = PBXBuildFile; fileRef = D7978424205FC09A00E93B37 /* lf_hfs_endian.h */; }; D79784412060037400E93B37 /* lf_hfs_raw_read_write.h in Headers */ = {isa = PBXBuildFile; fileRef = D797843F2060037400E93B37 /* lf_hfs_raw_read_write.h */; }; D79784422060037400E93B37 /* lf_hfs_raw_read_write.c in Sources */ = {isa = PBXBuildFile; fileRef = D79784402060037400E93B37 /* lf_hfs_raw_read_write.c */; }; + D7B2DC81233A3F5B00F12230 /* livefiles_hfs.dylib in Frameworks */ = {isa = PBXBuildFile; fileRef = 900BDED41FF919C2002F7EC0 /* livefiles_hfs.dylib */; }; D7BD8F9C20AC388E00E93640 /* lf_hfs_catalog.c in Sources */ = {isa = PBXBuildFile; fileRef = 906EBF82206409B800B21E94 /* lf_hfs_catalog.c */; }; EE73740520644328004C2F0E /* lf_hfs_sbunicode.h in Headers */ = {isa = PBXBuildFile; fileRef = EE73740320644328004C2F0E /* lf_hfs_sbunicode.h */; }; EE73740620644328004C2F0E /* lf_hfs_sbunicode.c in Sources */ = {isa = PBXBuildFile; fileRef = EE73740420644328004C2F0E /* lf_hfs_sbunicode.c */; }; @@ -1284,7 +1285,7 @@ isa = PBXFrameworksBuildPhase; buildActionMask = 2147483647; files = ( - 9022D171205EC18500D9A2AE /* livefiles_hfs.dylib in Frameworks */, + D7B2DC81233A3F5B00F12230 /* livefiles_hfs.dylib in Frameworks */, 9022D170205EC16900D9A2AE /* CoreFoundation.framework in Frameworks */, ); runOnlyForDeploymentPostprocessing = 0; diff --git a/hfs.xcodeproj/xcshareddata/xcschemes/livefiles_hfs_tester.xcscheme b/hfs.xcodeproj/xcshareddata/xcschemes/livefiles_hfs_tester.xcscheme index 965e177..8413325 100644 --- a/hfs.xcodeproj/xcshareddata/xcschemes/livefiles_hfs_tester.xcscheme +++ b/hfs.xcodeproj/xcshareddata/xcschemes/livefiles_hfs_tester.xcscheme @@ -27,8 +27,6 @@ selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB" selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB" shouldUseLaunchSchemeArgsEnv = "YES"> - - - - + + - - buffer = nil; + nodePtr->buffer = nil; nodePtr->blockHeader = nil; return err; diff --git a/livefiles_hfs_plugin/lf_hfs_btree_tree_ops.c b/livefiles_hfs_plugin/lf_hfs_btree_tree_ops.c index ecbe3fe..c3accc3 100644 --- a/livefiles_hfs_plugin/lf_hfs_btree_tree_ops.c +++ b/livefiles_hfs_plugin/lf_hfs_btree_tree_ops.c @@ -251,7 +251,7 @@ ReleaseAndExit: ErrorExit: *nodeNum = 0; - nodePtr->buffer = nil; + nodePtr->buffer = nil; nodePtr->blockHeader = nil; *returnIndex = 0; return err; diff --git a/livefiles_hfs_plugin/lf_hfs_chash.c b/livefiles_hfs_plugin/lf_hfs_chash.c index 70d99ca..60f4fd5 100644 --- a/livefiles_hfs_plugin/lf_hfs_chash.c +++ b/livefiles_hfs_plugin/lf_hfs_chash.c @@ -181,7 +181,7 @@ loop_with_lock: } } vp = wantrsrc ? cp->c_rsrc_vp : cp->c_vp; - + /* * Skip cnodes that are not in the name space anymore * we need to check with the cnode lock held because @@ -193,9 +193,10 @@ loop_with_lock: * is no longer valid for lookups. */ if (((cp->c_flag & (C_NOEXISTS | C_DELETED)) && !wantrsrc) || - (cp->uOpenLookupRefCount == 0) || + ((vp != NULL) && + ((cp->uOpenLookupRefCount == 0) || (vp->uValidNodeMagic1 == VALID_NODE_BADMAGIC) || - (vp->uValidNodeMagic2 == VALID_NODE_BADMAGIC)) + (vp->uValidNodeMagic2 == VALID_NODE_BADMAGIC)))) { int renamed = 0; if (cp->c_flag & C_RENAMED) diff --git a/livefiles_hfs_plugin/lf_hfs_cnode.c b/livefiles_hfs_plugin/lf_hfs_cnode.c index be2582d..2e15868 100644 --- a/livefiles_hfs_plugin/lf_hfs_cnode.c +++ b/livefiles_hfs_plugin/lf_hfs_cnode.c @@ -465,36 +465,6 @@ hfs_getnewvnode(struct hfsmount *hfsmp, struct vnode *dvp, struct componentname } } } -#if LF_HFS_FULL_VNODE_SUPPORT - if (tvp != NULL) - { - /* - * grab an iocount on the vnode we weren't - * interested in (i.e. we want the resource fork - * but the cnode already has the data fork) - * to prevent it from being - * recycled by us when we call vnode_create - * which will result in a deadlock when we - * try to take the cnode lock in hfs_vnop_fsync or - * hfs_vnop_reclaim... vnode_get can be called here - * because we already hold the cnode lock which will - * prevent the vnode from changing identity until - * we drop it.. vnode_get will not block waiting for - * a change of state... however, it will return an - * error if the current iocount == 0 and we've already - * started to terminate the vnode... we don't need/want to - * grab an iocount in the case since we can't cause - * the fileystem to be re-entered on this thread for this vp - * - * the matching vnode_put will happen in hfs_unlock - * after we've dropped the cnode lock - */ - if ( vnode_get(tvp) != 0) - { - cp->c_flag &= ~(C_NEED_RVNODE_PUT | C_NEED_DVNODE_PUT); - } - } -#endif vfsp.vnfs_mp = mp; vfsp.vnfs_vtype = vtype; @@ -671,7 +641,7 @@ hfs_getnewvnode(struct hfsmount *hfsmp, struct vnode *dvp, struct componentname */ if (vp && VNODE_IS_RSRC(vp)) { - vnode_rele(vp); + vp->is_rsrc = true; } hfs_chashwakeup(hfsmp, cp, H_ALLOC | H_ATTACH); @@ -937,32 +907,6 @@ hfs_unlock(struct cnode *cp) cp->c_lockowner = NULL; lf_lck_rw_unlock_shared(&cp->c_rwlock); } - -#if LF_HFS_FULL_VNODE_SUPPORT - /* Perform any vnode post processing after cnode lock is dropped. */ - if (vp) - { - if (c_flag & C_NEED_DATA_SETSIZE) - { - ubc_setsize(vp, VTOF(vp)->ff_size); - } - if (c_flag & C_NEED_DVNODE_PUT) - { - vnode_put(vp); - } - } - if (rvp) - { - if (c_flag & C_NEED_RSRC_SETSIZE) - { - ubc_setsize(rvp, VTOF(rvp)->ff_size); - } - if (c_flag & C_NEED_RVNODE_PUT) - { - vnode_put(rvp); - } - } -#endif } /* @@ -1900,47 +1844,13 @@ out: return error; } - -/* - * Reclaim a cnode so that it can be used for other purposes. - */ int -hfs_vnop_reclaim(struct vnode *vp) +hfs_fork_release(struct cnode* cp, struct vnode *vp, bool bIsRsc, int* piErr) { - struct cnode* cp = VTOC(vp); + struct hfsmount *hfsmp = VTOHFS(vp); struct filefork *fp = NULL; struct filefork *altfp = NULL; - struct hfsmount *hfsmp = VTOHFS(vp); int reclaim_cnode = 0; - int err = 0; - - /* - * We don't take the truncate lock since by the time reclaim comes along, - * all dirty pages have been synced and nobody should be competing - * with us for this thread. - */ - hfs_chash_mark_in_transit(hfsmp, cp); - - hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); - lf_hfs_generic_buf_cache_LockBufCache(); - - //In case we have other open lookups - //We need to decrease the counter and exit - if (cp->uOpenLookupRefCount > 1) - { - hfs_chash_lower_OpenLookupCounter(cp); - hfs_chashwakeup(hfsmp, cp, H_ALLOC | H_TRANSIT); - lf_hfs_generic_buf_cache_UnLockBufCache(); - hfs_unlock(cp); - return err; - } - - if (cp->uOpenLookupRefCount == 0) assert(0); - - hfs_chash_lower_OpenLookupCounter(cp); - lf_hfs_generic_buf_cache_remove_vnode(vp); - - lf_hfs_generic_buf_cache_UnLockBufCache(); /* * Sync to disk any remaining data in the cnode/vnode. This includes @@ -1950,12 +1860,12 @@ hfs_vnop_reclaim(struct vnode *vp) * because the catalog entry for this cnode is already gone. */ INVALIDATE_NODE(vp); - + if (!ISSET(cp->c_flag, C_NOEXISTS)) { - err = hfs_cnode_teardown(vp, 1); - if (err) + *piErr = hfs_cnode_teardown(vp, 1); + if (*piErr) { - return err; + return 0; } } @@ -1966,28 +1876,21 @@ hfs_vnop_reclaim(struct vnode *vp) hfs_free(vp->sFSParams.vnfs_cnp); } - - /* - * Find file fork for this vnode (if any) - * Also check if another fork is active - */ - if (cp->c_vp == vp) { + + if (!bIsRsc) { fp = cp->c_datafork; altfp = cp->c_rsrcfork; - + cp->c_datafork = NULL; cp->c_vp = NULL; - } else if (cp->c_rsrc_vp == vp) { + } else { fp = cp->c_rsrcfork; altfp = cp->c_datafork; - + cp->c_rsrcfork = NULL; cp->c_rsrc_vp = NULL; - } else { - LFHFS_LOG(LEVEL_ERROR, "hfs_vnop_reclaim: vp points to wrong cnode (vp=%p cp->c_vp=%p cp->c_rsrc_vp=%p)\n", vp, cp->c_vp, cp->c_rsrc_vp); - hfs_assert(0); } - + /* * On the last fork, remove the cnode from its hash chain. */ @@ -2001,11 +1904,12 @@ hfs_vnop_reclaim(struct vnode *vp) if (vnode_isdir(vp)) { hfs_reldirhints(cp, 0); } - + if(cp->c_flag & C_HARDLINK) { hfs_relorigins(cp); } } + /* Release the file fork and related data */ if (fp) { @@ -2016,6 +1920,80 @@ hfs_vnop_reclaim(struct vnode *vp) rl_remove_all(&fp->ff_invalidranges); hfs_free(fp); } + + return reclaim_cnode; +} + + +/* + * Reclaim a cnode so that it can be used for other purposes. + */ +int +hfs_vnop_reclaim(struct vnode *vp) +{ + struct cnode* cp = VTOC(vp); + struct hfsmount *hfsmp = VTOHFS(vp); + struct vnode *altvp = NULL; + int reclaim_cnode = 0; + int err = 0; + + /* + * We don't take the truncate lock since by the time reclaim comes along, + * all dirty pages have been synced and nobody should be competing + * with us for this thread. + */ + hfs_chash_mark_in_transit(hfsmp, cp); + + hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + lf_hfs_generic_buf_cache_LockBufCache(); + + //In case we have other open lookups + //We need to decrease the counter and exit + if (cp->uOpenLookupRefCount > 1) + { + hfs_chash_lower_OpenLookupCounter(cp); + hfs_chashwakeup(hfsmp, cp, H_ALLOC | H_TRANSIT); + lf_hfs_generic_buf_cache_UnLockBufCache(); + hfs_unlock(cp); + return err; + } + + if (cp->uOpenLookupRefCount == 0) assert(0); + + hfs_chash_lower_OpenLookupCounter(cp); + lf_hfs_generic_buf_cache_remove_vnode(vp); + + lf_hfs_generic_buf_cache_UnLockBufCache(); + + /* + * Find file fork for this vnode (if any) + * Also check if another fork is active + */ + if (cp->c_vp == vp) { + + reclaim_cnode = hfs_fork_release(cp, vp, false, &err); + if (err) return err; + + if (!reclaim_cnode && cp->c_rsrc_vp != NULL) + { + altvp = cp->c_rsrc_vp; + reclaim_cnode = hfs_fork_release(cp, altvp, true, &err); + if (err) return err; + } + } else if (cp->c_rsrc_vp == vp) { + reclaim_cnode = hfs_fork_release(cp, vp, true, &err); + if (err) return err; + + if (!reclaim_cnode && cp->c_vp != NULL) + { + altvp = cp->c_vp; + reclaim_cnode = hfs_fork_release(cp, altvp, false, &err); + if (err) return err; + } + } else { + LFHFS_LOG(LEVEL_ERROR, "hfs_vnop_reclaim: vp points to wrong cnode (vp=%p cp->c_vp=%p cp->c_rsrc_vp=%p)\n", vp, cp->c_vp, cp->c_rsrc_vp); + hfs_assert(0); + } /* * If there was only one active fork then we can release the cnode. @@ -2035,6 +2013,9 @@ hfs_vnop_reclaim(struct vnode *vp) } hfs_free(vp); + if (altvp) + hfs_free(altvp); + vp = NULL; return (0); } diff --git a/livefiles_hfs_plugin/lf_hfs_common.h b/livefiles_hfs_plugin/lf_hfs_common.h index 2150b09..ee9fb95 100644 --- a/livefiles_hfs_plugin/lf_hfs_common.h +++ b/livefiles_hfs_plugin/lf_hfs_common.h @@ -35,13 +35,14 @@ typedef struct { - int iFD; // File descriptor as received from usbstoraged - + int iFD; // File descriptor as received from usbstoraged + unsigned uUnmountHint; // Unmount hint (passed on in LFHFS_UNMOUNT, cleared on LFHFS_MOUNT) } FileSystemRecord_s; #define VPTOFSRECORD(vp) (vp->sFSParams.vnfs_mp->psHfsmount->hfs_devvp->psFSRecord) -#define VNODE_TO_IFD(vp) ((vp->bIsMountVnode)? (vp->psFSRecord->iFD) : ((VPTOFSRECORD(vp))->iFD)) +#define VNODE_TO_IFD(vp) ((vp->bIsMountVnode)? (vp->psFSRecord->iFD) : ((VPTOFSRECORD(vp))->iFD)) +#define VNODE_TO_UNMOUNT_HINT(vp) ((vp->bIsMountVnode)? (vp->psFSRecord->uUnmountHint) : ((VPTOFSRECORD(vp))->uUnmountHint)) /* Macros to clear/set/test flags. */ #define SET(t, f) (t) |= (f) diff --git a/livefiles_hfs_plugin/lf_hfs_dirops_handler.c b/livefiles_hfs_plugin/lf_hfs_dirops_handler.c index befe576..e13ec2a 100644 --- a/livefiles_hfs_plugin/lf_hfs_dirops_handler.c +++ b/livefiles_hfs_plugin/lf_hfs_dirops_handler.c @@ -201,7 +201,7 @@ exit: } int -LFHFS_Remove ( UVFSFileNode psDirNode, const char *pcUTF8Name, __unused UVFSFileNode victimNode) +LFHFS_Remove ( UVFSFileNode psDirNode, const char *pcUTF8Name, __unused UVFSFileNode victimNode) { LFHFS_LOG(LEVEL_DEBUG, "LFHFS_Remove\n"); VERIFY_NODE_IS_VALID(psDirNode); diff --git a/livefiles_hfs_plugin/lf_hfs_endian.c b/livefiles_hfs_plugin/lf_hfs_endian.c index 9e4a7d0..9ed6591 100644 --- a/livefiles_hfs_plugin/lf_hfs_endian.c +++ b/livefiles_hfs_plugin/lf_hfs_endian.c @@ -228,10 +228,7 @@ hfs_swap_BTNode ( if ((srcDesc->kind == kBTIndexNode) || (srcDesc->kind == kBTLeafNode)) { - if (VTOVCB(vp)->vcbSigWord == kHFSPlusSigWord) { - error = hfs_swap_HFSPlusBTInternalNode (src, VTOC(vp)->c_fileid, direction); - } - + error = hfs_swap_HFSPlusBTInternalNode (src, VTOC(vp)->c_fileid, direction); if (error) goto fail; } else if (srcDesc-> kind == kBTMapNode) { diff --git a/livefiles_hfs_plugin/lf_hfs_file_extent_mapping.c b/livefiles_hfs_plugin/lf_hfs_file_extent_mapping.c index 2a680f8..14c58c4 100644 --- a/livefiles_hfs_plugin/lf_hfs_file_extent_mapping.c +++ b/livefiles_hfs_plugin/lf_hfs_file_extent_mapping.c @@ -446,7 +446,7 @@ static OSErr TruncateExtents( Boolean releasedLastExtent; u_int32_t hint; HFSPlusExtentKey key; - HFSPlusExtentRecord extents = {0}; + HFSPlusExtentRecord extents = {{0}}; int lockflags; /* @@ -1397,6 +1397,7 @@ static OSErr SearchExtentRecord( u_int32_t *endingFABNPlusOne, Boolean *noMoreExtents) { +#pragma unused (vcb) OSErr err = noErr; u_int32_t extentIndex; /* Set it to the HFS std value */ diff --git a/livefiles_hfs_plugin/lf_hfs_fileops_handler.c b/livefiles_hfs_plugin/lf_hfs_fileops_handler.c index 44aee23..edeae7e 100644 --- a/livefiles_hfs_plugin/lf_hfs_fileops_handler.c +++ b/livefiles_hfs_plugin/lf_hfs_fileops_handler.c @@ -654,3 +654,105 @@ int LFHFS_ListXAttr ( UVFSFileNode psNode, void *pvOutBuf, size_t iBufSize, size return iErr; } + +int +LFHFS_StreamLookup ( UVFSFileNode psFileNode, UVFSStreamNode *ppsOutNode ) +{ + LFHFS_LOG(LEVEL_DEBUG, "LFHFS_StreamLookup\n"); + VERIFY_NODE_IS_VALID(psFileNode); + + vnode_t psVnode = (vnode_t)psFileNode; + vnode_t psRscVnode = NULL; + + if (IS_DIR(psVnode)) { + return EISDIR; + } + + int iError = hfs_vgetrsrc(psVnode, &psRscVnode); + + if (!iError) + hfs_unlock (VTOC(psRscVnode)); + + *ppsOutNode = (UVFSStreamNode) psRscVnode; + + return iError; +} + +int +LFHFS_StreamReclaim (UVFSStreamNode psStreamNode ) +{ + LFHFS_LOG(LEVEL_DEBUG, "LFHFS_StreamReclaim\n"); + + int iError = 0; + vnode_t psVnode = (vnode_t) psStreamNode; + + if ( psVnode != NULL ) + { + VERIFY_NODE_IS_VALID_FOR_RECLAIM(psVnode); + + iError = hfs_vnop_reclaim(psVnode); + psVnode = NULL; + } + + return iError; +} + +int +LFHFS_StreamRead (UVFSStreamNode psStreamNode, uint64_t uOffset, size_t iLength, void *pvBuf, size_t *iActuallyRead ) +{ + LFHFS_LOG(LEVEL_DEBUG, "LFHFS_StreamRead (psNode %p, uOffset %llu, iLength %lu)\n", psStreamNode, uOffset, iLength); + VERIFY_NODE_IS_VALID(psStreamNode); + + struct vnode *vp = (vnode_t)psStreamNode; + struct cnode *cp; + struct filefork *fp; + uint64_t filesize; + int retval = 0; + int took_truncate_lock = 0; + *iActuallyRead = 0; + + /* Preflight checks */ + if (!vnode_isreg(vp)) { + /* can only read regular files */ + return ( vnode_isdir(vp) ? EISDIR : EPERM ); + } + + cp = VTOC(vp); + fp = VTOF(vp); + + /* Protect against a size change. */ + hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT); + took_truncate_lock = 1; + + filesize = fp->ff_size; + /* + * Check the file size. Note that per POSIX spec, we return 0 at + * file EOF, so attempting a read at an offset that is too big + * should just return 0 on HFS+. Since the return value was initialized + * to 0 above, we just jump to exit. HFS Standard has its own behavior. + */ + if (uOffset > filesize) + { + LFHFS_LOG( LEVEL_ERROR, "LFHFS_Read: wanted offset is greater then file size\n" ); + goto exit; + } + + // If we asked to read above the file size, adjust the read size; + if ( uOffset + iLength > filesize ) + { + iLength = filesize - uOffset; + } + + uint64_t uReadStartCluster; + retval = raw_readwrite_read( vp, uOffset, pvBuf, iLength, iActuallyRead, &uReadStartCluster ); + + cp->c_touch_acctime = TRUE; + +exit: + if (took_truncate_lock) + { + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); + } + return retval; +} + diff --git a/livefiles_hfs_plugin/lf_hfs_fileops_handler.h b/livefiles_hfs_plugin/lf_hfs_fileops_handler.h index bf709a7..dedd70e 100644 --- a/livefiles_hfs_plugin/lf_hfs_fileops_handler.h +++ b/livefiles_hfs_plugin/lf_hfs_fileops_handler.h @@ -63,4 +63,7 @@ int LFHFS_GetXAttr ( UVFSFileNode psNode, const char *pcAttr, void *pvOutBuf, int LFHFS_SetXAttr ( UVFSFileNode psNode, const char *pcAttr, const void *pvInBuf, size_t iBufSize, UVFSXattrHow How ); int LFHFS_ListXAttr ( UVFSFileNode psNode, void *pvOutBuf, size_t iBufSize, size_t *iActualSize ); +int LFHFS_StreamLookup ( UVFSFileNode psFileNode, UVFSStreamNode *ppsOutNode ); +int LFHFS_StreamReclaim (UVFSStreamNode psStreamNode ); +int LFHFS_StreamRead (UVFSStreamNode psStreamNode, uint64_t uOffset, size_t iLength, void *pvBuf, size_t *iActuallyRead ); #endif /* lf_hfs_fileops_handler_h */ diff --git a/livefiles_hfs_plugin/lf_hfs_fsops_handler.c b/livefiles_hfs_plugin/lf_hfs_fsops_handler.c index 40acd59..b4bb873 100644 --- a/livefiles_hfs_plugin/lf_hfs_fsops_handler.c +++ b/livefiles_hfs_plugin/lf_hfs_fsops_handler.c @@ -22,6 +22,9 @@ #include "lf_hfs_journal.h" #include "lf_hfs_vfsops.h" #include "lf_hfs_mount.h" +#include "lf_hfs_readwrite_ops.h" + +#include "lf_hfs_vnops.h" static int FSOPS_GetRootVnode(struct vnode* psDevVnode, struct vnode** ppsRootVnode) @@ -261,14 +264,15 @@ LFHFS_Unmount ( UVFSFileNode psRootNode, UVFSUnmountHint hint ) { VERIFY_NODE_IS_VALID(psRootNode); LFHFS_LOG(LEVEL_DEBUG, "HFS_Unmount (psRootNode %p) (hint %u)\n", psRootNode, hint); - + int iError = 0; struct vnode *psRootVnode = (struct vnode*) psRootNode; FileSystemRecord_s *psFSRecord = VPTOFSRECORD(psRootVnode); struct mount *psMount = psRootVnode->sFSParams.vnfs_mp; struct cnode *psDevCnode = VTOHFS(psRootVnode)->hfs_devvp->sFSParams.vnfs_fsnode; struct hfsmount *psHfsMp = psMount->psHfsmount; - + psFSRecord->uUnmountHint = hint; + #if HFS_CRASH_TEST CRASH_ABORT(CRASH_ABORT_ON_UNMOUNT, psHfsMp, NULL); #endif @@ -290,11 +294,24 @@ LFHFS_Unmount ( UVFSFileNode psRootNode, UVFSUnmountHint hint ) } int -LFHFS_SetFSAttr ( UVFSFileNode psNode, const char *pcAttr, const UVFSFSAttributeValue *psAttrVal, size_t uLen ) +LFHFS_SetFSAttr ( UVFSFileNode psNode, const char *pcAttr, const UVFSFSAttributeValue *psAttrVal, size_t uLen, UVFSFSAttributeValue *psOutAttrVal, size_t uOutLen ) { #pragma unused (psNode, pcAttr, psAttrVal, uLen) VERIFY_NODE_IS_VALID(psNode); - LFHFS_LOG(LEVEL_DEBUG, "LFHFS_SetFSAttr (ENOTSUP)\n"); + + if (pcAttr == NULL || psAttrVal == NULL || psOutAttrVal == NULL) return EINVAL; + + if (strcmp(pcAttr, LI_FSATTR_PREALLOCATE) == 0) + { + if (uLen < sizeof (UVFSFSAttributeValue) || uOutLen < sizeof (UVFSFSAttributeValue)) + return EINVAL; + + LIFilePreallocateArgs_t* psPreAllocReq = (LIFilePreallocateArgs_t *) ((void *) psAttrVal->fsa_opaque); + LIFilePreallocateArgs_t* psPreAllocRes = (LIFilePreallocateArgs_t *) ((void *) psOutAttrVal->fsa_opaque); + + memcpy (psPreAllocRes, psPreAllocReq, sizeof(LIFilePreallocateArgs_t)); + return hfs_vnop_preallocate(psNode, psPreAllocReq, psPreAllocRes); + } return ENOTSUP; } @@ -690,7 +707,11 @@ UVFSFSOps HFS_fsOps = { .fsops_listxattr = LFHFS_ListXAttr, .fsops_scandir = LFHFS_ScanDir, - .fsops_scanids = LFHFS_ScanIDs + .fsops_scanids = LFHFS_ScanIDs, + + .fsops_stream_lookup = LFHFS_StreamLookup, + .fsops_stream_reclaim = LFHFS_StreamReclaim, + .fsops_stream_read = LFHFS_StreamRead, }; #if HFS_CRASH_TEST diff --git a/livefiles_hfs_plugin/lf_hfs_raw_read_write.c b/livefiles_hfs_plugin/lf_hfs_raw_read_write.c index f3ab6fd..f4c93f2 100644 --- a/livefiles_hfs_plugin/lf_hfs_raw_read_write.c +++ b/livefiles_hfs_plugin/lf_hfs_raw_read_write.c @@ -12,6 +12,7 @@ #include "lf_hfs_file_mgr_internal.h" #include "lf_hfs_file_extent_mapping.h" #include "lf_hfs_vfsutils.h" +#include #define MAX_READ_WRITE_LENGTH (0x7ffff000) @@ -62,7 +63,8 @@ errno_t raw_readwrite_read_mount( vnode_t psMountVnode, uint64_t uBlockN, uint64 if ( iReadBytes != (ssize_t)uBufLen ) { iErr = ( (iReadBytes < 0) ? errno : EIO ); - LFHFS_LOG( LEVEL_ERROR, "raw_readwrite_read_mount failed [%d]\n", iErr ); + HFSLogLevel_e eLogLevel = (VNODE_TO_UNMOUNT_HINT(psMountVnode)==UVFSUnmountHintForce)?LEVEL_DEBUG:LEVEL_ERROR; + LFHFS_LOG( eLogLevel, "raw_readwrite_read_mount failed [%d]\n", iErr ); } if (puActuallyRead) @@ -85,7 +87,8 @@ errno_t raw_readwrite_write_mount( vnode_t psMountVnode, uint64_t uBlockN, uint6 uActuallyWritten = pwrite(iFD, pvBuf, (size_t)uBufLen, uWantedOffset); if ( uActuallyWritten != (ssize_t)uBufLen ) { iErr = ( (uActuallyWritten < 0) ? errno : EIO ); - LFHFS_LOG( LEVEL_ERROR, "raw_readwrite_write_mount failed [%d]\n", iErr ); + HFSLogLevel_e eLogLevel = (VNODE_TO_UNMOUNT_HINT(psMountVnode)==UVFSUnmountHintForce)?LEVEL_DEBUG:LEVEL_ERROR; + LFHFS_LOG( eLogLevel, "raw_readwrite_write_mount failed [%d]\n", iErr ); } if (piActuallyWritten) diff --git a/livefiles_hfs_plugin/lf_hfs_readwrite_ops.c b/livefiles_hfs_plugin/lf_hfs_readwrite_ops.c index ea0b44c..45d1eff 100644 --- a/livefiles_hfs_plugin/lf_hfs_readwrite_ops.c +++ b/livefiles_hfs_plugin/lf_hfs_readwrite_ops.c @@ -753,3 +753,144 @@ hfs_truncate(struct vnode *vp, off_t length, int flags, int truncateflags) return error; } + +/* + * Preallocate file storage space. + */ +int +hfs_vnop_preallocate(struct vnode * vp, LIFilePreallocateArgs_t* psPreAllocReq, LIFilePreallocateArgs_t* psPreAllocRes) +{ + struct cnode *cp = VTOC(vp); + struct filefork *fp = VTOF(vp); + struct hfsmount *hfsmp = VTOHFS(vp); + ExtendedVCB *vcb = VTOVCB(vp); + int retval = E_NONE , retval2 = E_NONE; + + off_t length = psPreAllocReq->length; + psPreAllocRes->bytesallocated = 0; + + if (vnode_isdir(vp) || vnode_islnk(vp)) { + LFHFS_LOG(LEVEL_ERROR, "hfs_vnop_preallocate: Cannot change size of a directory or symlink!"); + return EPERM; + } + + if (length == 0) + return (0); + + if (psPreAllocReq->flags & LI_PREALLOCATE_ALLOCATEFROMVOL){ + LFHFS_LOG(LEVEL_ERROR, "hfs_vnop_preallocate: Not supporting LI_PREALLOCATE_ALLOCATEFROMVOL mode\n"); + return ENOTSUP; + } + + hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + + if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) { + goto err_exit; + } + + off_t filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize; + off_t startingPEOF = filebytes; + + /* If no changes are necesary, then we're done */ + if (filebytes == length) + goto exit; + + u_int32_t extendFlags = kEFNoClumpMask; + if (psPreAllocReq->flags & LI_PREALLOCATE_ALLOCATECONTIG) + extendFlags |= kEFContigMask; + if (psPreAllocReq->flags & LI_PREALLOCATE_ALLOCATEALL) + extendFlags |= kEFAllMask; + + + /* + * Lengthen the size of the file. We must ensure that the + * last byte of the file is allocated. Since the smallest + * value of filebytes is 0, length will be at least 1. + */ + if (length > filebytes) + { + off_t total_bytes_added = 0, orig_request_size, moreBytesRequested, actualBytesAdded; + orig_request_size = moreBytesRequested = length - filebytes; + + while ((length > filebytes) && (retval == E_NONE)) + { + off_t bytesRequested; + + if (hfs_start_transaction(hfsmp) != 0) + { + retval = EINVAL; + goto err_exit; + } + + /* Protect extents b-tree and allocation bitmap */ + int lockflags = SFL_BITMAP; + if (overflow_extents(fp)) + lockflags |= SFL_EXTENTS; + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); + + if (moreBytesRequested >= HFS_BIGFILE_SIZE) { + bytesRequested = HFS_BIGFILE_SIZE; + } else { + bytesRequested = moreBytesRequested; + } + + retval = MacToVFSError(ExtendFileC(vcb, + (FCB*)fp, + bytesRequested, + 0, + extendFlags, + &actualBytesAdded)); + + if (retval == E_NONE) + { + psPreAllocRes->bytesallocated += actualBytesAdded; + total_bytes_added += actualBytesAdded; + moreBytesRequested -= actualBytesAdded; + } + + filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize; + hfs_systemfile_unlock(hfsmp, lockflags); + + if (hfsmp->jnl) { + (void) hfs_update(vp, 0); + (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); + } + + hfs_end_transaction(hfsmp); + } + + /* + * if we get an error and no changes were made then exit + * otherwise we must do the hfs_update to reflect the changes + */ + if (retval && (startingPEOF == filebytes)) + goto err_exit; + + /* + * Adjust actualBytesAdded to be allocation block aligned, not + * clump size aligned. + * NOTE: So what we are reporting does not affect reality + * until the file is closed, when we truncate the file to allocation + * block size. + */ + if (total_bytes_added != 0 && orig_request_size < total_bytes_added) + psPreAllocRes->bytesallocated = roundup(orig_request_size, (off_t)vcb->blockSize); + } else { + //No need to touch anything else, just unlock and go out + goto err_exit; + } + +exit: + cp->c_flag |= C_MODIFIED; + cp->c_touch_chgtime = TRUE; + cp->c_touch_modtime = TRUE; + retval2 = hfs_update(vp, 0); + + if (retval == 0) + retval = retval2; + +err_exit: + hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); + hfs_unlock(cp); + return (retval); +} diff --git a/livefiles_hfs_plugin/lf_hfs_readwrite_ops.h b/livefiles_hfs_plugin/lf_hfs_readwrite_ops.h index 491d31f..7a021f6 100644 --- a/livefiles_hfs_plugin/lf_hfs_readwrite_ops.h +++ b/livefiles_hfs_plugin/lf_hfs_readwrite_ops.h @@ -28,5 +28,6 @@ int hfs_vnop_blockmap(struct vnop_blockmap_args *ap); int hfs_prepare_release_storage (struct hfsmount *hfsmp, struct vnode *vp); int hfs_release_storage (struct hfsmount *hfsmp, struct filefork *datafork, struct filefork *rsrcfork, u_int32_t fileid); int hfs_truncate(struct vnode *vp, off_t length, int flags, int truncateflags); +int hfs_vnop_preallocate(struct vnode * vp, LIFilePreallocateArgs_t* psPreAllocReq, LIFilePreallocateArgs_t* psPreAllocRes); #endif /* lf_hfs_readwrite_ops_h */ diff --git a/livefiles_hfs_plugin/lf_hfs_vfsops.c b/livefiles_hfs_plugin/lf_hfs_vfsops.c index 4dab9c8..4689cad 100644 --- a/livefiles_hfs_plugin/lf_hfs_vfsops.c +++ b/livefiles_hfs_plugin/lf_hfs_vfsops.c @@ -839,6 +839,7 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args) retval = hfs_CollectBtreeStats(hfsmp, vhp, embeddedOffset, args); free(vhp); + vhp = NULL; if ( retval ) { LFHFS_LOG(LEVEL_DEBUG, "hfs_mountfs: hfs_CollectBtreeStats encountered failure %d \n", retval); diff --git a/livefiles_hfs_plugin/lf_hfs_vnode.c b/livefiles_hfs_plugin/lf_hfs_vnode.c index 292a7a2..0bc73b6 100644 --- a/livefiles_hfs_plugin/lf_hfs_vnode.c +++ b/livefiles_hfs_plugin/lf_hfs_vnode.c @@ -11,6 +11,8 @@ #include "lf_hfs_vfsutils.h" #include "lf_hfs_generic_buf.h" #include "lf_hfs_fileops_handler.h" +#include "lf_hfs_xattr.h" +#include int VTtoUVFS_tab[16] = { @@ -166,8 +168,72 @@ void vnode_GetAttrInternal (vnode_t vp, UVFSFileAttributes *psOutAttr ) } else { - psOutAttr->fa_allocsize = VCTOF(vp, cp)->ff_blocks * VTOHFS(vp)->blockSize; - psOutAttr->fa_size = VCTOF(vp, cp)->ff_size; + if (psOutAttr->fa_bsd_flags & UF_COMPRESSED) + { + if (VNODE_IS_RSRC(vp)) + { + psOutAttr->fa_allocsize = VTOF(vp)->ff_blocks * VTOHFS(vp)->blockSize; + psOutAttr->fa_size = VTOF(vp)->ff_size; + } + else + { + hfs_unlock(VTOC(vp)); + void* data = NULL; + size_t attr_size; + int iErr = hfs_vnop_getxattr(vp, "com.apple.decmpfs", NULL, 0, &attr_size); + if (iErr != 0) { + goto fail; + } + + if (attr_size < sizeof(decmpfs_disk_header) || attr_size > MAX_DECMPFS_XATTR_SIZE) { + iErr = EINVAL; + goto fail; + } + /* allocation includes space for the extra attr_size field of a compressed_header */ + data = (char *) malloc(attr_size); + if (!data) { + iErr = ENOMEM; + goto fail; + } + + /* read the xattr into our buffer, skipping over the attr_size field at the beginning */ + size_t read_size; + iErr = hfs_vnop_getxattr(vp, "com.apple.decmpfs", data, attr_size, &read_size); + if (iErr != 0) { + goto fail; + } + if (read_size != attr_size) { + iErr = EINVAL; + goto fail; + } + + decmpfs_header Hdr; + Hdr.attr_size = (uint32_t) attr_size; + Hdr.compression_magic = *((uint32_t*)data); + Hdr.compression_type = *((uint32_t*)(data + sizeof(uint32_t))); + Hdr.uncompressed_size = *((uint32_t*)(data + sizeof(uint64_t))); + +fail: + if (iErr) + { + psOutAttr->fa_allocsize = VCTOF(vp, cp)->ff_blocks * VTOHFS(vp)->blockSize; + psOutAttr->fa_size = VCTOF(vp, cp)->ff_size; + } + else + { + psOutAttr->fa_allocsize = ROUND_UP(Hdr.uncompressed_size,VTOHFS(vp)->blockSize); + psOutAttr->fa_size = Hdr.uncompressed_size; + } + + if (data) free(data); + hfs_lock(VTOC(vp), 0, 0); + } + } + else + { + psOutAttr->fa_allocsize = VCTOF(vp, cp)->ff_blocks * VTOHFS(vp)->blockSize; + psOutAttr->fa_size = VCTOF(vp, cp)->ff_size; + } psOutAttr->fa_nlink = (cp->c_flag & C_HARDLINK)? cp->c_linkcount : 1; } } diff --git a/livefiles_hfs_plugin/lf_hfs_vnode.h b/livefiles_hfs_plugin/lf_hfs_vnode.h index ece2e07..d3afabe 100644 --- a/livefiles_hfs_plugin/lf_hfs_vnode.h +++ b/livefiles_hfs_plugin/lf_hfs_vnode.h @@ -12,18 +12,7 @@ #include #include "lf_hfs_common.h" - -/* - * Vnode types. VNON means no type. - */ -enum vtype { - /* 0 */ - VNON, - /* 1 - 5 */ - VREG, VDIR, VBLK, VCHR, VLNK, - /* 6 - 10 */ - VSOCK, VFIFO, VBAD, VSTR, VCPLX -}; +#include extern int VTtoUVFS_tab[]; diff --git a/livefiles_hfs_plugin/lf_hfs_vnops.c b/livefiles_hfs_plugin/lf_hfs_vnops.c index d040e97..a30b815 100644 --- a/livefiles_hfs_plugin/lf_hfs_vnops.c +++ b/livefiles_hfs_plugin/lf_hfs_vnops.c @@ -1202,7 +1202,8 @@ relock: hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); if (rvp) { - hfs_free(rvp); + hfs_chash_lower_OpenLookupCounter(cp); + rvp = NULL; } return (error); } @@ -1228,7 +1229,7 @@ relock: { /* We need to acquire the rsrc vnode */ rvp = cp->c_rsrc_vp; - + hfs_chash_raise_OpenLookupCounter(cp); /* Unlock everything to acquire iocount on the rsrc vnode */ hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT); hfs_unlockpair (dcp, cp); @@ -1263,8 +1264,10 @@ rm_done: hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT); if (rvp) - hfs_free(rvp); - + { + hfs_chash_lower_OpenLookupCounter(cp); + rvp = NULL; + } return (error); } @@ -2342,7 +2345,6 @@ retry: if (tvp_rsrc && tcp) { hfs_chash_lower_OpenLookupCounter(tcp); - hfs_free(tvp_rsrc); tvp_rsrc = NULL; } @@ -2399,7 +2401,6 @@ retry: took_trunc_lock = 0; } - hfs_unlockfour(fdcp, fcp, tdcp, tcp); goto retry; @@ -2760,7 +2761,7 @@ retry: if ((error == 0) && (tcp->c_flag & C_DELETED) && (tvp_rsrc)) { hfs_chash_lower_OpenLookupCounter(tcp); - hfs_free(tvp_rsrc); + tvp_rsrc = NULL; } } @@ -2947,7 +2948,6 @@ out: if (tvp_rsrc) { hfs_chash_lower_OpenLookupCounter(tcp); - hfs_free(tvp_rsrc); tvp_rsrc = NULL; } @@ -3171,3 +3171,175 @@ int hfs_removefile_callback(GenericLFBuf *psBuff, void *pvArgs) { return (0); } + +/* + * hfs_vgetrsrc acquires a resource fork vnode corresponding to the + * cnode that is found in 'vp'. The cnode should be locked upon entry + * and will be returned locked, but it may be dropped temporarily. + * + * If the resource fork vnode does not exist, HFS will attempt to acquire an + * empty (uninitialized) vnode from VFS so as to avoid deadlocks with + * jetsam. If we let the normal getnewvnode code produce the vnode for us + * we would be doing so while holding the cnode lock of our cnode. + * + * On success, *rvpp wlll hold the resource fork vnode with an + * iocount. *Don't* forget the vnode_put. + */ +int +hfs_vgetrsrc( struct vnode *vp, struct vnode **rvpp) +{ + struct hfsmount *hfsmp = VTOHFS(vp); + struct vnode *rvp = NULL; + struct cnode *cp = VTOC(vp); + int error = 0; + +restart: + /* Attempt to use existing vnode */ + if ((rvp = cp->c_rsrc_vp)) { + hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + hfs_chash_raise_OpenLookupCounter(cp); + + } else { + struct cat_fork rsrcfork; + struct cat_desc *descptr = NULL; + struct cat_desc to_desc; + int newvnode_flags = 0; + + hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT); + + /* + * We could have raced with another thread here while we dropped our cnode + * lock. See if the cnode now has a resource fork vnode and restart if appropriate. + * + * Note: We just released the cnode lock, so there is a possibility that the + * cnode that we just acquired has been deleted or even removed from disk + * completely, though this is unlikely. If the file is open-unlinked, the + * check below will resolve it for us. If it has been completely + * removed (even from the catalog!), then when we examine the catalog + * directly, below, while holding the catalog lock, we will not find the + * item and we can fail out properly. + */ + if (cp->c_rsrc_vp) { + /* Drop the empty vnode before restarting */ + hfs_unlock(cp); + rvp = NULL; + goto restart; + } + + /* + * hfs_vgetsrc may be invoked for a cnode that has already been marked + * C_DELETED. This is because we need to continue to provide rsrc + * fork access to open-unlinked files. In this case, build a fake descriptor + * like in hfs_removefile. If we don't do this, buildkey will fail in + * cat_lookup because this cnode has no name in its descriptor. + */ + if ((cp->c_flag & C_DELETED ) && (cp->c_desc.cd_namelen == 0)) { + char delname[32]; + bzero (&to_desc, sizeof(to_desc)); + bzero (delname, 32); + MAKE_DELETED_NAME(delname, sizeof(delname), cp->c_fileid); + to_desc.cd_nameptr = (const u_int8_t*) delname; + to_desc.cd_namelen = strlen(delname); + to_desc.cd_parentcnid = hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid; + to_desc.cd_flags = 0; + to_desc.cd_cnid = cp->c_cnid; + + descptr = &to_desc; + } + else { + descptr = &cp->c_desc; + } + + + int lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); + + /* + * We call cat_idlookup (instead of cat_lookup) below because we can't + * trust the descriptor in the provided cnode for lookups at this point. + * Between the time of the original lookup of this vnode and now, the + * descriptor could have gotten swapped or replaced. If this occurred, + * the parent/name combo originally desired may not necessarily be provided + * if we use the descriptor. Even worse, if the vnode represents + * a hardlink, we could have removed one of the links from the namespace + * but left the descriptor alone, since hfs_unlink does not invalidate + * the descriptor in the cnode if other links still point to the inode. + * + * Consider the following (slightly contrived) scenario: + * /tmp/a <--> /tmp/b (hardlinks). + * 1. Thread A: open rsrc fork on /tmp/b. + * 1a. Thread A: does lookup, goes out to lunch right before calling getnamedstream. + * 2. Thread B does 'mv /foo/b /tmp/b' + * 2. Thread B succeeds. + * 3. Thread A comes back and wants rsrc fork info for /tmp/b. + * + * Even though the hardlink backing /tmp/b is now eliminated, the descriptor + * is not removed/updated during the unlink process. So, if you were to + * do a lookup on /tmp/b, you'd acquire an entirely different record's resource + * fork. + * + * As a result, we use the fileid, which should be invariant for the lifetime + * of the cnode (possibly barring calls to exchangedata). + * + * Addendum: We can't do the above for HFS standard since we aren't guaranteed to + * have thread records for files. They were only required for directories. So + * we need to do the lookup with the catalog name. This is OK since hardlinks were + * never allowed on HFS standard. + */ + + /* Get resource fork data */ + error = cat_idlookup (hfsmp, cp->c_fileid, 0, 1, NULL, NULL, &rsrcfork); + + hfs_systemfile_unlock(hfsmp, lockflags); + if (error) { + LFHFS_LOG(LEVEL_ERROR, "hfs_vgetrsrc: cat_idlookup failed with error [%d]\n", error); + hfs_unlock(cp); + hfs_chash_lower_OpenLookupCounter(cp); + return (error); + } + /* + * Supply hfs_getnewvnode with a component name. + */ + struct componentname cn; + cn.cn_pnbuf = NULL; + if (descptr->cd_nameptr) { + void *buf = hfs_malloc(MAXPATHLEN); + + cn = (struct componentname){ + .cn_nameiop = LOOKUP, + .cn_flags = ISLASTCN, + .cn_pnlen = MAXPATHLEN, + .cn_pnbuf = buf, + .cn_nameptr = buf, + .cn_namelen = snprintf(buf, MAXPATHLEN, + "%s%s", descptr->cd_nameptr, + _PATH_RSRCFORKSPEC) + }; + + // Should never happen because cn.cn_nameptr won't ever be long... + if (cn.cn_namelen >= MAXPATHLEN) { + hfs_free(buf); + LFHFS_LOG(LEVEL_ERROR, "hfs_vgetrsrc: cnode name too long [ENAMETOOLONG]\n"); + hfs_unlock(cp); + hfs_chash_lower_OpenLookupCounter(cp); + return ENAMETOOLONG; + } + } + + /* + * We are about to call hfs_getnewvnode and pass in the vnode that we acquired + * earlier when we were not holding any locks. The semantics of GNV_USE_VP require that + * either hfs_getnewvnode consume the vnode and vend it back to us, properly initialized, + * or it will consume/dispose of it properly if it errors out. + */ + error = hfs_getnewvnode(hfsmp, NULL, cn.cn_pnbuf ? &cn : NULL, + descptr, (GNV_WANTRSRC | GNV_SKIPLOCK), + &cp->c_attr, &rsrcfork, &rvp, &newvnode_flags); + + hfs_free(cn.cn_pnbuf); + if (error) + return (error); + } /* End 'else' for rsrc fork not existing */ + + *rvpp = rvp; + return (0); +} diff --git a/livefiles_hfs_plugin/lf_hfs_vnops.h b/livefiles_hfs_plugin/lf_hfs_vnops.h index 6c1d1f6..5db2b8b 100644 --- a/livefiles_hfs_plugin/lf_hfs_vnops.h +++ b/livefiles_hfs_plugin/lf_hfs_vnops.h @@ -48,4 +48,6 @@ int hfs_removefile(struct vnode *dvp, struct vnode *vp, struct componentname *cn int hfs_vnop_renamex(struct vnode *fdvp,struct vnode *fvp, struct componentname *fcnp, struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp); int hfs_vnop_link(vnode_t vp, vnode_t tdvp, struct componentname *cnp); int hfs_removefile_callback(GenericLFBuf *psBuff, void *pvArgs); + +int hfs_vgetrsrc( struct vnode *vp, struct vnode **rvpp); #endif /* lf_hfs_vnops_h */ diff --git a/make_opensource.sh b/make_opensource.sh deleted file mode 100755 index 5b40db1..0000000 --- a/make_opensource.sh +++ /dev/null @@ -1,118 +0,0 @@ -#!/bin/sh - -# -# This script processes the directory hierarchy -# passed to it and eliminates all source code, -# makefile fragments, and documentation that is -# not suitable for open source posting. -# - -OPENSOURCE=1 - -DST=/tmp/hfs-open-source - -rm -rf $DST -mkdir $DST -xcodebuild installsrc SRCROOT=$DST - -SRCROOT="$DST" - -if [ ! -d "${SRCROOT}" ]; then - echo "Could not access ${SRCROOT}" 1>&2 - exit 1 -fi - - -UNIFDEF_FLAGS="" -if [ "$OPENSOURCE" -eq 1 ]; then - UNIFDEF_FLAGS="$UNIFDEF_FLAGS -D_OPEN_SOURCE_ -D__OPEN_SOURCE__ -U__arm__ -Uarm -UARM -U__ARM__ -U__arm64__ -Uarm64 -UARM64 -U__ARM64__ -UTARGET_OS_EMBEDDED -UHFS_CONFIG_KEY_ROLL" -fi - -# From this point forward, all paths are ./-relative -cd "${SRCROOT}" - -find -d . -name .open_source_exclude | while read f; do - dir=`dirname $f` - if [ -s $f ]; then - cat $f | while read g; do - if [ -n "$g" ]; then - echo "Removing $dir/$g (Listed in $f)" - rm -f "$dir/$g" || exit 1 - else - echo "Bad entry '$g' in $f" - exit 1 - fi - done - if [ $? -ne 0 ]; then - exit 1 - fi - else - echo "Removing $dir (Contains empty $f)" - rm -rf "$dir" - fi - rm -f "$f" -done - -if [ $? -ne 0 ]; then - # Propagate error from sub-shell invocation above - exit 1 -fi - -function stripfile() { - local extraflags="$1" - local path="$2" - - unifdef $extraflags $UNIFDEF_FLAGS $path > $path.new - if [ $? -eq 0 ]; then - # no change - rm $path.new - else - if [ $? -eq 2 ]; then - echo "Problems parsing $path, removing..." - rm $path.new $path - else - if [ -s $path.new ]; then - echo "Modified $path" - mv -f $path.new $path - else - echo "Removing empty $path" - rm -f $path.new $path - fi - fi - fi -} - -# C/C++ Source files -find . \( -type f -o -type l \) -a \( -name "*.[chy]" -o -name "*.cpp" \) | while read f; do - stripfile "" "$f" -done - -# Free-form plain text files -find . \( -type f -o -type l \) -a \( -name "*.[sS]" -o -name "*.sh" -o -name "README" -o -name "*.py" \) | while read f; do - stripfile "-t" "$f" - case "$f" in - *.sh) - chmod +x "$f" - ;; - esac -done - -# Remove project references -grep -i -v -E '(hfs_key_roll)' ./hfs.xcodeproj/project.pbxproj > ./hfs.xcodeproj/project.pbxproj.new -mv -f ./hfs.xcodeproj/project.pbxproj.new ./hfs.xcodeproj/project.pbxproj - -# Check for remaining bad file names -BADFILES=`find . \( -name "*.arm*" -o -name "arm*" \) | xargs echo`; -if [ -n "$BADFILES" ]; then - echo "Bad file names $BADFILES" - exit 1 -fi - -# Check for remaining bad file contents -if grep -iEr '([^UD_]_?_OPEN_SOURCE_|XNU_HIDE_SEED|XNU_HIDE_HARDWARE|CONFIG_EMBEDDED)' .; then - echo "cleanup FAILURE" - exit 1 -else - echo "done" - exit 0 -fi -- 2.45.2