/*
* ntfs_index.c - NTFS kernel index handling.
*
* Copyright (c) 2006-2008 Anton Altaparmakov. All Rights Reserved.
* Portions Copyright (c) 2006-2008 Apple Inc. All Rights Reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* 3. Neither the name of Apple Inc. ("Apple") nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* ALTERNATIVELY, provided that this notice and licensing terms are retained in
* full, this file may be redistributed and/or modified under the terms of the
* GNU General Public License (GPL) Version 2, in which case the provisions of
* that version of the GPL will apply to you instead of the license terms
* above. You can obtain a copy of the GPL Version 2 at
* http://developer.apple.com/opensource/licenses/gpl-2.txt.
*/
#include<sys/errno.h>
#include<string.h>
#include<libkern/libkern.h>
#include<libkern/OSMalloc.h>
#include<kern/debug.h>
#include<kern/locks.h>
#include"ntfs.h"
#include"ntfs_attr.h"
#include"ntfs_attr_list.h"
#include"ntfs_bitmap.h"
#include"ntfs_collate.h"
#include"ntfs_debug.h"
#include"ntfs_dir.h"
#include"ntfs_endian.h"
#include"ntfs_index.h"
#include"ntfs_inode.h"
#include"ntfs_layout.h"
#include"ntfs_lcnalloc.h"
#include"ntfs_mft.h"
#include"ntfs_page.h"
#include"ntfs_runlist.h"
#include"ntfs_types.h"
#include"ntfs_unistr.h"
#include"ntfs_volume.h"/**
* ntfs_index_ctx_unlock - unlock an index context
* @ictx: index context to unlock
*
* Unlock the index context @ictx. We also unmap the mft record (in index root
* case) or the page (in index allocation block case) thus all pointers into
* the index node need to be revalidated when the mft record or page is mapped
* again in ntfs_index_ctx_relock().
*
* Locking: - Caller must hold @ictx->idx_ni->lock on the index inode.
* - The index context @ictx must be locked.
*/voidntfs_index_ctx_unlock(ntfs_index_context *ictx)
{
if (!ictx)
panic("%s(): !ictx\n", __FUNCTION__);
if (!ictx->is_locked)
panic("%s(): !ictx->is_locked\n", __FUNCTION__);
if (ictx->is_root) {
if (ictx->actx) {
ntfs_attr_search_ctx_put(ictx->actx);
if (!ictx->base_ni)
panic("%s(): !ictx->base_ni\n", __FUNCTION__);
ntfs_mft_record_unmap(ictx->base_ni);
ictx->actx = NULL;
}
} else {
if (ictx->upl) {
ntfs_page_unmap(ictx->idx_ni, ictx->upl, ictx->pl,
ictx->is_dirty);
ictx->upl = NULL;
ictx->is_dirty = 0;
}
}
ictx->is_locked = 0;
}
/**
* ntfs_index_ctx_path_unlock - unlock an entire B+tree index context path
* @index_ctx: index context whose whole path to unlock
*
* Unlock all index contexts attached to the B+tree path to which @index_ctx
* belongs.
*
* This function is only called in error handling to ensure nothing is held
* busy so the error handling code cannot deadlock.
*
* Locking: Caller must hold @index_ctx->idx_ni->lock on the index inode.
*/staticvoidntfs_index_ctx_path_unlock(ntfs_index_context *index_ctx)
{
ntfs_index_context *ictx;
ictx = index_ctx;
if (!ictx)
panic("%s(): !ictx\n", __FUNCTION__);
/*
* Note we traverse the tree path backwards (upwards) because @up is
* the first element in the index_context structure thus doing things
* this way generates faster/better machine code.
*/do {
if (ictx->is_locked)
ntfs_index_ctx_unlock(ictx);
} while ((ictx = ictx->up) != index_ctx);
}
/**
* ntfs_index_ctx_relock - relock an index context that was unlocked earlier
* @ictx: index context to relock
*
* Relock the index context @ictx after it was unlocked with
* ntfs_index_ctx_unlock(). We also remap the mft record (in index root case)
* or the page (in index allocation block case) after which we revalidate all
* pointers into the index node because the page may have been mapped into a
* different virtual address and the mft record may have been changed with the
* result that the index root attribute is moved within the mft record or even
* to a completely different mft record.
*
* Note the check whether to revalidate or not is very simple because the index
* node content cannot have changed thus all points change by a fixed offset
* delta which once determined can be applied to all pointers.
*
* In the index root case, there is also a non-pointer index context field that
* can have changed (and it does so irrespective of the index root position).
* This is @ictx->bytes_free as that is dependent on the other attributes in
* the mft record which can have changed legitimately under our feet which of
* course is the reason why the index root can have moved about, too.
*
* Return 0 on success and errno on error.
*
* Locking: - Caller must hold @ictx->idx_ni->lock on the index inode.
* - The index context @ictx must not be locked.
*/
errno_t ntfs_index_ctx_relock(ntfs_index_context *ictx)
{
long delta;
errno_t err;
if (!ictx)
panic("%s(): !ictx\n", __FUNCTION__);
if (ictx->is_locked)
panic("%s(): ictx->is_locked\n", __FUNCTION__);
if (ictx->is_root) {
MFT_RECORD *m;
ntfs_attr_search_ctx *actx;
if (ictx->actx)
panic("%s(): ictx->actx\n", __FUNCTION__);
if (!ictx->base_ni)
panic("%s(): !ictx->base_ni\n", __FUNCTION__);
err = ntfs_mft_record_map(ictx->base_ni, &m);
if (err) {
ntfs_error(ictx->idx_ni->vol->mp, "Failed to lock ""index root because ""ntfs_mft_record_map() failed (error ""%d).", err);
return err;
}
ictx->actx = actx = ntfs_attr_search_ctx_get(ictx->base_ni, m);
if (!actx) {
ntfs_error(ictx->idx_ni->vol->mp, "Failed to allocate ""attribute search context.");
err = ENOMEM;
gotoactx_err;
}
err = ntfs_attr_lookup(AT_INDEX_ROOT, ictx->idx_ni->name,
ictx->idx_ni->name_len, 0, NULL, 0, actx);
if (err) {
if (err == ENOENT)
panic("%s(): err == ENOENT\n", __FUNCTION__);
ntfs_error(ictx->idx_ni->vol->mp, "Failed to look up ""index root attribute (error %d).",
err);
gotoactx_err;
}
ictx->bytes_free = le32_to_cpu(actx->m->bytes_allocated) -
le32_to_cpu(actx->m->bytes_in_use);
/* Get to the index root value. */
ictx->ir = (INDEX_ROOT*)((u8*)actx->a +
le16_to_cpu(actx->a->value_offset));
delta = (u8*)&ictx->ir->index - (u8*)ictx->index;
ictx->index = &ictx->ir->index;
} else {
u8 *addr;
if (ictx->upl)
panic("%s(): ictx->upl\n", __FUNCTION__);
if (ictx->is_dirty)
panic("%s(): ictx->is_dirty\n", __FUNCTION__);
err = ntfs_page_map(ictx->idx_ni, ictx->upl_ofs, &ictx->upl,
&ictx->pl, &addr, TRUE);
if (err) {
ntfs_error(ictx->idx_ni->vol->mp, "Failed to map ""index page (error %d).", err);
ictx->upl = NULL;
return err;
}
/* Get to the index allocation block. */
delta = addr - ictx->addr;
if (delta) {
ictx->addr = addr;
ictx->ia = (INDEX_ALLOCATION*)((u8*)ictx->ia + delta);
ictx->index = &ictx->ia->index;
}
}
if (delta) {
INDEX_ENTRY **entries;
unsigned u;
/*
* The index node has moved thus we have to update all stored
* pointers so they point into the new memory location.
*/
ictx->entry = (INDEX_ENTRY*)((u8*)ictx->entry + delta);
entries = ictx->entries;
for (u = 0; u < ictx->nr_entries; u++)
entries[u] = (INDEX_ENTRY*)((u8*)entries[u] + delta);
}
ictx->is_locked = 1;
return 0;
actx_err:
if (ictx->actx) {
ntfs_attr_search_ctx_put(ictx->actx);
ictx->actx = NULL;
}
ntfs_mft_record_unmap(ictx->base_ni);
return err;
}
/**
* ntfs_index_ctx_get - allocate and initialize a new index context
* @idx_ni: ntfs index inode with which to initialize the context
*
* Allocate a new index context, initialize it with @idx_ni and return it.
*
* Return NULL if the allocation failed.
*
* Locking: Caller must hold @idx_ni->lock on the index inode.
*/
ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni)
{
ntfs_index_context *ictx;
ictx = ntfs_index_ctx_alloc();
if (ictx)
ntfs_index_ctx_init(ictx, idx_ni);
return ictx;
}
/**
* ntfs_index_ctx_put_reuse_single - release an index context but do not free it
* @ictx: index context to release
*
* Release the index context @ictx, releasing all associated resources but keep
* the index context itself allocated so it can be reused with a call to
* ntfs_index_ctx_init().
*
* This function ignores the tree path which this entry may be a part of and is
* only a helper function for ntfs_index_ctx_put_reuse().
*
* Locking: Caller must hold @ictx->idx_ni->lock on the index inode.
*/voidntfs_index_ctx_put_reuse_single(ntfs_index_context *ictx)
{
if (ictx->entry && ictx->is_locked)
ntfs_index_ctx_unlock(ictx);
if (ictx->entries)
OSFree(ictx->entries, ictx->max_entries * sizeof(INDEX_ENTRY*),
ntfs_malloc_tag);
}
/**
* ntfs_index_ctx_put_reuse - release an index context but do not free it
* @index_ctx: index context to release
*
* Release the index context @index_ctx, releasing all associated resources but
* keep the index context itself allocated so it can be reused with a call to
* ntfs_index_ctx_init().
*
* Locking: Caller must hold @index_ctx->idx_ni->lock on the index inode.
*/voidntfs_index_ctx_put_reuse(ntfs_index_context *index_ctx)
{
ntfs_index_context *ictx, *next;
/*
* Destroy all tree path components except @index_ctx itself. We need
* the temporary index context pointer @next because we deallocate the
* current index context in each iteration of the loop thus we would no
* longer have any means of finding the next index context in the tree
* path if we had not already stored a pointer to it in @next. Note we
* actually traverse the tree path backwards (upwards) because @up is
* the first element in the index_context structure thus doing things
* this way generates faster/better machine code.
*/for (ictx = index_ctx->up, next = ictx->up; ictx != index_ctx;
ictx = next, next = next->up) {
/*
* Disconnect the current index context from the tree and
* release it and all its resources.
*/
ntfs_index_ctx_put_single(ictx);
}
/* Reuse the only remaining, bottom entry. */
ntfs_index_ctx_put_reuse_single(index_ctx);
}
/**
* ntfs_index_get_entries - get the entries for the index node into the context
* @ictx: index context for which to get the entries
*
* Loop through the entries in the index node described by @ictx and gather all
* the index entries into the @ictx->entries array which is also allocated by
* this function.
*
* Return 0 on success and errno on error.
*
* Locking: - Caller must hold @ictx->idx_ni->lock on the index inode.
* - The index context @ictx must be locked.
*/static errno_t ntfs_index_get_entries(ntfs_index_context *ictx)
{
u8 *index_end;
INDEX_HEADER *index;
INDEX_ENTRY *ie, **entries;
unsigned nr_entries, max_entries;
errno_t err;
BOOL is_view;
ntfs_debug("Entering.");
if (!ictx->is_locked)
panic("%s(): !ictx->is_locked\n", __FUNCTION__);
is_view = FALSE;
if (ictx->idx_ni->name != I30)
is_view = TRUE;
nr_entries = 0;
max_entries = ictx->max_entries;
/* Allocate memory for the index entry pointers in the index node. */
entries = OSMalloc(max_entries * sizeof(INDEX_ENTRY*), ntfs_malloc_tag);
if (!entries) {
ntfs_error(ictx->idx_ni->vol->mp, "Failed to allocate index ""entry pointer array.");
return ENOMEM;
}
index = ictx->index;
index_end = (u8*)index + le32_to_cpu(index->index_length);
/* The first index entry. */
ie = (INDEX_ENTRY*)((u8*)index + le32_to_cpu(index->entries_offset));
/*
* Loop until we exceed valid memory (corruption case) or until we
* reach the last entry and for each entry place a pointer to it into
* our array of entry pointers.
*/for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
/* Bounds checks. */if ((u8*)ie < (u8*)index || (u8*)ie +
sizeof(INDEX_ENTRY_HEADER) > index_end ||
(u8*)ie + le16_to_cpu(ie->length) > index_end ||
(u32)sizeof(INDEX_ENTRY_HEADER) +
le16_to_cpu(ie->key_length) >
le16_to_cpu(ie->length))
gotoerr;
/* Add this entry to the array of entry pointers. */if (nr_entries >= max_entries)
panic("%s(): nr_entries >= max_entries\n",
__FUNCTION__);
entries[nr_entries++] = ie;
if (ie->flags & INDEX_ENTRY_END)
break;
/* Further bounds checks for view indexes. */if (is_view && ((u32)sizeof(INDEX_ENTRY_HEADER) +
le16_to_cpu(ie->key_length) >
le16_to_cpu(ie->data_offset) ||
(u32)le16_to_cpu(ie->data_offset) +
le16_to_cpu(ie->data_length) >
le16_to_cpu(ie->length)))
gotoerr;
}
/* Except for the index root, leaf nodes are not allowed to be empty. */if (nr_entries < 2 && !ictx->is_root && !(index->flags & INDEX_NODE)) {
ntfs_error(ictx->idx_ni->vol->mp, "Illegal empty leaf node ""found in index.");
err = EIO;
gotoerr;
}
ictx->entries = entries;
ictx->nr_entries = nr_entries;
ntfs_debug("Done.");
return 0;
err:
OSFree(entries, max_entries * sizeof(INDEX_ENTRY*), ntfs_malloc_tag);
ntfs_error(ictx->idx_ni->vol->mp, "Corrupt index in inode 0x%llx. ""Run chkdsk.",
(unsignedlonglong)ictx->idx_ni->mft_no);
NVolSetErrors(ictx->idx_ni->vol);
return EIO;
}
/**
* ntfs_index_lookup_init - prepare an index context for a lookup
* @ictx: [IN] index context to prepare
* @key_len: [IN] size of the key ntfs_index_lookup() is called with in bytes
*
* Prepare the index context @ictx for a call to ntfs_index_lookup() or
* ntfs_index_lookup_by_position().
*
* @key_len is the length of the key ntfs_index_lookup() will be called with.
* If the index @ictx is a directory index this is ignored.
*
* Return 0 on success and errno on error.
*/static errno_t ntfs_index_lookup_init(ntfs_index_context *ictx,
constint key_len)
{
ntfs_inode *idx_ni;
MFT_RECORD *m;
ntfs_attr_search_ctx *actx;
INDEX_ROOT *ir;
unsigned min_entry_size, max_entries;
errno_t err;
ntfs_debug("Entering.");
if (!ictx->base_ni)
panic("%s(): !ictx->base_ni\n", __FUNCTION__);
idx_ni = ictx->idx_ni;
if (idx_ni->type != AT_INDEX_ALLOCATION)
panic("%s(): idx_ni->type != AT_INDEX_ALLOCATION\n",
__FUNCTION__);
/*
* Ensure the index context is still uninitialized, i.e. it is not
* legal to call ntfs_index_lookup*() with a search context that has
* been used already.
*/if (ictx->up != ictx || ictx->max_entries)
panic("%s(): Called for already used index context.\n",
__FUNCTION__);
if (!ntfs_is_collation_rule_supported(idx_ni->collation_rule)) {
ntfs_error(idx_ni->vol->mp, "Index uses unsupported collation ""rule 0x%x. Aborting.",
le32_to_cpu(idx_ni->collation_rule));
return ENOTSUP;
}
/* Get hold of the mft record for the index inode. */
err = ntfs_mft_record_map(ictx->base_ni, &m);
if (err) {
ntfs_error(idx_ni->vol->mp, "Failed to map mft record (error ""%d.", err);
return err;
}
actx = ntfs_attr_search_ctx_get(ictx->base_ni, m);
if (!actx) {
err = ENOMEM;
gotoerr;
}
/* Find the index root attribute in the mft record. */
err = ntfs_attr_lookup(AT_INDEX_ROOT, idx_ni->name, idx_ni->name_len,
0, NULL, 0, actx);
if (err) {
if (err == ENOENT) {
ntfs_error(idx_ni->vol->mp, "Index root attribute ""missing in inode 0x%llx. Inode is ""corrupt. Run chkdsk.",
(unsignedlonglong)idx_ni->mft_no);
/*
* This will cause the returned error to be EIO and the
* volume to be marked as containing errors.
*/
err = 0;
}
gotoerr;
}
/*
* The entry size is made up of the index entry header plus the index
* key plus the index data plus optionally the sub-node vcn pointer.
*
* For most index types the index key is constant size and is simply
* given by the caller supplied @key_len.
*
* The only collation types with variable sized keys are
* COLLATION_FILENAME and COLLATION_NTOFS_SID.
*
* Determining the index data size is more complicated than that so we
* simply ignore it. This means we waste some memory but it is not too
* bad as only directory indexes appear more than once on a volume thus
* only they can have more than one lookup in progress at any one time.
*/
min_entry_size = sizeof(INDEX_ENTRY_HEADER) + sizeof(FILENAME_ATTR);
if (idx_ni->collation_rule != COLLATION_FILENAME) {
if (idx_ni->collation_rule == COLLATION_NTOFS_SID)
min_entry_size = sizeof(INDEX_ENTRY_HEADER) +
sizeof(SID);
else
min_entry_size = sizeof(INDEX_ENTRY_HEADER) + key_len;
}
/*
* Work out the absolute maximum number of entries there can be in an
* index allocation block. We add one for the end entry which does not
* contain a key.
*/
max_entries = 1 + ((idx_ni->block_size - sizeof(INDEX_BLOCK) -
sizeof(INDEX_ENTRY_HEADER)) / min_entry_size);
/*
* Should the mft record size exceed the size of an index block (this
* should never really happen) then calculate the maximum number of
* entries there can be in the index root and if they are more than the
* ones in the index block use that as the maximum value.
*/if (idx_ni->vol->mft_record_size > idx_ni->block_size) {
unsigned max_root_entries;
max_root_entries = 1 + ((idx_ni->vol->mft_record_size -
le16_to_cpu(m->attrs_offset) -
offsetof(ATTR_RECORD, reservedR) -
sizeof(((ATTR_RECORD*)NULL)->reservedR) -
sizeof(INDEX_ROOT) -
sizeof(INDEX_ENTRY_HEADER)) / min_entry_size);
if (max_root_entries > max_entries)
max_entries = max_root_entries;
}
ictx->max_entries = max_entries;
/*
* Get to the index root value (it has been verified when the inode was
* read in ntfs_index_inode_read()).
*/
ir = (INDEX_ROOT*)((u8*)actx->a + le16_to_cpu(actx->a->value_offset));
ictx->index = &ir->index;
/*
* Gather the index entry pointers and finish setting up the index
* context.
*/
ictx->is_root = 1;
ictx->is_locked = 1;
err = ntfs_index_get_entries(ictx);
if (err) {
ictx->is_locked = 0;
gotoerr;
}
ictx->ir = ir;
ictx->actx = actx;
ictx->bytes_free = le32_to_cpu(actx->m->bytes_allocated) -
le32_to_cpu(actx->m->bytes_in_use);
ntfs_debug("Done.");
return 0;
err:
if (actx)
ntfs_attr_search_ctx_put(actx);
ntfs_mft_record_unmap(ictx->base_ni);
ntfs_error(idx_ni->vol->mp, "Failed (error %d).", err);
return err;
}
/**
* ntfs_index_descend_into_child_node - index node whose child node to return
* @index_ctx: pointer to index context whose child node to return
*
* Descend into the child node pointed to by (*@index_ctx)->entry and return
* its fully set up index context in *@index_ctx.
*
* Return 0 on success and errno on error.
*
* Locking: - Caller must hold @ictx->idx_ni->lock on the index inode.
* - The index context @ictx must be locked.
*/static errno_t ntfs_index_descend_into_child_node(
ntfs_index_context **index_ctx)
{
VCN vcn;
s64 ofs;
ntfs_index_context *ictx, *new_ictx;
ntfs_inode *idx_ni;
upl_t upl;
upl_page_info_array_t pl;
u8 *addr;
INDEX_ALLOCATION *ia;
errno_t err = 0;
int is_dirty = 0;
staticconstchar es[] = "%s. Inode 0x%llx is corrupt. Run chkdsk.";
ntfs_debug("Entering.");
if (!index_ctx)
panic("%s(): !index_ctx\n", __FUNCTION__);
ictx = *index_ctx;
if (!ictx)
panic("%s(): !ictx\n", __FUNCTION__);
idx_ni = ictx->idx_ni;
if (!ictx->is_locked)
panic("%s(): !ictx->is_locked\n", __FUNCTION__);
if (!(ictx->entry->flags & INDEX_ENTRY_NODE))
panic("%s(): !(ictx->entry->flags & INDEX_ENTRY_NODE)\n",
__FUNCTION__);
/*
* Since INDEX_NODE == LARGE_INDEX this check is ok for the index root
* as well.
*/if (!(ictx->index->flags & INDEX_NODE)) {
ntfs_error(idx_ni->vol->mp, es, "Index entry with child node ""found in a leaf node",
(unsignedlonglong)idx_ni->mft_no);
gotoerr;
}
/* Get the starting vcn of the child index block to descend into. */
vcn = sle64_to_cpup((sle64*)((u8*)ictx->entry +
le16_to_cpu(ictx->entry->length) - sizeof(VCN)));
if (vcn < 0) {
ntfs_error(idx_ni->vol->mp, es, "Negative child node VCN",
(unsignedlonglong)idx_ni->mft_no);
gotoerr;
}
/* Determine the offset of the page containing the child index block. */
ofs = (vcn << idx_ni->vcn_size_shift) & ~PAGE_MASK_64;
/*
* If the entry whose sub-node we are descending into is in the index
* root, release the index root unlocking its node or we can deadlock
* with ntfs_page_map().
*/if (ictx->is_root) {
/*
* As a sanity check verify that the index allocation attribute
* exists.
*/if (!NInoIndexAllocPresent(idx_ni)) {
ntfs_error(idx_ni->vol->mp, "No index allocation ""attribute but index root entry ""requires one. Inode 0x%llx is ""corrupt. Run chkdsk.",
(unsignedlonglong)idx_ni->mft_no);
gotoerr;
}
ntfs_index_ctx_unlock(ictx);
} else/* if (!ictx->is_root) */ {
/*
* If @vcn is in the same VM page as the existing page we reuse
* the mapped page, otherwise we release the page so we can get
* the new one.
*/
upl = ictx->upl;
pl = ictx->pl;
addr = ictx->addr;
is_dirty = ictx->is_dirty;
ictx->upl = NULL;
ictx->is_dirty = 0;
ictx->is_locked = 0;
if (ofs == ictx->upl_ofs)
gotohave_page;
ntfs_page_unmap(idx_ni, upl, pl, is_dirty);
is_dirty = 0;
}
/* We did not reuse the old page, get the new page. */
err = ntfs_page_map(idx_ni, ofs, &upl, &pl, &addr, TRUE);
if (err) {
ntfs_error(idx_ni->vol->mp, "Failed to map index page (error ""%d).", err);
gotoerr;
}
have_page:
/* Get to the index allocation block. */
ia = (INDEX_ALLOCATION*)(addr + ((unsigned)(vcn <<
idx_ni->vcn_size_shift) & PAGE_MASK));
/* Bounds checks. */if ((u8*)ia < addr || (u8*)ia > addr + PAGE_SIZE) {
ntfs_error(idx_ni->vol->mp, es, "Out of bounds check failed",
(unsignedlonglong)idx_ni->mft_no);
gotounm_err;
}
/* Catch multi sector transfer fixup errors. */if (!ntfs_is_indx_record(ia->magic)) {
ntfs_error(idx_ni->vol->mp, "Index record with VCN 0x%llx is ""corrupt. Inode 0x%llx is corrupt. Run ""chkdsk.", (unsignedlonglong)vcn,
(unsignedlonglong)idx_ni->mft_no);
gotounm_err;
}
if (sle64_to_cpu(ia->index_block_vcn) != vcn) {
ntfs_error(idx_ni->vol->mp, "Actual VCN (0x%llx) of index ""buffer is different from expected VCN ""(0x%llx). Inode 0x%llx is corrupt. Run ""chkdsk.", (unsignedlonglong)
sle64_to_cpu(ia->index_block_vcn),
(unsignedlonglong)vcn,
(unsignedlonglong)idx_ni->mft_no);
gotounm_err;
}
if (offsetof(INDEX_BLOCK, index) + le32_to_cpu(
ia->index.allocated_size) != idx_ni->block_size) {
ntfs_error(idx_ni->vol->mp, "Index buffer (VCN 0x%llx) of ""inode 0x%llx has a size (%u) differing from ""the index specified size (%u). Inode is ""corrupt. Run chkdsk.",
(unsignedlonglong)vcn,
(unsignedlonglong)idx_ni->mft_no,
(unsigned)offsetof(INDEX_BLOCK, index) +
le32_to_cpu(ia->index.allocated_size),
(unsigned)idx_ni->block_size);
gotounm_err;
}
if ((u8*)ia + idx_ni->block_size > addr + PAGE_SIZE)
panic("%s(): (u8*)ia + idx_ni->block_size > kaddr + ""PAGE_SIZE\n", __FUNCTION__);
if ((u8*)&ia->index + le32_to_cpu(ia->index.index_length) >
(u8*)ia + idx_ni->block_size) {
ntfs_error(idx_ni->vol->mp, "Size of index buffer (VCN ""0x%llx) of inode 0x%llx exceeds maximum ""size.", (unsignedlonglong)vcn,
(unsignedlonglong)idx_ni->mft_no);
gotounm_err;
}
/* Allocate a new index context. */
new_ictx = ntfs_index_ctx_alloc();
if (!new_ictx) {
err = ENOMEM;
ntfs_error(idx_ni->vol->mp, "Failed to allocate index ""context.");
gotounm_err;
}
/*
* Attach the new index context between the current index context
* (which is the bottom of the tree) and the index context below it
* (which is the top of the tree), i.e. place the new index context at
* the bottom of the tree.
*
* Gcc is broken so it fails to see both the members of the anonymous
* union(s) and the bitfield inside the C99 initializer. Work around
* this by setting @is_locked, @is_dirty, @ia, and @vcn afterwards.
*/
*new_ictx = (ntfs_index_context) {
.up = ictx,
.down = ictx->down,
.idx_ni = idx_ni,
.base_ni = ictx->base_ni,
.index = &ia->index,
.bytes_free = le32_to_cpu(ia->index.allocated_size) -
le32_to_cpu(ia->index.index_length),
.max_entries = ictx->max_entries,
};
new_ictx->is_locked = 1;
new_ictx->is_dirty = is_dirty;
new_ictx->ia = ia;
new_ictx->vcn = vcn;
new_ictx->upl_ofs = ofs;
new_ictx->upl = upl;
new_ictx->pl = pl;
new_ictx->addr = addr;
ictx->down->up = new_ictx;
ictx->down = new_ictx;
/*
* Gather the index entry pointers and finish setting up the new index
* context.
*/
err = ntfs_index_get_entries(new_ictx);
if (!err) {
*index_ctx = new_ictx;
ntfs_debug("Done.");
return 0;
}
new_ictx->is_locked = 0;
new_ictx->is_dirty = 0;
new_ictx->upl = NULL;
unm_err:
ntfs_page_unmap(idx_ni, upl, pl, is_dirty);
err:
if (!err) {
NVolSetErrors(idx_ni->vol);
err = EIO;
}
return err;
}
/**
* ntfs_index_lookup_in_node - search for an entry in an index node
* @ictx: index context in which to search for the entry
* @match_key: index entry key data to search for
* @match_key_len: length of @match_key in bytes
* @key: index entry key to search for
* @key_len: length of @key in bytes
*
* Perform a binary search through the index entries in the index node
* described by @ictx looking for the correct entry or if not found for the
* index entry whose sub-node pointer to descend into.
*
* @key and @key_len is the complete key to search for. This is used when
* doing the full blown collation.
*
* When doing exact matching for filenames we need to compare the actual
* filenames rather than the filename attributes whereas for view indexes we
* need to compare the whole key. To make this function simpler we let the
* caller specify the appropriate data to use for exact matching in @match_key
* and @match_key_len. For view indexes @match_key and @match_key_len are the
* same as @key and @key_len respectively.
*
* Return 0 on success and errno on error.
*
* Locking: - Caller must hold @ictx->idx_ni->lock on the index inode.
* - The index context @ictx must be locked.
*/static errno_t ntfs_index_lookup_in_node(ntfs_index_context *ictx,
constvoid *match_key, constint match_key_len,
constvoid *key, constint key_len)
{
ntfs_inode *idx_ni;
INDEX_ENTRY *ie, **entries;
unsigned min_left, max_right, cur_entry;
int rc;
BOOL is_view;
ntfs_debug("Entering.");
idx_ni = ictx->idx_ni;
is_view = FALSE;
if (idx_ni->name != I30)
is_view = TRUE;
entries = ictx->entries;
/*
* If there is only one entry, i.e. the index node is empty, we need to
* descend into the sub-node pointer of the end entry if present thus
* return the end entry.
*/if (ictx->nr_entries == 1) {
cur_entry = 0;
gotonot_found;
}
/*
* Now do a binary search through the index entries looking for the
* correct entry or if not found for the index entry whose sub-node
* pointer to descend into.
*
* Note we exclude the end entry from the search as it does not include
* a key we can compare. That makes the search algorithm simpler and
* more efficient.
*/
min_left = 0;
max_right = ictx->nr_entries - 2;
cur_entry = max_right / 2;
do {
void *ie_match_key;
int ie_match_key_len;
ie = entries[cur_entry];
if (!is_view) {
ie_match_key_len = ie->key.filename.filename_length <<
NTFSCHAR_SIZE_SHIFT;
ie_match_key = &ie->key.filename.filename;
} else {
ie_match_key_len = le16_to_cpu(ie->key_length);
ie_match_key = &ie->key;
}
/*
* If the keys match perfectly, we setup @ictx to point to the
* matching entry and return.
*/if ((match_key_len == ie_match_key_len) &&
!bcmp(match_key, ie_match_key,
ie_match_key_len)) {
found:
ictx->entry = ie;
ictx->entry_nr = cur_entry;
ictx->is_match = 1;
ntfs_debug("Done (found).");
return 0;
}
/*
* Not a perfect match, need to do full blown collation so we
* know which way in the B+tree we have to go.
*/
rc = ntfs_collate(idx_ni->vol, idx_ni->collation_rule, key,
key_len, &ie->key, le16_to_cpu(ie->key_length));
/*
* If @key collates before the key of the current entry, need
* to search on the left.
*/if (rc == -1) {
/*
* If this is the left-most entry we need to descend
* into it if it has a sub-node.
*/if (cur_entry == min_left)
break;
/* Exclude the wrong half or remaining entries. */
max_right = cur_entry - 1;
/* Find the middle of the remaining entries. */
cur_entry = (min_left + max_right) / 2;
continue;
}
/*
* A match should never happen as the bcmp() call should have
* caught it, but we still treat it correctly.
*/if (!rc)
gotofound;
/*
* @key collates after the key of the current entry, need to
* search on the right.
*
* If this is the right-most entry we need to descend into the
* end entry on the right if it has a sub-node.
*/if (cur_entry == max_right) {
cur_entry++;
break;
}
/* Exclude the wrong half or remaining entries. */
min_left = cur_entry + 1;
/* Find the middle of the remaining entries. */
cur_entry = (min_left + max_right) / 2;
} while (1);
not_found:
ictx->follow_entry = entries[cur_entry];
ictx->entry_nr = cur_entry;
ntfs_debug("Done (not found).");
return ENOENT;
}
/**
* ntfs_index_lookup - find a key in an index and return its index entry
* @key: [IN] key for which to search in the index
* @key_len: [IN] length of @key in bytes
* @index_ctx: [IN/OUT] context describing the index and the returned entry
*
* Before calling ntfs_index_lookup(), *@index_ctx must have been obtained
* from a call to ntfs_index_ctx_get().
*
* Look for the @key in the index specified by the index lookup context
* @index_ctx. ntfs_index_lookup() walks the contents of the index looking for
* the @key. As it does so, it records the path taken through the B+tree
* together with other useful information it obtains along the way. This is
* needed if the lookup is going to be followed by a complex index tree
* operation such as an index entry addition requiring one or more index block
* splits for example.
*
* If the @key is found in the index, 0 is returned and *@index_ctx is set up
* to describe the index entry containing the matching @key. The matching
* entry is pointed to by *@index_ctx->entry.
*
* If the @key is not found in the index, ENOENT is returned and *@index_ctx is
* setup to describe the index entry whose key collates immediately after the
* search @key, i.e. this is the position in the index at which an index entry
* with a key of @key would need to be inserted.
*
* If an error occurs return the error code. In this case *@index_ctx is
* undefined and must be freed via a call to ntfs_index_ctx_put() or
* reinitialized via a call to ntfs_index_ctx_put_reuse().
*
* When finished with the entry and its data, call ntfs_index_ctx_put() to free
* the context and other associated resources.
*
* If the index entry was modified, call ntfs_index_entry_mark_dirty() before
* the call to ntfs_index_ctx_put() to ensure that the changes are written to
* disk.
*
* Locking: - Caller must hold @index_ctx->idx_ni->lock on the index inode.
* - Caller must hold an iocount reference on the index inode.
*
* TODO: An optimization would be to take two new parameters, say @add and
* @del, which allow our caller to tell us if the search is for purposes of
* adding an entry (@add is true) or for removing an entry (@del is true) or if
* it is a simple lookup (read-only or overwrite without change in length, both
* @add and @del are false). For the lookup case we do not need to record the
* path so we can just use one single index context and only one array of index
* entry pointers and we keep reusing both instead of getting new ones each
* time we descend into a sub-node. Alternatively take a single parameter
* @reason or @intent or something and define some constants like NTFS_LOOKUP,
* NTFS_ADD, and NTFS_DEL or something to go with it to serve the same purpose
* as above.
*/
errno_t ntfs_index_lookup(constvoid *key, constint key_len,
ntfs_index_context **index_ctx)
{
ntfs_index_context *ictx;
constvoid *match_key;
int match_key_len;
errno_t err;
ntfs_debug("Entering.");
if (!key)
panic("%s(): !key\n", __FUNCTION__);
if (key_len <= 0)
panic("%s(): key_len <= 0\n", __FUNCTION__);
ictx = *index_ctx;
/*
* When doing exact matching for filenames we need to compare the
* actual filenames rather than the filename attributes whereas for
* view indexes we need to compare the whole key.
*/if (ictx->idx_ni->name == I30) {
match_key_len = ((FILENAME_ATTR*)key)->filename_length <<
NTFSCHAR_SIZE_SHIFT;
match_key = &((FILENAME_ATTR*)key)->filename;
} else {
match_key_len = key_len;
match_key = key;
}
/* Prepare the search context for its first lookup. */
err = ntfs_index_lookup_init(ictx, key_len);
if (err)
gotoerr;
do {
/*
* Look for the @key in the current index node. If found, the
* index context points to the found entry so we are done. If
* not found, the index context points to the entry whose
* sub-node pointer we need to descend into if it is present
* and if not we have failed to find the entry and are also
* done.
*/
err = ntfs_index_lookup_in_node(ictx, match_key, match_key_len,
key, key_len);
if (err && err != ENOENT)
panic("%s(): err && err != ENOENT\n", __FUNCTION__);
if (!err || !(ictx->entry->flags & INDEX_ENTRY_NODE))
break;
/* Not found but child node present, descend into it. */
err = ntfs_index_descend_into_child_node(&ictx);
if (err)
gotoerr;
/*
* Replace the caller's index context with the new one so the
* caller always gets the bottom-most index context.
*/
*index_ctx = ictx;
} while (1);
ntfs_debug("Done (%s in index %s).",
!err ? "found matching entry" : "entry not found",
ictx->is_root ? "root" : "allocation block");
return err;
err:
ntfs_error(ictx->idx_ni->vol->mp, "Failed (error %d).", err);
return err;
}
/**
* ntfs_index_lookup_by_position - find an entry by its position in the B+tree
* @pos: [IN] position of index entry to find in the B+tree
* @key_len: [IN] size of the key ntfs_index_lookup() is called with in bytes
* @index_ctx: [IN/OUT] context describing the index and the returned entry
*
* Before calling ntfs_index_lookup_by_position(), *@index_ctx must have been
* obtained from a call to ntfs_index_ctx_get().
*
* Start searching at the beginning of the B+tree, counting each of the entries
* and when the entry with position number @pos has been reached return that in
* *@index_ctx.
*
* @key_len is the length of the key ntfs_index_lookup() would be called with.
* If the index *@index_ictx is a directory index this is ignored.
*
* As the search progresses, the path taken through the B+tree is recorded
* together with other useful information obtained along the way. This is
* needed so the lookup can proceed to the next entry if/when desired, for
* example during a ntfs_readdir() call.
*
* If the position @pos is found in the index, 0 is returned and *@index_ctx is
* set up to describe the index entry containing at position @pos. The
* matching entry is pointed to by *@index_ctx->entry.
*
* If the position @pos is not found in the index, ENOENT is returned.
*
* If an error occurs the error code is returned (cannot be ENOENT).
*
* If any values other than 0 are returned, *@index_ctx is undefined and must
* be freed via a call to ntfs_index_ctx_put() or reinitialized via a call to
* ntfs_index_ctx_put_reuse().
*
* When finished with the entry and its data, call ntfs_index_ctx_put() to free
* the context and other associated resources.
*
* If the index entry was modified, call ntfs_index_entry_mark_dirty() before
* the call to ntfs_index_ctx_put() to ensure that the changes are written to
* disk.
*
* Locking: - Caller must hold @index_ctx->idx_ni->lock on the index inode.
* - Caller must hold an iocount reference on the index inode.
*/
errno_t ntfs_index_lookup_by_position(const s64 pos, constint key_len,
ntfs_index_context **index_ctx)
{
s64 current_pos;
ntfs_index_context *ictx;
errno_t err;
ntfs_debug("Entering for position 0x%llx.", (unsignedlonglong)pos);
if (pos < 0)
panic("%s(): pos < 0\n", __FUNCTION__);
ictx = *index_ctx;
/* Prepare the search context for its first lookup. */
err = ntfs_index_lookup_init(ictx, key_len);
if (err)
gotoerr;
/* Start at the first index entry in the index root. */
ictx->entry = ictx->entries[0];
ictx->entry_nr = 0;
current_pos = 0;
/*
* Iterate over the entries in the B+tree counting them up until we
* reach entry position @pos in which case we are done.
*/do {
/*
* Keep descending into the sub-node pointers until a leaf node
* is found.
*/while (ictx->entry->flags & INDEX_ENTRY_NODE) {
/* Child node present, descend into it. */
err = ntfs_index_descend_into_child_node(&ictx);
if (err)
gotoerr;
/* Start at the first index entry in the index node. */
ictx->entry = ictx->entries[0];
ictx->entry_nr = 0;
}
/*
* We have reached the next leaf node. If @pos is in this node
* skip to the correct entry and we are done.
*
* Note we need the -1 because @ictx->nr_entries counts the end
* entry which does not contain any data thus we do not count
* it as a real entry.
*/if (current_pos + ictx->nr_entries - 1 > pos) {
/*
* No need to skip any entries if the current entry is
* the one matching @pos.
*/if (current_pos < pos) {
s64 nr;
nr = ictx->entry_nr + (pos - current_pos);
if (nr >= ictx->nr_entries - 1)
panic("%s(): nr >= ictx->nr_entries - ""1\n", __FUNCTION__);
ictx->entry_nr = nr;
ictx->entry = ictx->entries[nr];
current_pos = pos;
} elseif (current_pos > pos)
panic("%s(): current_pos > pos\n",
__FUNCTION__);
break;
}
/*
* @pos is not in this node. Skip the whole node, i.e. skip
* @ictx->nr_entries - 1 real index entries.
*/
current_pos += ictx->nr_entries - 1;
/*
* Keep moving up the B+tree until we find an index node that
* we have not finished yet.
*/do {
ntfs_index_context *itmp;
/* If we are in the index root, we are done. */if (ictx->is_root)
gotonot_found;
/* Save the current index context so we can free it. */
itmp = ictx;
/* Move up to the parent node. */
ictx = ictx->up;
/*
* Disconnect the old index context from its path and
* free it.
*/
ntfs_index_ctx_disconnect(itmp);
ntfs_index_ctx_put_reuse_single(itmp);
ntfs_index_ctx_free(itmp);
} while (ictx->entry_nr == ictx->nr_entries - 1);
/*
* We have reached a node which we have not finished with yet.
* Lock it so we can work on it.
*/
err = ntfs_index_ctx_relock(ictx);
if (err)
gotoerr;
/*
* Check if the current entry is the entry matching @pos and if
* so we are done.
*/if (current_pos == pos)
break;
/*
* Move to the next entry in this node and continue descending
* into the sub-node pointers until a leaf node is found. The
* first entry in the leaf node will be the next entry thus
* increment @current_pos now.
*/
ictx->entry = ictx->entries[++ictx->entry_nr];
current_pos++;
} while (1);
/*
* Indicate that the current entry is the one the caller was looking
* for and return the index context containing it to the caller.
*/
ictx->is_match = 1;
*index_ctx = ictx;
ntfs_debug("Done (entry with position 0x%llx found in index %s).",
(unsignedlonglong)pos,
ictx->is_root ? "root" : "allocation block");
return 0;
not_found:
ntfs_debug("Done (entry with position 0x%llx not found in index, ""returning ENOENT).", (unsignedlonglong)pos);
/* Ensure we return a valid index context. */
*index_ctx = ictx;
return ENOENT;
err:
ntfs_error(ictx->idx_ni->vol->mp, "Failed (error %d).", err);
/* Ensure we return a valid index context. */
*index_ctx = ictx;
return err;
}
/**
* ntfs_index_lookup_next - find the next entry in the B+tree
* @index_ctx: [IN/OUT] context describing the index and the returned entry
*
* Before calling ntfs_index_lookup_next(), *@index_ctx must have been obtained
* from a call to ntfs_index_lookup() or ntfs_index_lookup_by_position().
*
* If the next entry is found in the index, 0 is returned and *@index_ctx is
* set up to describe the next index entry. The matching entry is pointed to
* by *@index_ctx->entry.
*
* If the position @pos is not found in the index, ENOENT is returned.
*
* If an error occurs the error code is returned (cannot be ENOENT).
*
* If any values other than 0 are returned, *@index_ctx is undefined and must
* be freed via a call to ntfs_index_ctx_put() or reinitialized via a call to
* ntfs_index_ctx_put_reuse().
*
* When finished with the entry and its data, call ntfs_index_ctx_put() to free
* the context and other associated resources.
*
* If the index entry was modified, call ntfs_index_entry_mark_dirty() before
* the call to ntfs_index_ctx_put() to ensure that the changes are written to
* disk.
*
* Locking: - Caller must hold @index_ctx->idx_ni->lock on the index inode.
* - Caller must hold an iocount reference on the index inode.
*/
errno_t ntfs_index_lookup_next(ntfs_index_context **index_ctx)
{
ntfs_index_context *ictx;
errno_t err;
ntfs_debug("Entering.");
ictx = *index_ctx;
if (!ictx->base_ni)
panic("%s(): !ictx->base_ni\n", __FUNCTION__);
/*
* Ensure the index context is initialized, i.e. it is not legal to
* call ntfs_index_lookup_next() with a search context that has not
* been used for a call to ntfs_index_lookup_by_position() or
* ntfs_index_lookup() yet.
*/if (!ictx->max_entries)
panic("%s(): Called for uninitialized index context.\n",
__FUNCTION__);
/*
* @ictx must currently point to real entry thus @ictx->nr_entries must
* be greater or equal to 2 as it includes the end entry which cannot
* contain any data.
*/if (ictx->nr_entries < 2)
panic("%s(): ictx->nr_entries < 2\n", __FUNCTION__);
if (!(ictx->entry->flags & INDEX_ENTRY_NODE)) {
/*
* The current entry is in a leaf node.
*
* If it is not the last entry in the node return the next
* entry in the node.
*/if (ictx->entry_nr < ictx->nr_entries - 2) {
ictx->entry = ictx->entries[++ictx->entry_nr];
ntfs_debug("Done (same leaf node).");
return 0;
}
/*
* The current entry is in a leaf node and it is the last entry
* in the node. The next entry is the first real entry above
* the current node thus keep moving up the B+tree until we
* find a real entry.
*/do {
ntfs_index_context *itmp;
/* If we are in the index root, we are done. */if (ictx->is_root) {
ntfs_debug("Done (was at last entry already, ""returning ENOENT).");
/* Ensure we return a valid index context. */
*index_ctx = ictx;
return ENOENT;
}
/* Save the current index context so we can free it. */
itmp = ictx;
/* Move up to the parent node. */
ictx = ictx->up;
/*
* Disconnect the old index context from its path and
* free it.
*/
ntfs_index_ctx_disconnect(itmp);
ntfs_index_ctx_put_reuse_single(itmp);
ntfs_index_ctx_free(itmp);
} while (ictx->entry_nr == ictx->nr_entries - 1);
/*
* We have reached a node with a real index entry. Lock it so
* we can work on it.
*/
err = ntfs_index_ctx_relock(ictx);
if (err)
gotoerr;
/*
* Indicate that the current entry is the next entry and return
* the index context containing it to the caller.
*/
ictx->is_match = 1;
*index_ctx = ictx;
ntfs_debug("Done (upper index node).");
return 0;
}
/*
* The current entry is in an index node.
*
* To find the next entry we need to switch to the next entry in this
* node and then descend into the sub-node pointers until a leaf node
* is found. The first entry of that leaf node is the next entry.
*
* First, unmark this node as being the matching one and switch to the
* next entry in this node.
*/
ictx->is_match = 0;
ictx->entry = ictx->entries[++ictx->entry_nr];
/*
* Keep descending into the sub-node pointers until a leaf node is
* found.
*/while (ictx->entry->flags & INDEX_ENTRY_NODE) {
/* Child node present, descend into it. */
err = ntfs_index_descend_into_child_node(&ictx);
if (err)
gotoerr;
/* Start at the first index entry in the index node. */
ictx->entry = ictx->entries[0];
ictx->entry_nr = 0;
}
/*
* We have reached the next leaf node. The next entry is the first
* entry in this node which we have already set up to be the current
* entry.
*
* Indicate that the current entry is the one the caller was looking
* for and return the index context containing it to the caller.
*/
ictx->is_match = 1;
*index_ctx = ictx;
ntfs_debug("Done (next leaf node).");
return 0;
err:
ntfs_error(ictx->idx_ni->vol->mp, "Failed (error %d).", err);
/* Ensure we return a valid index context. */
*index_ctx = ictx;
return err;
}
/**
* ntfs_index_entry_mark_dirty - mark an index entry dirty
* @ictx: ntfs index context describing the index entry
*
* Mark the index entry described by the index entry context @ictx dirty.
*
* If the index entry is in the index root attribute, simply mark the mft
* record containing the index root attribute dirty. This ensures the mft
* record, and hence the index root attribute, will be written out to disk
* later.
*
* If the index entry is in an index block belonging to the index allocation
* attribute, mark the page the index block is in dirty.
*
* Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for
* writing.
* - The index context @ictx must be locked.
*/voidntfs_index_entry_mark_dirty(ntfs_index_context *ictx)
{
if (!ictx->is_locked)
panic("%s(): Index context is not locked.\n", __FUNCTION__);
if (ictx->is_root) {
if (!ictx->actx)
panic("%s(): Attribute context is missing from index ""context.\n", __FUNCTION__);
NInoSetMrecNeedsDirtying(ictx->actx->ni);
} else {
if (!ictx->upl)
panic("%s(): Page list is missing from index ""context.\n", __FUNCTION__);
ictx->is_dirty = 1;
}
}
/**
* ntfs_insert_index_allocation_attribute - add the index allocation attribute
* @ni: base ntfs inode to which the attribute is being added
* @ctx: search context describing where to insert the attribute
* @idx_ni: index inode for which to add the index allocation attribute
*
* Insert a new index allocation attribute in the base ntfs inode @ni at the
* position indicated by the attribute search context @ctx and add an attribute
* list attribute entry for it if the inode uses the attribute list attribute.
*
* The new attribute type and name are given by the index inode @idx_ni.
*
* If @idx_ni contains a runlist, it is used to create the mapping pairs array
* for the non-resident index allocation attribute.
*
* Return 0 on success and errno on error.
*
* WARNING: Regardless of whether success or failure is returned, you need to
* check @ctx->is_error and if true the @ctx is no longer valid, i.e.
* you need to either call ntfs_attr_search_ctx_reinit() or
* ntfs_attr_search_ctx_put() on it. In that case @ctx->error will
* give the error code for why the mapping of the inode failed.
*
* Locking: Caller must hold @idx_ni->lock on the index inode for writing.
*/static errno_t ntfs_insert_index_allocation_attribute(ntfs_inode *ni,
ntfs_attr_search_ctx *ctx, ntfs_inode *idx_ni)
{
ntfs_volume *vol;
MFT_RECORD *base_m, *m;
ATTR_RECORD *a;
ATTR_LIST_ENTRY *al_entry;
unsigned mp_ofs, mp_size, al_entry_used, al_entry_len;
unsigned new_al_size, new_al_alloc;
errno_t err;
BOOL al_entry_added;
ntfs_debug("Entering for mft_no 0x%llx, name_len 0x%x.",
(unsignedlonglong)ni->mft_no, idx_ni->name_len);
vol = idx_ni->vol;
/*
* Calculate the offset into the new attribute at which the mapping
* pairs array begins. The mapping pairs array is placed after the
* name aligned to an 8-byte boundary which in turn is placed
* immediately after the non-resident attribute record itself.
*/
mp_ofs = offsetof(ATTR_RECORD, compressed_size) +
(((idx_ni->name_len << NTFSCHAR_SIZE_SHIFT) + 7) & ~7);
/* Work out the size for the mapping pairs array. */
err = ntfs_get_size_for_mapping_pairs(vol,
idx_ni->rl.elements ? idx_ni->rl.rl : NULL, 0, -1,
&mp_size);
if (err)
panic("%s(): err\n", __FUNCTION__);
/*
* Work out the size for the attribute record. We simply take the
* offset to the mapping pairs array we worked out above and add the
* size of the mapping pairs array in bytes aligned to an 8-byte
* boundary. Note we do not need to do the alignment as
* ntfs_attr_record_make_space() does it anyway.
*
* The current implementation of ntfs_attr_lookup() will always return
* pointing into the base mft record when an attribute is not found.
*/
base_m = ctx->m;
retry:
if (ni != ctx->ni)
panic("%s(): ni != ctx->ni\n", __FUNCTION__);
m = ctx->m;
a = ctx->a;
err = ntfs_attr_record_make_space(m, a, mp_ofs + mp_size);
if (err) {
ntfs_inode *eni;
if (err != ENOSPC)
panic("%s(): err != ENOSPC\n", __FUNCTION__);
/*
* There was not enough space in the mft record to insert the
* new attribute record which means we will need to insert it
* into an extent mft record.
*
* To avoid bugs and impossible situations, check that the
* attribute is not already the only attribute in the mft
* record otherwise moving it would not give us anything.
*/if (ntfs_attr_record_is_only_one(m, a))
panic("%s(): ntfs_attr_record_is_only_one()\n",
__FUNCTION__);
/*
* Before we can allocate an extent mft record, we need to
* ensure that the inode has an attribute list attribute.
*/if (!NInoAttrList(ni)) {
err = ntfs_attr_list_add(ni, m, NULL);
if (err) {
ntfs_error(vol->mp, "Failed to add attribute ""list attribute to mft_no ""0x%llx (error %d).",
(unsignedlonglong)ni->mft_no,
err);
return err;
}
/*
* Adding the attribute list attribute may have
* generated enough space in the base mft record to
* fit the attribute so try again.
*/
ntfs_attr_search_ctx_reinit(ctx);
err = ntfs_attr_lookup(AT_INDEX_ALLOCATION,
idx_ni->name, idx_ni->name_len, 0,
NULL, 0, ctx);
if (err == ENOENT) {
/*
* The current implementation of
* ntfs_attr_lookup() will always return
* pointing into the base mft record when an
* attribute is not found.
*/if (m != ctx->m)
panic("%s(): m != ctx->m\n",
__FUNCTION__);
gotoretry;
}
/*
* We cannot have found the attribute as we have
* exclusive access and know that it does not exist
* already.
*/if (!err)
panic("%s() !err\n", __FUNCTION__);
/*
* Something has gone wrong. Note we have to bail out
* as a failing attribute lookup indicates corruption
* and/or disk failure and/or not enough memory all of
* which would prevent us from rolling back the
* attribute list attribute addition.
*/
ntfs_error(vol->mp, "Failed to add index allocation ""attribute to mft_no 0x%llx because ""looking up the attribute failed ""(error %d).",
(unsignedlonglong)ni->mft_no, err);
return err;
}
/*
* We now need to allocate a new extent mft record, attach it
* to the base ntfs inode and set up the search context to
* point to it, then insert the new attribute into it.
*/
err = ntfs_mft_record_alloc(vol, NULL, NULL, ni, &eni, &m, &a);
if (err) {
ntfs_error(vol->mp, "Failed to add index allocation ""attribute to mft_no 0x%llx because ""allocating a new extent mft record ""failed (error %d).",
(unsignedlonglong)ni->mft_no, err);
/*
* TODO: If we added the attribute list attribute above
* we could now remove it again but this may require
* moving attributes back into the base mft record so
* is not a trivial amount of work and in the end it
* does not really matter if we leave an inode with an
* attribute list attribute that does not really need
* it. Especially so since this is error handling and
* it is not an error to have an attribute list
* attribute when not strictly required.
*/return err;
}
ctx->ni = eni;
ctx->m = m;
ctx->a = a;
/*
* Make space for the new attribute. This cannot fail as we
* now have an empty mft record which by definition can hold
* a maximum size resident attribute record.
*/
err = ntfs_attr_record_make_space(m, a, mp_ofs + mp_size);
if (err)
panic("%s(): err (ntfs_attr_record_make_space())\n",
__FUNCTION__);
}
/*
* Now setup the new attribute record. The entire attribute has been
* zeroed and the length of the attribute record has been set up by
* ntfs_attr_record_make_space().
*/
a->type = AT_INDEX_ALLOCATION;
a->non_resident = 1;
a->name_length = idx_ni->name_len;
a->name_offset = const_cpu_to_le16(offsetof(ATTR_RECORD,
compressed_size));
a->instance = m->next_attr_instance;
/*
* Increment the next attribute instance number in the mft record as we
* consumed the old one.
*/
m->next_attr_instance = cpu_to_le16(
(le16_to_cpu(m->next_attr_instance) + 1) & 0xffff);
a->highest_vcn = cpu_to_sle64((idx_ni->allocated_size >>
vol->cluster_size_shift) - 1);
a->mapping_pairs_offset = cpu_to_le16(mp_ofs);
lck_spin_lock(&idx_ni->size_lock);
a->allocated_size = cpu_to_sle64(idx_ni->allocated_size);
a->data_size = cpu_to_sle64(idx_ni->data_size);
a->initialized_size = cpu_to_sle64(idx_ni->initialized_size);
lck_spin_unlock(&idx_ni->size_lock);
/* Copy the attribute name into place. */if (idx_ni->name_len)
memcpy((u8*)a + offsetof(ATTR_RECORD, compressed_size),
idx_ni->name, idx_ni->name_len <<
NTFSCHAR_SIZE_SHIFT);
/* Generate the mapping pairs array into place. */
err = ntfs_mapping_pairs_build(vol, (s8*)a + mp_ofs, mp_size,
idx_ni->rl.elements ? idx_ni->rl.rl : NULL, 0,
-1, NULL);
if (err)
panic("%s(): err (ntfs_mapping_pairs_build())\n", __FUNCTION__);
/*
* If the inode does not use the attribute list attribute we are done.
*
* If the inode uses the attribute list attribute (including the case
* where we just created it), we need to add an attribute list
* attribute entry for the attribute.
*/if (!NInoAttrList(ni))
gotodone;
/* Add an attribute list attribute entry for the inserted attribute. */
al_entry = ctx->al_entry;
al_entry_used = offsetof(ATTR_LIST_ENTRY, name) +
(idx_ni->name_len << NTFSCHAR_SIZE_SHIFT);
al_entry_len = (al_entry_used + 7) & ~7;
new_al_size = ni->attr_list_size + al_entry_len;
/* Out of bounds checks. */if ((u8*)al_entry < ni->attr_list ||
(u8*)al_entry > ni->attr_list + new_al_size ||
(u8*)al_entry + al_entry_len >
ni->attr_list + new_al_size) {
/* Inode is corrupt. */
ntfs_error(vol->mp, "Mft_no 0x%llx is corrupt. Run chkdsk.",
(unsignedlonglong)ni->mft_no);
err = EIO;
gotoundo;
}
err = ntfs_attr_size_bounds_check(vol, AT_ATTRIBUTE_LIST, new_al_size);
if (err) {
if (err == ERANGE) {
ntfs_error(vol->mp, "Cannot insert attribute into ""mft_no 0x%llx because the attribute ""list attribute would become too ""large. You need to defragment your ""volume and then try again.",
(unsignedlonglong)ni->mft_no);
err = ENOSPC;
} else {
ntfs_error(vol->mp, "Attribute list attribute is ""unknown on the volume. The volume ""is corrupt. Run chkdsk.");
NVolSetErrors(vol);
err = EIO;
}
gotoundo;
}
new_al_alloc = (new_al_size + NTFS_ALLOC_BLOCK - 1) &
~(NTFS_ALLOC_BLOCK - 1);
/*
* Reallocate the memory buffer if needed and create space for the new
* entry.
*/if (new_al_alloc > ni->attr_list_alloc) {
u8 *tmp, *al, *al_end;
unsigned al_entry_ofs;
tmp = OSMalloc(new_al_alloc, ntfs_malloc_tag);
if (!tmp) {
ntfs_error(vol->mp, "Not enough memory to extend ""attribute list attribute of mft_no ""0x%llx.",
(unsignedlonglong)ni->mft_no);
err = ENOMEM;
gotoundo;
}
al = ni->attr_list;
al_entry_ofs = (u8*)al_entry - al;
al_end = al + ni->attr_list_size;
memcpy(tmp, al, al_entry_ofs);
if ((u8*)al_entry < al_end)
memcpy(tmp + al_entry_ofs + al_entry_len,
al + al_entry_ofs,
ni->attr_list_size - al_entry_ofs);
al_entry = ctx->al_entry = (ATTR_LIST_ENTRY*)(tmp +
al_entry_ofs);
OSFree(ni->attr_list, ni->attr_list_alloc, ntfs_malloc_tag);
ni->attr_list_alloc = new_al_alloc;
ni->attr_list = tmp;
} elseif ((u8*)al_entry < ni->attr_list + ni->attr_list_size)
memmove((u8*)al_entry + al_entry_len, al_entry,
ni->attr_list_size - ((u8*)al_entry -
ni->attr_list));
ni->attr_list_size = new_al_size;
/* Set up the attribute list entry. */
al_entry->type = AT_INDEX_ALLOCATION;
al_entry->length = cpu_to_le16(al_entry_len);
al_entry->name_length = idx_ni->name_len;
al_entry->name_offset = offsetof(ATTR_LIST_ENTRY, name);
al_entry->lowest_vcn = 0;
al_entry->mft_reference = MK_LE_MREF(ctx->ni->mft_no, ctx->ni->seq_no);
al_entry->instance = a->instance;
/* Copy the attribute name into place. */if (idx_ni->name_len)
memcpy((u8*)&al_entry->name, idx_ni->name,
idx_ni->name_len << NTFSCHAR_SIZE_SHIFT);
/* For tidyness, zero any unused space. */if (al_entry_len != al_entry_used) {
if (al_entry_len < al_entry_used)
panic("%s(): al_entry_len < al_entry_used\n",
__FUNCTION__);
memset((u8*)al_entry + al_entry_used, 0,
al_entry_len - al_entry_used);
}
/*
* Extend the attribute list attribute and copy in the modified
* value from the cache.
*/
err = ntfs_attr_list_sync_extend(ni, base_m,
(u8*)al_entry - ni->attr_list, ctx);
if (err) {
ntfs_error(vol->mp, "Failed to extend attribute list ""attribute of mft_no 0x%llx (error %d).",
(unsignedlonglong)ni->mft_no, err);
al_entry_added = TRUE;
gotoundo_al;
}
done:
ntfs_debug("Done.");
return 0;
undo:
al_entry_added = FALSE;
undo_al:
/*
* Need to remove the attribute again or free the extent mft record if
* there are no attributes remaining in it.
*/if (m == base_m || !ntfs_attr_record_is_only_one(m, a)) {
ntfs_attr_record_delete_internal(m, a);
/*
* If the attribute was not in the base mft record mark the
* extent mft record dirty so it gets written out later. If
* the attribute was in the base mft record it will be marked
* dirty later.
*
* We also unmap the extent mft record and we set @ctx->ni to
* equal the base inode @ni so that the search context is
* initialized from scratch or simply freed if the caller
* reinitializes or releases the search context respectively.
*/if (m != base_m) {
NInoSetMrecNeedsDirtying(ctx->ni);
ntfs_extent_mft_record_unmap(ctx->ni);
ctx->ni = ni;
}
} else {
errno_t err2;
BOOL al_needed;
err2 = ntfs_extent_mft_record_free(ni, ctx->ni, m);
if (err2) {
/*
* Ignore the error as we just end up with an unused
* mft record that is marked in use.
*/
ntfs_error(vol->mp, "Failed to free extent mft_no ""0x%llx (error %d). Unmount and run ""chkdsk to recover the lost inode.",
(unsignedlonglong)ctx->ni->mft_no,
err2);
NVolSetErrors(vol);
/*
* Relese the extent mft record after dirtying it thus
* simulating the effect of freeing it.
*/
NInoSetMrecNeedsDirtying(ctx->ni);
ntfs_extent_mft_record_unmap(ctx->ni);
}
/*
* The attribute search context still points to the no longer
* mapped extent inode thus we need to change it to point to
* the base inode instead so the context can be reinitialized
* or released safely.
*/
ctx->ni = ni;
/*
* Check the attribute list attribute. If there are no other
* attribute list attribute entries referencing extent mft
* records delete the attribute list attribute altogether.
*
* If this fails it does not matter as we simply retain the
* attribute list attribute so we ignore the error and go on to
* delete the attribute list attribute entry instead.
*
* If there are other attribute list attribute entries
* referencing extent mft records we still need the attribute
* list attribute thus we go on to delete the attribute list
* entry corresponding to the attribute record we just deleted
* by freeing its extent mft record.
*/
err2 = ntfs_attr_list_is_needed(ni,
al_entry_added ? al_entry : NULL, &al_needed);
if (err2)
ntfs_warning(vol->mp, "Failed to determine if ""attribute list attribute of mft_no ""0x%llx if still needed (error %d). ""Assuming it is still needed and ""continuing.",
(unsignedlonglong)ni->mft_no, err2);
elseif (!al_needed) {
/*
* No more extent mft records are in use. Delete the
* attribute list attribute.
*/
ntfs_attr_search_ctx_reinit(ctx);
err2 = ntfs_attr_list_delete(ni, ctx);
if (!err2) {
/*
* We deleted the attribute list attribute and
* this will have updated the base inode
* appropriately thus we have restored
* everything as it was before.
*/return err;
}
ntfs_warning(vol->mp, "Failed to delete attribute ""list attribute of mft_no 0x%llx ""(error %d). Continuing using ""alternative error recovery method.",
(unsignedlonglong)ni->mft_no, err2);
}
}
/*
* Both @ctx and @ni are now invalid and cannot be used any more which
* is fine as we have finished dealing with the attribute record.
*
* We now need to delete the corresponding attribute list attribute
* entry if we created it.
*
* Then we need to rewrite the attribute list attribute again because
* ntfs_attr_list_sync_extend() may have left it in an indeterminate
* state.
*/if (al_entry_added) {
errno_t err2;
ntfs_attr_list_entry_delete(ni, al_entry);
ntfs_attr_search_ctx_reinit(ctx);
err2 = ntfs_attr_list_sync_shrink(ni, 0, ctx);
if (err2) {
ntfs_error(vol->mp, "Failed to restore attribute list ""attribute in base mft_no 0x%llx ""(error %d). Leaving inconsistent ""metadata. Unmount and run chkdsk.",
(unsignedlonglong)ni->mft_no, err2);
NVolSetErrors(vol);
}
}
/* Make sure any changes are written out. */
NInoSetMrecNeedsDirtying(ni);
return err;
}
/**
* ntfs_index_block_lay_out - lay out an index block into a memory buffer
* @idx_ni: index inode to which the index block will belong
* @vcn: vcn of index block
* @ia: destination buffer of size >= @idx_ni->block_size bytes
*
* Lay out an empty index allocation block with the @vcn into the buffer @ia.
* The index inode @idx_ni is needed because we need to know the size of an
* index block and the @vcn is needed because we need to record it in the index
* block.
*
* Locking: Caller must hold @idx_ni->lock on the index inode for writing.
*/staticvoidntfs_index_block_lay_out(ntfs_inode *idx_ni, VCN vcn,
INDEX_BLOCK *ia)
{
INDEX_HEADER *ih;
INDEX_ENTRY *ie;
u32 ie_ofs;
bzero(ia, idx_ni->block_size);
ia->magic = magic_INDX;
ia->usa_ofs = const_cpu_to_le16(sizeof(INDEX_BLOCK));
ia->usa_count = cpu_to_le16(1 + (idx_ni->block_size / NTFS_BLOCK_SIZE));
/* Set the update sequence number to 1. */
*(le16*)((u8*)ia + le16_to_cpu(ia->usa_ofs)) = cpu_to_le16(1);
ia->index_block_vcn = cpu_to_sle64(vcn);
ih = &ia->index;
ie_ofs = (sizeof(INDEX_HEADER) +
(le16_to_cpu(ia->usa_count) << 1) + 7) & ~7;
ih->entries_offset = cpu_to_le32(ie_ofs);
ih->index_length = cpu_to_le32(ie_ofs + sizeof(INDEX_ENTRY_HEADER));
ih->allocated_size = cpu_to_le32(idx_ni->block_size -
offsetof(INDEX_BLOCK, index));
ih->flags = LEAF_NODE;
ie = (INDEX_ENTRY*)((u8*)ih + ie_ofs);
ie->length = const_cpu_to_le16(sizeof(INDEX_ENTRY_HEADER));
ie->flags = INDEX_ENTRY_END;
}
/**
* ntfs_index_block_alloc - allocate and return an index allocation block
* @ictx: index context of the index for which to allocate a block
* @dst_vcn: pointer in which to return the VCN of the allocated index block
* @dst_ia: pointer in which to return the allocated index block
* @dst_upl_ofs: pointer in which to return the mapped address of the page data
* @dst_upl: pointer in which to return the page list containing @ia
* @dst_pl: pointer in which to return the array of pages containing @ia
* @dst_addr: pointer in which to return the mapped address
*
* Allocate an index allocation block for the index described by the index
* context @ictx and return it in *@dst_ia. *@dst_vcn, *@dst_upl_ofs,
* *@dst_upl, *@dst_pl, and *@dst_addr are set to the VCN of the allocated
* index block and the mapped page list and array of pages containing the
* returned index block respectively.
*
* Return 0 on success and errno on error. On error *@dst_vcn, *@dst_ia,
* *@dst_upl_ofs, *@dst_upl, *@dst_pl, and *@dst_addr are left untouched.
*
* Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for
* writing.
* - All of the index contexts in the index must be unlocked (this
* includes @ictx, i.e. @ictx must not be locked).
*/static errno_t ntfs_index_block_alloc(ntfs_index_context *ictx, VCN *dst_vcn,
INDEX_BLOCK **dst_ia, s64 *dst_upl_ofs, upl_t *dst_upl,
upl_page_info_array_t *dst_pl, u8 **dst_addr)
{
s64 bmp_pos, end_pos, init_size, upl_ofs;
ntfs_inode *bmp_ni, *idx_ni = ictx->idx_ni;
upl_t upl;
upl_page_info_array_t pl;
u8 *bmp, *bmp_end;
INDEX_ALLOCATION *ia;
errno_t err, err2;
lck_rw_type_t lock;
le16 usn;
BOOL have_resized;
u8 bit;
ntfs_debug("Entering for inode 0x%llx.",
(unsignedlonglong)idx_ni->mft_no);
/*
* Get the index bitmap inode.
*
* Note we do not lock the bitmap inode as the caller holds an
* exclusive lock on the index inode thus the bitmap cannot change
* under us as the only places the index bitmap is modified is as a
* side effect of a locked index inode.
*
* The inode will be locked later if/when we are going to modify its
* size to ensure any relevant VM activity that may be happening at the
* same time is excluded.
*
* We need to take the ntfs inode lock on the bitmap inode for writing.
* This causes a complication because there is a valid case in which we
* can recurse (once) back into ntfs_index_block_alloc() by calling:
* ntfs_attr_resize(bmp_ni) ->
* ntfs_index_move_root_to_allocation_block() ->
* ntfs_index_block_alloc()
* In this case we have to not take the lock again or we will deadlock
* (or with lock debugging enabled the kernel will panic()). Thus we
* set @ictx->bmp_is_locked so that when we get back here we know not
* to take the lock again.
*/
lock = 0;
if (!ictx->bmp_is_locked) {
lock = LCK_RW_TYPE_EXCLUSIVE;
ictx->bmp_is_locked = 1;
}
err = ntfs_attr_inode_get(ictx->base_ni, AT_BITMAP, idx_ni->name,
idx_ni->name_len, FALSE, lock, &bmp_ni);
if (err) {
ntfs_error(idx_ni->vol->mp, "Failed to get index bitmap inode ""(error %d).", err);
if (lock)
ictx->bmp_is_locked = 0;
gotoerr;
}
/* Find the first zero bit in the index bitmap. */
bmp_pos = 0;
lck_spin_lock(&bmp_ni->size_lock);
end_pos = bmp_ni->initialized_size;
lck_spin_unlock(&bmp_ni->size_lock);
for (bmp_pos = 0; bmp_pos < end_pos;) {
err = ntfs_page_map(bmp_ni, bmp_pos, &upl, &pl, &bmp, TRUE);
if (err) {
ntfs_error(idx_ni->vol->mp, "Failed to read index ""bitmap (error %d).", err);
gotoput_err;
}
bmp_end = bmp + PAGE_SIZE;
if (bmp_pos + PAGE_SIZE > end_pos)
bmp_end = bmp + (end_pos - bmp_pos);
/* Check the next bit(s). */for (; bmp < bmp_end; bmp++, bmp_pos++) {
if (*bmp == 0xff)
continue;
/*
* TODO: There does not appear to be a ffz() function
* in the kernel. )-: If/when the kernel has an ffz()
* function, switch the below code to use it.
*
* So emulate "ffz(x)" using "ffs(~x) - 1" which gives
* the same result but incurs extra CPU overhead.
*/
bit = ffs(~(unsignedlong)*bmp) - 1;
if (bit < 8)
gotoallocated_bit;
}
ntfs_page_unmap(bmp_ni, upl, pl, FALSE);
}
/*
* There are no zero bits in the initialized part of the bitmap. Thus
* we extend it by 8 bytes and allocate the first bit of the extension.
*/
bmp_pos = end_pos;
lck_spin_lock(&bmp_ni->size_lock);
end_pos = bmp_ni->data_size;
lck_spin_unlock(&bmp_ni->size_lock);
/* If we are exceeding the bitmap size need to extend it. */
have_resized = FALSE;
if (bmp_pos + 8 >= end_pos) {
ntfs_debug("Extending index bitmap, old size 0x%llx, ""requested size 0x%llx.",
(unsignedlonglong)end_pos,
(unsignedlonglong)end_pos + 8);
err = ntfs_attr_resize(bmp_ni, end_pos + 8, 0, ictx);
if (err) {
ntfs_error(idx_ni->vol->mp, "Failed to extend index ""bitmap (error %d).", err);
gotoput_err;
}
have_resized = TRUE;
}
/*
* Get the page containing the bit we are allocating. Note this has to
* happen before we call ntfs_attr_set_initialized_size() as the latter
* only sets the initialized size without zeroing the area between the
* old initialized size and the new one thus we need to map the page
* now so that its tail is zeroed due to the old value of the
* initialized size.
*/
err = ntfs_page_map(bmp_ni, bmp_pos & ~PAGE_MASK_64, &upl, &pl, &bmp,
TRUE);
if (err) {
ntfs_error(idx_ni->vol->mp, "Failed to read index bitmap ""(error %d).", err);
/*
* There is no need to resize the bitmap back to its old size.
* It does not change the metadata consistency to have a bigger
* bitmap.
*/gotoput_err;
}
bmp += (unsigned)bmp_pos & PAGE_MASK;
/*
* Extend the initialized size if the bitmap is non-resident. If it is
* resident this was already done by the ntfs_attr_resize() call.
*/if (NInoNonResident(bmp_ni)) {
#ifdefDEBUG
lck_spin_lock(&bmp_ni->size_lock);
init_size = bmp_ni->initialized_size;
lck_spin_unlock(&bmp_ni->size_lock);
ntfs_debug("Setting initialized size of index bitmap, old ""size 0x%llx, requested size 0x%llx.",
(unsignedlonglong)init_size,
(unsignedlonglong)end_pos + 8);
#endif
err = ntfs_attr_set_initialized_size(bmp_ni, end_pos + 8);
if (err) {
ntfs_error(idx_ni->vol->mp, "Failed to update ""initialized size of index bitmap ""(error %d).", err);
ntfs_page_unmap(bmp_ni, upl, pl, FALSE);
gotoput_err;
}
}
/* Finally, allocate the bit in the page. */
bit = 0;
/*
* If we used ntfs_attr_resize() to extend the bitmap attribute it is
* possible that this caused the index root node to be moved out to an
* index allocation block which very likely will have allocated the
* very bit we want to allocate so we test for this case and if it has
* happened we allocate the next bit along which must be free or
* something has gone wrong.
*/if (have_resized && *bmp & 1) {
if (*bmp & 2)
panic("%s(): *bmp & 2\n", __FUNCTION__);
bit++;
}
allocated_bit:
*bmp |= 1 << bit;
ntfs_page_unmap(bmp_ni, upl, pl, TRUE);
/* Set @bmp_pos to the allocated index bitmap bit. */
bmp_pos = (bmp_pos << 3) + bit;
/*
* If we are caching the last set bit in the bitmap in the index inode
* and we allocated beyond the last set bit, update the cached value.
*/if (idx_ni->last_set_bit >= 0 && bmp_pos > idx_ni->last_set_bit)
idx_ni->last_set_bit = bmp_pos;
ntfs_debug("Allocated index bitmap bit 0x%llx.", bmp_pos);
/*
* We are done with the bitmap and have the index allocation to go.
*
* If the allocated bit is outside the data size need to extend it.
*/
lck_spin_lock(&idx_ni->size_lock);
end_pos = idx_ni->data_size;
lck_spin_unlock(&idx_ni->size_lock);
if (bmp_pos >= end_pos >> idx_ni->block_size_shift) {
ntfs_debug("Extending index allocation, old size 0x%llx, ""requested size 0x%llx.", (unsignedlonglong)
end_pos, (unsignedlonglong)(bmp_pos + 1) <<
idx_ni->block_size_shift);
err = ntfs_attr_resize(idx_ni, (bmp_pos + 1) <<
idx_ni->block_size_shift, 0, ictx);
if (err) {
ntfs_error(idx_ni->vol->mp, "Failed to extend index ""allocation (error %d).", err);
gotoalloc_err;
}
}
/*
* Map the page containing the allocated index block. As above for the
* bitmap we need to get the page before we set the initialized size so
* that the tail of the page is zeroed for us.
*/
upl_ofs = (bmp_pos << idx_ni->block_size_shift) & ~PAGE_MASK_64;
err = ntfs_page_map(idx_ni, upl_ofs, &upl, &pl, &bmp, TRUE);
if (err) {
ntfs_error(idx_ni->vol->mp, "Failed to read index allocation ""block (error %d).", err);
/*
* There is no need to truncate the index allocation back to
* its old size. It does not change the metadata consistency
* to have a bigger index allocation.
*/gotoalloc_err;
}
/* Extend the initialized size if needed. */
lck_spin_lock(&idx_ni->size_lock);
init_size = idx_ni->initialized_size;
lck_spin_unlock(&idx_ni->size_lock);
if (bmp_pos >= init_size >> idx_ni->block_size_shift) {
ntfs_debug("Setting initialized size of index allocation, old ""size 0x%llx, requested size 0x%llx.",
(unsignedlonglong)init_size,
(unsignedlonglong)(bmp_pos + 1) <<
idx_ni->block_size_shift);
err = ntfs_attr_set_initialized_size(idx_ni,
(bmp_pos + 1) << idx_ni->block_size_shift);
if (err) {
ntfs_error(idx_ni->vol->mp, "Failed to update ""initialized size of index allocation ""(error %d).", err);
gotounm_err;
}
}
if (lock) {
ictx->bmp_is_locked = 0;
lck_rw_unlock_exclusive(&bmp_ni->lock);
}
(void)vnode_put(bmp_ni->vn);
/* Lay out an empty index block into the allocated space. */
ia = (INDEX_ALLOCATION*)(bmp + (((unsigned)bmp_pos <<
idx_ni->block_size_shift) & PAGE_MASK));
/* Preserve the update sequence number across the layout. */
usn = 0;
if (le16_to_cpu(ia->usa_ofs) < NTFS_BLOCK_SIZE - sizeof(u16))
usn = *(le16*)((u8*)ia + le16_to_cpu(ia->usa_ofs));
/* Calculate the index block vcn from the index block number. */
*dst_vcn = bmp_pos = bmp_pos << idx_ni->block_size_shift >>
idx_ni->vcn_size_shift;
ntfs_index_block_lay_out(idx_ni, bmp_pos, ia);
if (usn && usn != const_cpu_to_le16(0xffff))
*(le16*)((u8*)ia + le16_to_cpu(ia->usa_ofs)) = usn;
*dst_ia = ia;
*dst_upl_ofs = upl_ofs;
*dst_upl = upl;
*dst_pl = pl;
*dst_addr = bmp;
ntfs_debug("Done (allocated index block with vcn 0x%llx.",
(unsignedlonglong)bmp_pos);
return 0;
unm_err:
ntfs_page_unmap(idx_ni, upl, pl, FALSE);
alloc_err:
/* Free the index bitmap bit that we allocated. */
err2 = ntfs_bitmap_clear_bit(bmp_ni, bmp_pos);
if (err2) {
ntfs_error(idx_ni->vol->mp, "Failed to undo index block ""allocation in index bitmap (error %d). ""Leaving inconsistent metadata. Run chkdsk.",
err2);
NVolSetErrors(idx_ni->vol);
}
put_err:
if (lock) {
ictx->bmp_is_locked = 0;
lck_rw_unlock_exclusive(&bmp_ni->lock);
}
(void)vnode_put(bmp_ni->vn);
err:
ntfs_error(idx_ni->vol->mp, "Failed (error %d).", err);
return err;
}
/**
* ntfs_index_ctx_disconnect_reinit - disconnect and reinit an index context
* @ictx: index context to disconnect
*
* Disconnect the index context @ictx from its tree path and reinitialize its
* @up and @down pointers to point to itself.
*
* Locking: - Caller must hold @ictx->idx_ni->lock on the index inode.
*/static inline voidntfs_index_ctx_disconnect_reinit(ntfs_index_context *ictx)
{
ntfs_index_ctx_disconnect(ictx);
ictx->down = ictx->up = ictx;
}
/**
* ntfs_index_move_root_to_allocation_block - move index root to allocation
* @ictx: index lookup context describing index root to move
*
* Move the index root to an index allocation block in the process switching
* the index from a small to a large index if necessary.
*
* If no index allocation and/or bitmap attributes exist they are created.
*
* An index block is then allocated and if necessary the index bitmap and/or
* the allocation attributes are extended in the process.
*
* Finally all index entries contained in the index root attribute are moved
* into the allocated index block in the index allocation attribute.
*
* We need to potentially create the index allocation and/or bitmap attributes
* so we can move the entries from the index root attribute to the index
* allocation attribute and then shrink the index root attribute. However,
* there is not enough space in the mft record to do this. Also we already
* have the index root attribute looked up so it makes sense to deal with it
* first.
*
* Thus, if there is no index allocation attribute, we allocate the space to be
* used by the index allocation attribute and setup the directory inode for a
* large index including the allocated space but leaving the initialized size
* to zero. We then map and lock the first page containing the now allocated
* first index block and move the index entries from the index root into it.
* We then shrink the index root and set it up to point to the allocated index
* block.
*
* Having shrunk the index root attribute there is now hopefully enough space
* in the mft record to create the index allocation attribute and the index
* bitmap attribute in the mft record and the conversion is complete.
*
* If there is not enough space we create the index allocation and/or index
* bitmap attributes in an extent mft record. TODO: This is not implemented
* yet.
*
* If there is an index allocation attribute already, we allocate a temporary
* buffer to hold the index block and then copy from there into the allocated
* index block later.
*
* Return 0 on success and errno on error. On error the index context is no
* longer usable and must be released or reinitialized.
*
* Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for
* writing.
* - The index context @ictx must be locked.
*/
errno_t ntfs_index_move_root_to_allocation_block(ntfs_index_context *ictx)
{
VCN vcn;
ntfs_inode *base_ni, *idx_ni;
ntfs_volume *vol;
ntfs_index_context *ir_ictx;
upl_t upl;
upl_page_info_array_t pl;
INDEX_ALLOCATION *ia;
INDEX_HEADER *ih;
INDEX_ROOT *ir;
INDEX_ENTRY *ie;
ntfs_attr_search_ctx *actx;
MFT_RECORD *m;
u32 u, ia_ie_ofs, ir_ie_ofs, clusters = 0;
errno_t err, err2;
struct {
unsigned is_large_index:1;
} old;
BOOL need_ubc_setsize = FALSE;
ntfs_debug("Entering.");
if (!ictx->is_locked)
panic("%s(): !ictx->is_locked\n", __FUNCTION__);
base_ni = ictx->base_ni;
idx_ni = ictx->idx_ni;
vol = base_ni->vol;
/*
* The current index context is going to turn from describing the index
* root to describing the newly allocated index block. Thus, allocate
* a new index context for the emptied index root.
*/
ir_ictx = ntfs_index_ctx_get(idx_ni);
if (!ir_ictx)
return ENOMEM;
ir_ictx->entries = OSMalloc(ictx->max_entries * sizeof(INDEX_ENTRY*),
ntfs_malloc_tag);
if (!ir_ictx->entries) {
err = ENOMEM;
gotoerr;
}
/*
* If there is no index allocation attribute, we need to allocate an
* index block worth of clusters. Thus if the cluster size is less
* than the index block size the number of clusters we need to allocate
* is the index block size divided by the cluster size and if the
* cluster size is greater than or equal to the index block size we
* simply allocate one cluster.
*
* Otherwise we allocate a temporary buffer for the index block.
*/if (!NInoIndexAllocPresent(idx_ni)) {
clusters = idx_ni->block_size >> vol->cluster_size_shift;
if (!clusters)
clusters = 1;
/*
* We cannot lock the runlist as we are holding the mft record
* lock. That is fortunately ok as we also have @idx_ni->lock
* on the index inode and there is no index allocation
* attribute at present so no-one can be using the runlist yet.
*/
err = ntfs_cluster_alloc(vol, 0, clusters, -1, DATA_ZONE, TRUE,
&idx_ni->rl);
if (err) {
ntfs_error(vol->mp, "Failed to allocate %u clusters ""for the index allocation block ""(error %d).", (unsigned)clusters,
err);
gotoerr;
}
/* Allocate/get the first page and zero it out. */
ntfs_page_grab(idx_ni, 0, &upl, &pl, (u8**)&ia, TRUE);
if (err) {
ntfs_error(vol->mp, "Failed to grab first page.");
gotopage_err;
}
bzero(ia, PAGE_SIZE);
} else {
/* Allocate a temporary buffer and zero it out. */
ia = OSMalloc(idx_ni->block_size, ntfs_malloc_tag);
if (!ia) {
err = ENOMEM;
gotoerr;
}
bzero(ia, idx_ni->block_size);
upl = NULL;
pl = NULL;
}
/*
* Set up the index block and copy the index entries from the index
* root.
*/
ia->magic = magic_INDX;
ia->usa_ofs = const_cpu_to_le16(sizeof(INDEX_BLOCK));
ia->usa_count = cpu_to_le16(1 + (idx_ni->block_size / NTFS_BLOCK_SIZE));
/* Set the update sequence number to 1. */
*(le16*)((u8*)ia + le16_to_cpu(ia->usa_ofs)) = cpu_to_le16(1);
ih = &ia->index;
ia_ie_ofs = (sizeof(INDEX_HEADER) +
(le16_to_cpu(ia->usa_count) << 1) + 7) & ~7;
ih->entries_offset = cpu_to_le32(ia_ie_ofs);
/* Work out the size of the index entries in the index root. */
ir = ictx->ir;
ir_ie_ofs = le32_to_cpu(ir->index.entries_offset);
/*
* FIXME: Should we scan through the index root looking for the last
* entry and use that to determine the size instead? Then we could fix
* any space being wasted due to a too long index for its entries.
*/
u = le32_to_cpu(ir->index.index_length) - ir_ie_ofs;
ih->index_length = cpu_to_le32(ia_ie_ofs + u);
ih->allocated_size = cpu_to_le32(idx_ni->block_size -
offsetof(INDEX_BLOCK, index));
ih->flags = LEAF_NODE;
ie = (INDEX_ENTRY*)((u8*)&ir->index + ir_ie_ofs);
memcpy((u8*)ih + ia_ie_ofs, ie, u);
/*
* If the index root is a large index, then the index block is an index
* node, i.e. not a leaf node.
*/if (ir->index.flags & LARGE_INDEX) {
old.is_large_index = 1;
ih->flags |= INDEX_NODE;
}
/*
* Set up the index context to point into the index allocation block
* instead of into the index root.
*/
ictx->entry = (INDEX_ENTRY*)((u8*)ih + ia_ie_ofs +
((u8*)ictx->entry - (u8*)ie));
ictx->is_root = 0;
actx = ictx->actx;
ictx->ia = ia;
ictx->vcn = 0;
ictx->index = ih;
ictx->upl_ofs = 0;
ictx->upl = upl;
ictx->pl = pl;
ictx->addr = (u8*)ia;
ictx->bytes_free = le32_to_cpu(ia->index.allocated_size) -
le32_to_cpu(ia->index.index_length);
/*
* We have copied the index entries and switched the index context so
* we can go ahead and shrink the index root by moving the end entry on
* top of the first entry. We only need to do the move if the first
* index root entry is not the end entry, i.e. the index is not empty.
*/
ih = &ir->index;
if (!(ie->flags & INDEX_ENTRY_END)) {
u8 *index_end;
INDEX_ENTRY *end_ie;
index_end = (u8*)&ir->index + le32_to_cpu(ih->index_length);
for (end_ie = ie;; end_ie = (INDEX_ENTRY*)((u8*)end_ie +
le16_to_cpu(end_ie->length))) {
/* Bounds checks. */if ((u8*)end_ie < (u8*)ie || (u8*)end_ie +
sizeof(INDEX_ENTRY_HEADER) >
index_end || (u8*)end_ie +
le16_to_cpu(end_ie->length) >
index_end ||
(u32)sizeof(INDEX_ENTRY_HEADER) +
le16_to_cpu(end_ie->key_length) >
le16_to_cpu(end_ie->length)) {
ntfs_error(vol->mp, "Corrupt index. Run ""chkdsk.");
NVolSetErrors(vol);
err = EIO;
gotoictx_err;
}
/* Stop when we have found the last entry. */if (end_ie->flags & INDEX_ENTRY_END)
break;
}
memmove(ie, end_ie, le16_to_cpu(end_ie->length));
}
/*
* If the end entry is a leaf we need to extend it by 8 bytes in order
* to turn it into a node entry. To do this we need to extend the
* index root attribute itself in the case that the index root was
* empty to start with or we need to do nothing if the index root was
* not empty as we have not shrunk the index root attribute yet and we
* moved the end index entry forward by at least one index entry which
* is much bigger than 8 bytes.
*
* We do the index root attribute resize unconditionally because it
* takes care of the size increase needed in the empty index root case
* as well as of the size decrease needed in the non-empty index root
* case.
*/retry_resize:
u = le16_to_cpu(ie->length);
if (!(ie->flags & INDEX_ENTRY_NODE))
u += sizeof(VCN);
err = ntfs_resident_attr_value_resize(actx->m, actx->a,
offsetof(INDEX_ROOT, index) +
le32_to_cpu(ih->entries_offset) + u);
if (err) {
leMFT_REF mref;
ATTR_RECORD *a;
ntfschar *a_name;
ATTR_LIST_ENTRY *al_entry;
u8 *al_end;
ntfs_attr_search_ctx ctx;
/*
* This can only happen when the first entry is the last entry,
* i.e. this is an empty directory and thus the above memmove()
* has not been done, in which case we only need eight bytes
* (sizeof(VCN)) more space which means that if we manage to
* move out a single attribute out of the mft record we would
* gain that much space. In the worst case scenario we can
* always move out the index root attribute itself in which
* case we are guaranteed to have enough space as an empty
* index root is much smaller than an mft record. The only
* complication is when there is no attribute list attribute as
* we have to add it first. On the bright side that in itself
* can cause some space to be freed up an we may have enough to
* extend the index root by eight bytes.
*
* Add an attribute list attribute if it is not present.
*/if (!NInoAttrList(base_ni)) {
/*
* Take a copy of the current attribute record pointer
* so we can update all our pointers below.
*/
a = actx->a;
err = ntfs_attr_list_add(base_ni, actx->m, actx);
if (err || actx->is_error) {
if (!err)
err = actx->error;
ntfs_error(vol->mp, "Failed to add attribute ""list attribute to mft_no ""0x%llx (error %d).",
(unsignedlonglong)
base_ni->mft_no, err);
gotoictx_err;
}
update_and_retry_resize:
/*
* Need to update our cached pointers as the index root
* attribute is likely to have moved.
*/
ir = (INDEX_ROOT*)((u8*)actx->a +
le16_to_cpu(actx->a->value_offset));
ih = &ir->index;
ir_ie_ofs = le32_to_cpu(ir->index.entries_offset);
ie = (INDEX_ENTRY*)((u8*)&ir->index + ir_ie_ofs);
/*
* Retry the resize in case we freed eight bytes in the
* mft record which is all we need.
*/gotoretry_resize;
}
/*
* If this is the only attribute record in the mft record we
* cannot gain anything by moving it or anything else. This
* really cannot happen as we have emptied the index root thus
* have made the attribute as small as possible thus it must
* fit just fine in an otherwise empty mft record thus we must
* have enough space to do the resize thus we cannot have
* gotten here.
*/if (ntfs_attr_record_is_only_one(actx->m, actx->a))
panic("%s(): ntfs_attr_record_is_only_one()\n",
__FUNCTION__);
/*
* We now know we have an attribute list attribute and that we
* still do not have enough space to extend the index root
* attribute by eight bytes.
*
* First, if the index root attribute is already in an extent
* mft record move it into a new, empty extent mft record.
* This is guaranteed to give us the needed eight bytes.
*
* Second, look through the mft record to see if there are any
* attribute records we can move out into an extent mft record
* and if yes move one. That will also definitely give us the
* needed eight bytes of space.
*
* Finally, if no attributes are available for moving then move
* the index root attribute itself.
*/if (actx->ni != base_ni) {
move_idx_root:
lck_rw_lock_shared(&base_ni->attr_list_rl.lock);
err = ntfs_attr_record_move(actx);
lck_rw_unlock_shared(&base_ni->attr_list_rl.lock);
if (err) {
ntfs_error(vol->mp, "Failed to move index ""root attribute from mft ""record 0x%llx to an extent ""mft record (error %d).",
(unsignedlonglong)
actx->ni->mft_no, err);
gotoictx_err;
}
/*
* Need to update our cached pointers as the index root
* attribute has now moved after which we need to retry
* the resize which will now succeed.
*/gotoupdate_and_retry_resize;
}
/*
* We know the index root is in the base mft record.
*
* Look through the base mft record to see if there are any
* attribute records we can move out into an extent mft record
* and if yes move one. That will also definitely give us the
* needed eight bytes of space.
*/
ntfs_attr_search_ctx_init(&ctx, base_ni, actx->m);
ctx.is_iteration = 1;
do {
err = ntfs_attr_find_in_mft_record(AT_UNUSED, NULL, 0,
NULL, 0, &ctx);
if (err) {
if (err == ENOENT) {
/*
* No attributes are available for
* moving thus move the index root
* attribute out of the base mft record
* into a new extent.
*
* TODO: Need to get this case
* triggered and then need to run
* chkdsk to check for validity of
* moving the index root attribute out
* of the base mft record.
*/gotomove_idx_root;
}
ntfs_error(vol->mp, "Failed to iterate over ""attribute records in base ""mft record 0x%llx (error %d).",
(unsignedlonglong)
base_ni->mft_no, err);
gotoictx_err;
}
/*
* Skip the standard information attribute, the
* attribute list attribute, and the index root
* attribute as we are looking for lower priority
* attributes to move out and the attribute list
* attribute is of course not movable.
*/
a = ctx.a;
} while (a->type == AT_STANDARD_INFORMATION ||
a->type == AT_ATTRIBUTE_LIST ||
a->type == AT_INDEX_ROOT);
/*
* Move the found attribute out to an extent mft record and
* update its attribute list entry.
*
* But first find the attribute list entry matching the
* attribute record so it can be updated.
*/
a_name = (ntfschar*)((u8*)a + le16_to_cpu(a->name_offset));
al_entry = (ATTR_LIST_ENTRY*)base_ni->attr_list;
al_end = (u8*)al_entry + base_ni->attr_list_size;
mref = MK_LE_MREF(base_ni->mft_no, base_ni->seq_no);
do {
if ((u8*)al_entry >= al_end || !al_entry->length) {
ntfs_error(vol->mp, "Attribute list attribute ""in mft_no 0x%llx is ""corrupt. Run chkdsk.",
(unsignedlonglong)
base_ni->mft_no);
err = EIO;
gotoictx_err;
}
if (al_entry->mft_reference == mref &&
al_entry->instance == a->instance) {
/*
* We found the entry, stop looking but first
* perform a quick sanity check that we really
* do have the correct attribute record.
*/if (al_entry->type == a->type &&
ntfs_are_names_equal(
(ntfschar*)((u8*)al_entry +
al_entry->name_offset),
al_entry->name_length, a_name,
a->name_length, TRUE,
vol->upcase, vol->upcase_len))
break;
ntfs_error(vol->mp, "Found corrupt attribute ""list attribute when looking ""for attribute type 0x%x in ""attribute list attribute of ""base mft record 0x%llx. Run ""chkdsk.",
(unsigned)le32_to_cpu(a->type),
(unsignedlonglong)
base_ni->mft_no);
NVolSetErrors(vol);
err = EIO;
gotoictx_err;
}
/* Go to the next attribute list entry. */
al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry +
le16_to_cpu(al_entry->length));
} while (1);
/* Finally, move the attribute to an extent record. */
err = ntfs_attr_record_move_for_attr_list_attribute(&ctx,
al_entry, actx, NULL);
if (err) {
ntfs_error(vol->mp, "Failed to move attribute type ""0x%x out of base mft record 0x%llx ""and into an extent mft record (error ""%d). Run chkdsk.",
(unsigned)le32_to_cpu(a->type),
(unsignedlonglong)base_ni->mft_no,
err);
NVolSetErrors(vol);
gotoictx_err;
}
/*
* Sync the modified attribute list attribute value to
* metadata/disk.
*/
ntfs_attr_search_ctx_reinit(&ctx);
err = ntfs_attr_list_sync(base_ni, (u8*)al_entry -
base_ni->attr_list, &ctx);
if (!err) {
/*
* Need to update our cached pointers as the index root
* attribute has likely moved after which we need to
* retry the resize which will now succeed.
*/gotoupdate_and_retry_resize;
}
/*
* FIXME: We could try and revert the move of the attribute and
* the attribute list attribute value change but that is a lot
* of work and the only real errors that could happen at this
* stage are either that we have run out of memory or that the
* disk has gone bad or has been disconnected or there must be
* bad memory corruption. In all those cases we are extremely
* unlikely to be able to undo what we have done so far due to
* further errors so at least for now we do not bother trying.
*/
ntfs_error(vol->mp, "Failed to sync attribute list attribute ""in mft_no 0x%llx (error %d). Leaving ""corrupt metadata. Run chkdsk.",
(unsignedlonglong)base_ni->mft_no, err);
/*
* Need to update our cached pointers as the index root
* attribute has likely moved.
*/
ir = (INDEX_ROOT*)((u8*)actx->a +
le16_to_cpu(actx->a->value_offset));
ih = &ir->index;
ir_ie_ofs = le32_to_cpu(ir->index.entries_offset);
ie = (INDEX_ENTRY*)((u8*)&ir->index + ir_ie_ofs);
gotoictx_err;
}
/*
* Update the end index entry and the index header to reflect the
* changes in size.
*/
ie->length = cpu_to_le16(u);
ih->allocated_size = ih->index_length = cpu_to_le32(
le32_to_cpu(ih->entries_offset) + u);
/*
* Update the index root and end index entry to reflect the fact that
* the directory now is a large index and that the end index entry
* points to a sub-node and set the sub-node pointer to vcn 0.
*/
ih->flags |= LARGE_INDEX;
ie->flags |= INDEX_ENTRY_NODE;
*(leVCN*)((u8*)ie + le16_to_cpu(ie->length) - sizeof(VCN)) = 0;
/*
* Now setup the new index context for the emptied index root and
* attach it at the start of the tree path.
*/
ir_ictx->entries[0] = ir_ictx->entry = ie;
ir_ictx->is_root = 1;
ir_ictx->is_locked = 1;
ir_ictx->ir = ir;
ir_ictx->index = ih;
ir_ictx->actx = actx;
ir_ictx->bytes_free = le32_to_cpu(actx->m->bytes_allocated) -
le32_to_cpu(actx->m->bytes_in_use);
ir_ictx->max_entries = ictx->max_entries;
ir_ictx->nr_entries = 1;
/* Ensure the modified index root attribute is written to disk. */
ntfs_index_entry_mark_dirty(ir_ictx);
/*
* Attach the new index context between the current index context
* (which is the top of the tree) and the index context above it (which
* is the bottom of the tree), i.e. make the new index context the top
* of the tree.
*/
ictx->up->down = ir_ictx;
ir_ictx->down = ictx;
ir_ictx->up = ictx->up;
ictx->up = ir_ictx;
/*
* If the index allocation attribute is not present yet, we need to
* create it and the index bitmap attribute.
*/if (upl) {
/*
* The page is now done, mark the index context as dirty so it
* gets written out later.
*/
ntfs_index_entry_mark_dirty(ictx);
/*
* Set up the index inode to reflect that it has an index
* allocation attribute.
*/
NInoSetIndexAllocPresent(idx_ni);
lck_spin_lock(&idx_ni->size_lock);
idx_ni->allocated_size = (s64)clusters <<
vol->cluster_size_shift;
idx_ni->initialized_size = idx_ni->data_size =
idx_ni->block_size;
lck_spin_unlock(&idx_ni->size_lock);
if (idx_ni->name == I30) {
lck_spin_lock(&base_ni->size_lock);
base_ni->allocated_size = idx_ni->allocated_size;
base_ni->initialized_size = base_ni->data_size =
idx_ni->block_size;
lck_spin_unlock(&base_ni->size_lock);
}
if (!ubc_setsize(idx_ni->vn, idx_ni->data_size))
panic("%s(): ubc_setsize() failed.\n", __FUNCTION__);
/*
* Find the position at which we need to insert the index
* allocation attribute.
*/
err = ntfs_attr_lookup(AT_INDEX_ALLOCATION, idx_ni->name,
idx_ni->name_len, 0, NULL, 0, actx);
if (err != ENOENT) {
if (!err) {
ntfs_error(vol->mp, "Index allocation ""attribute already present ""when it should not be. ""Corrupt index or driver ""bug. Run chkdsk.");
NVolSetErrors(vol);
err = EIO;
} else
ntfs_error(vol->mp, "Failed to look up ""position at which to insert ""the index allocation ""attribute (error %d).", err);
gotoia_err;
}
/* Insert the index allocation attribute. */
err = ntfs_insert_index_allocation_attribute(base_ni, actx,
idx_ni);
if (err || actx->is_error) {
ntfs_error(vol->mp, "Failed to %s mft_no 0x%llx ""(error %d).", actx->is_error ?
"remap extent mft record of" :
"insert index allocation attribute in ",
(unsignedlonglong)base_ni->mft_no,
err);
gotoia_err;
}
/*
* Ensure the created index allocation attribute is written to
* disk.
*/
NInoSetMrecNeedsDirtying(actx->ni);
/*
* Find the position at which we need to insert the index
* bitmap attribute.
*/
ntfs_attr_search_ctx_reinit(actx);
err = ntfs_attr_lookup(AT_BITMAP, idx_ni->name,
idx_ni->name_len, 0, NULL, 0, actx);
if (err != ENOENT) {
if (!err) {
ntfs_error(vol->mp, "Index bitmap attribute ""attribute already present ""when it should not be. ""Corrupt index or driver ""bug. Run chkdsk.");
NVolSetErrors(vol);
err = EIO;
} else
ntfs_error(vol->mp, "Failed to look up ""position at which to insert ""the index bitmap attribute ""(error %d).", err);
gotobmp_err;
}
/*
* Insert the index bitmap attribute as a resident attribute
* with a value length sufficient to cover the number of
* allocated index blocks rounded up to a multiple of 8 bytes
* which are initialized to zero. We then set the first bit in
* the bitmap to reflect the fact that the first index
* allocation block is in use.
*/
err = ntfs_resident_attr_record_insert(base_ni, actx,
AT_BITMAP, idx_ni->name, idx_ni->name_len,
NULL, (((idx_ni->allocated_size >>
idx_ni->block_size_shift) + 63) >> 3) &
~(s64)7);
if (err || actx->is_error) {
if (!err)
err = actx->error;
ntfs_error(vol->mp, "Failed to %s mft_no 0x%llx ""(error %d).", actx->is_error ?
"remap extent mft record of" :
"insert index bitmap attribute in",
(unsignedlonglong)base_ni->mft_no,
err);
gotobmp_err;
}
*((u8*)actx->a + le16_to_cpu(actx->a->value_offset)) = 1;
/*
* Ensure the created index bitmap attribute is written to
* disk.
*/
NInoSetMrecNeedsDirtying(actx->ni);
/*
* We successfully created the index allocation and bitmap
* attributes thus we can set up our cache of the last set bit
* in the bitmap in the index inode to 0 to reflect the fact
* that we just allocated the first index block.
*/
idx_ni->last_set_bit = 0;
}
/*
* We are either completely done and can release the attribute search
* context and the mft record or we now need to call functions which
* will deadlock with us if we do not release the attribute search
* context and the mft record so we have to release them now thus we
* now unlock the index root context.
*/
ntfs_index_ctx_unlock(ir_ictx);
if (upl) {
INDEX_ENTRY **entries;
long delta;
unsigned i;
update_ie_pointers:
/*
* Note we still hold the locked and referenced index
* allocation page which is now attached to the index context
* and will be released later when the index context is
* released.
*
* We need to update the index entry pointers in the array to
* point into the index block instead of the old index root.
*/
entries = ictx->entries;
delta = (u8*)ictx->entry - (u8*)entries[ictx->entry_nr];
for (i = 0; i < ictx->nr_entries; i++)
entries[i] = (INDEX_ENTRY*)((u8*)entries[i] + delta);
if (ictx->entry != entries[ictx->entry_nr])
panic("%s(): ictx->entry != entries[ictx->entry_nr]\n",
__FUNCTION__);
ntfs_debug("Done.");
return 0;
}
/*
* We have an index allocation attribute already thus we need to
* allocate an index block from it for us to transfer the temporary
* index block to.
*
* We need to set @is_locked to zero because we are using the temporary
* buffer for @ia and hence @upl and @pl are zero thus as far as the
* index context code is concerned the context is not actually locked
* at all. Not doing this can lead to a panic() in
* ntfs_index_block_alloc() under some circumstances due to an
* assertion check that verifies that @is_locked is zero in
* ntfs_attr_resize().
*/
ictx->is_locked = 0;
err = ntfs_index_block_alloc(ictx, &vcn, &ia, &ictx->upl_ofs, &upl,
&ictx->pl, &ictx->addr);
if (err) {
ntfs_error(vol->mp, "Failed to allocate index allocation ""block (error %d).", err);
gotoalloc_err;
}
/* Copy the update sequence number into our temporary buffer. */
*(le16*)((u8*)ictx->ia + le16_to_cpu(ictx->ia->usa_ofs)) =
*(le16*)((u8*)ia + le16_to_cpu(ia->usa_ofs));
/*
* Copy our temporary buffer into the allocated index block, free the
* temporary buffer and setup the index context to point to the
* allocated index block instead of the temporary one.
*/
memcpy(ia, ictx->ia, idx_ni->block_size);
OSFree(ictx->ia, idx_ni->block_size, ntfs_malloc_tag);
ictx->entry = ie = (INDEX_ENTRY*)((u8*)ia +
((u8*)ictx->entry - (u8*)ictx->ia));
ictx->ia = ia;
ictx->index = &ia->index;
ictx->upl = upl;
ictx->is_locked = 1;
/*
* If the vcn of the allocated index block is not zero, we need to
* update the vcn in the index block itself as well as the sub-node
* vcn pointer in the index root attribute.
*/if (vcn) {
ictx->vcn = vcn;
ia->index_block_vcn = cpu_to_sle64(vcn);
ictx->upl_ofs = (vcn << idx_ni->vcn_size_shift) &
~PAGE_MASK_64;
/* Get hold of the mft record for the index inode. */
err = ntfs_mft_record_map(base_ni, &m);
if (err) {
/*
* The only thing that is now incorrect is the vcn
* sub-node pointer in the empty index root attribute
* but we cannot correct it as we are failing to map
* the mft record which we need to be able to rollback.
* We leave it to chkdsk to sort this out later.
*/
ntfs_error(vol->mp, "Cannot rollback partial index ""upgrade because ""ntfs_mft_record_map() failed (error ""%d). Leaving inconsistent ""metadata. Run chkdsk.", err);
gotomap_err;
}
actx = ntfs_attr_search_ctx_get(base_ni, m);
if (!actx) {
err = ENOMEM;
ntfs_error(vol->mp, "Cannot rollback partial index ""upgrade because there was not enough ""memory to obtain an attribute search ""context. Leaving inconsistent ""metadata. Run chkdsk.");
gotoactx_err;
}
/* Find the index root attribute in the mft record. */
err = ntfs_attr_lookup(AT_INDEX_ROOT, idx_ni->name,
idx_ni->name_len, 0, NULL, 0, actx);
if (err)
gotolookup_err;
/* Get to the index root value. */
ir = (INDEX_ROOT*)((u8*)actx->a +
le16_to_cpu(actx->a->value_offset));
/* The first index entry which is also the last one. */
ie = (INDEX_ENTRY*)((u8*)&ir->index +
le32_to_cpu(ir->index.entries_offset));
if (!(ie->flags & INDEX_ENTRY_END))
panic("%s(): !(ie->flags & INDEX_ENTRY_END)\n",
__FUNCTION__);
if (!(ie->flags & INDEX_ENTRY_NODE))
panic("%s(): !(ie->flags & INDEX_ENTRY_NODE)\n",
__FUNCTION__);
/* Finally, update the vcn pointer of the index entry. */
*(leVCN*)((u8*)ie + le16_to_cpu(ie->length) - sizeof(VCN)) =
cpu_to_sle64(vcn);
/*
* Ensure the updated index entry is written to disk and
* release the attribute search context and the mft record.
*/
NInoSetMrecNeedsDirtying(actx->ni);
ntfs_attr_search_ctx_put(actx);
ntfs_mft_record_unmap(base_ni);
}
/*
* The page is now done, mark the index context as dirty so it gets
* written out later.
*/
ntfs_index_entry_mark_dirty(ictx);
gotoupdate_ie_pointers;
lookup_err:
ntfs_error(vol->mp, "Cannot rollback partial index upgrade because we ""failed to look up the index root attribute (error ""%d). Leaving inconsistent metadata. Run chkdsk.",
err);
if (err == ENOENT)
err = EIO;
ntfs_attr_search_ctx_put(actx);
actx_err:
ntfs_mft_record_unmap(base_ni);
map_err:
/*
* The page is now done, mark the index context as dirty so it gets
* written out later.
*/
ntfs_index_entry_mark_dirty(ictx);
gotoerr;
alloc_err:
/*
* We need to get hold of the mft record and get a new search context
* so we can restore the index root attribute.
*/
err2 = ntfs_mft_record_map(base_ni, &m);
if (err2) {
ntfs_error(vol->mp, "Cannot rollback partial index upgrade ""because ntfs_mft_record_map() failed (error ""%d). Leaving inconsistent metadata. Run ""chkdsk.", err2);
undo_alloc_err2:
/*
* Mark the index context invalid given it neither has an index
* block and page nor an index root and mft record attached.
*/
ictx->entry = NULL;
gotoundo_alloc_err;
}
actx = ntfs_attr_search_ctx_get(base_ni, m);
if (!actx) {
ntfs_error(vol->mp, "Cannot rollback partial index upgrade ""because there was not enough memory to ""obtain an attribute search context. Leaving ""inconsistent metadata. Run chkdsk.");
ntfs_mft_record_unmap(base_ni);
gotoundo_alloc_err2;
}
gotorestore_ir;
bmp_err:
ntfs_attr_search_ctx_reinit(actx);
err2 = ntfs_attr_lookup(AT_INDEX_ALLOCATION, idx_ni->name,
idx_ni->name_len, 0, NULL, 0, actx);
if (err2) {
ntfs_error(vol->mp, "Cannot rollback partial index upgrade ""because looking up the index allocation ""attribute failed (error %d). Leaving ""inconsistent metadata. Run chkdsk.", err2);
bmp_err_err:
NVolSetErrors(vol);
/*
* Everything is actually consistent except for the fact that
* the index bitmap attribute is missing so it is better if we
* do not do any kind of rollback. Hopefully, chkdsk will fix
* this by adding the missing index bitmap attribute.
*/gotoerr;
}
/* Remove the added index allocation attribute. */
err2 = ntfs_attr_record_delete(base_ni, actx);
if (err2) {
ntfs_error(vol->mp, "Cannot rollback partial index upgrade ""because deleting the index allocation ""attribute failed (error %d). Leaving ""inconsistent metadata. Run chkdsk.", err2);
gotobmp_err_err;
}
ia_err:
/* Reset the inode. */
lck_spin_lock(&idx_ni->size_lock);
idx_ni->initialized_size = idx_ni->data_size = idx_ni->allocated_size =
0;
lck_spin_unlock(&idx_ni->size_lock);
if (idx_ni->name == I30) {
lck_spin_lock(&base_ni->size_lock);
base_ni->initialized_size = base_ni->data_size =
base_ni->allocated_size = 0;
lck_spin_unlock(&base_ni->size_lock);
}
NInoClearIndexAllocPresent(idx_ni);
/*
* Cannot call ubc_setsize() because we have the page pinned so delay
* until we dump the page later on.
*/
need_ubc_setsize = TRUE;
/*
* Pretend the @ir_ictx is not locked as we are going to transfer the
* index root back to @ictx.
*/
ir_ictx->is_locked = 0;
ir_ictx->actx = NULL;
/* Restore the index root attribute. */
ntfs_attr_search_ctx_reinit(actx);
restore_ir:
err2 = ntfs_attr_lookup(AT_INDEX_ROOT, idx_ni->name, idx_ni->name_len,
0, NULL, 0, actx);
if (err2) {
ntfs_error(vol->mp, "Cannot rollback partial index upgrade ""because looking up the index root attribute ""failed (error %d). Leaving inconsistent ""metadata. Run chkdsk.", err2);
NVolSetErrors(vol);
gotoictx_err;
}
ir = (INDEX_ROOT*)((u8*)actx->a + le16_to_cpu(actx->a->value_offset));
ih = &ir->index;
ie = (INDEX_ENTRY*)((u8*)ih + ir_ie_ofs);
u = ir_ie_ofs + (le32_to_cpu(ia->index.index_length) - ia_ie_ofs);
err2 = ntfs_resident_attr_value_resize(actx->m, actx->a,
offsetof(INDEX_ROOT, index) + u);
if (err2) {
ntfs_error(vol->mp, "Cannot rollback partial index upgrade ""because resizing the index root attribute ""failed (error %d). Leaving inconsistent ""metadata. Run chkdsk.", err2);
NVolSetErrors(vol);
gotoictx_err;
}
if (!old.is_large_index)
ih->flags &= ~LARGE_INDEX;
ih->allocated_size = ih->index_length = cpu_to_le32(u);
memcpy(ie, (u8*)&ia->index + ia_ie_ofs,
le32_to_cpu(ia->index.index_length) - ia_ie_ofs);
/* Ensure the restored index root attribute is written to disk. */
NInoSetMrecNeedsDirtying(actx->ni);
ictx_err:
/*
* Reset the index context. We may be setting @ictx->entry to a bogus
* value but it does not matter because we are returning an error code
* thus the caller must not use the index context and while the value
* may be bogus it is correctly non-NULL thus the index context will be
* cleaned up correctly when the caller releases it.
*/
ictx->entry = ictx->entries[ictx->entry_nr];
ictx->is_root = 1;
ictx->is_locked = 1;
ictx->ir = ir;
ictx->index = ih;
ictx->actx = actx;
if (!actx->is_error)
ir_ictx->bytes_free = le32_to_cpu(actx->m->bytes_allocated) -
le32_to_cpu(actx->m->bytes_in_use);
if (!upl) {
undo_alloc_err:
OSFree(ia, idx_ni->block_size, ntfs_malloc_tag);
} else {
/* Destroy the page. */
ntfs_page_dump(idx_ni, upl, pl);
if (need_ubc_setsize && !ubc_setsize(idx_ni->vn, 0))
panic("%s(): ubc_setsize() failed.\n", __FUNCTION__);
page_err:
err2 = ntfs_cluster_free_from_rl(vol, idx_ni->rl.rl, 0, -1,
NULL);
if (err2) {
ntfs_error(vol->mp, "Failed to rollback cluster ""allocation (error %d). Run chkdsk ""to recover the lost space.", err2);
NVolSetErrors(vol);
}
err2 = ntfs_rl_truncate_nolock(vol, &idx_ni->rl, 0);
if (err2)
panic("%s(): err2\n", __FUNCTION__);
}
err:
/*
* Dissociate the allocated index root context from the tree path and
* throw it away. We do this here unconditionally as it works both for
* the case where we have not attached it to the tree path yet and for
* the case where we have already attached it to the tree path. We
* have to deal with the former case here so cannot defer the cleanup
* to the ntfs_index_ctx_put() of the index context @ictx that will be
* done by the caller.
*/
ntfs_index_ctx_disconnect_reinit(ir_ictx);
ntfs_index_ctx_put(ir_ictx);
ntfs_error(vol->mp, "Failed (error %d).", err);
return err;
}
/**
* ntfs_index_root_make_space - make space for an index entry in the index root
* @ictx: index entry in front of which to make space
* @ie_size: size of the index entry to make space for
*
* Return 0 on success and ENOSPC if there is not enough space in the mft
* record to insert an index entry of size @ie_size in the index root
* attribute.
*
* Note that we do not update the array of index entry pointers nor the number
* of entries in the array because on success it means that the index entry
* being added will be created and the function is then done, i.e. the array of
* index entry pointers will not be used any more and on error we have not done
* anything at all so there is no need to update any of the pointers.
*
* Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for
* writing.
* - The index context @ictx must be locked.
*/static errno_t ntfs_index_root_make_space(ntfs_index_context *ictx,
const u32 ie_size)
{
MFT_RECORD *m = ictx->actx->m;
const u32 muse = le32_to_cpu(m->bytes_in_use);
const u32 new_muse = muse + ie_size;
ntfs_debug("Entering.");
if (new_muse <= le32_to_cpu(m->bytes_allocated)) {
INDEX_ENTRY *ie = ictx->entry;
ATTR_RECORD *a;
INDEX_HEADER *ih;
/*
* Extend the index root attribute so it has enough space for
* the new index entry. As an optimization we combine the
* resizing of the index root attribute and the moving of index
* entries within the attribute into a single operation.
*/
memmove((u8*)ie + ie_size, ie, muse - ((u8*)ie - (u8*)m));
/* Adjust the mft record to reflect the change in used space. */
m->bytes_in_use = cpu_to_le32(new_muse);
/*
* Adjust the attribute record to reflect the changes in the
* size of the attribute record and in the size of the
* attribute value.
*/
a = ictx->actx->a;
a->length = cpu_to_le32(le32_to_cpu(a->length) + ie_size);
a->value_length = cpu_to_le32(le32_to_cpu(a->value_length) +
ie_size);
/* Adjust the index header to reflect the change in length. */
ih = ictx->index;
ih->allocated_size = ih->index_length = cpu_to_le32(
le32_to_cpu(ih->index_length) + ie_size);
ntfs_debug("Done.");
return 0;
}
ntfs_debug("Failed (not enough space in mft record to insert index ""entry into index root attribute).");
return ENOSPC;
}
/**
* ntfs_index_block_make_space - make space for an index entry in an index block
* @ictx: index entry in front of which to make space
* @ie_size: size of the index entry to make space for
*
* Return 0 on success and ENOSPC if there is not enough space in the index
* block to insert an index entry of size @ie_size.
*
* Note that we do not update the array of index entry pointers nor the number
* of entries in the array because on success it means that the index entry
* being added will be created and the function is then done, i.e. the array of
* index entry pointers will not be used any more and on error we have not done
* anything at all so there is no need to update any of the pointers.
*
* Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for
* writing.
* - The index context @ictx must be locked.
*/static errno_t ntfs_index_block_make_space(ntfs_index_context *ictx,
const u32 ie_size)
{
INDEX_HEADER *ih = ictx->index;
const u32 ilen = le32_to_cpu(ih->index_length);
const u32 new_ilen = ilen + ie_size;
ntfs_debug("Entering.");
if (new_ilen <= le32_to_cpu(ih->allocated_size)) {
INDEX_ENTRY *ie = ictx->entry;
/* Move the index entries to make space for the new entry. */
memmove((u8*)ie + ie_size, ie, ilen - ((u8*)ie - (u8*)ih));
/* Adjust the index header to reflect the change in length. */
ih->index_length = cpu_to_le32(new_ilen);
ntfs_debug("Done.");
return 0;
}
ntfs_debug("Failed (not enough space in index block to insert index ""entry).");
return ENOSPC;
}
/**
* ntfs_index_node_make_space - make space for an index entry in an index node
* @ictx: index entry in front of which to make space
* @ie_size: size of the index entry to make space for
*
* Return 0 on success and ENOSPC if there is not enough space in the index
* node to insert an index entry of size @ie_size.
*
* Note that we do not update the array of index entry pointers nor the number
* of entries in the array because on success it means that the index entry
* being added will be created and the function is then done, i.e. the array of
* index entry pointers will not be used any more and on error we have not done
* anything at all so there is no need to update any of the pointers.
*
* Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for
* writing.
* - The index context @ictx must be locked.
*/static errno_t ntfs_index_node_make_space(ntfs_index_context *ictx,
const u32 ie_size)
{
errno_t err;
if (ictx->is_root)
err = ntfs_index_root_make_space(ictx, ie_size);
else
err = ntfs_index_block_make_space(ictx, ie_size);
return err;
}
/**
* ntfs_index_ctx_revalidate - revalidate the pointers in an index context
* @new: index context with the new mapped virtual address
* @old: index context to revalidate
*
* Revalidate all pointers in the index context @old and adjust them for the
* changed mapped virtual address in the index context @new.
*
* Note both @new and @old must be in the same VM page.
*
* We need to revalidate all the pointers in @old just as if we were locking it
* because @new may have been mapped into a different virtual address than @old
* was last time thus the pointers in @old may be stale.
*
* To check if revalidation is needed is done by comparing @old->addr and
* @new->addr. If they match all is ok and if not then need to revalidate @old
* with the new address of @new->addr.
*
* Locking: - Caller must hold @new->idx_ni->lock on the index inode.
* - The index context @old must be locked.
* - The index context @new must not be locked.
*/staticvoidntfs_index_ctx_revalidate(ntfs_index_context *new,
ntfs_index_context *old)
{
long delta;
INDEX_ENTRY **entries;
unsigned u;
if (!new->is_locked)
panic("%s(): !new->is_locked\n", __FUNCTION__);
if (new->is_root)
panic("%s(): new->is_root\n", __FUNCTION__);
if (old->is_locked)
panic("%s(): old->is_locked\n", __FUNCTION__);
if (old->is_root)
panic("%s(): old->is_root\n", __FUNCTION__);
delta = new->addr - old->addr;
if (!delta)
return;
/* Revalidation is needed. */
old->entry = (INDEX_ENTRY*)((u8*)old->entry + delta);
old->ia = (INDEX_ALLOCATION*)((u8*)old->ia + delta);
old->index = &old->ia->index;
old->addr = new->addr;
entries = old->entries;
for (u = 0; u < old->nr_entries; u++)
entries[u] = (INDEX_ENTRY*)((u8*)entries[u] + delta);
}
/**
* ntfs_index_ctx_lock_two - lock two index contexts in a deadlock-free fashion
* @a: first index context to lock (this may be locked)
* @b: second index context to lock (this must not be locked)
*
* Lock the two index contexts @a and @b. @a may already be locked and it may
* need to be unlocked if it has to be locked after @b is locked.
*
* The following lock ordering rules are applied:
*
* - An index block node must be locked before an index root node.
*
* - Two index block nodes must be locked in ascending page offset order.
*
* - Two index block nodes that are physically in the same VM page must share
* the lock. The way this is implemented is that @a is really locked and @b
* is not actually locked but all its pointers are revalidated as if it were
* locked. This is ok because @a is locked and therefore the VM page is
* mapped, locked, and pinned and will not go anywhere until we are done.
* The reason we need to do the revalidation is because @b when it was mapped
* previously may have been mapped at a different virtual address than the
* one @a is mapped at now. This means that when you are unlocking @b you
* must check if @b is locked and only unlock it if so.
*
* Return 0 on success or errno on error. On error @a and @b may be both left
* unlocked or one can be left locked and one unlocked.
*/static errno_t ntfs_index_ctx_lock_two(ntfs_index_context *a,
ntfs_index_context *b)
{
errno_t err;
if (b->is_locked)
panic("%s(): b->is_locked\n", __FUNCTION__);
if (a->is_root) {
/*
* @a is the index root so it has to be locked second.
*
* Unlock @a if it is already locked.
*/if (a->is_locked)
ntfs_index_ctx_unlock(a);
} elseif (b->is_root) {
/* @b is the index root. If @a is not locked, lock it. */if (!a->is_locked) {
err = ntfs_index_ctx_relock(a);
if (err)
return err;
}
} else {
/*
* Both @a and @b are index blocks.
*
* Do we need to share the lock because both index nodes are in
* the same VM page?
*/if (a->upl_ofs == b->upl_ofs) {
if (!a->is_locked) {
err = ntfs_index_ctx_relock(a);
if (err)
return err;
}
ntfs_index_ctx_revalidate(a, b);
return 0;
}
if (a->is_locked) {
/* Do we have to unlock @a before locking @b? */if (a->upl_ofs > b->upl_ofs)
ntfs_index_ctx_unlock(a);
} else {
/* Do we need to lock @a first? */if (a->upl_ofs < b->upl_ofs) {
err = ntfs_index_ctx_relock(a);
if (err)
return err;
}
}
}
/* Lock @b. */
err = ntfs_index_ctx_relock(b);
/* If @a is currently locked or there was an error we are done. */if (a->is_locked || err)
return err;
/*
* We unlocked @a so we could lock @b or @a was not locked.
*
* Lock @a and we are done.
*/return ntfs_index_ctx_relock(a);
}
/**
* ntfs_index_entry_add_or_node_split - add a key to an index
* @ictx: index context specifying the node to split/position to add at
* @split_only: if true do not insert, only split the index node
* @entry_size: size of the entry that will be added after the split
* @key: key to add to the directory or view index
* @key_len: size of key @key in bytes
* @data: data to associate with the key @key if a view index
* @data_len: size of data @data in bytes if a view index
*
* If @split_only is true, split the index node pointed to by @ictx. @ictx
* also points to the entry in the index node at which an entry will be
* inserted after the split is completed. @entry_size is the size of the entry
* that will be added at that position. These are used so that the split is
* performed with consideration with the insertion that is likely to come. And
* if the insertion does not happen it does not matter much, it just means the
* split was not quite on the median entry but off by one.
*
* In this case @key, @key_len, @data, and @data_len are ignored.
*
* If @split_only is false @entry_size is ignored and @key, @key_len, @data,
* and @data_len are used.
*
* In this case, if @ictx belongs to a directory index, insert the filename
* attribute @key of length @key_len bytes in the directory index at the
* position specified by the index context @ictx and point the inserted index
* entry at the mft reference *@data which is the mft reference of the inode to
* which the filename @fn belongs. @data_len must be zero in this case.
*
* If @ictx belongs to a view index, insert the key @key of length @key_len
* bytes in the view index at the position specified by the index context @ictx
* and associate the data @data of size @data_len bytes with the key @key.
*
* Return 0 on success and errno on error. On error the index context is no
* longer usable and must be released or reinitialized.
*
* Note that we do not update the array of index entry pointers nor the number
* of entries in the array because on success it means that the index node has
* been split and the function is done, i.e. the array of index entry pointers
* will not be used any more and on error the index context becomes invalid so
* there is no need to update any of the pointers. The caller is expected to
* restart its operations by doing a new index lookup if it wants to continue
* working on the index for that reason.
*
* Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for
* writing.
* - The index context @ictx must be locked.
*/
errno_t ntfs_index_entry_add_or_node_split(ntfs_index_context *ictx,
const BOOL split_only, u32 entry_size, constvoid *key,
const u32 key_len, constvoid *data, const u32 data_len)
{
VCN vcn = 0;
ntfs_index_context *cur_ictx, *child_ictx;
ntfs_inode *bmp_ni, *idx_ni = ictx->idx_ni;
u32 data_ofs = 0;
errno_t err, err2;
const BOOL is_view = (idx_ni->name != I30);
ntfs_debug("Entering.");
if (!ictx->is_locked)
panic("%s(): !ictx->is_locked\n", __FUNCTION__);
/*
* If this is only a split @entry_size contains the size of the entry
* and the @key and @data are not set (and not needed/used).
*
* If this is an addition, @entry_size is not set and we need to work
* it out from the supplied @key and @data.
*/if (!split_only) {
/*
* Calculate the size for the index entry to be added. For a
* directory index, the mft reference @data is embedded inside
* the directory index entry thus we only need space for the
* index entry header and the filename attribute @key.
*
* For a view index, the data follows the key aligned to a
* 4-byte boundary.
*
* As an additional complication, the $SDH index in the $Secure
* system file contains an extra magic which is not counted in
* the data length @data_len so we need to add it by hand here.
*/
entry_size = sizeof(INDEX_ENTRY_HEADER) + key_len;
if (is_view) {
data_ofs = (entry_size + 4) & ~4;
entry_size = data_ofs + data_len;
if (idx_ni == idx_ni->vol->secure_sdh_ni)
entry_size += sizeof(((SDH_INDEX_DATA*)NULL)->
magic);
}
/*
* Align the index entry size to an 8-byte boundary and add
* another 8 bytes to the entry size if the insertion is to
* happen in an index node.
*/
entry_size = ((entry_size + 7) & ~7) +
((ictx->index->flags & INDEX_NODE) ?
sizeof(leVCN): 0);
}
/* Set the current entry to be the entry to be added to the index. */
cur_ictx = ictx;
cur_ictx->promote_inserted = 0;
cur_ictx->insert_to_add = 1;
cur_ictx->insert_entry_size = entry_size;
/*
* We need to loop over each index node on the tree path starting at
* the bottom. For each traversed node, we check if the current entry
* to be inserted fits and if it does not we need to split that node.
* We make the arrangements to be able to do so and then move onto the
* next node up the tree and so on until we find a node which does have
* enough space to fit the index entry to be inserted. We can then
* break out of the loop and begin work on the actual addition of the
* entry and the splitting of the so marked nodes.
*/while (cur_ictx->insert_entry_size > cur_ictx->bytes_free) {
ntfs_index_context *insert_ictx;
unsigned median_entry_nr, insert_entry_nr, entry_nr;
u32 insert_entry_size;
BOOL insert_to_add;
/*
* The entry to be inserted into this node @cur_ictx is
* specified by its @insert_entry and @insert_entry_size.
*/if (!cur_ictx->is_locked)
panic("%s(): !cur_ictx->is_locked\n", __FUNCTION__);
/*
* We do not have enough space in the current node to insert
* the entry to be inserted.
*
* If the current node is the index root we need to move the
* index root to an index block and if successful restart the
* loop to determine if we now have enough space to insert the
* entry.
*
* This causes the B+tree to grow in height by one and is in
* fact the only operation that can cause the tree to grow in
* height. All other operations can only cause the tree to
* grow in width.
*/if (cur_ictx->is_root) {
ntfs_debug("Moving index root into index allocation ""block.");
/*
* If we speculatively locked the index node containing
* the index entry to insert into the index root,
* unlock it now so we can move the index root to an
* index block.
*/
insert_ictx = cur_ictx->insert_ictx;
if (insert_ictx && insert_ictx->is_locked) {
if (insert_ictx->is_root)
panic("%s(): insert_ictx->is_root\n",
__FUNCTION__);
ntfs_index_ctx_unlock(insert_ictx);
}
err = ntfs_index_move_root_to_allocation_block(
cur_ictx);
if (!err)
continue;
ntfs_error(idx_ni->vol->mp, "Failed to move index ""root to index allocation block ""(error %d).", err);
gotoerr;
}
ntfs_debug("Need to split index block with VCN 0x%llx.",
(unsignedlonglong)cur_ictx->vcn);
/*
* We do not have enough space in the current node which we now
* know is an index allocation block. We have to split the
* node and promote the median entry into the parent node which
* may in turn involve a split. We do not perform the split
* but instead work out what needs to be done and allocate any
* needed index blocks in advance so we cannot run out of disk
* space and/or memory half-way through and only then do we do
* the actual work on the index nodes. This preparation
* involves the following steps:
*
* 1. Work out the median entry to be promoted into the parent
* node of the current node and unlock the current node.
* 2. If the entry to be promoted is not the entry to be
* inserted, make a note of the fact that the entry to be
* inserted is to be inserted into the current node in the
* index context.
* 3. Allocate a new index block and make a note of it in the
* current index context.
* 4. Set the parent node as the current node.
* 5. Set the median entry up as the current entry to be
* inserted.
*
* Then go round the loop again checking whether there is
* enough space to insert the now current entry into the now
* current node...
*
* First of all, mark the node as needing to be split.
*/
cur_ictx->split_node = 1;
/*
* 1. Determine the median index entry.
*
* Note we exclude the end entry from the median calculation
* because it is not eligible for promotion thus we should use
* @ictx->nr_entries - 1 but we need to take into account the
* not yet inserted entry for which we are doing the split in
* the first place so that is a + 1 and we also start counting
* entries at 0 and not 1 thus we apply a - 1 for that. Thus
* we have - 1 + 1 - 1 = -1.
*/
median_entry_nr = (cur_ictx->nr_entries - 1) / 2;
/*
* @entry_nr is the index into the array of index entry
* pointers of the index entry in front of which the entry to
* be inserted @cur_ictx->insert_entry needs to be inserted.
*/
entry_nr = cur_ictx->entry_nr;
/*
* If the position at which to insert the entry to be inserted
* is the median promote the entry to be inserted. If the
* number of entries is even there are two possible medians.
* We choose the first one for simplicity.
*
* If the entry to be inserted is before the median, subtract 1
* from the median.
*
* Otherwise promote the median entry.
*
* The only exception is when @split_only is true and the
* current node is the node we are meant to split in which
* case there is no entry to be inserted and thus it cannot be
* promoted. In this case we recalculate the median ignoring
* the entry to be inserted and promote that.
*/if (entry_nr == median_entry_nr && (!split_only ||
cur_ictx != ictx)) {
insert_to_add = cur_ictx->insert_to_add;
insert_entry_nr = cur_ictx->insert_entry_nr;
insert_entry_size = cur_ictx->insert_entry_size;
insert_ictx = cur_ictx->insert_ictx;
/*
* 2. The entry to be promoted is the entry to be
* inserted, make a note of that fact.
*/
cur_ictx->promote_inserted = 1;
} else {
if (entry_nr < median_entry_nr)
median_entry_nr--;
elseif (entry_nr == median_entry_nr) {
if (!split_only)
panic("%s(): !split_only\n",
__FUNCTION__);
if (cur_ictx != ictx)
panic("%s(): cur_ictx != ictx\n",
__FUNCTION__);
/*
* We must have at least one real entry or
* there is nothing to promote. This cannot
* happen unless something has gone wrong.
*/if (cur_ictx->nr_entries < 2)
panic("%s(): cur_ictx->nr_entries < ""2\n", __FUNCTION__);
median_entry_nr = (cur_ictx->nr_entries - 2) /
2;
}
insert_to_add = FALSE;
insert_entry_nr = median_entry_nr;
insert_entry_size = le16_to_cpu(cur_ictx->
entries[median_entry_nr]->length);
insert_ictx = cur_ictx;
}
/*
* If this is the very first promotion and we are promoting a
* leaf entry we need to add 8 bytes to the size of the entry
* being promoted to allow for the VCN sub-node pointer that
* will be added at the end of the entry when it is inserted
* into the parent index node.
*/if (cur_ictx == ictx &&
(!(cur_ictx->index->flags & INDEX_NODE) ?
sizeof(leVCN): 0))
insert_entry_size += sizeof(VCN);
/*
* Record which entry is being promoted, i.e. where we need to
* perform the split of the node.
*/
cur_ictx->promote_entry_nr = median_entry_nr;
// TODO: Possible optimization: Allow ntfs_index_block_alloc()
// to do the unlocking and thus to consume the lock if the new
// index block is in the same page as the current index block.
if (!cur_ictx->is_locked)
panic("%s(): !cur_ictx->is_locked\n", __FUNCTION__);
ntfs_index_ctx_unlock(cur_ictx);
/*
* 3. Allocate a new index block and make a note of it in the
* current index context.
*
* The call may cause the index root attribute to have its
* entries moved out to an index block. That is fine as we
* have not looked at any of our parent entries yet and the
* index root must be above us given we are a child node.
*/
err = ntfs_index_block_alloc(cur_ictx, &cur_ictx->right_vcn,
&cur_ictx->right_ia, &cur_ictx->right_upl_ofs,
&cur_ictx->right_upl, &cur_ictx->right_pl,
&cur_ictx->right_addr);
if (err) {
ntfs_error(idx_ni->vol->mp, "Failed to allocate index ""allocation block (error %d).", err);
gotoerr;
}
ntfs_debug("Allocated index block for right-hand node with ""VCN 0x%llx.", (unsignedlonglong)
cur_ictx->right_vcn);
/* We need to unmap the newly allocated index block. */
ntfs_page_unmap(idx_ni, cur_ictx->right_upl,
cur_ictx->right_pl, TRUE);
cur_ictx->right_upl = NULL;
/* 4. Set the parent node as the current node and lock it. */
cur_ictx = cur_ictx->up;
if (cur_ictx->is_locked)
panic("%s(): cur_ictx->is_locked\n", __FUNCTION__);
if (cur_ictx->is_root) {
/*
* We have reached the index root. We speculatively
* lock the index node containing the index entry to
* insert into the index root, as we cannot lock it
* once we have locked the mft record (which happens
* when we lock the current index context below) as
* that would cause lock reversal and thus deadlocks.
*/if (insert_ictx) {
if (insert_ictx->is_root)
panic("%s(): insert_ictx->is_root\n",
__FUNCTION__);
if (insert_ictx->is_locked)
panic("%s(): insert_ictx->is_locked\n",
__FUNCTION__);
err = ntfs_index_ctx_relock(insert_ictx);
if (err)
gotoerr;
}
}
/* Lock the current index context. */
err = ntfs_index_ctx_relock(cur_ictx);
if (err)
gotoerr;
/*
* 5. Set the median entry up as the current entry to be
* inserted.
*/
cur_ictx->insert_to_add = insert_to_add;
cur_ictx->insert_entry_nr = insert_entry_nr;
cur_ictx->insert_entry_size = insert_entry_size;
cur_ictx->insert_ictx = insert_ictx;
}
/*
* Get the child node index context if we are not at the bottom of the
* tree already or rather at the node for which we were called (which
* may not actually be at the bottom of the tree).
*/
child_ictx = NULL;
if (cur_ictx != ictx) {
child_ictx = cur_ictx->down;
if (child_ictx->is_root)
panic("%s(): child_ictx->is_root\n", __FUNCTION__);
}
/*
* If the node we were called for was the index root then it will have
* been moved to an index allocation block which will likely have
* created enough space to fit the entry to be inserted thus if this is
* a split only call nothing further needs to be done.
*/if (cur_ictx == ictx && split_only) {
ntfs_debug("Done (index root was upgraded to index ""allocation, no further split needed).");
return 0;
}
/*
* We now have prepared everything so we are guaranteed not to run out
* of disk space and can begin doing the work.
*
* TODO: We can still fail if relocking of an index context fails but
* that can really only happen under extreme memory pressure and there
* is not much we can do about that without <rdar://problem/4992358>
* being implemented. Thus for now we simply bomb out with an error
* message and leave the index potentially badly corrupted and also we
* potentially leave some allocated index blocks actually unused. This
* is highly unsatisfactory but rolling back once the modifications
* have begun is pretty much impossible without keeping a lot more
* state so we do not implement rollback. Once the aforementioned
* radar is implemented we will no longer suffer from this problem and
* it will no longer be possible to fail once we get here.
*
* We start with the current node which is the top-most node we need to
* deal with. We know from above it has enough space and does not need
* to be split.
*/do {
ntfs_index_context *insert_ictx;
INDEX_ENTRY *entry, *right_entry, *end_entry;
INDEX_HEADER *right_index;
unsigned u;
if (!cur_ictx->is_locked)
panic("%s(): !cur_ictx->is_locked\n", __FUNCTION__);
/*
* The current node is either the top-most node we need to deal
* with thus we know it has enough space or we have just split
* it and thus also know that it must have enough space.
*
* Thus the insertion is now a matter of doing a simple insert
* into the node unless the current node has had its entry to
* be inserted promoted into the parent node in which case
* there is nothing to insert into this node.
*
* Note: Use goto to reduce indentation.
*/
insert_ictx = cur_ictx->insert_ictx;
if (cur_ictx->promote_inserted)
gotoskip_insert;
/*
* We now need to begin to do the insertion but there is one
* complication we need to deal with first and that is that if
* we are inserting a promoted entry, we have to lock the index
* node it is in also.
*
* If the current node is an index block we may need to unlock
* it so that we can ensure to always take the node lock in
* ascending page offset order. Also we have to consider the
* case where the two index nodes are in the same page in which
* case we need to share the index lock.
*
* If the current node is the index root and we are inserting a
* promoted entry, we speculatively locked the index node
* containing the entry in the above loop thus we do not need
* to do anything here.
*/if (!cur_ictx->insert_to_add) {
if (!insert_ictx)
panic("%s(): !insert_ictx\n", __FUNCTION__);
if (cur_ictx->is_root) {
if (!insert_ictx->is_locked)
panic("%s(): !insert_ictx->""is_locked\n",
__FUNCTION__);
} else {
err = ntfs_index_ctx_lock_two(cur_ictx,
insert_ictx);
if (err)
gotorelock_err;
}
}
entry_size = cur_ictx->insert_entry_size;
/*
* Everything we need is locked and we know there is enough
* space in the index node thus make space for the entry to be
* inserted at the appropriate place and then insert the index
* entry by either copying it from the appropriate, locked
* child node or by creating it in place. The latter happens
* when the entry being inserted is the entry being added with
* this call to ntfs_index_entry_add(), i.e. it is the reason
* we have been called in the first place and thus there is no
* index entry to copy from.
*/
err = ntfs_index_node_make_space(cur_ictx, entry_size);
if (err)
panic("%s(): err\n", __FUNCTION__);
/*
* We have modified the index node. Make sure it is written
* out later.
*/
ntfs_index_entry_mark_dirty(cur_ictx);
if (!cur_ictx->insert_to_add) {
entry = insert_ictx->entries[
cur_ictx->insert_entry_nr];
/* Copy the index entry into the created space. */
memcpy(cur_ictx->entry, entry,
le16_to_cpu(entry->length));
entry = cur_ictx->entry;
} else {
if (split_only)
panic("%s(): split_only\n", __FUNCTION__);
entry = cur_ictx->entry;
/*
* Clear the created space so we start with a clean
* slate and do not need to worry about initializing
* all the zero fields.
*/
bzero(entry, entry_size);
/* Create the index entry in the created space. */if (!is_view)
entry->indexed_file = *(leMFT_REF*)data;
else {
u8 *new_data;
new_data = (u8*)entry + data_ofs;
entry->data_offset = cpu_to_le16(data_ofs);
entry->data_length = cpu_to_le16(data_len);
if (data_len)
memcpy(new_data, data, data_len);
/*
* In the case of $Secure/$SDH we leave the
* extra magic to zero rather than setting it
* to "II" in Unicode. This could easily be
* changed if deemed better and/or necessary by
* uncommenting the below code.
*/
#if 0
if (idx_ni == idx_ni->vol->secure_sdh_ni) {
staticconst ntfschar SDH_magic[2] = {
const_cpu_to_le16('I'),
const_cpu_to_le16('I')
};
memcpy(((SDH_INDEX_DATA*)data)->magic,
SDH_magic,
sizeof(SDH_magic));
}
#endif
}
entry->key_length = cpu_to_le16(key_len);
memcpy(&entry->key, key, key_len);
}
/*
* If the copied entry is a leaf entry and it is being inserted
* into a non-leaf node, we need to rewrite its size with the
* new size which includes the VCN sub-node pointer.
*
* We just overwrite the length unconditionally and use it to
* set the length of the just created index entry, too.
*
* There is no harm in doing this as we are either updating the
* size which we must do or we are overwriting the size with
* the same value it already has which has no effect.
*/
entry->length = cpu_to_le16(entry_size);
/*
* If the current node is not a leaf node, we have to fix up
* the VCN sub-node pointers both in the entry we just inserted
* and in the entry in front of which we inserted.
*
* The inserted index entry needs to point to what will be the
* left-hand index node after our child node is split.
*
* The index entry in front of which we inserted needs to point
* to what will be the right-hand index node after our child
* node is split.
*
* The INDEX_NODE check is fine for the index root, too,
* because as it happens LARGE_INDEX == INDEX_NODE.
*/if (cur_ictx->index->flags & INDEX_NODE) {
if (!child_ictx)
panic("%s(): !child_ictx\n", __FUNCTION__);
if (entry->flags & INDEX_ENTRY_END)
panic("%s(): entry->flags & INDEX_ENTRY_END\n",
__FUNCTION__);
entry->flags |= INDEX_ENTRY_NODE;
*(leVCN*)((u8*)entry + entry_size - sizeof(VCN)) =
cpu_to_sle64(child_ictx->vcn);
ntfs_debug("Setting VCN sub-node pointer of inserted ""index entry to 0x%llx.",
(unsignedlonglong)child_ictx->vcn);
entry = (INDEX_ENTRY*)((u8*)entry + entry_size);
if (!(entry->flags & INDEX_ENTRY_NODE))
panic("%s(): !(entry->flags & ""INDEX_ENTRY_NODE)\n",
__FUNCTION__);
*(leVCN*)((u8*)entry + le16_to_cpu(entry->length) -
sizeof(VCN)) = cpu_to_sle64(
child_ictx->right_vcn);
ntfs_debug("Setting VCN sub-node pointer of index ""entry after inserted entry to ""0x%llx.", (unsignedlonglong)
child_ictx->right_vcn);
}
skip_insert:
/*
* If there are no child nodes left we are done. We will dirty
* the index node once we have broken out of the loop. When
* the index context is released later all locked nodes will be
* unlocked so no need to do it now.
*/if (!child_ictx)
break;
if (!child_ictx->split_node)
panic("%s(): !child_ictx->split_node\n", __FUNCTION__);
/*
* TODO: @child_ictx->right_upl and @child_ictx->right_pl are
* currently not valid as @child_ictx is not locked. Once
* <rdar://problem/4992358> is implemented we can re-enable
* this check and change the code to leave the right_upl mapped
* at all times.
*/
#if 0
if (!child_ictx->right_upl || !child_ictx->right_pl)
panic("%s(): !child_ictx->right_upl ||
!child_ictx->right_pl\n",
__FUNCTION__);
#endif/*
* We are done with the current node and have a child node.
* Switch to the child node, and sort out the needed locks.
*
* First, unlock the @insert_ictx node if it exists and is
* locked.
*
* Note we do not bother with trying to transfer the lock from
* @insert_ictx onto @child_ictx or @child_ictx->right_*
* because index blocks are 4096 bytes in size in majority of
* cases and the PAGE_SIZE is 4096 bytes both on x86 and PPC
* thus in majority of cases each page will contain a separate
* index block thus no sharing will be possible and there is no
* point in adding extra complexity for an extremely unlikely
* event.
*/if (insert_ictx && insert_ictx->is_locked)
ntfs_index_ctx_unlock(insert_ictx);
/*
* Unlock the current node. Again we do not bother with trying
* to share the lock with @child_ictx or @child_ictx->right_*.
*/
ntfs_index_ctx_unlock(cur_ictx);
/* Set the child node to be the current node. */
cur_ictx = child_ictx;
/*
* We need to ensure both the current node and its right-hand
* node are locked. Both are currently unlocked.
*
* If both nodes share the same page, lock the current node and
* share the lock with the right one.
*/if (cur_ictx->is_locked)
panic("%s(): cur_ictx->is_locked\n", __FUNCTION__);
if (cur_ictx->right_is_locked)
panic("%s(): cur_ictx->right_is_locked\n",
__FUNCTION__);
if (cur_ictx->upl_ofs <= cur_ictx->right_upl_ofs) {
err = ntfs_index_ctx_relock(cur_ictx);
if (err)
gotorelock_err;
}
if (cur_ictx->upl_ofs == cur_ictx->right_upl_ofs) {
cur_ictx->right_ia = (INDEX_ALLOCATION*)(
(u8*)cur_ictx->right_ia +
(cur_ictx->addr -
cur_ictx->right_addr));
cur_ictx->right_addr = cur_ictx->addr;
} else {
u8 *addr;
err = ntfs_page_map(idx_ni, cur_ictx->right_upl_ofs,
&cur_ictx->right_upl,
&cur_ictx->right_pl, &addr, TRUE);
if (err) {
ntfs_error(idx_ni->vol->mp, "Failed to map ""index page (error %d).", err);
cur_ictx->right_upl = NULL;
gotorelock_err;
}
cur_ictx->right_is_locked = 1;
cur_ictx->right_ia = (INDEX_ALLOCATION*)(
(u8*)cur_ictx->right_ia + (addr -
cur_ictx->right_addr));
cur_ictx->right_addr = addr;
}
if (!cur_ictx->is_locked) {
err = ntfs_index_ctx_relock(cur_ictx);
if (err) {
if (cur_ictx->right_is_locked) {
ntfs_page_unmap(idx_ni,
cur_ictx->right_upl,
cur_ictx->right_pl,
FALSE);
cur_ictx->right_upl = NULL;
cur_ictx->right_is_locked = 0;
}
gotorelock_err;
}
}
/*
* Having obtained the needed locks, we can now split the
* current node.
*
* We have recorded the split point in @promote_entry_nr and
* @promote_inserted tells us whether to remove the entry
* specified by @promote_entry_nr and move all entries after it
* to the right-hand node (@promote_inserted is 0) or whether
* to move the entry specified by @promote_entry_nr and all
* entries after it to the right-hand node (@promote_inserted
* is 1).
*
* The split results in the creation of a new end entry in the
* left-hand node as its old end entry is moved to the right
* hand node. This means we need to determine what we need to
* set the VCN sub-node pointer to if this is an index node.
*
* If @promote_iserted is 0, i.e. we are promoting an existing
* entry from this node, we need to use the VCN sub-node
* pointer of the entry we are about to promote. Because we
* are promoting the entry it is going to disappear altogether
* from this node thus we need to make a note of its sub-node
* VCN pointer if it has one now before doing the actual split.
*
* In this case we do not need to modify the VCN sub-node
* pointer of the first entry in the (new) right-hand node as
* it does not change.
*
* If @promote_inserted is 1, i.e. we are promoting the entry
* that was going to be inserted into this node, we need to use
* the VCN of our child node which is the VCN of the entry in
* front of which we are inserting and splitting, i.e. the VCN
* of the first entry we are about to move to the right-hand
* node.
*
* In this case we also need to modify the VCN sub-node pointer
* of the first entry in the (new) right-hand node to point to
* the (new) right-hand child node. We do not know what that
* is yet so we determine it later once we have obtained our
* child node index context containing the needed information.
*
* First, copy the appropriate entries to the right-hand node
* and switch the node to be an index, i.e. non-leaf, node if
* the node being split is an index node.
*/
right_index = &cur_ictx->right_ia->index;
right_entry = (INDEX_ENTRY*)((u8*)right_index +
le32_to_cpu(right_index->entries_offset));
u = cur_ictx->promote_entry_nr;
entry = cur_ictx->entries[u];
if (!cur_ictx->promote_inserted) {
if (entry->flags & INDEX_ENTRY_NODE) {
if (!(cur_ictx->index->flags & INDEX_NODE))
panic("%s(): !(cur_ictx->index->flags ""& INDEX_NODE)\n",
__FUNCTION__);
vcn = sle64_to_cpu(*(leVCN*)((u8*)entry +
le16_to_cpu(entry->length) -
sizeof(VCN)));
}
u++;
entry = cur_ictx->entries[u];
}
end_entry = cur_ictx->entries[cur_ictx->nr_entries - 1];
if (!(end_entry->flags & INDEX_ENTRY_END))
panic("%s(): !(end_entry->flags & INDEX_ENTRY_END)\n",
__FUNCTION__);
u = (u8*)end_entry - (u8*)entry +
le16_to_cpu(end_entry->length);
memcpy(right_entry, entry, u);
right_index->index_length = cpu_to_le32(
le32_to_cpu(right_index->entries_offset) + u);
right_index->flags = cur_ictx->index->flags;
/*
* Move the end entry of the left-hand node forward thus
* truncating the left-hand node.
*/if (!cur_ictx->promote_inserted)
entry = cur_ictx->entries[cur_ictx->promote_entry_nr];
if (entry != end_entry) {
u = le16_to_cpu(end_entry->length);
memmove(entry, end_entry, u);
u += (u8*)entry - (u8*)cur_ictx->index;
cur_ictx->index->index_length = cpu_to_le32(u);
}
/*
* If the current, left-hand node is not a leaf node, we have
* to replace the VCN sub-node pointer in its end entry.
*
* If @promote_iserted is 0, we use the VCN we made a note of
* above.
*
* If @promote_inserted is 1, we take the VCN of the left-hand
* node of our child node.
*
* A side effect of getting the child node in loop scope here
* is that we do not need to reget it when we go round the loop
* again which is when we need to know the child node in order
* to be able to insert the appropriate entry into the current
* node.
*/
child_ictx = NULL;
if (entry->flags & INDEX_ENTRY_NODE) {
if (cur_ictx != ictx) {
child_ictx = cur_ictx->down;
if (child_ictx->is_root)
panic("%s(): child_ictx->is_root\n",
__FUNCTION__);
}
if (cur_ictx->promote_inserted) {
if (!child_ictx)
panic("%s(): !child_ictx\n",
__FUNCTION__);
/*
* As described take the VCN of the left-hand
* node of our child node index context for the
* new end entry.
*/
vcn = child_ictx->vcn;
/*
* Again, as described, update the VCN of the
* first entry we just moved to the (new) right
* hand node to the right-hand node of our
* child node index context.
*/
*(leVCN*)((u8*)right_entry + le16_to_cpu(
right_entry->length) -
sizeof(VCN)) = cpu_to_sle64(
child_ictx->right_vcn);
ntfs_debug("Setting VCN sub-node pointer of ""first index entry of ""right-hand index block node ""after splitting it off from ""the left-hand node to ""0x%llx.", (unsignedlonglong)
child_ictx->right_vcn);
}
if (!(cur_ictx->index->flags & INDEX_NODE))
panic("%s(): !(cur_ictx->index->flags & ""INDEX_NODE)\n", __FUNCTION__);
*(leVCN*)((u8*)entry + le16_to_cpu(entry->length) -
sizeof(VCN)) = cpu_to_sle64(vcn);
ntfs_debug("Setting VCN sub-node pointer of end index ""entry of left-hand index block node ""after splitting off the right-hand ""node to 0x%llx.", (unsignedlonglong)
vcn);
}
/*
* The index context still describes a single node thus if we
* are going to insert an entry into either of the two split
* nodes we have to update the index context appropriately.
*
* If @cur_ictx->entry_nr <= @cur_ictx->promote_entry_nr, we
* have to insert the entry into the left-hand node and if
* @cur_ictx->entry_nr > @cur_ictx->promote_entry_nr we have to
* insert the entry into the right-hand node.
*
* If inserting into the left-hand node we do not need to do
* anything as the insertion process does not make use of
* anything in the index context that has changed.
*
* If inserting into the right-hand node we have to switch the
* index context to describe the right-hand node and place the
* left-hand node into the place of the right-hand node, i.e.
* swap the left- and right-hand nodes in the index context.
*
* If we are switching the left- and right-hand nodes, we also
* have to update the index entry pointer to point to the
* correct insertion location in the now current page.
*
* Note we do not bother to update the array of index entry
* pointers as that is no longer used.
*/if (!cur_ictx->promote_inserted && cur_ictx->entry_nr >
cur_ictx->promote_entry_nr) {
union {
VCN vcn;
INDEX_BLOCK *ia;
s64 upl_ofs;
upl_t upl;
upl_page_info_array_t pl;
u8 *addr;
} tmp;
tmp.vcn = cur_ictx->right_vcn;
cur_ictx->right_vcn = cur_ictx->vcn;
cur_ictx->vcn = tmp.vcn;
tmp.ia = cur_ictx->right_ia;
cur_ictx->ia = tmp.ia;
cur_ictx->right_ia = cur_ictx->ia;
cur_ictx->index = right_index = &tmp.ia->index;
if (cur_ictx->right_is_locked) {
tmp.upl_ofs = cur_ictx->right_upl_ofs;
cur_ictx->right_upl_ofs = cur_ictx->upl_ofs;
cur_ictx->upl_ofs = tmp.upl_ofs;
tmp.upl = cur_ictx->right_upl;
cur_ictx->right_upl = cur_ictx->upl;
cur_ictx->upl = tmp.upl;
tmp.pl = cur_ictx->right_pl;
cur_ictx->right_pl = cur_ictx->pl;
cur_ictx->pl = tmp.pl;
tmp.addr = cur_ictx->right_addr;
cur_ictx->right_addr = cur_ictx->addr;
cur_ictx->addr = tmp.addr;
}
/*
* Get the location in the left-hand page of the first
* entry that was moved to the right-hand page.
*/
entry = cur_ictx->entries[cur_ictx->promote_entry_nr +
1];
cur_ictx->entry = (INDEX_ENTRY*)((u8*)right_index +
le32_to_cpu(right_index->
entries_offset) +
((u8*)cur_ictx->entry - (u8*)entry));
}
/*
* We are done with the node that is now set up as the
* right-hand node. Unless it is sharing the lock with the
* left-hand node, unmap/release it marking it dirty so it gets
* written out later.
*/if (cur_ictx->right_is_locked) {
ntfs_page_unmap(idx_ni, cur_ictx->right_upl,
cur_ictx->right_pl, TRUE);
cur_ictx->right_upl = NULL;
cur_ictx->right_is_locked = 0;
}
/*
* We may be done with the current node. Mark it dirty so it
* gets written out later.
*/
ntfs_index_entry_mark_dirty(cur_ictx);
/*
* If we have reached the original node (@child_ictx will be
* NULL) and we are only splitting it there is nothing to
* insert and hence nothing at all left to do so we break out
* of the loop.
*/
} while (child_ictx || !split_only);
ntfs_debug("Done (%s).", cur_ictx->split_node ?
"insert with split" : "simple insert");
return 0;
err:
if (!NInoIndexAllocPresent(idx_ni))
gotoerr_out;
/*
* Unlock all index contexts in the B+tree path otherwise the call to
* ntfs_attr_inode_get() can deadlock.
*/
ntfs_index_ctx_path_unlock(ictx);
/* Get the index bitmap inode. */
err2 = ntfs_attr_inode_get(ictx->base_ni, AT_BITMAP, idx_ni->name,
idx_ni->name_len, FALSE, LCK_RW_TYPE_SHARED, &bmp_ni);
if (err2) {
ntfs_error(idx_ni->vol->mp, "Failed to get index bitmap inode ""(error %d). Cannot undo index block ""allocation. Leaving inconsistent metadata. ""Run chkdsk.", err2);
NVolSetErrors(idx_ni->vol);
gotoerr_out;
}
/* Free all the index block allocations we have done. */do {
if (cur_ictx->right_addr) {
if (!cur_ictx->right_ia)
panic("%s(): !cur_ictx->right_ia\n",
__FUNCTION__);
if (cur_ictx->right_vcn < 0)
panic("%s(): cur_ictx->right_vcn < 0\n",
__FUNCTION__);
err2 = ntfs_bitmap_clear_bit(bmp_ni,
cur_ictx->right_vcn <<
idx_ni->vcn_size_shift >>
idx_ni->block_size_shift);
if (err2) {
ntfs_error(idx_ni->vol->mp, "Failed to undo ""index block allocation in ""(error %d). Leaving ""inconsistent metadata. Run ""chkdsk.", err2);
NVolSetErrors(idx_ni->vol);
}
}
/* When we have dealt with the bottom entry we are done. */if (cur_ictx == ictx)
break;
cur_ictx = cur_ictx->down;
} while (1);
lck_rw_unlock_shared(&bmp_ni->lock);
(void)vnode_put(bmp_ni->vn);
err_out:
ntfs_error(idx_ni->vol->mp, "Failed (error %d).", err);
return err;
relock_err:
ntfs_error(idx_ni->vol->mp, "Failed to relock index context (error ""%d). Leaving corrupt index. Run chkdsk.", err);
NVolSetErrors(idx_ni->vol);
return err;
}
/**
* ntfs_index_node_split - split an index node
* @ictx: index context specifying the node to split
* @entry_size: size of the entry that will be added after the split
*
* Split the index node pointed to by @ictx. @ictx also points to the entry
* in the index node at which an entry will be inserted after the split is
* completed. @entry_size is the size of the entry that will be added at that
* position. These are used so that the split is performed with consideration
* with the insertion that is likely to come. And if the insertion does not
* happen it does not matter much, it just means the split was not quite on the
* median entry but off by one.
*
* Return 0 on success and errno on error. On error the index context is no
* longer usable and must be released or reinitialized.
*
* Note that we do not update the array of index entry pointers nor the number
* of entries in the array because on success it means that the index node has
* been split and the function is done, i.e. the array of index entry pointers
* will not be used any more and on error the index context becomes invalid so
* there is no need to update any of the pointers. The caller is expected to
* restart its operations by doing a new index lookup for that reason.
*
* Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for
* writing.
* - The index context @ictx must be locked.
*/static inline errno_t ntfs_index_node_split(ntfs_index_context *ictx,
const u32 entry_size)
{
return ntfs_index_entry_add_or_node_split(ictx, TRUE, entry_size,
NULL, 0, NULL, 0);
}
/**
* ntfs_index_lookup_predecessor - index node whose predecessor node to return
* @ictx: index context whose predecessor node to return
* @pred_ictx: pointer in which to return the found predecessor index context
*
* Descend into the child node pointed to by @ictx->entry and then keep
* descending into the child node of the child node pointed to by the end entry
* of the child node until we reach the bottom of the B+tree.
*
* On success return the predecessor entry, i.e. the last real (non-end) entry
* in the found leaf index node in *@pred_ictx.
*
* Return 0 on success and errno on error.
*
* Locking: - Caller must hold @ictx->idx_ni->lock on the index inode.
* - The index context @ictx must be locked.
*/static errno_t ntfs_index_lookup_predecessor(ntfs_index_context *ictx,
ntfs_index_context **pred_ictx)
{
unsigned entry_nr;
errno_t err;
ntfs_debug("Entering.");
if (!pred_ictx)
panic("%s(): !pred_ictx\n", __FUNCTION__);
if (!(ictx->entry->flags & INDEX_ENTRY_NODE))
panic("%s(): !(ictx->entry->flags & INDEX_ENTRY_NODE)\n",
__FUNCTION__);
/*
* We must be at the bottom of the current tree path or things will get
* very confused.
*/if (!ictx->down->is_root)
panic("%s(): !ictx->down->is_root\n", __FUNCTION__);
do {
err = ntfs_index_descend_into_child_node(&ictx);
if (err)
gotoerr;
/* If this child node is a leaf node we are done. */if (!(ictx->index->flags & INDEX_NODE))
break;
/*
* This child node is an index node, descend into its end
* entry.
*/
ictx->entry_nr = entry_nr = ictx->nr_entries - 1;
ictx->follow_entry = ictx->entries[entry_nr];
} while (1);
/*
* We found the leaf node thus the predecessor entry we are looking for
* is the last entry before the end entry.
*/if (ictx->nr_entries < 2)
panic("%s(): ictx->nr_entries < 2\n", __FUNCTION__);
ictx->entry_nr = entry_nr = ictx->nr_entries - 2;
ictx->entry = ictx->entries[entry_nr];
ictx->is_match = 1;
*pred_ictx = ictx;
ntfs_debug("Done (found).");
return 0;
err:
ntfs_error(ictx->idx_ni->vol->mp, "Failed to descend into child node ""(error %d).", err);
return err;
}
/**
* ntfs_index_ctx_move - move an index context from its tree path to another one
* @ictx: index context to move
* @dst: destination index context below which to insert @ictx
*
* Disconnect the index context @ictx from its tree path and insert it into the
* tree path to which @dst belongs positioning it immediately below the index
* context @dst.
*
* Locking: - Caller must hold @ictx->idx_ni->lock on the index inode.
*/static inline voidntfs_index_ctx_move(ntfs_index_context *ictx,
ntfs_index_context *dst)
{
ntfs_index_ctx_disconnect(ictx);
dst->down->up = ictx;
ictx->down = dst->down;
ictx->up = dst;
dst->down = ictx;
}
/**
* ntfs_index_root_prepare_replace - prepare an index entry to be replaced
* @ictx: existing index entry that is going to be replaced
* @new_ie_size: size in bytes of the new index entry
*
* Resize the existing index entry to the size of the new index entry so that
* the index root is all set up and ready to receive the new entry.
*
* Return 0 on success and ENOSPC if there is not enough space in the index mft
* record for the new entry.
*
* Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for
* writing.
* - The index context @ictx must be locked.
*/static errno_t ntfs_index_root_prepare_replace(ntfs_index_context *ictx,
constunsigned new_ie_size)
{
MFT_RECORD *m = ictx->actx->m;
INDEX_ENTRY *ie = ictx->entry;
const u32 muse = le32_to_cpu(m->bytes_in_use);
constunsigned ie_size = le16_to_cpu(ie->length);
constint size_change = (int)new_ie_size - (int)ie_size;
const u32 new_muse = muse + size_change;
ntfs_debug("Entering.");
if (new_muse <= le32_to_cpu(m->bytes_allocated)) {
u8 *vcn_addr;
ATTR_RECORD *a;
INDEX_HEADER *ih;
/*
* Resize the index root attribute so it has the appropriate
* space for the new index entry to replace the existing entry.
* As an optimization we combine the resizing of the index root
* attribute and the moving of index entries within the
* attribute into a single operation.
*/
vcn_addr = (u8*)ie + ie_size - sizeof(VCN);
memmove((u8*)ie + new_ie_size - sizeof(VCN), vcn_addr,
muse - (vcn_addr - (u8*)m));
/* Adjust the mft record to reflect the change in used space. */
m->bytes_in_use = cpu_to_le32(new_muse);
/*
* Adjust the attribute record to reflect the changes in the
* size of the attribute record and in the size of the
* attribute value.
*/
a = ictx->actx->a;
a->length = cpu_to_le32(le32_to_cpu(a->length) + size_change);
a->value_length = cpu_to_le32(le32_to_cpu(a->value_length) +
size_change);
/* Adjust the index header to reflect the change in length. */
ih = ictx->index;
ih->allocated_size = ih->index_length = cpu_to_le32(
le32_to_cpu(ih->index_length) + size_change);
ntfs_debug("Done.");
return 0;
}
ntfs_debug("Failed (not enough space in mft record to replace index ""entry in index root attribute).");
return ENOSPC;
}
/**
* ntfs_index_block_prepare_replace - prepare an index entry to be replaced
* @ictx: existing index entry that is going to be replaced
* @new_ie_size: size in bytes of the new index entry
*
* Resize the existing index entry to the size of the new index entry so that
* the index node is all set up and ready to receive the new entry.
*
* Return 0 on success and ENOSPC if there is not enough space in the index
* node for the new entry.
*
* Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for
* writing.
* - The index context @ictx must be locked.
*/static errno_t ntfs_index_block_prepare_replace(ntfs_index_context *ictx,
constunsigned new_ie_size)
{
INDEX_HEADER *ih = ictx->index;
INDEX_ENTRY *ie = ictx->entry;
const u32 ilen = le32_to_cpu(ih->index_length);
constunsigned ie_size = le16_to_cpu(ie->length);
const u32 new_ilen = ilen + new_ie_size - ie_size;
ntfs_debug("Entering.");
if (new_ilen <= le32_to_cpu(ih->allocated_size)) {
u8 *vcn_addr;
/*
* Move the VCN of the index entry to be replaced and
* everything that follows it to adapt the space for the new
* entry.
*/
vcn_addr = (u8*)ie + ie_size - sizeof(VCN);
memmove((u8*)ie + new_ie_size - sizeof(VCN), vcn_addr,
ilen - (vcn_addr - (u8*)ih));
/* Adjust the index header to reflect the change in length. */
ih->index_length = cpu_to_le32(new_ilen);
ntfs_debug("Done.");
return 0;
}
ntfs_debug("Failed (not enough space in index block to replace index ""entry).");
return ENOSPC;
}
/**
* ntfs_index_entry_replace - replace an existing index entry with a new one
* @ictx: existing index entry to replace
* @new_ictx: new index entry to replace the existing entry with
*
* Replace the existing node index entry @ictx->entry with the leaf index entry
* @new_ictx->entry.
*
* Return 0 on success and ENOSPC if there is not enough space for the new
* entry.
*
* Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for
* writing.
* - The index context @ictx must be locked.
* - The index context @new_ictx must be locked.
*/static errno_t ntfs_index_entry_replace(ntfs_index_context *ictx,
ntfs_index_context *new_ictx)
{
INDEX_ENTRY *new_ie = new_ictx->entry;
INDEX_ENTRY *ie = ictx->entry;
constunsigned new_ie_size = le16_to_cpu(new_ie->length) + sizeof(VCN);
errno_t err;
if (!ictx->is_match || !new_ictx->is_match)
panic("%s(): !ictx->is_match || !new_ictx->is_match\n",
__FUNCTION__);
/* The destination entry is meant to be a node entry. */if (!(ictx->entry->flags & INDEX_ENTRY_NODE))
panic("%s(): !(ictx->entry->flags & INDEX_ENTRY_NODE)\n",
__FUNCTION__);
/* The new entry is meant to be a leaf entry. */if (new_ictx->entry->flags & INDEX_ENTRY_NODE)
panic("%s(): new_ictx->entry->flags & INDEX_ENTRY_NODE\n",
__FUNCTION__);
if (ictx->is_root)
err = ntfs_index_root_prepare_replace(ictx, new_ie_size);
else
err = ntfs_index_block_prepare_replace(ictx, new_ie_size);
if (!err) {
/* Copy the new index entry into the adapted space. */
memcpy(ie, new_ie, new_ie_size - sizeof(VCN));
/*
* Update the copied index entry to reflect the fact that it is
* now an index node entry and has the VCN sub-node pointer at
* its tail.
*/
ie->length = cpu_to_le16(new_ie_size);
ie->flags |= INDEX_ENTRY_NODE;
/* Ensure the updates are written to disk. */
ntfs_index_entry_mark_dirty(ictx);
}
return err;
}
/**
* ntfs_index_block_free - free an index allocation block
* @ictx: index context of the index block to deallocate
*
* Deallocate the index allocation block for the index described by the index
* context @ictx and invalidate the context so the caller can safely release
* it.
*
* We also check if the index allocation attribute can be shrunk as a
* consequence of the deallocation of the index allocation block and if so and
* that would actually change the on-disk size of the attribute we shrink it
* now.
*
* If we shrunk the index allocation attribute and the index bitmap attribute
* is non-resident we shrink the index bitmap attribute also but again only if
* it would actually change the on-disk size of the attribute.
*
* Return 0 on success and errno on error.
*
* Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for
* writing.
* - All of the index contexts in the index must be unlocked (this
* includes @ictx, i.e. @ictx must not be locked).
*/static errno_t ntfs_index_block_free(ntfs_index_context *ictx)
{
s64 target_pos, bmp_pos, alloc_size;
ntfs_inode *bmp_ni, *idx_ni = ictx->idx_ni;
ntfs_volume *vol = idx_ni->vol;
unsigned page_ofs;
errno_t err;
ntfs_debug("Entering.");
if (ictx->is_locked)
panic("%s(): ictx->is_locked\n", __FUNCTION__);
/* Get the index bitmap inode. */
err = ntfs_attr_inode_get(ictx->base_ni, AT_BITMAP, idx_ni->name,
idx_ni->name_len, FALSE, LCK_RW_TYPE_EXCLUSIVE,
&bmp_ni);
if (err) {
ntfs_error(vol->mp, "Failed to get index bitmap inode (error ""%d).", err);
return err;
}
/*
* Zero the bit in the index bitmap corresponding to the index block
* being deallocated.
*/
target_pos = ictx->vcn << idx_ni->vcn_size_shift >>
idx_ni->block_size_shift;
err = ntfs_bitmap_clear_bit(bmp_ni, target_pos);
if (err) {
ntfs_error(vol->mp, "Failed to deallocate index block in ""index bitmap (error %d).", err);
lck_rw_unlock_exclusive(&bmp_ni->lock);
(void)vnode_put(bmp_ni->vn);
return err;
}
/* If this is not the last set bit, we are done. */if (target_pos < idx_ni->last_set_bit) {
done:
ntfs_debug("Done (index records are allocated beyond the ""deallocated one).");
out:
lck_rw_unlock_exclusive(&bmp_ni->lock);
(void)vnode_put(bmp_ni->vn);
return 0;
}
/*
* Scan backwards through the entire bitmap looking for the last set
* bit. If we do not know the old last set bit (@idx_ni->last_set_bit
* is -1), start at the end of the bitmap and if we do know it but we
* cleared it just now, start at the old last set bit.
*
* Note we ignore any errors as the truncation is just a disk saving
* optimization and is not actually required. However if an error
* occurs we invalidate the last set bit stored in the inode.
*/if (idx_ni->last_set_bit >= 0)
bmp_pos = idx_ni->last_set_bit >> 3;
else {
lck_spin_lock(&bmp_ni->size_lock);
bmp_pos = bmp_ni->initialized_size - 1;
lck_spin_unlock(&bmp_ni->size_lock);
}
do {
upl_t upl;
upl_page_info_array_t pl;
u8 *bmp_start, *bmp;
err = ntfs_page_map(bmp_ni, bmp_pos & ~PAGE_MASK_64, &upl,
&pl, &bmp_start, FALSE);
if (err) {
ntfs_debug("Failed to read index bitmap (error %d).",
err);
idx_ni->last_set_bit = -1;
gotoout;
}
page_ofs = (unsigned)bmp_pos & PAGE_MASK;
bmp = bmp_start + page_ofs;
/* Scan backwards through the page. */do {
unsigned bit, byte = *bmp;
/* If this byte is zero skip it. */if (!byte)
continue;
/*
* Determine the last set bit in the byte.
*
* TODO: There does not appear to be a fls() function
* in the kernel. )-: If/when the kernel has an fls()
* function, switch the below code to use it.
*
* So we do the "bit = fls(byte) - 1" by hand which is
* not very efficient but works.
*/
bit = 0;
if (byte & 0xf0) {
byte >>= 4;
bit += 4;
}
if (byte & 0x0c) {
byte >>= 2;
bit += 2;
}
if (byte & 0x02)
bit++;
ntfs_page_unmap(bmp_ni, upl, pl, FALSE);
/*
* @bit now contains the last set bit in the byte thus
* we can determine the last set bit in the bitmap.
*/
idx_ni->last_set_bit = (((bmp_pos & ~PAGE_MASK_64) +
(bmp - bmp_start)) << 3) + bit;
if (target_pos < idx_ni->last_set_bit)
gotodone;
gotowas_last_set_bit;
} while (--bmp >= bmp_start);
ntfs_page_unmap(bmp_ni, upl, pl, FALSE);
} while ((bmp_pos -= page_ofs + 1) >= 0);
/*
* We scanned the entire bitmap and it was all zero. We do not do
* anything because truncation of indexes that become empty is done
* elsewhere.
*/
idx_ni->last_set_bit = -1;
ntfs_debug("Done (index bitmap has no set bits left).");
gotoout;
was_last_set_bit:
/*
* This was the last set bit. Check if we would save disk space by
* truncating the index allocation attribute and if so do it. To do
* this determine which the first unused cluster is and compare it
* against the currently allocated last cluster.
*
* Note we ignore any errors because it is not essential to resize the
* index allocation attribute. In fact Windows and chkdsk are
* perfectly happy with it remaining allocated. It just means the
* index is wasting space on disk and that will be reclaimed when the
* index is deleted or when the index is filled again with entries.
*/
target_pos = (((idx_ni->last_set_bit + 1) <<
idx_ni->block_size_shift) + vol->cluster_size_mask) &
~(s64)vol->cluster_size_mask;
lck_spin_lock(&idx_ni->size_lock);
alloc_size = idx_ni->allocated_size;
lck_spin_unlock(&idx_ni->size_lock);
if (target_pos >= alloc_size) {
ntfs_debug("Done (no space would be freed on disk by ""truncating the index allocation attribute).");
gotoout;
}
err = ntfs_attr_resize(idx_ni, target_pos, 0, NULL);
if (err) {
ntfs_debug("Failed to truncate index allocation attribute ""(error %d) thus this index will be wasting ""space on disk until it is deleted or ""repopulated with entries.", err);
gotoout;
}
ntfs_debug("Truncated index allocation attribute to reclaim 0x%llx ""bytes of disk space.",
(unsignedlonglong)(alloc_size - target_pos));
/*
* If the bitmap attribute is non-resident check if we would save disk
* space by truncating it, too, and if so do it. Again we ignore any
* errors as it is ok for the bitmap attribute to be left as it is.
*/if (NInoNonResident(bmp_ni)) {
target_pos = ((((target_pos >> idx_ni->block_size_shift) +
7) >> 3) + vol->cluster_size_mask) &
~(s64)vol->cluster_size_mask;
lck_spin_lock(&bmp_ni->size_lock);
alloc_size = bmp_ni->allocated_size;
lck_spin_unlock(&bmp_ni->size_lock);
if (target_pos >= alloc_size) {
ntfs_debug("Done (truncated index allocation to free ""space on disk but not truncating ""index bitmap as no space would be ""freed on disk by doing so).");
gotoout;
}
err = ntfs_attr_resize(bmp_ni, target_pos, 0, NULL);
if (err) {
ntfs_debug("Failed to truncate index bitmap attribute ""(error %d) thus this index will be ""wasting space on disk until it is ""deleted or repopulated with entries.",
err);
gotoout;
}
ntfs_debug("Truncated index bitmap attribute to reclaim ""0x%llx bytes of disk space.",
(unsignedlonglong)(alloc_size - target_pos));
}
ntfs_debug("Done.");
gotoout;
}
/**
* ntfs_index_make_empty - make an index empty discarding all entries in it
* @ictx: index context describing the index to empty
*
* Empty the index described by the index context @ictx. On failure, use the
* tree described by @ictx to re-allocate the freed index blocks if any have
* been freed at the point of failure.
*
* This is called when the last index entry in an index is being deleted.
*
* We need to remove the sub-node from the end entry of the index root and
* switch the entry to be a leaf entry.
*
* We also need to deallocate all index blocks and if possible shrink the index
* allocation attribute to zero size as well as the index bitmap attribute if
* it is non-resident.
*
* Return 0 on success and errno on error.
*
* Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for
* writing.
* - All of the index contexts in the index must be unlocked (this
* includes @ictx, i.e. @ictx must not be locked).
*/static errno_t ntfs_index_make_empty(ntfs_index_context *ictx)
{
s64 data_size;
ntfs_inode *bmp_ni, *idx_ni = ictx->idx_ni;
MFT_RECORD *m;
ntfs_attr_search_ctx *actx;
INDEX_ROOT *ir;
INDEX_HEADER *ih;
INDEX_ENTRY *ie;
ntfs_index_context *start_ictx;
u32 new_ilen;
errno_t err;
ntfs_debug("Entering.");
if (ictx->is_locked)
panic("%s(): ictx->is_locked\n", __FUNCTION__);
/*
* Start by zeroing the index bitmap bits corresponding to the index
* blocks being deallocated. For simplicity, we just zero the entire
* index bitmap attribute.
*/
err = ntfs_attr_inode_get(ictx->base_ni, AT_BITMAP, idx_ni->name,
idx_ni->name_len, FALSE, LCK_RW_TYPE_EXCLUSIVE,
&bmp_ni);
if (err) {
ntfs_error(idx_ni->vol->mp, "Failed to get index bitmap inode ""(error %d).", err);
return err;
}
lck_spin_lock(&bmp_ni->size_lock);
data_size = bmp_ni->data_size;
lck_spin_unlock(&bmp_ni->size_lock);
err = ntfs_attr_set(bmp_ni, 0, data_size, 0);
if (err) {
ntfs_error(idx_ni->vol->mp, "Failed to deallocate index ""block(s) in index bitmap.");
gotoerr;
}
/*
* We need to get hold of the index root attribute in order to convert
* its end entry to a leaf node.
*/
err = ntfs_mft_record_map(ictx->base_ni, &m);
if (err) {
ntfs_error(idx_ni->vol->mp, "ntfs_mft_record_map() failed ""(error %d).", err);
gotoerr;
}
actx = ntfs_attr_search_ctx_get(ictx->base_ni, m);
if (!actx) {
err = ENOMEM;
gotounm_err;
}
/* Find the index root attribute in the mft record. */
err = ntfs_attr_lookup(AT_INDEX_ROOT, idx_ni->name, idx_ni->name_len,
0, NULL, 0, actx);
if (err) {
if (err == ENOENT) {
ntfs_error(idx_ni->vol->mp, "Index root attribute ""missing in inode 0x%llx. Run ""chkdsk.",
(unsignedlonglong)idx_ni->mft_no);
err = EIO;
NVolSetErrors(idx_ni->vol);
}
gotoput_err;
}
/* Get to the index root value. */
ir = (INDEX_ROOT*)((u8*)actx->a + le16_to_cpu(actx->a->value_offset));
ih = (INDEX_HEADER*)&ir->index;
/* The first and last index entry. */
ie = (INDEX_ENTRY*)((u8*)ih + le32_to_cpu(ih->entries_offset));
if (!(ie->flags & INDEX_ENTRY_END))
panic("%s(): !(ie->flags & INDEX_ENTRY_END)\n", __FUNCTION__);
if (!(ie->flags & INDEX_ENTRY_NODE))
panic("%s(): !(ie->flags & INDEX_ENTRY_NODE)\n", __FUNCTION__);
/*
* Remove the sub-node pointer from the index entry and shrink the
* index root attribute appropriately.
*/
ie->length = cpu_to_le16(le16_to_cpu(ie->length) - sizeof(VCN));
ie->flags &= ~INDEX_ENTRY_NODE;
new_ilen = le32_to_cpu(ih->index_length) - sizeof(VCN);
ih->allocated_size = ih->index_length = cpu_to_le32(new_ilen);
ih->flags &= ~LARGE_INDEX;
err = ntfs_resident_attr_value_resize(actx->m, actx->a,
offsetof(INDEX_ROOT, index) + new_ilen);
/* We are shrinking the index root so the resize cannot fail. */if (err)
panic("%s(): err\n", __FUNCTION__);
/* Ensure the changes are written to disk. */
NInoSetMrecNeedsDirtying(actx->ni);
ntfs_attr_search_ctx_put(actx);
ntfs_mft_record_unmap(ictx->base_ni);
/*
* Now deal with the index allocation attribute by truncating it to
* zero length.
*
* Note we ignore any errors because it is not essential to resize the
* index allocation attribute. In fact Windows and chkdsk are
* perfectly happy with it remaining allocated. It just means the
* index is wasting space on disk and that will be reclaimed when the
* index is deleted or when the index is filled again with entries.
*/
err = ntfs_attr_resize(idx_ni, 0, 0, ictx);
if (err)
ntfs_debug("Failed to truncate index allocation attribute to ""zero size thus this index will be wasting ""space on disk until it is deleted or ""repopulated with entries.");
/*
* Finally, if the index bitmap attribute is non-resident truncate it
* to zero length, too. Again we ignore any errors as it is ok for the
* bitmap attribute to be left as it is.
*/if (!err && NInoNonResident(bmp_ni)) {
err = ntfs_attr_resize(bmp_ni, 0, 0, ictx);
if (err)
ntfs_debug("Failed to truncate index bitmap attribute ""to zero size thus this index will be ""wasting space on disk until it is ""deleted or repopulated with ""entries.");
}
lck_rw_unlock_exclusive(&bmp_ni->lock);
(void)vnode_put(bmp_ni->vn);
/*
* We no longer have any index blocks allocated so invalidate our cache
* of the last set bit.
*/
idx_ni->last_set_bit = -1;
ntfs_debug("Done.");
return 0;
put_err:
ntfs_attr_search_ctx_put(actx);
unm_err:
ntfs_mft_record_unmap(ictx->base_ni);
err:
/*
* Re-allocate the deallocated index block(s). This is safe because
* the index inode mutex is held throughout.
*/
start_ictx = ictx;
do {
int err2;
/*
* Skip the index root as it does not have a bit in the
* bitmap.
*/if (ictx->is_root)
continue;
err2 = ntfs_bitmap_set_bit(bmp_ni, ictx->vcn <<
idx_ni->vcn_size_shift >>
idx_ni->block_size_shift);
if (err2) {
ntfs_error(idx_ni->vol->mp, "Failed to undo ""deallocation of index block in index ""bitmap (error %d) of inode 0x%llx. ""Leaving inconsistent metadata. Run ""chkdsk.", err2,
(unsignedlonglong)idx_ni->mft_no);
NVolSetErrors(idx_ni->vol);
}
} while ((ictx = ictx->up) != start_ictx);
lck_rw_unlock_exclusive(&bmp_ni->lock);
(void)vnode_put(bmp_ni->vn);
ntfs_error(idx_ni->vol->mp, "Failed (error %d).", err);
return err;
}
/**
* ntfs_index_entry_delete - delete an index entry
* @ictx: index context describing the index entry to delete
*
* Delete the index entry described by the index context @ictx from the index.
*
* Return 0 on success and errno on error. A special case is a return code of
* -EAGAIN (negative, not EAGAIN) which means that the B+tree was rearranged
* into a different consistent state to make the deletion possible but now the
* lookup and delete has to be repeated as the index entry to be deleted may
* have changed its position in the tree thus a new lookup is required to be
* able to delete it. Doing this is not terribly efficient but we are only
* talking about a handful of cases in a single delete of tens of thousands of
* files so it does not matter if that is inefficient. On the plus side doing
* things this way means we do not need to keep track of the entry to be
* deleted when rearranging the tree which saves time and makes the code much
* simpler.
*
* Note that we do not update the array of index entry pointers nor the number
* of entries in the array because on success it means that the index entry has
* been removed and the function is done, i.e. the array of index entry
* pointers will not be used any more and on error the index contect becomes
* invalid so there is no need to update any of the pointers.
*
* Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for
* writing.
* - The index context @ictx must be locked.
*/intntfs_index_entry_delete(ntfs_index_context *ictx)
{
leVCN vcn;
ntfs_inode *idx_ni = ictx->idx_ni;
ntfs_index_context *pred_ictx, *parent_ictx, *suc_ictx, *put_ictx;
ntfs_index_context *deallocate_ictx;
INDEX_HEADER *ih;
INDEX_ENTRY *ie, *next_ie, *pull_down_entry;
MFT_RECORD *m;
unsigned ie_size, pred_ie_size, pull_down_entry_nr, pull_down_ie_size;
unsigned old_parent_entry_nr, old_parent_ie_size, move_ie_size;
errno_t err;
u32 new_ilen;
BOOL is_root = ictx->is_root;
ntfs_debug("Entering.");
if (!ictx->is_locked)
panic("%s(): !ictx->is_locked\n", __FUNCTION__);
deallocate_ictx = parent_ictx = put_ictx = NULL;
ih = ictx->index;
ie = ictx->entry;
/* We cannot delete end entries as they do not contain any key/data. */if (ie->flags & INDEX_ENTRY_END)
panic("%s(): ie->flags & INDEX_ENTRY_END\n", __FUNCTION__);
ie_size = le16_to_cpu(ie->length);
next_ie = (INDEX_ENTRY*)((u8*)ie + ie_size);
/*
* There are two types of index entry: Either it is a leaf entry, i.e.
* it is in a leaf node and thus has no children or it is in an index
* node entry, i.e. it is in an index block node and thus has children
* which we need to deal with. If it is a leaf entry, skip onto leaf
* entry deletion.
*/if (!(ie->flags & INDEX_ENTRY_NODE))
gotodelete_leaf_entry;
/*
* The node the entry to be deleted is in has sub-node(s), i.e. it is
* not a leaf node, and the entry to be deleted is not a leaf entry.
* We have to replace it with its predecessor (which by definition is a
* leaf entry). There are three cases of increasing complexity:
*
* 1) The simplest case is when the predecessor is not the only entry
* in its (leaf) node thus it can be simply removed without any tree
* rebalancing needing to be done on its node and in addition the
* predecessor entry can replace the entry to be deleted without
* overflowing the node, i.e. there is enough space in the node we are
* deleting the entry from to fit its predecessor entry thus it is a
* matter of a simple replace without the need to change anything else
* in the tree.
*
* 2) The slightly more complicated case is the same as above case 1)
* except that there is not enough space in the node the entry to be
* deleted is in to fit its predecessor entry, thus we need to split
* the node the entry is being deleted from and promote the median
* entry to the parent node which may then overflow the parent node and
* so on up to the root node. A particular annoyance here is that
* depending on the implementation details it is possible for the entry
* to be deleted to be the median entry and thus be promoted to its
* parent or alternatively it is possible for its predecessor entry
* that is to replace the entry to be deleted to have to be promoted.
* A further pitfall is that if the entry to be deleted is behind the
* median entry, i.e. it is on the right of the median entry, then it
* will be moved to a different node (the new right-hand node to the
* old node being split) thus the replace needs to happen in a
* different place. If we use ntfs_index_entry_add() as it is to take
* care of the split&promote on insertion this last case would cause us
* the problem that the pointers would be all out of date. We would
* need ntfs_index_entry_add() to update the pointers and to switch the
* right and left nodes in that case. Perhaps we need an
* ntfs_index_entry_replace() that can share a lot of code with
* ntfs_index_entry_add() perhaps even just a flag to
* ntfs_index_entry_add() to indicate replace? Then we would not need
* to worry about the pointers becoming out of date as the replace
* would be done already for us.
*
* 3) The more complicated case is when the predecessor entry is the
* last entry in its (leaf) node thus we cannot use it to replace the
* entry to be deleted without rebalancing the tree. In this case we
* have to rebalance the tree so that the predecessor entry is no
* longer the last entry in its (leaf) node. Once that is successfully
* done we have reduced the problem to either case 1) or case 2) above
* or in fact to a completely different case (see below). The benefit
* of doing the rebalancing first without regard to the delete and
* replace that are to take place is that the tree ends up in a
* consistent state, just a different one to what it was before, thus
* we do not need to rollback to the original state any more thus error
* handling is greatly simplified. The pitfall here is that doing the
* balancing may profoundly change the tree to the extent that the
* entry to be deleted may be moved to somewhere completely different
* which we somehow need to be able to cope with. This can have
* positive side effects, too, as it for example can lead to the entry
* to be deleted being turned into a leaf entry thus its deletion turns
* into a delete of a leaf entry thus the planned replace does not need
* to happen at all. This is the "completely different case" mentioned
* above. Usually this case will have been caused by a merge of two
* neighbouring nodes with pulldown of the parent entry (which happens
* to be the entry to be deleted) thus the entry to be deleted will not
* only be a leaf entry but it will also not be the last entry in the
* leaf node thus it can be deleted with a simple delete. Otherwise it
* is not a disaster and we just need to rebalance the tree again as we
* did just now but this time making sure that the entry to be deleted
* is not the only entry in the node (compare to above where we were
* making sure that the leaf predecessor entry is not the only entry in
* its leaf node) and then we can proceed with a simple delete of the
* leaf node. Because the tree is balanced at all steps we do not
* actually care about handling the delete of the entry in the case
* that it turned into a leaf entry and we just leave it to the leaf
* entry deletion code to deal with.
*
* That is quite enough description now, let the games begin. First,
* investigate the index entry to be deleted and its predecessor to
* determine which of the above categories the deletion falls into.
*
* Locate the predecessor entry. Because the predecessor may be the
* only entry in its (leaf) node in which case it would cause the tree
* to be rebalanced, we record the path taken to the predecessor entry
* by simply extending the existing tree path that points to the entry
* to be deleted.
*/
err = ntfs_index_lookup_predecessor(ictx, &pred_ictx);
if (err) {
ntfs_error(idx_ni->vol->mp, "Failed to look up predecessor of ""index node entry to be deleted (error %d).",
err);
return err;
}
/*
* If the predecessor is not the only entry other than the end entry in
* its node we can simply remove it from its node and use it to replace
* the entry to be deleted.
*/if (pred_ictx->nr_entries > 2)
gotosimple_replace;
if (pred_ictx->nr_entries < 2)
panic("%s(): pred_ictx->nr_entries < 2\n", __FUNCTION__);
/*
* The predecessor is the only entry (other than the end entry) in its
* node thus we cannot simply remove it from its node as that would
* cause the B+tree to become imbalanced.
*
* There are two different cases we need to consider for which we need
* to look up the predecessor of the current predecessor leaf entry.
* To do this we ascend the tree looking for a node with entries in it.
* The first node containing any entries is the node containing the
* predecessor entry (which is not a leaf entry).
*
* 1) If we reach the original node containing the entry to be deleted
* without finding any non-empty nodes on the way the whole sub-tree
* below the entry to be deleted will become empty after using the
* predecessor leaf entry to replace the entry to be deleted. Thus we
* can deallocate all nodes in this sub-tree and instead of replacing
* the entry to be deleted we remove it altogether and we then move the
* predecessor leaf entry that is now homeless in front of its new
* successor leaf entry, thus effectively merging the node containing
* the predecessor with its right-hand sibling.
*
* Note that the insertion of the predecessor leaf entry into the node
* containing its successor leaf entry may require a split of the node
* we are insert into.
*
* 2) If we find a predecessor for the predecessor entry before we
* reach the original node containing the entry to be deleted we break
* what we need to do into two operations. First, we rebalance the
* tree so that the predecessor entry of the entry to be deleted is no
* longer the only entry in its leaf node. This simplifies the
* deletion of the entry to a case we have already solved as we can now
* simply remove the predecessor entry from its node and use it to
* replace the entry to be deleted. Thus, second, we use the already
* existing code to do a simple replace. Should that fail it does not
* matter as we are leaving a fully consistent B+tree tree behind.
* Breaking this case up into two means we incur a little bit of
* overhead for the extra locking of the predecessor node and for the
* extra memmove()s and memcpy() of the predecessor entry. But this
* simplifies error handling and makes the code so much simpler that it
* is definitely worth paying the price in overhead.
*
* Enough discussion, time to do it. Start by going up the tree
* looking for a non-empty node or the original node containing the
* entry to be deleted so we can determine if we have the above case 1
* or 2.
*
* We must be at the bottom of the current tree path or things will get
* very confused.
*/if (!pred_ictx->down->is_root)
panic("%s(): !pred_ictx->down->is_root\n", __FUNCTION__);
parent_ictx = pred_ictx->up;
put_ictx = deallocate_ictx = pred_ictx;
ntfs_index_ctx_disconnect_reinit(pred_ictx);
/*
* Make a note of the size of the predecessor entry that we are going
* to move and unlock it for now.
*/
move_ie_size = le16_to_cpu(pred_ictx->entry->length);
ntfs_index_ctx_unlock(pred_ictx);
/* Now ascend the tree until we find a non-empty node. */while (parent_ictx->nr_entries <= 1) {
if (parent_ictx->nr_entries != 1)
panic("%s(): parent_ictx->nr_entries != 1\n",
__FUNCTION__);
/*
* Empty index node. Move it to the list of index nodes to be
* deallocated and proceed to its parent.
*/
pred_ictx = parent_ictx;
parent_ictx = parent_ictx->up;
ntfs_index_ctx_move(pred_ictx, deallocate_ictx);
}
/*
* We have a non-empty parent node. Check whether this is the node
* containing the entry to be deleted (case 1) or whether it is an
* intervening node (case 2). In the latter case we need to rearrange
* the tree.
*/if (parent_ictx != ictx)
gotorearrange_tree;
/*
* Now that we know we have case 1, we need to find the leaf node
* containing the successor entry of the entry to be deleted. To do
* this we repoint the tree path to the entry on the right-hand side of
* the entry to be deleted and descend down into the first entry of
* each node until we reach the leaf-node.
*
* We need to lock the node containing the entry to be deleted and make
* a note of it as we are going to delete it later.
*/
err = ntfs_index_ctx_relock(parent_ictx);
if (err) {
ntfs_error(idx_ni->vol->mp, "Failed to relock the parent ""index node (error %d).", err);
gotoput_err;
}
/*
* NOTE: pull_down_* == to be deleted (we are just reusing existing
* code and variables hence the (mis)naming...
*/
pull_down_entry = parent_ictx->follow_entry;
pull_down_entry_nr = parent_ictx->entry_nr;
/*
* Repoint the path to the entry on the right-hand side of the entry to
* be deleted so we can descend to the leaf node containing the
* successor entry.
*/
parent_ictx->follow_entry = (INDEX_ENTRY*)((u8*)pull_down_entry +
le16_to_cpu(pull_down_entry->length));
parent_ictx->entry_nr++;
/* Finally descend to the leaf node containing the successor entry. */
suc_ictx = parent_ictx;
do {
err = ntfs_index_descend_into_child_node(&suc_ictx);
if (err) {
ntfs_error(idx_ni->vol->mp, "Failed to descend to ""node containing successor entry ""(error %d).", err);
gotoput_err;
}
if (!suc_ictx->is_locked)
panic("%s(): !suc_ictx->is_locked\n", __FUNCTION__);
suc_ictx->follow_entry = suc_ictx->entries[0];
} while (suc_ictx->index->flags & INDEX_NODE);
if (suc_ictx->follow_entry->flags & INDEX_ENTRY_NODE)
panic("%s(): suc_ictx->follow_entry->flags & ""INDEX_ENTRY_NODE\n", __FUNCTION__);
/*
* Can the predecessor entry simply be inserted in front of the
* successor leaf entry without overflowing the node? If not we need
* to split the node and then restart the delete.
*/if (move_ie_size > suc_ictx->bytes_free) {
split_and_restart_case_1:
ntfs_debug("Splitting index node before add on delete (case ""1).");
ie_size = move_ie_size;
gotosplit_and_restart;
}
/*
* The predecessor entry can simply be inserted in front of the
* successor leaf entry.
*
* We need to have locked both nodes to be able to do the insert thus
* we may need to unlock the locked successor node to ensure that the
* node lock is always taken in ascending page offset order so we avoid
* deadlocks. Also we have to consider the case where the two index
* nodes are in the same page in which case we need to share the index
* lock.
*/if (deallocate_ictx->is_locked)
panic("%s(): deallocate_ictx->is_locked\n", __FUNCTION__);
err = ntfs_index_ctx_lock_two(suc_ictx, deallocate_ictx);
if (err)
gotoput_err;
/*
* We need to re-check the amount of free space as it can have changed
* whilst we dropped the lock on @suc_ictx if that was necessary. When
* this happens simply drop the lock on @deallocate_ictx if we took it
* and go back to the previous code dealing with the not enough space
* case.
*/if (move_ie_size > suc_ictx->bytes_free) {
if (deallocate_ictx->is_locked)
ntfs_index_ctx_unlock(deallocate_ictx);
gotosplit_and_restart_case_1;
}
/*
* Both nodes are locked and this is a simple insert, so go ahead and
* do it. We have already checked that there is enough space so it
* cannot fail.
*
* Move all the entries out of the way to make space for the
* predecessor entry to be copied in.
*/
err = ntfs_index_node_make_space(suc_ictx, move_ie_size);
if (err)
panic("%s(): err\n", __FUNCTION__);
/*
* Copy the predecessor index entry into the created space in front of
* the successor entry.
*/
memcpy(suc_ictx->follow_entry, deallocate_ictx->entry, move_ie_size);
/* Ensure the updates are written to disk. */
ntfs_index_entry_mark_dirty(suc_ictx);
/*
* We are done both with the predecessor and successor nodes so release
* their locks.
*/if (deallocate_ictx->is_locked)
ntfs_index_ctx_unlock(deallocate_ictx);
if (!suc_ictx->is_locked)
panic("%s(): !suc_ictx->is_locked\n", __FUNCTION__);
ntfs_index_ctx_unlock(suc_ictx);
/*
* We now have to lock the original node containing the entry to be
* deleted and we need to switch it to be the current index context and
* to point to the entry to be deleted. Note this is a simple delete
* just like for a leaf entry as we have taken care of its child leaf
* node(s) already and will deallocate it(them) later.
*/if (parent_ictx->is_locked)
panic("%s(): parent_ictx->is_locked\n", __FUNCTION__);
err = ntfs_index_ctx_relock(parent_ictx);
if (err) {
ntfs_error(idx_ni->vol->mp, "Failed to relock the parent ""index node (error %d).", err);
/*
* Undo the insertion of the predecessor entry in front of the
* successor entry by moving the successor and all following
* entries back into their old place and resetting the index
* length to the old size.
*/
err = ntfs_index_ctx_relock(suc_ictx);
if (err) {
ntfs_error(idx_ni->vol->mp, "Failed to relock index ""context (error %d). Leaving corrupt ""index. Run chkdsk.", err);
NVolSetErrors(idx_ni->vol);
} else {
/*
* @suc_ictx cannot be the index root or the below
* code would be wrong.
*/if (suc_ictx->is_root)
panic("%s(): suc_ctx->is_root\n", __FUNCTION__);
ie = suc_ictx->follow_entry;
ie_size = le16_to_cpu(ie->length);
next_ie = (INDEX_ENTRY*)((u8*)ie + ie_size);
ih = suc_ictx->index;
new_ilen = le32_to_cpu(ih->index_length);
memmove(ie, next_ie, new_ilen - ((u8*)next_ie -
(u8*)ih));
ih->index_length = cpu_to_le32(new_ilen - ie_size);
ntfs_index_entry_mark_dirty(suc_ictx);
}
gotoput_err;
}
ictx = parent_ictx;
parent_ictx->entry = parent_ictx->entries[pull_down_entry_nr];
parent_ictx->entry_nr = pull_down_entry_nr;
gotoprep_simple_delete;
rearrange_tree:
/*
* We now know we have case 2, i.e. we have a non-empty, intervening
* node containing the predecessor (non-leaf) entry of the predecessor
* entry of the entry to be deleted and we need to use it to rearrange
* the tree such that the (leaf) predecessor entry of the entry to be
* deleted no longer is the only entry in its (leaf) node.
*
* To do this we merge the (leaf) node containing the predecessor entry
* of the entry to be deleted with its left-hand sibling (leaf) node.
* This involves pulling the (non-leaf) predecessor entry of the (leaf)
* predecessor entry of the entry to be deleted down to behind its
* (leaf) predecessor entry.
*
* Thus, we need to now locate the leaf node containing the predecessor
* entry behind which we need to insert the two predecessor entries.
*
* To be able to do this we need to lock the node containing the (non-
* leaf) predecessor of the predecessor of the entry to be deleted and
* make a note of it as we are going to delete it later. We also need
* to repoint the tree path to descend into the (non-leaf) predecessor
* entry.
*/if (parent_ictx->is_locked)
panic("%s(): parent_ictx->is_locked\n", __FUNCTION__);
if (parent_ictx->is_root)
panic("%s(): parent_ictx->is_root\n", __FUNCTION__);
err = ntfs_index_ctx_relock(parent_ictx);
if (err)
gotoput_err;
parent_ictx->entry_nr--;
parent_ictx->follow_entry = parent_ictx->entries[parent_ictx->entry_nr];
pull_down_ie_size = le16_to_cpu(parent_ictx->follow_entry->length) -
sizeof(VCN);
/* Now descend to the leaf predecessor entry. */
suc_ictx = parent_ictx;
do {
err = ntfs_index_descend_into_child_node(&suc_ictx);
if (err) {
ntfs_error(idx_ni->vol->mp, "Failed to descend to ""node containing leaf predecessor ""entry of non-leaf predecessor entry ""of leaf predecessor entry of entry ""to be deleted (error %d).", err);
gotoput_err;
}
if (!suc_ictx->is_locked)
panic("%s(): !suc_ictx->is_locked\n", __FUNCTION__);
suc_ictx->entry_nr = suc_ictx->nr_entries - 1;
suc_ictx->follow_entry = suc_ictx->entries[suc_ictx->entry_nr];
} while (suc_ictx->index->flags & INDEX_NODE);
if (suc_ictx->follow_entry->flags & INDEX_ENTRY_NODE)
panic("%s(): suc_ictx->follow_entry->flags & ""INDEX_ENTRY_NODE\n", __FUNCTION__);
/*
* Can the entry to be pulled down and the original predecessor entry
* to be merged in simply be inserted after the located predecessor
* entry without overflowing the node? If not we need to split the
* node and then restart the delete.
*/if (pull_down_ie_size + move_ie_size > suc_ictx->bytes_free) {
split_and_restart_case_2:
ntfs_debug("Splitting index node before add on delete (case ""2).");
ie_size = pull_down_ie_size + move_ie_size;
gotosplit_and_restart;
}
/*
* We know we have enough space so we cannot fail. Need to lock the
* non-leaf predecessor that we want to pull down.
*/if (parent_ictx->is_locked)
panic("%s(): parent_ictx->is_locked\n", __FUNCTION__);
err = ntfs_index_ctx_lock_two(suc_ictx, parent_ictx);
if (err)
gotoput_err;
/*
* We need to re-check the amount of free space as it can have changed
* whilst we dropped the lock on @suc_ictx if that was necessary. When
* this happens simply drop the lock on @parent_ictx if we took it and
* go back to the previous code dealing with the not enough space case.
*/if (pull_down_ie_size + move_ie_size > suc_ictx->bytes_free) {
if (parent_ictx->is_locked)
ntfs_index_ctx_unlock(parent_ictx);
gotosplit_and_restart_case_2;
}
/*
* Make enough space in the destination leaf node to accomodate both
* the entries.
*/
err = ntfs_index_node_make_space(suc_ictx, pull_down_ie_size +
move_ie_size);
if (err)
panic("%s(): err\n", __FUNCTION__);
/*
* Copy the non-leaf predecessor entry into place and convert it to a
* leaf entry.
*/
ie = suc_ictx->follow_entry;
pull_down_entry = parent_ictx->follow_entry;
memcpy(ie, pull_down_entry, pull_down_ie_size);
ie->length = cpu_to_le16(pull_down_ie_size);
ie->flags &= ~INDEX_ENTRY_NODE;
/*
* Now delete the entry from its original node and transfer its VCN
* sub-node pointer to the next entry (which is the end entry).
*/
vcn = *(leVCN*)((u8*)pull_down_entry + pull_down_ie_size);
ih = parent_ictx->index;
new_ilen = le32_to_cpu(ih->index_length) - (pull_down_ie_size +
sizeof(VCN));
memmove(pull_down_entry, (u8*)pull_down_entry + pull_down_ie_size +
sizeof(VCN), new_ilen - ((u8*)pull_down_entry -
(u8*)ih));
*(leVCN*)((u8*)pull_down_entry + le16_to_cpu(pull_down_entry->length) -
sizeof(VCN)) = vcn;
/* Update the index size, too. */
ih->index_length = cpu_to_le32(new_ilen);
/* Ensure the updates are written to disk. */
ntfs_index_entry_mark_dirty(parent_ictx);
/* Update the index context as well. */
parent_ictx->bytes_free += pull_down_ie_size + sizeof(VCN);
parent_ictx->nr_entries--;
/* We are done with the non-leaf predecessor node so unlock it. */if (parent_ictx->is_locked)
ntfs_index_ctx_unlock(parent_ictx);
/*
* Now need to move over the original leaf predecessor entry. Lock its
* node again.
*/if (deallocate_ictx->is_locked)
panic("%s(): deallocate_ictx->is_locked\n", __FUNCTION__);
err = ntfs_index_ctx_lock_two(suc_ictx, deallocate_ictx);
if (err)
gotoput_err;
/* Copy the original leaf predecessor entry into place. */
memcpy((u8*)suc_ictx->follow_entry + pull_down_ie_size,
deallocate_ictx->follow_entry, move_ie_size);
/* Ensure the updates are written to disk. */
ntfs_index_entry_mark_dirty(suc_ictx);
/* We are finished rearranging the tree so unlock both nodes. */
ntfs_index_ctx_unlock(suc_ictx);
if (deallocate_ictx->is_locked)
ntfs_index_ctx_unlock(deallocate_ictx);
/*
* The last thing to do is to deallocate the disconnected index node(s)
* and then we can restart the delete operation which now involves a
* simple replace to complete the delete.
*/
parent_ictx = deallocate_ictx;
do {
if (parent_ictx->is_root)
panic("%s(): parent_ictx->is_root\n", __FUNCTION__);
err = ntfs_index_block_free(parent_ictx);
if (err) {
ntfs_error(idx_ni->vol->mp, "Failed to deallocate no ""longer used index block vcn 0x%llx ""(error %d). Run chkdsk to recover ""the lost index block.",
(unsignedlonglong)parent_ictx->vcn,
err);
NVolSetErrors(idx_ni->vol);
}
} while ((parent_ictx = parent_ictx->up) != deallocate_ictx);
/*
* Release the sub-tree paths that the caller no longer has a reference
* to.
*/
ntfs_index_ctx_put(put_ictx);
/* Reset the state machine to original values. */
put_ictx = deallocate_ictx = NULL;
/*
* The tree is now fully consistent, no nodes are locked, and @suc_ictx
* is the leaf node containing the predecessor entry of the entry to
* be deleted and the predecessor entry no longer is the only entry
* other than the end entry thus it can simply be used to replace the
* entry to be deleted.
*
* Setup everything so we can jump straight into the simple replace
* code. Note we lock the original entry to be deleted as well so that
* the simple replace code does not have to drop the lock on the
* predecessor node in order to lock the original node which it may
* have to do otherwise.
*
* The simple replace code requires the predecessor to be in
* @pred_ictx.
*/
pred_ictx = suc_ictx;
err = ntfs_index_ctx_lock_two(suc_ictx, ictx);
if (err)
return err;
/*
* Repoint the predecessor context to point to the correct entry and
* update it to reflect the addition of the two index entries.
*/
pred_ictx->entry = (INDEX_ENTRY*)((u8*)pred_ictx->entry +
pull_down_ie_size);
pred_ictx->entry_nr++;
pred_ictx->is_match = 1;
pred_ictx->bytes_free -= pull_down_ie_size + move_ie_size;
pred_ictx->nr_entries += 2;
if (pred_ictx->nr_entries > pred_ictx->max_entries)
panic("%s(): pred_ictx->nr_entries > pred_ictx->max_entries\n",
__FUNCTION__);
pred_ictx->entries[pred_ictx->entry_nr] = pred_ictx->entry;
pred_ictx->entries[pred_ictx->entry_nr + 1] = (INDEX_ENTRY*)(
(u8*)pred_ictx->entry + move_ie_size);
/* Everything is set up. Do the simple replace. *//* goto simple_replace; */simple_replace:
/*
* The predecessor can simply be removed from its node.
*
* We need to have locked both nodes to be able to do the replace thus
* we may need to unlock the locked predecessor node to ensure that
* page locks are only taken in ascending page offset order so we avoid
* deadlocks.
*/if (!pred_ictx->is_locked)
panic("%s(): !pred_ictx->is_locked\n", __FUNCTION__);
if (!ictx->is_locked) {
err = ntfs_index_ctx_lock_two(pred_ictx, ictx);
if (err)
return err;
}
/*
* Can the predecessor simply replace the entry to be deleted without
* overflowing the node containing the entry to be deleted? If not we
* need to split the node and then restart the delete.
*/
pred_ie_size = le16_to_cpu(pred_ictx->entry->length);
ie_size = le16_to_cpu(ictx->entry->length) - sizeof(VCN);
if ((int)pred_ie_size - (int)ie_size > (int)ictx->bytes_free) {
ntfs_debug("Splitting index node before add on delete (case ""3).");
/*
* Drop the lock on the predecessor or transfer it to the node
* containing the entry to be deleted if they share the same
* page (in which case @ictx is not marked locked).
*/if (!pred_ictx->is_locked)
panic("%s(): !pred_ictx->is_locked\n", __FUNCTION__);
if (ictx->is_locked) {
/*
* @ictx is locked thus it cannot be sharing the lock
* with @pred_ictx. Unlock @pred_ictx.
*/
ntfs_index_ctx_unlock(pred_ictx);
} else {
/*
* @ictx is not locked thus it must be sharing the lock
* with @pred_ictx. Transfer the lock across.
*/if (ictx->is_root)
panic("%s(): ictx->is_root\n", __FUNCTION__);
if (ictx->upl_ofs != pred_ictx->upl_ofs)
panic("%s(): ictx->upl_ofs != ""pred_ictx->upl_ofs\n",
__FUNCTION__);
ictx->is_locked = 1;
pred_ictx->is_locked = 0;
}
if (!ictx->is_locked)
panic("%s(): !ictx->is_locked\n", __FUNCTION__);
suc_ictx = ictx;
ie_size = pred_ie_size - ie_size;
gotosplit_and_restart;
}
/*
* Both nodes are locked and this is a simple replace, so go ahead and
* do it. We have already checked that there is enough space so it
* cannot fail.
*/
err = ntfs_index_entry_replace(ictx, pred_ictx);
if (err)
panic("%s(): err\n", __FUNCTION__);
/* We are done with the current entry so release it. */if (ictx->is_locked)
ntfs_index_ctx_unlock(ictx);
/*
* We have now replaced the entry to be deleted but at the moment we
* have two copies of the predecessor entry, so delete the original
* copy from its (leaf) node. We need to switch the predecessor index
* context to be the current context so we can just pretend it is the
* entry to be deleted. We only need to setup the fields relevant to
* index block node deletion as the predecessor cannot be in the index
* root node.
*/
ictx = pred_ictx;
gotoprep_simple_delete;
delete_leaf_entry:
/*
* If the entry is in the index root or it is not the only entry (other
* than the end entry) in the leaf node it can simply be removed.
*/if (ictx->nr_entries > 2 || is_root)
gotosimple_delete;
/*
* The entry to be deleted is the only entry and it is not in the index
* root. We have to rebalance the tree so that the entry to be deleted
* is no longer the only entry in its node. There are several cases we
* need to consider:
*
* 1) If the parent entry is not the end entry of its index node then
* simply insert our parent entry at the front of the leaf node on
* our right, i.e. the child node of the entry following our parent
* entry. Then remove our parent entry from our parent node and
* free the leaf node the entry to be deleted is in (thus deleting
* the entry).
*
* What the above describes is effectively merging the leaf node
* containing the entry to be deleted with its right-hand sibling
* leaf node and in the process pulling down our parent entry into
* the merged node and placing it in front of its successor entry.
*
* Our parent node is of course turned from an index node entry to a
* leaf entry as it is pulled down.
*
* Note how if our parent entry is the only entry other than the end
* entry in its node this results in our parent node being empty,
* i.e. containing only the end entry. This is fine for NTFS
* B+trees, i.e. index nodes may be empty. It is only leaf nodes
* that may not be empty.
*
* 2) As point 1) above but the parent entry of the entry to be removed
* does not fit in the leaf node containing its successor thus that
* node needs to be split and the split may need to propagate up the
* tree which is of course the job of ntfs_index_entry_add().
*
* 3) If the parent entry is the end entry of its index node, i.e.
* there is no right-hand sibling leaf node then simply insert the
* entry on the left of our parent entry at the end of its own child
* node, i.e. the leaf node on our left. Then remove the entry on
* the left of our parent entry that we inserted into its child node
* from our parent node and change the VCN sub-node pointer of our
* parent node to point to our left-hand sibling. Finally free the
* leaf node the entry to be deleted is in which is no longer
* referenced from anywhere (thus deleting the entry).
*
* What the above describes is effectively merging the leaf node
* containing the entry to be deleted with its left-hand sibling
* leaf node and in the process pulling down the entry on the
* left-hand side of our parent into the merged node and placing it
* after its predecessor entry.
*
* The left-hand sibling of our parent node is of course turned from
* and index node entry to a leaf entry as it is pulled down.
*
* Note how if the left-hand sibling of our parent entry is the only
* entry other than the end entry, which is also our parent entry,
* in its node this results in our parent node being empty, i.e.
* containing only the end entry. This is fine for NTFS B+trees,
* i.e. index nodes may be empty. It is only leaf nodes that may
* not be empty.
*
* 4) As point 3) above but the left-hand sibling of the parent entry
* of the entry to be removed does not fit in the leaf node
* containing its predecessor thus that node needs to be split and
* the split may need to propagate up the tree which is of course
* the job of ntfs_index_entry_add().
*
* 5) If the parent entry is the end entry of its index node, i.e.
* there is no right-hand sibling leaf node, and there is no entry
* on the left of our parent entry, i.e. there is no left-hand
* sibling leaf node, i.e. the parent node is completely empty, we
* cannot merge with a sibling as there is none thus we need to
* record the current state and repeat a very similar procedure as
* above points 1-4 except applied to the parent node which is not
* a leaf node so we need to be careful with VCN sub-node pointers.
* The aim of doing this is to merge the parent node with one of its
* siblings so that it is no longer empty so that our leaf node
* containing the entry to be deleted has at least one sibling so
* that we can merge the leaf node with one of its siblings. If the
* parent node of the parent node has no siblings either we need to
* push the recorded state down into the stack of recorded states,
* record the new current state, and then go to the next parent and
* try to merge that with its sibling and so on until we find a
* parent with a sibling that can be merged. There are two possible
* outcomes (not counting errors):
* a) The going up and merging will eventually succeed in which case
* we go backwards through the stack of recorded states merging
* nodes as we go along until we reach the last recorded state at
* which point we have reduced the problem to one of the above
* points 1-4 thus we repeat them to finish.
* b) We reach the index root which by definition cannot have any
* siblings. Also because we reached the index root it means
* that the index root must be empty and the end entry is
* pointing to the VCN sub-node we came from and because the
* entry to be deleted is the last entry in its leaf-node and all
* parent nodes including the index root are empty the entry to
* be deleted is the very last entry of the directory thus it can
* simply be removed by truncating the index allocation attribute
* to zero size and if the index bitmap is non-resident
* truncating it to zero size, too, and if the index bitmap is
* resident zeroing its contents instead of truncating it and
* finally the index root is switched to be a small index, which
* is reflected in its index flags and in the removal of the VCN
* sub-node pointer in the end entry of the index root.
* NOTE: Instead of recursing, we do it in one go by simply freeing
* all the index node blocks in the tree path until we find a
* non-empty node, i.e. a node that can be merged, and we then
* perform the merge by pulling down the appropriate index entry but
* instead of pulling it down one level we pull it down to before its
* successor (if merging to the right) or to after its predecessor (if
* merging on the left) which by definition will be in a leaf node.
* That give the same results as the recursion but it is much less
* work to do and it affects less index nodes.
* NOTE2: After implementing NOTE above, we ditch the code handling
* cases 1-4 above as they are just special cases of case 5 when
* implemented like NOTE.
*
* That is quite enough description now, let the games begin. First,
* investigate the parent index entry of the leaf entry to be deleted
* to see whether it is the end entry or not to determine which of the
* above categories the deletion falls into.
*
* We must be at the bottom of the current tree path or things will get
* very confused.
*/if (!ictx->down->is_root)
panic("%s(): !ictx->down->is_root\n", __FUNCTION__);
/*
* We are going to disconnect @ictx thus the tree starting at the index
* root is no longer referenced by the caller so we need to free it
* later.
*/
put_ictx = ictx->down;
/*
* Obtain the parent node, unlock the current node, disconnect it from
* the tree, and mark it for deallocation later.
*/
parent_ictx = ictx->up;
ntfs_index_ctx_unlock(ictx);
ntfs_index_ctx_disconnect_reinit(ictx);
deallocate_ictx = ictx;
/*
* Investigate the parent node. If it is not empty we can immediately
* proceed with the merge to the right or left. If it is empty then we
* need to ascend the tree until we find a non-empty node or until we
* reach an empty index root in which case the index is becoming empty
* so we delete its contents by resetting it to zero size.
*/while (parent_ictx->nr_entries <= 1) {
if (parent_ictx->nr_entries != 1)
panic("%s(): parent_ictx->nr_entries != 1\n",
__FUNCTION__);
if (parent_ictx->is_root) {
/*
* The index is becoming empty. To simplify things
* merge the two sub-trees back together and then empty
* the index.
*/
deallocate_ictx->up->down = parent_ictx->down;
parent_ictx->down->up = deallocate_ictx->up;
deallocate_ictx->up = parent_ictx;
parent_ictx->down = deallocate_ictx;
return ntfs_index_make_empty(deallocate_ictx);
}
/*
* Empty index node. Move it to the list of index nodes to be
* deallocated and proceed to its parent.
*/
ictx = parent_ictx;
parent_ictx = parent_ictx->up;
ntfs_index_ctx_move(ictx, deallocate_ictx);
}
/*
* We have a non-empty parent node which we need to lock as we need to
* access its contents.
*/if (parent_ictx->is_locked)
panic("%s(): parent_ictx->is_locked\n", __FUNCTION__);
err = ntfs_index_ctx_relock(parent_ictx);
if (err) {
ntfs_error(idx_ni->vol->mp, "Failed to relock the parent ""index node (error %d).", err);
gotoput_err;
}
/* INDEX_NODE == LARGE_INDEX */if (!(parent_ictx->follow_entry->flags & INDEX_ENTRY_NODE) ||
!(parent_ictx->index->flags & INDEX_NODE))
panic("%s(): !(parent_ictx->follow_entry->flags & ""INDEX_ENTRY_NODE) || ""!(parent_ictx->index->flags & INDEX_NODE)\n",
__FUNCTION__);
/*
* If the parent entry is the end entry in its node we cannot merge to
* the right so merge to the left instead.
*/if (parent_ictx->follow_entry->flags & INDEX_ENTRY_END)
gotomerge_left;
/*
* The parent entry is not the end entry in its node so merge on the
* right. To do this we need to find the successor (leaf) entry for
* which we need to repoint the path so we descend into the sub-node of
* the entry on the right-hand side of the parent entry.
*
* We also make a note of the parent entry as we are going to pull it
* down in front of its successor entry and we then have to delete it
* from this node later.
*/
pull_down_entry = parent_ictx->follow_entry;
pull_down_entry_nr = parent_ictx->entry_nr;
pull_down_ie_size = le16_to_cpu(pull_down_entry->length);
parent_ictx->follow_entry = (INDEX_ENTRY*)((u8*)pull_down_entry +
pull_down_ie_size);
pull_down_ie_size -= sizeof(VCN);
parent_ictx->entry_nr++;
suc_ictx = parent_ictx;
do {
err = ntfs_index_descend_into_child_node(&suc_ictx);
if (err) {
ntfs_error(idx_ni->vol->mp, "Failed to descend to ""node containing successor entry ""(error %d).", err);
gotoput_err;
}
if (!suc_ictx->is_locked)
panic("%s(): !suc_ictx->is_locked\n", __FUNCTION__);
suc_ictx->follow_entry = suc_ictx->entries[0];
} while (suc_ictx->index->flags & INDEX_NODE);
if (suc_ictx->follow_entry->flags & INDEX_ENTRY_NODE)
panic("%s(): suc_ictx->follow_entry->flags & ""INDEX_ENTRY_NODE\n", __FUNCTION__);
/*
* Can the parent entry simply be inserted in front of its successor
* leaf entry without overflowing the node? If not we need to split
* the node and then restart the delete.
*/if (pull_down_ie_size > suc_ictx->bytes_free) {
split_and_restart_case_4:
ntfs_debug("Splitting index node before add on delete (case ""4).");
ie_size = pull_down_ie_size;
gotosplit_and_restart;
}
/*
* The parent entry can simply be inserted in front of its successor
* leaf entry.
*
* We need to have locked both nodes to be able to do the insert thus
* we may need to unlock the locked successor node.
*/if (parent_ictx->is_locked)
panic("%s(): parent_ictx->is_locked\n", __FUNCTION__);
err = ntfs_index_ctx_lock_two(suc_ictx, parent_ictx);
if (err)
gotoput_err;
/*
* We need to re-check the amount of free space as it can have changed
* whilst we dropped the lock on @suc_ictx if that was necessary. When
* this happens simply drop the lock on @parent_ictx if we took it and
* go back to the previous code dealing with the not enough space case.
*/if (pull_down_ie_size > suc_ictx->bytes_free) {
if (parent_ictx->is_locked)
ntfs_index_ctx_unlock(parent_ictx);
gotosplit_and_restart_case_4;
}
/*
* Both nodes are locked and this is a simple insert, so go ahead and
* do it. We have already checked that there is enough space so it
* cannot fail.
*
* Move all the entries out of the way to make space for the parent
* entry to be copied in.
*/
err = ntfs_index_node_make_space(suc_ictx, pull_down_ie_size);
if (err)
panic("%s(): err\n", __FUNCTION__);
/*
* Copy the parent index entry into the created space in front of its
* successor and switch the inserted entry to being a leaf entry.
*/
ie = suc_ictx->follow_entry;
pull_down_entry = parent_ictx->entries[pull_down_entry_nr];
memcpy(ie, pull_down_entry, pull_down_ie_size);
ie->length = cpu_to_le16(pull_down_ie_size);
ie->flags &= ~INDEX_ENTRY_NODE;
/* Ensure the updates are written to disk. */
ntfs_index_entry_mark_dirty(suc_ictx);
/*
* We are done with the successor node so release it or transfer it to
* the parent node if they share the same page (in which case
* @parent_ictx is not marked locked).
*/if (!suc_ictx->is_locked)
panic("%s(): !suc_ictx->is_locked\n", __FUNCTION__);
if (parent_ictx->is_locked) {
/*
* @parent_ictx is locked thus it cannot be sharing the lock
* with @suc_ictx. Unlock @suc_ictx.
*/
ntfs_index_ctx_unlock(suc_ictx);
} else {
/*
* @parent_ictx is not locked thus it must be sharing the lock
* with @suc_ictx. Transfer the lock across.
*/if (parent_ictx->is_root)
panic("%s(): parent_ictx->is_root\n", __FUNCTION__);
if (parent_ictx->upl_ofs != suc_ictx->upl_ofs)
panic("%s(): parent_ictx->upl_ofs != ""suc_ictx->upl_ofs\n", __FUNCTION__);
parent_ictx->is_locked = 1;
suc_ictx->is_locked = 0;
}
if (!parent_ictx->is_locked)
panic("%s(): parent_ictx->is_locked\n", __FUNCTION__);
/*
* We have now inserted the parent entry in front of its successor but
* at the moment we have two copies of the parent entry, so delete the
* original copy from the parent node. We need to switch the parent
* index context to be the current context and to point to the parent
* entry we pulled down so we can just pretend it is the entry to be
* deleted. Note this is a simple delete just like for a leaf entry as
* we have taken care of its child leaf node(s) already and will
* deallocate it(them) later.
*/
ictx = parent_ictx;
parent_ictx->entry = pull_down_entry;
parent_ictx->entry_nr = pull_down_entry_nr;
gotoprep_simple_delete;
merge_left:
/*
* The parent entry is the end entry in its node so merge on the left.
* To do this we need to find the predecessor (leaf) entry for which we
* need to repoint the path so we descend into the sub-node of the
* entry on the left-hand side of the parent entry. Once found we need
* to pull down the new parent entry, i.e. the entry on the left of the
* current parent entry, behind its predecessor entry.
*
* We also make a note of the original parent entry as we have to
* change its VCN sub-node pointer later.
*/
old_parent_entry_nr = parent_ictx->entry_nr;
old_parent_ie_size = le16_to_cpu(parent_ictx->follow_entry->length) -
sizeof(VCN);
parent_ictx->entry_nr--;
parent_ictx->follow_entry =
parent_ictx->entries[parent_ictx->entry_nr];
pull_down_ie_size = le16_to_cpu(parent_ictx->follow_entry->length) -
sizeof(VCN);
suc_ictx = parent_ictx;
do {
err = ntfs_index_descend_into_child_node(&suc_ictx);
if (err) {
ntfs_error(idx_ni->vol->mp, "Failed to descend to ""node containing predecessor entry of ""left-hand sibling entry (error %d).",
err);
gotoput_err;
}
if (!suc_ictx->is_locked)
panic("%s(): !suc_ictx->is_locked\n", __FUNCTION__);
suc_ictx->entry_nr = suc_ictx->nr_entries - 1;
suc_ictx->follow_entry = suc_ictx->entries[suc_ictx->entry_nr];
} while (suc_ictx->index->flags & INDEX_NODE);
if (suc_ictx->follow_entry->flags & INDEX_ENTRY_NODE)
panic("%s(): suc_ictx->follow_entry->flags & ""INDEX_ENTRY_NODE\n", __FUNCTION__);
/*
* Can the entry on the left-hand side of the parent entry simply be
* inserted in after its predecessor leaf entry, i.e. before the end
* entry in the leaf node containing the predecessor leaf entry,
* without overflowing the node? If not we need to split the node and
* then restart the delete.
*/if (pull_down_ie_size > suc_ictx->bytes_free) {
split_and_restart_case_5:
ntfs_debug("Splitting index node before add on delete (case ""5).");
ie_size = pull_down_ie_size;
gotosplit_and_restart;
}
/*
* The entry on the left-hand side of the parent entry can simply be
* inserted in front of the end entry of the leaf node containing its
* predecessor.
*
* We need to have locked both nodes to be able to do the insert thus
* we may need to unlock the locked predecessor node to ensure that
* page locks are only taken in ascending page index order so we avoid
* deadlocks.
*/if (parent_ictx->is_locked)
panic("%s(): parent_ictx->is_locked\n", __FUNCTION__);
err = ntfs_index_ctx_lock_two(suc_ictx, parent_ictx);
if (err)
gotoput_err;
/*
* We need to re-check the amount of free space as it can have changed
* whilst we dropped the lock on @suc_ictx if that was necessary. When
* this happens simply drop the lock on @parent_ictx if we took it and
* go back to the previous code dealing with the not enough space case.
*/if (pull_down_ie_size > suc_ictx->bytes_free) {
if (parent_ictx->is_locked)
ntfs_index_ctx_unlock(parent_ictx);
gotosplit_and_restart_case_5;
}
/*
* Both nodes are locked and this is a simple insert, so go ahead and
* do it. We have already checked that there is enough space so it
* cannot fail.
*
* Move the end entry out of the way to make space for the entry to be
* copied in.
*/
err = ntfs_index_node_make_space(suc_ictx, pull_down_ie_size);
if (err)
panic("%s(): err\n", __FUNCTION__);
/*
* Copy the parent index entry into the created space in front of its
* predecessor and switch the inserted entry to being a leaf entry.
*/
ie = suc_ictx->follow_entry;
memcpy(ie, parent_ictx->follow_entry, pull_down_ie_size);
ie->length = cpu_to_le16(pull_down_ie_size);
ie->flags &= ~INDEX_ENTRY_NODE;
/* Ensure the updates are written to disk. */
ntfs_index_entry_mark_dirty(suc_ictx);
/*
* We are done with the predecessor node so release it or transfer it
* to the parent node if they share the same page (in which case
* @parent_ictx is not marked locked).
*/if (!suc_ictx->is_locked)
panic("%s(): !suc_ictx->is_locked\n", __FUNCTION__);
if (parent_ictx->is_locked) {
/*
* @parent_ictx is locked thus it cannot be sharing the lock
* with @suc_ictx. Unlock @suc_ictx.
*/
ntfs_index_ctx_unlock(suc_ictx);
} else {
/*
* @parent_ictx is not locked thus it must be sharing the lock
* with @suc_ictx. Transfer the lock across.
*/if (parent_ictx->is_root)
panic("%s(): parent_ictx->is_root\n", __FUNCTION__);
if (parent_ictx->upl_ofs != suc_ictx->upl_ofs)
panic("%s(): parent_ictx->upl_ofs != ""suc_ictx->upl_ofs\n", __FUNCTION__);
parent_ictx->is_locked = 1;
suc_ictx->is_locked = 0;
}
if (!parent_ictx->is_locked)
panic("%s(): parent_ictx->is_locked\n", __FUNCTION__);
/*
* We have now inserted the left-hand sibling entry of the parent entry
* in front of the end entry of the leaf node containing its
* predecessor entry but at the moment we have two copies of the entry,
* so delete the original copy from its index node.
*
* Before we do that, update the VCN sub-node pointer of the original
* parent entry to point to the index node that is about to become
* parentless. Note we do not mark the index node dirty as that will
* happen when the original copy of the entry we pulled down is
* deleted.
*/
*(leVCN*)((u8*)parent_ictx->entries[old_parent_entry_nr] +
old_parent_ie_size) = *(leVCN*)(
(u8*)parent_ictx->follow_entry + pull_down_ie_size);
/*
* All that is left now is to delete the original copy of the entry we
* pulled down. We need to switch the parent index context to be the
* current context as it already points to the entry we pulled down
* thus we can just pretend it is the entry to be deleted. Note this
* is a simple delete just like for a leaf entry as we have taken care
* of transfering its child leaf node(s) to the original parent entry
* already and will deallocate the origial parent entry's child leaf
* node(s) later.
*/
ictx = parent_ictx;
prep_simple_delete:
ie = ictx->entry;
is_root = ictx->is_root;
ictx->is_match = 1;
ih = ictx->index;
ie_size = le16_to_cpu(ie->length);
next_ie = (INDEX_ENTRY*)((u8*)ie + ie_size);
/* goto simple_delete; */simple_delete:
new_ilen = le32_to_cpu(ih->index_length) - ie_size;
if (is_root) {
ATTR_RECORD *a;
u32 muse;
m = ictx->actx->m;
muse = le32_to_cpu(m->bytes_in_use);
/*
* Move the index entries following @ie into @ie's position.
* As an optimization we combine the moving of the index
* entries and the resizing of the index root attribute into a
* single operation.
*/
memmove(ie, next_ie, muse - ((u8*)next_ie - (u8*)m));
/* Adjust the mft record to reflect the change in used space. */
m->bytes_in_use = cpu_to_le32(muse - ie_size);
/*
* Adjust the attribute record to reflect the change in
* attribute value and attribute record size.
*/
a = ictx->actx->a;
a->length = cpu_to_le32(le32_to_cpu(a->length) - ie_size);
a->value_length = cpu_to_le32(le32_to_cpu(a->value_length) -
ie_size);
/* Adjust the index header to reflect the change in length. */
ih->allocated_size = ih->index_length = cpu_to_le32(new_ilen);
} else {
/* Move index entries following @ie into @ie's position. */
memmove(ie, next_ie, new_ilen - ((u8*)ie - (u8*)ih));
/* Adjust the index header to reflect the change in length. */
ih->index_length = cpu_to_le32(new_ilen);
}
/* Ensure the updates are written to disk. */
ntfs_index_entry_mark_dirty(ictx);
/*
* If we have scheduled any index block nodes to be deallocated do it
* now but first unlock the node we just deleted an entry from so we do
* not deadlock.
*/if (deallocate_ictx) {
ntfs_index_ctx_unlock(ictx);
ictx = deallocate_ictx;
do {
if (ictx->is_root)
panic("%s(): ictx->is_root\n", __FUNCTION__);
err = ntfs_index_block_free(ictx);
if (err) {
ntfs_error(idx_ni->vol->mp, "Failed to ""deallocate no longer used ""index block VCN 0x%llx ""(error %d). Run chkdsk to ""recover the lost index ""block.",
(unsignedlonglong)ictx->vcn,
err);
NVolSetErrors(idx_ni->vol);
}
} while ((ictx = ictx->up) != deallocate_ictx);
}
/*
* Release any sub-tree paths that the caller no longer has a reference
* to.
*/if (put_ictx)
ntfs_index_ctx_put(put_ictx);
ntfs_debug("Done.");
return 0;
split_and_restart:
/*
* Split the index node described by @suc_ictx taking into account the
* fact that it is very likely that an entry of @ie_size will be added
* to the index in front of the position pointed to by @suc_ictx.
*/
err = ntfs_index_node_split(suc_ictx, ie_size);
if (!err) {
/*
* Signal to the caller that they need to restart the delete
* from scratch.
*/
err = -EAGAIN;
}
put_err:
if (put_ictx)
ntfs_index_ctx_put(put_ictx);
return err;
}