/*
* ntfs_vnops.c - NTFS kernel vnode operations.
*
* Copyright (c) 2006-2011 Anton Altaparmakov. All Rights Reserved.
* Portions Copyright (c) 2006-2011 Apple Inc. All Rights Reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* 3. Neither the name of Apple Inc. ("Apple") nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* ALTERNATIVELY, provided that this notice and licensing terms are retained in
* full, this file may be redistributed and/or modified under the terms of the
* GNU General Public License (GPL) Version 2, in which case the provisions of
* that version of the GPL will apply to you instead of the license terms
* above. You can obtain a copy of the GPL Version 2 at
* http://developer.apple.com/opensource/licenses/gpl-2.txt.
*/
#include<sys/attr.h>
#include<sys/buf.h>
#include<sys/errno.h>
#include<sys/param.h>
#include<sys/stat.h>
#include<sys/syslimits.h>
#include<sys/time.h>
#include<sys/ubc.h>
#include<sys/ucred.h>
#include<sys/uio.h>
#include<sys/unistd.h>
#include<sys/vnode.h>
#include<sys/vnode_if.h>
#include<sys/xattr.h>
#include<string.h>
#include<mach/kern_return.h>
#include<mach/memory_object_types.h>
#include<kern/debug.h>
#include<kern/locks.h>
#include<vfs/vfs_support.h>
#include"ntfs.h"
#include"ntfs_attr.h"
#include"ntfs_bitmap.h"
#include"ntfs_compress.h"
#include"ntfs_debug.h"
#include"ntfs_dir.h"
#include"ntfs_endian.h"
#include"ntfs_hash.h"
#include"ntfs_inode.h"
#include"ntfs_layout.h"
#include"ntfs_lcnalloc.h"
#include"ntfs_mft.h"
#include"ntfs_mst.h"
#include"ntfs_page.h"
#include"ntfs_sfm.h"
#include"ntfs_time.h"
#include"ntfs_unistr.h"
#include"ntfs_vnops.h"
#include"ntfs_volume.h"/* Global ntfs vnode operations. */
vnop_t **ntfs_vnodeop_p;
/**
* ntfs_cluster_iodone - complete i/o on a memory region
* @cbp: cluster head buffer for which i/o is being completed
* @arg: callback argument, we do not use it at present
*
* In the read case:
*
* For an mst protected attribute we do the post read mst deprotection and for
* an encrypted attribute we do the decryption (not supported at present).
* Note we ignore mst fixup errors as those are detected when
* ntfs_mft_record_map() is called later which gives us per record granularity.
*
* In the write case:
*
* For an mst protected attribute we do the post write mst deprotection.
* Writing to encrypted attributes is not supported at present.
*
* Return 0 on success and errno on error.
*/intntfs_cluster_iodone(buf_t cbp, void *arg __unused)
{
long size;
ntfs_inode *ni;
u8 *kend, *kaddr;
errno_t err, err2;
BOOL is_read = buf_flags(cbp) & B_READ;
ni = NTFS_I(buf_vnode(cbp));
size = buf_count(cbp);
if (size & (ni->block_size - 1))
panic("%s(): Called with size not a multiple of the inode ""block size.\n", __FUNCTION__);
err = buf_map(cbp, (caddr_t*)&kaddr);
if (err) {
ntfs_error(ni->vol->mp, "Failed to map buffer (error %d).",
err);
gotoerr;
}
kend = kaddr + size;
if (NInoMstProtected(ni)) {
s64 ofs, data_size, init_size;
u32 rec_size = ni->block_size;
NTFS_RECORD_TYPE magic = 0;
if (!is_read) {
if (ni->type == AT_INDEX_ALLOCATION)
magic = magic_INDX;
else
panic("%s(): Unknown mst protected inode ""0x%llx, type 0x%x, name_len ""0x%x.", __FUNCTION__,
(unsignedlonglong)ni->mft_no,
(unsigned)le32_to_cpu(ni->type),
(unsigned)ni->name_len);
}
/* The offset in the attribute at which this buffer begins. */
ofs = (s64)buf_lblkno(cbp) << PAGE_SHIFT;
lck_spin_lock(&ni->size_lock);
data_size = ni->data_size;
init_size = ni->initialized_size;
lck_spin_unlock(&ni->size_lock);
/*
* Limit mst deprotection to the initialized size as beyond
* that the data is zero and deprotection will fail. And worse
* in the write case it will lead to a kernel panic.
*/if (ofs + size > init_size) {
if (ofs > data_size) {
ntfs_error(ni->vol->mp, "Buffer begins past ""the end of the data of the ""attribute (mft_no 0x%llx).",
(unsignedlonglong)ni->mft_no);
err = EINVAL;
gotounm_err;
}
if (ofs > init_size) {
ntfs_debug("Buffer begins past the end of the ""initialized data of the ""attribute (mft_no 0x%llx).",
(unsignedlonglong)ni->mft_no);
gotounm_err;
}
size = init_size - ofs;
kend = kaddr + size;
}
/*
* Do the mst deprotection ignoring errors and make sure we do
* not go past the initialized size should an error somehow
* have caused the last record to straddle the initialized
* size.
*/while (kaddr + rec_size <= kend) {
if (is_read)
(void)ntfs_mst_fixup_post_read(
(NTFS_RECORD*)kaddr, rec_size);
elseif (__ntfs_is_magic(((NTFS_RECORD*)kaddr)->magic,
magic))
ntfs_mst_fixup_post_write((NTFS_RECORD*)kaddr);
kaddr += rec_size;
}
} elseif (NInoEncrypted(ni)) {
// TODO: Need to decrypt the encrypted sectors here. This
// cannot happen at present as we deny opening/reading/writing/
// paging encrypted vnodes.
panic("%s(): Called for encrypted vnode.\n", __FUNCTION__);
} else
panic("%s(): Called for normal vnode.\n", __FUNCTION__);
unm_err:
err2 = buf_unmap(cbp);
if (err2) {
if (!err)
err = err2;
ntfs_error(ni->vol->mp, "Failed to unmap buffer (error %d).",
err2);
}
err:
return err;
}
/**
* ntfs_buf_iodone - remove the MST fixups when i/o is complete on a buffer
* @buf: buffer for which to remove the MST fixups
* @arg: unused, always NULL
*
* ntfs_buf_iodone() is an i/o completion handler which is called when i/o is
* completed on a buffer belonging to $MFT/$DATA. It removes the MST fixups
* and returns after which the buffer busy state (BL_BUSY flag) is cleared and
* others can access the buffer again.
*
* ntfs_buf_iodone() is called both when the i/o was successful and when it
* failed thus we have to deal with that as appropriate.
*
* Note that ntfs_buf_iodone() is called deep from within the driver stack and
* thus there are limitations on what it is allowed to do. In particular it is
* not allowed to initiate new i/o operations nor to allocate/free memory.
*
* WARNING: This function can be called whilst an unmount is in progress and
* thus it may not look up nor use the ntfs_volume structure to which the inode
* belongs.
*/staticvoidntfs_buf_iodone(buf_t buf, void *arg __unused)
{
s64 ofs, data_size, init_size;
vnode_t vn;
mount_t mp;
ntfs_inode *ni;
unsigned size, b_flags;
errno_t err;
vn = buf_vnode(buf);
mp = vnode_mount(vn);
ni = NTFS_I(vn);
ntfs_debug("Entering for mft_no 0x%llx, lblkno 0x%llx.",
(unsignedlonglong)ni->mft_no,
(unsignedlonglong)buf_lblkno(buf));
if (!NInoMstProtected(ni) || ni->mft_no || NInoAttr(ni))
panic("%s(): Called not for $MFT!\n", __FUNCTION__);
/* The size and offset in the attribute at which this buffer begins. */
size = buf_count(buf);
if (size != ni->block_size)
panic("%s(): size != ni->block_size\n", __FUNCTION__);
ofs = (s64)buf_lblkno(buf) << ni->block_size_shift;
lck_spin_lock(&ni->size_lock);
data_size = ni->data_size;
init_size = ni->initialized_size;
lck_spin_unlock(&ni->size_lock);
b_flags = buf_flags(buf);
/*
* Limit mst deprotection to the initialized size as beyond that the
* data is zero and deprotection will fail. And worse in the write
* case it will lead to a kernel panic.
*/if (ofs + size > init_size) {
if (ofs > data_size) {
ntfs_error(mp, "Buffer begins past the end of the ""data of the attribute (mft_no ""0x%llx).",
(unsignedlonglong)ni->mft_no);
err = EINVAL;
gotoerr;
}
if (ofs > init_size) {
ntfs_error(mp, "Buffer begins past the end of the ""initialized data of the attribute ""(mft_no 0x%llx).",
(unsignedlonglong)ni->mft_no);
err = EINVAL;
gotoerr;
}
}
/*
* Do not try to remove the fixups if a read failed as there will be
* nothing to remove.
*/if (!buf_error(buf) || !(b_flags & B_READ)) {
NTFS_RECORD *rec;
err = buf_map(buf, (caddr_t*)&rec);
if (err) {
ntfs_error(mp, "Failed to map buffer (error %d).",
err);
gotoerr;
}
if (b_flags & B_READ) {
err = ntfs_mst_fixup_post_read(rec, size);
if (err) {
ntfs_error(mp, "Multi sector transfer error ""detected in mft_no 0x%llx ""(error %d). Run chkdsk",
(unsignedlonglong)ni->mft_no,
err);
buf_seterror(buf, err);
}
} else
ntfs_mst_fixup_post_write(rec);
err = buf_unmap(buf);
if (err) {
ntfs_error(mp, "Failed to unmap buffer (error %d).",
err);
gotoerr;
}
}
ntfs_debug("Done.");
return;
err:
if (!buf_error(buf))
buf_seterror(buf, err);
ntfs_debug("Failed.");
return;
}
/**
* ntfs_vnop_strategy - prepare and issue the i/o described by a buffer
* @a: arguments to strategy function
*
* @a contains:
* buf_t a_bp; buffer for which to prepare and issue the i/o
*
* Prepare and issue the i/o described by the buffer @a->a_bp. Adapted from
* buf_strategy().
*
* In NTFS, we only ever get called for buffers which have a page list
* attached. The page list is mapped and the address of the mapping is stored
* in (u8*)buf_dataptr(@a->a_bp). The exception to this is i/o for $MFT/$DATA
* and $MFTMirr/$DATA which is issued via buf_meta_bread(), etc, and thus does
* not involve a page list at all.
*
* Return 0 on success and errno on error.
*/staticintntfs_vnop_strategy(struct vnop_strategy_args *a)
{
s64 ofs, max_end_io;
daddr64_t lblkno;
buf_t buf = a->a_bp;
vnode_t vn = buf_vnode(buf);
ntfs_inode *ni;
ntfs_volume *vol;
void (*old_iodone)(buf_t, void *);
void *old_transact;
unsigned b_flags;
errno_t err, err2;
BOOL do_fixup;
/* Same checks as in buf_strategy(). */if (!vn || vnode_ischr(vn) || vnode_isblk(vn))
panic("%s(): !vn || vnode_ischr(vn) || vnode_isblk(vn)\n",
__FUNCTION__);
ni = NTFS_I(vn);
if (!ni) {
err = EIO;
ntfs_debug("Entered with NULL ntfs_inode, aborting.");
gotoerr;
}
ntfs_debug("Entering for mft_no 0x%llx, type 0x%x, name_len 0x%x, ""logical block 0x%llx.", (unsignedlonglong)ni->mft_no,
le32_to_cpu(ni->type), (unsigned)ni->name_len,
(unsignedlonglong)buf_lblkno(buf));
if (S_ISDIR(ni->mode))
panic("%s(): Called for directory vnode.\n", __FUNCTION__);
vol = ni->vol;
b_flags = buf_flags(buf);
/*
* If we are called from cluster_io() then pass the request down to the
* underlying device containing the NTFS volume. We have no KPI way of
* doing this directly so we invoke buf_strategy() and rely on the fact
* that it does not do anything other than associate the physical
* device with the buffer and then pass the buffer down to the device.
*/if (b_flags & B_CLUSTER)
gotodone;
/*
* If this i/o is for $MFTMirr/$DATA send it through straight without
* modifications. This is because we keep the $MFTMirr/$DATA buffers
* in memory with the fixups applied for simplicity.
*/if (ni->mft_no == FILE_MFTMirr && !NInoAttr(ni))
gotodone;
/*
* Except for $MFT/$DATA we never do i/o via file system buffers thus
* we should never get here.
*/if (ni->mft_no != FILE_MFT || NInoAttr(ni))
panic("%s(): Called for non-cluster i/o buffer.\n",
__FUNCTION__);
/*
* We are reading/writing $MFT/$DATA.
*
* For reads, i/o is allowed up to the data_size whilst for writes, i/o
* is only allowed up to the initialized_size.
*
* Further when reading past the initialized size we do not need to do
* i/o at all as we can simply clear the buffer and return success.
*/
lblkno = buf_lblkno(buf);
ofs = lblkno << ni->block_size_shift;
lck_spin_lock(&ni->size_lock);
max_end_io = ni->initialized_size;
do_fixup = FALSE;
if (b_flags & B_READ) {
if (ofs >= max_end_io) {
if (max_end_io > ni->data_size)
panic("%s() initialized_size > data_size\n",
__FUNCTION__);
if (ofs < ni->data_size) {
lck_spin_unlock(&ni->size_lock);
buf_clear(buf);
buf_biodone(buf);
ntfs_debug("Read past initialized size. ""Clearing buffer.");
return 0;
}
}
max_end_io = ni->data_size;
do_fixup = TRUE;
}
lck_spin_unlock(&ni->size_lock);
if (ofs >= max_end_io) {
/* I/o is out of range. This should never happen. */
ntfs_error(vol->mp, "Trying to %s buffer for $MFT/$DATA which ""is out of range, aborting.",
b_flags & B_READ ? "read" : "write");
err = EIO;
gotoerr;
}
/*
* For writes we need to apply the MST fixups before calling
* buf_strategy() which will perform the i/o and if the write is for an
* mft record that is also in the mft mirror we now need to write it to
* the mft mirror as well.
*
* Note B_WRITE is a pseudo flag and cannot be used for checking thus
* check that B_READ is not set which implies it is a write.
*/if (!(b_flags & B_READ)) {
NTFS_RECORD *rec;
NTFS_RECORD_TYPE magic;
BOOL need_mirr_sync;
err = buf_map(buf, (caddr_t*)&rec);
if (err) {
ntfs_error(vol->mp, "Failed to map buffer (error %d).",
err);
gotoerr;
}
if (!rec)
panic("%s(): buf_map() returned NULL.\n", __FUNCTION__);
#if 0
need_mirr_sync = FALSE;
if (ni->type == AT_INDEX_ALLOCATION)
magic = magic_INDX;
elseif (ni == mft_ni || ni == vol->mftmirr_ni) {
magic = magic_FILE;
if (ni == mft_ni)
need_mirr_sync = (lblkno < vol->mftmirr_size);
} else
panic("%s(): Unknown mst protected inode 0x%llx, type ""0x%x, name_len 0x%x.", __FUNCTION__,
(unsignedlonglong)ni->mft_no,
(unsigned)le32_to_cpu(ni->type),
(unsigned)ni->name_len);
#else
need_mirr_sync = (lblkno < vol->mftmirr_size);
magic = magic_FILE;
#endif/*
* Only apply fixups if the record has the correct magic. We
* may have detected a multi sector transfer error and are thus
* now writing a BAAD record in which case we do not want to
* touch its contents.
*
* Further, if there is an error do not sync the record to the
* mft mirror as that may still be intact and we do not want to
* overwrite the correct data with corrupt data.
*/if (__ntfs_is_magic(rec->magic, magic)) {
err = ntfs_mst_fixup_pre_write(rec, ni->block_size);
if (err) {
/* The record is corrupt, do not write it. */
ntfs_error(vol->mp, "Failed to apply mst ""fixups (mft_no 0x%llx, type ""0x%x, offset 0x%llx).",
(unsignedlonglong)ni->mft_no,
(unsigned)le32_to_cpu(ni->type),
(unsignedlonglong)ofs);
err = EIO;
gotounm_err;
}
do_fixup = TRUE;
if (need_mirr_sync) {
/*
* Note we continue despite an error as we may
* succeed to write the actual mft record.
*/
err = ntfs_mft_mirror_sync(vol, lblkno,
(MFT_RECORD*)rec,
!(b_flags & B_ASYNC));
if (err)
ntfs_error(vol->mp, "Failed to sync ""mft mirror (error ""%d). Run chkdsk.",
err);
}
}
err = buf_unmap(buf);
if (err)
ntfs_error(vol->mp, "Failed to unmap buffer (error ""%d).", err);
}
/*
* For both reads and writes we need to register our i/o completion
* handler which will be called after i/o is complete (including on i/o
* failure) and in which we will remove the MST fixups so the buffer in
* memory never has MST fixups applied unless it is under i/o in which
* case it is BL_BUSY and thus cannot be accessed by anyone so it is
* safe to have the MST fixups applied whilst i/o is in flight.
*/if (do_fixup) {
buf_setfilter(buf, ntfs_buf_iodone, NULL, &old_iodone,
&old_transact);
if (old_iodone || old_transact)
panic("%s(): Buffer for $MFT/$DATA already had an i/o ""completion handler assigned!\n",
__FUNCTION__);
}
/*
* Everything is set up. Pass the i/o onto the buffer layer.
*
* When the i/o is done it will call our i/o completion handler which
* will remove the mst fixups.
*/done:
return buf_strategy(vol->dev_vn, a);
unm_err:
err2 = buf_unmap(buf);
if (err2)
ntfs_error(vol->mp, "Failed to unmap buffer in error code ""path (error %d).", err2);
err:
buf_seterror(buf, err);
buf_biodone(buf);
return err;
}
/**
* ntfs_vnop_lookup - find a vnode inside an ntfs directory given its name
* @a: arguments to lookup function
*
* @a contains:
* vnode_t a_dvp; directory vnode in which to search
* vnode_t *a_vpp; destination pointer for the found vnode
* struct componentname *a_cnp; name to find in the directory vnode
* vfs_context_t a_context;
*
* In short, ntfs_vnop_lookup() looks for the vnode represented by the name
* @a->a_cnp in the directory vnode @a->a_dvp and if found returns the vnode in
* *@a->a_vpp.
*
* Return 0 on success and the error code on error. A return value of ENOENT
* does not signify an error as such but merely the fact that the name
* @a->a_cnp is not present in the directory @a->a_dvp. When the lookup is
* done for purposes of create, including for the destination of a rename, we
* return EJUSTRETURNED instead of ENOENT when the name is not found. This
* allows the VFS to proceed with the create/rename.
*
* To simplify matters for us, we do not treat the DOS and WIN32 filenames as
* two hard links but instead if the lookup matches a DOS filename, we return
* the corresponding WIN32 filename instead.
*
* There are three cases we need to distinguish here:
*
* 1) The name perfectly matches (i.e. including case) a directory entry with a
* filename in the WIN32 or POSIX namespaces. In this case
* ntfs_lookup_inode_by_name() will return with name set to NULL and we
* just use the name as supplied in @a->a_cnp.
* 2) The name matches (not including case) a directory entry with a filename
* in the WIN32 or POSIX namespaces. In this case
* ntfs_lookup_inode_by_name() will return with name set to point to an
* allocated ntfs_dir_lookup_name structure containing the properly cased
* little endian Unicode name. We convert the name to decomposed UTF-8 and
* use that name.
* 3) The name matches either perfectly or not (i.e. we do not care about case)
* a directory entry with a filename in the DOS namespace. In this case
* ntfs_lookup_inode_by_name() will return with name set to point to an
* allocated ntfs_dir_lookup_name structure which just tells us that the
* name is in the DOS namespace. We read the inode and find the filename in
* the WIN32 namespace corresponding to the matched DOS name. We then
* convert the name to decomposed UTF-8 and use that name to update the
* vnode identity with.
*/staticintntfs_vnop_lookup(struct vnop_lookup_args *a)
{
MFT_REF mref;
ino64_t mft_no;
unsignedlong op;
struct componentname *name_cn, *cn;
ntfs_inode *ni, *dir_ni = NTFS_I(a->a_dvp);
vnode_t vn;
ntfs_volume *vol;
ntfschar *ntfs_name;
ntfs_dir_lookup_name *name = NULL;
u8 *utf8_name = NULL;
size_t ntfs_name_size, utf8_size;
signed ntfs_name_len;
int err;
/*
* This is rather gross but several other file systems do it so perhaps
* the large stack (16kiB I believe) in the OS X kernel is big enough.
* If we do not want to do the static allocation then simply set
* ntfs_name to NULL and utf8_to_ntfs() will allocate the memory for
* us. (We then have to free it, see utf8_to_ntfs() description for
* details.)
*/
ntfschar ntfs_name_buf[NTFS_MAX_NAME_LEN];
struct componentname cn_buf;
#ifdefDEBUGstaticconstchar *ops[4] = { "LOOKUP", "CREATE", "DELETE", "RENAME" };
#endifif (!dir_ni) {
ntfs_debug("Entered with NULL ntfs_inode, aborting.");
return EINVAL;
}
vol = dir_ni->vol;
name_cn = cn = a->a_cnp;
op = cn->cn_nameiop;
ntfs_debug("Looking up %.*s in directory inode 0x%llx for %s, flags ""0x%lx.", (int)cn->cn_namelen, cn->cn_nameptr,
(unsignedlonglong)dir_ni->mft_no,
op < 4 ? ops[op] : "UNKNOWN",
(unsignedlong)cn->cn_flags);
/*
* Ensure we are being called for a directory in case we are not being
* called from the VFS.
*/if (!S_ISDIR(dir_ni->mode)) {
ntfs_error(vol->mp, "Not a directory.");
return ENOTDIR;
}
lck_rw_lock_shared(&dir_ni->lock);
/* Do not allow messing with the inode once it has been deleted. */if (NInoDeleted(dir_ni)) {
/* Remove the inode from the name cache. */
cache_purge(dir_ni->vn);
lck_rw_unlock_shared(&dir_ni->lock);
ntfs_debug("Parent directory is deleted.");
return ENOENT;
}
/*
* First, look for the name in the name cache. cache_lookup() returns
* -1 if found and @vn is set to the vnode, ENOENT if found and it is a
* negative entry thus @vn is not set to anything, or 0 if the lookup
* failed in which case we need to do a file system based lookup.
*
* Note that if @op is CREATE and there is a negative entry in the name
* cache cache_lookup() will discard that name and return 0, i.e. the
* lookup failed. In this case we will automatically fall through and
* do the right thing during the real lookup.
*/
err = cache_lookup(dir_ni->vn, &vn, cn);
if (err) {
if (err == -1) {
ni = NTFS_I(vn);
lck_rw_lock_shared(&ni->lock);
/*
* Do not allow messing with the inode once it has been
* deleted.
*/if (!NInoDeleted(ni)) {
lck_rw_unlock_shared(&ni->lock);
lck_rw_unlock_shared(&dir_ni->lock);
*a->a_vpp = vn;
ntfs_debug("Done (cached).");
return 0;
}
lck_rw_unlock_shared(&ni->lock);
/* Remove the inode from the name cache. */
cache_purge(vn);
vnode_put(vn);
ntfs_warning(vol->mp, "Cached but deleted vnode ""found, purged from cache and doing ""real lookup.");
} else {
lck_rw_unlock_shared(&dir_ni->lock);
if (err == ENOENT) {
ntfs_debug("Done (cached, negative).");
return err;
}
ntfs_error(vol->mp, "cache_lookup() failed (error ""%d).", err);
return err;
}
}
/* We special case "." and ".." as they are emulated on NTFS. */if (cn->cn_namelen == 1 && cn->cn_nameptr[0] == '.') {
/* "." is not cached. */
cn->cn_flags &= ~MAKEENTRY;
if (op == RENAME) {
lck_rw_unlock_shared(&dir_ni->lock);
ntfs_debug("Op is RENAME but name is \".\", returning ""EISDIR.");
return EISDIR;
}
err = vnode_get(dir_ni->vn);
lck_rw_unlock_shared(&dir_ni->lock);
if (err) {
ntfs_error(vol->mp, "Failed to get iocount reference ""on current directory (error %d).",
err);
return err;
}
ntfs_debug("Got \".\" directory 0x%llx.",
(unsignedlonglong)dir_ni->mft_no);
*a->a_vpp = dir_ni->vn;
return 0;
} elseif (cn->cn_flags & ISDOTDOT) {
/* ".." is not cached. */
cn->cn_flags &= ~MAKEENTRY;
vn = vnode_getparent(dir_ni->vn);
if (vn) {
lck_rw_unlock_shared(&dir_ni->lock);
ntfs_debug("Got \"..\" directory 0x%llx of directory ""0x%llx.",
(unsignedlonglong)NTFS_I(vn)->mft_no,
(unsignedlonglong)dir_ni->mft_no);
*a->a_vpp = vn;
return 0;
}
/*
* Look up a filename attribute in the mft record of the
* directory @dir_ni and use its parent mft reference to run an
* ntfs_inode_get() on it to obtain an inode for "..".
*/
err = ntfs_inode_get_name_and_parent_mref(dir_ni, FALSE, &mref,
NULL);
lck_rw_unlock_shared(&dir_ni->lock);
if (err) {
ntfs_error(vol->mp, "Failed to obtain parent mft ""reference for directory 0x%llx ""(error %d).",
(unsignedlonglong)dir_ni->mft_no,
err);
return err;
}
mft_no = MREF(mref);
err = ntfs_inode_get(vol, mft_no, FALSE, LCK_RW_TYPE_SHARED,
&ni, NULL, NULL);
if (err) {
ntfs_error(vol->mp, "Failed to obtain parent inode ""0x%llx for directory 0x%llx (error ""%d).", (unsignedlonglong)mft_no,
(unsignedlonglong)dir_ni->mft_no,
err);
return err;
}
/* Consistency check. */if (MSEQNO(mref) != ni->seq_no) {
lck_rw_unlock_shared(&ni->lock);
(void)vnode_put(ni->vn);
ntfs_error(vol->mp, "Found stale parent mft reference ""in filename of directory 0x%llx. ""Volume is corrupt. Run chkdsk.",
(unsignedlonglong)dir_ni->mft_no);
return EIO;
}
if (!S_ISDIR(ni->mode)) {
lck_rw_unlock_shared(&ni->lock);
(void)vnode_put(ni->vn);
ntfs_error(vol->mp, "Found non-directory parent for ""filename of directory 0x%llx. ""Volume is corrupt. Run chkdsk.",
(unsignedlonglong)dir_ni->mft_no);
return EIO;
}
ntfs_debug("Got \"..\" directory 0x%llx of directory 0x%llx.",
(unsignedlonglong)mft_no,
(unsignedlonglong)dir_ni->mft_no);
*a->a_vpp = ni->vn;
lck_rw_unlock_shared(&ni->lock);
return 0;
}
/* Convert the name from utf8 to Unicode. */
ntfs_name = ntfs_name_buf;
ntfs_name_size = sizeof(ntfs_name_buf);
ntfs_name_len = utf8_to_ntfs(vol, (u8*)cn->cn_nameptr, cn->cn_namelen,
&ntfs_name, &ntfs_name_size);
if (ntfs_name_len < 0) {
lck_rw_unlock_shared(&dir_ni->lock);
err = -ntfs_name_len;
if (err == ENAMETOOLONG)
ntfs_debug("Failed (name is too long).");
else
ntfs_error(vol->mp, "Failed to convert name to ""Unicode (error %d).", err);
return err;
}
/* Look up the converted name in the directory index. */
err = ntfs_lookup_inode_by_name(dir_ni, ntfs_name, ntfs_name_len,
&mref, &name);
if (err) {
lck_rw_unlock_shared(&dir_ni->lock);
if (err != ENOENT) {
ntfs_error(vol->mp, "Failed to find name in directory ""(error %d).", err);
return err;
}
not_found:
/*
* The name does not exist in the directory @dir_ni.
*
* If creating (or renaming and the name is the destination
* name) and we are at the end of a pathname we can consider
* allowing the file to be created so return EJUSTRETURN
* instead of ENOENT.
*/if (cn->cn_flags & ISLASTCN && (op == CREATE || op == RENAME)) {
ntfs_debug("Done (not found but for CREATE or RENAME, ""returning EJUSTRETURN).");
return EJUSTRETURN;
}
/*
* Insert a negative entry into the name cache if caching of
* this name is desired unless this is a create operation in
* which case we do not want to do that.
*/if (cn->cn_flags & MAKEENTRY && op != CREATE)
cache_enter(dir_ni->vn, NULL, cn);
/*
* Prevent the caller from trying to add the name to the cache
* as well.
*/
cn->cn_flags &= ~MAKEENTRY;
ntfs_debug("Done (not found%s).", cn->cn_flags & MAKEENTRY ?
"adding negative name cache entry" : "");
return err;
}
/* The lookup succeeded. */
mft_no = MREF(mref);
ntfs_debug("Name matches inode number 0x%llx.",
(unsignedlonglong)mft_no);
/*
* Remove all NTFS core system files from the name space so we do not
* need to worry about users damaging a volume by writing to them or
* deleting/renaming them and so that we can return fsRtParID (1) as
* the inode number of the parent of the volume root directory and
* fsRtDirID (2) as the inode number of the volume root directory which
* are both expected by Carbon and various applications.
*/if (mft_no < FILE_first_user) {
lck_rw_unlock_shared(&dir_ni->lock);
if (name)
OSFree(name, sizeof(*name), ntfs_malloc_tag);
ntfs_debug("Removing core NTFS system file (mft_no 0x%x) ""from name space.", (unsigned)mft_no);
err = ENOENT;
gotonot_found;
}
/*
* If the name is at the end of a pathname and is about to be deleted
* either directly or as a consequence of a rename with the name as the
* target, do not cache it.
*/if (cn->cn_flags & ISLASTCN && (op == DELETE || op == RENAME))
cn->cn_flags &= ~MAKEENTRY;
/*
* If a name was returned from the lookup and it is in the POSIX or
* WIN32 namespaces we need to convert it into a componentname so we
* can use it instead of the existing componentname @cn when getting
* the inode.
*
* If the returned name is in the DOS namespace we have to get the
* inode without a name as we need the inode in order to be able to
* find the WIN32 name corresponding to the DOS name. Once we have the
* name we will update the vnode identity with it.
*
* If no name was returned, the match was perfect and we just use the
* componentname that was passed in by the caller.
*/if (name) {
if (name->type == FILENAME_DOS) {
name_cn = NULL;
/*
* We do not need @name any more but do not set it to
* NULL because we use that fact to distinguish between
* the DOS and WIN32/POSIX cases.
*/
OSFree(name, sizeof(*name), ntfs_malloc_tag);
} else {
signed res_size;
res_size = ntfs_to_utf8(vol, name->name, name->len <<
NTFSCHAR_SIZE_SHIFT, &utf8_name,
&utf8_size);
OSFree(name, sizeof(*name), ntfs_malloc_tag);
if (res_size < 0) {
lck_rw_unlock_shared(&dir_ni->lock);
/* Failed to convert name. */
err = -res_size;
ntfs_error(vol->mp, "Failed to convert inode ""name to decomposed UTF-8 ""(error %d).", err);
return err;
}
name = NULL;
cn_buf = (struct componentname) {
.cn_flags = cn->cn_flags,
.cn_nameptr = (char*)utf8_name,
.cn_namelen = res_size,
};
name_cn = &cn_buf;
}
}
/*
* @name_cn now contains the correct name of the inode or is NULL.
*
* If @name_cn is not NULL and its cn_flags indicate that the name is
* to be entered into the name cache, ntfs_inode_get() will do this and
* clear the MAKEENTRY bit in the cn_flags.
*
* Note we only drop the directory lock after obtaining the inode
* otherwise someone could delete it under our feet.
*/
err = ntfs_inode_get(vol, mft_no, FALSE, LCK_RW_TYPE_SHARED, &ni,
dir_ni->vn, name_cn);
lck_rw_unlock_shared(&dir_ni->lock);
if (name_cn == &cn_buf) {
/* Pick up any modifications to the cn_flags. */
cn->cn_flags = cn_buf.cn_flags;
OSFree(utf8_name, utf8_size, ntfs_malloc_tag);
}
if (!err) {
/* Consistency check. */// FIXME: I cannot remember why we need the "mft_no !=
// FILE_MFT" test...
if (MSEQNO(mref) != ni->seq_no && mft_no != FILE_MFT) {
lck_rw_unlock_shared(&ni->lock);
(void)vnode_put(ni->vn);
ntfs_debug("Inode was deleted and reused under our ""feet.");
err = ENOENT;
gotonot_found;
}
/*
* We found it. Before we can return it, we have to check if
* returning this inode is a valid response to the requested
* lookup. To be more specific, if the lookup was for an
* intermediate path component and the inode is not a directory
* or symbolic link, it is not a valid response because it
* cannot be part of an intermediate path component. In that
* case return an error.
*/if (cn->cn_flags & ISLASTCN || S_ISDIR(ni->mode) ||
S_ISLNK(ni->mode)) {
/*
* Perfect WIN32/POSIX match or wrong case WIN32/POSIX
* match, i.e. cases 1 and 2, respectively.
*/if (!name) {
*a->a_vpp = ni->vn;
ntfs_debug("Done (case %d).",
name_cn == &cn_buf ? 2 : 1);
lck_rw_unlock_shared(&ni->lock);
return 0;
}
/*
* We are too indented. Handle DOS matches further
* below.
*/gotohandle_dos_name;
}
lck_rw_unlock_shared(&ni->lock);
(void)vnode_put(ni->vn);
ntfs_debug("Done (intermediate path component requested but ""found inode is not a directory or symbolic ""link, returning ENOTDIR).");
err = ENOTDIR;
} else {
if (err == ENOENT) {
ntfs_debug("Inode was deleted under our feet.");
gotonot_found;
}
ntfs_error(vol->mp, "Failed to get inode 0x%llx (error %d).",
(unsignedlonglong)mft_no, err);
}
return err;
// TODO: Consider moving this lot to a separate function.
handle_dos_name:
{
MFT_RECORD *m;
ntfs_attr_search_ctx *ctx;
FILENAME_ATTR *fn;
constchar *old_name;
signed res_size;
vn = ni->vn;
/*
* DOS match. -- Case 3.
*
* Find the WIN32 name corresponding to the matched DOS name.
*
* At present @ni is guaranteed to be a base inode.
*/
err = ntfs_mft_record_map(ni, &m);
if (err) {
ntfs_error(vol->mp, "Failed to map mft record (error %d).",
err);
gotoerr;
}
ctx = ntfs_attr_search_ctx_get(ni, m);
if (!ctx) {
ntfs_error(vol->mp, "Failed to allocate search context.");
err = ENOMEM;
gotounm_err;
}
do {
ATTR_RECORD *attr;
u32 val_len;
u16 val_ofs;
err = ntfs_attr_lookup(AT_FILENAME, AT_UNNAMED, 0, 0, NULL, 0,
ctx);
if (err) {
if (err == ENOENT) {
ntfs_error(vol->mp, "WIN32 namespace name is ""missing from inode. Run ""chkdsk.");
err = EIO;
} else
ntfs_error(vol->mp, "Failed to find WIN32 ""namespace name in inode ""(error %d).", err);
gotoput_err;
}
/* Consistency checks. */
attr = ctx->a;
if (attr->non_resident || attr->flags)
gotoattr_err;
val_len = le32_to_cpu(attr->value_length);
val_ofs = le16_to_cpu(attr->value_offset);
if (val_ofs + val_len > le32_to_cpu(attr->length))
gotoattr_err;
fn = (FILENAME_ATTR*)((u8*)attr + val_ofs);
if ((u32)(sizeof(FILENAME_ATTR) + (fn->filename_length <<
NTFSCHAR_SIZE_SHIFT)) > val_len)
gotoattr_err;
} while (fn->filename_type != FILENAME_WIN32);
/* Convert the name to decomposed UTF-8. */
res_size = ntfs_to_utf8(vol, fn->filename, fn->filename_length <<
NTFSCHAR_SIZE_SHIFT, &utf8_name, &utf8_size);
ntfs_attr_search_ctx_put(ctx);
ntfs_mft_record_unmap(ni);
if (res_size < 0) {
/* Failed to convert name. */
err = -res_size;
ntfs_error(vol->mp, "Failed to convert inode name to ""decomposed UTF-8 (error %d).", err);
gotoerr;
}
/* Update the vnode with the new name if it differs from the old one. */
old_name = vnode_getname(vn);
if (!old_name || (ni->link_count > 1 && ((long)strlen(old_name) !=
res_size || bcmp(old_name, utf8_name, res_size)))) {
vnode_update_identity(vn, NULL, (char*)utf8_name, res_size, 0,
VNODE_UPDATE_NAME | VNODE_UPDATE_CACHE);
}
if (old_name)
vnode_putname(old_name);
/*
* Enter the name into the cache (if it is already there this is a
* no-op) and prevent the caller from trying to add the name to the
* cache as well.
*/
cn_buf = (struct componentname) {
.cn_flags = cn->cn_flags,
.cn_nameptr = (char*)utf8_name,
.cn_namelen = res_size,
};
cache_enter(dir_ni->vn, vn, &cn_buf);
cn->cn_flags &= ~MAKEENTRY;
OSFree(utf8_name, utf8_size, ntfs_malloc_tag);
*a->a_vpp = ni->vn;
lck_rw_unlock_shared(&ni->lock);
ntfs_debug("Done (case 3).");
return 0;
attr_err:
ntfs_error(vol->mp, "Filename attribute is corrupt. Run chkdsk.");
err = EIO;
put_err:
ntfs_attr_search_ctx_put(ctx);
unm_err:
ntfs_mft_record_unmap(ni);
err:
lck_rw_unlock_shared(&ni->lock);
(void)vnode_put(vn);
return err;
}
}
// TODO: Rename to ntfs_inode_create and move to ntfs_inode.[hc]?
/**
* ntfs_create - create an inode on an ntfs volume
* @dir_vn: vnode of directory in which to create the new inode
* @vn: destination pointer for the vnode of the created inode
* @cn: componentname specifying name of the inode to create
* @va: vnode attributes to assign to the new inode
* @lock: if true the ntfs inode of the returned vnode *@vn is locked
*
* Create an inode with name as specified in @cn in the directory specified by
* the vnode @dir_vn. Assign the attributes @va to the created inode. Finally
* return the vnode of the created inode in *@vn.
*
* @va is used to determine which type of inode is to be created, i.e. if
* @va->va_type if VDIR create a directory, etc.
*
* If @lock is true the ntfs inode of the returned vnode is locked for writing
* (NTFS_I(@vn)->lock).
*
* Called by the various inode creation ntfs functions (ntfs_vnop_create(),
* ntfs_vnop_mkdir(), ntfs_vnop_symlink(), ntfs_vnop_mknod(), etc) which are
* called by the VFS.
*
* Return 0 on success and errno on error.
*
* Note we always create inode names in the POSIX namespace.
*/static errno_t ntfs_create(vnode_t dir_vn, vnode_t *vn,
struct componentname *cn, struct vnode_attr *va,
const BOOL lock)
{
ntfs_inode *ni, *dir_ni = NTFS_I(dir_vn);
ntfs_volume *vol;
FILENAME_ATTR *fn;
ntfschar *ntfs_name;
MFT_RECORD *m;
ATTR_RECORD *a;
size_t ntfs_name_size;
signed ntfs_name_len;
unsigned fn_alloc, fn_size;
errno_t err, err2;
if (!dir_ni) {
ntfs_debug("Entered with NULL ntfs_inode, aborting.");
return EINVAL;
}
vol = dir_ni->vol;
if (!S_ISDIR(dir_ni->mode)) {
ntfs_debug("Parent inode is not a directory, returning ""ENOTDIR.");
return ENOTDIR;
}
if (dir_ni->file_attributes & FILE_ATTR_REPARSE_POINT) {
ntfs_error(vol->mp, "Parent inode is a reparse point and not ""a regular directory, returning ENOTSUP.");
return ENOTDIR;
}
/*
* Create a temporary copy of the filename attribute so we can release
* the mft record before we add the directory entry. This is needed
* because when we hold the mft record for the newly created inode and
* we call ntfs_dir_entry_add() this would cause the mft record for the
* directory to be mapped which would result in a deadlock in the event
* that both mft records are in the same page.
*/
fn_alloc = sizeof(FILENAME_ATTR) + NTFS_MAX_NAME_LEN * sizeof(ntfschar);
fn = OSMalloc(fn_alloc, ntfs_malloc_tag);
if (!fn) {
ntfs_error(vol->mp, "Failed to allocate memory for temporary ""filename attribute.");
return ENOMEM;
}
bzero(fn, fn_alloc);
/* Begin setting up the temporary filename attribute. */
fn->parent_directory = MK_LE_MREF(dir_ni->mft_no, dir_ni->seq_no);
/* FILENAME_POSIX is zero and the attribute is already zeroed. *//* fn->filename_type = FILENAME_POSIX; *//* Convert the name from utf8 to Unicode. */
ntfs_name = fn->filename;
ntfs_name_size = NTFS_MAX_NAME_LEN * sizeof(ntfschar);
ntfs_name_len = utf8_to_ntfs(vol, (u8*)cn->cn_nameptr, cn->cn_namelen,
&ntfs_name, &ntfs_name_size);
if (ntfs_name_len < 0) {
err = -ntfs_name_len;
if (err == ENAMETOOLONG)
ntfs_debug("Failed (name is too long).");
else
ntfs_error(vol->mp, "Failed to convert name to ""Unicode (error %d).", err);
gotoerr;
}
/* Set the filename length in the temporary filename attribute. */
fn->filename_length = ntfs_name_len;
fn_size = sizeof(FILENAME_ATTR) + ntfs_name_len * sizeof(ntfschar);
/* If no vnode type is specified default to VREG, i.e. regular file. */if (va->va_type == VNON)
va->va_type = VREG;
/*
* We support regular files, directories, symbolic links, sockets,
* fifos, and block and character device special filesr.
*/switch (va->va_type) {
caseVBLK:
caseVCHR:
if (!VATTR_IS_ACTIVE(va, va_rdev)) {
ntfs_error(vol->mp, "va_type is %s but va_rdev is not ""specified!", va->va_type == VBLK ?
"VBLK" : "VCHR");
err = EINVAL;
gotoerr;
}
caseVREG:
caseVDIR:
caseVLNK:
caseVSOCK:
caseVFIFO:
break;
default:
ntfs_error(vol->mp, "Tried to create inode of type 0x%x which ""is not supported at present.", va->va_type);
err = ENOTSUP;
gotoerr;
}
va->va_mode |= VTTOIF(va->va_type);
/* If no create time is supplied default it to the current time. */if (!VATTR_IS_ACTIVE(va, va_create_time))
nanotime(&va->va_create_time);
/*
* Round the time down to the nearest 100-nano-second interval as
* needed for NTFS.
*/
va->va_create_time.tv_nsec -= va->va_create_time.tv_nsec % 100;
/* Set the times in the temporary filename attribute. */
fn->last_access_time = fn->last_mft_change_time =
fn->last_data_change_time = fn->creation_time =
utc2ntfs(va->va_create_time);
/* Set the bits for all the supported fields at once. */
va->va_supported |=
VNODE_ATTR_BIT(va_mode) |
VNODE_ATTR_BIT(va_flags) |
VNODE_ATTR_BIT(va_create_time) |
VNODE_ATTR_BIT(va_type);
again:
/* Lock the target directory and check that it has not been deleted. */
lck_rw_lock_exclusive(&dir_ni->lock);
if (!dir_ni->link_count) {
/* Remove the target directory from the name cache. */
cache_purge(dir_vn);
err = ENOENT;
gotounl_err;
}
/* Allocate and map a new mft record. */
err = ntfs_mft_record_alloc(vol, va, cn, dir_ni, &ni, &m, &a);
if (err) {
if (err != ENOSPC)
ntfs_error(vol->mp, "Failed to allocate a new on-disk ""inode (error %d).", err);
gotounl_err;
}
/*
* If requested by the caller, take the ntfs inode lock on the
* allocated ntfs inode for writing so no-one can start using it before
* it is ready. For example if it is a symbolic link we cannot allow
* anyone to look at it until we have set the data size to the symbolic
* link target size otherwise a concurrent ntfs_vnop_readlink() would
* return EINVAL as it would see a target size of zero.
*
* Also, if the inode is a symbolic link we need to take the lock so
* that we can create the AFP_AfpInfo attribute when we have finished
* setting up the inode.
*/if (lock || S_ISLNK(ni->mode))
lck_rw_lock_exclusive(&ni->lock);
/*
* @a now points to the location in the allocated mft record at which
* we need to insert the filename attribute so we can insert it without
* having to do a lookup first.
*
* Insert the filename attribute and initialize the value to zero.
* This cannot fail as we are dealing with a newly allocated mft record
* so there must be enough space for a filename attribute even if the
* filename is of the maximum allowed length.
*/
err = ntfs_resident_attr_record_insert_internal(m, a, AT_FILENAME,
NULL, 0, fn_size);
if (err)
panic("%s(): err\n", __FUNCTION__);
/* Finish setting up the filename attribute value. */
fn->file_attributes = ni->file_attributes;
/*
* Directories need the FILE_ATTR_DUP_FILENAME_INDEX_PRESENT flag set
* in their filename attributes both in their mft records and in the
* index entries pointing to them but not in the standard information
* attribute which is why it is not set in @ni->file_attributes.
*/if (va->va_type == VDIR)
fn->file_attributes |= FILE_ATTR_DUP_FILENAME_INDEX_PRESENT;
/*
* Update the data_size in the temporary filename attribute from the
* created ntfs inode. This will not be zero for fifos and block and
* character device special files for example.
*/
fn->data_size = ni->data_size;
/*
* Copy the created filename attribute into place in the attribute
* record.
*/
memcpy((u8*)a + le16_to_cpu(a->value_offset), fn, fn_size);
/*
* Set the link count to one to indicate there is one filename
* attribute inside the mft record.
*/
m->link_count = const_cpu_to_le16(1);
ni->link_count = 1;
/*
* Ensure the mft record is written to disk.
*
* Note we do not set any of the NInoDirty*() flags because we have
* just created the inode thus all the fields are in sync between the
* ntfs_inode @ni and its mft record @m.
*/
NInoSetMrecNeedsDirtying(ni);
/*
* Release the mft record. It is safe to do so even though the
* directory entry has not been added yet because the inode is still
* locked and marked new thus it is not a candidate for syncing yet.
*/
ntfs_mft_record_unmap(ni);
/*
* If the inode is a symbolic link now create the AFP_AfpInfo attribute
* with the Finder Info specifying that this is a symbolic link.
*/if (S_ISLNK(ni->mode)) {
err = ntfs_inode_afpinfo_write(ni);
/*
* If the caller has not requested that the inode be returned
* locked unlock it now.
*/if (!lock)
lck_rw_unlock_exclusive(&ni->lock);
if (err) {
ntfs_error(vol->mp, "Failed to create AFP_AfpInfo ""attribute in allocated inode 0x%llx ""(error %d).",
(unsignedlonglong)ni->mft_no, err);
gotorm_err;
}
}
/* Add the created filename attribute to the parent directory index. */
err = ntfs_dir_entry_add(dir_ni, fn, fn_size,
MK_LE_MREF(ni->mft_no, ni->seq_no));
if (!err) {
/* Free the temporary filename attribute. */
OSFree(fn, fn_alloc, ntfs_malloc_tag);
/*
* Invalidate negative cache entries in the directory. We need
* to do this because there may be negative cache entries
* which would match the name of the just created inode but in
* a different case. Such negative cache entries would now be
* incorrect thus we need to throw away all negative cache
* entries to ensure there cannot be any incorrectly negative
* entries in the name cache.
*/
cache_purge_negatives(dir_vn);
/*
* Add the inode to the name cache. Note that
* ntfs_vnop_lookup() will have caused the name to not be
* cached because it will have cleared the MAKEENTRY flag.
*/
cache_enter(dir_ni->vn, ni->vn, cn);
/* We are done with the directory so unlock it. */
lck_rw_unlock_exclusive(&dir_ni->lock);
/*
* We can finally unlock and unmark as new the new ntfs inode
* thus rendering the inode a full member of society.
*/
ntfs_inode_unlock_alloc(ni);
ntfs_debug("Done (new mft_no 0x%llx).",
(unsignedlonglong)ni->mft_no);
*vn = ni->vn;
return 0;
}
/*
* We failed to add the directory entry thus we have to effectively
* delete the created inode again. To do this we need to map the mft
* record and mark it as no longer in use.
*
* We then also need to set the link count in the ntfs inode to zero to
* reflect that it is deleted and to ensure that the subsequent
* vnode_put() results in ntfs_delete_inode() being called (via
* VNOP_INACTIVE() and ntfs_vnop_inactive() respectively).
*
* But first, unlock the allocated ntfs inode if we locked it above.
* No-one can get to it now as it does not have a directory entry
* pointing to it.
*/rm_err:
if (lock)
lck_rw_unlock_exclusive(&ni->lock);
err2 = ntfs_mft_record_map(ni, &m);
if (err2) {
ntfs_error(vol->mp, "Failed to map mft record in error code ""path (error %d). Run chkdsk to recover the ""lost mft record.", err2);
NVolSetErrors(vol);
} else {
m->flags &= ~MFT_RECORD_IN_USE;
NInoSetMrecNeedsDirtying(ni);
ntfs_mft_record_unmap(ni);
}
ni->link_count = 0;
lck_rw_unlock_exclusive(&dir_ni->lock);
ntfs_inode_unlock_alloc(ni);
cache_purge(ni->vn);
(void)vnode_put(ni->vn);
if (err == EEXIST) {
/*
* There are two possible reasons why the directory entry
* already exists. Either someone created it under our feet in
* which case we try to look up the existing vnode and retrn
* that instead and failing that we try to create the inode
* again or the name really does exist but we have removed it
* from the name space thus ntfs_vnop_lookup() will always
* return ENOENT/EJUSTRETURN for it. This is the case for the
* core system files for example. This would cause an infinite
* loop thus we need to check for this case by checking that
* the name being created does not match one of the core system
* filenames and if it does we return EEXIST.
*/if (dir_ni == vol->root_ni) {
/* Catch the "." entry. */if (cn->cn_namelen == 1 && cn->cn_nameptr[0] == '.')
gotois_system;
/*
* Catch the core system files which all start with the
* '$' character.
*/if (cn->cn_nameptr[0] == '$') {
char *n = (char*)cn->cn_nameptr + 1;
int l = cn->cn_namelen;
if ((l == 4 && !strncmp(n, "MFT", 3)) ||
(l == 5 && !strncmp(n, "Boot",
4)) ||
(l == 6 && !strncmp(n, "Quota",
5)) ||
(l == 7 && (
!strncmp(n, "Volume", 6) ||
!strncmp(n, "Bitmap", 6) ||
!strncmp(n, "Secure", 6) ||
!strncmp(n, "UpCase", 6) ||
!strncmp(n, "Extend", 6))) ||
(l == 8 && (
!strncmp(n, "MFTMirr", 7) ||
!strncmp(n, "LogFile", 7) ||
!strncmp(n, "AttrDef", 7) ||
!strncmp(n, "BadClus", 7))))
gotois_system;
}
}
ntfs_debug("Inode was created under our feet.");
/*
* If the inode was created under our feet, we are creating a
* regular file, and the caller did not want an exclusive
* create, simply look up the inode and return that.
*/if (va->va_type == VREG && !(va->va_vaflags & VA_EXCLUSIVE)) {
struct vnop_lookup_args la;
cn->cn_nameiop = LOOKUP;
la = (struct vnop_lookup_args) {
.a_desc = &vnop_lookup_desc,
.a_dvp = dir_vn,
.a_vpp = vn,
.a_cnp = cn,
};
err = ntfs_vnop_lookup(&la);
cn->cn_nameiop = CREATE;
/*
* If the inode that was created under our feet was
* also deleted under our feet, repeat the whole
* process.
*/if (err == ENOENT || err == EJUSTRETURN) {
*vn = NULL;
gotoagain;
}
/*
* Make sure the vnode we looked up is a regular file
* as we would not want to return a directory instead
* of a file for example.
*/if (!err && vnode_vtype(*vn) != VREG) {
(void)vnode_put(*vn);
*vn = NULL;
err = EEXIST;
}
}
} else
ntfs_error(vol->mp, "Failed to add directory entry (error ""%d).", err);
err:
OSFree(fn, fn_alloc, ntfs_malloc_tag);
return err;
unl_err:
lck_rw_unlock_exclusive(&dir_ni->lock);
gotoerr;
is_system:
ntfs_error(vol->mp, "Cannot create inode with name %.*s in the volume ""root directory as the name clashes with the name of ""a core system file. Returning EEXIST.",
(int)cn->cn_namelen, cn->cn_nameptr);
err = EEXIST;
*vn = NULL;
gotoerr;
}
/**
* ntfs_vnop_create - create a regular file
* @a: arguments to create function
*
* @a contains:
* vnode_t a_dvp; directory in which to create the file
* vnode_t *a_vpp; destination pointer for the created file
* struct componentname *a_cnp; name of the file to create
* struct vnode_attr *a_vap; attributes to set on the created file
* vfs_context_t a_context;
*
* Create a regular file with name as specified in @a->a_cnp in the directory
* specified by the vnode @a->a_dvp. Assign the attributes @a->a_vap to the
* created file. Finally return the vnode of the created file in *@a->a_vpp.
*
* Return 0 on success and errno on error.
*
* Note we always create filenames in the POSIX namespace.
*/staticintntfs_vnop_create(struct vnop_create_args *a)
{
errno_t err;
#ifdefDEBUG
ntfs_inode *ni = NTFS_I(a->a_dvp);
if (ni)
ntfs_debug("Creating a file named %.*s in directory mft_no ""0x%llx.", (int)a->a_cnp->cn_namelen,
a->a_cnp->cn_nameptr,
(unsignedlonglong)ni->mft_no);
#endif
err = ntfs_create(a->a_dvp, a->a_vpp, a->a_cnp, a->a_vap, FALSE);
ntfs_debug("Done (error %d).", (int)err);
return err;
}
/**
* ntfs_vnop_mknod - create a special file node
* @a: arguments to mknod function
*
* @a contains:
* vnode_t a_dvp; directory in which to create the file
* vnode_t *a_vpp; destination pointer for the created file
* struct componentname *a_cnp; name of the file to create
* struct vnode_attr *a_vap; attributes to set on the created file
* vfs_context_t a_context;
*
* Create a special file node with name as specified in @a->a_cnp in the
* directory specified by the vnode @a->a_dvp. Assign the attributes @a->a_vap
* to the created node. Finally return the vnode of the created file in
* *@a->a_vpp.
*
* The type of special file node to create is specified by the caller in
* @a->a_vap->va_type and can be one of:
* VSOCK - create a socket
* VFIFO - create a fifo
* VBLK - create a block special device
* VCHR - create a character special device
*
* Return 0 on success and errno on error.
*
* Note we always create filenames in the POSIX namespace.
*/staticintntfs_vnop_mknod(struct vnop_mknod_args *a)
{
errno_t err;
#ifdefDEBUG
ntfs_inode *ni = NTFS_I(a->a_dvp);
if (ni)
ntfs_debug("Creating a special inode of type 0x%x named %.*s ""in directory mft_no 0x%llx.",
a->a_vap->va_type, (int)a->a_cnp->cn_namelen,
a->a_cnp->cn_nameptr,
(unsignedlonglong)ni->mft_no);
#endif
err = ntfs_create(a->a_dvp, a->a_vpp, a->a_cnp, a->a_vap, FALSE);
ntfs_debug("Done (error %d).", (int)err);
return err;
}
/**
* ntfs_vnop_open - open a vnode
* @a: arguments to open function
*
* @a contains:
* vnode_t a_vp; vnode to open
* int a_mode; mode to open the file with
* vfs_context_t a_context;
*
* Open the vnode @a->a_vp with mode @a->a_mode.
*
* Note the VFS does a lot of checking before ntfs_vnop_open() is called
* including permissions and checking for a read-only file system thus we do
* not need to worry about the case where the driver is compiled read-only as
* the volume is then mounted read-only so the vfs catches all write accesses
* very early on and denies them.
*
* Return 0 on success and errno on error.
*/staticintntfs_vnop_open(struct vnop_open_args *a)
{
ntfs_inode *base_ni, *ni = NTFS_I(a->a_vp);
errno_t err = 0;
if (!ni) {
ntfs_debug("Entered with NULL ntfs_inode, aborting.");
return EINVAL;
}
ntfs_debug("Entering for mft_no 0x%llx, mode 0x%x.",
(unsignedlonglong)ni->mft_no, (unsigned)a->a_mode);
base_ni = ni;
if (NInoAttr(ni))
base_ni = ni->base_ni;
/*
* All the core system files cannot possibly be opened because they are
* removed from the name space thus it is impossible for a process to
* obtain a vnode to them thus VNOP_OPEN() can never be called for
* them. The only exception is the root directory which we of course
* allow access to.
*/if (ni->mft_no < FILE_first_user && ni != ni->vol->root_ni)
panic("%s(): Called for a system inode. This is not ""possible.\n", __FUNCTION__);
lck_rw_lock_shared(&ni->lock);
/* Do not allow messing with the inode once it has been deleted. */if (NInoDeleted(ni)) {
lck_rw_unlock_shared(&ni->lock);
/* Remove the inode from the name cache. */
cache_purge(ni->vn);
ntfs_debug("Cannot open deleted mft_no 0x%llx, returning ""ENOENT.", (unsignedlonglong)ni->mft_no);
return ENOENT;
}
/*
* Do not allow opening encrpyted files as we do not support reading,
* writing, nor mmap()ing them.
*/if (NInoEncrypted(ni)) {
lck_rw_unlock_shared(&ni->lock);
ntfs_debug("Cannot open encrypted mft_no 0x%llx, returning ""EACCES.", (unsignedlonglong)ni->mft_no);
return EACCES;
}
lck_rw_unlock_shared(&ni->lock);
/*
* We keep track of how many times the base vnode has been opened and
* we count other vnodes towards the base vnode open count to ensure
* we do the right thing in ntfs_unlink().
*/
OSIncrementAtomic(&base_ni->nr_opens);
ntfs_debug("Done (error %d).", (int)err);
return err;
}
/**
* ntfs_vnop_close - close a vnode
* @a: arguments to close function
*
* @a contains:
* vnode_t a_vp; vnode to close
* int a_fflag; close flags (FREAD and/or FWRITE for example)
* vfs_context_t a_context;
*
* Close the vnode @a->a_vp with flags @a->a_fflag.
*
* Return 0 on success and errno on error.
*/staticintntfs_vnop_close(struct vnop_close_args *a)
{
vnode_t vn = a->a_vp;
ntfs_inode *base_ni, *ni = NTFS_I(vn);
if (!ni) {
ntfs_debug("Entered with NULL ntfs_inode, aborting.");
return 0;
}
ntfs_debug("Entering for mft_no 0x%llx, fflag 0x%x.",
(unsignedlonglong)ni->mft_no, a->a_fflag);
base_ni = ni;
if (NInoAttr(ni))
base_ni = ni->base_ni;
/*
* We keep track of how many times the base vnode has been opened and
* we count other vnodes towards the base vnode open count to ensure
* we do the right thing in ntfs_unlink().
*/
OSDecrementAtomic(&base_ni->nr_opens);
/*
* If the vnode is still in use release any expired directory hints.
*
* If the vnode is no longer in use release all directory hints.
*
* Note we check for presence of directory hints outside the locks as
* an optimization. It is not a disaster if we miss any as all will be
* released in ntfs_inode_free() before the inode is thrown away at the
* latest.
*/if (ni != base_ni && ni->type == AT_INDEX_ALLOCATION &&
ni->nr_dirhints) {
int busy;
busy = vnode_isinuse(vn, ni->nr_refs + 1);
lck_rw_lock_exclusive(&ni->lock);
ntfs_dirhints_put(ni, busy);
lck_rw_unlock_exclusive(&ni->lock);
}
ntfs_debug("Done.");
return 0;
}
/**
* ntfs_vnop_access -
*
*/staticintntfs_vnop_access(struct vnop_access_args *a)
{
errno_t err;
ntfs_debug("Entering.");
// TODO:
err = ENOTSUP;
ntfs_debug("Done (error %d).", (int)err);
return err;
}
/**
* ntfs_vnop_getattr - get attributes about a vnode or about the mounted volume
* @a: arguments to getattr function
*
* @a contains:
* vnode_t a_vp; vnode for which to return attributes
* struct vnode_attr *a_vap; attributes to return and destination
* vfs_context_t a_context;
*
* Return the attributes described in @a_vap about the vnode @a_vp. Some
* attributes are intercepted by the VFS in getattrlist() and getvolattrlist()
* so we do not bother with them.
*
* At present we do not support all attributes. We declare what we support to
* the world in our VFS_GETATTR() function (ntfs_vfsops.c::ntfs_getattr()) so
* do not forget to update that when support for further attributes is added
* here.
*
* Return 0 on success and errno on error.
*
* TODO: Implement more attributes.
*/staticintntfs_vnop_getattr(struct vnop_getattr_args *a)
{
MFT_REF parent_mref;
ino64_t mft_no;
s64 on_disk_size;
struct vnode_attr *va = a->a_vap;
ntfs_inode *ni, *base_ni;
ntfs_volume *vol;
constchar *name;
FILE_ATTR_FLAGS file_attributes;
unsigned flags;
errno_t err;
lck_rw_type_t lock;
BOOL is_root, name_is_done, have_parent;
ni = NTFS_I(a->a_vp);
if (!ni) {
ntfs_debug("Entered with NULL ntfs_inode, aborting.");
return 0;
}
vol = ni->vol;
mft_no = ni->mft_no;
have_parent = name_is_done = is_root = FALSE;
ntfs_debug("Entering for mft_no 0x%llx.", (unsignedlonglong)mft_no);
base_ni = ni;
if (NInoAttr(ni)) {
base_ni = ni->base_ni;
lck_rw_lock_shared(&base_ni->lock);
}
lck_rw_lock_shared(&ni->lock);
lock = LCK_RW_TYPE_SHARED;
/* Do not allow messing with the inode once it has been deleted. */if (NInoDeleted(ni)) {
/* Remove the inode from the name cache. */
cache_purge(ni->vn);
err = ENOENT;
gotoerr;
}
/*
* If this is the root directory, leave it to the VFS to get the name
* from the mountpoint (see below).
*/if (base_ni == vol->root_ni)
name_is_done = is_root = TRUE;
/* For directories always return a link count of 1. */
va->va_nlink = 1;
if (!S_ISDIR(ni->mode))
va->va_nlink = ni->link_count;
va->va_rdev = (dev_t)0;
switch (ni->mode & S_IFMT) {
caseS_IFBLK:
caseS_IFCHR:
/*
* For block and character device special inodes return the
* device.
*/
va->va_rdev = ni->rdev;
caseS_IFIFO:
caseS_IFSOCK:
/*
* For fifos, sockets, block and character device special files
* return all sizes set to zero.
*/
va->va_total_alloc = va->va_data_alloc = va->va_total_size =
va->va_data_size = 0;
break;
default:
lck_spin_lock(&ni->size_lock);
/*
* We cheat for both the total size and the total allocated
* size and just return the attribute size rather than looping
* over all ($DATA?) attributes and adding up their sizes.
*/
va->va_total_size = va->va_data_size = ni->data_size;
/*
* Resident attributes reside inside the on-disk inode and thus
* have no on-disk allocation because the on-disk inode itself
* is already accounted for in the allocated size of the $MFT
* system file which contains the table of on-disk inodes.
* Perhaps more importantly, if we delete a resident file no
* space would be freed up on the volume, thus we definitely
* need to return zero for the allocated size of such resident
* files.
*/
on_disk_size = 0;
if (NInoNonResident(ni)) {
if (ni->type == AT_DATA && (NInoCompressed(ni) ||
NInoSparse(ni)))
on_disk_size = ni->compressed_size;
else
on_disk_size = ni->allocated_size;
}
va->va_total_alloc = va->va_data_alloc = on_disk_size;
lck_spin_unlock(&ni->size_lock);
}
va->va_iosize = ubc_upl_maxbufsize();
va->va_uid = ni->uid;
va->va_gid = ni->gid;
va->va_mode = ni->mode;
file_attributes = base_ni->file_attributes;
/*
* Do not allow the volume root directory to be read-only or hidden and
* do not allow directories in general to be read-only as Windows uses
* the read-only bit on directories for completely different purposes
* like customized/specialized folder views which are lost when you
* clear the read-only bit.
*/if (S_ISDIR(base_ni->mode)) {
file_attributes &= ~FILE_ATTR_READONLY;
if (is_root)
file_attributes &= ~FILE_ATTR_HIDDEN;
}
flags = 0;
/*
* if (NInoCompressed(ni))
* flags |= SF_COMPRESSED;
*/if (file_attributes & FILE_ATTR_READONLY)
flags |= UF_IMMUTABLE;
if (file_attributes & FILE_ATTR_HIDDEN)
flags |= UF_HIDDEN;
/*
* Windows does not set the "needs archiving" bit on directories
* except for encrypted directories where it does set the bit.
*/if ((!S_ISDIR(base_ni->mode) ||
file_attributes & FILE_ATTR_ENCRYPTED) &&
!(file_attributes & FILE_ATTR_ARCHIVE))
flags |= SF_ARCHIVED;
va->va_flags = flags;
va->va_create_time = base_ni->creation_time;
va->va_access_time = base_ni->last_access_time;
va->va_modify_time = base_ni->last_data_change_time;
va->va_change_time = base_ni->last_mft_change_time;
/*
* NTFS does not distinguish between the inode and its hard links.
*
* We have to remap the root directory inode to inode number 2, i.e.
* fsRtDirID, for compatibility with Carbon.
*/if (!is_root)
va->va_fileid = mft_no;
else
va->va_fileid = 2;
va->va_fsid = vol->dev;
/* FIXME: What is the difference between the below two? */
va->va_filerev = base_ni->seq_no;
va->va_gen = base_ni->seq_no;
va->va_encoding = 0x7e; /* = kTextEncodingMacUnicode */
va->va_supported |=
VNODE_ATTR_BIT(va_rdev) |
VNODE_ATTR_BIT(va_nlink) |
VNODE_ATTR_BIT(va_total_size) |
VNODE_ATTR_BIT(va_total_alloc) |
VNODE_ATTR_BIT(va_data_size) |
VNODE_ATTR_BIT(va_data_alloc) |
VNODE_ATTR_BIT(va_iosize) |
VNODE_ATTR_BIT(va_uid) |
VNODE_ATTR_BIT(va_gid) |
VNODE_ATTR_BIT(va_mode) |
VNODE_ATTR_BIT(va_flags) |
VNODE_ATTR_BIT(va_create_time) |
VNODE_ATTR_BIT(va_access_time) |
VNODE_ATTR_BIT(va_modify_time) |
VNODE_ATTR_BIT(va_change_time) |
VNODE_ATTR_BIT(va_fileid) |
VNODE_ATTR_BIT(va_fsid) |
VNODE_ATTR_BIT(va_filerev) |
VNODE_ATTR_BIT(va_gen) |
VNODE_ATTR_BIT(va_encoding) |
0;
/*
* Return va_parentid, i.e. the mft record number of the parent of the
* inode, if it was requested.
*
* We have to return 1, i.e. fsRtParID, for the parent inode number of
* the root directory inode for compatibility with Carbon. Simillarly
* we have to return 2, i.e. fsRtDirID, if the parent inode is the root
* directory inode.
*
* For all other inodes we try to get the parent from the vnode and if
* it does not have the vnode cached then if the inode is an attribute
* inode we return the inode number of the base inode (in line with how
* named streams work on Mac OS X) and otherwise we obtain the parent
* mft reference by looking up a filename attribute record in the mft
* record of the inode and obtaining the parent mft record reference
* from there.
*
* There is one pitfall with this approach for files and that is that a
* file may have multiple parents and we are returning a random one but
* that is the best we can do.
*
* To make this a little better we get the name at the same time as we
* get the parent mft reference so we can at least return a parent id
* and name that match, i.e. the name is present in the parent id.
*
* And to make this even better, when the parent is requested and a
* name is cached in the vnode, we use the name in the vnode to find
* the parent that matches that name if it exists. If it does not
* exist we revert to finding a random parent.
*/if (VATTR_IS_ACTIVE(va, va_parentid)) {
ino64_t parent_mft_no;
vnode_t parent_vn;
if (is_root && base_ni == ni)
VATTR_RETURN(va, va_parentid, 1);
elseif ((parent_vn = vnode_getparent(ni->vn))) {
parent_mft_no = NTFS_I(parent_vn)->mft_no;
(void)vnode_put(parent_vn);
have_parent = TRUE;
if (parent_mft_no == FILE_root)
parent_mft_no = 2;
VATTR_RETURN(va, va_parentid, parent_mft_no);
} elseif (ni != base_ni) {
parent_mft_no = base_ni->mft_no;
if (parent_mft_no == FILE_root)
parent_mft_no = 2;
VATTR_RETURN(va, va_parentid, parent_mft_no);
} else/* if (ni == base_ni) */ {
name_is_done = TRUE;
name = NULL;
if (VATTR_IS_ACTIVE(va, va_name))
name = va->va_name;
err = ntfs_inode_get_name_and_parent_mref(base_ni,
FALSE, &parent_mref, name);
if (err) {
ntfs_error(base_ni->vol->mp, "Failed to obtain ""parent mft reference for ""mft_no 0x%llx (error %d).",
(unsignedlonglong)
base_ni->mft_no, err);
gotoerr;
}
parent_mft_no = MREF(parent_mref);
if (parent_mft_no == FILE_root)
parent_mft_no = 2;
va->va_parentid = parent_mft_no;
va->va_supported |= VNODE_ATTR_BIT(va_parentid) |
(name ? VNODE_ATTR_BIT(va_name) : 0);
}
}
/*
* Return va_name, i.e. the name of the inode, if it was requested.
*
* If this is the root directory of the volume, leave it to the VFS to
* find the mounted-on name, which is different from the real volume
* root directory name of "." (this is ensured by the fact that
* @name_is_done was set to TRUE for the root directory earlier).
*
* For all other inodes we try to get the name from the vnode and if it
* does not have the name cached we obtain the name by looking up a
* filename attribute record in the mft record of the inode and using
* that.
*
* Note we do not need to do anything if we dealt with the name as part
* of dealing with va_parentid above. In this case @name_is_done will
* be set to true.
*
* Also we do not need to do anything if we tried to deal with
* va_parentid above and failed as we would only fail again here. This
* means that if @err is not zero we skip the call to
* ntfs_inode_get_name_and_parent_mref().
*
* TODO: What do we return for attribute inodes? Shall we exclude them
* from VNOP_GETATTR() altogether? For now we simply do not return a
* name for them.
*/if (!name_is_done && VATTR_IS_ACTIVE(va, va_name) && ni == base_ni) {
name = vnode_getname(base_ni->vn);
if (name) {
(void)strlcpy(va->va_name, name, MAXPATHLEN - 1);
VATTR_SET_SUPPORTED(va, va_name);
(void)vnode_putname(name);
} else {
err = ntfs_inode_get_name_and_parent_mref(base_ni,
have_parent, &parent_mref, va->va_name);
if (err) {
ntfs_error(base_ni->vol->mp, "Failed to obtain ""parent mft reference for ""mft_no 0x%llx (error %d).",
(unsignedlonglong)
base_ni->mft_no, err);
gotoerr;
}
/*
* We forcibly overwrite the parent id with the
* possibly new parent id here to be consistent with
* the name, i.e. we want the name we return to
* actually exist in the returned parent.
*
* If we already had the parent id from before then
* ntfs_inode_get_name_and_parent_mref() will have
* found the name matching this parent id thus our
* setting of the parent id here will be a no-op.
*/
va->va_parentid = MREF(parent_mref);
if (va->va_parentid == FILE_root)
va->va_parentid = 2;
va->va_supported |= VNODE_ATTR_BIT(va_parentid) |
VNODE_ATTR_BIT(va_name);
}
}
/*
* Unlock the attribute inode as we do not need it any more and so we
* cannot deadlock with converting the lock on the base inode to
* exclusive and with the call to ntfs_inode_afpinfo_read() below.
*/if (ni != base_ni)
lck_rw_unlock_shared(&ni->lock);
if (VATTR_IS_ACTIVE(va, va_backup_time)) {
if (!NInoValidBackupTime(base_ni)) {
if (!lck_rw_lock_shared_to_exclusive(&base_ni->lock)) {
lck_rw_lock_exclusive(&base_ni->lock);
if (NInoDeleted(base_ni)) {
cache_purge(base_ni->vn);
lck_rw_unlock_exclusive(&base_ni->lock);
return ENOENT;
}
}
lock = LCK_RW_TYPE_EXCLUSIVE;
/*
* Load the AFP_AfpInfo stream and initialize the
* backup time and Finder Info (if they are not already
* valid).
*/
err = ntfs_inode_afpinfo_read(base_ni);
if (err) {
ntfs_error(base_ni->vol->mp, "Failed to ""read AFP_AfpInfo attribute ""from inode 0x%llx (error ""%d).", (unsignedlonglong)
base_ni->mft_no, err);
lck_rw_unlock_exclusive(&base_ni->lock);
return err;
}
if (!NInoValidBackupTime(base_ni))
panic("%s(): !NInoValidBackupTime(base_ni)\n",
__FUNCTION__);
}
VATTR_RETURN(va, va_backup_time, base_ni->backup_time);
}
if (lock == LCK_RW_TYPE_SHARED)
lck_rw_unlock_shared(&base_ni->lock);
else
lck_rw_unlock_exclusive(&base_ni->lock);
ntfs_debug("Done.");
return 0;
err:
lck_rw_unlock_shared(&ni->lock);
if (ni != base_ni)
lck_rw_unlock_shared(&base_ni->lock);
return err;
}
/**
* ntfs_vnop_setattr - set attributes of a vnode or of the mounted volume
* @a: arguments to setattr function
*
* @a contains:
* vnode_t a_vp; vnode of which to set attributes
* struct vnode_attr *a_vap; attributes to set and source
* vfs_context_t a_context;
*
* Set the attributes described by @a_vap in the vnode @a_vp. Some attributes
* are intercepted by the VFS in setattrlist() and setvolattrlist() so we do
* not bother with them.
*
* At present we do not support all attributes. We declare what we support to
* the world in our VFS_GETATTR() function (ntfs_vfsops.c::ntfs_getattr()) so
* do not forget to update that when support for further attributes is added
* here.
*
* Return 0 on success and errno on error.
*
* TODO: Implement more attributes.
*/staticintntfs_vnop_setattr(struct vnop_setattr_args *a)
{
ntfs_inode *base_ni, *ni = NTFS_I(a->a_vp);
ntfs_volume *vol;
struct vnode_attr *va = a->a_vap;
errno_t err = 0;
BOOL dirty_times = FALSE;
if (!ni) {
ntfs_debug("Entered with NULL ntfs_inode, aborting.");
return EINVAL;
}
vol = ni->vol;
ntfs_debug("Entering for mft_no 0x%llx.",
(unsignedlonglong)ni->mft_no);
base_ni = ni;
if (NInoAttr(ni)) {
base_ni = ni->base_ni;
lck_rw_lock_exclusive(&base_ni->lock);
}
lck_rw_lock_exclusive(&ni->lock);
/* Do not allow messing with the inode once it has been deleted. */if (NInoDeleted(ni)) {
/* Remove the inode from the name cache. */
cache_purge(ni->vn);
err = ENOENT;
gotounl_err;
}
if (VATTR_IS_ACTIVE(va, va_data_size)) {
ntfs_debug("Changing size for mft_no 0x%llx to 0x%llx.",
(unsignedlonglong)ni->mft_no,
(unsignedlonglong)va->va_data_size);
#if 1 // TODO: Removethiswhensparsesupportisdone...
if (NInoSparse(ni)) {
err = ENOTSUP;
gotounl_err;
}
#endif/*
* Do not allow calling for $MFT/$DATA as it would destroy the
* volume.
*
* Also only allow setting the size of VREG vnodes as that
* covers both regular files and named streams whilst excluding
* symbolic links for example.
*/if (vnode_vtype(ni->vn) != VREG ||
(!ni->mft_no && !NInoAttr(ni)))
err = EPERM;
else
err = ntfs_attr_resize(ni, va->va_data_size,
va->va_vaflags & 0xffff, NULL);
if (err) {
ntfs_error(vol->mp, "Failed to set inode size (error ""%d).", err);
gotounl_err;
}
VATTR_SET_SUPPORTED(va, va_data_size);
}
/*
* Unlock the attribute inode as we do not need it any more and so we
* cannot deadlock with the call to ntfs_inode_afpinfo_write() below.
*/if (ni != base_ni)
lck_rw_unlock_exclusive(&ni->lock);
if (VATTR_IS_ACTIVE(va, va_flags)) {
u32 flags = va->va_flags;
BOOL dirty_flags = FALSE;
/*
* Only allow changing of supported flags. There are two
* exceptions and those are the archived flag and read-only bit
* on directories which are not supported on NTFS but we have
* to ignore them or too many things break such as "cp -pr"
* from a more sensible file system.
*/if (flags & ~(SF_ARCHIVED | SF_IMMUTABLE | UF_IMMUTABLE |
UF_HIDDEN /* | SF_COMPRESSED */)) {
ntfs_error(vol->mp, "Cannot set unsupported flags ""0x%x.",
(unsigned)(flags & ~(SF_ARCHIVED |
SF_IMMUTABLE | UF_IMMUTABLE |
UF_HIDDEN)));
err = EINVAL;
gotoerr;
}
/*
* We do not allow modification for any of the core NTFS
* system files which we want to remain as they are except that
* we silently ignore changes to the root directory.
*/if (base_ni->mft_no < FILE_first_user &&
base_ni != vol->root_ni) {
ntfs_error(vol->mp, "Refusing to change flags on core ""NTFS system file (mft_no 0x%llx).",
(unsignedlonglong)base_ni->mft_no);
err = EPERM;
gotoerr;
}
/*
* We currently do not support changing the compression state
* of a vnode.
*
* Further, only the base inode may be compressed.
*//*
* if (((flags & SF_COMPRESSED) && !NInoCompressed(ni)) ||
* (!(flags & SF_COMPRESSED) &&
* NInoCompressed(ni))) {
* if (ni != base_ni) {
* ntfs_error(vol->mp, "Only regular files and "
* "directories may be "
* "compressed, aborting.");
* err = EINVAL;
* goto err;
* }
* ntfs_warning(vol->mp, "Changing the compression state "
* "is not supported at present, "
* "returning ENOTSUP.");
* err = ENOTSUP;
* goto err;
* }
*//*
* The root directory of a volume always has the hidden bit set
* but we pretend that it is not hidden to OS X and we do not
* allow this bit to be modified for the root directory.
*/if (base_ni != vol->root_ni) {
/*
* If the Finder info is valid need to update it as
* well. Note setting or clearing the hidden flag in
* the Finder info does not cause the Finder info to
* become dirty as the hidden bit is not stored on disk
* in the Finder info.
*/if (flags & UF_HIDDEN) {
base_ni->file_attributes |= FILE_ATTR_HIDDEN;
if (NInoValidFinderInfo(base_ni))
base_ni->finder_info.attrs |=
FINDER_ATTR_IS_HIDDEN;
} else {
base_ni->file_attributes &= ~FILE_ATTR_HIDDEN;
if (NInoValidFinderInfo(base_ni))
base_ni->finder_info.attrs &=
~FINDER_ATTR_IS_HIDDEN;
}
dirty_flags = TRUE;
}
/*
* Windows does not allow users to set/clear the read-only bit
* on directories. In fact Windows uses the read-only bit on a
* directory to signify that a customized or specialized folder
* view is in effect thus we do not allow setting/clearing the
* read-only bit on directories from OS X.
*
* Windows does not set the "needs archiving" bit on
* directories.
*
* The only exception are encrypted directories which do have
* the "needs archiving" bit set but we do not want to allow
* this bit to be cleared so ignore them, too.
*/if (!S_ISDIR(base_ni->mode)) {
if (flags & (SF_IMMUTABLE | UF_IMMUTABLE))
base_ni->file_attributes |= FILE_ATTR_READONLY;
else
base_ni->file_attributes &= ~FILE_ATTR_READONLY;
if (flags & SF_ARCHIVED)
base_ni->file_attributes &= ~FILE_ATTR_ARCHIVE;
else
base_ni->file_attributes |= FILE_ATTR_ARCHIVE;
dirty_flags = TRUE;
}
if (dirty_flags)
NInoSetDirtyFileAttributes(base_ni);
VATTR_SET_SUPPORTED(va, va_flags);
}
if (VATTR_IS_ACTIVE(va, va_create_time)) {
base_ni->creation_time = va->va_create_time;
VATTR_SET_SUPPORTED(va, va_create_time);
dirty_times = TRUE;
}
if (VATTR_IS_ACTIVE(va, va_modify_time)) {
base_ni->last_data_change_time = va->va_modify_time;
VATTR_SET_SUPPORTED(va, va_modify_time);
dirty_times = TRUE;
/*
* The following comment came from the HFS code:
*
* <quote>The utimes system call can reset the modification
* time but it doesn't know about HFS create times. So we need
* to ensure that the creation time is always at least as old
* as the modification time.</quote>
*
* SMB also follows this behaviour and it also adds the
* following comment:
*
* <quote>The HFS code also checks to make sure it was not the
* root vnode. Don Brady said that the SMB code should not use
* that part of the check.</quote>
*
* I assume the root vnode check is there in HFS as it does not
* support times on the root vnode at all so the check is
* needed for HFS only.
*
* The same applies for NTFS so follow the HFS/SMB behaviour.
*
* One salient point is that we only do the above if the
* creation time is not being explicitly set already.
*/if (!VATTR_IS_ACTIVE(va, va_create_time) &&
(va->va_modify_time.tv_sec <
base_ni->creation_time.tv_sec ||
(va->va_modify_time.tv_sec ==
base_ni->creation_time.tv_sec &&
va->va_modify_time.tv_nsec <
base_ni->creation_time.tv_nsec)))
base_ni->creation_time = va->va_modify_time;
}
if (VATTR_IS_ACTIVE(va, va_change_time)) {
base_ni->last_mft_change_time = va->va_change_time;
VATTR_SET_SUPPORTED(va, va_change_time);
dirty_times = TRUE;
}
if (VATTR_IS_ACTIVE(va, va_access_time)) {
base_ni->last_access_time = va->va_access_time;
VATTR_SET_SUPPORTED(va, va_access_time);
dirty_times = TRUE;
}
if (dirty_times)
NInoSetDirtyTimes(base_ni);
if (VATTR_IS_ACTIVE(va, va_backup_time)) {
base_ni->backup_time = va->va_backup_time;
NInoSetValidBackupTime(base_ni);
NInoSetDirtyBackupTime(base_ni);
/*
* Now write (if needed creating) the AFP_AfpInfo attribute
* with the specified backup time.
*/
err = ntfs_inode_afpinfo_write(base_ni);
if (err) {
ntfs_error(vol->mp, "Failed to write/create ""AFP_AfpInfo attribute in inode ""0x%llx (error %d).",
(unsignedlonglong)base_ni->mft_no,
err);
gotoerr;
}
VATTR_SET_SUPPORTED(va, va_backup_time);
}
ntfs_debug("Done.");
err:
lck_rw_unlock_exclusive(&base_ni->lock);
return err;
unl_err:
if (ni != base_ni)
lck_rw_unlock_exclusive(&ni->lock);
gotoerr;
}
/* Limit the internal i/o size so we can represent it in a 32-bit int. */
#defineNTFS_MAX_IO_REQUEST_SIZE (1024 * 1024 * 256)
/**
* ntfs_vnop_read_compressed - read from a compressed attribute
* @ni: ntfs inode describing the compressed attribute to read
* @uio: destination in which to return the read data
* @data_size: data size of the compressed attribute
* @ioflags: flags further describing the read request (see ntfs_vnop_read())
*
* This is a helper function for ntfs_vnop_read() (see below). It is called
* when a read request for a compressed attribute is received by
* ntfs_vnop_read().
*
* This function is somewhat similar to cluster_read() or to be more precise to
* cluster_read_copy() in that it breaks up large i/os into smaller manageable
* chunks, and for each chunk tries to get the data from the vm page cache and
* return it in the destination buffer described by @uio and failing that, it
* creates and maps a upl and causes it to be filled with data by calling
* ntfs_read_compressed() which reads the compressed data via the raw inode and
* decompresses it into our mapped upl and once that is done we now have the
* data in the vm page cache and copy it into the destination buffer described
* by @uio.
*
* Return 0 on success and errno on error.
*/static inline intntfs_vnop_read_compressed(ntfs_inode *ni, uio_t uio,
const s64 data_size, int ioflags)
{
s64 size;
user_ssize_t start_count;
off_t ofs;
vnode_t vn = ni->vn;
ntfs_inode *raw_ni;
upl_t upl;
upl_page_info_t *pl;
kern_return_t kerr;
int count, err, align_mask, cur_pg, last_pg;
int max_upl_size = ubc_upl_maxbufsize();
ofs = uio_offset(uio);
start_count = uio_resid(uio);
ntfs_debug("Entering for compressed file inode 0x%llx, offset 0x%llx, ""count 0x%llx, ioflags 0x%x.",
(unsignedlonglong)ni->mft_no,
(unsignedlonglong)ofs,
(unsignedlonglong)start_count, ioflags);
/*
* We can only read from regular files and named streams that are
* compressed and non-resident. We should never be called for anything
* else.
*/if (ni->type != AT_DATA || !NInoCompressed(ni) ||
!NInoNonResident(ni) || NInoEncrypted(ni) ||
NInoRaw(ni))
panic("%s(): Called for inappropriate inode.\n", __FUNCTION__);
/*
* Get the raw inode. We take the inode lock shared to protect against
* concurrent writers as the compressed data is invalid whilst a write
* is in progress.
*/
err = ntfs_raw_inode_get(ni, LCK_RW_TYPE_SHARED, &raw_ni);
if (err) {
ntfs_error(ni->vol->mp, "Failed to get raw inode (error %d).",
err);
return err;
}
if (!NInoRaw(raw_ni))
panic("%s(): Requested raw inode but got non-raw one.\n",
__FUNCTION__);
lck_spin_lock(&raw_ni->size_lock);
size = ubc_getsize(raw_ni->vn);
if (size != raw_ni->data_size)
panic("%s(): size != raw_ni->data_size\n", __FUNCTION__);
lck_spin_unlock(&raw_ni->size_lock);
/*
* If nothing was requested or the request starts at or beyond the end
* of the attribute, we do not need to do anything.
*/if (!start_count || ofs >= data_size) {
err = 0;
gotoerr;
}
/* Cannot read from a negative offset. */if (ofs < 0) {
err = EINVAL;
gotoerr;
}
if (vnode_isnocache(vn) || vnode_isnocache(raw_ni->vn))
ioflags |= IO_NOCACHE;
if (vnode_isnoreadahead(vn) || vnode_isnoreadahead(raw_ni->vn))
ioflags |= IO_RAOFF;
align_mask = ni->compression_block_size - 1;
if (align_mask < PAGE_MASK)
align_mask = PAGE_MASK;
/*
* Loop until we have finished the whole request or reached the end of
* the attribute.
*
* FIXME: We do not bother with read-ahead on the uncompressed vnode
* for now except to the extent that we always decompress full
* compression blocks which may be larger than the current i/o request
* so the next i/o request will find the whole compression block
* decompressed in the vm page cache thus small reads will in effect
* experience a certain amount of read-ahead in this way.
*/do {
u8 *kaddr;
int delta, next_pg, orig_count;
size = data_size - ofs;
if (size > start_count)
size = start_count;
count = size;
/*
* Break up the i/o in chunks that fit into a 32-bit int so
* we can call cluster_copy_ubc_data(), etc.
*/if (size > NTFS_MAX_IO_REQUEST_SIZE)
count = NTFS_MAX_IO_REQUEST_SIZE;
/*
* First of all, try to copy the data from the vm page cache.
* This will work on the second and all later reads so this is
* the hot path. If the attribute has not been accessed at all
* before or its cached pages were dropped due to vm pressure
* this will fail to copy any data due to the lack of a valid
* page and we will drop into the slow path.
*/if (!(ioflags & IO_NOCACHE)) {
err = cluster_copy_ubc_data(vn, uio, &count, 0);
if (err) {
/*
* The copying (uiomove()) failed with an
* error, abort.
*/
ntfs_error(ni->vol->mp,
"cluster_copy_ubc_data() ""failed (error %d).", err);
gotoerr;
}
/*
* @count is now set to the number of bytes remaining
* to be transferred. If it is zero, it means all the
* pages were in the vm page cache so we can skip onto
* the next part of the i/o.
*/if (!count)
continue;
ofs = uio_offset(uio);
}
/*
* Only some or none of the pages were in the vm page cache or
* this is not a cached i/o. First align this i/o request to
* compression block boundaries and to PAGE_SIZE boundaries and
* truncate it to the maximum upl size then create and map a
* page list so we can fill it with the data.
*/
delta = ofs & align_mask;
ofs -= delta;
orig_count = count;
count += delta;
count = (count + align_mask) & ~(off_t)align_mask;
if (count > max_upl_size)
count = max_upl_size;
/*
* Do not exceed the attribute size except for a final partial
* page.
*/
size = (data_size - ofs + PAGE_MASK) & ~PAGE_MASK_64;
if (count > size)
count = size;
start_count = count;
kerr = ubc_create_upl(vn, ofs, count, &upl, &pl, UPL_SET_LITE);
if (kerr != KERN_SUCCESS)
panic("%s(): Failed to get page list (error %d).\n",
__FUNCTION__, (int)kerr);
kerr = ubc_upl_map(upl, (vm_offset_t*)&kaddr);
if (kerr != KERN_SUCCESS) {
ntfs_error(ni->vol->mp, "Failed to map page list ""(error %d).", (int)kerr);
err = EIO;
gotoabort_err;
}
/*
* We know @ofs starts on both a compression block and a page
* boundary. We read from the compressed raw vnode
* decompressing the data into our mapped page list. Any
* already valid pages are automatically skipped.
*/
err = ntfs_read_compressed(ni, raw_ni, ofs, count, kaddr, pl,
ioflags);
if (err) {
ntfs_error(ni->vol->mp, "Failed to decompress data ""(error %d).", err);
gotounm_err;
}
/*
* We now have the entire page list filled with valid pages,
* thus we can now copy from the mapped page list into the
* destination buffer using uiomove(). We just need to make
* sure not to copy past the end of the attribute.
*/
ofs += delta;
count -= delta;
if (count > orig_count)
count = orig_count;
if (ofs + count > data_size)
count = data_size - ofs;
err = uiomove((caddr_t)(kaddr + delta), count, uio);
if (err) {
ntfs_error(ni->vol->mp, "uiomove() failed (error %d).",
err);
gotounm_err;
}
kerr = ubc_upl_unmap(upl);
if (kerr != KERN_SUCCESS) {
ntfs_error(ni->vol->mp, "ubc_upl_unmap() failed ""(error %d).", (int)kerr);
err = EIO;
gotoabort_err;
}
/*
* We are done with the page list, commit and/or abort the
* pages.
*/
next_pg = 0;
last_pg = start_count >> PAGE_SHIFT;
do {
int commit_flags;
BOOL was_valid, was_dirty;
cur_pg = next_pg;
/* Determine the state of the current first page. */
was_valid = upl_valid_page(pl, cur_pg);
was_dirty = (was_valid && upl_dirty_page(pl, cur_pg));
/* Find sequential pages of the same state. */for (next_pg = cur_pg + 1; next_pg < last_pg;
next_pg++) {
if (was_valid != upl_valid_page(pl, next_pg))
break;
if (was_valid) {
if (was_dirty != upl_dirty_page(pl,
next_pg))
break;
}
}
count = (next_pg - cur_pg) << PAGE_SHIFT;
/*
* For a set of pages that were invalid and hence we
* just filled them with data we commit and clean them
* unless no caching is requested in which case we dump
* them.
*
* For a set of pages that were already valid and hence
* we did not touch we commit them taking care to
* preserve any dirty state unless the pages were clean
* and no caching is requested in which case we dump
* them.
*/if (ioflags & IO_NOCACHE && !was_dirty) {
ubc_upl_abort_range(upl, cur_pg << PAGE_SHIFT,
count, UPL_ABORT_DUMP_PAGES |
UPL_ABORT_FREE_ON_EMPTY);
continue;
}
commit_flags = UPL_COMMIT_FREE_ON_EMPTY |
UPL_COMMIT_INACTIVATE;
if (!was_valid)
commit_flags |= UPL_COMMIT_CLEAR_DIRTY;
elseif (was_dirty)
commit_flags |= UPL_COMMIT_SET_DIRTY;
ubc_upl_commit_range(upl, cur_pg << PAGE_SHIFT, count,
commit_flags);
} while (next_pg < last_pg);
} while ((start_count = uio_resid(uio)) &&
(ofs = uio_offset(uio)) < data_size);
ntfs_debug("Done.");
err:
lck_rw_unlock_shared(&raw_ni->lock);
(void)vnode_put(raw_ni->vn);
return err;
unm_err:
kerr = ubc_upl_unmap(upl);
if (kerr != KERN_SUCCESS)
ntfs_error(ni->vol->mp, "ubc_upl_unmap() failed (error %d).",
(int)kerr);
abort_err:
/*
* We handle each page independently for simplicity. We do not care
* for performance given this is an error code path.
*
* For a page that was not valid, we dump it as it still does not
* contain valid data. For a page that was valid, we release it
* without modification as we have not touched it unless no caching is
* requested and the page was clean in which case we dump it.
*/
last_pg = start_count >> PAGE_SHIFT;
for (cur_pg = 0; cur_pg < last_pg; cur_pg++) {
int abort_flags;
abort_flags = UPL_ABORT_FREE_ON_EMPTY;
if (!upl_valid_page(pl, cur_pg) || (ioflags & IO_NOCACHE &&
!upl_dirty_page(pl, cur_pg)))
abort_flags |= UPL_ABORT_DUMP_PAGES;
ubc_upl_abort_range(upl, cur_pg << PAGE_SHIFT, PAGE_SIZE,
abort_flags);
}
gotoerr;
}
// TODO: Rename to ntfs_inode_read and move to ntfs_inode.[hc]?
/**
* ntfs_read - read a number of bytes from an inode into memory
* @ni: ntfs inode whose data to read into memory
* @uio: destination in which to return the read data
* @ioflags: flags further describing the read request
* @locked: if true the ntfs inode lock is already taken for reading
*
* Read uio_resid(@uio) bytes from the ntfs inode @ni, starting at byte offset
* uio_offset(@uio) into the inode into the destination buffer pointed to by
* @uio.
*
* The flags in @ioflags further describe the read request. The following
* ioflags are currently defined in OS X kernel (a lot of them are not
* applicable to VNOP_READ() however):
* IO_UNIT - Do i/o as atomic unit.
* IO_APPEND - Append write to end.
* IO_SYNC - Do i/o synchronously.
* IO_NODELOCKED - Underlying node already locked.
* IO_NDELAY - FNDELAY flag set in file table.
* IO_NOZEROFILL - F_SETSIZE fcntl uses this to prevent zero filling.
* IO_TAILZEROFILL - Zero fills at the tail of write.
* IO_HEADZEROFILL - Zero fills at the head of write.
* IO_NOZEROVALID - Do not zero fill if valid page.
* IO_NOZERODIRTY - Do not zero fill if page is dirty.
* IO_CLOSE - The i/o was issued from close path.
* IO_NOCACHE - Same effect as VNOCACHE_DATA, but only for this i/o.
* IO_RAOFF - Same effect as VRAOFF, but only for this i/o.
* IO_DEFWRITE - Defer write if vfs.defwrite is set.
* IO_PASSIVE - This is background i/o so do not throttle other i/o.
*
* For encrypted attributes we abort for now as we do not support them yet.
*
* For non-resident attributes we use cluster_read_ext() which deals with both
* normal and multi sector transfer protected attributes and
* ntfs_vnop_read_compressed() which deals with compressed attributes.
*
* For resident attributes we read the data from the vm page cache and if it is
* not there we cause the vm page cache to be populated by reading the buffer
* at offset 0 in the attribute.
*
* Return 0 on success and errno on error.
*
* Note it is up to the caller to verify that reading from the inode @ni makes
* sense. We cannot do the verification inside ntfs_read() as it is called
* from various VNOPs which all have different requirements. For example
* VNOP_READLINK(), i.e. ntfs_vnop_readlink(), needs to only allow S_ISLNK()
* inodes whilst VNOP_READ(), i.e. ntfs_vnop_read(), needs to not allow
* S_ISLNK() but needs to allow S_IFREG() instead but only if it is not a
* system file.
*/static errno_t ntfs_read(ntfs_inode *ni, uio_t uio, constint ioflags,
const BOOL locked)
{
s64 size;
user_ssize_t start_count;
off_t ofs;
vnode_t vn = ni->vn;
ntfs_inode *base_ni;
upl_t upl;
upl_page_info_array_t pl;
u8 *kaddr;
int err, count;
ofs = uio_offset(uio);
start_count = uio_resid(uio);
base_ni = ni;
if (NInoAttr(ni))
base_ni = ni->base_ni;
ntfs_debug("Entering for file inode 0x%llx, offset 0x%llx, count ""0x%llx, ioflags 0x%x, locked is %s.",
(unsignedlonglong)ni->mft_no,
(unsignedlonglong)ofs,
(unsignedlonglong)start_count, ioflags,
locked ? "true" : "false");
/*
* Protect against changes in initialized_size and thus against
* truncation also.
*/if (!locked)
lck_rw_lock_shared(&ni->lock);
/* Do not allow messing with the inode once it has been deleted. */if (NInoDeleted(ni)) {
if (!locked)
lck_rw_unlock_shared(&ni->lock);
/* Remove the inode from the name cache. */
cache_purge(ni->vn);
return ENOENT;
}
/*
* TODO: This check may no longer be necessary now that we lock against
* changes in initialized size and thus truncation... Revisit this
* issue when the write code has been written and remove the check if
* appropriate simply using ubc_getsize(vn); without the size_lock.
*/
lck_spin_lock(&ni->size_lock);
size = ubc_getsize(vn);
if (size > ni->data_size)
size = ni->data_size;
lck_spin_unlock(&ni->size_lock);
/*
* If nothing was requested or the request starts at or beyond the end
* of the attribute, we do not need to do anything.
*/if (!start_count || ofs >= size) {
err = 0;
gotoerr;
}
/* Cannot read from a negative offset. */if (ofs < 0) {
err = EINVAL;
gotoerr;
}
/* TODO: Deny access to encrypted attributes, just like NT4. */if (NInoEncrypted(ni)) {
ntfs_warning(ni->vol->mp, "Denying access to encrypted ""attribute (EACCES).");
err = EACCES;
gotoerr;
}
if (NInoNonResident(ni)) {
int (*callback)(buf_t, void *);
if (NInoCompressed(ni) && !NInoRaw(ni)) {
err = ntfs_vnop_read_compressed(ni, uio, size, ioflags);
if (!err)
ntfs_debug("Done (ntfs_vnop_read_compressed()"").");
else
ntfs_error(ni->vol->mp, "Failed (""ntfs_vnop_read_compressed(), ""error %d).", err);
gotoerr;
}
callback = NULL;
if (NInoMstProtected(ni) || NInoEncrypted(ni))
callback = ntfs_cluster_iodone;
err = cluster_read_ext(vn, uio, size, ioflags, callback, NULL);
if (!err)
ntfs_debug("Done (cluster_read_ext()).");
else
ntfs_error(ni->vol->mp, "Failed for file inode ""0x%llx, start offset 0x%llx, start ""count 0x%llx, now offset 0x%llx, ""now count 0x%llx, ioflags 0x%x ""(cluster_read_ext(), error %d).",
(unsignedlonglong)ni->mft_no,
(unsignedlonglong)ofs,
(unsignedlonglong)start_count,
(unsignedlonglong)uio_offset(uio),
(unsignedlonglong)uio_resid(uio),
ioflags, err);
gotoerr;
} /* else if (!NInoNonResident(ni)) *//*
* That attribute is resident thus we have to deal with it by
* ourselves. First of all, try to copy the data from the vm page
* cache. This will work on the second and all later reads so this is
* the hot path. If the attribute has not been accessed at all before
* or its cached pages were dropped due to vm pressure this will fail
* to copy any data due to the lack of a valid page and we will drop
* into the slow path.
*/
size -= ofs;
if (size > start_count)
size = start_count;
if (size > PAGE_SIZE) {
ntfs_warning(ni->vol->mp, "Unexpected count 0x%llx > PAGE_SIZE ""0x%x, overriding it to PAGE_SIZE.",
(unsignedlonglong)size, PAGE_SIZE);
size = PAGE_SIZE;
}
count = size;
err = cluster_copy_ubc_data(vn, uio, &count, 0);
if (err) {
/* The copying (uiomove()) failed with an error, abort. */
ntfs_error(ni->vol->mp, "cluster_copy_ubc_data() failed ""(error %d).", err);
gotoerr;
}
/*
* @count is now set to the number of bytes remaining to be
* transferred. If it is zero, it means we are done. Note it is
* possible that there is more data requested, i.e. uio_resid(uio) > 0,
* but that just means the request goes beyond the end of the
* attribute.
*/if (!count) {
ntfs_debug("Done (resident, cached, returned 0x%llx bytes).",
(unsignedlonglong)size);
gotoerr;
}
/*
* We failed to transfer everything. That really means we failed to
* transfer anything at all as we are guaranteed that a resident
* attribute is smaller than a page thus either the page is there and
* valid and we transfer everything or it is not and we transfer
* nothing.
*/if (count != size) {
ntfs_warning(ni->vol->mp, "Unexpected partial transfer from ""cached page (size 0x%llx, count 0x%x).",
(unsignedlonglong)size, count);
ofs = uio_offset(uio);
}
/*
* The page is not in cache or is not valid. We need to bring it into
* cache and make it valid so we can then copy the data out. The
* easiest way to do this is to just map the page which will take care
* of everything for us. We can than uiomove() straight out of the
* page into the @uio and then unmap the page again.
*
* Note this will take the inode lock again but this is ok as in both
* cases the lock is taken shared.
*/
err = ntfs_page_map(ni, 0, &upl, &pl, &kaddr, FALSE);
if (err) {
ntfs_error(ni->vol->mp, "Failed to map page (error %d).", err);
gotoerr;
}
err = uiomove((caddr_t)(kaddr + ofs), count, uio);
ntfs_page_unmap(ni, upl, pl, FALSE);
if (!err)
ntfs_debug("Done (resident, not cached, returned 0x%llx ""bytes).", (unsignedlonglong)size -
uio_resid(uio));
else
ntfs_error(ni->vol->mp, "uiomove() failed (error %d).", err);
err:
/*
* Update the last_access_time (atime) if something was read and this
* is the base ntfs inode or it is a named stream (this is what HFS+
* does, too).
*
* Skip the update if atime updates are disabled via the noatime mount
* option or the volume is read only or this is a symbolic link.
*
* Also, skip the core system files except for the root directory.
*/if (uio_resid(uio) < start_count && !NVolReadOnly(ni->vol) &&
!(vfs_flags(ni->vol->mp) & MNT_NOATIME) &&
!S_ISLNK(base_ni->mode) &&
(ni == base_ni || ni->type == AT_DATA)) {
BOOL need_update_time;
need_update_time = TRUE;
if (ni->vol->major_ver > 1) {
if (base_ni->mft_no <= FILE_Extend &&
base_ni != ni->vol->root_ni)
need_update_time = FALSE;
} else {
if (base_ni->mft_no <= FILE_UpCase &&
base_ni != ni->vol->root_ni)
need_update_time = FALSE;
}
if (need_update_time) {
base_ni->last_access_time = ntfs_utc_current_time();
NInoSetDirtyTimes(base_ni);
}
}
if (!locked)
lck_rw_unlock_shared(&ni->lock);
return err;
}
/**
* ntfs_vnop_read - read a number of bytes from a file into memory
* @a: arguments to read function
*
* @a contains:
* vnode_t a_vp; vnode of file whose data to read into memory
* uio_t a_uio; destination in which to return the read data
* int a_ioflag; flags further describing the read request
* vfs_context_t a_context;
*
* Read uio_resid(@a->a_uio) bytes from the vnode @a-a_vp, starting at byte
* offset uio_offset(@a->a_uio) into the vnode into the destination buffer
* pointed to by @uio.
*
* The flags in @a->a_ioflag further describe the read request. The following
* ioflags are currently defined in OS X kernel (a lot of them are not
* applicable to VNOP_READ() however):
* IO_UNIT - Do i/o as atomic unit.
* IO_APPEND - Append write to end.
* IO_SYNC - Do i/o synchronously.
* IO_NODELOCKED - Underlying node already locked.
* IO_NDELAY - FNDELAY flag set in file table.
* IO_NOZEROFILL - F_SETSIZE fcntl uses this to prevent zero filling.
* IO_TAILZEROFILL - Zero fills at the tail of write.
* IO_HEADZEROFILL - Zero fills at the head of write.
* IO_NOZEROVALID - Do not zero fill if valid page.
* IO_NOZERODIRTY - Do not zero fill if page is dirty.
* IO_CLOSE - The i/o was issued from close path.
* IO_NOCACHE - Same effect as VNOCACHE_DATA, but only for this i/o.
* IO_RAOFF - Same effect as VRAOFF, but only for this i/o.
* IO_DEFWRITE - Defer write if vfs.defwrite is set.
* IO_PASSIVE - This is background i/o so do not throttle other i/o.
*
* For encrypted attributes we abort for now as we do not support them yet.
*
* For non-resident attributes we use cluster_read_ext() which deals with both
* normal and multi sector transfer protected attributes and
* ntfs_vnop_read_compressed() which deals with compressed attributes.
*
* For resident attributes we read the data from the vm page cache and if it is
* not there we cause the vm page cache to be populated by reading the buffer
* at offset 0 in the attribute.
*
* Return 0 on success and errno on error.
*/staticintntfs_vnop_read(struct vnop_read_args *a)
{
vnode_t vn = a->a_vp;
ntfs_inode *ni = NTFS_I(vn);
if (!ni) {
ntfs_debug("Entered with NULL ntfs_inode, aborting.");
return EINVAL;
}
/*
* We can only read from regular files and named streams.
*
* Also, do not allow reading from system files or mst protected
* attributes.
*/if (vnode_issystem(vn) || NInoMstProtected(ni) ||
(!S_ISREG(ni->mode) && !(NInoAttr(ni) &&
ni->type == AT_DATA))) {
if (S_ISDIR(ni->mode))
return EISDIR;
return EPERM;
}
return (int)ntfs_read(ni, a->a_uio, a->a_ioflag, FALSE);
}
// TODO: Rename to ntfs_inode_write and move to ntfs_inode.[hc]?
/**
* ntfs_write - write a number of bytes from a memory buffer into a file
* @ni: ntfs inode to write to
* @uio: source containing the data to write
* @ioflags: flags further describing the write request
* @write_locked: if true the ntfs inode lock is already taken for writing
*
* Write uio_resid(@uio) bytes from the source buffer specified by @uio to the
* ntfs inode @ni, starting at byte offset uio_offset(@uio) into the inode.
*
* The flags in @ioflags further describe the write request. The following
* ioflags are currently defined in OS X kernel (not all of them are applicable
* to VNOP_WRITE() however):
* IO_UNIT - Do i/o as atomic unit.
* IO_APPEND - Append write to end.
* IO_SYNC - Do i/o synchronously.
* IO_NODELOCKED - Underlying node already locked.
* IO_NDELAY - FNDELAY flag set in file table.
* IO_NOZEROFILL - F_SETSIZE fcntl uses this to prevent zero filling.
* IO_TAILZEROFILL - Zero fills at the tail of write.
* IO_HEADZEROFILL - Zero fills at the head of write.
* IO_NOZEROVALID - Do not zero fill if valid page.
* IO_NOZERODIRTY - Do not zero fill if page is dirty.
* IO_CLOSE - The i/o was issued from close path.
* IO_NOCACHE - Same effect as VNOCACHE_DATA, but only for this i/o.
* IO_RAOFF - Same effect as VRAOFF, but only for this i/o.
* IO_DEFWRITE - Defer write if vfs.defwrite is set.
* IO_PASSIVE - This is background i/o so do not throttle other i/o.
*
* For compressed and encrypted attributes we abort for now as we do not
* support them yet.
*
* For non-resident attributes we use cluster_write_ext() which deals with
* normal attributes.
*
* Return 0 on success and errno on error.
*
* Note it is up to the caller to verify that writing to the inode @ni makes
* sense. We cannot do the verification inside ntfs_write() as it is called
* from various VNOPs which all have different requirements. For example
* VNOP_SYMLINK(), i.e. ntfs_vnop_symlink(), needs to write to S_ISLNK() inodes
* whilst VNOP_WRITE(), i.e. ntfs_vnop_write(), needs to not allow S_ISLNK()
* but needs to allow S_IFREG() instead but only if it is not a system file.
*/static errno_t ntfs_write(ntfs_inode *ni, uio_t uio, int ioflags,
BOOL write_locked)
{
s64 old_size, size, end, nr_truncated;
user_ssize_t old_count, count;
off_t old_ofs, ofs;
vnode_t vn = ni->vn;
ntfs_inode *base_ni;
upl_t upl;
upl_page_info_array_t pl;
u8 *kaddr;
int cnt;
errno_t err;
BOOL was_locked, need_uptodate;
/* Do not allow writing if mounted read-only. */if (NVolReadOnly(ni->vol))
return EROFS;
nr_truncated = 0;
ofs = old_ofs = uio_offset(uio);
count = old_count = uio_resid(uio);
ntfs_debug("Entering for file inode 0x%llx, offset 0x%llx, count ""0x%llx, ioflags 0x%x, write_locked is %s.",
(unsignedlonglong)ni->mft_no,
(unsignedlonglong)ofs,
(unsignedlonglong)count, ioflags,
write_locked ? "true" : "false");
/* If nothing to do return success. */if (!count)
return 0;
/* Cannot write to a negative offset. */if (ofs < 0)
return EINVAL;
/* TODO: Deny access to encrypted attributes, just like NT4. */if (NInoEncrypted(ni)) {
ntfs_warning(ni->vol->mp, "Denying write to encrypted ""attribute (EACCES).");
return EACCES;
}
/* TODO: We do not support writing to compressed files. */if (NInoCompressed(ni)) {
ntfs_error(ni->vol->mp, "Writing to compressed files is not ""implemented yet. Sorry.");
return ENOTSUP;
}
#if 1 // TODO: Removethiswhensparsesupportisdone...
if (NInoSparse(ni))
return ENOTSUP;
#endif
base_ni = ni;
if (NInoAttr(ni))
base_ni = ni->base_ni;
/* The first byte after the write. */
end = ofs + count;
/*
* If we are going to extend the initialized size take the inode lock
* for writing and take it for reading otherwise.
*
* Appending will always cause the initialized size to be extended thus
* always take the lock for writing.
*
* Writing into holes requires us to take the lock for writing thus if
* this is a sparse file take the lock for writing just in case.
*/
was_locked = write_locked;
if (ioflags & IO_APPEND) {
if (!was_locked) {
lck_rw_lock_exclusive(&ni->lock);
write_locked = TRUE;
}
/*
* Do not allow messing with the inode once it has been
* deleted.
*/if (NInoDeleted(ni)) {
if (!was_locked)
lck_rw_unlock_exclusive(&ni->lock);
/* Remove the inode from the name cache. */
cache_purge(ni->vn);
return ENOENT;
}
lck_spin_lock(&ni->size_lock);
ofs = ni->data_size;
lck_spin_unlock(&ni->size_lock);
uio_setoffset(uio, ofs);
ntfs_debug("Write to mft_no 0x%llx, IO_APPEND flag is set, ""setting uio_offset() to file size 0x%llx.",
(unsignedlonglong)ni->mft_no,
(unsignedlonglong)ofs);
/* Update the first byte after the write with the new offset. */
end = ofs + count;
} else {
if (!was_locked) {
if (NInoSparse(ni)) {
lck_rw_lock_exclusive(&ni->lock);
write_locked = TRUE;
} else {
lck_rw_lock_shared(&ni->lock);
write_locked = FALSE;
}
}
recheck_deleted:
/*
* Do not allow messing with the inode once it has been
* deleted.
*/if (NInoDeleted(ni)) {
if (!was_locked) {
if (write_locked)
lck_rw_unlock_exclusive(&ni->lock);
else
lck_rw_unlock_shared(&ni->lock);
}
/* Remove the inode from the name cache. */
cache_purge(ni->vn);
return ENOENT;
}
lck_spin_lock(&ni->size_lock);
size = ni->initialized_size;
lck_spin_unlock(&ni->size_lock);
if (!write_locked && end > size) {
/* If we fail to convert the lock, take it. */if (!lck_rw_lock_shared_to_exclusive(&ni->lock))
lck_rw_lock_exclusive(&ni->lock);
write_locked = TRUE;
gotorecheck_deleted;
}
ntfs_debug("Mft_no 0x%llx, inode lock taken for %s.",
(unsignedlonglong)ni->mft_no,
write_locked ? "writing" : "reading");
}
/*
* We do not want any form of zero filling to happen at the starting
* offset of the write as we sort this out ourselves.
*
* Further, we never want to zero fill at the end of the write as this
* is pointless. We automatically get zero filling at the end of the
* page when a page is read in and when the initialized size is
* extended.
*/
ioflags &= ~(IO_HEADZEROFILL | IO_TAILZEROFILL);
/*
* We do not want to zero any valid/dirty pages as they could already
* have new data written via mmap() for example and we do not want to
* lose that.
*/
ioflags |= IO_NOZEROVALID | IO_NOZERODIRTY;
lck_spin_lock(&ni->size_lock);
old_size = ni->data_size;
size = ni->allocated_size;
lck_spin_unlock(&ni->size_lock);
/*
* If this is a sparse attribute and the write overlaps the existing
* allocated size we need to fill any holes overlapping the write. We
* can skip resident attributes as they cannot have sparse regions.
*
* As allocated size goes in units of clusters we need to round down
* the start offset to the nearest cluster boundary and we need to
* round up the end offset to the next cluster boundary.
*/if (NInoSparse(ni) && NInoNonResident(ni) &&
(ofs & ~ni->vol->cluster_size_mask) < size) {
s64 aligned_end, new_end;
if (!write_locked)
panic("%s(): !write_locked\n", __FUNCTION__);
aligned_end = (end + ni->vol->cluster_size_mask) &
~ni->vol->cluster_size_mask;
/*
* Only need to instantiate holes up to the allocated size
* itself. Everything else is an extension and will be dealt
* with by ntfs_attr_extend_allocation() below.
*/if (aligned_end > size)
aligned_end = size;
err = ntfs_attr_instantiate_holes(ni,
ofs & ~ni->vol->cluster_size_mask, aligned_end,
&new_end, ioflags & IO_UNIT);
if (err) {
ntfs_error(ni->vol->mp, "Cannot perform write to ""mft_no 0x%llx because instantiation ""of sparse regions failed (error %d).",
(unsignedlonglong)ni->mft_no, err);
uio_setoffset(uio, old_ofs);
uio_setresid(uio, old_count);
if (!was_locked)
lck_rw_unlock_exclusive(&ni->lock);
return err;
}
/* If the instantiation was partial, truncate the write. */if (new_end < aligned_end) {
s64 new_count;
if (ioflags & IO_UNIT)
panic("%s(): new_end < aligned_end && ""ioflags & IO_UNIT\n",
__FUNCTION__);
ntfs_debug("Truncating write to mft_no 0x%llx because ""instantiation of sparse regions was ""only partially completed.",
(unsignedlonglong)ni->mft_no);
if (new_end > end)
panic("%s(): new_end > end\n", __FUNCTION__);
end = new_end;
new_count = new_end - ofs;
if (new_count >= count)
panic("%s(): new_count >= count\n",
__FUNCTION__);
nr_truncated += count - new_count;
count = new_count;
uio_setresid(uio, new_count);
}
}
/*
* If the write goes beyond the allocated size, extend the allocation
* to cover the whole of the write, rounded up to the nearest cluster.
*/if (end > size) {
if (!write_locked)
panic("%s(): !write_locked\n", __FUNCTION__);
/* Extend the allocation without changing the data size. */
err = ntfs_attr_extend_allocation(ni, end, -1, ofs, NULL,
&size, ioflags & IO_UNIT);
if (!err) {
if (ofs >= size)
panic("%s(): ofs >= size\n", __FUNCTION__);
/* If the extension was partial truncate the write. */if (end > size) {
s64 new_count;
if (ioflags & IO_UNIT)
panic("%s(): end > size && ""ioflags & IO_UNIT\n",
__FUNCTION__);
ntfs_debug("Truncating write to mft_no 0x%llx ""because the allocation was ""only partially extended.",
(unsignedlonglong)ni->mft_no);
end = size;
new_count = size - ofs;
if (new_count >= count)
panic("%s(): new_count >= count\n",
__FUNCTION__);
nr_truncated += count - new_count;
count = new_count;
uio_setresid(uio, new_count);
}
} else/* if (err) */ {
lck_spin_lock(&ni->size_lock);
size = ni->allocated_size;
lck_spin_unlock(&ni->size_lock);
/* Perform a partial write if possible or fail. */if (ofs < size && !(ioflags & IO_UNIT)) {
s64 new_count;
ntfs_debug("Truncating write to mft_no 0x%llx ""because extending the ""allocation failed (error %d).",
(unsignedlonglong)ni->mft_no,
err);
end = size;
new_count = size - ofs;
if (new_count >= count)
panic("%s(): new_count >= count\n",
__FUNCTION__);
nr_truncated += count - new_count;
count = new_count;
uio_setresid(uio, new_count);
} else {
ntfs_error(ni->vol->mp, "Cannot perform write ""to mft_no 0x%llx because ""extending the allocation ""failed (error %d).",
(unsignedlonglong)ni->mft_no,
err);
gotoabort;
}
}
}
/*
* If the write starts beyond the initialized size, extend it up to the
* beginning of the write and initialize all non-sparse space between
* the old initialized size and the new one. This automatically also
* increments the data size as well as the ubc size to keep it above or
* equal to the initialized size.
*/
lck_spin_lock(&ni->size_lock);
size = ni->initialized_size;
lck_spin_unlock(&ni->size_lock);
if (ofs > size) {
if (!write_locked)
panic("%s(): !write_locked 2\n", __FUNCTION__);
err = ntfs_attr_extend_initialized(ni, ofs);
if (err) {
ntfs_error(ni->vol->mp, "Cannot perform write to ""mft_no 0x%llx because extending the ""initialized size failed (error %d).",
(unsignedlonglong)ni->mft_no, err);
gotoabort;
}
size = ofs;
}
if (NInoNonResident(ni)) {
int (*callback)(buf_t, void *);
if (NInoCompressed(ni) && !NInoRaw(ni)) {
#if 0
err = ntfs_vnop_write_compressed(ni, uio, size,
ioflags);
if (!err)
ntfs_debug("Done (ntfs_vnop_write_compressed()"").");
else
ntfs_error(ni->vol->mp, "Failed (""ntfs_vnop_write_compressed(), ""error %d).", err);
#endif/*
* TODO: At present we should never get here for
* compressed files as this case is aborted at the
* start of the function.
*/
panic("%s(): NInoCompressed(ni) && !NInoRaw(ni)\n",
__FUNCTION__);
}
callback = NULL;
if (NInoEncrypted(ni)) {
callback = ntfs_cluster_iodone;
/*
* TODO: At present we should never get here for
* encrypted files as this case is aborted at the start
* of the function.
*/
panic("%s(): NInoEncrypted(ni)\n", __FUNCTION__);
}
/* Determine the new file size. */
size = ubc_getsize(vn);
if (end > size)
size = end;
/*
* Note the first size is the original file size and the second
* file size is the new file size when the write is complete.
*/
err = cluster_write_ext(vn, uio, ubc_getsize(vn), size, 0, 0,
ioflags, callback, NULL);
if (err) {
/*
* There was an error. We do not know where. Ensure
* everything is set up as if the write never happened.
*/
ntfs_error(ni->vol->mp, "Failed (cluster_write_ext(), ""error %d).", err);
gotoabort;
}
gotodone;
}
/*
* The attribute is resident thus we have to deal with it by ourselves.
* First of all, try to copy the data to the vm page cache. This will
* work on the second and all later writes so this is the hot path. If
* the attribute has not been accessed at all before or its cached
* pages were dropped due to vm pressure this will fail to copy any
* data due to the lack of a valid page and we will drop into the slow
* path.
*/if (ofs > PAGE_SIZE)
panic("%s(): ofs > PAGE_SIZE\n", __FUNCTION__);
cnt = (int)count;
if (count > PAGE_SIZE - ofs) {
cnt = PAGE_SIZE - ofs;
ntfs_warning(ni->vol->mp, "Unexpected count (0x%llx) > ""PAGE_SIZE - ofs (0x%x), overriding it to ""PAGE_SIZE - ofs.", (unsignedlonglong)count,
cnt);
}
/*
* Note we pass mark_dirty = 1 (the last parameter) which means the
* pages that are written to will be marked dirty.
*/
err = cluster_copy_ubc_data(vn, uio, &cnt, 1);
if (err) {
/*
* The copying (uiomove()) failed with an error. Ensure
* everything is set up as if the write never happened.
*/
ntfs_error(ni->vol->mp, "cluster_copy_ubc_data() failed ""(error %d).", err);
gotoabort;
}
/*
* @cnt is now set to the number of bytes remaining to be transferred.
* If it is zero, it means we are done.
*/if (!cnt)
gotodone;
/*
* We failed to transfer everything. That really means we failed to
* transfer anything at all as we are guaranteed that a resident
* attribute is smaller than a page thus either the page is there and
* valid and we transfer everything or it is not and we transfer
* nothing.
*/if (cnt != count) {
ntfs_warning(ni->vol->mp, "Unexpected partial transfer to ""cached page (count 0x%llx, cnt 0x%x).",
(unsignedlonglong)count, cnt);
/* Ensure everything is as it was before. */
uio_setoffset(uio, old_ofs);
uio_setresid(uio, old_count - nr_truncated);
}
/*
* The page is not in cache or is not valid. We need to bring it into
* cache and make it valid so we can then copy the data in. The
* easiest way to do this is to just map the page which will take care
* of everything for us. We can then uiomove() straight into the page
* from the @uio and then mark the page dirty and unmap it again.
*
* As an optimization, if the write covers the whole existing attribute
* we grab the page without bringing it uptodate if it is not valid
* already thus saving a pagein from disk.
*/
need_uptodate = (ofs || end < size);
err = ntfs_page_map_ext(ni, 0, &upl, &pl, &kaddr, need_uptodate, TRUE);
if (err) {
ntfs_error(ni->vol->mp, "Failed to map page (error %d).", err);
gotoabort;
}
err = uiomove((caddr_t)(kaddr + ofs), cnt, uio);
if (err) {
/*
* If we just caused the page to exist and did not bring it
* up-to-date or caching is disabled on the vnode or for this
* i/o, dump the page. Otherwise release it back to the VM.
*/if (upl_valid_page(pl, 0) || (need_uptodate &&
!vnode_isnocache(vn) &&
!(ioflags & IO_NOCACHE)))
ntfs_page_unmap(ni, upl, pl, FALSE);
else
ntfs_page_dump(ni, upl, pl);
/*
* The copying (uiomove()) failed with an error. Ensure
* everything is set up as if the write never happened.
*/
ntfs_error(ni->vol->mp, "uiomove() failed (error %d).", err);
gotoabort;
}
/*
* If the page is not uptodate and we did not bring it up-to-date when
* mapping it, zero the remainder of the page now thus bringing it
* up-to-date.
*/if (!need_uptodate && !upl_valid_page(pl, 0)) {
const off_t cur_ofs = uio_offset(uio);
if (cur_ofs > PAGE_SIZE)
panic("%s(): cur_ofs > PAGE_SIZE\n", __FUNCTION__);
bzero(kaddr + cur_ofs, PAGE_SIZE - cur_ofs);
}
/*
* Unmap the page marking it dirty.
*
* Note we leave the page cached even if no caching is requested for
* simplicity. That way we do not need to touch the mft record at all
* and can instead rely on the next sync to propagate the dirty data
* from the page into the mft record and then to disk. In the sync i/o
* case we will call ntfs_inode_sync() at the end of this function.
*/
ntfs_page_unmap(ni, upl, pl, TRUE);
done:
/*
* If the write went past the end of the initialized size update it
* both in the ntfs inode and in the base attribute record.
*
* Also update the data size and the ubc size if the write went past
* the end of the data size. Note this is automatically done by
* ntfs_attr_set_initialized_size() so we do not need to do it here.
*/
size = uio_offset(uio);
lck_spin_lock(&ni->size_lock);
if (size > ni->initialized_size) {
lck_spin_unlock(&ni->size_lock);
if (!write_locked)
panic("%s(): !write_locked 3\n", __FUNCTION__);
err = ntfs_attr_set_initialized_size(ni, size);
if (err) {
ntfs_error(ni->vol->mp, "Failed to update the ""initialized size of mft_no 0x%llx ""(error %d).",
(unsignedlonglong)ni->mft_no, err);
/*
* If the write was meant to be atomic, the write
* started beyond the end of the initialized size, or
* nothing was written ensure everything is set up as
* if the write never happened.
*/
lck_spin_lock(&ni->size_lock);
size = ni->initialized_size;
lck_spin_unlock(&ni->size_lock);
if (ioflags & IO_UNIT || old_ofs >= size ||
uio_resid(uio) >= old_count)
gotoabort;
/*
* Something was written before the initialized size
* thus turn the error into a partial, successful write
* up to the initialized size.
*/
uio_setoffset(uio, size);
uio_setresid(uio, size - old_ofs);
err = 0;
}
} else
lck_spin_unlock(&ni->size_lock);
// TODO: If we wrote anything at all we have to clear the S_ISUID and
// S_ISGID bits in the file mode as a precaution against tampering
// (see xnu/bsd/hfs/hfs_readwrite.c::hfs_vnop_write()).
/*
* Update the last_data_change_time (mtime) and last_mft_change_time
* (ctime) on the base ntfs inode @base_ni unless this is an attribute
* inode update in which case only update the ctime as named stream/
* extended attribute semantics expect on OS X.
*/
base_ni->last_mft_change_time = ntfs_utc_current_time();
if (ni == base_ni)
base_ni->last_data_change_time = base_ni->last_mft_change_time;
NInoSetDirtyTimes(base_ni);
/*
* If this is not a directory or it is an encrypted directory, set the
* needs archiving bit except for the core system files.
*/if (!S_ISDIR(base_ni->mode) || NInoEncrypted(base_ni)) {
BOOL need_set_archive_bit = TRUE;
if (ni->vol->major_ver >= 2) {
if (ni->mft_no <= FILE_Extend)
need_set_archive_bit = FALSE;
} else {
if (ni->mft_no <= FILE_UpCase)
need_set_archive_bit = FALSE;
}
if (need_set_archive_bit) {
base_ni->file_attributes |= FILE_ATTR_ARCHIVE;
NInoSetDirtyFileAttributes(base_ni);
}
}
/*
* If we truncated the write add back the number of truncated bytes to
* the number of bytes remaining.
*/if (nr_truncated > 0) {
if (ioflags & IO_UNIT)
panic("%s(): ioflags & IO_UNIT\n", __FUNCTION__);
uio_setresid(uio, uio_resid(uio) + nr_truncated);
}
/*
* If the write was partial we need to trim off any extra allocated
* space by truncating the attribute to its old size. We can only have
* extended the allocation if we hold the inode lock for writing so do
* not bother going through this code if we only hold the lock for
* reading.
*
* There is one exception and that is that if the write was meant to be
* atomic a partial write is not acceptable thus we need to abort the
* write completely in this case.
*/
size = uio_resid(uio);
if (write_locked && size > nr_truncated) {
s64 truncate_size;
errno_t err2;
int rflags;
/*
* If the write was meant to be atomic or nothing was written
* reset everything as if the write never happened thus
* releasing any extra space we may have allocated.
*/if (ioflags & IO_UNIT || size >= old_count) {
if (size > old_count)
panic("%s(): size > old_count\n", __FUNCTION__);
abort:
uio_setoffset(uio, old_ofs);
uio_setresid(uio, old_count);
if (!write_locked) {
if (!err)
panic("%s(): !err\n", __FUNCTION__);
gotoskip_truncate;
}
truncate_size = old_size;
} else/* if (uio_resid(uio) < old_count) */ {
/*
* At least something was written. Truncate the
* attribute to the successfully written size thus
* releasing any extra space we allocated but ensure we
* do not truncate to less than the old size.
*/
truncate_size = uio_offset(uio);
if (truncate_size < old_size)
truncate_size = old_size;
}
/*
* Truncate the attribute to @truncate_size.
*
* The truncate must be complete or no need to bother at all so
* set the IO_UNIT flag. Also remove unwanted flags.
*/
rflags = (ioflags | IO_UNIT) & ~(IO_APPEND | IO_SYNC |
IO_NOZEROFILL);
err2 = ntfs_attr_resize(ni, truncate_size, rflags, NULL);
if (err2) {
BOOL is_dirty;
/*
* If no other error has occured failing the truncate
* will at worst mean that we have too much allocated
* space which is not a disaster so carry on in this
* case.
*
* If another error has occured any of a number of
* things can now be wrong and in particular if the
* data size is not equal to @truncate_size this is
* very bad news so mark the volume dirty and warn the
* user about it.
*/
is_dirty = (err);
if (is_dirty) {
lck_spin_lock(&ni->size_lock);
if (truncate_size == ni->data_size)
is_dirty = FALSE;
lck_spin_unlock(&ni->size_lock);
}
ntfs_error(ni->vol->mp, "Truncate failed (error %d).%s",
err2, is_dirty ? " Leaving ""inconsistent data on disk. Unmount ""and run chkdsk." : "");
if (is_dirty)
NVolSetErrors(ni->vol);
}
}
skip_truncate:
if (!was_locked) {
if (!write_locked)
lck_rw_unlock_shared(&ni->lock);
else
lck_rw_unlock_exclusive(&ni->lock);
/*
* If the write was successful and synchronous i/o was
* requested, sync all changes to the backing store. We
* dropped the inode lock already to be able to call
* ntfs_inode_sync() thus if it fails we cannot do anything
* about it so we just return the error even though the
* operation has otherwise been performed.
*
* Note we cannot do this if the inode was already locked or
* the call to ntfs_inode_sync() would cause a deadlock.
*/if (!err && ioflags & IO_SYNC) {
/* Mask out undersired @ioflags. */
ioflags &= ~(IO_UNIT | IO_APPEND | IO_DEFWRITE);
err = ntfs_inode_sync(ni, ioflags, FALSE);
}
}
return err;
}
/**
* ntfs_vnop_write - write a number of bytes from a memory buffer into a file
* @a: arguments to write function
*
* @a contains:
* vnode_t a_vp; vnode of file to write to
* uio_t a_uio; source containing the data to write
* int a_ioflag; flags further describing the write request
* vfs_context_t a_context;
*
* Write uio_resid(@a->a_uio) bytes from the source buffer specified by
* @a->a_uio to the vnode @a-a_vp, starting at byte offset
* uio_offset(@a->a_uio) into the vnode.
*
* The flags in @a->a_ioflag further describe the write request. The following
* ioflags are currently defined in OS X kernel (not all of them are applicable
* to VNOP_WRITE() however):
* IO_UNIT - Do i/o as atomic unit.
* IO_APPEND - Append write to end.
* IO_SYNC - Do i/o synchronously.
* IO_NODELOCKED - Underlying node already locked.
* IO_NDELAY - FNDELAY flag set in file table.
* IO_NOZEROFILL - F_SETSIZE fcntl uses this to prevent zero filling.
* IO_TAILZEROFILL - Zero fills at the tail of write.
* IO_HEADZEROFILL - Zero fills at the head of write.
* IO_NOZEROVALID - Do not zero fill if valid page.
* IO_NOZERODIRTY - Do not zero fill if page is dirty.
* IO_CLOSE - The i/o was issued from close path.
* IO_NOCACHE - Same effect as VNOCACHE_DATA, but only for this i/o.
* IO_RAOFF - Same effect as VRAOFF, but only for this i/o.
* IO_DEFWRITE - Defer write if vfs.defwrite is set.
* IO_PASSIVE - This is background i/o so do not throttle other i/o.
*
* For compressed and encrypted attributes we abort for now as we do not
* support them yet.
*
* For non-resident attributes we use cluster_write_ext() which deals with
* normal attributes.
*
* Return 0 on success and errno on error.
*/staticintntfs_vnop_write(struct vnop_write_args *a)
{
vnode_t vn = a->a_vp;
ntfs_inode *ni = NTFS_I(vn);
if (!ni) {
ntfs_debug("Entered with NULL ntfs_inode, aborting.");
return EINVAL;
}
/*
* We can only write to regular files and named streams.
*
* Also, do not allow writing to system files and mst protected
* attributes.
*/if (vnode_issystem(vn) || NInoMstProtected(ni) ||
(!S_ISREG(ni->mode) && !(NInoAttr(ni) &&
ni->type == AT_DATA))) {
if (S_ISDIR(ni->mode))
return EISDIR;
return EPERM;
}
return (int)ntfs_write(ni, a->a_uio, a->a_ioflag, FALSE);
}
/**
* ntfs_vnop_ioctl -
*
*/staticintntfs_vnop_ioctl(struct vnop_ioctl_args *a)
{
errno_t err;
ntfs_debug("Entering.");
// TODO:
err = ENOTSUP;
ntfs_debug("Done (error %d).", (int)err);
return err;
}
/**
* ntfs_vnop_select -
*
*/staticintntfs_vnop_select(struct vnop_select_args *a)
{
errno_t err;
ntfs_debug("Entering.");
// TODO:
err = ENOTSUP;
ntfs_debug("Done (error %d).", (int)err);
return err;
}
/**
* ntfs_vnop_exchange -
*
*/staticintntfs_vnop_exchange(struct vnop_exchange_args *a)
{
errno_t err;
ntfs_debug("Entering.");
// TODO:
err = ENOTSUP;
ntfs_debug("Done (error %d).", (int)err);
return err;
}
/**
* ntfs_vnop_mmap - map a file (vnode) into memory
* @a: arguments to mmap function
*
* @a contains:
* vnode_t a_vp; file vnode which to map into memory
* int a_fflags; mapping flags for the vnode
* vfs_context_t a_context;
*
* Map the file vnode @a->a_vp into memory applying the mapping flags
* @a->a_fflags which are a combination of one or more of PROT_READ,
* PROT_WRITE, and PROT_EXEC.
*
* VNOP_MMAP() and hence ntfs_vnop_mmap() gets called from ubc_map() which in
* turn gets called from the mmap() system call when a file is being mapped
* into memory.
*
* The mmap() system call does the necessary permission checking and in fact
* ignores the return value from ubc_map() and relies on things not working
* later on for error handling.
*
* ubc_map() on the other hand does look at the return value of VNOP_MMAP() but
* it only cares for one error code and that is EPERM. All other errors are
* ignored and not passed to its caller. Thus for any return value not equal
* to EPERM, ubc_map() takes an extra reference on the vnode and sets the flags
* UI_ISMAPPED and UI_WASMAPPED in the ubc info of the vnode and for EPERM it
* does not do anything and just returns EPERM to the caller.
*
* In effect neither class of return value (EPERM or not EPERM) actually has
* any effect at all so we do not bother doing any checking here and defer all
* checks to VNOP_PAGEIN() and hence ntfs_vnop_pagein().
*
* FIXME: This is a huge problem because it means that anyone can use mmap() on
* a system file and then write rubbish into the mapped memory and then trash
* the metadata in the mapped memory by calling msync() to write the rubbish
* out into the system file on disk! This will need to be fixed in the kernel
* I think, i.e. the mmap() system call must fail if VNOP_MMAP() fails. This
* is because we have no way to tell who is causing a page{in,out} at
* ntfs_vnop_page{in,out}() time and for what reason so we have to always
* permit page{in,out} to be called.
*
* Return 0 on success and EPERM on error.
*/staticintntfs_vnop_mmap(struct vnop_mmap_args *a)
{
#ifdefDEBUG
ntfs_inode *ni = NTFS_I(a->a_vp);
if (ni)
ntfs_debug("Mapping mft_no 0x%llx, type 0x%x, name_len 0x%x, ""mapping flags 0x%x.",
(unsignedlonglong)ni->mft_no,
le32_to_cpu(ni->type), (unsigned)ni->name_len,
a->a_fflags);
#endif/* Nothing to do. */return 0;
}
/**
* ntfs_vnop_mnomap - unmap a file (vnode) from memory
* @a: arguments to mnomap function
*
* @a contains:
* vnode_t a_vp; file vnode which to unmap from memory
* vfs_context_t a_context;
*
* Remove the memory mapping of the file vnode @a->a_vp that was previously
* established via ntfs_vnop_mmap().
*
* VNOP_MNOMAP() and hence ntfs_vnop_mnomap() gets called from ubc_unmap() when
* a file is being unmapped from memory via the munmap() system call.
*
* ubc_unmap() only calls VNOP_MNOMAP() if the previous VNOP_MMAP() call did
* not return EPERM.
*
* ubc_unmap() completely ignores the return value from VNOP_MNOMAP().
*
* Always return 0 as the return value is always ignored.
*/staticintntfs_vnop_mnomap(struct vnop_mnomap_args *a)
{
#ifdefDEBUG
ntfs_inode *ni = NTFS_I(a->a_vp);
if (ni)
ntfs_debug("Unmapping mft_no 0x%llx, type 0x%x, name_len ""0x%x.", (unsignedlonglong)ni->mft_no,
le32_to_cpu(ni->type), (unsigned)ni->name_len);
#endif/* Nothing to do. */return 0;
}
/**
* ntfs_vnop_fsync - synchronize a vnode's in-core state with that on disk
* @a: arguments to fsync function
*
* @a contains:
* vnode_t a_vp; vnode which to sync
* int a_waitfor; if MNT_WAIT wait for i/o to complete
* vfs_context_t a_context;
*
* Write all dirty cached data belonging/related to the vnode @a->a_vp to disk.
*
* If @a->a_waitfor is MNT_WAIT, wait for all i/o to complete before returning.
*
* Note: When called from reclaim, the vnode has a zero v_iocount and
* v_usecount and vnode_isrecycled() is true.
*
* Return 0 on success and the error code on error.
*/staticintntfs_vnop_fsync(struct vnop_fsync_args *a)
{
vnode_t vn = a->a_vp;
ntfs_inode *ni = NTFS_I(vn);
int sync, err;
if (!ni) {
ntfs_debug("Entered with NULL ntfs_inode, aborting.");
return 0;
}
/* If we are mounted read-only, we do not need to sync anything. */if (NVolReadOnly(ni->vol))
return 0;
sync = (a->a_waitfor == MNT_WAIT) ? IO_SYNC : 0;
ntfs_debug("Entering for inode 0x%llx, waitfor 0x%x, %ssync i/o.",
(unsignedlonglong)ni->mft_no, a->a_waitfor,
(sync == IO_SYNC) ? "a" : "");
/*
* We need to allow ENOENT errors since the unlink system call can call
* VNOP_FSYNC() during vclean().
*/
err = ntfs_inode_sync(ni, sync, FALSE);
if (err == ENOENT)
err = 0;
ntfs_debug("Done (error %d).", err);
return err;
}
/**
* ntfs_unlink_internal - unlink and ntfs inode from its parent directory
* @dir_ni: directory ntfs inode from which to unlink the ntfs inode
* @ni: base ntfs inode to unlink
* @name: Unicode name of the inode to unlink
* @name_len: length of the name in Unicode characters
* @name_type: Namespace the name is in (i.e. FILENAME_{DOS,WIN32,POSIX,etc})
* @is_rename: if true ntfs_unlink_internal() is called for a rename operation
*
* Unlink an inode with the ntfs inode @ni and name @name with length @name_len
* Unicode characters and of namespace @name_type from the directory with ntfs
* inode @dir_ni.
*
* If @is_rename is true the caller was ntfs_vnop_rename() in which case the
* link count of the inode to unlink @ni will be one higher than the link count
* in the mft record.
*
* Return 0 on success and the error code on error.
*
* Note that if the name of the inode to be removed is in the WIN32 or DOS
* namespaces, both the WIN32 and the corresponding DOS names are removed.
*
* Note that for a hard link this function simply removes the name and its
* directory entry and decrements the hard link count whilst for the last name,
* i.e. the last link to an inode, it only removes the directory entry, i.e. it
* does not remove the name, however it does decrement the hard link count to
* zero. This is so that the inode can be undeleted and its original name
* restored. In any case, we do not actually delete the inode here as it may
* still be open and UNIX semantics require an unlinked inode to be still
* accessible through already opened file descriptors. When the last file
* descriptor is closed, we causes the inode to be deleted when the VFS
* notifies us of the last close by calling VNOP_INACTIVE(), i.e.
* ntfs_vnop_inactive().
*/static errno_t ntfs_unlink_internal(ntfs_inode *dir_ni, ntfs_inode *ni,
ntfschar *name, signed name_len, FILENAME_TYPE_FLAGS name_type,
const BOOL is_rename)
{
ntfs_volume *vol;
ntfs_inode *objid_o_ni;
ntfschar *ntfs_name;
MFT_RECORD *m;
ntfs_attr_search_ctx *actx;
ATTR_RECORD *a;
ntfs_index_context *ictx;
FILENAME_ATTR *fn, *tfn;
signed ntfs_name_len;
unsigned fn_count, tfn_alloc;
errno_t err;
BOOL seen_dos;
FILENAME_TYPE_FLAGS seek_type, fn_type;
vol = ni->vol;
objid_o_ni = vol->objid_o_ni;
ntfs_debug("Unlinking mft_no 0x%llx from directory mft_no 0x%llx, ""name type 0x%x.", (unsignedlonglong)ni->mft_no,
(unsignedlonglong)dir_ni->mft_no,
(unsigned)name_type);
if (NInoAttr(ni))
panic("%s(): Target inode is an attribute inode.\n",
__FUNCTION__);
/* Start the unlink by evicting the target from the name cache. */
cache_purge(ni->vn);
/*
* We now need to look up the target name in the target mft record.
*
* If @name_type is FILENAME_POSIX then @name and @name_len contain the
* correctly cased name and length in Unicode characters, respectively
* so we simply set @ntfs_name and @ntfs_name_len to @name and
* @name_len, respectively.
*
* If @name_type is anything else, i.e. FILENAME_WIN32, FILENAME_DOS,
* or FILENAME_WIN32_AND_DOS we simply need to look for that type of
* name in the target mft record as there can only be one filename
* attribute of this type thus the name is uniquely identified by type
* so the lookup can be optimized that way.
*/
seek_type = 0;
if (name_type == FILENAME_POSIX) {
ntfs_name = name;
ntfs_name_len = name_len;
} else {
/*
* Set @ntfs_name to NULL so we know to do the look up based on
* the filename namespace @seek_type instead.
*/
ntfs_name = NULL;
ntfs_name_len = 0;
seek_type = name_type;
/*
* If the target name is the WIN32 name we first need to delete
* the DOS name thus re-set @seek_type accordingly (see below
* for details).
*/if (seek_type == FILENAME_WIN32)
seek_type = FILENAME_DOS;
}
/*
* We know this is the base inode since we bailed out for attribute
* inodes above.
*/
err = ntfs_mft_record_map(ni, &m);
if (err) {
ntfs_error(vol->mp, "Failed to map mft record 0x%llx (error ""%d).", (unsignedlonglong)ni->mft_no, err);
gotoerr;
}
/*
* Sanity check that the inode link count is in step with the mft
* record link count.
*/if ((!is_rename && ni->link_count != le16_to_cpu(m->link_count)) ||
(is_rename && ni->link_count !=
(unsigned)le16_to_cpu(m->link_count) + 1))
panic("%s(): ni->link_count != le16_to_cpu(m->link_count)\n",
__FUNCTION__);
actx = ntfs_attr_search_ctx_get(ni, m);
if (!actx) {
err = ENOMEM;
gotounm_err;
}
/*
* Find the name in the target mft record.
*
* If it is a name in the WIN32 or DOS namespace (but not both), we
* remove the DOS name from both the directory index it is in and from
* the mft record and we decrement the link count both in the base mft
* record and in the ntfs inode. In the case of a WIN32 name, we find
* the corresponding DOS name first and proceed as described.
*
* If the removal of the DOS name from the directory index is
* successful, we change the namespace of the remaining WIN32 name to
* the POSIX namespace, thus if we fail to remove the remaining name
* after successfully removing the DOS name, we still have a consistent
* file system. This also has the side effect of allowing undelete to
* work properly as otherwise the undelete would restore a WIN32 name
* without a corresponding DOS name which would result in an illegal
* inode.
*
* We thus reduce the problem to a normal single name unlink and we can
* now determine whether this unlink is just a hard link removal or the
* final name removal, i.e. the inode is being deleted.
*/
seen_dos = FALSE;
restart_name:
/*
* Before looking for the last name and removing it from its directory
* index entry, i.e. before unlinking the inode and targeting it for
* deletion, we need to check if the inode has an object id and if so
* we need to remove it from the object id index on the volume (present
* in $O index of $Extend/$ObjId system file), so that the inode cannot
* be found via its object id any more either. Also, when the deleted
* inode gets reused for different purposes, we do not want the old
* object id to still point at it.
*
* If the volume is pre-NTFS 3.0, i.e. it does not support object ids,
* @vol->objid_o_ni will be NULL. It will also be NULL if the volume
* is NTFS 3.0+ but no object ids are present on the volume, thus we
* can make the check conditional on @objid_o_ni not being NULL.
*
* We do this before deleting the last directory entry so that we can
* abort the unlink if we fail to remove the object id from the index
* to ensure the volume does not become inconsistent.
*/if (objid_o_ni && ni->link_count <= 1) {
err = ntfs_attr_lookup(AT_OBJECT_ID, AT_UNNAMED, 0, 0, NULL, 0,
actx);
if (err) {
if (err != ENOENT) {
ntfs_error(vol->mp, "Failed to look up object ""id in mft_no 0x%llx (error ""%d).",
(unsignedlonglong)ni->mft_no,
err);
gotoput_err;
}
/*
* The object id was not found which is fine. The
* inode simply does not have an object id assigned to
* it so there is nothing for us to do.
*/
ntfs_debug("Target mft_no 0x%llx does not have an ""object id assigned to it.",
(unsignedlonglong)ni->mft_no);
} else/* if (!err) */ {
INDEX_ENTRY *ie;
GUID object_id;
/* The inode has an object id assigned to it. */
ntfs_debug("Deleting object id from target mft_no ""0x%llx.",
(unsignedlonglong)ni->mft_no);
a = actx->a;
/*
* We need to make a copy of the object id and release
* the mft record before looking up the object id in
* the $ObjID/$O index otherwise we could deadlock if
* the currently mapped mft record is in the same page
* as one of the mft records of $ObjId.
*/
memcpy(&object_id, &((OBJECT_ID_ATTR*)((u8*)a +
le16_to_cpu(a->value_offset)))->
object_id, sizeof(object_id));
ntfs_attr_search_ctx_put(actx);
ntfs_mft_record_unmap(ni);
err = vnode_get(objid_o_ni->vn);
if (err) {
ntfs_error(vol->mp, "Failed to get index ""vnode for $ObjId/$O.");
gotoerr;
}
lck_rw_lock_exclusive(&objid_o_ni->lock);
ictx = ntfs_index_ctx_get(objid_o_ni);
if (!ictx) {
ntfs_error(vol->mp, "Failed to get index ""context.");
err = ENOMEM;
gotoiput_err;
}
restart_ictx:
/* Get the index entry matching the object id. */
err = ntfs_index_lookup(&object_id, sizeof(object_id),
&ictx);
if (err) {
if (err == ENOENT) {
ntfs_error(vol->mp, "Failed to delete ""object id of target ""inode 0x%llx from ""object id index ""because the object ""id was not found in ""the object id ""index. Volume is ""corrupt. Run ""chkdsk.",
(unsignedlonglong)
ni->mft_no);
NVolSetErrors(vol);
err = EIO;
} else
ntfs_error(vol->mp, "Failed to delete ""object id of target ""inode 0x%llx from ""object id index ""because looking up ""the object id in the ""object id index ""failed (error %d)." ,
(unsignedlonglong)
ni->mft_no, err);
gotoiput_err;
}
ie = ictx->entry;
/* We now have the index entry, delete it. */
err = ntfs_index_entry_delete(ictx);
if (err) {
if (err == -EAGAIN) {
ntfs_debug("Restarting object id ""delete as tree was ""rearranged.");
ntfs_index_ctx_reinit(ictx, objid_o_ni);
gotorestart_ictx;
}
ntfs_error(vol->mp, "Failed to delete object ""id of target inode 0x%llx ""from object id index (error ""%d).",
(unsignedlonglong)ni->mft_no,
err);
gotoiput_err;
}
ntfs_index_ctx_put(ictx);
lck_rw_unlock_exclusive(&objid_o_ni->lock);
(void)vnode_put(objid_o_ni->vn);
/*
* Now get back the mft record so we can re-look up the
* object id attribute so we can delete it.
*
* This means we do not need to worry about
* inconsistencies to do with the object id in our
* error handling code paths later on.
*/
err = ntfs_mft_record_map(ni, &m);
if (err) {
ntfs_error(vol->mp, "Failed to re-map mft ""record 0x%llx (error %d). ""Leaving inconstent ""metadata. Run chkdsk.",
(unsignedlonglong)ni->mft_no,
err);
NVolSetErrors(vol);
gotoerr;
}
actx = ntfs_attr_search_ctx_get(ni, m);
if (!actx) {
ntfs_error(vol->mp, "Failed to re-get ""attribute search context for ""mft record 0x%llx (error ""%d). Leaving inconstent ""metadata. Run chkdsk.",
(unsignedlonglong)ni->mft_no,
err);
NVolSetErrors(vol);
err = ENOMEM;
gotounm_err;
}
err = ntfs_attr_lookup(AT_OBJECT_ID, AT_UNNAMED, 0, 0,
NULL, 0, actx);
if (err) {
ntfs_error(vol->mp, "Failed to re-look up ""object id in mft_no 0x%llx ""(error %d). Leaving ""inconsistent metadata. Run ""chkdsk.",
(unsignedlonglong)ni->mft_no,
err);
NVolSetErrors(ni->vol);
err = EIO;
gotoput_err;
}
/*
* Remove the object id attribute from the mft record
* and mark the mft record dirty.
*/
err = ntfs_attr_record_delete(ni, actx);
if (err) {
ntfs_error(vol->mp, "Failed to delete object ""id in mft_no 0x%llx (error ""%d). Leaving inconsistent ""metadata. Run chkdsk.",
(unsignedlonglong)ni->mft_no,
err);
gotoput_err;
}
}
/* Reinit the search context for the AT_FILENAME lookup. */
ntfs_attr_search_ctx_reinit(actx);
}
/* Use label and goto instead of a loop to reduce indentation. */
fn_count = 0;
next_name:
/* Increment the filename attribute counter. */
fn_count++;
err = ntfs_attr_lookup(AT_FILENAME, AT_UNNAMED, 0, 0, NULL, 0, actx);
if (err) {
if (err == ENOENT) {
/*
* If the name we are looking for is not found there is
* either some corruption or a bug given that a call to
* ntfs_lookup_inode_by_name() just found the name in
* the directory index.
*/
ntfs_error(vol->mp, "The target filename was not ""found in the mft record 0x%llx. ""This is not possible. This is ""either due to corruption or due to a ""driver bug. Run chkdsk.",
(unsignedlonglong)ni->mft_no);
NVolSetErrors(vol);
err = EIO;
} else
ntfs_error(vol->mp, "Failed to look up target ""filename in the mft record 0x%llx ""(error %d).",
(unsignedlonglong)ni->mft_no, err);
gotoput_err;
}
a = actx->a;
fn = (FILENAME_ATTR*)((u8*)a + le16_to_cpu(a->value_offset));
fn_type = fn->filename_type;
/*
* If this is a specific DOS or WIN32 or combined name lookup, no need
* to compare the actual name as there can only be one DOS and one
* WIN32 name or only one combined name in an inode.
*/if (seek_type && seek_type != FILENAME_POSIX) {
/*
* If this filename attribute does not match the target name
* try the next one.
*/if (seek_type != fn_type)
gotonext_name;
/* We found the filename attribute matching the target name. */if (fn_type == FILENAME_WIN32) {
/*
* We were looking for the WIN32 name so we can remove
* it after having removed the DOS name. We now found
* it, so switch it to the POSIX namespace as described
* above and then go ahead and delete it.
*/
ntfs_debug("Switching namespace of filename attribute ""from WIN32 to POSIX.");
fn_type = fn->filename_type = FILENAME_POSIX;
NInoSetMrecNeedsDirtying(actx->ni);
}
gotofound_name;
}
/* If this is the DOS name, note that we have seen it. */if (fn_type == FILENAME_DOS)
seen_dos = TRUE;
/* If the names do not match, continue searching. */if (fn->filename_length != ntfs_name_len)
gotonext_name;
if (MREF_LE(fn->parent_directory) != dir_ni->mft_no)
gotonext_name;
if (bcmp(fn->filename, ntfs_name, ntfs_name_len * sizeof(ntfschar)))
gotonext_name;
/* Found the matching name. */if (fn_type == FILENAME_WIN32) {
/*
* Pure WIN32 name. Repeat the lookup but for the DOS name
* this time so we can remove that first.
*/
seek_type = FILENAME_DOS;
/*
* If @seen_dos is true, then restart the lookup from the
* beginning and if not then continue the lookup where we left
* off.
*/if (seen_dos) {
ntfs_attr_search_ctx_reinit(actx);
fn_count = 0;
}
gotonext_name;
}
if (fn_type == FILENAME_DOS) {
/*
* This cannot happen as ntfs_lookup_inode_by_name() always
* returns @name for pure DOS names and hence we would have
* @seek_type == FILENAME_DOS and thus would have picked this
* filename attribute up above without ever doing a name based
* match.
*/
ntfs_error(vol->mp, "Filename is in DOS namespace. This is ""not possible. This is either due to ""corruption or due to a driver bug. Run ""chkdsk.");
NVolSetErrors(vol);
err = EIO;
gotoput_err;
}
found_name:
/*
* We found the target filename attribute and can now remove it from
* the directory index. But before we can do that we need to make a
* copy of the filename attribute value so we can release the mft
* record before we delete the directory index entry. This is needed
* because when we hold the target mft record and we call
* ntfs_dir_entry_delete() this would cause the mft record for the
* directory to be mapped which could result in a deadlock in the event
* that both mft records are in the same page.
*/
tfn_alloc = le32_to_cpu(a->value_length);
tfn = OSMalloc(tfn_alloc, ntfs_malloc_tag);
if (!tfn) {
/*
* TODO: If @seek_type == FILENAME_WIN32 &&
* @fn->filename_type == FILENAME_POSIX we need to update the
* directory entry filename_type to FILENAME_POSIX. See below
* for how this is done for the error case in
* ntfs_dir_entry_delete(). Given a memory allocation just
* failed it is highly unlikely we would succeed in trying to
* look up the directory entry so that we could change the
* filename_type in it so at least for now just set the volume
* has errors flag instead.
*/
ntfs_error(vol->mp, "Failed to allocate memory for temporary ""filename attribute. Leaving inconsistent ""metadata. Run chkdsk.");
NVolSetErrors(vol);
err = EIO;
gotoput_err;
}
memcpy(tfn, fn, tfn_alloc);
ntfs_attr_search_ctx_put(actx);
ntfs_mft_record_unmap(ni);
/*
* We copied the name and can now remove it from the directory index.
* If the name is in the POSIX namespace, we may have converted it from
* a pure WIN32 name after removing the corresponding DOS name, in
* which case we need to update the index entry to reflect the
* conversion should we fail to remove it from the directory index.
* ntfs_dir_entry_delete() takes care of this for us.
*/
err = ntfs_dir_entry_delete(dir_ni, ni, tfn, tfn_alloc);
if (err) {
ntfs_error(vol->mp, "Failed to delete directory index entry ""(error %d).", err);
gotoerr;
}
/*
* Now get back the mft record.
*
* If getting back the mft record fails there is nothing we can do to
* recover and must bail out completely leaving inconsistent metadata.
*
* TODO: We could try to add the dir entry back again in an attempt to
* recover but as above we likely fail a memory allocation it is highly
* unlikely we would succeed in trying to do the lookup and addition of
* the directory entry.
*/
err = ntfs_mft_record_map(ni, &m);
if (err) {
ntfs_error(vol->mp, "Failed to re-map mft record 0x%llx ""(error %d). Leaving inconsistent metadata. ""Run chkdsk.", (unsignedlonglong)ni->mft_no,
err);
NVolSetErrors(vol);
gotoerr;
}
actx = ntfs_attr_search_ctx_get(ni, m);
if (!actx) {
ntfs_error(vol->mp, "Failed to re-get attribute search ""context for mft record 0x%llx (error %d). ""Leaving inconsitent metadata. Run chkdsk.",
(unsignedlonglong)ni->mft_no, err);
NVolSetErrors(vol);
err = EIO;
gotounm_err;
}
/*
* If the name is in the DOS namespace or this is not the last name we
* also need to remove the name from the mft record it is in and
* decrement the link count in the base mft record.
*/if (fn_type == FILENAME_DOS || ni->link_count > 1) {
/* Now need to re-lookup the target filename attribute. */while (fn_count > 0) {
fn_count--;
err = ntfs_attr_lookup(AT_FILENAME, AT_UNNAMED, 0, 0,
NULL, 0, actx);
if (!err)
continue;
ntfs_error(vol->mp, "Failed to re-look up target ""filename in mft_no 0x%llx (error %d).",
(unsignedlonglong)ni->mft_no, err);
NVolSetErrors(vol);
err = EIO;
gotoput_err;
}
a = actx->a;
if (a->type != AT_FILENAME)
panic("%s(): a->type (0x%x) != AT_FILENAME (0x30)\n",
__FUNCTION__, le32_to_cpu(a->type));
fn = (FILENAME_ATTR*)((u8*)a + le16_to_cpu(a->value_offset));
if (fn_type != fn->filename_type)
panic("%s(): fn_type != fn->filename_type\n",
__FUNCTION__);
/* Remove the filename from the mft record, too. */
err = ntfs_attr_record_delete(ni, actx);
if (err) {
ntfs_error(vol->mp, "Failed to delete filename ""attribute from mft_no 0x%llx (error ""%d).", (unsignedlonglong)ni->mft_no,
err);
NVolSetErrors(vol);
err = EIO;
gotoput_err;
}
/*
* Update the hard link count in the base mft record. Note we
* subtract one from the inode link count if this is a rename
* as the link count has been elevated by one by the caller.
*/
m->link_count = cpu_to_le16(ni->link_count - 1 -
(is_rename ? 1 : 0));
} else/* if (fn_type != FILENAME_DOS && ni->link_count <= 1) */ {
/*
* This is the last name, so we need to mark the mft record as
* unused in the mft record flags so no-one can open it by
* accident and so that, in case of a crash between now and the
* deletion of the inode, ntfsck will know that we meant to
* delete the inode rather than that we were in the process of
* allocating or renaming it so it will do the Right Thing(TM)
* and complete the deletion process.
*/
m->flags &= ~MFT_RECORD_IN_USE;
/* Ensure the base mft record gets written out. */
NInoSetMrecNeedsDirtying(ni);
}
/*
* We have either deleted the filename completely or we only removed
* the directory index entry if this is the last name.
*
* In either case, we need to update the hard link count and the ctime
* in the ntfs inode (the ctime is the last_mft_change_time on NTFS).
*/
ni->link_count--;
ni->last_mft_change_time = dir_ni->last_mft_change_time;
NInoSetDirtyTimes(ni);
/*
* If this is the DOS name, we now need to find the WIN32 name, so it
* can be deleted, too. Otherwise we are done.
*/if (fn_type == FILENAME_DOS) {
seek_type = FILENAME_WIN32;
/*
* We looked up the DOS name above thus we need to reinitialize
* the search context for the WIN32 name lookup.
*/
ntfs_attr_search_ctx_reinit(actx);
fn_count = 0;
gotorestart_name;
}
/*
* If we removed a hard link but the inode is not deleted yet we need
* to remove the parent vnode from the vnode as this association may no
* longer exist.
*
* The same is true for the vnode name as we have just unlinked it.
*
* Note we skip this for the rename case because the subsequent call to
* ntfs_link_internal() is going to update the vnode identity with the
* new name and parent so no need to do wipe them here.
*/if (ni->link_count > 0 && !is_rename)
vnode_update_identity(ni->vn, NULL, NULL, 0, 0,
VNODE_UPDATE_PARENT | VNODE_UPDATE_NAME);
ntfs_debug("Done.");
put_err:
ntfs_attr_search_ctx_put(actx);
unm_err:
ntfs_mft_record_unmap(ni);
err:
return err;
iput_err:
if (ictx)
ntfs_index_ctx_put(ictx);
lck_rw_unlock_exclusive(&objid_o_ni->lock);
(void)vnode_put(objid_o_ni->vn);
return err;
}
/**
* ntfs_unlink - unlink and ntfs inode from its parent directory
* @dir_ni: directory ntfs inode from which to unlink the ntfs inode
* @ni: base ntfs inode to unlink
* @cn: name of the inode to unlink
* @flags: flags describing the unlink request
* @is_rmdir: true if called from VNOP_RMDIR() and hence ntfs_vnop_rmdir()
*
* Unlink an inode with the ntfs inode @ni and name as specified in @cn from
* the directory with ntfs inode @dir_ni.
*
* The flags in @flags further describe the unlink request. The following
* flags are currently defined in OS X kernel:
* VNODE_REMOVE_NODELETEBUSY - Do not delete busy files, i.e. use
* Carbon delete semantics).
*
* If @is_rmdir is true the caller is VNOP_RMDIR() and hence ntfs_vnop_rmdir()
* and if @is_rmdir is false the caller is VNOP_REMOVE() and hence
* ntfs_vnop_remove(). Note @flags is always zero if @is_rmdir is true.
*
* Return 0 on success and the error code on error.
*
* Note that if the name of the inode to be removed is in the WIN32 or DOS
* namespaces, both the WIN32 and the corresponding DOS names are removed.
*
* Note that for a hard link this function simply removes the name and its
* directory entry and decrements the hard link count whilst for the last name,
* i.e. the last link to an inode, it only removes the directory entry, i.e. it
* does not remove the name, however it does decrement the hard link count to
* zero. This is so that the inode can be undeleted and its original name
* restored. In any case, we do not actually delete the inode here as it may
* still be open and UNIX semantics require an unlinked inode to be still
* accessible through already opened file descriptors. When the last file
* descriptor is closed, we causes the inode to be deleted when the VFS
* notifies us of the last close by calling VNOP_INACTIVE(), i.e.
* ntfs_vnop_inactive().
*/static errno_t ntfs_unlink(ntfs_inode *dir_ni, ntfs_inode *ni,
struct componentname *cn, constint flags, const BOOL is_rmdir)
{
MFT_REF mref;
ntfs_volume *vol;
ntfs_inode *objid_o_ni;
ntfschar *ntfs_name;
ntfs_dir_lookup_name *name = NULL;
size_t ntfs_name_size;
signed ntfs_name_len;
errno_t err;
FILENAME_TYPE_FLAGS ntfs_name_type;
ntfschar ntfs_name_buf[NTFS_MAX_NAME_LEN];
vol = ni->vol;
objid_o_ni = vol->objid_o_ni;
ntfs_debug("Unlinking %s%.*s with mft_no 0x%llx from directory ""mft_no 0x%llx, flags 0x%x.",
is_rmdir ? "directory " : "", (int)cn->cn_namelen,
cn->cn_nameptr, (unsignedlonglong)ni->mft_no,
(unsignedlonglong)dir_ni->mft_no, flags);
/*
* Do not allow attribute inodes or raw inodes to be deleted. Note
* raw inodes are always attribute inodes, too.
*/if (NInoAttr(ni)) {
ntfs_debug("Target %.*s, mft_no 0x%llx is a%s inode, ""returning EPERM.", (int)cn->cn_namelen,
cn->cn_nameptr, (unsignedlonglong)ni->mft_no,
NInoAttr(ni) ? "n attribute" : " raw");
return EPERM;
}
/* The parent inode must be a directory. */if (!S_ISDIR(dir_ni->mode)) {
ntfs_debug("Parent mft_no 0x%llx is not a directory, ""returning ENOTDIR.",
(unsignedlonglong)dir_ni->mft_no);
return ENOTDIR;
}
/* Check for "." removal. */if (ni == dir_ni) {
ntfs_debug("Target %.*s, mft_no 0x%llx is the same as its ""parent directory, returning EINVAL.",
(int)cn->cn_namelen, cn->cn_nameptr,
(unsignedlonglong)ni->mft_no);
return EINVAL;
}
/* Lock both the parent directory and the target inode for writing. */
lck_rw_lock_exclusive(&dir_ni->lock);
lck_rw_lock_exclusive(&ni->lock);
/* Ensure the parent directory has not been deleted. */if (!dir_ni->link_count) {
ntfs_debug("Parent directory mft_no 0x%llx has been deleted, ""returning ENOENT.",
(unsignedlonglong)dir_ni->mft_no);
/*
* If the directory is somehow still in the name cache remove
* it now.
*/
cache_purge(dir_ni->vn);
err = ENOENT;
gotoerr;
}
/* Ensure tha target has not been deleted by someone else already. */if (!ni->link_count) {
ntfs_debug("Target %.*s, mft_no 0x%llx has been deleted, ""returning ENOENT.", (int)cn->cn_namelen,
cn->cn_nameptr, (unsignedlonglong)ni->mft_no);
/*
* If the target is somehow still in the name cache remove it
* now.
*/
cache_purge(ni->vn);
err = ENOENT;
gotoerr;
}
/*
* If this is a directory removal, i.e. rmdir, need to check that the
* directory is empty.
*
* Note we already checked for "." removal and we do not need to check
* for ".." removal because that would fail the directory is empty
* check as the parent directory would at least have one entry and that
* is the current directory.
*/if (is_rmdir) {
err = ntfs_dir_is_empty(ni);
if (err) {
if (err == ENOTEMPTY)
ntfs_debug("Target directory %.*s, mft_no ""0x%llx is not empty, ""returning ENOTEMPTY.",
(int)cn->cn_namelen,
cn->cn_nameptr,
(unsignedlonglong)ni->mft_no);
else
ntfs_error(vol->mp, "Failed to determine if ""target directory %.*s, ""mft_no 0x%llx is empty ""(error %d).",
(int)cn->cn_namelen,
cn->cn_nameptr,
(unsignedlonglong)ni->mft_no,
err);
gotoerr;
}
} else {
/* Do not allow directories to be unlinked. */if (S_ISDIR(ni->mode)) {
ntfs_debug("Target %.*s, mft_no 0x%llx is a ""directory, returning EPERM.",
(int)cn->cn_namelen, cn->cn_nameptr,
(unsignedlonglong)ni->mft_no);
err = EPERM;
gotoerr;
}
}
/*
* Do not allow any of the system files to be deleted.
*
* For NTFS 3.0+ volumes do not allow any of the extended system files
* to be deleted, either.
*
* Note we specifically blacklist all system files that we make use of
* except for the transaction log $UsnJrnl as that is allowed to be
* deleted and its deletion means that transaction logging is disabled.
*
* Note that if the transaction log is present it will be held busy by
* the NTFS driver thus unlinking the $UsnJrnl will not actually delete
* it until the driver is unmounted. FIXME: Should we leave it like
* this or should we detach the $UsnJrnl vnodes from the volume and
* release them so they can be deleted immediately?
*
* TODO: What about all the new metadata files introduced with Windows
* Vista? We are currently ignoring them and allowing them to be
* deleted...
*/if (ni->file_attributes & FILE_ATTR_SYSTEM) {
BOOL is_system = FALSE;
if (vol->major_ver <= 1) {
if (ni->mft_no < FILE_Extend)
is_system = TRUE;
} else {
if (ni->mft_no <= FILE_Extend)
is_system = TRUE;
if (dir_ni == vol->extend_ni) {
if (ni == vol->objid_ni ||
ni == vol->quota_ni)
is_system = TRUE;
}
}
if (is_system) {
ntfs_debug("Target %.*s, mft_no 0x%llx is a%s system ""file, returning EPERM.",
(int)cn->cn_namelen, cn->cn_nameptr,
(unsignedlonglong)ni->mft_no,
(dir_ni == vol->extend_ni) ?
"n extended" : "");
err = EPERM;
gotoerr;
}
}
/*
* Ensure the file is not read-only (the read-only bit is ignored for
* directories.
*/if (!S_ISDIR(ni->mode) && ni->file_attributes & FILE_ATTR_READONLY) {
ntfs_debug("Target %.*s, mft_no 0x%llx is marked read-only, ""returning EPERM.", (int)cn->cn_namelen,
cn->cn_nameptr,
(unsignedlonglong)ni->mft_no);
err = EPERM;
gotoerr;
}
/*
* If the inode is a reparse point or if the inode is offline we cannot
* remove a name from it yet. TODO: Implement this.
*/if (ni->file_attributes & (FILE_ATTR_REPARSE_POINT |
FILE_ATTR_OFFLINE)) {
ntfs_error(vol->mp, "Target %.*s, mft_no 0x%llx is %s. ""Deleting names from such inodes is not ""supported yet, returning ENOTSUP.",
(int)cn->cn_namelen, cn->cn_nameptr,
(unsignedlonglong)ni->mft_no,
ni->file_attributes & FILE_ATTR_REPARSE_POINT ?
"a reparse point" : "offline");
err = ENOTSUP;
gotoerr;
}
/*
* If Carbon delete semantics are requested, do not allow busy files to
* be unlinked. Note we do not use vnode_isinuse() as that accounts
* for open named streams/extended attributes as well which we do not
* care about. We only care for actually opened files thus we keep
* track of them ourselves.
*/if (flags & VNODE_REMOVE_NODELETEBUSY && ni->nr_opens) {
ntfs_debug("Target %.*s, mft_no 0x%llx is busy (nr_opens ""0x%x) and Carbon delete semantics were ""requested, returning EBUSY.",
(int)cn->cn_namelen, cn->cn_nameptr,
(unsignedlonglong)ni->mft_no,
(unsigned)ni->nr_opens);
err = EBUSY;
gotoerr;
}
/*
* We need to make sure the target still has the name specified in @cn
* that is being unlinked. It could have been unlinked or renamed
* before we took the locks on the parent directory and the target.
*
* To do this, first convert the name of the target from utf8 to
* Unicode then look up the converted name in the directory index.
*/
ntfs_name = ntfs_name_buf;
ntfs_name_size = sizeof(ntfs_name_buf);
ntfs_name_len = utf8_to_ntfs(vol, (u8*)cn->cn_nameptr, cn->cn_namelen,
&ntfs_name, &ntfs_name_size);
if (ntfs_name_len < 0) {
err = -ntfs_name_len;
if (err == ENAMETOOLONG)
ntfs_debug("Failed (name is too long).");
else
ntfs_error(vol->mp, "Failed to convert name to ""Unicode (error %d).", err);
gotoerr;
}
err = ntfs_lookup_inode_by_name(dir_ni, ntfs_name, ntfs_name_len,
&mref, &name);
if (err) {
if (err != ENOENT) {
ntfs_error(vol->mp, "Failed to find name in directory ""(error %d).", err);
gotoerr;
}
enoent:
/*
* The name does not exist in the directory @dir_ni.
*
* This means someone renamed or deleted the name from the
* directory before we managed to take the locks.
*/
ntfs_debug("Target %.*s, mft_no 0x%llx has been renamed or ""deleted already, returning ENOENT.",
(int)cn->cn_namelen, cn->cn_nameptr,
(unsignedlonglong)ni->mft_no);
/*
* If the target is somehow still in the name cache remove it
* now.
*/
cache_purge(ni->vn);
err = ENOENT;
gotoerr;
}
/*
* We found the target name in the directory index but does it still
* point to the same mft record? The sequence number check ensures the
* inode was not deleted and recreated with the same name and the same
* mft record number.
*/if (mref != MK_MREF(ni->mft_no, ni->seq_no))
gotoenoent;
/*
* We are going to go ahead with unlinking the target.
*
* There are several different types of outcome from the above lookup
* that need to be handled.
*
* If @name is NULL @ntfs_name contains the correctly cased name thus
* we can simply look for that. In this case we set the name type to 0
* as we do not know which namespace the name is in.
*
* If @name is not NULL the correctly cased name is in @name->name thus
* we look for that. In this case we do know which namespace the name
* is in as it is @name->type.
*/
ntfs_name_type = 0;
if (name) {
ntfs_name = name->name;
ntfs_name_len = name->len;
ntfs_name_type = name->type;
}
/* Now we can perform the actual unlink. */
err = ntfs_unlink_internal(dir_ni, ni, ntfs_name, ntfs_name_len,
ntfs_name_type, FALSE);
if (err)
ntfs_error(vol->mp, "Failed to unlink %.*s with mft_no 0x%llx ""from directory mft_no 0x%llx (error %d).",
(int)cn->cn_namelen, cn->cn_nameptr,
(unsignedlonglong)ni->mft_no,
(unsignedlonglong)dir_ni->mft_no, err);
else
ntfs_debug("Done.");
err:
if (name)
OSFree(name, sizeof(*name), ntfs_malloc_tag);
lck_rw_unlock_exclusive(&ni->lock);
lck_rw_unlock_exclusive(&dir_ni->lock);
return err;
}
/**
* ntfs_vnop_remove - unlink a file
* @a: arguments to remove function
*
* @a contains:
* vnode_t a_dvp; directory from which to unlink the file
* vnode_t a_vp; file to unlink
* struct componentname *a_cnp; name of the file to unlink
* int a_flags; flags describing the unlink request
* vfs_context_t a_context;
*
* Unlink a file with vnode @a->a_vp and name as specified in @a->a_cnp form
* the directory with vnode @a->a_dvp.
*
* The flags in @a->a_flags further describe the unlink request. The following
* flags are currently defined in OS X kernel:
* VNODE_REMOVE_NODELETEBUSY - Do not delete busy files, i.e. use
* Carbon delete semantics).
*
* Return 0 on success and errno on error.
*
* Note that if the name of the inode to be removed is in the WIN32 or DOS
* namespaces, both the WIN32 and the corresponding DOS names are removed.
*
* Note that for a hard link this function simply removes the name and its
* directory entry and decrements the hard link count whilst for the last name,
* i.e. the last link to an inode, it only removes the directory entry, i.e. it
* does not remove the name, however it does decrement the hard link count to
* zero. This is so that the inode can be undeleted and its original name
* restored. In any case, we do not actually delete the inode here as it may
* still be open and UNIX semantics require an unlinked inode to be still
* accessible through already opened file descriptors. When the last file
* descriptor is closed, we causes the inode to be deleted when the VFS
* notifies us of the last close by calling VNOP_INACTIVE(), i.e.
* ntfs_vnop_inactive().
*/staticintntfs_vnop_remove(struct vnop_remove_args *a)
{
ntfs_inode *dir_ni = NTFS_I(a->a_dvp);
ntfs_inode *ni = NTFS_I(a->a_vp);
errno_t err;
if (!dir_ni || !ni) {
ntfs_debug("Entered with NULL ntfs_inode, aborting.");
return EINVAL;
}
ntfs_debug("Entering.");
err = ntfs_unlink(NTFS_I(a->a_dvp), NTFS_I(a->a_vp), a->a_cnp,
a->a_flags, FALSE);
ntfs_debug("Done (error %d).", (int)err);
return err;
}
/**
* ntfs_link_internal - create a hard link to an inode
* @ni: base ntfs inode to create hard link to
* @dir_ni: directory ntfs inode in which to create the hard link
* @cn: componentname specifying name of the hard link to create
* @is_rename: if true ntfs_link_internal() is called for a rename
* @name: Unicode name of the inode to unlink
* @name_len: length of the name in Unicode characters
*
* Create a hard link to the ntfs inode @ni with name as specified in @cn in
* the directory ntfs inode @dir_ni.
*
* If @is_rename is true the caller was ntfs_vnop_rename() in which case the
* link count of the inode to link to will be one higher than the link count in
* the mft record and @name and @name_len specify the Unicode name and length
* in Unicode characters corresponding to @cn, respectively so we do not have
* to convert @cn to Unicode in this case.
*
* If @is_rename is false then @name and @name_len are undefined.
*
* Return 0 on success and errno on error.
*
* Note we always create filenames in the POSIX namespace.
*/static errno_t ntfs_link_internal(ntfs_inode *ni, ntfs_inode *dir_ni,
struct componentname *cn, const BOOL is_rename,
const ntfschar *name, constsigned name_len)
{
ntfs_volume *vol;
FILENAME_ATTR *fn;
ntfschar *ntfs_name;
MFT_RECORD *m;
ntfs_attr_search_ctx *ctx;
size_t ntfs_name_size;
signed ntfs_name_len;
unsigned fn_alloc, fn_size;
errno_t err, err2;
BOOL is_dir;
vol = ni->vol;
ntfs_debug("Creating a hard link to mft_no 0x%llx, named %.*s in ""directory mft_no 0x%llx.",
(unsignedlonglong)ni->mft_no, (int)cn->cn_namelen,
cn->cn_nameptr, (unsignedlonglong)dir_ni->mft_no);
if (NInoAttr(ni))
panic("%s(): Inode to link to is an attribute/raw inode.\n",
__FUNCTION__);
is_dir = S_ISDIR(ni->mode);
/*
* Create a temporary filename attribute so we can find the correct
* place to insert it into. We also need a temporary copy so we can
* release the mft record before we add the directory entry. This is
* needed because when we hold the mft record for the inode and we call
* ntfs_dir_entry_add() this would cause the mft record for the
* directory to be mapped which would result in a deadlock in the event
* that both mft records are in the same page.
*/
fn_alloc = sizeof(FILENAME_ATTR) + NTFS_MAX_NAME_LEN * sizeof(ntfschar);
fn = OSMalloc(fn_alloc, ntfs_malloc_tag);
if (!fn) {
ntfs_error(vol->mp, "Failed to allocate memory for temporary ""filename attribute.");
err = ENOMEM;
gotoerr;
}
bzero(fn, fn_alloc);
/* Begin setting up the temporary filename attribute. */
fn->parent_directory = MK_LE_MREF(dir_ni->mft_no, dir_ni->seq_no);
/* FILENAME_POSIX is zero and the attribute is already zeroed. *//* fn->filename_type = FILENAME_POSIX; *//*
* If this is not a rename then convert the name from utf8 to Unicode.
* If this is a rename on the other hand then we have the name in
* Unicode already so just copy that over.
*/
ntfs_name = fn->filename;
ntfs_name_size = NTFS_MAX_NAME_LEN * sizeof(ntfschar);
if (!is_rename) {
ntfs_name_len = utf8_to_ntfs(vol, (u8*)cn->cn_nameptr,
cn->cn_namelen, &ntfs_name, &ntfs_name_size);
if (ntfs_name_len < 0) {
err = -ntfs_name_len;
if (err == ENAMETOOLONG)
ntfs_debug("Failed (name is too long).");
else
ntfs_error(vol->mp, "Failed to convert name to ""Unicode (error %d).", err);
gotoerr;
}
} else {
memcpy(ntfs_name, name, name_len * sizeof(ntfschar));
ntfs_name_len = name_len;
}
/* Set the filename length in the temporary filename attribute. */
fn->filename_length = ntfs_name_len;
fn_size = sizeof(FILENAME_ATTR) + ntfs_name_len * sizeof(ntfschar);
/*
* Copy the times from the standard information attribute which we have
* cached in the ntfs inode.
*/
fn->creation_time = utc2ntfs(ni->creation_time);
fn->last_data_change_time = utc2ntfs(ni->last_data_change_time);
fn->last_mft_change_time = utc2ntfs(ni->last_mft_change_time);
fn->last_access_time = utc2ntfs(ni->last_access_time);
if (!is_dir) {
lck_spin_lock(&ni->size_lock);
fn->allocated_size = cpu_to_sle64(NInoNonResident(ni) &&
(NInoSparse(ni) || NInoCompressed(ni)) ?
ni->compressed_size : ni->allocated_size);
fn->data_size = cpu_to_sle64(ni->data_size);
lck_spin_unlock(&ni->size_lock);
} else {
/*
* Directories use 0 for the sizes in the filename attribute
* and the attribute is already zeroed.
*//* fn->data_size = fn->allocated_size = 0; */
}
/*
* If this is not a directory or it is an encrypted directory, set the
* needs archiving bit except for the core system files.
*/
fn->file_attributes = ni->file_attributes;
if (!is_dir || NInoEncrypted(ni)) {
BOOL need_set_archive_bit = TRUE;
if (vol->major_ver >= 2) {
if (ni->mft_no <= FILE_Extend)
need_set_archive_bit = FALSE;
} else {
if (ni->mft_no <= FILE_UpCase)
need_set_archive_bit = FALSE;
}
if (need_set_archive_bit) {
ni->file_attributes |= FILE_ATTR_ARCHIVE;
fn->file_attributes = ni->file_attributes;
NInoSetDirtyFileAttributes(ni);
}
}
/*
* Directories need the FILE_ATTR_DUP_FILENAME_INDEX_PRESENT flag set
* in their filename attributes both in their mft records and in the
* index entries pointing to them but not in the standard information
* attribute which is why it is not set in @ni->file_attributes.
*/if (is_dir)
fn->file_attributes |= FILE_ATTR_DUP_FILENAME_INDEX_PRESENT;
/*
* TODO: We need to find out whether it is true that ea_length takes
* precedence over reparse_tag, i.e. we need to check that if both EAs
* are present and this is a reparse point, we need to set the
* ea_length rather than the reparse_tag. So far I have not been able
* to create EAs on a reparse point and vice versa so perhaps the two
* are mutually exclusive in which case we are fine...
*
* The attribute is already zeroed so no need to set anything to zero.
*/
#if 0
if (ni->ea_length) {
fn->ea_length = cpu_to_le16(ni->ea_length);
/* fn->reserved = 0; */
} elseif (ni->file_attributes & FILE_ATTR_REPARSE_POINT) {
// TODO: Instead of zero use actual value if/when we enable
// creating hard links to reparse points...
/* fn->reparse_tag = 0; */
} else {
/*
* We need to initialize the unused field to zero but as we
* have already zeroed the attribute we do not need to do
* anything now.
*//* fn->reparse_tag = 0; */
}
#endif/*
* Add the created filename attribute to the parent directory index.
*
* We know @ni is the base inode since we bailed out for attribute
* inodes above so we can use it to generate the mft reference.
*/
err = ntfs_dir_entry_add(dir_ni, fn, fn_size,
MK_LE_MREF(ni->mft_no, ni->seq_no));
if (err)
gotoerr;
/*
* The ea_length and reparse_tag are only set in the directory index
* entries and not in filename attributes in the mft record so zero
* them here, before adding the filename attribute to the mft record.
*/
fn->reparse_tag = 0;
/*
* Add the created filename attribute to the mft record as well.
*
* Again, we know @ni is the base inode.
*/
err = ntfs_mft_record_map(ni, &m);
if (err) {
ntfs_error(vol->mp, "Failed to map mft record 0x%llx (error ""%d).", (unsignedlonglong)ni->mft_no, err);
gotorm_err;
}
ctx = ntfs_attr_search_ctx_get(ni, m);
if (!ctx) {
err = ENOMEM;
gotounm_err;
}
err = ntfs_attr_lookup(AT_FILENAME, AT_UNNAMED, 0, 0, fn, fn_size, ctx);
if (err != ENOENT) {
if (!err) {
ntfs_debug("Failed (filename already present in ""inode.");
err = EEXIST;
} else
ntfs_error(vol->mp, "Failed to add filename to mft_no ""0x%llx because looking up the ""filename in the mft record failed ""(error %d).",
(unsignedlonglong)ni->mft_no, err);
gotoput_err;
}
/*
* The current implementation of ntfs_attr_lookup() will always return
* pointing into the base mft record when an attribute was not found.
*/if (ni != ctx->ni)
panic("%s(): ni != ctx->ni\n", __FUNCTION__);
if (m != ctx->m)
panic("%s(): m != ctx->m\n", __FUNCTION__);
/*
* @ctx->a now points to the location in the mft record at which we
* need to insert the filename attribute, so insert it now.
*
* Note we ignore the case where @ctx->is_error is true because we do
* not need the attribute any more for anything after it has been
* inserted so we do not care that we failed to map its mft record.
*/
err = ntfs_resident_attr_record_insert(ni, ctx, AT_FILENAME, NULL, 0,
fn, fn_size);
if (err) {
ntfs_error(vol->mp, "Failed to add filename to mft_no 0x%llx ""because inserting the filename attribute ""failed (error %d).",
(unsignedlonglong)ni->mft_no, err);
gotoput_err;
}
/*
* Update the hard link count in the mft record. Note we subtract one
* from the inode link count if this is a rename as the link count has
* been elevated by one by the caller.
*/
ni->link_count++;
m->link_count = cpu_to_le16(ni->link_count - (is_rename ? 1 : 0));
/*
* Update the ctime in the inode by copying it from the target
* directory inode where it will have been updated by the above call to
* ntfs_dir_entry_add().
*/
ni->last_mft_change_time = dir_ni->last_mft_change_time;
NInoSetDirtyTimes(ni);
/*
* Invalidate negative cache entries in the directory. We need to do
* this because there may be negative cache entries which would match
* the name of the just created inode but in a different case. Such
* negative cache entries would now be incorrect thus we need to throw
* away all negative cache entries to ensure there cannot be any
* incorrectly negative entries in the name cache.
*/
cache_purge_negatives(dir_ni->vn);
/*
* We should add the new hard link to the name cache. Problem is that
* this is likely not to be a useful thing to do as the original name
* is likely in the name cache already and the OS X name cache only
* allows one name per vnode and cache_enter() simply returns without
* doing anything if a name is already present in the name cache for
* the vnode. Thus we could use vnode_update_identity() instead to
* switch the cached name from the original name to the new hard link.
*
* FIXME: The question is whether this is a useful thing to do. On the
* one hand people creating a hard link are likely to want to then
* access the inode via the new name but on the other hand hard links
* are often used in applications for locking purposes and in this case
* after the hard link is created the application is likely to unlink
* the original name thus it would be beneficial if that remains in the
* cache until this happens which will automatically remove the name
* from the name cache and the next lookup of the new name will insert
* the new one. Thus it is best if we do nothing at all now. If OS X
* ever allows multiple name links per vnode we can uncomment the below
* cache_enter() call.
*
* For the rename case we have just removed the original name, thus it
* makes sense to add the new name now and whilst at it also update the
* vnode identity with the new name and parent as the old ones are no
* longer valid.
*/if (is_rename) {
vnode_update_identity(ni->vn, dir_ni->vn, cn->cn_nameptr,
cn->cn_namelen, cn->cn_hash,
VNODE_UPDATE_PARENT | VNODE_UPDATE_NAME);
cache_enter(dir_ni->vn, ni->vn, cn);
cn->cn_flags &= ~MAKEENTRY;
}
/*
* Ensure the base mft record is written to disk.
*
* Note we do not set any of the NInoDirty*() flags because we have
* just created the inode thus all the fields are in sync between the
* ntfs_inode @ni and its mft record @m.
*
* Also note we defer the unmapping of the mft record to here so that
* we do not get racing time updates, etc during concurrent runs of
* link(2) and rename(2) where the source inode for the rename is the
* inode that has a new hardlink created to it at the same time. This
* case can happen because we do not lock the source inode in
* ntfs_vnop_rename().
*/
NInoSetMrecNeedsDirtying(ni);
/* We are done with the mft record. */
ntfs_attr_search_ctx_put(ctx);
ntfs_mft_record_unmap(ni);
/* Free the temporary filename attribute. */
OSFree(fn, fn_alloc, ntfs_malloc_tag);
ntfs_debug("Done.");
return 0;
put_err:
ntfs_attr_search_ctx_put(ctx);
unm_err:
ntfs_mft_record_unmap(ni);
rm_err:
#if 0
if (ni->ea_length) {
fn->ea_length = cpu_to_le16(ni->ea_length);
/* fn->reserved = 0; */
} elseif (ni->file_attributes & FILE_ATTR_REPARSE_POINT) {
// TODO: Instead of zero use actual value if/when we enable
// creating hard links to reparse points...
/* fn->reparse_tag = 0; */
} else {
/*
* We need to initialize the unused field to zero but as we
* have already zeroed the attribute we do not need to do
* anything now.
*//* fn->reparse_tag = 0; */
}
#endif
err2 = ntfs_dir_entry_delete(dir_ni, ni, fn, fn_size);
if (err2) {
ntfs_error(vol->mp, "Failed to rollback index entry creation ""in error handling code path (error %d). ""Leaving inconsistent metadata. Run chkdsk.",
err2);
NVolSetErrors(vol);
}
err:
if (fn)
OSFree(fn, fn_alloc, ntfs_malloc_tag);
if (err != EEXIST)
ntfs_error(vol->mp, "Failed (error %d).", err);
else
ntfs_debug("Failed (error EEXIST).");
return err;
}
/**
* ntfs_vnop_link - create a hard link to an inode
* @a: arguments to link function
*
* @a contains:
* vnode_t a_vp; vnode to create hard link to
* vnode_t a_tdvp; destination directory for the hard link
* struct componentname *a_cnp; name of the hard link to create
* vfs_context_t a_context;
*
* Create a hard link to the inode specified by the vnode @a->a_vp with name as
* specified in @a->a_cnp in the directory specified by the vnode @a->a_tdvp.
*
* Return 0 on success and errno on error.
*
* Note we always create filenames in the POSIX namespace.
*/staticintntfs_vnop_link(struct vnop_link_args *a)
{
ntfs_inode *ni, *dir_ni;
ntfs_volume *vol;
struct componentname *cn;
errno_t err;
ni = NTFS_I(a->a_vp);
vol = ni->vol;
dir_ni = NTFS_I(a->a_tdvp);
if (!dir_ni || !ni) {
ntfs_debug("Entered with NULL ntfs_inode, aborting.");
return EINVAL;
}
cn = a->a_cnp;
ntfs_debug("Creating a hard link to mft_no 0x%llx, named %.*s in ""directory mft_no 0x%llx.",
(unsignedlonglong)ni->mft_no, (int)cn->cn_namelen,
cn->cn_nameptr, (unsignedlonglong)dir_ni->mft_no);
/* Do not allow attribute/raw inodes to be linked to. */if (NInoAttr(ni)) {
ntfs_debug("Mft_no 0x%llx is a%s inode, returning EPERM.",
(unsignedlonglong)ni->mft_no,
NInoRaw(ni) ? " raw" : "n attribute");
return EPERM;
}
/* The target inode must be a directory. */if (!S_ISDIR(dir_ni->mode)) {
ntfs_debug("Target mft_no 0x%llx is not a directory, ""returning ENOTDIR.",
(unsignedlonglong)dir_ni->mft_no);
return ENOTDIR;
}
/* Lock the target directory inode for writing. */
lck_rw_lock_exclusive(&dir_ni->lock);
/* The inode being linked to must not be a directory. */if (S_ISDIR(ni->mode)) {
lck_rw_unlock_exclusive(&dir_ni->lock);
ntfs_debug("Mft_no 0x%llx to link to is a directory, cannot ""create hard link %.*s to it, returning ""EPERM.", (unsignedlonglong)ni->mft_no,
(int)cn->cn_namelen, cn->cn_nameptr);
return EPERM;
}
/* Lock the inode to link to for writing. */
lck_rw_lock_exclusive(&ni->lock);
/* Ensure the target directory has not been deleted. */if (!dir_ni->link_count) {
ntfs_debug("Target directory mft_no 0x%llx has been deleted, ""returning ENOENT.",
(unsignedlonglong)dir_ni->mft_no);
/*
* If the directory is somehow still in the name cache remove
* it now.
*/
cache_purge(dir_ni->vn);
err = ENOENT;
gotoerr;
}
/*
* Ensure the inode has not been deleted. Note we really should be
* checking that the source of the hard link has not been unlinked yet
* but we do not know what the source name was as the caller does not
* provide it to us and we do not know which name we were called for
* from just looking at the source vnode/inode.
*/if (!ni->link_count) {
ntfs_debug("Inode %.*s, mft_no 0x%llx has been deleted, ""returning ENOENT.", (int)cn->cn_namelen,
cn->cn_nameptr, (unsignedlonglong)ni->mft_no);
/*
* If the target is somehow still in the name cache remove it
* now.
*/
cache_purge(ni->vn);
err = ENOENT;
gotoerr;
}
/*
* The inode being linked to must not be a directory or device special
* file. TODO: Extend the checks when we support device special files.
*/if (S_ISDIR(ni->mode)) {
ntfs_debug("Mft_no 0x%llx to link to is a directory, cannot ""create hard link %.*s to it, returning ""EPERM.", (unsignedlonglong)ni->mft_no,
(int)cn->cn_namelen, cn->cn_nameptr);
err = EPERM;
gotoerr;
}
/*
* Do not allow any of the system files to be linked to.
*
* For NTFS 3.0+ volumes do not allow any of the extended system files
* to be linked to, either.
*
* Note we specifically blacklist all system files that we make use of.
*
* TODO: What about all the new metadata files introduced with Windows
* Vista? We are currently ignoring them and allowing them to be
* linked to...
*/if (ni->file_attributes & FILE_ATTR_SYSTEM) {
BOOL is_system = FALSE;
if (vol->major_ver <= 1) {
if (ni->mft_no < FILE_Extend)
is_system = TRUE;
} else {
if (ni->mft_no <= FILE_Extend)
is_system = TRUE;
if (ni == vol->objid_ni || ni == vol->quota_ni ||
ni == vol->usnjrnl_ni)
is_system = TRUE;
}
if (is_system) {
ntfs_debug("Mft_no 0x%llx is a%s system file, ""returning EPERM.",
(unsignedlonglong)ni->mft_no,
(ni->mft_no > FILE_Extend) ?
"n extended" : "");
err = EPERM;
gotoerr;
}
}
/*
* Ensure the inode to link to is not read-only (we already checked
* that @ni is not a directory).
*/if (ni->file_attributes & FILE_ATTR_READONLY) {
ntfs_debug("Mft_no 0x%llx is marked read-only, returning ""EPERM.", (unsignedlonglong)ni->mft_no);
err = EPERM;
gotoerr;
}
/*
* TODO: Test if Windows is happy with a reparse point having a hard
* link and if so remove this check and copy in the reparse point tag
* into the filename attribute below. For mount point reparse points
* the reparse point is a directory so the link attempt would already
* have been aborted.
*
* TODO: Test if Windows is happy with an offline inode having a hard
* link and if so remove this check.
*/if (ni->file_attributes & (FILE_ATTR_REPARSE_POINT |
FILE_ATTR_OFFLINE)) {
ntfs_debug("Mft_no 0x%llx is %s. Creating hard links to such ""inodes is not allowed, returning EPERM.",
(unsignedlonglong)ni->mft_no,
(ni->file_attributes &
FILE_ATTR_REPARSE_POINT) ?
"a reparse point" : "offline");
err = EPERM;
gotoerr;
}
/* Check if the maximum link count is already reached. */if (ni->link_count >= NTFS_MAX_HARD_LINKS) {
ntfs_debug("Cannot create hard link to mft_no 0x%llx because ""it already has too many hard links.",
(unsignedlonglong)ni->mft_no);
err = EMLINK;
gotoerr;
}
/* Go ahead and create the hard link. */
err = ntfs_link_internal(ni, dir_ni, cn, FALSE, NULL, 0);
if (err) {
if (err != EEXIST)
ntfs_error(vol->mp, "Failed to create hard link to ""mft_no 0x%llx, named %.*s, in ""directory mft_no 0x%llx (error %d).",
(unsignedlonglong)ni->mft_no,
(int)cn->cn_namelen, cn->cn_nameptr,
(unsignedlonglong)dir_ni->mft_no,
err);
else
ntfs_debug("Failed to create hard link to mft_no ""0x%llx, named %.*s, in directory ""mft_no 0x%llx (error EEXIST).",
(unsignedlonglong)ni->mft_no,
(int)cn->cn_namelen, cn->cn_nameptr,
(unsignedlonglong)dir_ni->mft_no);
} else
ntfs_debug("Done.");
err:
/* We are done, unlock the inode and the target directory. */
lck_rw_unlock_exclusive(&ni->lock);
lck_rw_unlock_exclusive(&dir_ni->lock);
return err;
}
/**
* ntfs_vnop_rename - rename an inode (file/directory/symbolic link/etc)
* @a: arguments to rename function
*
* @a contains:
* vnode_t a_fdvp; directory containing source inode
* vnode_t a_fvp; source inode to be renamed
* struct componentname *a_fcnp; name of the inode to rename
* vnode_t a_tdvp; target directory to move the source to
* vnode_t a_tvp; target inode to be deleted
* struct componentname *a_tcnp; name of the inode to delete
* vfs_context_t a_context;
*
* Rename the inode @a_fvp with name as specified in @a->a_fcnp located in the
* directory @a->a_fdvp to the new name specified in a->a_tcnp placing it in
* the target directory @a->a_tdvp.
*
* If @a->a_tvp is not NULL it means that the rename target already exists
* which means we have to delete the rename target before we can perform the
* rename. In this case @a->a_tvp is the existing target inode and its name is
* the rename target name specified in @a->a_tcnp and it is located in the
* target directory @a->a_tdvp.
*
* Return 0 on success and errno on error.
*
* Note we always create the target name @a->a_tcnp in the POSIX namespace.
*
* Rename is a complicated operation because there are several special cases
* that need consideration:
*
* First of all unchecked renaming can create directory loops which are not
* attached to the file system root, e.g. take the directory tree /a/b/c and
* perform a rename of /a/b to /a/b/c/ which if allowed to proceed would create
* /a and b/c/b where the latter is a loop in that b points back to c which
* points back to b. Also this loop no longer is attached to the file system
* directory tree and there is no way to access it any more as there is no link
* from /a to b or c any more. Thus we have to check for this case and return
* EINVAL error instead of doing the rename. Also a concurrent rename could
* reshape the tree after our check so that our case would result in a loop
* after all thus all tree reshaping renames must be done under a rename lock.
* Note the VFS already holds the mnt_renamelock mutex for some renames but it
* does not hold it in all cases we need it to be held so we still need our own
* NTFS rename lock.
*
* Further VNOP_RENAME() must observe the following rules:
*
* - Source and destination must either both be directories, or both not be
* directories. If this is not the case return ENOTDIR if the target is not
* a directory and EISDIR if the target is a directory.
*
* - If the target is a directory, it must be empty. Return ENOTEMPTY if not.
*
* - It is not allowed to rename "/", ".", or "..". Return EINVAL if this is
* attempted.
*
* - If the source inode and the target inode are the same and the mount is
* case sensitive or the parent directories are also the same and the names
* are the same do not do anything at all and return success, i.e. 0. Note
* this is a violation of POSIX but it is needed to allow renaming of files
* from one case to another, i.e. when a mount is not case sensitive but case
* preserving (this is the default for NTFS) and the source and target inodes
* and their parent directories match but the names do not match we want to
* perform the rename rather than just return success. If we still find that
* the target exists as a hard link rather than this being a case changing
* rename we still need to abort and return success to comply with POSIX.
*
* FIXME: There is a bug in the VFS in that it never calls VNOP_RENAME() at
* all when it is called with source and target strings being the same. This
* is wrong when the string matches the name but does not have the same case,
* i.e. the rename would normally succeed switching the case to the new case.
* The VFS is currently forbidding this to happen. <rdar://problem/5485782>
*/staticintntfs_vnop_rename(struct vnop_rename_args *a)
{
MFT_REF src_mref, dst_mref;
ntfs_inode *src_dir_ni, *src_ni, *dst_dir_ni, *dst_ni;
struct componentname *src_cn, *dst_cn;
ntfs_volume *vol;
ntfschar *ntfs_name_buf, *orig_ntfs_name, *dst_ntfs_name;
ntfschar *src_ntfs_name, *target_ntfs_name;
ntfs_dir_lookup_name *src_name, *dst_name;
size_t orig_ntfs_name_size, dst_ntfs_name_size;
signed orig_ntfs_name_len, dst_ntfs_name_len, src_ntfs_name_len;
signed target_ntfs_name_len;
errno_t err, err2;
FILENAME_TYPE_FLAGS src_ntfs_name_type, target_ntfs_name_type;
BOOL have_unlinked = FALSE;
dst_name = src_name = NULL;
src_dir_ni = NTFS_I(a->a_fdvp);
src_ni = NTFS_I(a->a_fvp);
src_cn = a->a_fcnp;
dst_dir_ni = NTFS_I(a->a_tdvp);
if (!src_dir_ni || !src_ni || !dst_dir_ni) {
ntfs_debug("Entered with NULL ntfs_inode, aborting.");
return EINVAL;
}
vol = src_dir_ni->vol;
dst_cn = a->a_tcnp;
if (a->a_tvp) {
dst_ni = NTFS_I(a->a_tvp);
if (!dst_ni) {
ntfs_debug("Entered with NULL ntfs_inode, aborting.");
return EINVAL;
}
ntfs_debug("Entering for source mft_no 0x%llx, name %.*s, ""parent directory mft_no 0x%llx and ""destination mft_no 0x%llx, name %.*s, parent ""directory mft_no 0x%llx.",
(unsignedlonglong)src_ni->mft_no,
(int)src_cn->cn_namelen, src_cn->cn_nameptr,
(unsignedlonglong)src_dir_ni->mft_no,
(unsignedlonglong)dst_ni->mft_no,
(int)dst_cn->cn_namelen, dst_cn->cn_nameptr,
(unsignedlonglong)dst_dir_ni->mft_no);
if (src_ni == dst_ni && NVolCaseSensitive(vol)) {
ntfs_debug("Source and destination inodes are the ""same and the volume is case ""sensitive. Returning success ""without doing anything as required ""by POSIX.");
return 0;
}
} else {
dst_ni = NULL;
ntfs_debug("Entering for source mft_no 0x%llx, name %.*s, ""parent directory mft_no 0x%llx and no ""destination mft_no, destination name %.*s, ""parent directory mft_no 0x%llx.",
(unsignedlonglong)src_ni->mft_no,
(int)src_cn->cn_namelen, src_cn->cn_nameptr,
(unsignedlonglong)src_dir_ni->mft_no,
(int)dst_cn->cn_namelen, dst_cn->cn_nameptr,
(unsignedlonglong)dst_dir_ni->mft_no);
}
/*
* The source and target parent inodes must be directories which
* implies they are base inodes.
*/if (!S_ISDIR(src_dir_ni->mode) || !S_ISDIR(dst_dir_ni->mode)) {
ntfs_debug("%s parent inode 0x%llx is not a directory, ""returning ENOTDIR.",
!S_ISDIR(src_dir_ni->mode) ?
"Source" : "Destination", (unsignedlonglong)
(!S_ISDIR(src_dir_ni->mode) ?
src_dir_ni->mft_no : dst_dir_ni->mft_no));
return ENOTDIR;
}
/*
* All inodes must be locked in parent -> child order so we need to
* check whether the source and target parent inodes have a
* parent/child relationship with each other.
*
* If both are the same we have the easiest case and we just lock the
* single directory inode.
*
* If the two are not the same we need to exclude all other tree
* reshaping renames from happening as they could change the
* relationship between the parent directory inodes under our feet. To
* do this we use a per ntfs volume lock so we can then go on to
* determine their parent/child relationship.
*
* Once we have established if there is a parent/child relationship we
* lock the parent followed by the child and if the two are completely
* unrelated the order of locking does not matter so we just lock the
* destination followed by the source.
*
* Note that we take this opportunity of walking the directory tree up
* to the root starting from @dst_dir_ni to also check whether @src_ni
* is either equal to or a parent of @dst_dir_ni in which case a
* directory loop would be caused by the rename so we have to abort it
* with EINVAL error.
*/if (src_dir_ni == dst_dir_ni)
lck_rw_lock_exclusive(&src_dir_ni->lock);
else {
BOOL is_parent;
lck_mtx_lock(&vol->rename_lock);
err = ntfs_inode_is_parent(src_dir_ni, dst_dir_ni, &is_parent,
src_ni);
if (err) {
lck_mtx_unlock(&vol->rename_lock);
/*
* @err == EINVAL means @src_ni matches or is a parent
* of @dst_dir_ni. This would create a directory
* loop so abort the rename but do not emit an error
* message as there is no error as such.
*/if (err != EINVAL)
ntfs_error(vol->mp, "Failed to determine ""whether source directory ""mft_no 0x%llx is a parent of ""destination directory mft_no ""0x%llx (error %d).",
(unsignedlonglong)
src_dir_ni->mft_no,
(unsignedlonglong)
dst_dir_ni->mft_no, err);
return err;
}
/*
* If @src_dir_ni is a parent of @dst_dir_ni, lock @src_dir_ni
* followed by @dst_dir_ni.
*
* Otherwise either @dst_dir_ni is a parent of @src_dir_ni, in
* which case we have to lock @dst_dir_ni followed by
* @src_dir_ni, or they are unrelated in which case lock
* ordering does not matter thus we do not need to distinguish
* those two cases and can simply lock @dst_dir_ni followed by
* @src_dir_ni.
*/if (is_parent) {
lck_rw_lock_exclusive(&src_dir_ni->lock);
lck_rw_lock_exclusive(&dst_dir_ni->lock);
} else {
lck_rw_lock_exclusive(&dst_dir_ni->lock);
lck_rw_lock_exclusive(&src_dir_ni->lock);
}
}
/*
* The source cannot be the source directory and the destination cannot
* be the destination directory. Also as we are about to lock the
* target ensure it does not equal the source directory either. We
* have already checked for the source being equal to the target
* directory above so no need to check again.
*/if (dst_ni && dst_ni == src_dir_ni) {
ntfs_debug("The source parent directory equals the target, ""returning ENOTEMPTY.");
err = ENOTEMPTY;
/* Set @dst_ni to NULL so we do not try to unlock it. */
dst_ni = NULL;
gotoerr;
}
if (src_ni == src_dir_ni || (dst_ni && dst_ni == dst_dir_ni)) {
ntfs_debug("The source and/or the target is/are equal to ""their parent directories, returning EINVAL.");
err = EINVAL;
/* Set @dst_ni to NULL so we do not try to unlock it. */
dst_ni = NULL;
gotoerr;
}
/*
* If the destination inode exists lock it so it can be unlinked
* safely. For example if it is a directory we need to ensure that it
* is empty and that no-one creates an entry in it whilst the delete is
* in progress which requires us to hold an exclusive lock on it.
*/if (dst_ni)
lck_rw_lock_exclusive(&dst_ni->lock);
/*
* Because we have locked the parent inode of the source inode there is
* no need to lock the source inode itself. We are not going to unlink
* it completely, just move it from one location/name to another name
* and/or place in the directory tree and the mft record will be mapped
* and thus locked for exclusive access whenever we modify the inode
* which will serialize any potential concurrent operations on the
* inode. The only concurrent operation to watch out for is when the
* source inode is a directory and someone calls VNOP_REMOVE() or
* VNOP_RMDIR() on any of its child inodes. This can end up in the
* situation where the index root node is locked in
* ntfs_index_entry_delete() and hence the mft record is mapped whilst
* the free space in the mft record is evaluated but then before this
* information is used the mft record is unmapped and then mapped again
* as part of a call to ntfs_index_entry_lock_two() and if our
* VNOP_RENAME() manages to map the mft record whilst it is temporarily
* unmapped during the ntfs_index_entry_lock_two() we can cause the
* free space in the mft record to decrease and thus the
* ntfs_index_entry_delete() may then encounter an out of space
* condition when it thought it had determined the amount of free space
* already and thus assume something has gone wrong and panic(). We
* overcome this problem inside ntfs_index_entry_delete() by rechecking
* the free space after reacquiring the lock and dealing with it as
* appropriate.
*
* First, ensure the parent directories have not been deleted.
*/if (!src_dir_ni->link_count || !dst_dir_ni->link_count) {
ntfs_debug("One or both of the parent directories mft_no ""0x%llx and mft_no 0x%llx has/have been ""deleted, returning ENOENT.",
(unsignedlonglong)src_dir_ni->mft_no,
(unsignedlonglong)dst_dir_ni->mft_no);
/*
* If the directory is somehow still in the name cache remove
* it now.
*/if (!src_dir_ni->link_count)
cache_purge(src_dir_ni->vn);
if (!dst_dir_ni->link_count)
cache_purge(dst_dir_ni->vn);
err = ENOENT;
gotoerr;
}
/* Rename is not allowed on attribute/raw inodes. */if (NInoAttr(src_ni) || (dst_ni && NInoAttr(dst_ni))) {
ntfs_debug("Source and/or target inode is/are attribute/raw ""inodes, returning EPERM.");
err = EPERM;
gotoerr;
}
/* Ensure the source has not been deleted by someone else already. */if (!src_ni->link_count) {
ntfs_debug("Source %.*s, mft_no 0x%llx has been deleted, ""returning ENOENT.", (int)src_cn->cn_namelen,
src_cn->cn_nameptr,
(unsignedlonglong)src_ni->mft_no);
/*
* If the source is somehow still in the name cache remove it
* now.
*/
cache_purge(src_ni->vn);
err = ENOENT;
gotoerr;
}
/*
* Ensure the target has not been deleted by someone else already. If
* it has been deleted pretend the caller did not specify a target.
* This is what HFS+ does, too.
*/if (dst_ni && !dst_ni->link_count) {
ntfs_debug("Target %.*s, mft_no 0x%llx has been deleted, ""pretending no target was specified.",
(int)dst_cn->cn_namelen, dst_cn->cn_nameptr,
(unsignedlonglong)dst_ni->mft_no);
/*
* If the target is somehow still in the name cache remove it
* now.
*/
cache_purge(dst_ni->vn);
lck_rw_unlock_exclusive(&dst_ni->lock);
dst_ni = NULL;
}
/*
* If the destination exists need to ensure that it is a directory if
* the source is a directory or that it is not a directory if the
* source is not a directory.
*
* Also, need to ensure the target directory is empty.
*
* If the source and destination are the same none of these checks
* apply so skip them.
*/if (dst_ni && src_ni != dst_ni) {
if (S_ISDIR(src_ni->mode)) {
if (!S_ISDIR(dst_ni->mode)) {
ntfs_debug("Source is a directory but ""destination is not, ""returning ENOTDIR");
err = ENOTDIR;
gotoerr;
}
/* The target is a directory, but is it empty? */
err = ntfs_dir_is_empty(dst_ni);
if (err) {
if (err == ENOTEMPTY)
ntfs_debug("Target directory %.*s, ""mft_no 0x%llx is not ""empty, returning ""ENOTEMPTY.",
(int)dst_cn->cn_namelen,
dst_cn->cn_nameptr,
(unsignedlonglong)
dst_ni->mft_no);
else {
ntfs_error(vol->mp, "Failed to ""determine if target ""directory %.*s, ""mft_no 0x%llx is ""empty (error %d).",
(int)dst_cn->cn_namelen,
dst_cn->cn_nameptr,
(unsignedlonglong)
dst_ni->mft_no, err);
err = EIO;
}
gotoerr;
}
} else/* if (!S_ISDIR(src_ni->mode)) */ {
if (S_ISDIR(dst_ni->mode)) {
ntfs_debug("Source is not a directory but ""destination is, returning ""EISDIR");
err = EISDIR;
gotoerr;
}
}
}
/* Ensure none of the inodes are read-only. */if ((!S_ISDIR(src_ni->mode) &&
src_ni->file_attributes & FILE_ATTR_READONLY) ||
(dst_ni && !S_ISDIR(dst_ni->mode) &&
dst_ni->file_attributes & FILE_ATTR_READONLY)) {
ntfs_debug("One of the inodes involved in the rename is ""read-only, returning EPERM.");
err = EPERM;
gotoerr;
}
/*
* Do not allow any of the system files to be renamed/deleted.
*
* For NTFS 3.0+ volumes do not allow any of the extended system files
* to be renamed/deleted, either.
*
* Note we specifically blacklist all system files that we make use of.
*
* TODO: What about all the new metadata files introduced with Windows
* Vista? We are currently ignoring them and allowing them to be
* renamed/deleted...
*/if (src_ni->file_attributes & FILE_ATTR_SYSTEM || (dst_ni &&
dst_ni->file_attributes & FILE_ATTR_SYSTEM)) {
BOOL is_system = FALSE;
if (vol->major_ver <= 1) {
if (src_ni->mft_no < FILE_Extend || (dst_ni &&
dst_ni->mft_no < FILE_Extend))
is_system = TRUE;
} else {
if (src_ni->mft_no <= FILE_Extend || (dst_ni &&
dst_ni->mft_no <= FILE_Extend))
is_system = TRUE;
if (src_dir_ni == vol->extend_ni) {
if (src_ni == vol->objid_ni ||
src_ni == vol->quota_ni ||
src_ni == vol->usnjrnl_ni)
is_system = TRUE;
}
if (dst_dir_ni == vol->extend_ni) {
if (dst_ni == vol->objid_ni ||
dst_ni == vol->quota_ni ||
dst_ni == vol->usnjrnl_ni)
is_system = TRUE;
}
}
if (is_system) {
ntfs_debug("Source and/or target inode is a system ""file, returning EPERM.");
err = EPERM;
gotoerr;
}
}
/*
* If the source/target inodes are reparse points or if they are
* offline we cannot rename/delete them yet. TODO: Implement this.
*/if (src_ni->file_attributes & (FILE_ATTR_REPARSE_POINT |
FILE_ATTR_OFFLINE) || (dst_ni &&
dst_ni->file_attributes & (FILE_ATTR_REPARSE_POINT |
FILE_ATTR_OFFLINE))) {
ntfs_error(vol->mp, "Source or target inode is a reparse ""point or offline, renaming such indoes is ""notsupported yet, returning ENOTSUP.");
err = ENOTSUP;
gotoerr;
}
/*
* To proceed further we need to convert both the source and target
* names from utf8 to Unicode. This is a good time to do both as the
* conversion also checks for invalid names, too long names, etc.
*
* Note we allocate both source and target names with a single buffer
* so we only have to call once into the allocator.
*/
ntfs_name_buf = OSMalloc(NTFS_MAX_NAME_LEN * 2, ntfs_malloc_tag);
if (!ntfs_name_buf) {
ntfs_debug("Not enough memory to allocate name buffer.");
err = ENOMEM;
gotoerr;
}
orig_ntfs_name = ntfs_name_buf;
dst_ntfs_name = (ntfschar*)((u8*)ntfs_name_buf + NTFS_MAX_NAME_LEN);
dst_ntfs_name_size = orig_ntfs_name_size = NTFS_MAX_NAME_LEN;
orig_ntfs_name_len = utf8_to_ntfs(vol, (u8*)src_cn->cn_nameptr,
src_cn->cn_namelen, &orig_ntfs_name,
&orig_ntfs_name_size);
if (orig_ntfs_name_len < 0) {
err = -orig_ntfs_name_len;
if (err == ENAMETOOLONG)
ntfs_debug("Failed (source name is too long).");
else
ntfs_error(vol->mp, "Failed to convert name to ""Unicode (error %d).", err);
gotofree_err;
}
dst_ntfs_name_len = utf8_to_ntfs(vol, (u8*)dst_cn->cn_nameptr,
dst_cn->cn_namelen, &dst_ntfs_name,
&dst_ntfs_name_size);
if (dst_ntfs_name_len < 0) {
err = -dst_ntfs_name_len;
if (err == ENAMETOOLONG)
ntfs_debug("Failed (target name is too long).");
else
ntfs_error(vol->mp, "Failed to convert target name to ""Unicode (error %d).", err);
gotofree_err;
}
/*
* We need to make sure the source still has the name specified in
* @src_cn. It could have been unlinked or renamed before we took the
* lock on the parent directory.
*
* To do this, look up the converted source name in the source parent
* directory index.
*/
err = ntfs_lookup_inode_by_name(src_dir_ni, orig_ntfs_name,
orig_ntfs_name_len, &src_mref, &src_name);
if (err) {
if (err != ENOENT) {
ntfs_error(vol->mp, "Failed to find source name in ""directory (error %d).", err);
gotofree_err;
}
src_enoent:
/*
* The source name does not exist in the source parent
* directory.
*
* This means someone renamed or deleted the name from the
* directory before we managed to take the locks.
*/
ntfs_debug("Source has been renamed or deleted already, ""returning ENOENT.");
/*
* If the source is somehow still in the name cache remove it
* now.
*/
cache_purge(src_ni->vn);
err = ENOENT;
gotofree_err;
}
/*
* We found the source name in the directory index but does it still
* point to the same mft record? The sequence number check ensures the
* inode was not deleted and recreated with the same name and the same
* mft record number.
*/if (src_mref != MK_MREF(src_ni->mft_no, src_ni->seq_no))
gotosrc_enoent;
/*
* We now have verified everything to do with the source. Set the
* source name to be the correctly cased name (unless it was correctly
* cased already in which case @src_name will be NULL and
* @orig_ntfs_name contains the correcly cased name).
*/if (src_name) {
src_ntfs_name = src_name->name;
src_ntfs_name_len = src_name->len;
src_ntfs_name_type = src_name->type;
} else {
src_ntfs_name = orig_ntfs_name;
src_ntfs_name_len = orig_ntfs_name_len;
src_ntfs_name_type = 0;
}
/*
* Now we need to verify the target. In an ideal world, either it has
* to be specified in @dst_ni in which case it also has to exist in the
* destination parent directory @dst_dir_ni, or @dst_ni has to be NULL
* in which case the target name must not exist in the destination
* parent directory.
*
* But because the VFS obtains the target before we take the necessary
* locks it is possible for the above ideal not to be true. There are
* several possible cases:
*
* - Target was specified but deleted. We have detected this case
* above and have set @dst_ni to NULL thus we do not need to worry
* about this case any more.
* - Target was not specified but another inode was created with the
* same name. In this case we return EEXIST which is what HFS+ does,
* too.
* - Target was specified but renamed. This means we may or may not
* find a directory entry of the same name. If we do not find a
* matching directory entry we know the target has been renamed thus
* we can simply set @dst_ni to NULL and pretend it does not exist.
* If we do find a directory entry that matches in name but does not
* point to the same mft reference we know the target was renamed and
* another inode was created with the same name. In this case we
* return EEXIST which is what HFS+ does, too.
*/
err = ntfs_lookup_inode_by_name(dst_dir_ni, dst_ntfs_name,
dst_ntfs_name_len, &dst_mref, &dst_name);
if (err) {
if (err != ENOENT) {
ntfs_error(vol->mp, "Failed to find target name in ""directory (error %d).", err);
gotofree_err;
}
/*
* The destination name does not exist in the destination
* parent directory which means that the target must have been
* renamed to something else before we took the locks. We
* treat this the same as if had been deleted, i.e. we pretend
* the caller did not specify a target.
*/if (dst_ni) {
ntfs_debug("Target %.*s, mft_no 0x%llx has been ""renamed, pretending no target was ""specified.", (int)dst_cn->cn_namelen,
dst_cn->cn_nameptr,
(unsignedlonglong)dst_ni->mft_no);
lck_rw_unlock_exclusive(&dst_ni->lock);
dst_ni = NULL;
}
} else/* if (!err) */ {
/*
* The destination name exists in the directory index.
*
* If the caller did not specify it in @dst_ni or the
* destination inode has been deleted (in which case we set
* @dst_ni to NULL above) or the target was renamed and another
* inode was created with the same name return error EEXIST
* which is what HFS+ does, too.
*
* FIXME: Technically it would probably be more correct to get
* the new target ntfs inode and restart the function but at
* least for now stick with the same behaviour as HFS+.
*/if (!dst_ni || dst_mref != MK_MREF(dst_ni->mft_no,
dst_ni->seq_no)) {
ntfs_debug("Target name %.*s exists but %s, returning ""EEXIST.", (int)dst_cn->cn_namelen,
dst_cn->cn_nameptr, !dst_ni ?
"target inode was not specified or it ""was already deleted" :
"does not match specified target ""inode (it must have been renamed and ""a new inode created with the same ""name)");
err = EEXIST;
gotofree_err;
}
/*
* We still need the destination name thus use a new variable
* to store the correctly cased target name.
*/if (!dst_name) {
target_ntfs_name = dst_ntfs_name;
target_ntfs_name_len = dst_ntfs_name_len;
target_ntfs_name_type = 0;
} else {
target_ntfs_name = dst_name->name;
target_ntfs_name_len = dst_name->len;
target_ntfs_name_type = dst_name->type;
}
/*
* We have verified everything to do with the target. We now
* need to unlink it unless the source and the target are the
* same, i.e. we are changing the case of an existing filename.
* We need to distinguish two cases. If the volume is mounted
* case sensitive or it is not case sensitive and the source
* and destination names do not match (i.e. they are different
* hard links to the same inode) we do not proceed and return
* success (this is required by POSIX). Otherwise the volume
* is not case sensitive and the source and destination names
* match (i.e. they are the same hard link) and we can either
* return success when the source and destination names are
* identical (same case) or we can proceed with the rename when
* the case differs.
*
* Note we have caught the case of the inodes being equal and
* the volume being mounted case sensitive earlier on so we now
* know that the volume is not mounted case sensitive.
*/if (src_ni == dst_ni) {
/*
* If the two names are not the same hardlink return
* success not doing anything as required by POSIX.
*
* Note we do not need to care about case when
* comparing because we are comparing the correctly
* cased names.
*/if (src_ntfs_name_len != target_ntfs_name_len ||
bcmp(src_ntfs_name, target_ntfs_name,
src_ntfs_name_len * sizeof(ntfschar))) {
ntfs_debug("Source and target inodes are the ""same but the source and ""target names are different ""hard links. Returning ""success without doing ""anything as required by ""POSIX.");
gotodone;
}
/*
* The names are the same hard link. If the existing
* name is the same as the destination name (i.e. the
* target name before case correction) there is
* nothing to do and we can return success.
*/if (src_ntfs_name_len == dst_ntfs_name_len &&
!bcmp(src_ntfs_name, dst_ntfs_name,
src_ntfs_name_len * sizeof(ntfschar))) {
ntfs_debug("Source and destination are ""identical so no need to do ""anything. Returning ""success.");
gotodone;
}
/*
* The names are the same hard link but they differ in
* case thus there is no target to be removed as it
* will be removed as part of the actual rename when
* the source name is removed.
*/
} else/* if (dst_ni && src_ni != dst_ni) */ {
/*
* The source and the target are not the same thus now
* unlink the target. We can do this atomically before
* adding the new entry because both the parent
* directory inode and the target inode are locked for
* writing thus no-one can access either until we have
* finished. FIXME: The only pitfal is what happens if
* the rename fails after we have removed the target?
* We just ignore this problem for now and let the
* target disappear. This is what HFS does also so at
* least we are not the only non-POSIX conformant file
* system on OS X... In fact as long as we return EIO
* on error once we have unlinked the target POSIX
* still considers this ok. (This is what HFS does,
* too.)
*
* Note we do not set @is_rename to true here as this
* is just a normal unlink operation.
*/
err = ntfs_unlink_internal(dst_dir_ni, dst_ni,
target_ntfs_name, target_ntfs_name_len,
target_ntfs_name_type, FALSE);
if (err) {
ntfs_error(vol->mp, "Rename failed because ""the target mft_no 0x%llx ""could not be removed from ""directory mft_no 0x%llx ""(error %d).",
(unsignedlonglong)
dst_ni->mft_no,
(unsignedlonglong)
dst_dir_ni->mft_no, err);
gotofree_err;
}
/*
* Set @have_unlinked to true so that we know that we
* have to return error EIO from now on if we fail to
* complete the rename.
*/
have_unlinked = TRUE;
}
/*
* Release the lock on the destination inode and set it to NULL
* so we assume it does not exist from now on.
*/
lck_rw_unlock_exclusive(&dst_ni->lock);
dst_ni = NULL;
}
/*
* We dealt with the target if there was one thus now we can begin the
* actual rename.
*
* To start with we lock the source inode for writing which allows us
* to split the removal of the source name and the addition of the
* destination name into two events.
*
* Note we cheat a little and set @dst_ni to @src_ni so that @src_ni is
* unlocked at the end of the function/on error.
*/if (dst_ni)
panic("%s(): dst_ni\n", __FUNCTION__);
dst_ni = src_ni;
lck_rw_lock_exclusive(&src_ni->lock);
/*
* As the source inode is now locked for writing we can perform the
* rename in two stages. First we remove the source name and then we
* add the destination name both to the mft record of the inode and to
* the parent directory indexes. We can do this atomically because
* both the parent directory and the source inode are locked for
* writing thus no-one can access either until we are finished.
*
* As removal of the source name can leave the source inode with a zero
* link count we artificially increment the link count here to ensure
* it cannot reach zero. This is required to guarantee that the unlink
* of the source name will remove the filename attribute and to ensure
* that the object id is not deleted. Finally, this also ensures
* no-one can ever see the inode in a deleted state (although this
* should never happen anyway as we have the inode locked for writing).
*
* Note the link count in the ntfs inode is unsigned int type, i.e. at
* least 32-bit, to allow us to overflow 16-bits here if needed. In
* this way we do not need to worry about the link count overflowing
* here which makes the code simpler.
*
* We set @is_rename to true as we have elevated the link count by one.
*/
src_ni->link_count++;
err = ntfs_unlink_internal(src_dir_ni, src_ni, src_ntfs_name,
src_ntfs_name_len, src_ntfs_name_type, TRUE);
if (err) {
ntfs_error(vol->mp, "Rename failed because the source name, ""%.*s mft_no 0x%llx could not be removed from ""directory mft_no 0x%llx (error %d).",
(int)src_cn->cn_namelen, src_cn->cn_nameptr,
(unsignedlonglong)src_ni->mft_no,
(unsignedlonglong)src_dir_ni->mft_no, err);
gotodec_err;
}
/*
* The source name is now removed both from the source parent directory
* index and from the mft record of the source inode.
*
* Now add the destination name as a hard link to the mft record of the
* source inode and to the destination parent directory index.
*
* Calling ntfs_link_internal() also sets the "needs to be archived"
* bit on the ntfs inode unless we are renaming an unencrypted
* directory inode so we do not need to worry about setting it
* ourselves.
*/
err = ntfs_link_internal(src_ni, dst_dir_ni, dst_cn, TRUE,
dst_ntfs_name, dst_ntfs_name_len);
if (err)
gotolink_err;
/* We are done, decrement the link count back to its correct value. */
src_ni->link_count--;
done:
if (src_name)
OSFree(src_name, sizeof(*src_name), ntfs_malloc_tag);
if (dst_name)
OSFree(dst_name, sizeof(*dst_name), ntfs_malloc_tag);
OSFree(ntfs_name_buf, NTFS_MAX_NAME_LEN * 2, ntfs_malloc_tag);
err:
/* If the destination inode existed we locked it so unlock it now. */if (dst_ni)
lck_rw_unlock_exclusive(&dst_ni->lock);
/* Drop the source and destination parent directory inode locks. */
lck_rw_unlock_exclusive(&src_dir_ni->lock);
if (src_dir_ni != dst_dir_ni) {
lck_rw_unlock_exclusive(&dst_dir_ni->lock);
lck_mtx_unlock(&vol->rename_lock);
}
ntfs_debug("Done (error %d).", (int)err);
return err;
link_err:
ntfs_error(vol->mp, "Rename failed because the destination name %.*s, ""mft_ni 0x%llx could not be added to directory mft_no ""0x%llx (error %d).", (int)dst_cn->cn_namelen,
dst_cn->cn_nameptr, (unsignedlonglong)src_ni->mft_no,
(unsignedlonglong)dst_dir_ni->mft_no, err);
/*
* Try to roll back the unlink of the source by creating a new hard
* link with the old name.
*/
err2 = ntfs_link_internal(src_ni, src_dir_ni, src_cn, TRUE,
orig_ntfs_name, orig_ntfs_name_len);
if (err2) {
ntfs_error(vol->mp, "Failed to roll back partially completed ""rename (error %d). Leaving corrupt ""metadata and returning EIO. Unmount and run ""chkdsk.", err2);
NVolSetErrors(vol);
err = EIO;
} else
ntfs_debug("Re-linking of source name succeeded.");
dec_err:
src_ni->link_count--;
free_err:
if (have_unlinked) {
/* We unlinked an existing target, need to re-link it now. */
ntfs_debug("Rename failed but the target was already unlinked ""and relinking it is not implemented (yet), ""returning EIO. (Given you were renaming ""over it chances are you did not care about ""the target anyway.)");
err = EIO;
}
gotodone;
}
/**
* ntfs_vnop_mkdir - create a directory
* @a: arguments to mkdir function
*
* @a contains:
* vnode_t a_dvp; directory in which to create the dir
* vnode_t *a_vpp; destination pointer for the created dir
* struct componentname *a_cnp; name of the directory to create
* struct vnode_attr *a_vap; attributes to set on the created dir
* vfs_context_t a_context;
*
* Create a directory with name as specified in @a->a_cnp in the directory
* specified by the vnode @a->a_dvp. Assign the attributes @a->a_vap to the
* created directory. Finally return the vnode of the created directory in
* *@a->a_vpp.
*
* Return 0 on success and errno on error.
*
* Note we always create directory names in the POSIX namespace.
*/staticintntfs_vnop_mkdir(struct vnop_mkdir_args *a)
{
errno_t err;
#ifdefDEBUG
ntfs_inode *ni = NTFS_I(a->a_dvp);
if (ni)
ntfs_debug("Creating a directory named %.*s in directory ""mft_no 0x%llx.", (int)a->a_cnp->cn_namelen,
a->a_cnp->cn_nameptr,
(unsignedlonglong)ni->mft_no);
#endif
err = ntfs_create(a->a_dvp, a->a_vpp, a->a_cnp, a->a_vap, FALSE);
ntfs_debug("Done (error %d).", (int)err);
return err;
}
/**
* ntfs_vnop_rmdir - remove an empty directory
* @a: arguments to rmdir function
*
* @a contains:
* vnode_t a_dvp; parent directory remove from
* vnode_t a_vp; directory to remove
* struct componentname *a_cnp; name of the dircetory to remove
* vfs_context_t a_context;
*
* Make sure that the directory with vnode @a->a_vp and name as specified in
* @a->a_cnp is empty and if so remove it from its parent directory with vnode
* @a->a_dvp.
*
* Return 0 on success and errno on error.
*
* Note that if the name of the directory to be removed is in the WIN32 or DOS
* namespaces, both the WIN32 and the corresponding DOS names are removed.
*
* Note that this function only removes the directory entry, i.e. it does not
* remove the name, however it does decrement the hard link count to zero.
* This is so that the directory can be undeleted and its original name
* restored. In any case, we do not actually delete the inode here as it may
* still be open and UNIX semantics require an unlinked inode to be still
* accessible through already opened file descriptors. When the last file
* descriptor is closed, we causes the inode to be deleted when the VFS
* notifies us of the last close by calling VNOP_INACTIVE(), i.e.
* ntfs_vnop_inactive().
*/staticintntfs_vnop_rmdir(struct vnop_rmdir_args *a)
{
ntfs_inode *dir_ni = NTFS_I(a->a_dvp);
ntfs_inode *ni = NTFS_I(a->a_vp);
errno_t err;
ntfs_debug("Entering.");
if (!dir_ni || !ni) {
ntfs_debug("Entered with NULL ntfs_inode, aborting.");
return EINVAL;
}
err = ntfs_unlink(dir_ni, ni, a->a_cnp, 0, TRUE);
ntfs_debug("Done (error %d).", (int)err);
return err;
}
/**
* ntfs_vnop_symlink - create a symbolic link
* @a: arguments to symlink function
*
* @a contains:
* vnode_t a_dvp; directory to create the symlink in
* vnode_t *a_vpp; destination pointer for the new symlink
* struct componentname *a_cnp; name of the symlink to create
* struct vnode_attr *a_vap; attributes to set on the new symlink
* char *a_target; path to point the created symlink at
* vfs_context_t a_context;
*
* Create a symbolic link to the path string @a->a_target with name as
* specified in @a->a_cnp in directory specified by the vnode @a->a_dvp.
* Assign the attributes @a->a_vap to the created symlink. Finally return the
* vnode of the created symlink in *@a->a_vpp.
*
* We implement symbolic links the same way as SFM, i.e. a symbolic link is a
* regular file as far as NTFS is concerned with an AFP_AfpInfo named stream
* containing the finder info with the type set to 'slnk' and the creator set
* to 'rhap'. This is basically how HFS+ stores symbolic links, too.
*
* Return 0 on success and errno on error.
*
* Note, since IEEE Std 1003.1-2001 does not require any association of file
* times with symbolic links, there is no requirement that file times be
* updated by symlink(). - This is what POSIX says about updating times in
* symlink() thus we do not update any of the times except as an indirect
* result of calling ntfs_write() on the symbolic link inode.
*/staticintntfs_vnop_symlink(struct vnop_symlink_args *a)
{
uio_t uio;
ntfs_inode *dir_ni, *ni, *raw_ni;
int err, err2;
unsigned len;
dir_ni = NTFS_I(a->a_dvp);
if (!dir_ni) {
ntfs_debug("Entered with NULL ntfs_inode, aborting.");
return EINVAL;
}
ntfs_debug("Creating a symbolic link named %.*s in directory mft_no ""0x%llx and pointing it at path \"%s\".",
(int)a->a_cnp->cn_namelen, a->a_cnp->cn_nameptr,
(unsignedlonglong)dir_ni->mft_no, a->a_target);
len = strlen(a->a_target);
/* Zero length symbolic links are not allowed. */if (!len || len > MAXPATHLEN) {
err = EINVAL;
if (len)
err = ENAMETOOLONG;
ntfs_error(dir_ni->vol->mp, "Invalid symbolic link target ""length %d, returning %s.", len,
len ? "ENAMETOOLONG" : "EINVAL");
return err;
}
retry:
/* Create the symbolic link inode. */
err = ntfs_create(dir_ni->vn, a->a_vpp, a->a_cnp, a->a_vap, TRUE);
if (err) {
if (err != EEXIST)
ntfs_error(dir_ni->vol->mp, "Failed to create ""symbolic link named %.*s in ""directory mft_no 0x%llx and pointing ""to path \"%s\" (error %d).",
(int)a->a_cnp->cn_namelen,
a->a_cnp->cn_nameptr,
(unsignedlonglong)dir_ni->mft_no,
a->a_target, err);
else
ntfs_debug("Failed to create symbolic link named %.*s ""in directory mft_no 0x%llx and ""pointing to path \"%s\" (error ""EEXIST).", (int)a->a_cnp->cn_namelen,
a->a_cnp->cn_nameptr,
(unsignedlonglong)dir_ni->mft_no,
a->a_target);
return err;
}
/* Note the ntfs inode @ni is locked for writing. */
ni = NTFS_I(*a->a_vpp);
/* Make sure no-one deleted it under our feet. */if (NInoDeleted(ni)) {
/* Remove the inode from the name cache. */
cache_purge(ni->vn);
/* Release the vnode and try the create again. */
lck_rw_unlock_exclusive(&ni->lock);
vnode_put(ni->vn);
gotoretry;
}
/*
* Create a uio and attach the target path to it so we can use
* ntfs_write() to do the work.
*/
uio = uio_create(1, 0, UIO_SYSSPACE, UIO_WRITE);
if (!uio) {
err = ENOMEM;
ntfs_error(dir_ni->vol->mp, "Failed to allocate UIO.");
gotoerr;
}
err = uio_addiov(uio, (uintptr_t)a->a_target, len);
if (err)
panic("%s(): Failed to attach target path buffer to UIO ""(error %d).", __FUNCTION__, err);
/*
* FIXME: At present the kernel does not allow VLNK vnodes to use the
* UBC (<rdar://problem/5794900>) thus we need to use a shadow VREG
* vnode to do the actual write of the symbolic link data. Fortunately
* we already implemented this functionality for compressed files where
* we need to read the compressed data using a shadow vnode so we use
* the same implementation here, thus our shadow vnode is a raw inode.
*/
err = ntfs_raw_inode_get(ni, LCK_RW_TYPE_EXCLUSIVE, &raw_ni);
if (err) {
ntfs_error(ni->vol->mp, "Failed to get raw inode (error %d).",
err);
gotoerr;
}
if (!NInoRaw(raw_ni))
panic("%s(): Requested raw inode but got non-raw one.\n",
__FUNCTION__);
/*
* Write the symbolic link target to the created inode. We pass in
* IO_UNIT as we want an atomic i/o operation.
*
* FIXME: ntfs_write() does not always honour the IO_UNIT flag so we
* still have to test for partial writes.
*/
err = ntfs_write(raw_ni, uio, IO_UNIT, TRUE);
/*
* Update the sizes in the base inode. Note there is no need to lock
* @raw_ni->size_lock as the values cannot change at present as we are
* holding the inode lock @raw_ni->lock for write.
*/
lck_spin_lock(&ni->size_lock);
ni->initialized_size = raw_ni->initialized_size;
ni->data_size = raw_ni->data_size;
ni->allocated_size = raw_ni->allocated_size;
ni->compressed_size = raw_ni->compressed_size;
lck_spin_unlock(&ni->size_lock);
if (NInoNonResident(raw_ni))
NInoSetNonResident(ni);
lck_rw_unlock_exclusive(&raw_ni->lock);
vnode_put(raw_ni->vn);
/* Check for write errors. */if (uio_resid(uio) && !err)
err = EIO;
/* We no longer need the uio. */
uio_free(uio);
if (!err) {
lck_rw_unlock_exclusive(&ni->lock);
ntfs_debug("Done.");
return 0;
}
/* Write failed or was partial, unlink the created symbolic link. */
ntfs_error(dir_ni->vol->mp, "Failed to write target path to symbolic ""link inode (error %d).", err);
err:
lck_rw_unlock_exclusive(&ni->lock);
err2 = ntfs_unlink(dir_ni, ni, a->a_cnp, 0, FALSE);
if (err2) {
ntfs_error(dir_ni->vol->mp, "Failed to unlink symbolic link ""inode in error code path (error %d). Run ""chkdsk.", err2);
NVolSetErrors(dir_ni->vol);
}
vnode_put(ni->vn);
return err;
}
/**
* ntfs_vnop_readdir - read directory entries into a supplied buffer
* @a: arguments to readdir function
*
* @a contains:
* vnode_t a_vp; directory vnode to read directory entries from
* uio_t a_uio; destination in which to return the entries
* int a_flags; flags describing the entries to return
* int *a_eofflag; return end of file status (can be NULL)
* int *a_numdirent; return number of entries returned (can be NULL)
* vfs_context_t a_context;
*
* See ntfs_dir.c::ntfs_readdir() for a description of the implemented
* features. In addition to those described features VNOP_READDIR() should
* also implement the below features.
*
* @a->a_flags can have the following bits set:
* VNODE_READDIR_EXTENDED use extended directory entries
* VNODE_READDIR_REQSEEKOFF requires seek offset (cookies)
* VNODE_READDIR_SEEKOFF32 seek offset values should be 32-bit
*
* When VNODE_READDIR_EXTENDED is set, the format of the returned directory
* entry structures changes to the direntry structure which is defined as:
*
* u64 d_ino; inode number of entry
* u64 d_seekoff; seek offset (optional, used by servers)
* u16 d_reclen; length of this record
* u16 d_namlen; length of string in d_name
* u8 d_type; inode type (one of DT_DIR, DT_REG, etc)
* char d_name[MAXPATHLEN]; null terminated filename
*
* If VNODE_READDIR_REQSEEKOFF is set, VNODE_READDIR_EXTENDED must also be set,
* and it means that the seek offset (d_seekoff) in the direntry structure must
* be set. If VNODE_READDIR_REQSEEKOFF is not set, the seek offset can be set
* to zero as the caller will ignore it.
*
* If VNODE_READDIR_SEEKOFF32 is set, both VNODE_READDIR_EXTENDED and
* VNODE_READDIR_REQSEEKOFF must be set and it means that the seek offset must
* be at most 32-bits, i.e. the most significant 32-bits of d_seekoff must be
* zero.
*
* All the VNODE_READDIR_* flags are only ever set by the NFS server and given
* we do not yet support NFS exporting of NTFS volumes we just abort if any of
* them are set.
*
* If the directory is deleted-but-in-use, we do not synthesize entries for "."
* and "..".
*
* Return 0 on success and the error code on error.
*/staticintntfs_vnop_readdir(struct vnop_readdir_args *a)
{
user_ssize_t start_count;
ntfs_inode *dir_ni = NTFS_I(a->a_vp);
errno_t err;
if (!dir_ni) {
ntfs_debug("Entered with NULL ntfs_inode, aborting.");
return EINVAL;
}
ntfs_debug("Entering for directory inode 0x%llx.",
(unsignedlonglong)dir_ni->mft_no);
/*
* FIXME: Is this check necessary? Can we ever get here for
* non-directories? All current callers (except the NFS server) ensure
* that @dir_ni is a directory. We do not currently support NFS
* exporting so this should indeed definitely never trigger but leave
* it here as a kind of debug assertion.
*/if (!S_ISDIR(dir_ni->mode)) {
ntfs_debug("Not a directory, returning ENOTDIR.");
return ENOTDIR;
}
if (a->a_flags) {
ntfs_error(dir_ni->vol->mp, "None of the VNODE_READDIR_* ""flags are supported yet, sorry.");
return ENOTSUP;
}
lck_rw_lock_shared(&dir_ni->lock);
/* Do not allow messing with the inode once it has been deleted. */if (NInoDeleted(dir_ni)) {
/* Remove the inode from the name cache. */
cache_purge(dir_ni->vn);
lck_rw_unlock_shared(&dir_ni->lock);
ntfs_debug("Directory is deleted.");
return ENOENT;
}
start_count = uio_resid(a->a_uio);
err = ntfs_readdir(dir_ni, a->a_uio, a->a_eofflag, a->a_numdirent);
/*
* Update the last_access_time (atime) if something was read.
*
* Skip the update if atime updates are disabled via the noatime mount
* option or the volume is read only.
*/if (uio_resid(a->a_uio) < start_count && !NVolReadOnly(dir_ni->vol) &&
!(vfs_flags(dir_ni->vol->mp) & MNT_NOATIME)) {
dir_ni->last_access_time = ntfs_utc_current_time();
NInoSetDirtyTimes(dir_ni);
}
lck_rw_unlock_shared(&dir_ni->lock);
ntfs_debug("Done (error %d).", (int)err);
return err;
}
/**
* ntfs_vnop_readdirattr -
*
*/staticintntfs_vnop_readdirattr(struct vnop_readdirattr_args *a)
{
errno_t err;
ntfs_debug("Entering.");
(void)nop_readdirattr(a);
// TODO:
err = ENOTSUP;
ntfs_debug("Done (error %d).", (int)err);
return err;
}
/**
* ntfs_vnop_readlink - read the contents of a symbolic link
* @a: arguments to readlink function
*
* @a contains:
* vnode_t a_vp; vnode of symbolic link whose data to read
* uio_t *a_uio; destination in which to return the read data
* vfs_context_t a_context;
*
* Read the path stored in the symbolic link vnode @a->a_vp and return it in
* the destination buffer pointed to by @a->a_uio.
*
* uio_resid(@a->a_uio) is the maximum number of bytes to read and
* uio_offset(@a->a_uio) must be zero.
*
* We implement symbolic links the same way as SFM, i.e. a symbolic link is a
* regular file as far as NTFS is concerned with an AFP_AfpInfo named stream
* containing the finder info with the type set to 'slnk' and the creator set
* to 'rhap'. This is basically how HFS+ stores symbolic links, too.
*
* Thus obtaining the symbolic link target is a simple matter of calling
* ntfs_read() on the symbolic link inode.
*
* TODO: We may wish to add support for other symbolic link types found on NTFS
* volumes such as the methods used by:
* - Windows Services for Unix (SFU) and the userspace ntfsmount driver,
* - SMB/Samba (when run on a file system without native symbolic links)
* - Cygwin
*
* It may also be worth supporting reparse point based symbolic links but those
* are a lot trickier if at all possible as they contain information that
* cannot be resolved without access to the Windows registry and potentially
* without access to the Windows Domain/Active Directory.
*
* Return 0 on success and errno on error.
*
* Note, since IEEE Std 1003.1-2001 does not require any association of file
* times with symbolic links, there is no requirement that file times be
* updated by readlink().
*/staticintntfs_vnop_readlink(struct vnop_readlink_args *a)
{
s64 size;
user_ssize_t start_count;
ntfs_inode *ni, *raw_ni;
uio_t uio = a->a_uio;
errno_t err;
ni = NTFS_I(a->a_vp);
if (!ni) {
ntfs_debug("Entered with NULL ntfs_inode, aborting.");
return EINVAL;
}
ntfs_debug("Entering for mft_no 0x%llx.",
(unsignedlonglong)ni->mft_no);
/*
* Protect against changes in initialized_size and thus against
* truncation also and against deletion/rename.
*/
lck_rw_lock_shared(&ni->lock);
/* Do not allow messing with the inode once it has been deleted. */if (!ni->link_count || NInoDeleted(ni)) {
/* Remove the inode from the name cache. */
cache_purge(ni->vn);
err = ENOENT;
gotoerr;
}
if (!S_ISLNK(ni->mode)) {
ntfs_debug("Not a symbolic link, returning EINVAL.");
err = EINVAL;
gotoerr;
}
if (uio_offset(uio)) {
ntfs_error(ni->vol->mp, "uio_offset(uio) is not zero, ""returning EINVAL.");
err = EINVAL;
gotoerr;
}
/*
* FIXME: At present the kernel does not allow VLNK vnodes to use the
* UBC (<rdar://problem/5794900>) thus we need to use a shadow VREG
* vnode to do the actual read of the symbolic link data. Fortunately
* we already implemented this functionality for compressed files where
* we need to read the compressed data using a shadow vnode so we use
* the same implementation here, thus our shadow vnode is a raw inode.
*
* Doing this has the unfortunate consequence that if the symbolic link
* inode is compressed or encrypted we cannot read it as we are already
* using the raw inode and we can only have one raw inode.
*/
lck_spin_lock(&ni->size_lock);
size = ni->data_size;
lck_spin_unlock(&ni->size_lock);
/* Zero length symbolic links are not allowed. */if (!size || size > MAXPATHLEN) {
ntfs_error(ni->vol->mp, "Invalid symbolic link size %lld in ""mft_no 0x%llx, returning EINVAL.",
(longlong)size,
(unsignedlonglong)ni->mft_no);
err = EINVAL;
gotoerr;
}
start_count = uio_resid(uio);
err = ntfs_raw_inode_get(ni, LCK_RW_TYPE_SHARED, &raw_ni);
if (err) {
ntfs_error(ni->vol->mp, "Failed to get raw inode (error %d).",
err);
gotoerr;
}
if (!NInoRaw(raw_ni))
panic("%s(): Requested raw inode but got non-raw one.\n",
__FUNCTION__);
lck_spin_lock(&raw_ni->size_lock);
if (size > ubc_getsize(raw_ni->vn) || size != raw_ni->data_size)
panic("%s(): size (0x%llx) > ubc_getsize(raw_ni->vn, 0x%llx) ""|| size != raw_ni->data_size (0x%llx)\n",
__FUNCTION__, (unsignedlonglong)size,
(unsignedlonglong)ubc_getsize(raw_ni->vn),
(unsignedlonglong)raw_ni->data_size);
lck_spin_unlock(&raw_ni->size_lock);
/* Perform the actual read of the symbolic link data into the uio. */
err = ntfs_read(raw_ni, uio, 0, TRUE);
lck_rw_unlock_shared(&raw_ni->lock);
vnode_put(raw_ni->vn);
/*
* If the read was partial, reset @uio pretending that the read never
* happened unless we used up all the space in the uio and it was
* simply not big enough to hold the entire symbolic link data in which
* case we return a truncated result.
*/if (err || (uio_resid(uio) && start_count - uio_resid(uio) != size)) {
/*
* FIXME: Should we be trying to continue a partial read in
* case we can complete it with multiple calls to ntfs_read()?
*/if (!err) {
ntfs_debug("ntfs_read() returned a partial read, ""pretending the read never happened.");
err = EIO;
}
uio_setoffset(uio, 0);
uio_setresid(uio, start_count);
if (err)
ntfs_error(ni->vol->mp, "Failed to read symbolic link ""data (error %d).", err);
}
ntfs_debug("Done (error %d).", (int)err);
err:
lck_rw_unlock_shared(&ni->lock);
return err;
}
/**
* ntfs_mft_record_free_all - free clusters referenced by an mft record
* @base_ni: base ntfs inode to which the (extent) inode @ni and @m belong
* @ni: ntfs inode for which to free all clusters
* @m: mft record for which to free all clusters
*
* For the ntfs inode @ni and its mft record @m, iterate over all attributes in
* the mft record and free all clusters referenced by the attributes. @base_ni
* is the base ntfs inode to which @ni and @m belong.
*
* Also, mark the mft record as not in use, increment its sequence number and
* mark it dirty to ensure it gets written out later.
*
* When any operations fail this function notifies the user about it and marks
* the volume dirty but does not return an error code as the caller can proceed
* regardless without caring if some clusters failed to be freed. A later
* chkdsk will find them and free them and in the mean time they just waste
* some space on the volume.
*/staticvoidntfs_mft_record_free_all(ntfs_inode *base_ni, ntfs_inode *ni,
MFT_RECORD *m)
{
ntfs_volume *vol = base_ni->vol;
ATTR_RECORD *a;
errno_t err;
ntfs_runlist rl;
for (a = (ATTR_RECORD*)((u8*)m + le16_to_cpu(m->attrs_offset));
a->type != AT_END;
a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))) {
if ((u8*)a < (u8*)m || (u8*)a > (u8*)m +
le32_to_cpu(m->bytes_in_use) ||
le32_to_cpu(m->bytes_in_use) >
le32_to_cpu(m->bytes_allocated) ||
!a->length) {
ntfs_warning(vol->mp, "Found corrupt attribute whilst ""releasing deleted mft_no 0x%llx. ""Run chkdsk to recover lost space and ""fix any other inconsistencies.",
(unsignedlonglong)ni->mft_no);
NVolSetErrors(vol);
break;
}
/*
* For most resident attribute records, there is nothing we
* need to do as they do not reference any clusters outside the
* mft record itself.
*/if (!a->non_resident) {
STANDARD_INFORMATION *si;
/*
* We only need to deal with the standard information
* attribute.
*/if (a->type != AT_STANDARD_INFORMATION)
continue;
/*
* We need to update the {a,m,c}times from the ntfs
* inode into the corresponding times in the standard
* information attribute. The inode ctime, i.e. the
* last_mft_change_time in the standard information
* attribute, gives us a de facto deleted time that can
* be used by ntfsck and ntfsundelete for example.
*/
si = (STANDARD_INFORMATION*)((u8*)a +
le16_to_cpu(a->value_offset));
si->last_data_change_time = utc2ntfs(
base_ni->last_data_change_time);
si->last_mft_change_time = utc2ntfs(
base_ni->last_mft_change_time);
si->last_access_time = utc2ntfs(
base_ni->last_access_time);
/* Whilst here also update the file attributes. */
si->file_attributes = base_ni->file_attributes;
/*
* We need to take care to handle NTFS 1.x style
* standard information attributes on NTFS 3.0+ volumes
* as they are lazily updated on write after a volume
* has been upgraded from 1.x and after a volume has
* been accessed by an older NTFS driver such as the
* one in Windows NT4.
*/
#if 0
if (vol->major_ver <= 3 ||
le32_to_cpu(a->value_length) <
sizeof(STANDARD_INFORMATION))
continue;
#endif/*
* We have an NTFS 3.0+ style, extended standard
* information attribute.
*//*
* TODO: When we implement support for $UsnJrnl, we
* will need to journal the delete event and update the
* usn field in the standard information attribute.
* For now this is not needed as we stamp the
* transaction log thus telling applications querying
* the transaction log that it does not contain
* uptodate information. We cannot do this at unlink
* time because there may still be writes and truncates
* happening due to existing open file descriptors and
* the delete event has to come last.
*//*
* TODO: When we implement support for quotas, we will
* need to update the quota control entry belonging to
* the user_id specified in the owner_id field in the
* standard information attribute by updating its
* change_time field to the current time and
* decrementing its bytes_used field by the amount
* specified in the quota_charged field in the standard
* information attribute as well as setting the
* exceeded_time to 0 if we go from over the soft quota
* specified in the limit of the quota control entry.
* For now this is not needed as we mark all quotas as
* invalid when we mount a volume read-write. We
* cannot do the quota update at unlink time because
* there may still be writes and truncates happening
* due to existing open file descriptors which will
* affect the quota related fields.
*/continue;
}
/*
* For non-resident attribute records, we need to free all the
* clusters specified in their mapping pairs array.
*
* If this is the base extent, we only need to do this if the
* allocated size is not zero. If this is not the base extent
* then by definition the allocated size cannot be zero and
* more importantly an extent mft rceord does not have the
* allocated_size field set thus it is always zero.
*/if (!a->lowest_vcn && !a->allocated_size)
continue;
rl.rl = NULL;
rl.alloc = rl.elements = 0;
err = ntfs_mapping_pairs_decompress(vol, a, &rl);
if (!err) {
VCN lowest_vcn;
/*
* We need to supply the correct start and count values
* otherwise freeing the clusters fails when an
* attribute has multiple extent records because the
* runlist contains unmapped elements.
*/
lowest_vcn = sle64_to_cpu(a->lowest_vcn);
err = ntfs_cluster_free_from_rl(vol, rl.rl, lowest_vcn,
sle64_to_cpu(a->highest_vcn) + 1 -
lowest_vcn, NULL);
if (err) {
ntfs_warning(vol->mp, "Failed to free some ""allocated clusters belonging ""to mft_no 0x%llx (error ""%d). Run chkdsk to recover ""the lost space.",
(unsignedlonglong)ni->mft_no,
err);
NVolSetErrors(vol);
}
OSFree(rl.rl, rl.alloc, ntfs_malloc_tag);
} else {
ntfs_error(vol->mp, "Cannot free some allocated space ""belonging to mft_no 0x%llx because ""the decompression of the mapping ""pairs array failed (error %d). Run ""chkdsk to recover the lost space.",
(unsignedlonglong)ni->mft_no, err);
NVolSetErrors(vol);
}
}
/*
* We have processed all attributes in the base mft record thus we can
* mark it as not in use, increment its sequence number, and mark it
* dirty for later writeout.
*/
m->flags &= ~MFT_RECORD_IN_USE;
if (m->sequence_number != const_cpu_to_le16(0xffff))
m->sequence_number = cpu_to_le16(
le16_to_cpu(m->sequence_number) + 1);
else
m->sequence_number = const_cpu_to_le16(1);
ni->seq_no = le16_to_cpu(m->sequence_number);
NInoSetMrecNeedsDirtying(ni);
}
/**
* ntfs_vnop_inactive - the last reference to a vnode has been dropped
* @args: arguments to inactive function
*
* @args contains:
* vnode_t a_vp; vnode whose last reference has been dropped
* vfs_context_t a_context;
*
* Last reference to a vnode has been dropped or a forced unmount is in
* progress.
*
* Note: When called from reclaim, the vnode has a zero v_iocount and
* v_usecount and vnode_isrecycled() is true.
*
* Return 0 on success and errno on error.
*
* Note the current OS X VFS ignores the return value from VNOP_INACTIVE() and
* hence ntfs_vnop_inactive().
*/staticintntfs_vnop_inactive(struct vnop_inactive_args *args)
{
leMFT_REF mref;
vnode_t vn = args->a_vp;
ntfs_inode *base_ni, *mftbmp_ni, *ni = NTFS_I(vn);
ntfs_volume *vol;
MFT_RECORD *m;
leMFT_REF *mrefs;
unsigned nr_mrefs;
errno_t err;
BOOL is_delete;
if (!ni) {
ntfs_debug("Entered with NULL ntfs_inode, aborting.");
return 0;
}
is_delete = !ni->link_count;
vol = ni->vol;
ntfs_debug("Entering for mft_no 0x%llx, type 0x%x, name_len 0x%x%s.",
(unsignedlonglong)ni->mft_no,
(unsigned)le32_to_cpu(ni->type), (unsigned)ni->name_len,
is_delete ? ", is delete" : "");
base_ni = ni;
if (NInoAttr(ni))
base_ni = ni->base_ni;
/*
* This is the last close thus remove any directory hints.
*
* Note we check for presence of directory hints outside the locks as
* an optimization. It is not a disaster if we miss any as all will be
* released in ntfs_inode_free() before the inode is thrown away at the
* latest.
*/if (ni != base_ni && ni->type == AT_INDEX_ALLOCATION &&
ni->nr_dirhints) {
lck_rw_lock_exclusive(&ni->lock);
ntfs_dirhints_put(ni, 0);
lck_rw_unlock_exclusive(&ni->lock);
}
/*
* If the inode is not being deleted or this is a raw inode sync it and
* we are done.
*/if (!is_delete || NInoRaw(ni)) {
sync:
/*
* Commit dirty data to disk unless mounted read-only.
*
* WARNING: Please see <rdar://problem/7202356> why this causes
* stack exhaustion and kernel panics by creating a loop where
* the VNOP_INACTIVE() calls ntfs_inode_sync() which ends up
* doing ntfs_inode_get() which in turn triggers another
* VNOP_INACTIVE() which in turn calls ntfs_inode_sync() and
* thus ntfs_inode_get() which in turns calls VNOP_INACTIVE()
* and so on until the stack overflows.
*/
err = 0;
if (!NVolReadOnly(vol))
err = ntfs_inode_sync(ni, IO_SYNC | IO_CLOSE, FALSE);
if (!err)
ntfs_debug("Done.");
else
ntfs_error(vol->mp, "Failed to sync mft_no 0x%llx, ""type 0x%x, name_len 0x%x (error %d).",
(unsignedlonglong)ni->mft_no,
(unsigned)le32_to_cpu(ni->type),
(unsigned)ni->name_len, err);
return err;
}
if (ni != base_ni)
lck_rw_lock_exclusive(&base_ni->lock);
lck_rw_lock_exclusive(&ni->lock);
/* Do not allow messing with the inode once it has been deleted. */if (NInoDeleted(ni)) {
/* Remove the inode from the name cache. */
cache_purge(vn);
lck_rw_unlock_exclusive(&ni->lock);
if (ni != base_ni)
lck_rw_unlock_exclusive(&base_ni->lock);
ntfs_debug("Done (was already deleted).");
return 0;
}
/*
* If someone else re-instantiated the inode whilst we were waiting for
* the inode lock sync the inode instead of deleting it.
*/if (ni->link_count) {
lck_rw_unlock_exclusive(&ni->lock);
if (ni != base_ni)
lck_rw_unlock_exclusive(&base_ni->lock);
ntfs_debug("Someone re-instantiated the inode.");
gotosync;
}
/*
* The inode has been unlinked, delete it now freeing all allocated
* space on disk as well as all related resources on disk. Note we
* proceed on errors because there is not much we can do about them.
* We have to carry on regardless as the inode is about to be
* terminated in any case.
*
* On a metadata affecting error, we mark the volume dirty and leave it
* to a subsequent chkdsk to clean up after us. This is not a disaster
* since there are no directory entries pointing to the inode @ni any
* more, thus us failing just means that we will keep some on disk
* resources allocated so chkdsk will just find this file and delete
* it.
*
* First, remove the inode from the inode cache so it cannot be found
* any more.
*/
lck_mtx_lock(&ntfs_inode_hash_lock);
/*
* Mark the inode as having been deleted so we do not try to remove it
* from the ntfs inode hash again in ntfs_inode_reclaim().
*/
NInoSetDeleted(ni);
/*
* Remove the ntfs_inode from the inode hash so it cannot be looked up
* any more.
*/
ntfs_inode_hash_rm_nolock(ni);
lck_mtx_unlock(&ntfs_inode_hash_lock);
/* Remove the inode from the name cache if it is still in it. */
cache_purge(vn);
/*
* The inode/vnode are no longer reachable at all so drop the inode
* lock. Anyone waiting on the lock should test for NInoDeleted() and
* abort once they have taken the lock.
*/
lck_rw_unlock_exclusive(&ni->lock);
/* In case someone is waiting on the inode do a wakeup. */
ntfs_inode_wakeup(ni);
/* Invalidate all buffers to do with the vnode. */
err = buf_invalidateblks(vn, 0, 0, 0);
if (err)
ntfs_error(vol->mp, "Failed to invalidate cached buffers ""(error %d).", err);
/*
* Invalidate all cached pages in the VM.
*
* This will fail for non-regular (VREG) nodes as they do not have UBC
* info attached to them and ubc_msync() returns error in this case.
*/if (vnode_isreg(vn)) {
err = ubc_msync(vn, 0, ubc_getsize(vn), NULL, UBC_INVALIDATE);
if (err)
ntfs_error(vol->mp, "Failed to invalidate cached ""pages (error %d).", err);
}
/*
* Cause the vnode to be reused immediately when we return rather than
* sitting around in the vnode cache.
*/
vnode_recycle(vn);
/*
* ntfs_unlink() and ntfs_vnop_rename() bail out for attribute inodes
* so we cannot get here with an attribute inode unless something has
* gone badly wrong.
*
* When a named stream is deleted via VNOP_REMOVENAMEDSTREAM() its
* link_count is set to zero so we get here on the last close. We have
* to perform the actual freeing of allocated space if the attribute is
* non-resident as well as the removal of the attribute record here.
*/if (ni != base_ni) {
ntfs_attr_search_ctx *ctx;
if (ni->type != AT_DATA || !ni->name_len)
panic("%s(): ni != base_ni && (ni->type != AT_DATA || ""!ni->name_len)\n", __FUNCTION__);
/*
* For simplicity, if the attribute is non-resident, we
* truncate the attribute to zero size first as that causes
* both the allocated clusters to be freed as well as all
* extent attribute records to be deleted.
*
* We then only need to remove the base attribute record and we
* are done.
*/if (NInoNonResident(ni)) {
err = ntfs_attr_resize(ni, 0, 0, NULL);
if (err) {
ntfs_error(vol->mp, "Cannot delete named ""stream from mft_no 0x%llx ""because truncating the ""stream inode to zero size ""failed (error %d).",
(unsignedlonglong)ni->mft_no,
err);
gotoerr;
}
}
/* Remove the named stream. */
err = ntfs_mft_record_map(base_ni, &m);
if (err) {
ntfs_error(vol->mp, "Failed to delete named stream ""because mapping the mft record ""0x%llx failed (error %d).",
(unsignedlonglong)ni->mft_no, err);
gotoerr;
}
ctx = ntfs_attr_search_ctx_get(base_ni, m);
if (!ctx) {
ntfs_error(vol->mp, "Failed to delete named stream ""because allocating an attribute ""search context failed.");
gotounm_err;
}
err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 0,
NULL, 0, ctx);
if (err) {
ntfs_error(vol->mp, "Failed to delete named stream ""because looking up the named $DATA ""attribute in the mft record 0x%llx ""failed (error %d).",
(unsignedlonglong)ni->mft_no, err);
gotoput_err;
}
err = ntfs_attr_record_delete(base_ni, ctx);
if (err) {
ntfs_error(vol->mp, "Failed to delete named stream ""because deleting the named $DATA ""attribute from its mft record 0x%llx ""failed (error %d).",
(unsignedlonglong)ctx->ni->mft_no,
err);
gotoput_err;
}
ntfs_debug("Done (deleted attribute inode).");
put_err:
ntfs_attr_search_ctx_put(ctx);
unm_err:
ntfs_mft_record_unmap(base_ni);
err:
lck_rw_unlock_exclusive(&base_ni->lock);
return err;
}
/*
* We only need to be concerned with the allocated space on disk which
* we need to deallocate and any related resources on disk, which we
* also need to deallocate and/or mark unused. To do this, we map the
* base mft record and iterate over all its attributes and deal with
* each of them in sequence.
*/
err = ntfs_mft_record_map(ni, &m);
if (err) {
ntfs_warning(vol->mp, "Cannot release deleted mft_no 0x%llx ""because the mapping of the base mft record ""failed (error %d). Run chkdsk to recover ""lost resources.",
(unsignedlonglong)ni->mft_no, err);
NVolSetErrors(vol);
return 0;
}
/*
* Make sure the mft record was marked as not in use in
* ntfs_unlink_internal().
*/if (m->flags & MFT_RECORD_IN_USE)
panic("%s(): m->flags & MFT_RECORD_IN_USE\n", __FUNCTION__);
/*
* We will need the mft reference of the base mft record below but we
* are about to change it thus make a note of the old one now.
*/
mref = MK_LE_MREF(ni->mft_no, ni->seq_no);
/*
* Release all clusters allocated to attribute records located in the
* extent mft record.
*/
ntfs_mft_record_free_all(ni, ni, m);
/*
* We are finished with the base mft record, if there is an attribute
* list attribute, we iterate over its entries and each time we
* encounter an extent mft record that we have not done yet, we map it
* and iterate over all its attributes as we did above for the base mft
* record, followed by marking the extent mft record as not in use,
* incrementing its sequence number, and marking it dirty, again as we
* did above for the base mft record. Finally, we add it to our list
* of mft records to deallocate from the $MFT/$BITMAP attribute.
*
* As an optimization, we reuse the attribute list buffer as our list
* of mft records to deallocate from the $MFT/$BITMAP attribute. This
* works because each ATTR_LIST_ENTRY record in the attribute list
* attribute is at least 24 bytes long and we only need to store 8
* bytes for each mft reference in our list of mft records to
* deallocate so we are guaranteed to have enough space in the buffer
* for our needs and we are also guaranteed that we will never
* overwrite part of the attribute list attribute data that we have not
* dealt with yet.
*/
nr_mrefs = 1;
mrefs = &mref;
if (NInoAttrList(ni)) {
ATTR_LIST_ENTRY *entry, *next_entry, *end;
ntfs_inode *eni;
if (!ni->attr_list || ni->attr_list_size < sizeof(leMFT_REF) ||
!ni->attr_list_alloc)
panic("%s(): !ni->attr_list || !ni->attr_list_size || ""!ni->attr_list_alloc\n", __FUNCTION__);
entry = (ATTR_LIST_ENTRY*)ni->attr_list;
mrefs = (leMFT_REF*)entry;
next_entry = (ATTR_LIST_ENTRY*)((u8*)entry +
le16_to_cpu(entry->length));
end = (ATTR_LIST_ENTRY*)(ni->attr_list + ni->attr_list_size);
/*
* Add the mft reference of the base mft record as the first
* element in our list as we have already dealt with it.
*/
*mrefs = mref;
while (entry < end) {
unsigned i;
mref = entry->mft_reference;
for (i = 0; i < nr_mrefs; i++) {
if (mref == mrefs[i])
gotodo_next;
}
/*
* This mft reference has not been encountered before.
* Add it to the list of mft references and free all
* disk storage associated with all the attribute
* records stored in the mft record with this mft
* reference.
*/
mrefs[nr_mrefs++] = mref;
err = ntfs_extent_mft_record_map(ni, le64_to_cpu(mref),
&eni, &m);
if (!err) {
/*
* Release all clusters allocated to attribute
* records located in the extent mft record and
* mark the mft record as not in use.
*
* We need to ensure the mft record is marked
* as in use. It can happen that it is not
* marked in use after a system crash occurs
* whilst a file is being extended.
*/if (m->flags & MFT_RECORD_IN_USE)
ntfs_mft_record_free_all(ni, eni, m);
else {
ntfs_warning(vol->mp, "Extent mft_no ""0x%llx, base mft_no ""0x%llx is marked as ""not in use. Cannot ""release allocated ""clusters. Unmount ""and run chkdsk to ""recover the lost ""clusters.",
(unsignedlonglong)
MREF_LE(mref),
(unsignedlonglong)
ni->mft_no);
NVolSetErrors(vol);
}
/* Unmap the mft record again. */
ntfs_extent_mft_record_unmap(eni);
} else {
ntfs_warning(vol->mp, "Failed to release ""allocated clusters because ""mapping extent mft_no 0x%llx, ""base mft_no 0x%llx failed ""(error %d). Unmount and run ""chkdsk to recover the lost ""clusters.",
(unsignedlonglong)MREF_LE(mref),
(unsignedlonglong)ni->mft_no,
err);
NVolSetErrors(vol);
}
do_next:
entry = next_entry;
next_entry = (ATTR_LIST_ENTRY*)((u8*)entry +
le16_to_cpu(entry->length));
}
}
ntfs_mft_record_unmap(ni);
/*
* Mark the base mft record and all extent mft records (if any) as
* unused in the mft bitmap.
*
* Note that this means that ntfs_inode_reclaim() may run when someone
* else has already reused one of the mft records we are freeing now.
* This is ok because all ntfs_inode_reclaim() does is to do some
* memory freeing. And we have already removed the inode from the
* inode cache thus there are no problems from that point of view
* either.
*/
lck_rw_lock_exclusive(&vol->mftbmp_lock);
mftbmp_ni = vol->mftbmp_ni;
err = vnode_get(mftbmp_ni->vn);
if (err)
ntfs_warning(vol->mp, "Failed to get vnode for $MFT/$BITMAP ""(error %d) thus cannot release mft ""record(s). Run chkdsk to recover the lost ""mft record(s).", err);
else {
lck_rw_lock_shared(&mftbmp_ni->lock);
while (nr_mrefs > 0) {
nr_mrefs--;
err = ntfs_bitmap_clear_bit(mftbmp_ni,
MREF_LE(mrefs[nr_mrefs]));
if (!err) {
/*
* We cleared a bit in the mft bitmap thus we
* need to reflect this in the cached number of
* free mft records.
*/
vol->nr_free_mft_records++;
if (vol->nr_free_mft_records >=
vol->nr_mft_records)
panic("%s(): vol->nr_free_mft_records ""> vol->nr_mft_records""\n", __FUNCTION__);
} else {
ntfs_error(vol->mp, "Failed to free mft_no ""0x%llx (error %d). Run ""chkdsk to recover the lost ""mft record.",
(unsignedlonglong)
MREF_LE(mrefs[nr_mrefs]), err);
NVolSetErrors(vol);
}
}
lck_rw_unlock_shared(&mftbmp_ni->lock);
(void)vnode_put(mftbmp_ni->vn);
}
lck_rw_unlock_exclusive(&vol->mftbmp_lock);
ntfs_debug("Done (deleted base inode).");
return 0;
}
/**
* ntfs_vnop_reclaim - free ntfs specific parts of a vnode so it can be reused
* @a: arguments to reclaim function
*
* @a contains:
* vnode_t a_vp; vnode to be reclaimed
* vfs_context_t a_context;
*
* Reclaim a vnode so it can be used for other purposes.
*
* Note: This is called from reclaim. The vnode has a zero v_iocount and
* v_usecount and vnode_isrecycled() is true.
*
* Return 0 on success and errno on error.
*
* Note the current OS X VFS panic()s the machine if VNOP_RECLAIM() and hence
* ntfs_vnop_reclaim() returns an error.
*/staticintntfs_vnop_reclaim(struct vnop_reclaim_args *a)
{
vnode_t vn = a->a_vp;
ntfs_inode *ni = NTFS_I(vn);
errno_t err;
/* Do not dereference @ni if it is NULL. */
#ifdefDEBUGif (ni)
ntfs_debug("Entering for mft_no 0x%llx, type 0x%x, name_len ""0x%x.", (unsignedlonglong)ni->mft_no,
le32_to_cpu(ni->type), (unsigned)ni->name_len);
else
ntfs_debug("Entering for already reclaimed vnode!");
#endif
vnode_removefsref(vn);
err = ntfs_inode_reclaim(ni);
ntfs_debug("Done (error %d).", (int)err);
return err;
}
/**
* ntfs_vnop_pathconf - get configurable pathname variables
* @a: arguments to pathconf function
*
* @a contains:
* vnode_t a_vp; vnode for which to return pathconf information
* int a_name; the pathconf variable to be queried
* register_t *a_retval; destination for result of query
* vfs_context_t a_context;
*
* Return POSIX pathconf information applicable to ntfs file system. Some
* @a_name values are intercepted by the VFS in vn_pathconf (pathconf(2) ->
* vn_pathconf() -> VNOP_PATHCONF() -> ntfs_vnop_pathconf()) so we do not
* bother with them.
*
* Return 0 on success and EINVAL if an unsupported @a_name was queried for.
*/staticintntfs_vnop_pathconf(struct vnop_pathconf_args *a)
{
ntfs_inode *ni = NTFS_I(a->a_vp);
ntfs_volume *vol = NTFS_MP(vnode_mount(a->a_vp));
errno_t err = 0;
ntfs_debug("Entering for pathconf variable number %d.", a->a_name);
if (ni) {
lck_rw_lock_shared(&ni->lock);
/*
* Do not allow messing with the inode once it has been
* deleted.
*/if (NInoDeleted(ni)) {
/* Remove the inode from the name cache. */
cache_purge(ni->vn);
lck_rw_unlock_shared(&ni->lock);
ntfs_debug("Directory is deleted.");
return ENOENT;
}
}
switch (a->a_name) {
case_PC_LINK_MAX:
/*
* The maximum file link count. For ntfs, the link count is
* stored in the mft record in the link_count field which is of
* type le16, thus 16 bits. For attribute inodes and
* directories however, no hard links are allowed and thus the
* maximum link count is 1.
*/if (!ni) {
ntfs_debug("Entered with NULL ntfs_inode, aborting.");
return EINVAL;
}
*a->a_retval = NTFS_MAX_HARD_LINKS;
if (NInoAttr(ni) || S_ISDIR(ni->mode))
*a->a_retval = 1;
break;
case_PC_NAME_MAX:
/*
* The maximum number of bytes in a filename. For ntfs, this
* is stored in the attribute record in the name_length field
* which is of type u8, thus 8 bits.
*/
*a->a_retval = NTFS_MAX_NAME_LEN; /* 255 */break;
case_PC_PATH_MAX:
/*
* The maximum number of bytes in a path name. Ntfs imposes no
* restrictions so use the system limit.
*/
*a->a_retval = PATH_MAX; /* 1024 */break;
case_PC_PIPE_BUF:
/*
* The maximum number of bytes which will be written atomically
* to a pipe, again ntfs imposes no restrictions so use the
* system limit.
*/
*a->a_retval = PIPE_BUF; /* 512 */break;
case_PC_CHOWN_RESTRICTED:
/*
* Non-zero if appropriate privileges are required for the
* chown(2) system call. For ntfs, this is always the case.
*/
*a->a_retval = 200112; /* unistd.h: _POSIX_CHOWN_RESTRICTED */break;
case_PC_NO_TRUNC:
/*
* Non-zero if accessing filenames longer than _POSIX_NAME_MAX
* (which we specified above to be NTFS_MAX_NAME_LEN) generates
* an error. For ntfs, this is always the case.
*/
*a->a_retval = 200112; /* unistd.h: _POSIX_NO_TRUNC */break;
case_PC_NAME_CHARS_MAX:
/*
* The maximum number of characters in a filename. This is
* the same as _PC_NAME_MAX, above.
*/
*a->a_retval = NTFS_MAX_NAME_LEN; /* 255 */break;
case_PC_CASE_SENSITIVE:
/*
* Return 1 if case sensitive and 0 if not. For ntfs, this
* depends on the mount options.
*/if (vol)
*a->a_retval = (NVolCaseSensitive(vol) ? 1 : 0);
else
err = EINVAL;
break;
case_PC_CASE_PRESERVING:
/*
* Return 1 if case preserving and 0 if not. For ntfs, this is
* always 1, i.e. ntfs always preserves case.
*/
*a->a_retval = 1;
break;
case_PC_FILESIZEBITS:
/*
* The number of bits to represent file size. For ntfs, the
* file size is stored in the attribute record in the data_size
* field which is of type sle64, thus 63 bits.
*/
*a->a_retval = 63;
break;
default:
err = EINVAL;
}
if (ni)
lck_rw_unlock_shared(&ni->lock);
ntfs_debug("Done (error %d).", (int)err);
return err;
}
/**
* ntfs_vnop_allocate -
*/staticintntfs_vnop_allocate(struct vnop_allocate_args *a)
{
errno_t err;
ntfs_debug("Entering.");
// TODO:
(void)nop_allocate(a);
err = ENOTSUP;
ntfs_debug("Done (error %d).", (int)err);
return err;
}
/**
* ntfs_vnop_pagein - read a range of pages into memory
* @a: arguments to pagein function
*
* @a contains:
* vnode_t a_vp; vnode whose data to read into the page range
* upl_t a_pl; page list describing destination page range
* upl_offset_t a_pl_offset; byte offset into page list at which to start
* off_t a_f_offset; byte offset in the vnode at which to start
* size_t a_size; number of bytes to read from the vnode
* int a_flags; flags further describing the pagein request
* vfs_context_t a_context;
*
* Read @a->a_size bytes from the vnode @a-a_vp, starting at byte offset
* @a->a_f_offset into the vnode, into the range of pages specified by the page
* list @a->a_pl, starting at byte offset @a->a_pl_offset into the page list.
*
* The flags in @a->a_flags further describe the pagein request. The following
* pagein flags are currently defined in OS X kernel:
* UPL_IOSYNC - Perform synchronous i/o.
* UPL_NOCOMMIT - Do not commit/abort the page range.
* UPL_NORDAHEAD - Do not perform any speculative read-ahead.
* IO_PASSIVE - This is background i/o so do not throttle other i/o.
*
* For encrypted attributes we abort for now as we do not support them yet.
*
* For non-resident, non-compressed attributes we use cluster_pagein_ext()
* which deals with both normal and multi sector transfer protected attributes.
*
* For resident attributes and non-resident, compressed attributes we read the
* data ourselves by mapping the page list, and in the resident case, mapping
* the mft record, looking up the attribute in it, and copying the requested
* data from the mapped attribute into the page list, then unmapping the mft
* record, whilst for non-resident, compressed attributes, we get the raw inode
* and use it with ntfs_read_compressed() to read and decompress the data into
* our mapped page list. We then unmap the page list and finally, if
* UPL_NOCOMMIT is not specified, we commit (success) or abort (error) the page
* range.
*
* Return 0 on success and errno on error.
*
* Note the pages in the page list are marked busy on entry and the busy bit is
* cleared when we commit the page range. Thus it is perfectly safe for us to
* fill the pages with encrypted or mst protected data and to decrypt or mst
* deprotect in place before committing the page range.
*
* Adapted from cluster_pagein_ext().
*/staticintntfs_vnop_pagein(struct vnop_pagein_args *a)
{
ntfs_inode *base_ni, *ni = NTFS_I(a->a_vp);
int err;
if (!ni) {
ntfs_debug("Entered with NULL ntfs_inode, aborting.");
if (!(a->a_flags & UPL_NOCOMMIT) && a->a_pl)
ubc_upl_abort_range(a->a_pl, a->a_pl_offset, a->a_size,
UPL_ABORT_FREE_ON_EMPTY |
UPL_ABORT_ERROR);
return EINVAL;
}
base_ni = ni;
if (NInoAttr(ni))
base_ni = ni->base_ni;
ntfs_debug("Entering for mft_no 0x%llx, offset 0x%llx, size 0x%llx, ""pagein flags 0x%x, page list offset 0x%llx.",
(unsignedlonglong)ni->mft_no,
(unsignedlonglong)a->a_f_offset,
(unsignedlonglong)a->a_size, a->a_flags,
(unsignedlonglong)a->a_pl_offset);
err = ntfs_pagein(ni, a->a_f_offset, a->a_size, a->a_pl,
a->a_pl_offset, a->a_flags);
/*
* Update the last_access_time (atime) if something was read and this
* is the base ntfs inode or it is a named stream (this is what HFS+
* does, too).
*
* Skip the update if atime updates are disabled via the noatime mount
* option or the volume is read only or this is a symbolic link.
*
* Also, skip the core system files except for the root directory.
*/if (!err && !NVolReadOnly(ni->vol) &&
!(vfs_flags(ni->vol->mp) & MNT_NOATIME) &&
!S_ISLNK(base_ni->mode) &&
(ni == base_ni || ni->type == AT_DATA)) {
BOOL need_update_time;
need_update_time = TRUE;
if (ni->vol->major_ver > 1) {
if (base_ni->mft_no <= FILE_Extend &&
base_ni != ni->vol->root_ni)
need_update_time = FALSE;
} else {
if (base_ni->mft_no <= FILE_UpCase &&
base_ni != ni->vol->root_ni)
need_update_time = FALSE;
}
if (need_update_time) {
base_ni->last_access_time = ntfs_utc_current_time();
NInoSetDirtyTimes(base_ni);
}
}
return err;
}
// TODO: Move to ntfs_page.[hc].
staticintntfs_mst_pageout(ntfs_inode *ni, upl_t upl, upl_offset_t upl_ofs,
unsigned size, s64 attr_ofs, s64 attr_size, int flags)
{
ntfs_volume *vol = ni->vol;
u8 *kaddr;
kern_return_t kerr;
unsigned rec_size, rec_shift, nr_recs, i;
int err;
NTFS_RECORD_TYPE magic = 0;
BOOL do_commit;
do_commit = !(flags & UPL_NOCOMMIT);
if (ni->type == AT_INDEX_ALLOCATION)
magic = magic_INDX;
else
panic("%s(): Unknown mst protected inode 0x%llx, type 0x%x, ""name_len 0x%x.", __FUNCTION__,
(unsignedlonglong)ni->mft_no,
(unsigned)le32_to_cpu(ni->type),
(unsigned)ni->name_len);
ntfs_debug("Entering for mft_no 0x%llx, page list offset 0x%llx, size ""0x%x, offset 0x%llx, pageout flags 0x%x, magic is ""0x%x.", (unsignedlonglong)ni->mft_no,
(unsignedlonglong)upl_ofs, size,
(unsignedlonglong)attr_ofs, flags,
(unsigned)le32_to_cpu(magic));
if (attr_ofs < 0 || attr_ofs >= attr_size || attr_ofs & PAGE_MASK_64 ||
size & PAGE_MASK || upl_ofs & PAGE_MASK) {
err = EINVAL;
gotoerr;
}
if (!NInoMstProtected(ni))
panic("%s(): Called for non-mst protected attribute.\n",
__FUNCTION__);
if (!NInoNonResident(ni))
panic("%s(): Resident mst protected attribute.\n",
__FUNCTION__);
rec_size = ni->block_size;
if (attr_ofs & (rec_size - 1) || size & (rec_size - 1))
panic("%s(): Write not aligned to NTFS record boundary.\n",
__FUNCTION__);
rec_shift = ni->block_size_shift;
/* Clip the number of records to the size of the attribute. */
nr_recs = size >> rec_shift;
if (attr_ofs + size > attr_size) {
unsigned to_write;
/* Abort any pages outside the end of the attribute. */
to_write = attr_size - attr_ofs;
nr_recs = to_write >> rec_shift;
to_write = (to_write + PAGE_MASK) & ~PAGE_MASK;
if (size != to_write) {
if (size < to_write)
panic("%s(): size less than to_write.\n",
__FUNCTION__);
ntfs_debug("Truncating write past end of attribute.");
if (do_commit)
ubc_upl_abort_range(upl, upl_ofs + to_write,
size - to_write,
UPL_ABORT_FREE_ON_EMPTY);
size = to_write;
}
}
if (!nr_recs)
panic("%s(): NTFS record size greater than write size.\n",
__FUNCTION__);
/*
* Need to apply the mst fixups and abort on errors. To apply the
* fixups need to map the page list so we can access its contents.
*/
kerr = ubc_upl_map(upl, (vm_offset_t*)&kaddr);
if (kerr != KERN_SUCCESS) {
ntfs_error(vol->mp, "ubc_upl_map() failed (error %d).",
(int)kerr);
err = EIO;
gotoerr;
}
/*
* Loop over the records in the page list and for each apply the mst
* fixups. On any fixup errors, remove all the applied fixups and
* abort the write completely.
*/for (i = 0; i < nr_recs; i++) {
NTFS_RECORD *rec = (NTFS_RECORD*)(kaddr + (i << rec_shift));
if (__ntfs_is_magic(rec->magic, magic)) {
err = ntfs_mst_fixup_pre_write(rec, rec_size);
if (err) {
ntfs_error(vol->mp, "Failed to apply mst ""fixups (mft_no 0x%llx, type ""0x%x, offset 0x%llx).",
(unsignedlonglong)ni->mft_no,
(unsigned)le32_to_cpu(ni->type),
(unsignedlonglong)attr_ofs +
(i << rec_shift));
gotomst_err;
}
}
}
/* Unmap the page list again so we can call cluster_pageout_ext(). */// FIXME: Can we leave the page list mapped throughout the
// cluster_pageout_ext() call? That would be a lot more efficient and
// simplify error handling.
kerr = ubc_upl_unmap(upl);
if (kerr != KERN_SUCCESS) {
ntfs_error(vol->mp, "ubc_upl_unmap() failed (error %d).",
(int)kerr);
err = EIO;
gotomst_err;
}
/*
* We need the write to be synchronous so we do not leave the metadata
* with the fixups applied for too long.
*
* We also need to set the no commit flag so we can still recover from
* errors by removing the fixups.
*/
flags |= UPL_IOSYNC | UPL_NOCOMMIT;
/*
* On success the fixups will have been removed by the
* ntfs_cluster_iodone() callback.
*/
err = cluster_pageout_ext(ni->vn, upl, upl_ofs, attr_ofs, size,
attr_size, flags, ntfs_cluster_iodone, NULL);
if (!err) {
if (do_commit) {
/* Commit the page range we wrote out. */
ubc_upl_commit_range(upl, upl_ofs, size,
UPL_COMMIT_FREE_ON_EMPTY |
UPL_COMMIT_CLEAR_DIRTY);
}
ntfs_debug("Done.");
return err;
}
ntfs_error(vol->mp, "Failed (cluster_pageout_ext() returned error ""%d).", err);
/*
* We may have some records left with applied fixups thus remove them
* again. It does not matter if it is done twice as this is an error
* code path and the only side effect is a little slow down.
*/
kerr = ubc_upl_map(upl, (vm_offset_t*)&kaddr);
if (kerr != KERN_SUCCESS) {
ntfs_error(vol->mp, "ubc_upl_map() failed (error %d), cannot ""remove mst fixups. Unmount and run chkdsk.",
(int)kerr);
NVolSetErrors(vol);
gotoerr;
}
mst_err:
/* Remove the applied fixups, unmap the page list and abort. */while (i > 0) {
NTFS_RECORD *rec = (NTFS_RECORD*)(kaddr + (--i << rec_shift));
if (__ntfs_is_magic(rec->magic, magic))
ntfs_mst_fixup_post_write(rec);
}
kerr = ubc_upl_unmap(upl);
if (kerr != KERN_SUCCESS)
ntfs_error(vol->mp, "ubc_upl_unmap() failed (error %d).",
(int)kerr);
err:
if (do_commit)
ubc_upl_abort_range(upl, upl_ofs, size,
UPL_ABORT_FREE_ON_EMPTY);
return err;
}
/**
* ntfs_vnop_pageout - write a range of pages to storage
* @a: arguments to pageout function
*
* @a contains:
* vnode_t a_vp; vnode whose data to write from the page range
* upl_t a_pl; page list describing the source page range
* upl_offset_t a_pl_offset; byte offset into page list at which to start
* off_t a_f_offset; byte offset in the vnode at which to start
* size_t a_size; number of bytes to write to the vnode
* int a_flags; flags further describing the pageout request
* vfs_context_t a_context;
*
* If UPL_NESTED_PAGEOUT is set in the flags (a->a_flags) we are called from
* cluster_io() which is in turn called from cluster_write() which is in turn
* called from ntfs_vnop_write() which means we are already holding the inode
* lock (@ni->lock). Alternatively cluster_io() can be called from
* cluster_push() which can be called from various places in NTFS.
*
* Write @a->a_size bytes to the vnode @a-a_vp, starting at byte offset
* @a->a_f_offset into the vnode, from the range of pages specified by the page
* list @a->a_pl, starting at byte offset @a->a_pl_offset into the page list.
*
* The flags in @a->a_flags further describe the pageout request. The
* following pageout flags are currently defined in OS X kernel:
* UPL_IOSYNC - Perform synchronous i/o.
* UPL_NOCOMMIT - Do not commit/abort the page range.
* UPL_KEEPCACHED - Data is already cached in memory, keep it cached.
* IO_PASSIVE - This is background i/o so do not throttle other i/o.
*
* For encrypted attributes we abort for now as we do not support them yet.
*
* For non-resident, non-compressed attributes we use cluster_pageout_ext()
* which deals with both normal and multi sector transfer protected attributes.
*
* In the case of multi sector transfer protected attributes we apply the
* fixups and then submit the i/o synchronously by setting the UPL_IOSYNC flag.
*
* For resident attributes and non-resident, compressed attributes we write the
* data ourselves by mapping the page list, and in the resident case, mapping
* the mft record, looking up the attribute in it, and copying the data to the
* mapped attribute from the page list, then unmapping the mft record, whilst
* for non-resident, compressed attributes, we get the raw inode and use it
* with ntfs_write_compressed() to compress and write the data from our mapped
* page list. We then unmap the page list and finally, if UPL_NOCOMMIT is not
* specified, we commit (success) or abort (error) the page range.
*
* Return 0 on success and errno on error.
*
* Note the pages in the page list are marked busy on entry and the busy bit is
* cleared when we commit the page range. Thus it is perfectly safe for us to
* apply the mst fixups and write out the data which will then also take away
* the fixups again before committing the page range.
*
* Adapted from cluster_pageout_ext().
*/staticintntfs_vnop_pageout(struct vnop_pageout_args *a)
{
s64 attr_ofs, attr_size, alloc_size, bytes;
ntfs_inode *base_ni, *ni = NTFS_I(a->a_vp);
upl_t upl = a->a_pl;
ntfs_volume *vol;
u8 *kaddr;
upl_offset_t upl_ofs = a->a_pl_offset;
kern_return_t kerr;
unsigned to_write, size = a->a_size;
int err, flags = a->a_flags;
lck_rw_type_t lock_type = LCK_RW_TYPE_SHARED;
BOOL locked = FALSE;
if (!ni) {
ntfs_debug("Entered with NULL ntfs_inode, aborting.");
if (!(flags & UPL_NOCOMMIT) && upl)
ubc_upl_abort_range(upl, upl_ofs, size,
UPL_ABORT_FREE_ON_EMPTY);
return EINVAL;
}
vol = ni->vol;
attr_ofs = a->a_f_offset;
base_ni = ni;
if (NInoAttr(ni))
base_ni = ni->base_ni;
ntfs_debug("Entering for mft_no 0x%llx, offset 0x%llx, size 0x%x, ""pageout flags 0x%x, page list offset 0x%llx.",
(unsignedlonglong)ni->mft_no,
(unsignedlonglong)attr_ofs, size, flags,
(unsignedlonglong)upl_ofs);
/*
* If the caller did not specify any i/o, then we are done. We cannot
* issue an abort because we do not have a upl or we do not know its
* size.
*/if (!upl || size <= 0) {
ntfs_error(vol->mp, "NULL page list passed in or request size ""is below zero (error EINVAL).");
return EINVAL;
}
if (S_ISDIR(ni->mode)) {
ntfs_error(vol->mp, "Called for directory vnode.");
err = EISDIR;
gotoerr;
}
if (NVolReadOnly(vol)) {
err = EROFS;
gotoerr;
}
/*
* Need to clip i/o at maximum file size of 2^63-1 bytes in case
* someone creates a sparse file and is playing silly with seek + write
* note we only need to check for this for sparse files as non-sparse
* files can never reach 2^63-1 because that is also the maximum space
* on the volume thus the write would simply get an ENOSPC when the
* volume is full.
*/if (NInoSparse(ni) && (u64)attr_ofs + size > NTFS_MAX_ATTRIBUTE_SIZE) {
err = EFBIG;
gotoerr;
}
#if 1 // TODO: Removethiswhensparsesupportisdone...
if (NInoSparse(ni)) {
err = ENOTSUP;
gotoerr;
}
#endif/*
* Protect against changes in initialized_size and thus against
* truncation also but only if the VFS is not calling back into the
* NTFS driver after the NTFS driver called it in which case we are
* already holding the lock.
*
* There is a complication in that the UPL is already created by the
* caller thus us taking the lock here is a case of lock reversal wrt
* the UPL keeping the pages locked for exclusive access thus we can
* deadlock with a concurrent file create for example when it holds the
* ntfs inode lock @ni->lock for exclusive access on the index vnode of
* the parent directory and then calls ntfs_page_map() to map a page
* from the index as we already hold the same UPL that ntfs_page_map()
* will try to get thus if we go to sleep on the ntfs inode lock that
* is held exclusive by the create code path we would now deadlock.
*
* To avoid the deadlock, we do a try-lock for the ntfs inode lock and
* if that fails we simply abort the pages returning them to the VM
* without modification thus they should remain dirty and they should
* be paged out at a later point in time.
*
* We then return ENXIO to indicate that this is a temporary failure to
* the caller.
*
* FIXME: There is a complication and that is that we really need to
* hole the inode lock for writing if we are writing to a hole and/or
* writing past the initialized size as we would then be modifying the
* initialized_size. But if UPL_NESTED_PAGEOUT is set we have no idea
* whether the caller is holding the lock for write or not and we
* cannot safely drop/retake the lock in any case... For now we ignore
* the problem and just emit a warning in this case.
*/if (!(flags & UPL_NESTED_PAGEOUT)) {
if (NInoSparse(ni))
lock_type = LCK_RW_TYPE_EXCLUSIVE;
if (!lck_rw_try_lock(&ni->lock, lock_type)) {
ntfs_debug("Failed to take ni->lock for %s for mft_no ""0x%llx, type 0x%x. Aborting with ""ENXIO to avoid deadlock.",
(lock_type == LCK_RW_TYPE_SHARED) ?
"reading" : "writing",
(unsignedlonglong)ni->mft_no,
(unsigned)le32_to_cpu(ni->type));
if (!(flags & UPL_NOCOMMIT))
ubc_upl_abort_range(upl, upl_ofs, size,
UPL_ABORT_FREE_ON_EMPTY);
return ENXIO;
}
locked = TRUE;
} else {
if (NInoSparse(ni))
ntfs_warning(vol->mp, "flags & UPL_NESTED_PAGEOUT && ""NINoSparse(ni), need inode lock ""exclusive but caller holds the lock ""so we do not know if it is exclusive ""or not.");
}
/* Do not allow messing with the inode once it has been deleted. */if (NInoDeleted(ni)) {
/* Remove the inode from the name cache. */
cache_purge(ni->vn);
err = ENOENT;
gotoerr;
}
retry_pageout:
/*
* TODO: This check may no longer be necessary now that we lock against
* changes in initialized size and thus truncation... Revisit this
* issue when the write code has been written and remove the check if
* appropriate simply using ubc_getsize(vn); without the size_lock.
*/
lck_spin_lock(&ni->size_lock);
attr_size = ubc_getsize(a->a_vp);
if (attr_size > ni->data_size)
attr_size = ni->data_size;
/*
* Cannot pageout to a negative offset or if we are starting beyond the
* end of the attribute or if the attribute offset is not page aligned
* or the size requested is not a multiple of PAGE_SIZE.
*/if (attr_ofs < 0 || attr_ofs >= attr_size || attr_ofs & PAGE_MASK_64 ||
size & PAGE_MASK || upl_ofs & PAGE_MASK) {
lck_spin_unlock(&ni->size_lock);
err = EINVAL;
gotoerr;
}
// TODO: HERE:
// FIXME: For now abort writes beyond initialized size...
// TODO: This causes a problem and that is in ntfs_vnop_write() we only
// update the initialized size after calling cluster_write() which
// means we cannot zero up to the initialized size here or we could
// trample over data that has just been written out. Also this causes
// our check here to trigger even though we are not really outside the
// initialized size at all and in fact this page out may be part of the
// write itself so it has to succeed. But on the other hand if this is
// a genuine mmap()-based write we do need to do the zeroing. We need
// to somehow be able to tell the difference between the two...
// If the initialized size equals attr_ofs then we can safely perform
// the write and then update the initialized size to attr_ofs + size
// but need to be careful to update the data size appropriately and
// also need to make sure not to exceed the end of the write otherwise
// we would cause a file extension here when we should not do so. In
// fact if this is not part of an extending write then we should not
// modify the data size and only the initialized size instead.
if (attr_ofs + size > ni->initialized_size && ni->initialized_size !=
ni->data_size) {
lck_spin_unlock(&ni->size_lock);
ntfs_error(vol->mp, "Writing beyond the initialized size of ""an attribute is not implemented yet.");
err = ENOTSUP;
gotoerr;
}
alloc_size = ni->allocated_size;
lck_spin_unlock(&ni->size_lock);
/*
* If this is a sparse attribute we need to fill any holes overlapping
* the write. We can skip resident attributes as they cannot have
* sparse regions.
*
* As allocated size goes in units of clusters we need to round down
* the start offset to the nearest cluster boundary and we need to
* round up the end offset to the next cluster boundary.
*/if (NInoSparse(ni) && NInoNonResident(ni) && ni->type == AT_DATA) {
s64 aligned_end, new_end;
aligned_end = (attr_ofs + size + vol->cluster_size_mask) &
~vol->cluster_size_mask;
/*
* Only need to instantiate holes up to the allocated size
* itself. Everything else would be an extension which is not
* allowed from VNOP_PAGEOUT().
*/if (aligned_end > alloc_size)
aligned_end = alloc_size;
err = ntfs_attr_instantiate_holes(ni,
attr_ofs & ~vol->cluster_size_mask,
aligned_end, &new_end, TRUE);
if (err) {
ntfs_error(vol->mp, "Cannot perform pageout of mft_no ""0x%llx because instantiation of ""sparse regions failed (error %d).",
(unsignedlonglong)ni->mft_no, err);
gotoerr;
}
/* The instantiation may not be partial. */if (new_end < aligned_end)
panic("%s(): new_end < aligned_end\n", __FUNCTION__);
}
/*
* Only $DATA attributes can be encrypted/compressed. Index root can
* have the flags set but this means to create compressed/encrypted
* files, not that the attribute is compressed/encrypted. Note we need
* to check for AT_INDEX_ALLOCATION since this is the type of directory
* index inodes.
*/if (ni->type != AT_INDEX_ALLOCATION) {
/* TODO: Deny access to encrypted attributes, just like NT4. */if (NInoEncrypted(ni)) {
if (ni->type != AT_DATA)
panic("%s(): Encrypted non-data attribute.\n",
__FUNCTION__);
ntfs_warning(vol->mp, "Denying write to encrypted ""attribute (EACCES).");
err = EACCES;
gotoerr;
}
/* Compressed data streams need special handling. */if (NInoNonResident(ni) && NInoCompressed(ni) && !NInoRaw(ni)) {
if (ni->type != AT_DATA)
panic("%s(): Compressed non-data attribute.\n",
__FUNCTION__);
gotocompressed;
}
}
/* NInoNonResident() == NInoIndexAllocPresent() */if (NInoNonResident(ni)) {
if (NInoMstProtected(ni))
err = ntfs_mst_pageout(ni, upl, upl_ofs, size,
attr_ofs, attr_size, flags);
else {
err = cluster_pageout_ext(a->a_vp, upl, upl_ofs,
attr_ofs, size, attr_size, flags, NULL,
NULL);
if (!err)
ntfs_debug("Done (cluster_pageout_ext()).");
else
ntfs_error(vol->mp, "Failed ""(cluster_pageout_ext(), ""error %d).", err);
}
gotodone;
}
compressed:
/* The attribute is resident and/or compressed. */
to_write = size;
bytes = attr_size - attr_ofs;
if (to_write > bytes)
to_write = bytes;
/*
* Calculate the number of bytes available in the attribute starting at
* offset @attr_ofs up to a maximum of the number of bytes to be
* written rounded up to a multiple of the system page size.
*/
bytes = (to_write + PAGE_MASK) & ~PAGE_MASK;
/* Abort any pages outside the end of the attribute. */if (size > bytes && !(flags & UPL_NOCOMMIT)) {
ubc_upl_abort_range(upl, upl_ofs + bytes, size - bytes,
UPL_ABORT_FREE_ON_EMPTY);
/* Update @size. */
size = bytes;
}
/* To access the page list contents, we need to map the page list. */
kerr = ubc_upl_map(upl, (vm_offset_t*)&kaddr);
if (kerr != KERN_SUCCESS) {
ntfs_error(vol->mp, "ubc_upl_map() failed (error %d).",
(int)kerr);
err = EIO;
gotoerr;
}
if (!NInoNonResident(ni)) {
/*
* Write the data from the page list into the resident
* attribute in its mft record.
*/
err = ntfs_resident_attr_write(ni, kaddr + upl_ofs, to_write,
attr_ofs);
// TODO: If !err and synchronous i/o, write the mft record now.
// This should probably happen in ntfs_resident_attr_write().
if (err && err != EAGAIN)
ntfs_error(vol->mp, "ntfs_resident_attr_write() ""failed (error %d).", err);
} elseif (NInoCompressed(ni)) {
ntfs_error(vol->mp, "Writing to compressed files is not ""implemented yet, sorry.");
err = ENOTSUP;
#if 0
ntfs_inode *raw_ni;
int ioflags;
/*
* Get the raw inode and lock it for writing to protect against
* concurrent readers and writers as the compressed data is
* invalid whilst a write is in progress.
*/
err = ntfs_raw_inode_get(ni, LCK_RW_TYPE_EXCLUSIVE, &raw_ni);
if (err)
ntfs_error(vol->mp, "Failed to get raw inode (error ""%d).", err);
else {
if (!NInoRaw(raw_ni))
panic("%s(): Requested raw inode but got ""non-raw one.\n", __FUNCTION__);
ioflags = 0;
if (vnode_isnocache(ni->vn) ||
vnode_isnocache(raw_ni->vn))
ioflags |= IO_NOCACHE;
if (vnode_isnoreadahead(ni->vn) ||
vnode_isnoreadahead(raw_ni->vn))
ioflags |= IO_RAOFF;
err = ntfs_write_compressed(ni, raw_ni, attr_ofs, size,
kaddr + upl_ofs, NULL, ioflags);
if (err)
ntfs_error(vol->mp, "ntfs_write_compressed() ""failed (error %d).", err);
lck_rw_unlock_exclusive(&raw_ni->lock);
(void)vnode_put(raw_ni->vn);
}
#endif
} else {
/*
* The attribute was converted to non-resident under our nose
* we need to retry the pageout.
*
* TODO: This may no longer be possible to happen now that we
* lock against changes in initialized size and thus
* truncation... Revisit this issue when the write code has
* been finished and replace this with a panic().
*/
err = EAGAIN;
}
kerr = ubc_upl_unmap(upl);
if (kerr != KERN_SUCCESS) {
ntfs_error(vol->mp, "ubc_upl_unmap() failed (error %d).",
(int)kerr);
if (!err)
err = EIO;
}
if (!err) {
if (!(flags & UPL_NOCOMMIT)) {
/* Commit the page range we wrote out. */
ubc_upl_commit_range(upl, upl_ofs, size,
UPL_COMMIT_FREE_ON_EMPTY);
}
// TODO: If we wrote anything at all we have to clear the
// setuid and setgid bits as a precaution against tampering
// (see xnu/bsd/hfs/hfs_readwrite.c::hfs_vnop_pageout()).
ntfs_debug("Done (%s).", !NInoNonResident(ni) ?
"ntfs_resident_attr_write()" :
"ntfs_write_compressed()");
} else/* if (err) */ {
/*
* If the attribute was converted to non-resident under our
* nose, retry the pageout.
*
* TODO: This may no longer be possible to happen now that we
* lock against changes in initialized size and thus
* truncation... Revisit this issue when the write code has
* been finished and remove the check and goto if appropriate.
*/if (err == EAGAIN)
gotoretry_pageout;
err:
if (!(flags & UPL_NOCOMMIT))
ubc_upl_abort_range(upl, upl_ofs, size,
UPL_ABORT_FREE_ON_EMPTY);
ntfs_error(vol->mp, "Failed (error %d).", err);
}
done:
// TODO: If we wrote anything at all we have to clear the setuid and
// setgid bits as a precaution against tampering (see
// xnu/bsd/hfs/hfs_readwrite.c::hfs_vnop_pageout()).
/*
* If this is not a directory or it is an encrypted directory, set the
* needs archiving bit except for the core system files.
*/if (!err && (!S_ISDIR(base_ni->mode) || NInoEncrypted(base_ni))) {
BOOL need_set_archive_bit = TRUE;
if (vol->major_ver > 1) {
if (base_ni->mft_no <= FILE_Extend)
need_set_archive_bit = FALSE;
} else {
if (base_ni->mft_no <= FILE_UpCase)
need_set_archive_bit = FALSE;
}
if (need_set_archive_bit) {
base_ni->file_attributes |= FILE_ATTR_ARCHIVE;
NInoSetDirtyFileAttributes(base_ni);
}
}
/*
* Update the last_data_change_time (mtime) and last_mft_change_time
* (ctime) on the base ntfs inode @base_ni but not on the core system
* files. However do set it on the root directory.
*
* Do not update the times on symbolic links.
*/if (!err && !S_ISLNK(base_ni->mode)) {
BOOL need_update_time = TRUE;
if (vol->major_ver > 1) {
if (base_ni->mft_no <= FILE_Extend &&
base_ni != vol->root_ni)
need_update_time = FALSE;
} else {
if (base_ni->mft_no <= FILE_UpCase &&
base_ni != vol->root_ni)
need_update_time = FALSE;
}
if (need_update_time) {
base_ni->last_mft_change_time =
base_ni->last_data_change_time =
ntfs_utc_current_time();
NInoSetDirtyTimes(base_ni);
}
}
if (locked) {
if (lock_type == LCK_RW_TYPE_SHARED)
lck_rw_unlock_shared(&ni->lock);
else
lck_rw_unlock_exclusive(&ni->lock);
}
return err;
}
/**
* ntfs_vnop_searchfs -
*
*/staticintntfs_vnop_searchfs(struct vnop_searchfs_args *a)
{
errno_t err;
ntfs_debug("Entering.");
// TODO:
err = err_searchfs(a);
ntfs_debug("Done (error %d).", (int)err);
return err;
}
/**
* ntfs_vnop_getxattr - get the data of an extended attribute of an ntfs inode
* @a: arguments to getxattr function
*
* @a contains:
* vnode_t a_vp; vnode whose extended attribute to get
* char *a_name; name of extented attribute to get in utf8
* uio_t a_uio; destination in which to return the exteneded attribute
* size_t *a_size; size of the extended attribute in bytes
* int a_options; flags controlling how the attribute is obtained
* vfs_context_t a_context;
*
* Get the named stream with the name @a->a_name (we map named streams 1:1 with
* extended attributes for NTFS as the NTFS native EAs are useless) contained
* in the vnode @a->a_vp and return its data in the destination specified by
* @a->a_uio.
*
* If there was not enough space to return the whole extended attribute in the
* destination @a->a_uio we return error ERANGE. The only exception to this is
* the resource fork (@a->a_name is XATTR_RESOURCEFORK_NAME) for which we just
* return up to uio_resid(@a->a_uio) bytes (or up to the end of the resource
* fork if that is smaller).
*
* Note that uio_offset(@a->a_uio) must be zero except for the resource fork
* where it can specify the offset into the resource fork at which to begin
* returning the data.
*
* If @a->a_uio is NULL, do not return the data of the attribute and instead
* return the current data size of the named stream in *@a->a_size. Note that
* when @a->a_uio is not NULL @a->a_size is ignored as the size of the named
* stream is implicitly returned in the @a->a_uio and it can be obtained by
* taking the original buffer size and subtracting uio_resid(@a->a_uio) from
* it.
*
* The flags in @a->a_options control how the attribute is obtained. The
* following flags are currently defined in OS X kernel:
* XATTR_NOFOLLOW - Do not follow symbolic links.
* XATTR_CREATE - Set the value, fail if already exists (setxattr only).
* XATTR_REPLACE - Set the value, fail if does not exist (setxattr only).
* XATTR_NOSECURITY- Bypass authorization checking.
* XATTR_NODEFAULT - Bypass default extended attribute file ('._' file).
*
* Return 0 on success and errno on error.
*/staticintntfs_vnop_getxattr(struct vnop_getxattr_args *a)
{
s64 size;
user_ssize_t start_count;
off_t start_ofs;
ntfs_inode *ani, *ni = NTFS_I(a->a_vp);
constchar *name = a->a_name;
uio_t uio = a->a_uio;
ntfs_volume *vol;
ntfschar *ntfs_name;
size_t ntfs_name_size;
signed ntfs_name_len;
errno_t err;
ntfschar ntfs_name_buf[NTFS_MAX_ATTR_NAME_LEN];
if (!ni) {
ntfs_debug("Entered with NULL ntfs_inode, aborting.");
return EINVAL;
}
vol = ni->vol;
/* Check for invalid names. */if (!name || name[0] == '\0')
return EINVAL;
start_ofs = uio_offset(uio);
start_count = uio_resid(uio);
ntfs_debug("Entering for mft_no 0x%llx, extended attribute name %s, ""offset 0x%llx, size 0x%llx, options 0x%x.",
(unsignedlonglong)ni->mft_no, name, start_ofs,
start_count, a->a_options);
lck_rw_lock_shared(&ni->lock);
/* Do not allow messing with the inode once it has been deleted. */if (NInoDeleted(ni)) {
/* Remove the inode from the name cache. */
cache_purge(ni->vn);
ntfs_debug("Mft_no 0x%llx is deleted.",
(unsignedlonglong)ni->mft_no);
err = ENOENT;
gotoerr;
}
/*
* Only regular files, directories, and symbolic links can have
* extended attributes. (Specifically named streams cannot have them.)
*
* Thus the check is for attribute inodes as all base inodes are
* allowed. Raw inodes are also attribute inodes so they are excluded
* automatically, too.
*/if (NInoAttr(ni)) {
ntfs_debug("Mft_no 0x%llx is an attribute inode.",
(unsignedlonglong)ni->mft_no);
err = EPERM;
gotoerr;
}
/*
* First of all deal with requests for the Finder info as that is
* special because we cache it in the base ntfs inode @ni and we only
* want to return it if the Finder info is non-zero. This is what HFS
* does, too.
*
* Thus we need to check the status of the cache in the ntfs inode
* first and if that it valid we can use it to check the content of the
* Finder info for being zero. And if it is not valid then we need to
* read it into the cache in the ntfs inode and then we can check the
* Finder info in the cache for being zero. In fact we do this the
* other way round, i.e. if the Finder info cache is not valid we read
* the Finder info into the cache first and then the cache is
* definitely valid thus we can check the Finder info for being
* non-zero and the Finder info data if so.
*
* A further complication is in the event of symbolic links where we do
* not return the type and creator and instead return zero for them as
* that is what HFS+ does, too.
*
* FIXME: This comparison is case sensitive.
*/if (!bcmp(name, XATTR_FINDERINFO_NAME, sizeof(XATTR_FINDERINFO_NAME))) {
FINDER_INFO fi;
if (!NInoValidFinderInfo(ni)) {
if (!lck_rw_lock_shared_to_exclusive(&ni->lock)) {
lck_rw_lock_exclusive(&ni->lock);
if (NInoDeleted(ni)) {
cache_purge(ni->vn);
lck_rw_unlock_exclusive(&ni->lock);
ntfs_debug("Mft_no 0x%llx is deleted.",
(unsignedlonglong)
ni->mft_no);
return ENOENT;
}
}
/*
* Load the AFP_AfpInfo stream and initialize the
* backup time and Finder info (if they are not already
* valid).
*/
err = ntfs_inode_afpinfo_read(ni);
if (err) {
ntfs_error(vol->mp, "Failed to obtain AfpInfo ""for mft_no 0x%llx (error %d).",
(unsignedlonglong)ni->mft_no,
err);
lck_rw_unlock_exclusive(&ni->lock);
return err;
}
lck_rw_lock_exclusive_to_shared(&ni->lock);
if (!NInoValidFinderInfo(ni))
panic("%s(): !NInoValidFinderInfo(ni)\n",
__FUNCTION__);
}
/*
* Make a copy of the Finder info and mask out the hidden bit
* if this is the root directory and the type and creator if
* this is a symbolic link.
*/
memcpy(&fi, &ni->finder_info, sizeof(fi));
if (ni == vol->root_ni)
fi.attrs &= ~FINDER_ATTR_IS_HIDDEN;
if (S_ISLNK(ni->mode)) {
fi.type = 0;
fi.creator = 0;
}
/* If the Finder info is zero, pretend it does not exist. */if (!bcmp(&fi, &ntfs_empty_finder_info,
sizeof(ni->finder_info))) {
ntfs_debug("Mft_no 0x%llx has zero Finder info, ""returning ENOATTR.",
(unsignedlonglong)ni->mft_no);
err = ENOATTR;
gotoerr;
}
/* The Finder info is not zero, return it. */if (!uio) {
*a->a_size = sizeof(FINDER_INFO);
err = 0;
} elseif (start_ofs)
err = EINVAL;
elseif (uio_resid(uio) < (user_ssize_t)sizeof(FINDER_INFO))
err = ERANGE;
else {
err = uiomove((caddr_t)&fi, sizeof(fi), uio);
if (err)
ntfs_error(vol->mp, "uiomove() failed (error ""%d).", err);
}
gotoerr;
}
/*
* Now deal with requests for the resource fork as that is special
* because on one hand we need to translate its name from
* XATTR_RESOURCEFORK_NAME to AFP_Resource so we do not need to convert
* the utf8 name @name to Unicode and on the other hand the offset
* @start_ofs may be non-zero and the read may be only from a partial
* region of the resource fork.
*
* FIXME: This comparison is case sensitive.
*/if (!bcmp(name, XATTR_RESOURCEFORK_NAME,
sizeof(XATTR_RESOURCEFORK_NAME))) {
ntfs_name = NTFS_SFM_RESOURCEFORK_NAME;
ntfs_name_len = 12;
} else {
/*
* The request is not for the resource fork (nor for the Finder
* info). This means that the offset @start_ofs must be zero.
*/if (start_ofs) {
err = EINVAL;
gotoerr;
}
/* Convert the requested name from utf8 to Unicode. */
ntfs_name = ntfs_name_buf;
ntfs_name_size = sizeof(ntfs_name_buf);
ntfs_name_len = utf8_to_ntfs(vol, (const u8*)name, strlen(name),
&ntfs_name, &ntfs_name_size);
if (ntfs_name_len < 0) {
err = -ntfs_name_len;
if (err == ENAMETOOLONG)
ntfs_debug("Failed (name is too long).");
else
ntfs_error(vol->mp, "Failed to convert name to ""Unicode (error %d).", err);
gotoerr;
}
/*
* If this is one of the SFM named streams, skip it, as they
* contain effectively metadata information so should not be
* exposed directly.
*/if (ntfs_is_sfm_name(vol, ntfs_name, ntfs_name_len)) {
ntfs_debug("Not allowing access to protected SFM name ""(returning EINVAL).");
err = EINVAL;
gotoerr;
}
}
/*
* We now have the name of the requested attribute in @ntfs_name and it
* is @ntfs_name_len characters long and we have verified that the
* start offset is zero (unless this is the resource fork in which case
* a non-zero start offset is fine).
*
* Start by getting the ntfs inode for the $DATA:@ntfs_name attribute.
*/
err = ntfs_attr_inode_get(ni, AT_DATA, ntfs_name, ntfs_name_len, FALSE,
LCK_RW_TYPE_SHARED, &ani);
if (err) {
if (err == ENOENT)
err = ENOATTR;
elseif (err != ENOATTR)
ntfs_error(vol->mp, "Failed to get $DATA/%s attribute ""inode mft_no 0x%llx (error %d).", name,
(unsignedlonglong)ni->mft_no, err);
gotoerr;
}
/*
* TODO: This check may no longer be necessary now that we lock against
* changes in initialized size and thus truncation... Revisit this
* issue when the write code has been written and remove the check if
* appropriate simply using ubc_getsize(ni->vn); without the size_lock.
*/
lck_spin_lock(&ani->size_lock);
size = ubc_getsize(ani->vn);
if (size > ani->data_size)
size = ani->data_size;
lck_spin_unlock(&ani->size_lock);
if (!uio)
*a->a_size = size;
elseif (ntfs_name != NTFS_SFM_RESOURCEFORK_NAME &&
start_count < size) {
/* Partial reads are only allowed for the resource fork. */
err = ERANGE;
} else {
/*
* Perform the actual read from the attribute inode. We pass
* in IO_UNIT as we want an atomic i/o operation.
*
* FIXME: ntfs_read() currently ignores the IO_UNIT flag so we
* still have to test for partial reads.
*/
err = ntfs_read(ani, uio, IO_UNIT, TRUE);
/*
* If the read was partial, reset @uio pretending that the read
* never happened. This is because extended attribute i/o is
* meant to be atomic, i.e. either we get it all or we do not
* get anything.
*
* Note we also accept the case where uio_resid() has gone to
* zero as this covers the exception of the resource fork for
* which we do not need to return the whole resource fork in
* one go.
*/if (uio_resid(uio) && start_count - uio_resid(uio) !=
size - start_ofs) {
/*
* FIXME: Should we be trying to continue a partial
* read in case we can complete it with multiple calls
* to ntfs_read()? If we do that we could also drop
* the IO_UNIT flag above.
*/if (!err) {
ntfs_debug("ntfs_read() returned a partial ""read, pretending the read ""never happened.");
err = EIO;
}
uio_setoffset(uio, start_ofs);
uio_setresid(uio, start_count);
}
}
lck_rw_unlock_shared(&ani->lock);
(void)vnode_put(ani->vn);
err:
lck_rw_unlock_shared(&ni->lock);
ntfs_debug("Done (error %d).", err);
return err;
}
/**
* ntfs_vnop_setxattr - set the data of an extended attribute of an ntfs inode
* @a: arguments to setxattr function
*
* @a contains:
* vnode_t a_vp; vnode whose extended attribute to set
* char *a_name; name of extented attribute to set in utf8
* uio_t a_uio; source data to which to set the exteneded attribute
* int a_options; flags controlling how the attribute is set
* vfs_context_t a_context;
*
* Get the named stream with the name @a->a_name (we map named streams 1:1 with
* extended attributes for NTFS as the NTFS native EAs are useless) contained
* in the vnode @a->a_vp and set its data to the source specified by @a->a_uio.
*
* If @a->a_options does not specify XATTR_CREATE nor XATTR_REPLACE the
* attribute will be created if it does not exist already and if it exists
* already the old value will be replaced with the new one, i.e. if the old
* value does not have the same size as the new value the attribute is
* truncated to the new size.
*
* If @a->a_options specifies XATTR_CREATE the call will fail if the attribute
* already exists, i.e. the existing attribute will not be replaced.
*
* If @a->a_options specifies XATTR_REPLACE the call will fail if the attribute
* does not exist, i.e. the new attribute will not be created.
*
* An exception is the resource fork (@a->a_name is XATTR_RESOURCEFORK_NAME)
* for which we do not replace the existing attribute and instead we write over
* the existing attribute starting at offset uio_offset(@a->a_uio) and writing
* uio_resid(@a->a_uio) bytes. Writing past the end of the resource fork will
* cause the resource fork to be extended just like a regular file write would
* do but a write to any existing part of the attribute will not cause the
* attribute to be shrunk.
*
* Simillar to other extended attributes, if @a->a_options specifies
* XATTR_CREATE the call will fail if the resource fork already exists, i.e.
* the write to the existing resource fork will be denied and if @a->a_options
* specified XATTR_REPLACE the call will fail if the resource fork does not yet
* exist, i.e. the new resource fork will not be created.
*
* Note that uio_offset(@a->a_uio) must be zero except for the resource fork
* where it can specify the offset into the resource fork at which to begin
* writing the data.
*
* The flags in @a->a_options control how the attribute is set. The following
* flags are currently defined in OS X kernel:
* XATTR_NOFOLLOW - Do not follow symbolic links.
* XATTR_CREATE - Set the value, fail if already exists (setxattr only).
* XATTR_REPLACE - Set the value, fail if does not exist (setxattr only).
* XATTR_NOSECURITY- Bypass authorization checking.
* XATTR_NODEFAULT - Bypass default extended attribute file ('._' file).
*
* Return 0 on success and errno on error.
*/staticintntfs_vnop_setxattr(struct vnop_setxattr_args *a)
{
s64 size;
user_ssize_t start_count;
off_t start_ofs;
ntfs_inode *ani, *ni = NTFS_I(a->a_vp);
ntfs_volume *vol;
constchar *name = a->a_name;
uio_t uio = a->a_uio;
ntfschar *ntfs_name;
size_t ntfs_name_size;
signed ntfs_name_len;
constint options = a->a_options;
errno_t err;
ntfschar ntfs_name_buf[NTFS_MAX_ATTR_NAME_LEN];
if (!ni) {
ntfs_debug("Entered with NULL ntfs_inode, aborting.");
return EINVAL;
}
vol = ni->vol;
/* Check for invalid names. */if (!name || name[0] == '\0')
return EINVAL;
start_ofs = uio_offset(uio);
start_count = uio_resid(uio);
ntfs_debug("Entering for mft_no 0x%llx, extended attribute name %s, ""offset 0x%llx, size 0x%llx, options 0x%x.",
(unsignedlonglong)ni->mft_no, name, start_ofs,
start_count, options);
/*
* Access to extended attributes must be atomic which we ensure by
* locking the base ntfs inode for writing.
*/
lck_rw_lock_exclusive(&ni->lock);
/* Do not allow messing with the inode once it has been deleted. */if (NInoDeleted(ni)) {
/* Remove the inode from the name cache. */
cache_purge(ni->vn);
ntfs_debug("Mft_no 0x%llx is deleted.",
(unsignedlonglong)ni->mft_no);
err = ENOENT;
gotoerr;
}
/*
* Only regular files, directories, and symbolic links can have
* extended attributes. (Specifically named streams cannot have them.)
*
* Thus the check is for attribute inodes as all base inodes are
* allowed. Raw inodes are also attribute inodes so they are excluded
* automatically, too.
*/if (NInoAttr(ni)) {
ntfs_debug("Mft_no 0x%llx is an attribute inode.",
(unsignedlonglong)ni->mft_no);
err = EPERM;
gotoerr;
}
/*
* XATTR_CREATE and XATTR_REPLACE may not be specified at the same time
* or weird things would happen so test for and abort this case here.
*/if ((options & (XATTR_CREATE | XATTR_REPLACE)) ==
(XATTR_CREATE | XATTR_REPLACE)) {
ntfs_debug("Either XATTR_CREATE or XATTR_REPLACE but not both ""may be specified.");
err = EINVAL;
gotoerr;
}
/*
* First of all deal with requests to set the Finder info as that is
* special because we cache it in the base ntfs inode @ni thus we need
* to copy the new Finder info into the cache and then write the
* changes out to the AFP_AfpInfo attribute (creating it if it did not
* exist before).
*
* The only exception to the above description is when the XATTR_CREATE
* or XATTR_REPLACE flags are set in @options in which case we need to
* know whether the Finder info extists already or not and thus if the
* Finder info cache is not valid we need to make it valid first and
* then we can check it against being zero to determine whether the
* Finder info exists already or not and then we know whether or not to
* proceed with setting the Finder info.
*
* FIXME: This comparison is case sensitive.
*/if (!bcmp(name, XATTR_FINDERINFO_NAME, sizeof(XATTR_FINDERINFO_NAME))) {
FINDER_INFO fi;
if (start_count != sizeof(ni->finder_info)) {
ntfs_debug("Number of bytes to write (%lld) does not ""equal Finder info size (%ld), ""returning ERANGE.",
(unsignedlonglong)start_count,
sizeof(ni->finder_info));
err = ERANGE;
gotoerr;
}
/*
* If @options does not specify XATTR_CREATE nor XATTR_REPLACE
* there is no need to bring the Finder info up-to-date before
* the write.
*/if (options & (XATTR_CREATE | XATTR_REPLACE)) {
if (!NInoValidFinderInfo(ni)) {
/*
* Load the AFP_AfpInfo stream and initialize
* the backup time and Finder info (at least
* the Finder info is not yet valid).
*/
err = ntfs_inode_afpinfo_read(ni);
if (err) {
ntfs_error(vol->mp, "Failed to obtain ""AfpInfo for mft_no ""0x%llx (error %d).",
(unsignedlonglong)
ni->mft_no, err);
gotoerr;
}
if (!NInoValidFinderInfo(ni))
panic("%s(): !NInoValidFinderInfo(ni)""\n", __FUNCTION__);
}
/*
* Make a copy of the Finder info and mask out the
* hidden bit if this is the root directory and the
* type and creator if this is a symbolic link.
*/
memcpy(&fi, &ni->finder_info, sizeof(fi));
if (ni == vol->root_ni)
fi.attrs &= ~FINDER_ATTR_IS_HIDDEN;
if (S_ISLNK(ni->mode)) {
fi.type = 0;
fi.creator = 0;
}
if (bcmp(&ni->finder_info, &ntfs_empty_finder_info,
sizeof(ni->finder_info))) {
/*
* Finder info is non-zero, i.e. it exists, and
* XATTR_CREATE was specified.
*/if (options & XATTR_CREATE) {
ntfs_debug("Mft_no 0x%llx has ""non-zero Finder info ""and XATTR_CREATE was ""specified, returning ""EEXIST.",
(unsignedlonglong)
ni->mft_no);
err = EEXIST;
gotoerr;
}
} else {
/*
* Finder info is zero, i.e. it does not exist,
* and XATTR_REPLACE was specified.
*/if (options & XATTR_REPLACE) {
ntfs_debug("Mft_no 0x%llx has zero ""Finder info and ""XATTR_REPLACE was ""specified, returning ""ENOATTR.",
(unsignedlonglong)
ni->mft_no);
err = ENOATTR;
gotoerr;
}
}
}
/* Copy the new Finder info value to our buffer. */
err = uiomove((caddr_t)&fi, sizeof(fi), uio);
if (!err) {
/*
* Set the Finder info to the new value after masking
* out the hidden bit if this is the root directory and
* enforcing the type and creator if this is a symbolic
* link to be our private values for symbolic links.
*/if (ni == vol->root_ni)
fi.attrs &= ~FINDER_ATTR_IS_HIDDEN;
if (S_ISLNK(ni->mode)) {
fi.type = FINDER_TYPE_SYMBOLIC_LINK;
fi.creator = FINDER_CREATOR_SYMBOLIC_LINK;
}
memcpy((u8*)&ni->finder_info, (u8*)&fi, sizeof(fi));
NInoSetValidFinderInfo(ni);
NInoSetDirtyFinderInfo(ni);
/*
* If the file is not hidden but the Finder info hidden
* bit is being set, we need to cause the file to be
* hidden, i.e. we need to set the FILE_ATTR_HIDDEN bit
* in the file_attributes of the $STANDARD_INFORMATION
* attribute.
*/if (fi.attrs & FINDER_ATTR_IS_HIDDEN &&
!(ni->file_attributes &
FILE_ATTR_HIDDEN)) {
ni->file_attributes |= FILE_ATTR_HIDDEN;
NInoSetDirtyFileAttributes(ni);
}
/*
* Updating the Finder info causes both the
* last_data_change_time (mtime) and
* last_mft_change_time (ctime) to be updated.
*/
ni->last_mft_change_time = ni->last_data_change_time =
ntfs_utc_current_time();
NInoSetDirtyTimes(ni);
/*
* Now write (if needed creating) the AFP_AfpInfo
* attribute with the specified Finder Info.
*/
err = ntfs_inode_afpinfo_write(ni);
if (err)
ntfs_error(vol->mp, "Failed to write/create ""AFP_AfpInfo attribute in ""inode 0x%llx (error %d).",
(unsignedlonglong)ni->mft_no,
err);
} else
ntfs_error(vol->mp, "uiomove() failed (error %d).",
err);
gotoerr;
}
/*
* Now deal with requests to write to the resource fork as that is
* special because on one hand we need to translate its name from
* XATTR_RESOURCEFORK_NAME to AFP_Resource so we do not need to convert
* the utf8 name @name to Unicode and on the other hand the offset
* @start_ofs may be non-zero, the write may be only to a partial
* region of the resource fork, and the write may not shrink the
* resource fork though it may extend it.
*
* FIXME: This comparison is case sensitive.
*/if (!bcmp(name, XATTR_RESOURCEFORK_NAME,
sizeof(XATTR_RESOURCEFORK_NAME))) {
ntfs_name = NTFS_SFM_RESOURCEFORK_NAME;
ntfs_name_len = 12;
} else {
/*
* The request is not for the resource fork (nor for the Finder
* info). This means that the offset @start_ofs must be zero.
*/if (start_ofs) {
err = EINVAL;
gotoerr;
}
/* Convert the requested name from utf8 to Unicode. */
ntfs_name = ntfs_name_buf;
ntfs_name_size = sizeof(ntfs_name_buf);
ntfs_name_len = utf8_to_ntfs(vol, (const u8*)name, strlen(name),
&ntfs_name, &ntfs_name_size);
if (ntfs_name_len < 0) {
err = -ntfs_name_len;
if (err == ENAMETOOLONG)
ntfs_debug("Failed (name is too long).");
else
ntfs_error(vol->mp, "Failed to convert name to ""Unicode (error %d).", err);
gotoerr;
}
/*
* If this is one of the SFM named streams, skip it, as they
* contain effectively metadata information so should not be
* exposed directly.
*/if (ntfs_is_sfm_name(vol, ntfs_name, ntfs_name_len)) {
ntfs_debug("Not allowing access to protected SFM name ""(returning EINVAL).");
err = EINVAL;
gotoerr;
}
}
/*
* We now have the name of the requested attribute in @ntfs_name and it
* is @ntfs_name_len characters long and we have verified that the
* start offset is zero (unless this is the resource fork in which case
* a non-zero start offset is fine).
*
* Get the ntfs attribute inode of the $DATA:@ntfs_name attribute
* (unless XATTR_CREATE is specified in @options) and if it does not
* exist create it first (unless XATTR_REPLACE is specified in
* @options).
*/
err = ntfs_attr_inode_get_or_create(ni, AT_DATA, ntfs_name,
ntfs_name_len, FALSE, FALSE, options,
LCK_RW_TYPE_EXCLUSIVE, &ani);
if (err) {
if (err == ENOENT)
err = ENOATTR;
elseif (err != ENOATTR && err != EEXIST)
ntfs_error(vol->mp, "Failed to get or create $DATA/%s ""attribute inode mft_no 0x%llx (error ""%d).", name,
(unsignedlonglong)ni->mft_no, err);
gotoerr;
}
/*
* TODO: This check may no longer be necessary now that we lock against
* changes in initialized size and thus truncation... Revisit this
* issue when the write code has been written and remove the check if
* appropriate simply using ubc_getsize(ni->vn); without the size_lock.
*/
lck_spin_lock(&ani->size_lock);
size = ubc_getsize(ani->vn);
if (size > ani->data_size)
size = ani->data_size;
lck_spin_unlock(&ani->size_lock);
/*
* Perform the actual write to the attribute inode. We pass in IO_UNIT
* as we want an atomic i/o operation.
*
* FIXME: ntfs_write() does not always honour the IO_UNIT flag so we
* still have to test for partial writes.
*/
err = ntfs_write(ani, uio, IO_UNIT, TRUE);
/*
* If the write was successful, need to shrink the attribute if the new
* size is smaller than the old size.
*
* If the write was partial or failed, reset @uio pretending that the
* write never happened. This is because extended attribute i/o is
* meant to be atomic, i.e. either we get it all or we do not get
* anything.
*
* In the partial/failed case, if @options specifies XATTR_REPLACE we
* know the extended attribute existed already thus we truncate it to
* zero size to simulate that the old value has been replaced. And if
* @options specifies XATTR_CREATE we know we created the extended
* attribute thus we delete it again. And if @options does not specify
* XATTR_REPLACE nor XATTR_CREATE then we do not know whether we
* created it or not and in this case we assume the caller does not
* care so we delete it to conserve disk space.
*/if (!err && !uio_resid(uio)) {
/*
* Shrink the attribute if the new value is smaller than the
* old value. We do not do this for the resource fork as that
* is a special case.
*/if (ntfs_name != NTFS_SFM_RESOURCEFORK_NAME) {
if (size > start_count) {
err = ntfs_attr_resize(ani, start_count, 0,
NULL);
if (err) {
ntfs_error(vol->mp, "Failed to resize ""extended attribute ""to its new size ""(error %d).", err);
gotoundo_err;
}
}
}
} else {
/*
* FIXME: Should we be trying to continue a partial write in
* case we can complete it with multiple calls to ntfs_write()?
*/if (!err) {
ntfs_debug("ntfs_write() returned a partial write, ""pretending the write never happened ""and removing or truncating to zero ""size the old attribute value.");
err = EIO;
}
undo_err:
uio_setoffset(uio, start_ofs);
uio_setresid(uio, start_count);
if (options & XATTR_REPLACE) {
errno_t err2;
err2 = ntfs_attr_resize(ani, 0, 0, NULL);
if (err2) {
ntfs_error(vol->mp, "Failed to truncate ""extended attribute to zero ""size in error code path ""(error %d), attempting to ""delete it instead.", err2);
gotorm_err;
}
} else {
rm_err:
/*
* Unlink the named stream. The last close will cause
* the VFS to call ntfs_vnop_inactive() which will do
* the actual removal.
*/
ani->link_count = 0;
/*
* Update the last_mft_change_time (ctime) in the inode
* as named stream/extended attribute semantics expect
* on OS X.
*/
ni->last_mft_change_time = ntfs_utc_current_time();
NInoSetDirtyTimes(ni);
/*
* If this is not a directory or it is an encrypted
* directory, set the needs archiving bit except for
* the core system files.
*/if (!S_ISDIR(ni->mode) || NInoEncrypted(ni)) {
BOOL need_set_archive_bit = TRUE;
if (ni->vol->major_ver >= 2) {
if (ni->mft_no <= FILE_Extend)
need_set_archive_bit = FALSE;
} else {
if (ni->mft_no <= FILE_UpCase)
need_set_archive_bit = FALSE;
}
if (need_set_archive_bit) {
ni->file_attributes |=
FILE_ATTR_ARCHIVE;
NInoSetDirtyFileAttributes(ni);
}
}
}
}
lck_rw_unlock_exclusive(&ani->lock);
(void)vnode_put(ani->vn);
err:
lck_rw_unlock_exclusive(&ni->lock);
ntfs_debug("Done (error %d).", err);
return err;
}
/**
* ntfs_vnop_removexattr - remove an extended attribute from an ntfs inode
* @a: arguments to removexattr function
*
* @a contains:
* vnode_t a_vp; vnode whose extended attribute to remove
* char *a_name; name of extented attribute to remove in utf8
* int a_options; flags controlling how the attribute is removed
* vfs_context_t a_context;
*
* Remove the named stream with the name @a->a_name (we map named streams 1:1
* with extended attributes for NTFS as the NTFS native EAs are useless) from
* the vnode @a->a_vp.
*
* The flags in @a->a_options control how the attribute is set. The following
* flags are currently defined in OS X kernel:
* XATTR_NOFOLLOW - Do not follow symbolic links.
* XATTR_CREATE - Set the value, fail if already exists (setxattr only).
* XATTR_REPLACE - Set the value, fail if does not exist (setxattr only).
* XATTR_NOSECURITY- Bypass authorization checking.
* XATTR_NODEFAULT - Bypass default extended attribute file ('._' file).
*
* Return 0 on success and errno on error.
*/staticintntfs_vnop_removexattr(struct vnop_removexattr_args *a)
{
ntfs_inode *ani, *ni = NTFS_I(a->a_vp);
constchar *name = a->a_name;
ntfs_volume *vol;
ntfschar *ntfs_name;
size_t ntfs_name_size;
signed ntfs_name_len;
errno_t err;
ntfschar ntfs_name_buf[NTFS_MAX_ATTR_NAME_LEN];
if (!ni) {
ntfs_debug("Entered with NULL ntfs_inode, aborting.");
return EINVAL;
}
vol = ni->vol;
/* Check for invalid names. */if (!name || name[0] == '\0')
return EINVAL;
ntfs_debug("Entering for mft_no 0x%llx, extended attribute name %s, ""options 0x%x.", (unsignedlonglong)ni->mft_no, name,
a->a_options);
/*
* Access to extended attributes must be atomic which we ensure by
* locking the base ntfs inode for writing.
*/
lck_rw_lock_exclusive(&ni->lock);
/* Do not allow messing with the inode once it has been deleted. */if (NInoDeleted(ni)) {
/* Remove the inode from the name cache. */
cache_purge(ni->vn);
ntfs_debug("Mft_no 0x%llx is deleted.",
(unsignedlonglong)ni->mft_no);
err = ENOENT;
gotoerr;
}
/*
* Only regular files, directories, and symbolic links can have
* extended attributes. (Specifically named streams cannot have them.)
*
* Thus the check is for attribute inodes as all base inodes are
* allowed. Raw inodes are also attribute inodes so they are excluded
* automatically, too.
*/if (NInoAttr(ni)) {
ntfs_debug("Mft_no 0x%llx is an attribute inode.",
(unsignedlonglong)ni->mft_no);
err = EPERM;
gotoerr;
}
/*
* First of all deal with requests to remove the Finder info as that is
* special because we cache it in the base ntfs inode @ni thus we need
* to zero the cached Finder info and then write the changes out to the
* AFP_AfpInfo attribute (deleting it if it is no longer needed). This
* is sufficient as a zero Finder info is treated the same as
* non-existent Finder info and vice versa.
*
* Note if the Finder info is already zero it does not exist thus we
* need to return ENOATTR instead thus we may need to load the Finder
* info first to find out whether it is zero or not.
*
* FIXME: This comparison is case sensitive.
*/if (!bcmp(name, XATTR_FINDERINFO_NAME, sizeof(XATTR_FINDERINFO_NAME))) {
FINDER_INFO fi;
if (!NInoValidFinderInfo(ni)) {
/*
* Load the AFP_AfpInfo stream and initialize the
* backup time and Finder info (at least the Finder
* info is not yet valid).
*/
err = ntfs_inode_afpinfo_read(ni);
if (err) {
ntfs_error(vol->mp, "Failed to obtain AfpInfo ""for mft_no 0x%llx (error %d).",
(unsignedlonglong)ni->mft_no,
err);
gotoerr;
}
if (!NInoValidFinderInfo(ni))
panic("%s(): !NInoValidFinderInfo(ni)\n",
__FUNCTION__);
}
/*
* Make a copy of the Finder info and mask out the hidden bit
* if this is the root directory and the type and creator if
* this is a symbolic link.
*/
memcpy(&fi, &ni->finder_info, sizeof(fi));
if (ni == vol->root_ni)
fi.attrs &= ~FINDER_ATTR_IS_HIDDEN;
if (S_ISLNK(ni->mode)) {
fi.type = 0;
fi.creator = 0;
}
if (!bcmp(&fi, &ntfs_empty_finder_info, sizeof(fi))) {
/* Finder info is zero, i.e. it does not exist. */
ntfs_debug("Mft_no 0x%llx has zero Finder info, ""returning ENOATTR.",
(unsignedlonglong)ni->mft_no);
err = ENOATTR;
gotoerr;
}
/* Zero the Finder info. */
bzero(&ni->finder_info, sizeof(ni->finder_info));
/*
* If the file is hidden, we need to reflect this fact in the
* Finder info, too.
*/if (ni->file_attributes & FILE_ATTR_HIDDEN)
ni->finder_info.attrs |= FINDER_ATTR_IS_HIDDEN;
/*
* Also, enforce the type and creator if this is a symbolic
* link to be our private values for symbolic links. This in
* fact causes the Finder info not to be deleted on disk and we
* cannot allow that to happen as we would then no longer know
* that this is a symbolic link.
*/if (S_ISLNK(ni->mode)) {
ni->finder_info.type = FINDER_TYPE_SYMBOLIC_LINK;
ni->finder_info.creator = FINDER_CREATOR_SYMBOLIC_LINK;
}
NInoSetValidFinderInfo(ni);
NInoSetDirtyFinderInfo(ni);
/*
* Updating the Finder info causes both the
* last_data_change_time (mtime) and last_mft_change_time
* (ctime) to be updated.
*/
ni->last_mft_change_time = ni->last_data_change_time =
ntfs_utc_current_time();
NInoSetDirtyTimes(ni);
/* Now write (if needed deleting) the AFP_AfpInfo attribute. */
err = ntfs_inode_afpinfo_write(ni);
if (!err)
ntfs_debug("Deleted Finder info from mft_no 0x%llx.",
(unsignedlonglong)ni->mft_no);
else
ntfs_error(vol->mp, "Failed to write/delete ""AFP_AfpInfo attribute in inode ""0x%llx (error %d).",
(unsignedlonglong)ni->mft_no, err);
gotoerr;
}
/*
* Now deal with requests to remove the resource fork as that is
* special because we need to translate its name from
* XATTR_RESOURCEFORK_NAME to AFP_Resource so we do not need to convert
* the utf8 name @name to Unicode.
*
* FIXME: This comparison is case sensitive.
*/if (!bcmp(name, XATTR_RESOURCEFORK_NAME,
sizeof(XATTR_RESOURCEFORK_NAME))) {
ntfs_name = NTFS_SFM_RESOURCEFORK_NAME;
ntfs_name_len = 12;
} else {
/*
* The request is not for the resource fork (nor for the Finder
* info).
*
* Convert the requested name from utf8 to Unicode.
*/
ntfs_name = ntfs_name_buf;
ntfs_name_size = sizeof(ntfs_name_buf);
ntfs_name_len = utf8_to_ntfs(vol, (const u8*)name, strlen(name),
&ntfs_name, &ntfs_name_size);
if (ntfs_name_len < 0) {
err = -ntfs_name_len;
if (err == ENAMETOOLONG)
ntfs_debug("Failed (name is too long).");
else
ntfs_error(vol->mp, "Failed to convert name to ""Unicode (error %d).", err);
gotoerr;
}
/*
* If this is one of the SFM named streams, skip it, as they
* contain effectively metadata information so should not be
* exposed directly.
*/if (ntfs_is_sfm_name(vol, ntfs_name, ntfs_name_len)) {
ntfs_debug("Not allowing access to protected SFM name ""%s in mft_no 0x%llx (returning ""EINVAL).", name,
(unsignedlonglong)ni->mft_no);
err = EINVAL;
gotoerr;
}
}
/*
* We now have the name of the requested attribute in @ntfs_name and it
* is @ntfs_name_len characters long.
*
* Get the ntfs attribute inode of the $DATA:@ntfs_name attribute.
*/
err = ntfs_attr_inode_get(ni, AT_DATA, ntfs_name, ntfs_name_len, FALSE,
LCK_RW_TYPE_EXCLUSIVE, &ani);
if (err) {
if (err == ENOENT)
err = ENOATTR;
elseif (err != ENOATTR)
ntfs_error(vol->mp, "Failed to get $DATA/%s attribute ""inode mft_no 0x%llx (error %d).",
name, (unsignedlonglong)ni->mft_no,
err);
gotoerr;
}
/*
* Unlink the named stream. The last close will cause the VFS to call
* ntfs_vnop_inactive() which will do the actual removal.
*/
ani->link_count = 0;
/*
* Update the last_mft_change_time (ctime) in the inode as named
* stream/extended attribute semantics expect on OS X.
*/
ni->last_mft_change_time = ntfs_utc_current_time();
NInoSetDirtyTimes(ni);
/*
* If this is not a directory or it is an encrypted directory, set the
* needs archiving bit except for the core system files.
*/if (!S_ISDIR(ni->mode) || NInoEncrypted(ni)) {
BOOL need_set_archive_bit = TRUE;
if (ni->vol->major_ver >= 2) {
if (ni->mft_no <= FILE_Extend)
need_set_archive_bit = FALSE;
} else {
if (ni->mft_no <= FILE_UpCase)
need_set_archive_bit = FALSE;
}
if (need_set_archive_bit) {
ni->file_attributes |= FILE_ATTR_ARCHIVE;
NInoSetDirtyFileAttributes(ni);
}
}
ntfs_debug("Done.");
lck_rw_unlock_exclusive(&ani->lock);
(void)vnode_put(ani->vn);
err:
lck_rw_unlock_exclusive(&ni->lock);
return err;
}
/**
* ntfs_vnop_listxattr - list the names of the extended attributes of an inode
* @args: arguments to listxattr function
*
* @args contains:
* vnode_t a_vp; vnode whose extended attributes to list
* uio_t a_uio; destination in which to return the list
* size_t *a_size; size of the list of extended attributes in bytes
* int a_options; flags controlling how the attribute list is generated
* vfs_context_t a_context;
*
* Iterate over the list of named streams (which we map 1:1 with extended
* attributes for NTFS as the NTFS native EAs are useless) in the vnode
* @args->a_vp and for each encountered stream copy its name (converted to an
* NULL-terminated utf8 string) to the destination as specified by
* @args->a_uio.
*
* If @args->a_uio is NULL, do not copy anything and simply iterate over all
* named streams and add up the number of bytes needed to create a full list of
* their names and return that in *@args->a_size. Note that when @args->a_uio
* is not NULL @args->a_size is ignored as the number of bytes is implicitly
* returned in the @args->a_uio and it can be obtained by taking the original
* buffer size and subtracting uio_resid(@args->a_uio) from it.
*
* The flags in @args->a_options control how the attribute list is generated.
* The following flags are currently defined in OS X kernel:
* XATTR_NOFOLLOW - Do not follow symbolic links.
* XATTR_CREATE - Set the value, fail if already exists (setxattr only).
* XATTR_REPLACE - Set the value, fail if does not exist (setxattr only).
* XATTR_NOSECURITY- Bypass authorization checking.
* XATTR_NODEFAULT - Bypass default extended attribute file ('._' file).
*
* Return 0 on success and errno on error.
*/staticintntfs_vnop_listxattr(struct vnop_listxattr_args *args)
{
ntfs_inode *ni = NTFS_I(args->a_vp);
uio_t uio = args->a_uio;
ntfs_volume *vol;
MFT_RECORD *m;
ntfs_attr_search_ctx *ctx;
u8 *utf8_name;
ntfschar *upcase;
unsigned upcase_len;
size_t size, utf8_size;
errno_t err;
BOOL case_sensitive;
FINDER_INFO fi;
if (!ni) {
ntfs_debug("Entered with NULL ntfs_inode, aborting.");
return EINVAL;
}
vol = ni->vol;
upcase = vol->upcase;
upcase_len = vol->upcase_len;
case_sensitive = NVolCaseSensitive(vol);
ntfs_debug("Entering.");
lck_rw_lock_shared(&ni->lock);
/* Do not allow messing with the inode once it has been deleted. */if (NInoDeleted(ni)) {
/* Remove the inode from the name cache. */
cache_purge(ni->vn);
ntfs_debug("Mft_no 0x%llx is deleted.",
(unsignedlonglong)ni->mft_no);
err = ENOENT;
gotoerr;
}
/*
* Only regular files, directories, and symbolic links can have
* extended attributes. (Specifically named streams cannot have them.)
*
* Thus the check is for attribute inodes as all base inodes are
* allowed. Raw inodes are also attribute inodes so they are excluded
* automatically, too.
*/if (NInoAttr(ni)) {
ntfs_debug("Mft_no 0x%llx is an attribute inode.",
(unsignedlonglong)ni->mft_no);
err = EPERM;
gotoerr;
}
size = 0;
/*
* First of all deal with the Finder info as that is special because we
* cache it in the base ntfs inode @ni and we only want to export the
* name for the Finder info, XATTR_FINDERINFO_NAME, if the Finder info
* is non-zero. This is what HFS does, too.
*
* Thus we need to check the status of the cache in the ntfs inode
* first and if that it valid we can use it to check the content of the
* Finder info for being zero. And if it is not valid then it must be
* non-resident in which case we need to read it into the cache in the
* ntfs inode and then we can check the Finder info in the cache for
* being zero. In fact we do this the other way round, i.e. if the
* Finder info cache is not valid we read the Finder info into the
* cache first and then the cache is definitely valid thus we can check
* the Finder info for being non-zero and export XATTR_FINDERINFO_NAME
* if so.
*/if (!NInoValidFinderInfo(ni)) {
if (!lck_rw_lock_shared_to_exclusive(&ni->lock)) {
lck_rw_lock_exclusive(&ni->lock);
if (NInoDeleted(ni)) {
cache_purge(ni->vn);
lck_rw_unlock_exclusive(&ni->lock);
ntfs_debug("Mft_no 0x%llx is deleted.",
(unsignedlonglong)ni->mft_no);
return ENOENT;
}
}
/*
* Load the AFP_AfpInfo stream and initialize the backup time
* and Finder info (if they are not already valid).
*/
err = ntfs_inode_afpinfo_read(ni);
if (err) {
ntfs_error(vol->mp, "Failed to obtain AfpInfo for ""mft_no 0x%llx (error %d).",
(unsignedlonglong)ni->mft_no, err);
lck_rw_unlock_exclusive(&ni->lock);
return err;
}
if (!NInoValidFinderInfo(ni))
panic("%s(): !NInoValidFinderInfo(ni)\n", __FUNCTION__);
lck_rw_lock_exclusive_to_shared(&ni->lock);
}
/*
* Make a copy of the Finder info and mask out the hidden bit if this
* is the root directory and the type and creator if this is a symbolic
* link.
*/
memcpy(&fi, &ni->finder_info, sizeof(fi));
if (ni == vol->root_ni)
fi.attrs &= ~FINDER_ATTR_IS_HIDDEN;
if (S_ISLNK(ni->mode)) {
fi.type = 0;
fi.creator = 0;
}
if (bcmp(&fi, &ntfs_empty_finder_info, sizeof(fi))) {
if (!uio)
size += sizeof(XATTR_FINDERINFO_NAME);
elseif (uio_resid(uio) <
(user_ssize_t)sizeof(XATTR_FINDERINFO_NAME)) {
err = ERANGE;
gotoerr;
} else {
err = uiomove((caddr_t)XATTR_FINDERINFO_NAME,
sizeof(XATTR_FINDERINFO_NAME), uio);
if (err) {
ntfs_error(vol->mp, "uiomove() failed (error ""%d).", err);
gotoerr;
}
}
ntfs_debug("Exporting Finder info name %s.",
XATTR_FINDERINFO_NAME);
}
/* Iterate over all the named $DATA attributes. */
err = ntfs_mft_record_map(ni, &m);
if (err) {
ntfs_error(vol->mp, "Failed to map mft record (error %d).",
err);
gotoerr;
}
ctx = ntfs_attr_search_ctx_get(ni, m);
if (!ctx) {
ntfs_error(vol->mp, "Failed to allocate search context.");
err = ENOMEM;
gotounm_err;
}
/*
* Allocate a buffer we can use when converting the names of the named
* $DATA attributes to utf8. We want enough space to definitely be
* able to convert the name as well as a byte for the NULL terminator.
*/
utf8_size = NTFS_MAX_ATTR_NAME_LEN * 4 + 1;
utf8_name = OSMalloc(utf8_size, ntfs_malloc_tag);
if (!utf8_name) {
ntfs_error(vol->mp, "Failed to allocate name buffer.");
err = ENOMEM;
gotoput_err;
}
do {
ntfs_inode *ani;
ATTR_RECORD *a;
ntfschar *name;
unsigned name_len;
signed utf8_len;
/* Get the next $DATA attribute. */
err = ntfs_attr_lookup(AT_DATA, NULL, 0, 0, NULL, 0, ctx);
if (err) {
if (err == ENOENT) {
err = 0;
break;
}
ntfs_error(vol->mp, "Failed to iterate over named ""$DATA attributes (error %d).", err);
gotofree_err;
}
/* Got the next attribute, deal with it. */
a = ctx->a;
/* If this is the unnamed $DATA attribute, skip it. */if (!a->name_length) {
ntfs_debug("Skipping unnamed $DATA attribute.");
continue;
}
name = (ntfschar*)((u8*)a + le16_to_cpu(a->name_offset));
name_len = a->name_length;
if ((u8*)name < (u8*)a || (u8*)name + name_len > (u8*)a +
le32_to_cpu(a->length)) {
ntfs_error(vol->mp, "Found corrupt named $DATA ""attribute. Run chkdsk.");
NVolSetErrors(vol);
err = EIO;
gotofree_err;
}
/*
* Check if this attribute currently has a cached inode/vnode
* and if so check if it has been unlinked/deleted and if so
* skip it.
*/
err = ntfs_attr_inode_lookup(ni, a->type, name, name_len,
FALSE, &ani);
if (err != ENOENT) {
BOOL skip_it;
if (err)
panic("%s() inode lookup failed (error %d).\n",
__FUNCTION__, err);
/* Got the cached attribute inode. */
skip_it = FALSE;
if (NInoDeleted(ani) || !ani->link_count ||
(ntfs_are_names_equal(name, name_len,
NTFS_SFM_RESOURCEFORK_NAME, 12,
case_sensitive, upcase, upcase_len) &&
!ubc_getsize(ani->vn)))
skip_it = TRUE;
if (skip_it) {
if (NInoDeleted(ani) || !ani->link_count)
ntfs_debug("Skipping deleted/unlinked ""attribute.");
else
ntfs_debug("Mft_no 0x%llx has zero ""size resource fork, ""pretending it does ""not exist.",
(unsignedlonglong)
ani->mft_no);
(void)vnode_put(ani->vn);
continue;
}
(void)vnode_put(ani->vn);
}
/*
* If AFP_Resource named stream exists, i.e. the resource fork
* is present, and it is non-empty export the name
* XATTR_RESOURCEFORK_NAME. This is what HFS does, too.
*/if (ntfs_are_names_equal(name, name_len,
NTFS_SFM_RESOURCEFORK_NAME, 12, case_sensitive,
upcase, upcase_len)) {
if (!ntfs_attr_size(a)) {
ntfs_debug("Skipping empty resource fork ""name %s.",
XATTR_RESOURCEFORK_NAME);
continue;
}
if (!uio)
size += sizeof(XATTR_RESOURCEFORK_NAME);
elseif (uio_resid(uio) < (user_ssize_t)sizeof(
XATTR_RESOURCEFORK_NAME)) {
err = ERANGE;
gotofree_err;
} else {
err = uiomove((caddr_t)XATTR_RESOURCEFORK_NAME,
sizeof(XATTR_RESOURCEFORK_NAME),
uio);
if (err) {
ntfs_error(vol->mp, "uiomove() failed ""(error %d).", err);
gotofree_err;
}
}
ntfs_debug("Exporting resource fork name %s.",
XATTR_RESOURCEFORK_NAME);
continue;
}
/*
* If this is one of the SFM named streams, skip it, as they
* contain effectively metadata information so should not be
* exposed directly.
*/if (ntfs_is_sfm_name(vol, name, name_len)) {
ntfs_debug("Skipping protected SFM name.");
continue;
}
/* Convert the name to utf8. */
utf8_len = ntfs_to_utf8(vol, name, name_len <<
NTFSCHAR_SIZE_SHIFT, &utf8_name, &utf8_size);
if (utf8_len < 0) {
ntfs_warning(vol->mp, "Skipping unrepresentable name ""in mft_no 0x%llx (error %d).",
(unsignedlonglong)ni->mft_no,
-utf8_len);
continue;
}
/*
* If this is a protected attribute, skip it.
*
* FIXME: xattr_protected() is case sensitive so it does not
* exclude protected attributes when they are not correctly
* cased on disk.
*
* However we do call it to be consistent with HFS and SMB but
* it is pointless as anyone can call getxattr() for a case
* variant and the getxattr() system call would use
* xattr_protected() which would not filter it out so the
* VNOP_GETXATTR() call would happen and we would return the
* attribute just fine. Simillarly anyone could set and remove
* such "protected" attributes by just calling the system call
* with a case variant even when they are correctly filtered
* out here.
*/if (xattr_protected((char*)utf8_name)) {
ntfs_debug("Skipping protected name %.*s.", utf8_len,
utf8_name);
continue;
}
/*
* Increment the length of the name by one for the NULL
* terminator.
*/
utf8_len++;
/* Export the utf8_name. */if (!uio)
size += utf8_len;
elseif (uio_resid(uio) < utf8_len) {
err = ERANGE;
gotofree_err;
} else {
err = uiomove((caddr_t)utf8_name, utf8_len, uio);
if (err) {
ntfs_error(vol->mp, "uiomove() failed (error ""%d).", err);
gotofree_err;
}
}
ntfs_debug("Exporting name %.*s.", utf8_len, utf8_name);
/* Continue to the next name. */
} while (1);
if (!uio)
*args->a_size = size;
ntfs_debug("Done.");
free_err:
OSFree(utf8_name, utf8_size, ntfs_malloc_tag);
put_err:
ntfs_attr_search_ctx_put(ctx);
unm_err:
ntfs_mft_record_unmap(ni);
err:
lck_rw_unlock_shared(&ni->lock);
return err;
}
/**
* ntfs_vnop_blktooff - map a logical block number to its byte offset
* @a: arguments to blktooff function
*
* @a contains:
* vnode_t a_vp; vnode to which the logical block number belongs
* daddr64_t a_lblkno; logical block number to map
* off_t *a_offset; destination for returning the result
*
* Map the logical block number @a->a_lblkno belonging to the vnode @a->a_vp to
* the corresponding byte offset, i.e. the offset in the vnode in bytes and
* return the result in @a->a_offset.
*
* Return 0 on success and EINVAL if no vnode was specified in @a->a_vp.
*/staticintntfs_vnop_blktooff(struct vnop_blktooff_args *a)
{
ntfs_inode *ni;
ntfs_volume *vol;
unsigned block_size_shift;
if (!a->a_vp) {
ntfs_warning(NULL, "Called with NULL vnode!");
return EINVAL;
}
ni = NTFS_I(a->a_vp);
if (!ni) {
ntfs_debug("Entered with NULL ntfs_inode, aborting.");
return EINVAL;
}
if (S_ISDIR(ni->mode)) {
ntfs_error(ni->vol->mp, "Called for directory vnode.");
return EINVAL;
}
ntfs_debug("Entering for logical block 0x%llx, mft_no 0x%llx, type ""0x%x, name_len 0x%x.", (unsignedlonglong)a->a_lblkno,
(unsignedlonglong)ni->mft_no, le32_to_cpu(ni->type),
(unsigned)ni->name_len);
vol = ni->vol;
block_size_shift = PAGE_SHIFT;
/*
* For $MFT/$DATA and $MFTMirr/$DATA the logical block number is the
* mft record number and the block size is the mft record size which is
* also in @ni->block_size{,_shift}.
*/if (ni == vol->mft_ni || ni == vol->mftmirr_ni)
block_size_shift = ni->block_size_shift;
*a->a_offset = a->a_lblkno << block_size_shift;
ntfs_debug("Done (byte offset 0x%llx).",
(unsignedlonglong)*a->a_offset);
return 0;
}
/**
* ntfs_vnop_offtoblk - map a byte offset to its logical block number
* @a: arguments to offtoblk function
*
* @a contains:
* vnode_t a_vp; vnode to which the byte offset belongs
* off_t a_offset; byte offset to map
* daddr64_t *a_lblkno; destination for returning the result
*
* Map the byte offset @a->a_offset belonging to the vnode @a->a_vp to the
* corresponding logical block number, i.e. the offset in the vnode in units of
* the vnode block size and return the result in @a->a_lblkno.
*
* Return 0 on success and EINVAL if no vnode was specified in @a->a_vp.
*/staticintntfs_vnop_offtoblk(struct vnop_offtoblk_args *a)
{
ntfs_inode *ni;
ntfs_volume *vol;
unsigned block_size_shift;
if (!a->a_vp) {
ntfs_warning(NULL, "Called with NULL vnode.");
return EINVAL;
}
ni = NTFS_I(a->a_vp);
if (!ni) {
ntfs_debug("Entered with NULL ntfs_inode, aborting.");
return EINVAL;
}
if (S_ISDIR(ni->mode)) {
ntfs_error(ni->vol->mp, "Called for directory vnode.");
return EINVAL;
}
ntfs_debug("Entering for byte offset 0x%llx, mft_no 0x%llx, type ""0x%x, name_len 0x%x.", (unsignedlonglong)a->a_offset,
(unsignedlonglong)ni->mft_no, le32_to_cpu(ni->type),
(unsigned)ni->name_len);
vol = ni->vol;
block_size_shift = PAGE_SHIFT;
/*
* For $MFT/$DATA and $MFTMirr/$DATA the logical block number is the
* mft record number and the block size is the mft record size which is
* also in @ni->block_size{,_shift}.
*/if (ni == vol->mft_ni || ni == vol->mftmirr_ni)
block_size_shift = ni->block_size_shift;
*a->a_lblkno = a->a_offset >> block_size_shift;
ntfs_debug("Done (logical block 0x%llx).",
(unsignedlonglong)*a->a_lblkno);
return 0;
}
/**
* ntfs_vnop_blockmap - map a file offset to its physical block number
* @a: arguments to blockmap function
*
* @a contains:
* vnode_t a_vp; vnode to which the byte offset belongs
* off_t a_foffset; starting byte offset to map
* size_t a_size; number of bytes to map starting at @a_foffset
* daddr64_t *a_bpn; destination for starting physical block number
* size_t *a_run; destination for contiguous bytes from @a_bpn
* void *a_poff; physical offset into @a_bpn
* int a_flags; reason for map (VNODE_READ, VNODE_WRITE, or 0)
* vfs_context_t a_context;
*
* Map @a->a_size bytes starting at the file offset @a->a_foffset to the
* corresponding physical block number and return the result in @a->a_bpn
* (starting block number), @a->a_run (number of contiguous bytes starting at
* @a->a_bpn), and @a->a_poff (byte offset into @a->a_bpn corresponding to the
* file offset @a->a_foffset, this will be zero if @a_foffset is block aligned
* and non-zero otherwise).
*
* FIXME: At present the OS X kernel completely ignores @a->a_poff and in fact
* it is always either NULL on entry or the returned value is ignored. Thus,
* for now, if @a->a_foffset is not aligned to the physical block size, we
* always return error (EINVAL) unless @a->a_foffset equals the initialized
* size in the ntfs inode in which case we return a block number of -1 in
* @a->a_bpn thus alignment to the block and hence @a->a_poff are not relevant.
* Thus we always return 0 in @a->a_poff.
*
* @a->a_flags is either VNODE_READ or VNODE_WRITE but can be 0 in certain call
* paths such as the system call fcntl(F_LOG2PHYS) for example.
*
* Note, all the return pointers (@a->a_bpn, @a->a_run, @a->a_poff) are NULL in
* some code paths in xnu (one or more of them at a time), thus all of them
* need to be checked for being NULL before writing to them. If @a->a_bpn is
* NULL then there is nothing to do and success is returned immediately.
*
* For ntfs mapping to physical blocks is special because some attributes do
* not have block aligned data. This is the case for all resident attributes
* as well as for all non-resident attributes which are compressed or
* encrypted. For all of those it would be logical to return an error however
* this leads to a kernel panic in current xnu because a buf_bread() can cause
* ntfs_vnop_blockmap() to be called when an uptodate page is in memory but no
* buffer is in memory. This can happen under memory pressure when the buffer
* has been recycled for something else but the page has not been reused yet.
* In that case ntfs_vnop_blockmap() is only called to recreate the physical
* mapping of the buffer and is not actually used for anything as the data is
* already present in the uptodate page. Thus, instead of returning error, we
* set the physical block @a->a_bpn to equal the logical block corresponding to
* the byte offset @a->a_foffset and return success. Doing this signals to the
* VFS that the physical mapping cannot be cached in the buffer and all is
* well. Note this call path always has a non-zero @a->a_flags whilst other
* "weird" code paths like fcntl(F_LOG2PHYS) set @a->a_flags to zero, thus we
* can do the above workaround when @a->a_flags is not zero and return error
* EINVAL when @a->a_flags is zero.
*
* In the read case and when @a->a_flags is zero, if @a->a_foffset is beyond
* the end of the attribute, return error ERANGE. HFS returns ERANGE in this
* case so we follow suit. Although some other OS X file systems return EFBIG
* and some E2BIG instead so it does not seem to be very standardized, so maybe
* we should return the IMHO more correct "invalid seek" (ESPIPE), instead. (-;
*
* In the write case we need to allow the mapping of blocks beyond the end of
* the attribute as we will already have extended the allocated size but not
* yet the data size nor the initialized size. Thus in this case we only
* return ERANGE if the requested @a->a_foffset is beyond the end of the
* allocated size.
*
* Return 0 on success and errno on error.
*/staticintntfs_vnop_blockmap(struct vnop_blockmap_args *a)
{
const s64 byte_offset = a->a_foffset;
const s64 byte_size = a->a_size;
s64 max_size, data_size, init_size, clusters, bytes = 0;
VCN vcn;
LCN lcn;
ntfs_inode *ni = NTFS_I(a->a_vp);
ntfs_volume *vol;
unsigned vcn_ofs;
BOOL is_write = (a->a_flags & VNODE_WRITE);
if (!ni) {
ntfs_debug("Entered with NULL ntfs_inode, aborting.");
return EINVAL;
}
vol = ni->vol;
ntfs_debug("Entering for mft_no 0x%llx, type 0x%x, name_len 0x%x, ""offset 0x%llx, size 0x%llx, for %s operation.",
(unsignedlonglong)ni->mft_no,
(unsigned)le32_to_cpu(ni->type),
(unsigned)ni->name_len,
(unsignedlonglong)byte_offset,
(unsignedlonglong)byte_size,
a->a_flags ? (is_write ? "write" : "read") :
"unspecified");
if (S_ISDIR(ni->mode)) {
ntfs_error(vol->mp, "Called for directory vnode.");
return EINVAL;
}
if (is_write && NVolReadOnly(vol)) {
ntfs_warning(vol->mp, "Called for VNODE_WRITE but mount is ""read-only.");
return EROFS;
}
if (!a->a_bpn) {
ntfs_debug("Called with a_bpn == NULL, nothing to do. ""Returning success (0).");
return 0;
}
/*
* We cannot take the inode lock as it may be held already so we just
* check the deleted bit and abort if it is set which is better than
* nothing.
*/if (NInoDeleted(ni)) {
/* Remove the inode from the name cache. */
cache_purge(ni->vn);
ntfs_debug("Inode has been deleted.");
return ENOENT;
}
/*
* Note it does not matter if we are racing with truncate because that
* will be detected during the runlist lookup below.
*/
lck_spin_lock(&ni->size_lock);
if (is_write)
max_size = ni->allocated_size;
else
max_size = ni->data_size;
data_size = ni->data_size;
init_size = ni->initialized_size;
lck_spin_unlock(&ni->size_lock);
if (byte_offset >= max_size) {
eof:
ntfs_error(vol->mp, "Called for inode 0x%llx, size 0x%llx, ""byte offset 0x%llx, for %s operation, which ""is beyond the end of the inode %s size ""0x%llx. Returning error: ERANGE.",
(unsignedlonglong)ni->mft_no,
(unsignedlonglong)byte_size,
(unsignedlonglong)byte_offset, a->a_flags ?
(is_write ? "write" : "read") : "unspecified",
is_write ? "allocated" : "data",
(unsignedlonglong)max_size);
return ERANGE;
}
if (byte_offset & vol->sector_size_mask && byte_offset != init_size) {
ntfs_error(vol->mp, "Called for inode 0x%llx, byte offset ""0x%llx. This is not a multiple of the ""physical block size %u thus the mapping ""cannot be performed. Returning error: ""EINVAL.", (unsignedlonglong)ni->mft_no,
(unsignedlonglong)byte_offset,
(unsigned)vol->sector_size);
return EINVAL;
}
/*
* In the read case, if the requested byte offset is at or beyond the
* initialized size simply return a hole. We already checked for being
* at or beyond the data size so we know we are in an uninitialized
* region in this case rather than at or beyond the end of the
* attribute.
*/if (!is_write && byte_offset >= init_size) {
*a->a_bpn = -1; /* -1 means hole. *//*
* Set the size of the block to the number of uninitialized
* bytes in the attribute starting at the requested byte offset
* @a->a_foffset.
*/
bytes = data_size - byte_offset;
gotodone;
}
/*
* Blockmap does not make sense for resident attributes and neither
* does it make sense for non-resident, compressed or encrypted
* attributes. The only special case is for directory inodes because
* their flags are only defaults to be used when creating new files
* rather than having any meaning for their actual data contents.
*/if (!NInoNonResident(ni) || (ni->type != AT_INDEX_ALLOCATION &&
(NInoCompressed(ni) || NInoEncrypted(ni)) &&
!NInoRaw(ni))) {
if (!a->a_flags) {
ntfs_error(vol->mp, "Called for inode 0x%llx, which ""is resident, compressed, or ""encrypted and VNOP_BLOCKMAP() does ""not make sense for such inodes. ""Returning error: EINVAL.",
(unsignedlonglong)ni->mft_no);
return EINVAL;
}
*a->a_bpn = byte_offset >> PAGE_SHIFT;
bytes = ni->block_size;
ntfs_debug("Called for inode 0x%llx which is resident, ""compressed, or encrypted and VNOP_BLOCKMAP() ""does not make sense for such inodes. ""Returning success and setting physical == ""logical block number to signal to VFS that ""the mapping cannot be cached in the buffer.",
(unsignedlonglong)ni->mft_no);
gotodone;
}
/*
* All is ok, do the mapping. First, work out the vcn and vcn offset
* corresponding to the @a->a_foffset.
*/
vcn = byte_offset >> vol->cluster_size_shift;
vcn_ofs = (u32)byte_offset & vol->cluster_size_mask;
/*
* Convert the vcn to the corresponding lcn and obtain the number of
* contiguous clusters starting at the vcn.
*/
lck_rw_lock_shared(&ni->rl.lock);
lcn = ntfs_attr_vcn_to_lcn_nolock(ni, vcn, FALSE,
a->a_run ? &clusters : 0);
if (lcn < LCN_HOLE) {
errno_t err;
/* Error: deal with it. */
lck_rw_unlock_shared(&ni->rl.lock);
switch (lcn) {
caseLCN_ENOENT:
/*
* Raced with a concurrent truncate which caused the
* byte offset @a->a_foffset to become outside the
* attribute size.
*/gotoeof;
caseLCN_ENOMEM:
ntfs_error(vol->mp, "Not enough memory to complete ""mapping for inode 0x%llx. ""Returning error: ENOMEM.",
(unsignedlonglong)ni->mft_no);
err = ENOMEM;
break;
default:
ntfs_error(vol->mp, "Failed to complete mapping for ""inode 0x%llx. Run chkdsk. ""Returning error: EIO.",
(unsignedlonglong)ni->mft_no);
err = EIO;
break;
}
return err;
}
if (lcn < 0) {
/*
* It is a hole, return it. If this is a VNODE_WRITE request,
* output a warning as this should never happen. Both
* VNOP_WRITE() and VNOP_PAGEOUT() should have instantiated the
* hole before performing the write.
*
* Note we could potentially fill the hole here in the write
* case. However this is quite hard to do as the caller will
* likely have pages around the hole locked in UBC UPLs thus we
* would have difficulties zeroing the surrounding regions when
* the cluster size is larger than the page size. Also a
* problem is what happens if the write fails for some reason
* but we have instantiated the hole here and not zeroed it
* completely (because we are expecting the write to go into
* the allocated clusters). We would have no way of fixing up
* in this case and we would end up exposing stale data. This
* all is why we choose not to fill the hole here but to do it
* in advance in ntfs_vnop_write() and ntfs_vnop_pageout().
*
* The only thing that will happen when we return a hole in the
* write case is that when the caller is cluster_io(), it will
* page out page by page and this will fill the hole in pieces
* which will degrade performance.
*/if (is_write)
ntfs_warning(vol->mp, "Returning hole but flags ""specify VNODE_WRITE. This causes ""very inefficient allocation and I/O ""patterns.");
/* Return the hole. */
lck_rw_unlock_shared(&ni->rl.lock);
*a->a_bpn = -1; /* -1 means hole. */if (a->a_run) {
bytes = (clusters << vol->cluster_size_shift) - vcn_ofs;
/*
* If the run overlaps the initialized size, extend the
* run length so it goes up to the data size thus
* merging the hole with the uninitialized region.
*
* Note, do not do this in the write case as we want to
* return the real clusters even beyond the initialized
* size as the initialized size will only be updated
* after the write has completed.
*/if (!is_write && byte_offset + bytes > init_size)
bytes = data_size - byte_offset;
}
gotodone;
} else
lck_rw_unlock_shared(&ni->rl.lock);
/* The vcn was mapped successfully to a physical lcn, return it. */
*a->a_bpn = ((lcn << vol->cluster_size_shift) + vcn_ofs) >>
vol->sector_size_shift;
if (a->a_run) {
bytes = (clusters << vol->cluster_size_shift) - vcn_ofs;
/*
* In the read case, if the run overlaps the initialized size,
* truncate the run length so it only goes up to the
* initialized size. The caller will then be able to access
* this region on disk directly and will then call us again
* with a byte offset equal to the initialized size and we will
* then return the entire initialized region as a hole. Thus
* the caller does not need to know about the fact that NTFS
* has such a thing as the initialized_size.
*
* We already handled the case where the byte offset is beyond
* the initialized size so no need to check for that here.
*
* However do not do this if the initialized size is equal to
* the data size. The caller is responsible for not returning
* data beyond the attribute size to user space. If this is
* not done the last page of an attribute read is broken into
* two separate i/os, one with a read and one with a hole.
* cluster_io() will zero beyond the end of attribute in any
* case so it is faster to do it with a single call.
*/if (!is_write && byte_offset + bytes > init_size &&
init_size < data_size)
bytes = init_size - byte_offset;
}
done:
if (a->a_run) {
if (bytes > byte_size)
bytes = byte_size;
*a->a_run = bytes;
}
if (a->a_poff)
*(int*)a->a_poff = 0;
ntfs_debug("Done (a_bpn 0x%llx, a_run 0x%lx, a_poff 0x%x).",
(unsignedlonglong)*a->a_bpn,
a->a_run ? (unsignedlong)*a->a_run : 0,
a->a_poff ? *(int*)a->a_poff : 0);
return 0;
}
/**
* ntfs_vnop_getnamedstream - find a named stream in an inode given its name
* @a: arguments to getnamedstream function
*
* @a contains:
* vnode_t a_vp; vnode containing the named stream
* vnode_t *a_svpp; destination for the named stream vnode
* const char *a_name; name of the named stream to get
* enum nsoperation a_operation; reason for getnamedstream
* int a_flags; flags describing the request
* vfs_context_t a_context;
*
* Find the named stream with name @a->a_name in the vnode @a->a_vp and return
* the vnode of the named stream in *@a->a_svpp if it was found.
*
* @a->a_operation specifies the reason for the lookup of the named stream.
* The following operations are currently defined in OS X kernel:
* NS_OPEN - Want to open the named stream for access.
* NS_CREATE - Want to create the named stream so checking it does not
* exist already.
* NS_DELETE - Want to delete the named stream so making sure it exists.
*
* The flags in @a->a_flags further describe the getnamedstream request. At
* present no flags are defined in OS X kernel.
*
* Note that at present Mac OS X only supports the "com.apple.ResourceFork"
* stream so we follow suit.
*
* Return 0 on success and the error code on error. A return value of ENOATTR
* does not signify an error as such but merely the fact that the named stream
* @name is not present in the vnode @a->a_vp.
*/staticintntfs_vnop_getnamedstream(struct vnop_getnamedstream_args *a)
{
vnode_t vn = a->a_vp;
ntfs_inode *sni, *ni = NTFS_I(vn);
constchar *name = a->a_name;
int options;
constenum nsoperation op = a->a_operation;
errno_t err;
if (!ni) {
ntfs_debug("Entered with NULL ntfs_inode, aborting.");
return EINVAL;
}
ntfs_debug("Entering for mft_no 0x%llx, stream name %s, operation %s ""(0x%x), flags 0x%x.", (unsignedlonglong)ni->mft_no,
name, op == NS_OPEN ? "NS_OPEN" :
(op == NS_CREATE ? "NS_CREATE" :
(op == NS_DELETE ? "NS_DELETE" : "unknown")), op,
a->a_flags);
/*
* Mac OS X only supports the resource fork stream.
* Note that this comparison is case sensitive.
*/if (bcmp(name, XATTR_RESOURCEFORK_NAME,
sizeof(XATTR_RESOURCEFORK_NAME))) {
ntfs_warning(ni->vol->mp, "Unsupported named stream %s ""specified, only the resource fork named ""stream (%s) is supported at present. ""Returning ENOATTR.", name,
XATTR_RESOURCEFORK_NAME);
return ENOATTR;
}
/* Only regular files may have a resource fork stream. */if (!S_ISREG(ni->mode)) {
ntfs_warning(ni->vol->mp, "The resource fork may only be ""attached to regular files and mft_no 0x%llx ""is not a regular file. Returning EPERM.",
(unsignedlonglong)ni->mft_no);
return EPERM;
}
/*
* Attempt to get the inode for the named stream. For the resource
* fork we need to return it even if it is zero size if the caller has
* specified @op == NS_OPEN so we set @options to zero in this case.
* Otherwise we want to treat a zero size resource fork as a
* non-existent resource fork se we set @options to XATTR_REPLACE which
* is the behaviour of ntfs_attr_inode_get().
*/if (op == NS_OPEN) {
options = 0;
lck_rw_lock_exclusive(&ni->lock);
} else {
options = XATTR_REPLACE;
lck_rw_lock_shared(&ni->lock);
}
/* Do not allow messing with the inode once it has been deleted. */if (NInoDeleted(ni)) {
/* Remove the inode from the name cache. */
cache_purge(vn);
if (op == NS_OPEN)
lck_rw_unlock_exclusive(&ni->lock);
else
lck_rw_unlock_shared(&ni->lock);
ntfs_debug("Mft_no 0x%llx is deleted.",
(unsignedlonglong)ni->mft_no);
return ENOENT;
}
err = ntfs_attr_inode_get_or_create(ni, AT_DATA,
NTFS_SFM_RESOURCEFORK_NAME, 12, FALSE, FALSE, options,
LCK_RW_TYPE_SHARED, &sni);
if (!err) {
/* We have successfully opened the named stream. */
*a->a_svpp = sni->vn;
lck_rw_unlock_shared(&sni->lock);
ntfs_debug("Done.");
} else {
if (err == ENOENT) {
err = ENOATTR;
ntfs_debug("Done (named stream %s does not exist in ""mft_no 0x%llx.", name,
(unsignedlonglong)ni->mft_no);
} else
ntfs_error(ni->vol->mp, "Failed to get named stream ""%s, mft_no 0x%llx (error %d).", name,
(unsignedlonglong)ni->mft_no, err);
}
if (op == NS_OPEN)
lck_rw_unlock_exclusive(&ni->lock);
else
lck_rw_unlock_shared(&ni->lock);
return err;
}
/**
* ntfs_vnop_makenamedstream - create a named stream in an ntfs inode
* @a: arguments to makenamedstream function
*
* @a contains:
* vnode_t a_vp; vnode in which to create the named stream
* vnode_t *a_svpp; destination for the named stream vnode
* const char *a_name; name of the named stream to create
* int a_flags; flags describing the request
* vfs_context_t a_context;
*
* Create the named stream with name @a->a_name in the vnode @a->a_vp and
* return the created vnode of the named stream in *@a->a_svpp. If the named
* stream already exists than it is obtained instead, i.e. if the named stream
* already exists then ntfs_vnop_makenamedstream() does exactly the same thing
* as ntfs_vnop_getnamedstream().
*
* The flags in @a->a_flags further describe the makenamedstream request. At
* present no flags are defined in OS X kernel.
*
* Note that at present Mac OS X only supports the "com.apple.ResourceFork"
* stream so we follow suit.
*
* Return 0 on success and the error code on error.
*/staticintntfs_vnop_makenamedstream(struct vnop_makenamedstream_args *a)
{
vnode_t vn = a->a_vp;
ntfs_inode *sni, *ni = NTFS_I(vn);
constchar *name = a->a_name;
errno_t err;
if (!ni) {
ntfs_debug("Entered with NULL ntfs_inode, aborting.");
return EINVAL;
}
ntfs_debug("Entering for mft_no 0x%llx, stream name %s, flags 0x%x.",
(unsignedlonglong)ni->mft_no, name, a->a_flags);
/*
* Mac OS X only supports the resource fork stream.
* Note that this comparison is case sensitive.
*/if (bcmp(name, XATTR_RESOURCEFORK_NAME,
sizeof(XATTR_RESOURCEFORK_NAME))) {
ntfs_warning(ni->vol->mp, "Unsupported named stream %s ""specified, only the resource fork named ""stream (%s) is supported at present. ""Returning ENOATTR.", name,
XATTR_RESOURCEFORK_NAME);
return ENOATTR;
}
/* Only regular files may have a resource fork stream. */if (!S_ISREG(ni->mode)) {
ntfs_warning(ni->vol->mp, "The resource fork may only be ""attached to regular files and mft_no 0x%llx ""is not a regular file. Returning EPERM.",
(unsignedlonglong)ni->mft_no);
return EPERM;
}
lck_rw_lock_exclusive(&ni->lock);
/* Do not allow messing with the inode once it has been deleted. */if (NInoDeleted(ni)) {
/* Remove the inode from the name cache. */
cache_purge(vn);
lck_rw_unlock_exclusive(&ni->lock);
ntfs_debug("Mft_no 0x%llx is deleted.",
(unsignedlonglong)ni->mft_no);
return ENOENT;
}
/*
* Attempt to create the named stream.
*
* HFS allows an existing resource fork to be opened. We want to
* follow suit so we specify 0 for @options when calling
* ntfs_attr_inode_get_or_create().
*
* FIXME: I think this is actually wrong behaviour. If I am right and
* this is one day fixed in HFS, then we can trivially fix the
* behaviour here by setting @options to XATTR_CREATE.
*/
err = ntfs_attr_inode_get_or_create(ni, AT_DATA,
NTFS_SFM_RESOURCEFORK_NAME, 12, FALSE, FALSE, 0,
LCK_RW_TYPE_SHARED, &sni);
if (!err) {
/* We have successfully opened the (created) named stream. */
*a->a_svpp = sni->vn;
lck_rw_unlock_shared(&sni->lock);
ntfs_debug("Done.");
} else {
if (err == EEXIST)
ntfs_debug("Named stream %s already exists in mft_no ""0x%llx.", name,
(unsignedlonglong)ni->mft_no);
else
ntfs_error(ni->vol->mp, "Failed to create named ""stream %s in mft_no 0x%llx (error ""%d).", name,
(unsignedlonglong)ni->mft_no, err);
}
lck_rw_unlock_exclusive(&ni->lock);
return err;
}
/**
* ntfs_vnop_removenamedstream - remove a named stream from an ntfs inode
* @a: arguments to removenamedstream function
*
* @a contains:
* vnode_t a_vp; vnode from which to remove the named stream
* vnode_t a_svp; vnode of named stream to remove
* const char *a_name; name of the named stream to remove
* int a_flags; flags describing the request
* vfs_context_t a_context;
*
* Delete the named stream described by the vnode @a->a_svp with name
* @a->a_name from the vnode @a->a_vp.
*
* The flags in @a->a_flags further describe the removenamedstream request. At
* present no flags are defined in OS X kernel.
*
* Note we obey POSIX open unlink semantics thus an open named stream will
* remain accessible for read/write/lseek purproses until the last open
* instance is closed when the VFS will call ntfs_vnop_inactive() which will in
* turn actually remove the named stream.
*
* Note that at present Mac OS X only supports the "com.apple.ResourceFork"
* stream so we follow suit.
*
* Return 0 on success and the error code on error. A return value of ENOATTR
* does not signify an error as such but merely the fact that the named stream
* @name is not present in the vnode @a->a_vp.
*/staticintntfs_vnop_removenamedstream(struct vnop_removenamedstream_args *a)
{
vnode_t svn, vn = a->a_vp;
ntfs_inode *sni, *ni = NTFS_I(vn);
constchar *vname, *name = a->a_name;
svn = a->a_svp;
sni = NTFS_I(svn);
if (!ni || !sni) {
ntfs_debug("Entered with NULL ntfs_inode, aborting.");
return EINVAL;
}
vname = vnode_getname(svn);
ntfs_debug("Entering for mft_no 0x%llx, stream mft_no 0x%llx, stream ""name %s, flags 0x%x, stream vnode name %s.",
(unsignedlonglong)ni->mft_no,
(unsignedlonglong)sni->mft_no, name, a->a_flags,
vname ? vname : "not present");
if (vname)
(void)vnode_putname(vname);
/*
* Mac OS X only supports the resource fork stream.
* Note that this comparison is case sensitive.
*/if (bcmp(name, XATTR_RESOURCEFORK_NAME,
sizeof(XATTR_RESOURCEFORK_NAME))) {
ntfs_warning(ni->vol->mp, "Unsupported named stream %s ""specified, only the resource fork named ""stream (%s) is supported at present. ""Returning ENOATTR.", name,
XATTR_RESOURCEFORK_NAME);
return ENOATTR;
}
/* Only regular files may have a resource fork stream. */if (!S_ISREG(ni->mode)) {
ntfs_warning(ni->vol->mp, "The resource fork may only be ""attached to regular files and mft_no 0x%llx ""is not a regular file. Returning EPERM.",
(unsignedlonglong)ni->mft_no);
return EPERM;
}
lck_rw_lock_exclusive(&ni->lock);
/* Do not allow messing with the inode once it has been deleted. */if (NInoDeleted(ni)) {
/* Remove the inode from the name cache. */
cache_purge(vn);
lck_rw_unlock_exclusive(&ni->lock);
ntfs_debug("Mft_no 0x%llx is deleted.",
(unsignedlonglong)ni->mft_no);
return ENOATTR;
}
lck_rw_lock_exclusive(&sni->lock);
/* Do not allow messing with the stream once it has been deleted. */if (NInoDeleted(sni)) {
/* Remove the inode from the name cache. */
cache_purge(svn);
lck_rw_unlock_exclusive(&sni->lock);
lck_rw_unlock_exclusive(&ni->lock);
ntfs_debug("Stream mft_no 0x%llx, name %s is deleted.",
(unsignedlonglong)sni->mft_no, name);
return ENOATTR;
}
/*
* The base inode of the stream inode must be the same as the parent
* inode specified by the caller.
*/if (!NInoAttr(sni) || sni->base_ni != ni)
panic("%s(): !NInoAttr(sni) || sni->base_ni != ni\n",
__FUNCTION__);
/*
* Unlink the named stream. The last close will cause the VFS to call
* ntfs_vnop_inactive() which will do the actual removal.
*
* And if the named stream is already unlinked there is nothing to do.
* This is what HFS does so we follow suit.
*/if (sni->link_count) {
sni->link_count = 0;
/*
* Update the last_mft_change_time (ctime) in the inode as
* named stream/extended attribute semantics expect on OS X.
*/
ni->last_mft_change_time = ntfs_utc_current_time();
NInoSetDirtyTimes(ni);
/*
* If this is not a directory or it is an encrypted directory,
* set the needs archiving bit except for the core system
* files.
*/if (!S_ISDIR(ni->mode) || NInoEncrypted(ni)) {
BOOL need_set_archive_bit = TRUE;
if (ni->vol->major_ver >= 2) {
if (ni->mft_no <= FILE_Extend)
need_set_archive_bit = FALSE;
} else {
if (ni->mft_no <= FILE_UpCase)
need_set_archive_bit = FALSE;
}
if (need_set_archive_bit) {
ni->file_attributes |= FILE_ATTR_ARCHIVE;
NInoSetDirtyFileAttributes(ni);
}
}
ntfs_debug("Done.");
} else
ntfs_debug("$DATA/%s attribute has already been unlinked from ""mft_no 0x%llx.", name,
(unsignedlonglong)sni->mft_no);
lck_rw_unlock_exclusive(&sni->lock);
lck_rw_unlock_exclusive(&ni->lock);
return 0;
}
staticstruct vnodeopv_entry_desc ntfs_vnodeop_entries[] = {
/*
* Set vn_default_error() to be our default vnop, thus any vnops we do
* not specify (or specify as NULL) will be set to it and this function
* just returns ENOTSUP.
*/
{ &vnop_default_desc, (vnop_t*)vn_default_error },
{ &vnop_strategy_desc, (vnop_t*)ntfs_vnop_strategy },
/*
* vn_bwrite() is a simple wrapper for buf_bwrite() which in turn uses
* VNOP_STRATEGY() and hence ntfs_vnop_strategy() to do the i/o and the
* latter handles all NTFS specifics thus we can simply use the generic
* vn_bwrite() for our VNOP_BWRITE() method.
*/
{ &vnop_bwrite_desc, (vnop_t*)vn_bwrite },
{ &vnop_lookup_desc, (vnop_t*)ntfs_vnop_lookup },
{ &vnop_create_desc, (vnop_t*)ntfs_vnop_create },
{ &vnop_mknod_desc, (vnop_t*)ntfs_vnop_mknod },
{ &vnop_open_desc, (vnop_t*)ntfs_vnop_open },
{ &vnop_close_desc, (vnop_t*)ntfs_vnop_close },
{ &vnop_access_desc, (vnop_t*)ntfs_vnop_access },
{ &vnop_getattr_desc, (vnop_t*)ntfs_vnop_getattr },
{ &vnop_setattr_desc, (vnop_t*)ntfs_vnop_setattr },
{ &vnop_read_desc, (vnop_t*)ntfs_vnop_read },
{ &vnop_write_desc, (vnop_t*)ntfs_vnop_write },
{ &vnop_ioctl_desc, (vnop_t*)ntfs_vnop_ioctl },
{ &vnop_select_desc, (vnop_t*)ntfs_vnop_select },
{ &vnop_exchange_desc, (vnop_t*)ntfs_vnop_exchange },
/* Let the VFS deal with revoking a vnode. */
{ &vnop_revoke_desc, (vnop_t*)nop_revoke },
{ &vnop_mmap_desc, (vnop_t*)ntfs_vnop_mmap },
{ &vnop_mnomap_desc, (vnop_t*)ntfs_vnop_mnomap },
{ &vnop_fsync_desc, (vnop_t*)ntfs_vnop_fsync },
{ &vnop_remove_desc, (vnop_t*)ntfs_vnop_remove },
{ &vnop_link_desc, (vnop_t*)ntfs_vnop_link },
{ &vnop_rename_desc, (vnop_t*)ntfs_vnop_rename },
{ &vnop_mkdir_desc, (vnop_t*)ntfs_vnop_mkdir },
{ &vnop_rmdir_desc, (vnop_t*)ntfs_vnop_rmdir },
{ &vnop_symlink_desc, (vnop_t*)ntfs_vnop_symlink },
{ &vnop_readdir_desc, (vnop_t*)ntfs_vnop_readdir },
{ &vnop_readdirattr_desc, (vnop_t*)ntfs_vnop_readdirattr },
{ &vnop_readlink_desc, (vnop_t*)ntfs_vnop_readlink },
{ &vnop_inactive_desc, (vnop_t*)ntfs_vnop_inactive },
{ &vnop_reclaim_desc, (vnop_t*)ntfs_vnop_reclaim },
{ &vnop_pathconf_desc, (vnop_t*)ntfs_vnop_pathconf },
/*
* Let the VFS deal with advisory locking for us, so our advlock method
* should never get called and if it were to get called for some
* reason, we make sure to return error (ENOTSUP).
*/
{ &vnop_advlock_desc, (vnop_t*)err_advlock },
{ &vnop_allocate_desc, (vnop_t*)ntfs_vnop_allocate },
{ &vnop_pagein_desc, (vnop_t*)ntfs_vnop_pagein },
{ &vnop_pageout_desc, (vnop_t*)ntfs_vnop_pageout },
{ &vnop_searchfs_desc, (vnop_t*)ntfs_vnop_searchfs },
/*
* Nothing supports copyfile in current xnu and it is not documented so
* we do not support it either.
*/
{ &vnop_copyfile_desc, (vnop_t*)err_copyfile },
{ &vnop_getxattr_desc, (vnop_t*)ntfs_vnop_getxattr },
{ &vnop_setxattr_desc, (vnop_t*)ntfs_vnop_setxattr },
{ &vnop_removexattr_desc, (vnop_t*)ntfs_vnop_removexattr },
{ &vnop_listxattr_desc, (vnop_t*)ntfs_vnop_listxattr },
{ &vnop_blktooff_desc, (vnop_t*)ntfs_vnop_blktooff },
{ &vnop_offtoblk_desc, (vnop_t*)ntfs_vnop_offtoblk },
{ &vnop_blockmap_desc, (vnop_t*)ntfs_vnop_blockmap },
{ &vnop_getnamedstream_desc, (vnop_t*)ntfs_vnop_getnamedstream },
{ &vnop_makenamedstream_desc, (vnop_t*)ntfs_vnop_makenamedstream },
{ &vnop_removenamedstream_desc, (vnop_t*)ntfs_vnop_removenamedstream },
{ NULL, (vnop_t*)NULL }
};
struct vnodeopv_desc ntfs_vnodeopv_desc = {
&ntfs_vnodeop_p, ntfs_vnodeop_entries
};