| // Copyright 2021 The Fuchsia Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| use crate::security; |
| use crate::task::CurrentTask; |
| use crate::vfs::{ |
| CheckAccessReason, FileHandle, FileObject, FsNodeHandle, FsNodeLinkBehavior, FsStr, FsString, |
| MountInfo, Mounts, NamespaceNode, UnlinkKind, path, |
| }; |
| use bitflags::bitflags; |
| use fuchsia_rcu::{RcuOptionArc, RcuReadScope}; |
| use starnix_rcu::RcuString; |
| use starnix_sync::{FileOpsCore, LockEqualOrBefore, Locked, RwLock, RwLockWriteGuard}; |
| use starnix_uapi::auth::FsCred; |
| use starnix_uapi::errors::{ENOENT, Errno}; |
| use starnix_uapi::file_mode::{Access, FileMode}; |
| use starnix_uapi::inotify_mask::InotifyMask; |
| use starnix_uapi::open_flags::OpenFlags; |
| use starnix_uapi::{NAME_MAX, RENAME_EXCHANGE, RENAME_NOREPLACE, RENAME_WHITEOUT, errno, error}; |
| use std::collections::BTreeMap; |
| use std::collections::btree_map::Entry; |
| use std::fmt; |
| use std::ops::Deref; |
| use std::sync::atomic::{AtomicU8, Ordering}; |
| use std::sync::{Arc, Weak}; |
| #[cfg(detect_lock_cycles)] |
| use tracing_mutex::util; |
| |
| bitflags! { |
| #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] |
| pub struct RenameFlags: u32 { |
| // Exchange the entries. |
| const EXCHANGE = RENAME_EXCHANGE; |
| |
| // Don't overwrite an existing DirEntry. |
| const NOREPLACE = RENAME_NOREPLACE; |
| |
| // Create a "whiteout" object to replace the file. |
| const WHITEOUT = RENAME_WHITEOUT; |
| |
| // Allow replacing any file with a directory. This is an internal flag used only |
| // internally inside Starnix for OverlayFS. |
| const REPLACE_ANY = 0x80000000; |
| |
| // Internal flags that cannot be passed to `sys_rename()` |
| const INTERNAL = Self::REPLACE_ANY.bits(); |
| } |
| } |
| |
| pub trait DirEntryOps: Send + Sync + 'static { |
| /// Revalidate the [`DirEntry`], if needed. |
| /// |
| /// Most filesystems don't need to do any revalidations because they are "local" |
| /// and all changes to nodes go through the kernel. However some filesystems |
| /// allow changes to happen through other means (e.g. NFS, FUSE) and these |
| /// filesystems need a way to let the kernel know it may need to refresh its |
| /// cached metadata. This method provides that hook for such filesystems. |
| /// |
| /// For more details, see: |
| /// - https://www.halolinux.us/kernel-reference/the-dentry-cache.html |
| /// - https://www.kernel.org/doc/html/latest/filesystems/path-lookup.html#revalidation-and-automounts |
| /// - https://lwn.net/Articles/649115/ |
| /// - https://www.infradead.org/~mchehab/kernel_docs/filesystems/path-walking.html |
| /// |
| /// Returns `Ok(valid)` where `valid` indicates if the `DirEntry` is still valid, |
| /// or an error. |
| fn revalidate( |
| &self, |
| _locked: &mut Locked<FileOpsCore>, |
| _: &CurrentTask, |
| _: &DirEntry, |
| ) -> Result<bool, Errno> { |
| Ok(true) |
| } |
| } |
| |
| bitflags! { |
| #[derive(Clone, Copy, Debug, PartialEq, Eq)] |
| pub struct DirEntryFlags: u8 { |
| /// Whether this directory entry has been removed from the tree. |
| const IS_DEAD = 1 << 0; |
| |
| /// Whether the entry has filesystems mounted on top of it. |
| const HAS_MOUNTS = 1 << 1; |
| } |
| } |
| |
| pub struct DefaultDirEntryOps; |
| |
| impl DirEntryOps for DefaultDirEntryOps {} |
| |
| /// An entry in a directory. |
| /// |
| /// This structure assigns a name to an FsNode in a given file system. An |
| /// FsNode might have multiple directory entries, for example if there are more |
| /// than one hard link to the same FsNode. In those cases, each hard link will |
| /// have a different parent and a different local_name because each hard link |
| /// has its own DirEntry object. |
| /// |
| /// A directory cannot have more than one hard link, which means there is a |
| /// single DirEntry for each Directory FsNode. That invariant lets us store the |
| /// children for a directory in the DirEntry rather than in the FsNode. |
| pub struct DirEntry { |
| /// The FsNode referenced by this DirEntry. |
| /// |
| /// A given FsNode can be referenced by multiple DirEntry objects, for |
| /// example if there are multiple hard links to a given FsNode. |
| pub node: FsNodeHandle, |
| |
| /// The [`DirEntryOps`] for this `DirEntry`. |
| /// |
| /// The `DirEntryOps` are implemented by the individual file systems to provide |
| /// specific behaviours for this `DirEntry`. |
| ops: Box<dyn DirEntryOps>, |
| |
| /// The parent DirEntry. |
| /// |
| /// The DirEntry tree has strong references from child-to-parent and weak |
| /// references from parent-to-child. This design ensures that the parent |
| /// chain is always populated in the cache, but some children might be |
| /// missing from the cache. |
| parent: RcuOptionArc<DirEntry>, |
| |
| /// The [`DirEntryFlags`] for this `DirEntry`. |
| flags: AtomicU8, |
| |
| /// The name that this parent calls this child. |
| /// |
| /// This name might not be reflected in the full path in the namespace that |
| /// contains this DirEntry. For example, this DirEntry might be the root of |
| /// a chroot. |
| /// |
| /// Most callers that want to work with names for DirEntries should use the |
| /// NamespaceNodes. |
| local_name: RcuString, |
| |
| /// A partial cache of the children of this DirEntry. |
| /// |
| /// DirEntries are added to this cache when they are looked up and removed |
| /// when they are no longer referenced. |
| /// |
| // FIXME(b/379929394): The lock ordering here assumes parent-to-child lock acquisition, which |
| // a number of algorithms in the DirEntry operations also assume. This assumption can be broken |
| // by the rename operation, which can move nodes around the hierarchy. See the referenced bug |
| // for more details, the current mitigations, and potentials for long-term solutions. |
| children: RwLock<DirEntryChildren>, |
| } |
| type DirEntryChildren = BTreeMap<FsString, Weak<DirEntry>>; |
| |
| pub type DirEntryHandle = Arc<DirEntry>; |
| |
| impl DirEntry { |
| #[allow(clippy::let_and_return)] |
| pub fn new_uncached( |
| node: FsNodeHandle, |
| parent: Option<DirEntryHandle>, |
| local_name: FsString, |
| ) -> DirEntryHandle { |
| let ops = node.create_dir_entry_ops(); |
| let result = Arc::new(DirEntry { |
| node, |
| ops, |
| parent: RcuOptionArc::new(parent), |
| flags: Default::default(), |
| local_name: local_name.into(), |
| children: Default::default(), |
| }); |
| #[cfg(any(test, debug_assertions))] |
| { |
| // Taking this lock tells the lock tracing system about the parent/child ordering |
| // relation. |
| let _l1 = result.children.read(); |
| } |
| result |
| } |
| |
| pub fn new( |
| node: FsNodeHandle, |
| parent: Option<DirEntryHandle>, |
| local_name: FsString, |
| ) -> DirEntryHandle { |
| let result = Self::new_uncached(node, parent, local_name); |
| result.node.fs().did_create_dir_entry(&result); |
| result |
| } |
| |
| /// Returns a new DirEntry for the given `node` without parent. The entry has no local name and |
| /// is not cached. |
| pub fn new_unrooted(node: FsNodeHandle) -> DirEntryHandle { |
| Self::new_uncached(node, None, FsString::default()) |
| } |
| |
| /// Returns a new `DirEntry` that is ready marked as having been deleted. |
| pub fn new_deleted( |
| node: FsNodeHandle, |
| parent: Option<DirEntryHandle>, |
| local_name: FsString, |
| ) -> DirEntryHandle { |
| let entry = DirEntry::new_uncached(node, parent, local_name); |
| entry.raise_flags(DirEntryFlags::IS_DEAD); |
| entry |
| } |
| |
| /// Returns a file handle to this entry, associated with an anonymous namespace. |
| pub fn open_anonymous<L>( |
| self: &DirEntryHandle, |
| locked: &mut Locked<L>, |
| current_task: &CurrentTask, |
| flags: OpenFlags, |
| ) -> Result<FileHandle, Errno> |
| where |
| L: LockEqualOrBefore<FileOpsCore>, |
| { |
| let ops = self.node.create_file_ops(locked, current_task, flags)?; |
| FileObject::new( |
| locked, |
| current_task, |
| ops, |
| NamespaceNode::new_anonymous(self.clone()), |
| flags, |
| ) |
| } |
| |
| /// Set the children of this DirEntry to the given `children`. This should only ever be called |
| /// when children is empty. |
| pub fn set_children(self: &DirEntryHandle, children: BTreeMap<FsString, DirEntryHandle>) { |
| let mut dir_entry_children = self.lock_children(); |
| assert!(dir_entry_children.children.is_empty()); |
| for (name, child) in children.into_iter() { |
| child.set_parent(self.clone()); |
| dir_entry_children.children.insert(name, Arc::downgrade(&child)); |
| } |
| } |
| |
| fn lock_children<'a>(self: &'a DirEntryHandle) -> DirEntryLockedChildren<'a> { |
| DirEntryLockedChildren { entry: self, children: self.children.write() } |
| } |
| |
| /// The parent DirEntry. |
| pub fn parent(&self) -> Option<DirEntryHandle> { |
| self.parent.to_option_arc() |
| } |
| |
| /// Returns a reference to the parent DirEntry. |
| /// |
| /// The reference is only valid for the duration of the RCU read scope. |
| pub fn parent_ref<'a>(&'a self, scope: &'a RcuReadScope) -> Option<&'a DirEntry> { |
| self.parent.as_ref(scope) |
| } |
| |
| /// Set the parent of this DirEntry. |
| pub fn set_parent(&self, parent: DirEntryHandle) { |
| self.parent.update(Some(parent)); |
| } |
| |
| /// The parent DirEntry object or this DirEntry if this entry is the root. |
| /// |
| /// Useful when traversing up the tree if you always want to find a parent |
| /// (e.g., for ".."). |
| /// |
| /// Be aware that the root of one file system might be mounted as a child |
| /// in another file system. For that reason, consider walking the |
| /// NamespaceNode tree (which understands mounts) rather than the DirEntry |
| /// tree. |
| pub fn parent_or_self(self: &DirEntryHandle) -> DirEntryHandle { |
| self.parent().unwrap_or_else(|| self.clone()) |
| } |
| |
| /// The name that this parent calls this child. |
| /// |
| /// The reference is only valid for the duration of the RCU read scope. |
| pub fn local_name<'a>(&self, scope: &'a RcuReadScope) -> &'a FsStr { |
| self.local_name.read(scope) |
| } |
| |
| /// Whether the given name has special semantics as a directory entry. |
| /// |
| /// Specifically, whether the name is empty (which means "self"), dot |
| /// (which also means "self"), or dot dot (which means "parent"). |
| pub fn is_reserved_name(name: &FsStr) -> bool { |
| name.is_empty() || name == "." || name == ".." |
| } |
| |
| /// Returns the flags of this DirEntry. |
| pub fn flags(&self) -> DirEntryFlags { |
| DirEntryFlags::from_bits_truncate(self.flags.load(Ordering::Acquire)) |
| } |
| |
| /// Raises the flags of this DirEntry. |
| /// |
| /// Returns the flags of this DirEntry before the flags were raised. |
| pub fn raise_flags(&self, flags: DirEntryFlags) -> DirEntryFlags { |
| let old_flags = self.flags.fetch_or(flags.bits(), Ordering::AcqRel); |
| DirEntryFlags::from_bits_truncate(old_flags) |
| } |
| |
| /// Lowers the flags of this DirEntry. |
| /// |
| /// Returns the flags of this DirEntry before the flags were lowered. |
| pub fn lower_flags(&self, flags: DirEntryFlags) -> DirEntryFlags { |
| let old_flags = self.flags.fetch_and(!flags.bits(), Ordering::AcqRel); |
| DirEntryFlags::from_bits_truncate(old_flags) |
| } |
| |
| /// Returns true if this DirEntry is dead. |
| pub fn is_dead(&self) -> bool { |
| self.flags().contains(DirEntryFlags::IS_DEAD) |
| } |
| |
| /// Look up a directory entry with the given name as direct child of this |
| /// entry. |
| pub fn component_lookup<L>( |
| self: &DirEntryHandle, |
| locked: &mut Locked<L>, |
| current_task: &CurrentTask, |
| mount: &MountInfo, |
| name: &FsStr, |
| ) -> Result<DirEntryHandle, Errno> |
| where |
| L: LockEqualOrBefore<FileOpsCore>, |
| { |
| let (node, _) = self.get_or_create_child( |
| locked, |
| current_task, |
| mount, |
| name, |
| |locked, d, mount, name| d.lookup(locked, current_task, mount, name), |
| )?; |
| Ok(node) |
| } |
| |
| /// Creates a new DirEntry |
| /// |
| /// The create_node_fn function is called to create the underlying FsNode |
| /// for the DirEntry. |
| /// |
| /// If the entry already exists, create_node_fn is not called, and EEXIST is |
| /// returned. |
| pub fn create_entry<L>( |
| self: &DirEntryHandle, |
| locked: &mut Locked<L>, |
| current_task: &CurrentTask, |
| mount: &MountInfo, |
| name: &FsStr, |
| create_node_fn: impl FnOnce( |
| &mut Locked<L>, |
| &FsNodeHandle, |
| &MountInfo, |
| &FsStr, |
| ) -> Result<FsNodeHandle, Errno>, |
| ) -> Result<DirEntryHandle, Errno> |
| where |
| L: LockEqualOrBefore<FileOpsCore>, |
| { |
| let (entry, exists) = |
| self.create_entry_internal(locked, current_task, mount, name, create_node_fn)?; |
| if exists { |
| return error!(EEXIST); |
| } |
| Ok(entry) |
| } |
| |
| /// Creates a new DirEntry. Works just like create_entry, except if the entry already exists, |
| /// it is returned. |
| pub fn get_or_create_entry<L>( |
| self: &DirEntryHandle, |
| locked: &mut Locked<L>, |
| current_task: &CurrentTask, |
| mount: &MountInfo, |
| name: &FsStr, |
| create_node_fn: impl FnOnce( |
| &mut Locked<L>, |
| &FsNodeHandle, |
| &MountInfo, |
| &FsStr, |
| ) -> Result<FsNodeHandle, Errno>, |
| ) -> Result<DirEntryHandle, Errno> |
| where |
| L: LockEqualOrBefore<FileOpsCore>, |
| { |
| let (entry, _exists) = |
| self.create_entry_internal(locked, current_task, mount, name, create_node_fn)?; |
| Ok(entry) |
| } |
| |
| fn create_entry_internal<L>( |
| self: &DirEntryHandle, |
| locked: &mut Locked<L>, |
| current_task: &CurrentTask, |
| mount: &MountInfo, |
| name: &FsStr, |
| create_node_fn: impl FnOnce( |
| &mut Locked<L>, |
| &FsNodeHandle, |
| &MountInfo, |
| &FsStr, |
| ) -> Result<FsNodeHandle, Errno>, |
| ) -> Result<(DirEntryHandle, bool), Errno> |
| where |
| L: LockEqualOrBefore<FileOpsCore>, |
| { |
| if DirEntry::is_reserved_name(name) { |
| return error!(EEXIST); |
| } |
| // TODO: Do we need to check name for embedded NUL characters? |
| if name.len() > NAME_MAX as usize { |
| return error!(ENAMETOOLONG); |
| } |
| if name.contains(&path::SEPARATOR) { |
| return error!(EINVAL); |
| } |
| let (entry, exists) = |
| self.get_or_create_child(locked, current_task, mount, name, create_node_fn)?; |
| if !exists { |
| // An entry was created. Update the ctime and mtime of this directory. |
| self.node.update_ctime_mtime(); |
| entry.notify_creation(); |
| } |
| Ok((entry, exists)) |
| } |
| |
| // This is marked as test-only because it sets the owner/group to root instead of the current |
| // user to save a bit of typing in tests, but this shouldn't happen silently in production. |
| #[cfg(test)] |
| pub fn create_dir<L>( |
| self: &DirEntryHandle, |
| locked: &mut starnix_sync::Locked<L>, |
| current_task: &CurrentTask, |
| name: &FsStr, |
| ) -> Result<DirEntryHandle, Errno> |
| where |
| L: LockEqualOrBefore<FileOpsCore>, |
| { |
| self.create_dir_for_testing(locked, current_task, name) |
| } |
| |
| // This function is for testing because it sets the owner/group to root instead of the current |
| // user to save a bit of typing in tests, but this shouldn't happen silently in production. |
| pub fn create_dir_for_testing<L>( |
| self: &DirEntryHandle, |
| locked: &mut Locked<L>, |
| current_task: &CurrentTask, |
| name: &FsStr, |
| ) -> Result<DirEntryHandle, Errno> |
| where |
| L: LockEqualOrBefore<FileOpsCore>, |
| { |
| // TODO: apply_umask |
| self.create_entry( |
| locked, |
| current_task, |
| &MountInfo::detached(), |
| name, |
| |locked, dir, mount, name| { |
| dir.create_node( |
| locked, |
| current_task, |
| mount, |
| name, |
| starnix_uapi::file_mode::mode!(IFDIR, 0o777), |
| starnix_uapi::device_type::DeviceType::NONE, |
| FsCred::root(), |
| ) |
| }, |
| ) |
| } |
| |
| /// Creates an anonymous file. |
| /// |
| /// The FileMode::IFMT of the FileMode is always FileMode::IFREG. |
| /// |
| /// Used by O_TMPFILE. |
| pub fn create_tmpfile<L>( |
| self: &DirEntryHandle, |
| locked: &mut Locked<L>, |
| current_task: &CurrentTask, |
| mount: &MountInfo, |
| mode: FileMode, |
| owner: FsCred, |
| flags: OpenFlags, |
| ) -> Result<DirEntryHandle, Errno> |
| where |
| L: LockEqualOrBefore<FileOpsCore>, |
| { |
| // Only directories can have children. |
| if !self.node.is_dir() { |
| return error!(ENOTDIR); |
| } |
| assert!(mode.is_reg()); |
| |
| // From <https://man7.org/linux/man-pages/man2/open.2.html>: |
| // |
| // Specifying O_EXCL in conjunction with O_TMPFILE prevents a |
| // temporary file from being linked into the filesystem in |
| // the above manner. (Note that the meaning of O_EXCL in |
| // this case is different from the meaning of O_EXCL |
| // otherwise.) |
| let link_behavior = if flags.contains(OpenFlags::EXCL) { |
| FsNodeLinkBehavior::Disallowed |
| } else { |
| FsNodeLinkBehavior::Allowed |
| }; |
| |
| let node = |
| self.node.create_tmpfile(locked, current_task, mount, mode, owner, link_behavior)?; |
| let local_name = format!("#{}", node.ino).into(); |
| Ok(DirEntry::new_deleted(node, Some(self.clone()), local_name)) |
| } |
| |
| pub fn unlink<L>( |
| self: &DirEntryHandle, |
| locked: &mut Locked<L>, |
| current_task: &CurrentTask, |
| mount: &MountInfo, |
| name: &FsStr, |
| kind: UnlinkKind, |
| must_be_directory: bool, |
| ) -> Result<(), Errno> |
| where |
| L: LockEqualOrBefore<FileOpsCore>, |
| { |
| assert!(!DirEntry::is_reserved_name(name)); |
| |
| // child_to_unlink *must* be dropped after self_children (even in the error paths). |
| let child_to_unlink; |
| |
| let mut self_children = self.lock_children(); |
| child_to_unlink = self_children.component_lookup(locked, current_task, mount, name)?; |
| child_to_unlink.require_no_mounts(mount)?; |
| |
| // Check that this filesystem entry must be a directory. This can |
| // happen if the path terminates with a trailing slash. |
| // |
| // Example: If we're unlinking a symlink `/foo/bar/`, this would |
| // result in `ENOTDIR` because of the trailing slash, even if |
| // `UnlinkKind::NonDirectory` was used. |
| if must_be_directory && !child_to_unlink.node.is_dir() { |
| return error!(ENOTDIR); |
| } |
| |
| match kind { |
| UnlinkKind::Directory => { |
| if !child_to_unlink.node.is_dir() { |
| return error!(ENOTDIR); |
| } |
| } |
| UnlinkKind::NonDirectory => { |
| if child_to_unlink.node.is_dir() { |
| return error!(EISDIR); |
| } |
| } |
| } |
| |
| self.node.unlink(locked, current_task, mount, name, &child_to_unlink.node)?; |
| self_children.children.remove(name); |
| |
| std::mem::drop(self_children); |
| child_to_unlink.destroy(¤t_task.kernel().mounts); |
| |
| Ok(()) |
| } |
| |
| /// Destroy this directory entry. |
| /// |
| /// Notice that this method takes `self` by value to destroy this reference. |
| fn destroy(self: DirEntryHandle, mounts: &Mounts) { |
| let was_already_dead = |
| self.raise_flags(DirEntryFlags::IS_DEAD).contains(DirEntryFlags::IS_DEAD); |
| if was_already_dead { |
| return; |
| } |
| let unmount = |
| self.lower_flags(DirEntryFlags::HAS_MOUNTS).contains(DirEntryFlags::HAS_MOUNTS); |
| self.node.fs().will_destroy_dir_entry(&self); |
| if unmount { |
| mounts.unmount(&self); |
| } |
| self.notify_deletion(); |
| } |
| |
| /// Returns whether this entry is a descendant of |other|. |
| pub fn is_descendant_of(self: &DirEntryHandle, other: &DirEntryHandle) -> bool { |
| let scope = RcuReadScope::new(); |
| let mut current = self.deref(); |
| loop { |
| if std::ptr::eq(current, other.deref()) { |
| // We found |other|. |
| return true; |
| } |
| if let Some(parent) = current.parent_ref(&scope) { |
| current = parent; |
| } else { |
| // We reached the root of the file system. |
| return false; |
| } |
| } |
| } |
| |
| /// Rename the file with old_basename in old_parent to new_basename in |
| /// new_parent. |
| /// |
| /// old_parent and new_parent must belong to the same file system. |
| pub fn rename<L>( |
| locked: &mut Locked<L>, |
| current_task: &CurrentTask, |
| old_parent: &DirEntryHandle, |
| old_mount: &MountInfo, |
| old_basename: &FsStr, |
| new_parent: &DirEntryHandle, |
| new_mount: &MountInfo, |
| new_basename: &FsStr, |
| flags: RenameFlags, |
| ) -> Result<(), Errno> |
| where |
| L: LockEqualOrBefore<FileOpsCore>, |
| { |
| // The nodes we are touching must be part of the same mount. |
| if old_mount != new_mount { |
| return error!(EXDEV); |
| } |
| // The mounts are equals, choose one. |
| let mount = old_mount; |
| |
| // If either the old_basename or the new_basename is a reserved name |
| // (e.g., "." or ".."), then we cannot do the rename. |
| if DirEntry::is_reserved_name(old_basename) || DirEntry::is_reserved_name(new_basename) { |
| if flags.contains(RenameFlags::NOREPLACE) { |
| return error!(EEXIST); |
| } |
| return error!(EBUSY); |
| } |
| |
| // If the names and parents are the same, then there's nothing to do |
| // and we can report success. |
| if Arc::ptr_eq(&old_parent.node, &new_parent.node) && old_basename == new_basename { |
| return Ok(()); |
| } |
| |
| // This task must have write access to the old and new parent nodes. |
| old_parent.node.check_access( |
| locked, |
| current_task, |
| mount, |
| Access::WRITE, |
| CheckAccessReason::InternalPermissionChecks, |
| old_parent, |
| )?; |
| new_parent.node.check_access( |
| locked, |
| current_task, |
| mount, |
| Access::WRITE, |
| CheckAccessReason::InternalPermissionChecks, |
| new_parent, |
| )?; |
| |
| // The mount check ensures that the nodes we're touching are part of the |
| // same file system. It doesn't matter where we grab the FileSystem reference from. |
| let fs = old_parent.node.fs(); |
| |
| // We need to hold these DirEntryHandles until after we drop all the |
| // locks so that we do not deadlock when we drop them. |
| let renamed; |
| let mut maybe_replaced = None; |
| |
| { |
| // Before we take any locks, we need to take the rename mutex on |
| // the file system. This lock ensures that no other rename |
| // operations are happening in this file system while we're |
| // analyzing this rename operation. |
| // |
| // For example, we grab writer locks on both old_parent and |
| // new_parent. If there was another rename operation in flight with |
| // old_parent and new_parent reversed, then we could deadlock while |
| // trying to acquire these locks. |
| let _lock = fs.rename_mutex.lock(); |
| |
| // Compute the list of ancestors of new_parent to check whether new_parent is a |
| // descendant of the renamed node. This must be computed before taking any lock to |
| // prevent lock inversions. |
| // |
| // TODO: This walk is now lock-free, which means we don't need to materialize this |
| // list. We can just walk the tree and check for cycles. |
| let mut new_parent_ancestor_list = Vec::<DirEntryHandle>::new(); |
| { |
| let mut current = Some(new_parent.clone()); |
| while let Some(entry) = current { |
| current = entry.parent(); |
| new_parent_ancestor_list.push(entry); |
| } |
| } |
| |
| // We cannot simply grab the locks on old_parent and new_parent |
| // independently because old_parent and new_parent might be the |
| // same directory entry. Instead, we use the RenameGuard helper to |
| // grab the appropriate locks. |
| let mut state = RenameGuard::lock(old_parent, new_parent); |
| |
| // Now that we know the old_parent child list cannot change, we |
| // establish the DirEntry that we are going to try to rename. |
| renamed = |
| state.old_parent().component_lookup(locked, current_task, mount, old_basename)?; |
| |
| // Check whether the sticky bit on the old parent prevents us from |
| // removing this child. |
| old_parent.node.check_sticky_bit(current_task, &renamed.node)?; |
| |
| // If new_parent is a descendant of renamed, the operation would |
| // create a cycle. That's disallowed. |
| if new_parent_ancestor_list.into_iter().any(|entry| Arc::ptr_eq(&entry, &renamed)) { |
| return error!(EINVAL); |
| } |
| |
| // Check whether the renamed entry is a mountpoint. |
| // TODO: We should hold a read lock on the mount points for this |
| // namespace to prevent the child from becoming a mount point |
| // while this function is executing. |
| renamed.require_no_mounts(mount)?; |
| |
| // We need to check if there is already a DirEntry with |
| // new_basename in new_parent. If so, there are additional checks |
| // we need to perform. |
| match state.new_parent().component_lookup(locked, current_task, mount, new_basename) { |
| Ok(replaced) => { |
| // Set `maybe_replaced` now to ensure it gets dropped in the right order. |
| let replaced = maybe_replaced.insert(replaced); |
| |
| if flags.contains(RenameFlags::NOREPLACE) { |
| return error!(EEXIST); |
| } |
| |
| // Sayeth https://man7.org/linux/man-pages/man2/rename.2.html: |
| // |
| // "If oldpath and newpath are existing hard links referring to the |
| // same file, then rename() does nothing, and returns a success |
| // status." |
| if Arc::ptr_eq(&renamed.node, &replaced.node) { |
| return Ok(()); |
| } |
| |
| // Sayeth https://man7.org/linux/man-pages/man2/rename.2.html: |
| // |
| // "oldpath can specify a directory. In this case, newpath must" |
| // either not exist, or it must specify an empty directory." |
| if replaced.node.is_dir() { |
| // Check whether the replaced entry is a mountpoint. |
| // TODO: We should hold a read lock on the mount points for this |
| // namespace to prevent the child from becoming a mount point |
| // while this function is executing. |
| replaced.require_no_mounts(mount)?; |
| } |
| |
| if !flags.intersects(RenameFlags::EXCHANGE | RenameFlags::REPLACE_ANY) { |
| if renamed.node.is_dir() && !replaced.node.is_dir() { |
| return error!(ENOTDIR); |
| } else if !renamed.node.is_dir() && replaced.node.is_dir() { |
| return error!(EISDIR); |
| } |
| } |
| } |
| // It's fine for the lookup to fail to find a child. |
| Err(errno) if errno == ENOENT => {} |
| // However, other errors are fatal. |
| Err(e) => return Err(e), |
| } |
| |
| security::check_fs_node_rename_access( |
| current_task, |
| &old_parent.node, |
| &renamed.node, |
| &new_parent.node, |
| maybe_replaced.as_ref().map(|dir_entry| dir_entry.node.deref().as_ref()), |
| old_basename, |
| new_basename, |
| )?; |
| |
| // We've found all the errors that we know how to find. Ask the |
| // file system to actually execute the rename operation. Once the |
| // file system has executed the rename, we are no longer allowed to |
| // fail because we will not be able to return the system to a |
| // consistent state. |
| |
| if flags.contains(RenameFlags::EXCHANGE) { |
| let replaced = maybe_replaced.as_ref().ok_or_else(|| errno!(ENOENT))?; |
| fs.exchange( |
| current_task, |
| &renamed.node, |
| &old_parent.node, |
| old_basename, |
| &replaced.node, |
| &new_parent.node, |
| new_basename, |
| )?; |
| } else { |
| fs.rename( |
| locked, |
| current_task, |
| &old_parent.node, |
| old_basename, |
| &new_parent.node, |
| new_basename, |
| &renamed.node, |
| maybe_replaced.as_ref().map(|replaced| &replaced.node), |
| )?; |
| } |
| |
| // We need to update the parent and local name for the DirEntry |
| // we are renaming to reflect its new parent and its new name. |
| renamed.set_parent(new_parent.clone()); |
| renamed.local_name.update(new_basename.to_owned()); |
| |
| // Actually add the renamed child to the new_parent's child list. |
| // This operation implicitly removes the replaced child (if any) |
| // from the child list. |
| state.new_parent().children.insert(new_basename.into(), Arc::downgrade(&renamed)); |
| |
| #[cfg(detect_lock_cycles)] |
| unsafe { |
| // Lock ordering is enforced from parent-to-child, and therefore we need to |
| // reset the lock ordering constraints when we reorder the tree nodes. |
| util::reset_dependencies(renamed.children.raw()); |
| } |
| |
| if flags.contains(RenameFlags::EXCHANGE) { |
| // Reparent `replaced` when exchanging. |
| let replaced = |
| maybe_replaced.as_ref().expect("replaced expected with RENAME_EXCHANGE"); |
| replaced.set_parent(old_parent.clone()); |
| replaced.local_name.update(old_basename.to_owned()); |
| state.old_parent().children.insert(old_basename.into(), Arc::downgrade(replaced)); |
| |
| #[cfg(detect_lock_cycles)] |
| unsafe { |
| // Lock ordering is enforced from parent-to-child, and therefore we need to |
| // reset the lock ordering constraints when we reorder the tree nodes. |
| util::reset_dependencies(replaced.children.raw()); |
| } |
| } else { |
| // Remove the renamed child from the old_parent's child list. |
| state.old_parent().children.remove(old_basename); |
| } |
| }; |
| |
| fs.purge_old_entries(); |
| |
| if let Some(replaced) = maybe_replaced { |
| if !flags.contains(RenameFlags::EXCHANGE) { |
| replaced.destroy(¤t_task.kernel().mounts); |
| } |
| } |
| |
| // Renaming a file updates its ctime. |
| renamed.node.update_ctime(); |
| |
| let mode = renamed.node.info().mode; |
| let cookie = current_task.kernel().get_next_inotify_cookie(); |
| old_parent.node.notify(InotifyMask::MOVE_FROM, cookie, old_basename, mode, false); |
| new_parent.node.notify(InotifyMask::MOVE_TO, cookie, new_basename, mode, false); |
| renamed.node.notify(InotifyMask::MOVE_SELF, 0, Default::default(), mode, false); |
| |
| Ok(()) |
| } |
| |
| pub fn get_children<F, T>(&self, callback: F) -> T |
| where |
| F: FnOnce(&DirEntryChildren) -> T, |
| { |
| let children = self.children.read(); |
| callback(&children) |
| } |
| |
| /// Remove the child with the given name from the children cache. The child must not have any |
| /// mounts. |
| pub fn remove_child(&self, name: &FsStr, mounts: &Mounts) { |
| let mut children = self.children.write(); |
| let child = children.get(name).and_then(Weak::upgrade); |
| if let Some(child) = child { |
| children.remove(name); |
| std::mem::drop(children); |
| child.destroy(mounts); |
| } |
| } |
| |
| fn get_or_create_child<L>( |
| self: &DirEntryHandle, |
| locked: &mut Locked<L>, |
| current_task: &CurrentTask, |
| mount: &MountInfo, |
| name: &FsStr, |
| create_fn: impl FnOnce( |
| &mut Locked<L>, |
| &FsNodeHandle, |
| &MountInfo, |
| &FsStr, |
| ) -> Result<FsNodeHandle, Errno>, |
| ) -> Result<(DirEntryHandle, bool), Errno> |
| where |
| L: LockEqualOrBefore<FileOpsCore>, |
| { |
| assert!(!DirEntry::is_reserved_name(name)); |
| // Only directories can have children. |
| if !self.node.is_dir() { |
| return error!(ENOTDIR); |
| } |
| // The user must be able to search the directory (requires the EXEC permission) |
| self.node.check_access( |
| locked, |
| current_task, |
| mount, |
| Access::EXEC, |
| CheckAccessReason::InternalPermissionChecks, |
| self, |
| )?; |
| |
| // Check if the child is already in children. In that case, we can |
| // simply return the child and we do not need to call init_fn. |
| let child = self.children.read().get(name).and_then(Weak::upgrade); |
| let (child, create_result) = if let Some(child) = child { |
| // Do not cache a child in a locked directory |
| if self.node.fail_if_locked(current_task).is_ok() { |
| child.node.fs().did_access_dir_entry(&child); |
| } |
| (child, CreationResult::Existed { create_fn }) |
| } else { |
| let (child, create_result) = self.lock_children().get_or_create_child( |
| locked, |
| current_task, |
| mount, |
| name, |
| create_fn, |
| )?; |
| child.node.fs().purge_old_entries(); |
| (child, create_result) |
| }; |
| |
| let (child, exists) = match create_result { |
| CreationResult::Created => (child, false), |
| CreationResult::Existed { create_fn } => { |
| if child.ops.revalidate( |
| locked.cast_locked::<FileOpsCore>(), |
| current_task, |
| &child, |
| )? { |
| (child, true) |
| } else { |
| self.internal_remove_child(&child); |
| child.destroy(¤t_task.kernel().mounts); |
| |
| let (child, create_result) = self.lock_children().get_or_create_child( |
| locked, |
| current_task, |
| mount, |
| name, |
| create_fn, |
| )?; |
| child.node.fs().purge_old_entries(); |
| (child, matches!(create_result, CreationResult::Existed { .. })) |
| } |
| } |
| }; |
| |
| Ok((child, exists)) |
| } |
| |
| // This function is only useful for tests and has some oddities. |
| // |
| // For example, not all the children might have been looked up yet, which |
| // means the returned vector could be missing some names. |
| // |
| // Also, the vector might have "extra" names that are in the process of |
| // being looked up. If the lookup fails, they'll be removed. |
| #[cfg(test)] |
| pub fn copy_child_names(&self) -> Vec<FsString> { |
| let scope = RcuReadScope::new(); |
| self.children |
| .read() |
| .values() |
| .filter_map(|child| Weak::upgrade(child).map(|c| c.local_name.read(&scope).to_owned())) |
| .collect() |
| } |
| |
| fn internal_remove_child(&self, child: &DirEntry) { |
| let mut children = self.children.write(); |
| let scope = RcuReadScope::new(); |
| let local_name = child.local_name.read(&scope); |
| if let Some(weak_child) = children.get(local_name) { |
| // If this entry is occupied, we need to check whether child is |
| // the current occupant. If so, we should remove the entry |
| // because the child no longer exists. |
| if std::ptr::eq(weak_child.as_ptr(), child) { |
| children.remove(local_name); |
| } |
| } |
| } |
| |
| /// Notifies watchers on the current node and its parent about an event. |
| pub fn notify(&self, event_mask: InotifyMask) { |
| self.notify_watchers(event_mask, self.is_dead()); |
| } |
| |
| /// Notifies watchers on the current node and its parent about an event. |
| /// |
| /// Used for FSNOTIFY_EVENT_INODE events, which ignore IN_EXCL_UNLINK. |
| pub fn notify_ignoring_excl_unlink(&self, event_mask: InotifyMask) { |
| // We pretend that this directory entry is not dead to ignore IN_EXCL_UNLINK. |
| self.notify_watchers(event_mask, false); |
| } |
| |
| fn notify_watchers(&self, event_mask: InotifyMask, is_dead: bool) { |
| let mode = self.node.info().mode; |
| { |
| let scope = RcuReadScope::new(); |
| if let Some(parent) = self.parent_ref(&scope) { |
| let local_name = self.local_name.read(&scope); |
| parent.node.notify(event_mask, 0, local_name, mode, is_dead); |
| } |
| } |
| self.node.notify(event_mask, 0, Default::default(), mode, is_dead); |
| } |
| |
| /// Notifies parents about creation, and notifies current node about link_count change. |
| fn notify_creation(&self) { |
| let mode = self.node.info().mode; |
| if Arc::strong_count(&self.node) > 1 { |
| // Notify about link change only if there is already a hardlink. |
| self.node.notify(InotifyMask::ATTRIB, 0, Default::default(), mode, false); |
| } |
| let scope = RcuReadScope::new(); |
| if let Some(parent) = self.parent_ref(&scope) { |
| let local_name = self.local_name.read(&scope); |
| parent.node.notify(InotifyMask::CREATE, 0, local_name, mode, false); |
| } |
| } |
| |
| /// Notifies watchers on the current node about deletion if this is the |
| /// last hardlink, and drops the DirEntryHandle kept by Inotify. |
| /// Parent is also notified about deletion. |
| fn notify_deletion(&self) { |
| let mode = self.node.info().mode; |
| if !mode.is_dir() { |
| // Linux notifies link count change for non-directories. |
| self.node.notify(InotifyMask::ATTRIB, 0, Default::default(), mode, false); |
| } |
| |
| let scope = RcuReadScope::new(); |
| if let Some(parent) = self.parent_ref(&scope) { |
| let local_name = self.local_name.read(&scope); |
| parent.node.notify(InotifyMask::DELETE, 0, local_name, mode, false); |
| } |
| |
| // This check is incorrect if there's another hard link to this FsNode that isn't in |
| // memory at the moment. |
| if Arc::strong_count(&self.node) == 1 { |
| self.node.notify(InotifyMask::DELETE_SELF, 0, Default::default(), mode, false); |
| } |
| } |
| |
| /// Returns true if this entry has mounts. |
| pub fn has_mounts(&self) -> bool { |
| self.flags().contains(DirEntryFlags::HAS_MOUNTS) |
| } |
| |
| /// Records whether or not the entry has mounts. |
| pub fn set_has_mounts(&self, v: bool) { |
| if v { |
| self.raise_flags(DirEntryFlags::HAS_MOUNTS); |
| } else { |
| self.lower_flags(DirEntryFlags::HAS_MOUNTS); |
| } |
| } |
| |
| /// Verifies this directory has nothing mounted on it. |
| fn require_no_mounts(self: &Arc<Self>, parent_mount: &MountInfo) -> Result<(), Errno> { |
| if self.has_mounts() { |
| if let Some(mount) = parent_mount.as_ref() { |
| if mount.read().has_submount(self) { |
| return error!(EBUSY); |
| } |
| } |
| } |
| Ok(()) |
| } |
| } |
| |
| struct DirEntryLockedChildren<'a> { |
| entry: &'a DirEntryHandle, |
| children: RwLockWriteGuard<'a, DirEntryChildren>, |
| } |
| |
| enum CreationResult<F> { |
| Created, |
| Existed { create_fn: F }, |
| } |
| |
| impl<'a> DirEntryLockedChildren<'a> { |
| fn component_lookup<L>( |
| &mut self, |
| locked: &mut Locked<L>, |
| current_task: &CurrentTask, |
| mount: &MountInfo, |
| name: &FsStr, |
| ) -> Result<DirEntryHandle, Errno> |
| where |
| L: LockEqualOrBefore<FileOpsCore>, |
| { |
| assert!(!DirEntry::is_reserved_name(name)); |
| let (node, _) = |
| self.get_or_create_child(locked, current_task, mount, name, |_, _, _, _| { |
| error!(ENOENT) |
| })?; |
| Ok(node) |
| } |
| |
| fn get_or_create_child< |
| L, |
| F: FnOnce(&mut Locked<L>, &FsNodeHandle, &MountInfo, &FsStr) -> Result<FsNodeHandle, Errno>, |
| >( |
| &mut self, |
| locked: &mut Locked<L>, |
| current_task: &CurrentTask, |
| mount: &MountInfo, |
| name: &FsStr, |
| create_fn: F, |
| ) -> Result<(DirEntryHandle, CreationResult<F>), Errno> |
| where |
| L: LockEqualOrBefore<FileOpsCore>, |
| { |
| let create_child = |locked: &mut Locked<L>, create_fn: F| { |
| // Before creating the child, check for existence. |
| let (node, create_result) = |
| match self.entry.node.lookup(locked, current_task, mount, name) { |
| Ok(node) => (node, CreationResult::Existed { create_fn }), |
| Err(e) if e == ENOENT => { |
| (create_fn(locked, &self.entry.node, mount, name)?, CreationResult::Created) |
| } |
| Err(e) => return Err(e), |
| }; |
| |
| assert!( |
| node.info().mode & FileMode::IFMT != FileMode::EMPTY, |
| "FsNode initialization did not populate the FileMode in FsNodeInfo." |
| ); |
| |
| let entry = DirEntry::new(node, Some(self.entry.clone()), name.to_owned()); |
| Ok((entry, create_result)) |
| }; |
| |
| let (child, create_result) = match self.children.entry(name.to_owned()) { |
| Entry::Vacant(entry) => { |
| let (child, create_result) = create_child(locked, create_fn)?; |
| // Do not cache a child in a locked directory |
| if self.entry.node.fail_if_locked(current_task).is_ok() { |
| entry.insert(Arc::downgrade(&child)); |
| } |
| (child, create_result) |
| } |
| Entry::Occupied(mut entry) => { |
| // It's possible that the upgrade will succeed this time around because we dropped |
| // the read lock before acquiring the write lock. Another thread might have |
| // populated this entry while we were not holding any locks. |
| if let Some(child) = Weak::upgrade(entry.get()) { |
| // Do not cache a child in a locked directory |
| if self.entry.node.fail_if_locked(current_task).is_ok() { |
| child.node.fs().did_access_dir_entry(&child); |
| } |
| return Ok((child, CreationResult::Existed { create_fn })); |
| } |
| let (child, create_result) = create_child(locked, create_fn)?; |
| // Do not cache a child in a locked directory |
| if self.entry.node.fail_if_locked(current_task).is_ok() { |
| entry.insert(Arc::downgrade(&child)); |
| } |
| (child, create_result) |
| } |
| }; |
| |
| security::fs_node_init_with_dentry(locked, current_task, &child)?; |
| |
| Ok((child, create_result)) |
| } |
| } |
| |
| impl fmt::Debug for DirEntry { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| let scope = RcuReadScope::new(); |
| let mut parents = vec![]; |
| let mut maybe_parent = self.parent_ref(&scope); |
| while let Some(parent) = maybe_parent { |
| parents.push(parent.local_name.read(&scope)); |
| maybe_parent = parent.parent_ref(&scope); |
| } |
| let mut builder = f.debug_struct("DirEntry"); |
| builder.field("id", &(self as *const DirEntry)); |
| builder.field("local_name", &self.local_name.read(&scope).to_owned()); |
| if !parents.is_empty() { |
| builder.field("parents", &parents); |
| } |
| builder.finish() |
| } |
| } |
| |
| struct RenameGuard<'a> { |
| old_parent_guard: DirEntryLockedChildren<'a>, |
| new_parent_guard: Option<DirEntryLockedChildren<'a>>, |
| } |
| |
| impl<'a> RenameGuard<'a> { |
| fn lock(old_parent: &'a DirEntryHandle, new_parent: &'a DirEntryHandle) -> Self { |
| if Arc::ptr_eq(old_parent, new_parent) { |
| Self { old_parent_guard: old_parent.lock_children(), new_parent_guard: None } |
| } else { |
| // Following gVisor, these locks are taken in ancestor-to-descendant order. |
| // Moreover, if the node are not comparable, they are taken from smallest inode to |
| // biggest. |
| if new_parent.is_descendant_of(old_parent) |
| || (!old_parent.is_descendant_of(new_parent) |
| && old_parent.node.node_key() < new_parent.node.node_key()) |
| { |
| let old_parent_guard = old_parent.lock_children(); |
| let new_parent_guard = new_parent.lock_children(); |
| Self { old_parent_guard, new_parent_guard: Some(new_parent_guard) } |
| } else { |
| let new_parent_guard = new_parent.lock_children(); |
| let old_parent_guard = old_parent.lock_children(); |
| Self { old_parent_guard, new_parent_guard: Some(new_parent_guard) } |
| } |
| } |
| } |
| |
| fn old_parent(&mut self) -> &mut DirEntryLockedChildren<'a> { |
| &mut self.old_parent_guard |
| } |
| |
| fn new_parent(&mut self) -> &mut DirEntryLockedChildren<'a> { |
| if let Some(new_guard) = self.new_parent_guard.as_mut() { |
| new_guard |
| } else { |
| &mut self.old_parent_guard |
| } |
| } |
| } |
| |
| /// The Drop trait for DirEntry removes the entry from the child list of the |
| /// parent entry, which means we cannot drop DirEntry objects while holding a |
| /// lock on the parent's child list. |
| impl Drop for DirEntry { |
| fn drop(&mut self) { |
| let scope = RcuReadScope::new(); |
| let maybe_parent = self.parent_ref(&scope); |
| self.parent.update(None); |
| if let Some(parent) = maybe_parent { |
| parent.internal_remove_child(self); |
| } |
| } |
| } |