Centos-kernel-stream-9/fs/kernfs/inode.c

455 lines
11 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0-only
/*
* fs/kernfs/inode.c - kernfs inode implementation
*
* Copyright (c) 2001-3 Patrick Mochel
* Copyright (c) 2007 SUSE Linux Products GmbH
* Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
*/
#include <linux/pagemap.h>
#include <linux/backing-dev.h>
#include <linux/capability.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/xattr.h>
#include <linux/security.h>
#include "kernfs-internal.h"
static const struct inode_operations kernfs_iops = {
.permission = kernfs_iop_permission,
.setattr = kernfs_iop_setattr,
.getattr = kernfs_iop_getattr,
.listxattr = kernfs_iop_listxattr,
};
static struct kernfs_iattrs *__kernfs_iattrs(struct kernfs_node *kn, int alloc)
{
static DEFINE_MUTEX(iattr_mutex);
struct kernfs_iattrs *ret;
mutex_lock(&iattr_mutex);
if (kn->iattr || !alloc)
goto out_unlock;
kn->iattr = kmem_cache_zalloc(kernfs_iattrs_cache, GFP_KERNEL);
if (!kn->iattr)
goto out_unlock;
/* assign default attributes */
kn->iattr->ia_uid = GLOBAL_ROOT_UID;
kn->iattr->ia_gid = GLOBAL_ROOT_GID;
ktime_get_real_ts64(&kn->iattr->ia_atime);
kn->iattr->ia_mtime = kn->iattr->ia_atime;
kn->iattr->ia_ctime = kn->iattr->ia_atime;
simple_xattrs_init(&kn->iattr->xattrs);
atomic_set(&kn->iattr->nr_user_xattrs, 0);
atomic_set(&kn->iattr->user_xattr_size, 0);
out_unlock:
ret = kn->iattr;
mutex_unlock(&iattr_mutex);
return ret;
}
static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn)
{
return __kernfs_iattrs(kn, 1);
}
static struct kernfs_iattrs *kernfs_iattrs_noalloc(struct kernfs_node *kn)
{
return __kernfs_iattrs(kn, 0);
}
int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
{
struct kernfs_iattrs *attrs;
unsigned int ia_valid = iattr->ia_valid;
attrs = kernfs_iattrs(kn);
if (!attrs)
return -ENOMEM;
if (ia_valid & ATTR_UID)
attrs->ia_uid = iattr->ia_uid;
if (ia_valid & ATTR_GID)
attrs->ia_gid = iattr->ia_gid;
if (ia_valid & ATTR_ATIME)
attrs->ia_atime = iattr->ia_atime;
if (ia_valid & ATTR_MTIME)
attrs->ia_mtime = iattr->ia_mtime;
if (ia_valid & ATTR_CTIME)
attrs->ia_ctime = iattr->ia_ctime;
if (ia_valid & ATTR_MODE)
kn->mode = iattr->ia_mode;
return 0;
}
/**
* kernfs_setattr - set iattr on a node
* @kn: target node
* @iattr: iattr to set
*
kernfs: fix all kernel-doc warnings and multiple typos JIRA: https://issues.redhat.com/browse/RHEL-56023 commit 24b3e3dd9c9c742a4dd18e71b6963f9e7ab72911 Author: Randy Dunlap <rdunlap@infradead.org> Date: Fri, 11 Nov 2022 19:14:56 -0800 kernfs: fix all kernel-doc warnings and multiple typos Fix kernel-doc warnings. Many of these are about a function's return value, so use the kernel-doc Return: format to fix those Use % prefix on numeric constant values. dir.c: fix typos/spellos file.c fix typo: s/taret/target/ Fix all of these kernel-doc warnings: dir.c:305: warning: missing initial short description on line: * kernfs_name_hash dir.c:137: warning: No description found for return value of 'kernfs_path_from_node_locked' dir.c:196: warning: No description found for return value of 'kernfs_name' dir.c:224: warning: No description found for return value of 'kernfs_path_from_node' dir.c:292: warning: No description found for return value of 'kernfs_get_parent' dir.c:312: warning: No description found for return value of 'kernfs_name_hash' dir.c:404: warning: No description found for return value of 'kernfs_unlink_sibling' dir.c:588: warning: No description found for return value of 'kernfs_node_from_dentry' dir.c:806: warning: No description found for return value of 'kernfs_find_ns' dir.c:879: warning: No description found for return value of 'kernfs_find_and_get_ns' dir.c:904: warning: No description found for return value of 'kernfs_walk_and_get_ns' dir.c:927: warning: No description found for return value of 'kernfs_create_root' dir.c:996: warning: No description found for return value of 'kernfs_root_to_node' dir.c:1016: warning: No description found for return value of 'kernfs_create_dir_ns' dir.c:1048: warning: No description found for return value of 'kernfs_create_empty_dir' dir.c:1306: warning: No description found for return value of 'kernfs_next_descendant_post' dir.c:1568: warning: No description found for return value of 'kernfs_remove_self' dir.c:1630: warning: No description found for return value of 'kernfs_remove_by_name_ns' dir.c:1667: warning: No description found for return value of 'kernfs_rename_ns' file.c:66: warning: No description found for return value of 'of_on' file.c:88: warning: No description found for return value of 'kernfs_deref_open_node_locked' file.c:1036: warning: No description found for return value of '__kernfs_create_file' inode.c:100: warning: No description found for return value of 'kernfs_setattr' mount.c:160: warning: No description found for return value of 'kernfs_root_from_sb' mount.c:198: warning: No description found for return value of 'kernfs_node_dentry' mount.c:302: warning: No description found for return value of 'kernfs_super_ns' mount.c:318: warning: No description found for return value of 'kernfs_get_tree' symlink.c:28: warning: No description found for return value of 'kernfs_create_link' Signed-off-by: Randy Dunlap <rdunlap@infradead.org> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Tejun Heo <tj@kernel.org> Acked-by: Tejun Heo <tj@kernel.org> Link: https://lore.kernel.org/r/20221112031456.22980-1-rdunlap@infradead.org Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Signed-off-by: Waiman Long <longman@redhat.com>
2024-09-27 01:03:43 +00:00
* Return: %0 on success, -errno on failure.
*/
int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
{
int ret;
struct kernfs_root *root = kernfs_root(kn);
kernfs: Introduce separate rwsem to protect inode attributes JIRA: https://issues.redhat.com/browse/RHEL-52956 Upstream status: Linus Conflicts: There's a reject of hunks 2, 4 and 5 against fs/kernfs/inode.c because idmapping updates have not yet been made to the RHEL9 kernel. The RH_KABI_EXTEND() macro is used to add the new lock to the kernfs_root structure. This should be fine because CentOS Stream commit 25d13c0bf2507 ("kernfs: move struct kernfs_root out of the public view") has already been included in RHEL-9 making the kernfs_root structure private to kernfs. commit 9caf696142252a466fb89e629d0eddcdced027b0 Author: Imran Khan <imran.f.khan@oracle.com> Date: Thu Mar 9 22:09:30 2023 +1100 kernfs: Introduce separate rwsem to protect inode attributes. Right now a global per-fs rwsem (kernfs_rwsem) synchronizes multiple kernfs operations. On a large system with few hundred CPUs and few hundred applications simultaneoulsy trying to access sysfs, this results in multiple sys_open(s) contending on kernfs_rwsem via kernfs_iop_permission and kernfs_dop_revalidate. For example on a system with 384 cores, if I run 200 instances of an application which is mostly executing the following loop: for (int loop = 0; loop <100 ; loop++) { for (int port_num = 1; port_num < 2; port_num++) { for (int gid_index = 0; gid_index < 254; gid_index++ ) { char ret_buf[64], ret_buf_lo[64]; char gid_file_path[1024]; int ret_len; int ret_fd; ssize_t ret_rd; ub4 i, saved_errno; memset(ret_buf, 0, sizeof(ret_buf)); memset(gid_file_path, 0, sizeof(gid_file_path)); ret_len = snprintf(gid_file_path, sizeof(gid_file_path), "/sys/class/infiniband/%s/ports/%d/gids/%d", dev_name, port_num, gid_index); ret_fd = open(gid_file_path, O_RDONLY | O_CLOEXEC); if (ret_fd < 0) { printf("Failed to open %s\n", gid_file_path); continue; } /* Read the GID */ ret_rd = read(ret_fd, ret_buf, 40); if (ret_rd == -1) { printf("Failed to read from file %s, errno: %u\n", gid_file_path, saved_errno); continue; } close(ret_fd); } } I see contention around kernfs_rwsem as follows: path_openat | |----link_path_walk.part.0.constprop.0 | | | |--49.92%--inode_permission | | | | | --48.69%--kernfs_iop_permission | | | | | |--18.16%--down_read | | | | | |--15.38%--up_read | | | | | --14.58%--_raw_spin_lock | | | | | ----- | | | |--29.08%--walk_component | | | | | --29.02%--lookup_fast | | | | | |--24.26%--kernfs_dop_revalidate | | | | | | | |--14.97%--down_read | | | | | | | --9.01%--up_read | | | | | --4.74%--__d_lookup | | | | | --4.64%--_raw_spin_lock | | | | | ---- Having a separate per-fs rwsem to protect kernfs inode attributes, will avoid the above mentioned contention and result in better performance as can bee seen below: path_openat | |----link_path_walk.part.0.constprop.0 | | | | | |--27.06%--inode_permission | | | | | --25.84%--kernfs_iop_permission | | | | | |--9.29%--up_read | | | | | |--8.19%--down_read | | | | | --7.89%--_raw_spin_lock | | | | | ---- | | | |--22.42%--walk_component | | | | | --22.36%--lookup_fast | | | | | |--16.07%--__d_lookup | | | | | | | --16.01%--_raw_spin_lock | | | | | | | ---- | | | | | --6.28%--kernfs_dop_revalidate | | | | | |--3.76%--down_read | | | | | --2.26%--up_read As can be seen from the above data the overhead due to both kerfs_iop_permission and kernfs_dop_revalidate have gone down and this also reduces overall run time of the earlier mentioned loop. Signed-off-by: Imran Khan <imran.f.khan@oracle.com> Link: https://lore.kernel.org/r/20230309110932.2889010-2-imran.f.khan@oracle.com Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Signed-off-by: Ian Kent <ikent@redhat.com>
2024-08-07 01:55:39 +00:00
down_write(&root->kernfs_iattr_rwsem);
ret = __kernfs_setattr(kn, iattr);
kernfs: Introduce separate rwsem to protect inode attributes JIRA: https://issues.redhat.com/browse/RHEL-52956 Upstream status: Linus Conflicts: There's a reject of hunks 2, 4 and 5 against fs/kernfs/inode.c because idmapping updates have not yet been made to the RHEL9 kernel. The RH_KABI_EXTEND() macro is used to add the new lock to the kernfs_root structure. This should be fine because CentOS Stream commit 25d13c0bf2507 ("kernfs: move struct kernfs_root out of the public view") has already been included in RHEL-9 making the kernfs_root structure private to kernfs. commit 9caf696142252a466fb89e629d0eddcdced027b0 Author: Imran Khan <imran.f.khan@oracle.com> Date: Thu Mar 9 22:09:30 2023 +1100 kernfs: Introduce separate rwsem to protect inode attributes. Right now a global per-fs rwsem (kernfs_rwsem) synchronizes multiple kernfs operations. On a large system with few hundred CPUs and few hundred applications simultaneoulsy trying to access sysfs, this results in multiple sys_open(s) contending on kernfs_rwsem via kernfs_iop_permission and kernfs_dop_revalidate. For example on a system with 384 cores, if I run 200 instances of an application which is mostly executing the following loop: for (int loop = 0; loop <100 ; loop++) { for (int port_num = 1; port_num < 2; port_num++) { for (int gid_index = 0; gid_index < 254; gid_index++ ) { char ret_buf[64], ret_buf_lo[64]; char gid_file_path[1024]; int ret_len; int ret_fd; ssize_t ret_rd; ub4 i, saved_errno; memset(ret_buf, 0, sizeof(ret_buf)); memset(gid_file_path, 0, sizeof(gid_file_path)); ret_len = snprintf(gid_file_path, sizeof(gid_file_path), "/sys/class/infiniband/%s/ports/%d/gids/%d", dev_name, port_num, gid_index); ret_fd = open(gid_file_path, O_RDONLY | O_CLOEXEC); if (ret_fd < 0) { printf("Failed to open %s\n", gid_file_path); continue; } /* Read the GID */ ret_rd = read(ret_fd, ret_buf, 40); if (ret_rd == -1) { printf("Failed to read from file %s, errno: %u\n", gid_file_path, saved_errno); continue; } close(ret_fd); } } I see contention around kernfs_rwsem as follows: path_openat | |----link_path_walk.part.0.constprop.0 | | | |--49.92%--inode_permission | | | | | --48.69%--kernfs_iop_permission | | | | | |--18.16%--down_read | | | | | |--15.38%--up_read | | | | | --14.58%--_raw_spin_lock | | | | | ----- | | | |--29.08%--walk_component | | | | | --29.02%--lookup_fast | | | | | |--24.26%--kernfs_dop_revalidate | | | | | | | |--14.97%--down_read | | | | | | | --9.01%--up_read | | | | | --4.74%--__d_lookup | | | | | --4.64%--_raw_spin_lock | | | | | ---- Having a separate per-fs rwsem to protect kernfs inode attributes, will avoid the above mentioned contention and result in better performance as can bee seen below: path_openat | |----link_path_walk.part.0.constprop.0 | | | | | |--27.06%--inode_permission | | | | | --25.84%--kernfs_iop_permission | | | | | |--9.29%--up_read | | | | | |--8.19%--down_read | | | | | --7.89%--_raw_spin_lock | | | | | ---- | | | |--22.42%--walk_component | | | | | --22.36%--lookup_fast | | | | | |--16.07%--__d_lookup | | | | | | | --16.01%--_raw_spin_lock | | | | | | | ---- | | | | | --6.28%--kernfs_dop_revalidate | | | | | |--3.76%--down_read | | | | | --2.26%--up_read As can be seen from the above data the overhead due to both kerfs_iop_permission and kernfs_dop_revalidate have gone down and this also reduces overall run time of the earlier mentioned loop. Signed-off-by: Imran Khan <imran.f.khan@oracle.com> Link: https://lore.kernel.org/r/20230309110932.2889010-2-imran.f.khan@oracle.com Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Signed-off-by: Ian Kent <ikent@redhat.com>
2024-08-07 01:55:39 +00:00
up_write(&root->kernfs_iattr_rwsem);
return ret;
}
fs: port ->setattr() to pass mnt_idmap JIRA: https://issues.redhat.com/browse/RHEL-33888 Status: Linus Conflicts: CentOS Stream commit 3c29fadfb1ba7 ("afs: split afs_pagecache_valid() out of afs_validate()") is present, manually adjust hunk #1 of fs/afs/internal.h. For consistency drop btrfs hunks because it isn't supported in CentOS Stream and other backports also drop such hunks. CentOS Stream commit 48fa94aacd100 ("ceph: fscrypt_auth handling for ceph") alters the definition of _ceph_setattr(), adjust manually. CentOS Stream commit 34b2a2b5a3b7e {"ceph: add some fscrypt guardrails") introduces a call to fscrypt_prepare_setattr() which causes fuzz when applying. The cifs source has been moved in CentOS Stream so manually apply rejected hunks to fs/smb/client/cifsfs.h and fs/smb/client/inode.c. Upstream commit 5a646fb3a3e2d ("coda: avoid doing bad things on inode type changes during revalidation") is not present which causes fuzz in fs/coda/coda_linux.h. Dropped hunks for ntfs3 because the source is not present in the CentOS Stream source tree. CentOS Stream commit 98ba731fc7eae ("ovl: Move xattr support to new xattrs.c file") is presnt so manually apply hunk. CentOS Stream commit 892da692fa5bc ("shmem: support idmapped mounts for tmpfs") is present so it's ok to pass idmap to setattr_prepare() and setattr_copy(). Update to add incremental changes needed due to CentOS Stream commit 469e1d13f6e5f ("shmem: quota support"). Allow for CentOS Stream commit 6c3396a0d8f2c ("kernfs: Introduce separate rwsem to protect inode attributes") which is already present. CentOS Stream commit f5219db0c03b6 ("KVM: fix Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory") updated the upstream commit a7800aa80ea4d ("KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory") to account for missing idmapping commits. Now we have updated one of the two places these changes were made make one of the needed adjustments to match the original upstream patch. commit c1632a0f11209338fc300c66252bcc4686e609e8 Author: Christian Brauner <brauner@kernel.org> Date: Fri Jan 13 12:49:11 2023 +0100 fs: port ->setattr() to pass mnt_idmap Convert to struct mnt_idmap. Last cycle we merged the necessary infrastructure in 256c8aed2b42 ("fs: introduce dedicated idmap type for mounts"). This is just the conversion to struct mnt_idmap. Currently we still pass around the plain namespace that was attached to a mount. This is in general pretty convenient but it makes it easy to conflate namespaces that are relevant on the filesystem with namespaces that are relevent on the mount level. Especially for non-vfs developers without detailed knowledge in this area this can be a potential source for bugs. Once the conversion to struct mnt_idmap is done all helpers down to the really low-level helpers will take a struct mnt_idmap argument instead of two namespace arguments. This way it becomes impossible to conflate the two eliminating the possibility of any bugs. All of the vfs and all filesystems only operate on struct mnt_idmap. Acked-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org> Signed-off-by: Ian Kent <ikent@redhat.com>
2024-05-21 03:43:39 +00:00
int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
struct iattr *iattr)
{
struct inode *inode = d_inode(dentry);
struct kernfs_node *kn = inode->i_private;
struct kernfs_root *root;
int error;
if (!kn)
return -EINVAL;
root = kernfs_root(kn);
kernfs: Introduce separate rwsem to protect inode attributes JIRA: https://issues.redhat.com/browse/RHEL-52956 Upstream status: Linus Conflicts: There's a reject of hunks 2, 4 and 5 against fs/kernfs/inode.c because idmapping updates have not yet been made to the RHEL9 kernel. The RH_KABI_EXTEND() macro is used to add the new lock to the kernfs_root structure. This should be fine because CentOS Stream commit 25d13c0bf2507 ("kernfs: move struct kernfs_root out of the public view") has already been included in RHEL-9 making the kernfs_root structure private to kernfs. commit 9caf696142252a466fb89e629d0eddcdced027b0 Author: Imran Khan <imran.f.khan@oracle.com> Date: Thu Mar 9 22:09:30 2023 +1100 kernfs: Introduce separate rwsem to protect inode attributes. Right now a global per-fs rwsem (kernfs_rwsem) synchronizes multiple kernfs operations. On a large system with few hundred CPUs and few hundred applications simultaneoulsy trying to access sysfs, this results in multiple sys_open(s) contending on kernfs_rwsem via kernfs_iop_permission and kernfs_dop_revalidate. For example on a system with 384 cores, if I run 200 instances of an application which is mostly executing the following loop: for (int loop = 0; loop <100 ; loop++) { for (int port_num = 1; port_num < 2; port_num++) { for (int gid_index = 0; gid_index < 254; gid_index++ ) { char ret_buf[64], ret_buf_lo[64]; char gid_file_path[1024]; int ret_len; int ret_fd; ssize_t ret_rd; ub4 i, saved_errno; memset(ret_buf, 0, sizeof(ret_buf)); memset(gid_file_path, 0, sizeof(gid_file_path)); ret_len = snprintf(gid_file_path, sizeof(gid_file_path), "/sys/class/infiniband/%s/ports/%d/gids/%d", dev_name, port_num, gid_index); ret_fd = open(gid_file_path, O_RDONLY | O_CLOEXEC); if (ret_fd < 0) { printf("Failed to open %s\n", gid_file_path); continue; } /* Read the GID */ ret_rd = read(ret_fd, ret_buf, 40); if (ret_rd == -1) { printf("Failed to read from file %s, errno: %u\n", gid_file_path, saved_errno); continue; } close(ret_fd); } } I see contention around kernfs_rwsem as follows: path_openat | |----link_path_walk.part.0.constprop.0 | | | |--49.92%--inode_permission | | | | | --48.69%--kernfs_iop_permission | | | | | |--18.16%--down_read | | | | | |--15.38%--up_read | | | | | --14.58%--_raw_spin_lock | | | | | ----- | | | |--29.08%--walk_component | | | | | --29.02%--lookup_fast | | | | | |--24.26%--kernfs_dop_revalidate | | | | | | | |--14.97%--down_read | | | | | | | --9.01%--up_read | | | | | --4.74%--__d_lookup | | | | | --4.64%--_raw_spin_lock | | | | | ---- Having a separate per-fs rwsem to protect kernfs inode attributes, will avoid the above mentioned contention and result in better performance as can bee seen below: path_openat | |----link_path_walk.part.0.constprop.0 | | | | | |--27.06%--inode_permission | | | | | --25.84%--kernfs_iop_permission | | | | | |--9.29%--up_read | | | | | |--8.19%--down_read | | | | | --7.89%--_raw_spin_lock | | | | | ---- | | | |--22.42%--walk_component | | | | | --22.36%--lookup_fast | | | | | |--16.07%--__d_lookup | | | | | | | --16.01%--_raw_spin_lock | | | | | | | ---- | | | | | --6.28%--kernfs_dop_revalidate | | | | | |--3.76%--down_read | | | | | --2.26%--up_read As can be seen from the above data the overhead due to both kerfs_iop_permission and kernfs_dop_revalidate have gone down and this also reduces overall run time of the earlier mentioned loop. Signed-off-by: Imran Khan <imran.f.khan@oracle.com> Link: https://lore.kernel.org/r/20230309110932.2889010-2-imran.f.khan@oracle.com Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Signed-off-by: Ian Kent <ikent@redhat.com>
2024-08-07 01:55:39 +00:00
down_write(&root->kernfs_iattr_rwsem);
fs: port ->setattr() to pass mnt_idmap JIRA: https://issues.redhat.com/browse/RHEL-33888 Status: Linus Conflicts: CentOS Stream commit 3c29fadfb1ba7 ("afs: split afs_pagecache_valid() out of afs_validate()") is present, manually adjust hunk #1 of fs/afs/internal.h. For consistency drop btrfs hunks because it isn't supported in CentOS Stream and other backports also drop such hunks. CentOS Stream commit 48fa94aacd100 ("ceph: fscrypt_auth handling for ceph") alters the definition of _ceph_setattr(), adjust manually. CentOS Stream commit 34b2a2b5a3b7e {"ceph: add some fscrypt guardrails") introduces a call to fscrypt_prepare_setattr() which causes fuzz when applying. The cifs source has been moved in CentOS Stream so manually apply rejected hunks to fs/smb/client/cifsfs.h and fs/smb/client/inode.c. Upstream commit 5a646fb3a3e2d ("coda: avoid doing bad things on inode type changes during revalidation") is not present which causes fuzz in fs/coda/coda_linux.h. Dropped hunks for ntfs3 because the source is not present in the CentOS Stream source tree. CentOS Stream commit 98ba731fc7eae ("ovl: Move xattr support to new xattrs.c file") is presnt so manually apply hunk. CentOS Stream commit 892da692fa5bc ("shmem: support idmapped mounts for tmpfs") is present so it's ok to pass idmap to setattr_prepare() and setattr_copy(). Update to add incremental changes needed due to CentOS Stream commit 469e1d13f6e5f ("shmem: quota support"). Allow for CentOS Stream commit 6c3396a0d8f2c ("kernfs: Introduce separate rwsem to protect inode attributes") which is already present. CentOS Stream commit f5219db0c03b6 ("KVM: fix Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory") updated the upstream commit a7800aa80ea4d ("KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory") to account for missing idmapping commits. Now we have updated one of the two places these changes were made make one of the needed adjustments to match the original upstream patch. commit c1632a0f11209338fc300c66252bcc4686e609e8 Author: Christian Brauner <brauner@kernel.org> Date: Fri Jan 13 12:49:11 2023 +0100 fs: port ->setattr() to pass mnt_idmap Convert to struct mnt_idmap. Last cycle we merged the necessary infrastructure in 256c8aed2b42 ("fs: introduce dedicated idmap type for mounts"). This is just the conversion to struct mnt_idmap. Currently we still pass around the plain namespace that was attached to a mount. This is in general pretty convenient but it makes it easy to conflate namespaces that are relevant on the filesystem with namespaces that are relevent on the mount level. Especially for non-vfs developers without detailed knowledge in this area this can be a potential source for bugs. Once the conversion to struct mnt_idmap is done all helpers down to the really low-level helpers will take a struct mnt_idmap argument instead of two namespace arguments. This way it becomes impossible to conflate the two eliminating the possibility of any bugs. All of the vfs and all filesystems only operate on struct mnt_idmap. Acked-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org> Signed-off-by: Ian Kent <ikent@redhat.com>
2024-05-21 03:43:39 +00:00
error = setattr_prepare(&nop_mnt_idmap, dentry, iattr);
if (error)
goto out;
error = __kernfs_setattr(kn, iattr);
if (error)
goto out;
/* this ignores size changes */
fs: port ->setattr() to pass mnt_idmap JIRA: https://issues.redhat.com/browse/RHEL-33888 Status: Linus Conflicts: CentOS Stream commit 3c29fadfb1ba7 ("afs: split afs_pagecache_valid() out of afs_validate()") is present, manually adjust hunk #1 of fs/afs/internal.h. For consistency drop btrfs hunks because it isn't supported in CentOS Stream and other backports also drop such hunks. CentOS Stream commit 48fa94aacd100 ("ceph: fscrypt_auth handling for ceph") alters the definition of _ceph_setattr(), adjust manually. CentOS Stream commit 34b2a2b5a3b7e {"ceph: add some fscrypt guardrails") introduces a call to fscrypt_prepare_setattr() which causes fuzz when applying. The cifs source has been moved in CentOS Stream so manually apply rejected hunks to fs/smb/client/cifsfs.h and fs/smb/client/inode.c. Upstream commit 5a646fb3a3e2d ("coda: avoid doing bad things on inode type changes during revalidation") is not present which causes fuzz in fs/coda/coda_linux.h. Dropped hunks for ntfs3 because the source is not present in the CentOS Stream source tree. CentOS Stream commit 98ba731fc7eae ("ovl: Move xattr support to new xattrs.c file") is presnt so manually apply hunk. CentOS Stream commit 892da692fa5bc ("shmem: support idmapped mounts for tmpfs") is present so it's ok to pass idmap to setattr_prepare() and setattr_copy(). Update to add incremental changes needed due to CentOS Stream commit 469e1d13f6e5f ("shmem: quota support"). Allow for CentOS Stream commit 6c3396a0d8f2c ("kernfs: Introduce separate rwsem to protect inode attributes") which is already present. CentOS Stream commit f5219db0c03b6 ("KVM: fix Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory") updated the upstream commit a7800aa80ea4d ("KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory") to account for missing idmapping commits. Now we have updated one of the two places these changes were made make one of the needed adjustments to match the original upstream patch. commit c1632a0f11209338fc300c66252bcc4686e609e8 Author: Christian Brauner <brauner@kernel.org> Date: Fri Jan 13 12:49:11 2023 +0100 fs: port ->setattr() to pass mnt_idmap Convert to struct mnt_idmap. Last cycle we merged the necessary infrastructure in 256c8aed2b42 ("fs: introduce dedicated idmap type for mounts"). This is just the conversion to struct mnt_idmap. Currently we still pass around the plain namespace that was attached to a mount. This is in general pretty convenient but it makes it easy to conflate namespaces that are relevant on the filesystem with namespaces that are relevent on the mount level. Especially for non-vfs developers without detailed knowledge in this area this can be a potential source for bugs. Once the conversion to struct mnt_idmap is done all helpers down to the really low-level helpers will take a struct mnt_idmap argument instead of two namespace arguments. This way it becomes impossible to conflate the two eliminating the possibility of any bugs. All of the vfs and all filesystems only operate on struct mnt_idmap. Acked-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org> Signed-off-by: Ian Kent <ikent@redhat.com>
2024-05-21 03:43:39 +00:00
setattr_copy(&nop_mnt_idmap, inode, iattr);
out:
kernfs: Introduce separate rwsem to protect inode attributes JIRA: https://issues.redhat.com/browse/RHEL-52956 Upstream status: Linus Conflicts: There's a reject of hunks 2, 4 and 5 against fs/kernfs/inode.c because idmapping updates have not yet been made to the RHEL9 kernel. The RH_KABI_EXTEND() macro is used to add the new lock to the kernfs_root structure. This should be fine because CentOS Stream commit 25d13c0bf2507 ("kernfs: move struct kernfs_root out of the public view") has already been included in RHEL-9 making the kernfs_root structure private to kernfs. commit 9caf696142252a466fb89e629d0eddcdced027b0 Author: Imran Khan <imran.f.khan@oracle.com> Date: Thu Mar 9 22:09:30 2023 +1100 kernfs: Introduce separate rwsem to protect inode attributes. Right now a global per-fs rwsem (kernfs_rwsem) synchronizes multiple kernfs operations. On a large system with few hundred CPUs and few hundred applications simultaneoulsy trying to access sysfs, this results in multiple sys_open(s) contending on kernfs_rwsem via kernfs_iop_permission and kernfs_dop_revalidate. For example on a system with 384 cores, if I run 200 instances of an application which is mostly executing the following loop: for (int loop = 0; loop <100 ; loop++) { for (int port_num = 1; port_num < 2; port_num++) { for (int gid_index = 0; gid_index < 254; gid_index++ ) { char ret_buf[64], ret_buf_lo[64]; char gid_file_path[1024]; int ret_len; int ret_fd; ssize_t ret_rd; ub4 i, saved_errno; memset(ret_buf, 0, sizeof(ret_buf)); memset(gid_file_path, 0, sizeof(gid_file_path)); ret_len = snprintf(gid_file_path, sizeof(gid_file_path), "/sys/class/infiniband/%s/ports/%d/gids/%d", dev_name, port_num, gid_index); ret_fd = open(gid_file_path, O_RDONLY | O_CLOEXEC); if (ret_fd < 0) { printf("Failed to open %s\n", gid_file_path); continue; } /* Read the GID */ ret_rd = read(ret_fd, ret_buf, 40); if (ret_rd == -1) { printf("Failed to read from file %s, errno: %u\n", gid_file_path, saved_errno); continue; } close(ret_fd); } } I see contention around kernfs_rwsem as follows: path_openat | |----link_path_walk.part.0.constprop.0 | | | |--49.92%--inode_permission | | | | | --48.69%--kernfs_iop_permission | | | | | |--18.16%--down_read | | | | | |--15.38%--up_read | | | | | --14.58%--_raw_spin_lock | | | | | ----- | | | |--29.08%--walk_component | | | | | --29.02%--lookup_fast | | | | | |--24.26%--kernfs_dop_revalidate | | | | | | | |--14.97%--down_read | | | | | | | --9.01%--up_read | | | | | --4.74%--__d_lookup | | | | | --4.64%--_raw_spin_lock | | | | | ---- Having a separate per-fs rwsem to protect kernfs inode attributes, will avoid the above mentioned contention and result in better performance as can bee seen below: path_openat | |----link_path_walk.part.0.constprop.0 | | | | | |--27.06%--inode_permission | | | | | --25.84%--kernfs_iop_permission | | | | | |--9.29%--up_read | | | | | |--8.19%--down_read | | | | | --7.89%--_raw_spin_lock | | | | | ---- | | | |--22.42%--walk_component | | | | | --22.36%--lookup_fast | | | | | |--16.07%--__d_lookup | | | | | | | --16.01%--_raw_spin_lock | | | | | | | ---- | | | | | --6.28%--kernfs_dop_revalidate | | | | | |--3.76%--down_read | | | | | --2.26%--up_read As can be seen from the above data the overhead due to both kerfs_iop_permission and kernfs_dop_revalidate have gone down and this also reduces overall run time of the earlier mentioned loop. Signed-off-by: Imran Khan <imran.f.khan@oracle.com> Link: https://lore.kernel.org/r/20230309110932.2889010-2-imran.f.khan@oracle.com Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Signed-off-by: Ian Kent <ikent@redhat.com>
2024-08-07 01:55:39 +00:00
up_write(&root->kernfs_iattr_rwsem);
return error;
}
ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size)
{
struct kernfs_node *kn = kernfs_dentry_node(dentry);
struct kernfs_iattrs *attrs;
attrs = kernfs_iattrs(kn);
if (!attrs)
return -ENOMEM;
return simple_xattr_list(d_inode(dentry), &attrs->xattrs, buf, size);
}
static inline void set_default_inode_attr(struct inode *inode, umode_t mode)
{
inode->i_mode = mode;
inode->i_atime = inode->i_mtime =
inode->i_ctime = current_time(inode);
}
static inline void set_inode_attr(struct inode *inode,
struct kernfs_iattrs *attrs)
{
inode->i_uid = attrs->ia_uid;
inode->i_gid = attrs->ia_gid;
inode->i_atime = attrs->ia_atime;
inode->i_mtime = attrs->ia_mtime;
inode->i_ctime = attrs->ia_ctime;
}
static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode)
{
struct kernfs_iattrs *attrs = kn->iattr;
inode->i_mode = kn->mode;
if (attrs)
/*
* kernfs_node has non-default attributes get them from
* persistent copy in kernfs_node.
*/
set_inode_attr(inode, attrs);
if (kernfs_type(kn) == KERNFS_DIR)
set_nlink(inode, kn->dir.subdirs + 2);
}
fs: port ->getattr() to pass mnt_idmap JIRA: https://issues.redhat.com/browse/RHEL-33888 Status: Linus Conflicts: CentOS Stream has commit 3e0b6f1fa9a1c ("afs: use read_seqbegin() in afs_check_validity() and afs_getattr()"), manually apply hunk #2 to fs/afs/inode.c. CentOS Stream commit 3b06927229296 {"afs: split afs_pagecache_valid() out of afs_validate()") is present which causes a reject in fs/afs/internal.h, manually apply hunk to fs/afs/internal.h. For consistency drop btrfs hunks because it isn't supported in CentOS Stream and other backports also drop such hunks. CentOS Stream commit 48fa94aacd100 ("ceph: fscrypt_auth handling for ceph") alters the definition of _ceph_setattr() causing fuzz. The cifs source has been moved in CentOS Stream so manually apply rejected hunks to fs/smb/client/cifsfs.h and fs/smb/client/inode.c. Upstream commit 2e1d66379ece5 ("staging: erofs: drop the extern prefix for function definitions") caused strange behaviour when applying this patch, there was a conflict in fs/erofs/internal.h but after a refresh the hunk and context looked ok. The hunk had to be manually applied. Upstream commit 2db0487faa211 ("f2fs: move f2fs_force_buffered_io() into file.c") is not present in CentOS Stream which causes fuzz when applying the first hunk to fs/f2fs/file.c. Upstream commit 30abce053f811 ("fat: report creation time in statx") is not present in CentOS Stream which caused a reject so apply change manually. Dropped hunks for ksmbd because the source is not present in the CentOS Stream source tree. Dropped hunks for ntfs3 because the source is not present in the CentOS Stream source tree. There was fuzz with hunk #2 against fs/nfs/inode.c but I was unable to see any difference. CentOS Stream commit 98ba731fc7eae ("ovl: Move xattr support to new xattrs.c file") is present which caused fuzz in fs/overlayfs/overlayfs.h. Upstream commit d919a1e79bac8 ("proc: fix a dentry lock race between release_task and lookup") is not present in CentOS Stream causing fuzz applying hunk #1 against fs/proc/base.c. CentOS Stream commit 20c470188c2eb ("vfs: plumb i_version handling into struct kstat") is present causing fuzz in hunk #2 against fs/stat.c. Upstream commit e0c49bd2b4d3c ("fs: sysv: Fix sysv_nblocks() returns wrong value") is not present in CentOS Stream causing fuzz applying hunk#1 against fs/sysv/itree.c. CentOS Stream commit 892da692fa5bc ("shmem: support idmapped mounts for tmpfs") is present so it's ok to pass idmap to generic_fillattr(). CentOS Stream commit f0f830cd7e01b {"ceph: create symlinks with encrypted and base64-encoded targets") uses the old struct user_namespace and so leaves those changes out, make those getattr() changes here. Allow for CentOS Stream commit 6c3396a0d8f2c ("kernfs: Introduce separate rwsem to protect inode attributes") which is already present. CentOS Stream commit f5219db0c03b6 ("KVM: fix Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory") updated the upstream commit a7800aa80ea4d ("KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory") to account for missing idmapping commits. Now we have updated the second and final place these changes were made make the final needed adjustment to match the original upstream patch. commit b74d24f7a74ffd2d42ca883d84b7422b8d545901 Author: Christian Brauner <brauner@kernel.org> Date: Fri Jan 13 12:49:12 2023 +0100 fs: port ->getattr() to pass mnt_idmap Convert to struct mnt_idmap. Last cycle we merged the necessary infrastructure in 256c8aed2b42 ("fs: introduce dedicated idmap type for mounts"). This is just the conversion to struct mnt_idmap. Currently we still pass around the plain namespace that was attached to a mount. This is in general pretty convenient but it makes it easy to conflate namespaces that are relevant on the filesystem with namespaces that are relevent on the mount level. Especially for non-vfs developers without detailed knowledge in this area this can be a potential source for bugs. Once the conversion to struct mnt_idmap is done all helpers down to the really low-level helpers will take a struct mnt_idmap argument instead of two namespace arguments. This way it becomes impossible to conflate the two eliminating the possibility of any bugs. All of the vfs and all filesystems only operate on struct mnt_idmap. Acked-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org> Signed-off-by: Ian Kent <ikent@redhat.com>
2024-05-21 04:47:00 +00:00
int kernfs_iop_getattr(struct mnt_idmap *idmap,
const struct path *path, struct kstat *stat,
statx: Add a system call to make enhanced file info available Add a system call to make extended file information available, including file creation and some attribute flags where available through the underlying filesystem. The getattr inode operation is altered to take two additional arguments: a u32 request_mask and an unsigned int flags that indicate the synchronisation mode. This change is propagated to the vfs_getattr*() function. Functions like vfs_stat() are now inline wrappers around new functions vfs_statx() and vfs_statx_fd() to reduce stack usage. ======== OVERVIEW ======== The idea was initially proposed as a set of xattrs that could be retrieved with getxattr(), but the general preference proved to be for a new syscall with an extended stat structure. A number of requests were gathered for features to be included. The following have been included: (1) Make the fields a consistent size on all arches and make them large. (2) Spare space, request flags and information flags are provided for future expansion. (3) Better support for the y2038 problem [Arnd Bergmann] (tv_sec is an __s64). (4) Creation time: The SMB protocol carries the creation time, which could be exported by Samba, which will in turn help CIFS make use of FS-Cache as that can be used for coherency data (stx_btime). This is also specified in NFSv4 as a recommended attribute and could be exported by NFSD [Steve French]. (5) Lightweight stat: Ask for just those details of interest, and allow a netfs (such as NFS) to approximate anything not of interest, possibly without going to the server [Trond Myklebust, Ulrich Drepper, Andreas Dilger] (AT_STATX_DONT_SYNC). (6) Heavyweight stat: Force a netfs to go to the server, even if it thinks its cached attributes are up to date [Trond Myklebust] (AT_STATX_FORCE_SYNC). And the following have been left out for future extension: (7) Data version number: Could be used by userspace NFS servers [Aneesh Kumar]. Can also be used to modify fill_post_wcc() in NFSD which retrieves i_version directly, but has just called vfs_getattr(). It could get it from the kstat struct if it used vfs_xgetattr() instead. (There's disagreement on the exact semantics of a single field, since not all filesystems do this the same way). (8) BSD stat compatibility: Including more fields from the BSD stat such as creation time (st_btime) and inode generation number (st_gen) [Jeremy Allison, Bernd Schubert]. (9) Inode generation number: Useful for FUSE and userspace NFS servers [Bernd Schubert]. (This was asked for but later deemed unnecessary with the open-by-handle capability available and caused disagreement as to whether it's a security hole or not). (10) Extra coherency data may be useful in making backups [Andreas Dilger]. (No particular data were offered, but things like last backup timestamp, the data version number and the DOS archive bit would come into this category). (11) Allow the filesystem to indicate what it can/cannot provide: A filesystem can now say it doesn't support a standard stat feature if that isn't available, so if, for instance, inode numbers or UIDs don't exist or are fabricated locally... (This requires a separate system call - I have an fsinfo() call idea for this). (12) Store a 16-byte volume ID in the superblock that can be returned in struct xstat [Steve French]. (Deferred to fsinfo). (13) Include granularity fields in the time data to indicate the granularity of each of the times (NFSv4 time_delta) [Steve French]. (Deferred to fsinfo). (14) FS_IOC_GETFLAGS value. These could be translated to BSD's st_flags. Note that the Linux IOC flags are a mess and filesystems such as Ext4 define flags that aren't in linux/fs.h, so translation in the kernel may be a necessity (or, possibly, we provide the filesystem type too). (Some attributes are made available in stx_attributes, but the general feeling was that the IOC flags were to ext[234]-specific and shouldn't be exposed through statx this way). (15) Mask of features available on file (eg: ACLs, seclabel) [Brad Boyer, Michael Kerrisk]. (Deferred, probably to fsinfo. Finding out if there's an ACL or seclabal might require extra filesystem operations). (16) Femtosecond-resolution timestamps [Dave Chinner]. (A __reserved field has been left in the statx_timestamp struct for this - if there proves to be a need). (17) A set multiple attributes syscall to go with this. =============== NEW SYSTEM CALL =============== The new system call is: int ret = statx(int dfd, const char *filename, unsigned int flags, unsigned int mask, struct statx *buffer); The dfd, filename and flags parameters indicate the file to query, in a similar way to fstatat(). There is no equivalent of lstat() as that can be emulated with statx() by passing AT_SYMLINK_NOFOLLOW in flags. There is also no equivalent of fstat() as that can be emulated by passing a NULL filename to statx() with the fd of interest in dfd. Whether or not statx() synchronises the attributes with the backing store can be controlled by OR'ing a value into the flags argument (this typically only affects network filesystems): (1) AT_STATX_SYNC_AS_STAT tells statx() to behave as stat() does in this respect. (2) AT_STATX_FORCE_SYNC will require a network filesystem to synchronise its attributes with the server - which might require data writeback to occur to get the timestamps correct. (3) AT_STATX_DONT_SYNC will suppress synchronisation with the server in a network filesystem. The resulting values should be considered approximate. mask is a bitmask indicating the fields in struct statx that are of interest to the caller. The user should set this to STATX_BASIC_STATS to get the basic set returned by stat(). It should be noted that asking for more information may entail extra I/O operations. buffer points to the destination for the data. This must be 256 bytes in size. ====================== MAIN ATTRIBUTES RECORD ====================== The following structures are defined in which to return the main attribute set: struct statx_timestamp { __s64 tv_sec; __s32 tv_nsec; __s32 __reserved; }; struct statx { __u32 stx_mask; __u32 stx_blksize; __u64 stx_attributes; __u32 stx_nlink; __u32 stx_uid; __u32 stx_gid; __u16 stx_mode; __u16 __spare0[1]; __u64 stx_ino; __u64 stx_size; __u64 stx_blocks; __u64 __spare1[1]; struct statx_timestamp stx_atime; struct statx_timestamp stx_btime; struct statx_timestamp stx_ctime; struct statx_timestamp stx_mtime; __u32 stx_rdev_major; __u32 stx_rdev_minor; __u32 stx_dev_major; __u32 stx_dev_minor; __u64 __spare2[14]; }; The defined bits in request_mask and stx_mask are: STATX_TYPE Want/got stx_mode & S_IFMT STATX_MODE Want/got stx_mode & ~S_IFMT STATX_NLINK Want/got stx_nlink STATX_UID Want/got stx_uid STATX_GID Want/got stx_gid STATX_ATIME Want/got stx_atime{,_ns} STATX_MTIME Want/got stx_mtime{,_ns} STATX_CTIME Want/got stx_ctime{,_ns} STATX_INO Want/got stx_ino STATX_SIZE Want/got stx_size STATX_BLOCKS Want/got stx_blocks STATX_BASIC_STATS [The stuff in the normal stat struct] STATX_BTIME Want/got stx_btime{,_ns} STATX_ALL [All currently available stuff] stx_btime is the file creation time, stx_mask is a bitmask indicating the data provided and __spares*[] are where as-yet undefined fields can be placed. Time fields are structures with separate seconds and nanoseconds fields plus a reserved field in case we want to add even finer resolution. Note that times will be negative if before 1970; in such a case, the nanosecond fields will also be negative if not zero. The bits defined in the stx_attributes field convey information about a file, how it is accessed, where it is and what it does. The following attributes map to FS_*_FL flags and are the same numerical value: STATX_ATTR_COMPRESSED File is compressed by the fs STATX_ATTR_IMMUTABLE File is marked immutable STATX_ATTR_APPEND File is append-only STATX_ATTR_NODUMP File is not to be dumped STATX_ATTR_ENCRYPTED File requires key to decrypt in fs Within the kernel, the supported flags are listed by: KSTAT_ATTR_FS_IOC_FLAGS [Are any other IOC flags of sufficient general interest to be exposed through this interface?] New flags include: STATX_ATTR_AUTOMOUNT Object is an automount trigger These are for the use of GUI tools that might want to mark files specially, depending on what they are. Fields in struct statx come in a number of classes: (0) stx_dev_*, stx_blksize. These are local system information and are always available. (1) stx_mode, stx_nlinks, stx_uid, stx_gid, stx_[amc]time, stx_ino, stx_size, stx_blocks. These will be returned whether the caller asks for them or not. The corresponding bits in stx_mask will be set to indicate whether they actually have valid values. If the caller didn't ask for them, then they may be approximated. For example, NFS won't waste any time updating them from the server, unless as a byproduct of updating something requested. If the values don't actually exist for the underlying object (such as UID or GID on a DOS file), then the bit won't be set in the stx_mask, even if the caller asked for the value. In such a case, the returned value will be a fabrication. Note that there are instances where the type might not be valid, for instance Windows reparse points. (2) stx_rdev_*. This will be set only if stx_mode indicates we're looking at a blockdev or a chardev, otherwise will be 0. (3) stx_btime. Similar to (1), except this will be set to 0 if it doesn't exist. ======= TESTING ======= The following test program can be used to test the statx system call: samples/statx/test-statx.c Just compile and run, passing it paths to the files you want to examine. The file is built automatically if CONFIG_SAMPLES is enabled. Here's some example output. Firstly, an NFS directory that crosses to another FSID. Note that the AUTOMOUNT attribute is set because transiting this directory will cause d_automount to be invoked by the VFS. [root@andromeda ~]# /tmp/test-statx -A /warthog/data statx(/warthog/data) = 0 results=7ff Size: 4096 Blocks: 8 IO Block: 1048576 directory Device: 00:26 Inode: 1703937 Links: 125 Access: (3777/drwxrwxrwx) Uid: 0 Gid: 4041 Access: 2016-11-24 09:02:12.219699527+0000 Modify: 2016-11-17 10:44:36.225653653+0000 Change: 2016-11-17 10:44:36.225653653+0000 Attributes: 0000000000001000 (-------- -------- -------- -------- -------- -------- ---m---- --------) Secondly, the result of automounting on that directory. [root@andromeda ~]# /tmp/test-statx /warthog/data statx(/warthog/data) = 0 results=7ff Size: 4096 Blocks: 8 IO Block: 1048576 directory Device: 00:27 Inode: 2 Links: 125 Access: (3777/drwxrwxrwx) Uid: 0 Gid: 4041 Access: 2016-11-24 09:02:12.219699527+0000 Modify: 2016-11-17 10:44:36.225653653+0000 Change: 2016-11-17 10:44:36.225653653+0000 Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2017-01-31 16:46:22 +00:00
u32 request_mask, unsigned int query_flags)
{
statx: Add a system call to make enhanced file info available Add a system call to make extended file information available, including file creation and some attribute flags where available through the underlying filesystem. The getattr inode operation is altered to take two additional arguments: a u32 request_mask and an unsigned int flags that indicate the synchronisation mode. This change is propagated to the vfs_getattr*() function. Functions like vfs_stat() are now inline wrappers around new functions vfs_statx() and vfs_statx_fd() to reduce stack usage. ======== OVERVIEW ======== The idea was initially proposed as a set of xattrs that could be retrieved with getxattr(), but the general preference proved to be for a new syscall with an extended stat structure. A number of requests were gathered for features to be included. The following have been included: (1) Make the fields a consistent size on all arches and make them large. (2) Spare space, request flags and information flags are provided for future expansion. (3) Better support for the y2038 problem [Arnd Bergmann] (tv_sec is an __s64). (4) Creation time: The SMB protocol carries the creation time, which could be exported by Samba, which will in turn help CIFS make use of FS-Cache as that can be used for coherency data (stx_btime). This is also specified in NFSv4 as a recommended attribute and could be exported by NFSD [Steve French]. (5) Lightweight stat: Ask for just those details of interest, and allow a netfs (such as NFS) to approximate anything not of interest, possibly without going to the server [Trond Myklebust, Ulrich Drepper, Andreas Dilger] (AT_STATX_DONT_SYNC). (6) Heavyweight stat: Force a netfs to go to the server, even if it thinks its cached attributes are up to date [Trond Myklebust] (AT_STATX_FORCE_SYNC). And the following have been left out for future extension: (7) Data version number: Could be used by userspace NFS servers [Aneesh Kumar]. Can also be used to modify fill_post_wcc() in NFSD which retrieves i_version directly, but has just called vfs_getattr(). It could get it from the kstat struct if it used vfs_xgetattr() instead. (There's disagreement on the exact semantics of a single field, since not all filesystems do this the same way). (8) BSD stat compatibility: Including more fields from the BSD stat such as creation time (st_btime) and inode generation number (st_gen) [Jeremy Allison, Bernd Schubert]. (9) Inode generation number: Useful for FUSE and userspace NFS servers [Bernd Schubert]. (This was asked for but later deemed unnecessary with the open-by-handle capability available and caused disagreement as to whether it's a security hole or not). (10) Extra coherency data may be useful in making backups [Andreas Dilger]. (No particular data were offered, but things like last backup timestamp, the data version number and the DOS archive bit would come into this category). (11) Allow the filesystem to indicate what it can/cannot provide: A filesystem can now say it doesn't support a standard stat feature if that isn't available, so if, for instance, inode numbers or UIDs don't exist or are fabricated locally... (This requires a separate system call - I have an fsinfo() call idea for this). (12) Store a 16-byte volume ID in the superblock that can be returned in struct xstat [Steve French]. (Deferred to fsinfo). (13) Include granularity fields in the time data to indicate the granularity of each of the times (NFSv4 time_delta) [Steve French]. (Deferred to fsinfo). (14) FS_IOC_GETFLAGS value. These could be translated to BSD's st_flags. Note that the Linux IOC flags are a mess and filesystems such as Ext4 define flags that aren't in linux/fs.h, so translation in the kernel may be a necessity (or, possibly, we provide the filesystem type too). (Some attributes are made available in stx_attributes, but the general feeling was that the IOC flags were to ext[234]-specific and shouldn't be exposed through statx this way). (15) Mask of features available on file (eg: ACLs, seclabel) [Brad Boyer, Michael Kerrisk]. (Deferred, probably to fsinfo. Finding out if there's an ACL or seclabal might require extra filesystem operations). (16) Femtosecond-resolution timestamps [Dave Chinner]. (A __reserved field has been left in the statx_timestamp struct for this - if there proves to be a need). (17) A set multiple attributes syscall to go with this. =============== NEW SYSTEM CALL =============== The new system call is: int ret = statx(int dfd, const char *filename, unsigned int flags, unsigned int mask, struct statx *buffer); The dfd, filename and flags parameters indicate the file to query, in a similar way to fstatat(). There is no equivalent of lstat() as that can be emulated with statx() by passing AT_SYMLINK_NOFOLLOW in flags. There is also no equivalent of fstat() as that can be emulated by passing a NULL filename to statx() with the fd of interest in dfd. Whether or not statx() synchronises the attributes with the backing store can be controlled by OR'ing a value into the flags argument (this typically only affects network filesystems): (1) AT_STATX_SYNC_AS_STAT tells statx() to behave as stat() does in this respect. (2) AT_STATX_FORCE_SYNC will require a network filesystem to synchronise its attributes with the server - which might require data writeback to occur to get the timestamps correct. (3) AT_STATX_DONT_SYNC will suppress synchronisation with the server in a network filesystem. The resulting values should be considered approximate. mask is a bitmask indicating the fields in struct statx that are of interest to the caller. The user should set this to STATX_BASIC_STATS to get the basic set returned by stat(). It should be noted that asking for more information may entail extra I/O operations. buffer points to the destination for the data. This must be 256 bytes in size. ====================== MAIN ATTRIBUTES RECORD ====================== The following structures are defined in which to return the main attribute set: struct statx_timestamp { __s64 tv_sec; __s32 tv_nsec; __s32 __reserved; }; struct statx { __u32 stx_mask; __u32 stx_blksize; __u64 stx_attributes; __u32 stx_nlink; __u32 stx_uid; __u32 stx_gid; __u16 stx_mode; __u16 __spare0[1]; __u64 stx_ino; __u64 stx_size; __u64 stx_blocks; __u64 __spare1[1]; struct statx_timestamp stx_atime; struct statx_timestamp stx_btime; struct statx_timestamp stx_ctime; struct statx_timestamp stx_mtime; __u32 stx_rdev_major; __u32 stx_rdev_minor; __u32 stx_dev_major; __u32 stx_dev_minor; __u64 __spare2[14]; }; The defined bits in request_mask and stx_mask are: STATX_TYPE Want/got stx_mode & S_IFMT STATX_MODE Want/got stx_mode & ~S_IFMT STATX_NLINK Want/got stx_nlink STATX_UID Want/got stx_uid STATX_GID Want/got stx_gid STATX_ATIME Want/got stx_atime{,_ns} STATX_MTIME Want/got stx_mtime{,_ns} STATX_CTIME Want/got stx_ctime{,_ns} STATX_INO Want/got stx_ino STATX_SIZE Want/got stx_size STATX_BLOCKS Want/got stx_blocks STATX_BASIC_STATS [The stuff in the normal stat struct] STATX_BTIME Want/got stx_btime{,_ns} STATX_ALL [All currently available stuff] stx_btime is the file creation time, stx_mask is a bitmask indicating the data provided and __spares*[] are where as-yet undefined fields can be placed. Time fields are structures with separate seconds and nanoseconds fields plus a reserved field in case we want to add even finer resolution. Note that times will be negative if before 1970; in such a case, the nanosecond fields will also be negative if not zero. The bits defined in the stx_attributes field convey information about a file, how it is accessed, where it is and what it does. The following attributes map to FS_*_FL flags and are the same numerical value: STATX_ATTR_COMPRESSED File is compressed by the fs STATX_ATTR_IMMUTABLE File is marked immutable STATX_ATTR_APPEND File is append-only STATX_ATTR_NODUMP File is not to be dumped STATX_ATTR_ENCRYPTED File requires key to decrypt in fs Within the kernel, the supported flags are listed by: KSTAT_ATTR_FS_IOC_FLAGS [Are any other IOC flags of sufficient general interest to be exposed through this interface?] New flags include: STATX_ATTR_AUTOMOUNT Object is an automount trigger These are for the use of GUI tools that might want to mark files specially, depending on what they are. Fields in struct statx come in a number of classes: (0) stx_dev_*, stx_blksize. These are local system information and are always available. (1) stx_mode, stx_nlinks, stx_uid, stx_gid, stx_[amc]time, stx_ino, stx_size, stx_blocks. These will be returned whether the caller asks for them or not. The corresponding bits in stx_mask will be set to indicate whether they actually have valid values. If the caller didn't ask for them, then they may be approximated. For example, NFS won't waste any time updating them from the server, unless as a byproduct of updating something requested. If the values don't actually exist for the underlying object (such as UID or GID on a DOS file), then the bit won't be set in the stx_mask, even if the caller asked for the value. In such a case, the returned value will be a fabrication. Note that there are instances where the type might not be valid, for instance Windows reparse points. (2) stx_rdev_*. This will be set only if stx_mode indicates we're looking at a blockdev or a chardev, otherwise will be 0. (3) stx_btime. Similar to (1), except this will be set to 0 if it doesn't exist. ======= TESTING ======= The following test program can be used to test the statx system call: samples/statx/test-statx.c Just compile and run, passing it paths to the files you want to examine. The file is built automatically if CONFIG_SAMPLES is enabled. Here's some example output. Firstly, an NFS directory that crosses to another FSID. Note that the AUTOMOUNT attribute is set because transiting this directory will cause d_automount to be invoked by the VFS. [root@andromeda ~]# /tmp/test-statx -A /warthog/data statx(/warthog/data) = 0 results=7ff Size: 4096 Blocks: 8 IO Block: 1048576 directory Device: 00:26 Inode: 1703937 Links: 125 Access: (3777/drwxrwxrwx) Uid: 0 Gid: 4041 Access: 2016-11-24 09:02:12.219699527+0000 Modify: 2016-11-17 10:44:36.225653653+0000 Change: 2016-11-17 10:44:36.225653653+0000 Attributes: 0000000000001000 (-------- -------- -------- -------- -------- -------- ---m---- --------) Secondly, the result of automounting on that directory. [root@andromeda ~]# /tmp/test-statx /warthog/data statx(/warthog/data) = 0 results=7ff Size: 4096 Blocks: 8 IO Block: 1048576 directory Device: 00:27 Inode: 2 Links: 125 Access: (3777/drwxrwxrwx) Uid: 0 Gid: 4041 Access: 2016-11-24 09:02:12.219699527+0000 Modify: 2016-11-17 10:44:36.225653653+0000 Change: 2016-11-17 10:44:36.225653653+0000 Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2017-01-31 16:46:22 +00:00
struct inode *inode = d_inode(path->dentry);
struct kernfs_node *kn = inode->i_private;
struct kernfs_root *root = kernfs_root(kn);
kernfs: Introduce separate rwsem to protect inode attributes JIRA: https://issues.redhat.com/browse/RHEL-52956 Upstream status: Linus Conflicts: There's a reject of hunks 2, 4 and 5 against fs/kernfs/inode.c because idmapping updates have not yet been made to the RHEL9 kernel. The RH_KABI_EXTEND() macro is used to add the new lock to the kernfs_root structure. This should be fine because CentOS Stream commit 25d13c0bf2507 ("kernfs: move struct kernfs_root out of the public view") has already been included in RHEL-9 making the kernfs_root structure private to kernfs. commit 9caf696142252a466fb89e629d0eddcdced027b0 Author: Imran Khan <imran.f.khan@oracle.com> Date: Thu Mar 9 22:09:30 2023 +1100 kernfs: Introduce separate rwsem to protect inode attributes. Right now a global per-fs rwsem (kernfs_rwsem) synchronizes multiple kernfs operations. On a large system with few hundred CPUs and few hundred applications simultaneoulsy trying to access sysfs, this results in multiple sys_open(s) contending on kernfs_rwsem via kernfs_iop_permission and kernfs_dop_revalidate. For example on a system with 384 cores, if I run 200 instances of an application which is mostly executing the following loop: for (int loop = 0; loop <100 ; loop++) { for (int port_num = 1; port_num < 2; port_num++) { for (int gid_index = 0; gid_index < 254; gid_index++ ) { char ret_buf[64], ret_buf_lo[64]; char gid_file_path[1024]; int ret_len; int ret_fd; ssize_t ret_rd; ub4 i, saved_errno; memset(ret_buf, 0, sizeof(ret_buf)); memset(gid_file_path, 0, sizeof(gid_file_path)); ret_len = snprintf(gid_file_path, sizeof(gid_file_path), "/sys/class/infiniband/%s/ports/%d/gids/%d", dev_name, port_num, gid_index); ret_fd = open(gid_file_path, O_RDONLY | O_CLOEXEC); if (ret_fd < 0) { printf("Failed to open %s\n", gid_file_path); continue; } /* Read the GID */ ret_rd = read(ret_fd, ret_buf, 40); if (ret_rd == -1) { printf("Failed to read from file %s, errno: %u\n", gid_file_path, saved_errno); continue; } close(ret_fd); } } I see contention around kernfs_rwsem as follows: path_openat | |----link_path_walk.part.0.constprop.0 | | | |--49.92%--inode_permission | | | | | --48.69%--kernfs_iop_permission | | | | | |--18.16%--down_read | | | | | |--15.38%--up_read | | | | | --14.58%--_raw_spin_lock | | | | | ----- | | | |--29.08%--walk_component | | | | | --29.02%--lookup_fast | | | | | |--24.26%--kernfs_dop_revalidate | | | | | | | |--14.97%--down_read | | | | | | | --9.01%--up_read | | | | | --4.74%--__d_lookup | | | | | --4.64%--_raw_spin_lock | | | | | ---- Having a separate per-fs rwsem to protect kernfs inode attributes, will avoid the above mentioned contention and result in better performance as can bee seen below: path_openat | |----link_path_walk.part.0.constprop.0 | | | | | |--27.06%--inode_permission | | | | | --25.84%--kernfs_iop_permission | | | | | |--9.29%--up_read | | | | | |--8.19%--down_read | | | | | --7.89%--_raw_spin_lock | | | | | ---- | | | |--22.42%--walk_component | | | | | --22.36%--lookup_fast | | | | | |--16.07%--__d_lookup | | | | | | | --16.01%--_raw_spin_lock | | | | | | | ---- | | | | | --6.28%--kernfs_dop_revalidate | | | | | |--3.76%--down_read | | | | | --2.26%--up_read As can be seen from the above data the overhead due to both kerfs_iop_permission and kernfs_dop_revalidate have gone down and this also reduces overall run time of the earlier mentioned loop. Signed-off-by: Imran Khan <imran.f.khan@oracle.com> Link: https://lore.kernel.org/r/20230309110932.2889010-2-imran.f.khan@oracle.com Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Signed-off-by: Ian Kent <ikent@redhat.com>
2024-08-07 01:55:39 +00:00
down_read(&root->kernfs_iattr_rwsem);
kernfs_refresh_inode(kn, inode);
fs: port ->getattr() to pass mnt_idmap JIRA: https://issues.redhat.com/browse/RHEL-33888 Status: Linus Conflicts: CentOS Stream has commit 3e0b6f1fa9a1c ("afs: use read_seqbegin() in afs_check_validity() and afs_getattr()"), manually apply hunk #2 to fs/afs/inode.c. CentOS Stream commit 3b06927229296 {"afs: split afs_pagecache_valid() out of afs_validate()") is present which causes a reject in fs/afs/internal.h, manually apply hunk to fs/afs/internal.h. For consistency drop btrfs hunks because it isn't supported in CentOS Stream and other backports also drop such hunks. CentOS Stream commit 48fa94aacd100 ("ceph: fscrypt_auth handling for ceph") alters the definition of _ceph_setattr() causing fuzz. The cifs source has been moved in CentOS Stream so manually apply rejected hunks to fs/smb/client/cifsfs.h and fs/smb/client/inode.c. Upstream commit 2e1d66379ece5 ("staging: erofs: drop the extern prefix for function definitions") caused strange behaviour when applying this patch, there was a conflict in fs/erofs/internal.h but after a refresh the hunk and context looked ok. The hunk had to be manually applied. Upstream commit 2db0487faa211 ("f2fs: move f2fs_force_buffered_io() into file.c") is not present in CentOS Stream which causes fuzz when applying the first hunk to fs/f2fs/file.c. Upstream commit 30abce053f811 ("fat: report creation time in statx") is not present in CentOS Stream which caused a reject so apply change manually. Dropped hunks for ksmbd because the source is not present in the CentOS Stream source tree. Dropped hunks for ntfs3 because the source is not present in the CentOS Stream source tree. There was fuzz with hunk #2 against fs/nfs/inode.c but I was unable to see any difference. CentOS Stream commit 98ba731fc7eae ("ovl: Move xattr support to new xattrs.c file") is present which caused fuzz in fs/overlayfs/overlayfs.h. Upstream commit d919a1e79bac8 ("proc: fix a dentry lock race between release_task and lookup") is not present in CentOS Stream causing fuzz applying hunk #1 against fs/proc/base.c. CentOS Stream commit 20c470188c2eb ("vfs: plumb i_version handling into struct kstat") is present causing fuzz in hunk #2 against fs/stat.c. Upstream commit e0c49bd2b4d3c ("fs: sysv: Fix sysv_nblocks() returns wrong value") is not present in CentOS Stream causing fuzz applying hunk#1 against fs/sysv/itree.c. CentOS Stream commit 892da692fa5bc ("shmem: support idmapped mounts for tmpfs") is present so it's ok to pass idmap to generic_fillattr(). CentOS Stream commit f0f830cd7e01b {"ceph: create symlinks with encrypted and base64-encoded targets") uses the old struct user_namespace and so leaves those changes out, make those getattr() changes here. Allow for CentOS Stream commit 6c3396a0d8f2c ("kernfs: Introduce separate rwsem to protect inode attributes") which is already present. CentOS Stream commit f5219db0c03b6 ("KVM: fix Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory") updated the upstream commit a7800aa80ea4d ("KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory") to account for missing idmapping commits. Now we have updated the second and final place these changes were made make the final needed adjustment to match the original upstream patch. commit b74d24f7a74ffd2d42ca883d84b7422b8d545901 Author: Christian Brauner <brauner@kernel.org> Date: Fri Jan 13 12:49:12 2023 +0100 fs: port ->getattr() to pass mnt_idmap Convert to struct mnt_idmap. Last cycle we merged the necessary infrastructure in 256c8aed2b42 ("fs: introduce dedicated idmap type for mounts"). This is just the conversion to struct mnt_idmap. Currently we still pass around the plain namespace that was attached to a mount. This is in general pretty convenient but it makes it easy to conflate namespaces that are relevant on the filesystem with namespaces that are relevent on the mount level. Especially for non-vfs developers without detailed knowledge in this area this can be a potential source for bugs. Once the conversion to struct mnt_idmap is done all helpers down to the really low-level helpers will take a struct mnt_idmap argument instead of two namespace arguments. This way it becomes impossible to conflate the two eliminating the possibility of any bugs. All of the vfs and all filesystems only operate on struct mnt_idmap. Acked-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org> Signed-off-by: Ian Kent <ikent@redhat.com>
2024-05-21 04:47:00 +00:00
generic_fillattr(&nop_mnt_idmap, inode, stat);
kernfs: Introduce separate rwsem to protect inode attributes JIRA: https://issues.redhat.com/browse/RHEL-52956 Upstream status: Linus Conflicts: There's a reject of hunks 2, 4 and 5 against fs/kernfs/inode.c because idmapping updates have not yet been made to the RHEL9 kernel. The RH_KABI_EXTEND() macro is used to add the new lock to the kernfs_root structure. This should be fine because CentOS Stream commit 25d13c0bf2507 ("kernfs: move struct kernfs_root out of the public view") has already been included in RHEL-9 making the kernfs_root structure private to kernfs. commit 9caf696142252a466fb89e629d0eddcdced027b0 Author: Imran Khan <imran.f.khan@oracle.com> Date: Thu Mar 9 22:09:30 2023 +1100 kernfs: Introduce separate rwsem to protect inode attributes. Right now a global per-fs rwsem (kernfs_rwsem) synchronizes multiple kernfs operations. On a large system with few hundred CPUs and few hundred applications simultaneoulsy trying to access sysfs, this results in multiple sys_open(s) contending on kernfs_rwsem via kernfs_iop_permission and kernfs_dop_revalidate. For example on a system with 384 cores, if I run 200 instances of an application which is mostly executing the following loop: for (int loop = 0; loop <100 ; loop++) { for (int port_num = 1; port_num < 2; port_num++) { for (int gid_index = 0; gid_index < 254; gid_index++ ) { char ret_buf[64], ret_buf_lo[64]; char gid_file_path[1024]; int ret_len; int ret_fd; ssize_t ret_rd; ub4 i, saved_errno; memset(ret_buf, 0, sizeof(ret_buf)); memset(gid_file_path, 0, sizeof(gid_file_path)); ret_len = snprintf(gid_file_path, sizeof(gid_file_path), "/sys/class/infiniband/%s/ports/%d/gids/%d", dev_name, port_num, gid_index); ret_fd = open(gid_file_path, O_RDONLY | O_CLOEXEC); if (ret_fd < 0) { printf("Failed to open %s\n", gid_file_path); continue; } /* Read the GID */ ret_rd = read(ret_fd, ret_buf, 40); if (ret_rd == -1) { printf("Failed to read from file %s, errno: %u\n", gid_file_path, saved_errno); continue; } close(ret_fd); } } I see contention around kernfs_rwsem as follows: path_openat | |----link_path_walk.part.0.constprop.0 | | | |--49.92%--inode_permission | | | | | --48.69%--kernfs_iop_permission | | | | | |--18.16%--down_read | | | | | |--15.38%--up_read | | | | | --14.58%--_raw_spin_lock | | | | | ----- | | | |--29.08%--walk_component | | | | | --29.02%--lookup_fast | | | | | |--24.26%--kernfs_dop_revalidate | | | | | | | |--14.97%--down_read | | | | | | | --9.01%--up_read | | | | | --4.74%--__d_lookup | | | | | --4.64%--_raw_spin_lock | | | | | ---- Having a separate per-fs rwsem to protect kernfs inode attributes, will avoid the above mentioned contention and result in better performance as can bee seen below: path_openat | |----link_path_walk.part.0.constprop.0 | | | | | |--27.06%--inode_permission | | | | | --25.84%--kernfs_iop_permission | | | | | |--9.29%--up_read | | | | | |--8.19%--down_read | | | | | --7.89%--_raw_spin_lock | | | | | ---- | | | |--22.42%--walk_component | | | | | --22.36%--lookup_fast | | | | | |--16.07%--__d_lookup | | | | | | | --16.01%--_raw_spin_lock | | | | | | | ---- | | | | | --6.28%--kernfs_dop_revalidate | | | | | |--3.76%--down_read | | | | | --2.26%--up_read As can be seen from the above data the overhead due to both kerfs_iop_permission and kernfs_dop_revalidate have gone down and this also reduces overall run time of the earlier mentioned loop. Signed-off-by: Imran Khan <imran.f.khan@oracle.com> Link: https://lore.kernel.org/r/20230309110932.2889010-2-imran.f.khan@oracle.com Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Signed-off-by: Ian Kent <ikent@redhat.com>
2024-08-07 01:55:39 +00:00
up_read(&root->kernfs_iattr_rwsem);
kernfs: use i_lock to protect concurrent inode updates Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2004858 Upstream status: Linus Testing: The series has been included in RHEL-8 and customer testing has been done there. The upstreaming process includes fairly broad general testing as well. commit 47b5c64d0ab5e7136db2b78c6ec710e0d8a5a36b From: Ian Kent <raven@themaw.net> Date: 2021-07-16 17:28:34 +0800 kernfs: use i_lock to protect concurrent inode updates The inode operations .permission() and .getattr() use the kernfs node write lock but all that's needed is the read lock to protect against partial updates of these kernfs node fields which are all done under the write lock. And .permission() is called frequently during path walks and can cause quite a bit of contention between kernfs node operations and path walks when the number of concurrent walks is high. To change kernfs_iop_getattr() and kernfs_iop_permission() to take the rw sem read lock instead of the write lock an additional lock is needed to protect against multiple processes concurrently updating the inode attributes and link count in kernfs_refresh_inode(). The inode i_lock seems like the sensible thing to use to protect these inode attribute updates so use it in kernfs_refresh_inode(). The last hunk in the patch, applied to kernfs_fill_super(), is possibly not needed but taking the lock was present originally. I prefer to continue to take it to protect against a partial update of the source kernfs fields during the call to kernfs_refresh_inode() made by kernfs_get_inode(). Reviewed-by: Miklos Szeredi <mszeredi@redhat.com> Signed-off-by: Ian Kent <raven@themaw.net> Link: https://lore.kernel.org/r/162642771474.63632.16295959115893904470.stgit@web.messagingengine.com Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Signed-off-by: Ian Kent <ikent@redhat.com>
2021-11-29 05:54:21 +00:00
return 0;
}
static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode)
{
kernfs_get(kn);
inode->i_private = kn;
inode->i_mapping->a_ops = &ram_aops;
inode->i_op = &kernfs_iops;
inode->i_generation = kernfs_gen(kn);
set_default_inode_attr(inode, kn->mode);
kernfs_refresh_inode(kn, inode);
/* initialize inode according to type */
switch (kernfs_type(kn)) {
case KERNFS_DIR:
inode->i_op = &kernfs_dir_iops;
inode->i_fop = &kernfs_dir_fops;
if (kn->flags & KERNFS_EMPTY_DIR)
make_empty_dir_inode(inode);
break;
case KERNFS_FILE:
inode->i_size = kn->attr.size;
inode->i_fop = &kernfs_file_fops;
break;
case KERNFS_LINK:
inode->i_op = &kernfs_symlink_iops;
break;
default:
BUG();
}
unlock_new_inode(inode);
}
/**
* kernfs_get_inode - get inode for kernfs_node
* @sb: super block
* @kn: kernfs_node to allocate inode for
*
* Get inode for @kn. If such inode doesn't exist, a new inode is
* allocated and basics are initialized. New inode is returned
* locked.
*
kernfs: fix all kernel-doc warnings and multiple typos JIRA: https://issues.redhat.com/browse/RHEL-56023 commit 24b3e3dd9c9c742a4dd18e71b6963f9e7ab72911 Author: Randy Dunlap <rdunlap@infradead.org> Date: Fri, 11 Nov 2022 19:14:56 -0800 kernfs: fix all kernel-doc warnings and multiple typos Fix kernel-doc warnings. Many of these are about a function's return value, so use the kernel-doc Return: format to fix those Use % prefix on numeric constant values. dir.c: fix typos/spellos file.c fix typo: s/taret/target/ Fix all of these kernel-doc warnings: dir.c:305: warning: missing initial short description on line: * kernfs_name_hash dir.c:137: warning: No description found for return value of 'kernfs_path_from_node_locked' dir.c:196: warning: No description found for return value of 'kernfs_name' dir.c:224: warning: No description found for return value of 'kernfs_path_from_node' dir.c:292: warning: No description found for return value of 'kernfs_get_parent' dir.c:312: warning: No description found for return value of 'kernfs_name_hash' dir.c:404: warning: No description found for return value of 'kernfs_unlink_sibling' dir.c:588: warning: No description found for return value of 'kernfs_node_from_dentry' dir.c:806: warning: No description found for return value of 'kernfs_find_ns' dir.c:879: warning: No description found for return value of 'kernfs_find_and_get_ns' dir.c:904: warning: No description found for return value of 'kernfs_walk_and_get_ns' dir.c:927: warning: No description found for return value of 'kernfs_create_root' dir.c:996: warning: No description found for return value of 'kernfs_root_to_node' dir.c:1016: warning: No description found for return value of 'kernfs_create_dir_ns' dir.c:1048: warning: No description found for return value of 'kernfs_create_empty_dir' dir.c:1306: warning: No description found for return value of 'kernfs_next_descendant_post' dir.c:1568: warning: No description found for return value of 'kernfs_remove_self' dir.c:1630: warning: No description found for return value of 'kernfs_remove_by_name_ns' dir.c:1667: warning: No description found for return value of 'kernfs_rename_ns' file.c:66: warning: No description found for return value of 'of_on' file.c:88: warning: No description found for return value of 'kernfs_deref_open_node_locked' file.c:1036: warning: No description found for return value of '__kernfs_create_file' inode.c:100: warning: No description found for return value of 'kernfs_setattr' mount.c:160: warning: No description found for return value of 'kernfs_root_from_sb' mount.c:198: warning: No description found for return value of 'kernfs_node_dentry' mount.c:302: warning: No description found for return value of 'kernfs_super_ns' mount.c:318: warning: No description found for return value of 'kernfs_get_tree' symlink.c:28: warning: No description found for return value of 'kernfs_create_link' Signed-off-by: Randy Dunlap <rdunlap@infradead.org> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Tejun Heo <tj@kernel.org> Acked-by: Tejun Heo <tj@kernel.org> Link: https://lore.kernel.org/r/20221112031456.22980-1-rdunlap@infradead.org Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Signed-off-by: Waiman Long <longman@redhat.com>
2024-09-27 01:03:43 +00:00
* Locking:
* Kernel thread context (may sleep).
*
kernfs: fix all kernel-doc warnings and multiple typos JIRA: https://issues.redhat.com/browse/RHEL-56023 commit 24b3e3dd9c9c742a4dd18e71b6963f9e7ab72911 Author: Randy Dunlap <rdunlap@infradead.org> Date: Fri, 11 Nov 2022 19:14:56 -0800 kernfs: fix all kernel-doc warnings and multiple typos Fix kernel-doc warnings. Many of these are about a function's return value, so use the kernel-doc Return: format to fix those Use % prefix on numeric constant values. dir.c: fix typos/spellos file.c fix typo: s/taret/target/ Fix all of these kernel-doc warnings: dir.c:305: warning: missing initial short description on line: * kernfs_name_hash dir.c:137: warning: No description found for return value of 'kernfs_path_from_node_locked' dir.c:196: warning: No description found for return value of 'kernfs_name' dir.c:224: warning: No description found for return value of 'kernfs_path_from_node' dir.c:292: warning: No description found for return value of 'kernfs_get_parent' dir.c:312: warning: No description found for return value of 'kernfs_name_hash' dir.c:404: warning: No description found for return value of 'kernfs_unlink_sibling' dir.c:588: warning: No description found for return value of 'kernfs_node_from_dentry' dir.c:806: warning: No description found for return value of 'kernfs_find_ns' dir.c:879: warning: No description found for return value of 'kernfs_find_and_get_ns' dir.c:904: warning: No description found for return value of 'kernfs_walk_and_get_ns' dir.c:927: warning: No description found for return value of 'kernfs_create_root' dir.c:996: warning: No description found for return value of 'kernfs_root_to_node' dir.c:1016: warning: No description found for return value of 'kernfs_create_dir_ns' dir.c:1048: warning: No description found for return value of 'kernfs_create_empty_dir' dir.c:1306: warning: No description found for return value of 'kernfs_next_descendant_post' dir.c:1568: warning: No description found for return value of 'kernfs_remove_self' dir.c:1630: warning: No description found for return value of 'kernfs_remove_by_name_ns' dir.c:1667: warning: No description found for return value of 'kernfs_rename_ns' file.c:66: warning: No description found for return value of 'of_on' file.c:88: warning: No description found for return value of 'kernfs_deref_open_node_locked' file.c:1036: warning: No description found for return value of '__kernfs_create_file' inode.c:100: warning: No description found for return value of 'kernfs_setattr' mount.c:160: warning: No description found for return value of 'kernfs_root_from_sb' mount.c:198: warning: No description found for return value of 'kernfs_node_dentry' mount.c:302: warning: No description found for return value of 'kernfs_super_ns' mount.c:318: warning: No description found for return value of 'kernfs_get_tree' symlink.c:28: warning: No description found for return value of 'kernfs_create_link' Signed-off-by: Randy Dunlap <rdunlap@infradead.org> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Tejun Heo <tj@kernel.org> Acked-by: Tejun Heo <tj@kernel.org> Link: https://lore.kernel.org/r/20221112031456.22980-1-rdunlap@infradead.org Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Signed-off-by: Waiman Long <longman@redhat.com>
2024-09-27 01:03:43 +00:00
* Return:
* Pointer to allocated inode on success, %NULL on failure.
*/
struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn)
{
struct inode *inode;
inode = iget_locked(sb, kernfs_ino(kn));
if (inode && (inode->i_state & I_NEW))
kernfs_init_inode(kn, inode);
return inode;
}
/*
* The kernfs_node serves as both an inode and a directory entry for
* kernfs. To prevent the kernfs inode numbers from being freed
* prematurely we take a reference to kernfs_node from the kernfs inode. A
* super_operations.evict_inode() implementation is needed to drop that
* reference upon inode destruction.
*/
void kernfs_evict_inode(struct inode *inode)
{
struct kernfs_node *kn = inode->i_private;
mm + fs: store shadow entries in page cache Reclaim will be leaving shadow entries in the page cache radix tree upon evicting the real page. As those pages are found from the LRU, an iput() can lead to the inode being freed concurrently. At this point, reclaim must no longer install shadow pages because the inode freeing code needs to ensure the page tree is really empty. Add an address_space flag, AS_EXITING, that the inode freeing code sets under the tree lock before doing the final truncate. Reclaim will check for this flag before installing shadow pages. Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Reviewed-by: Rik van Riel <riel@redhat.com> Reviewed-by: Minchan Kim <minchan@kernel.org> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Bob Liu <bob.liu@oracle.com> Cc: Christoph Hellwig <hch@infradead.org> Cc: Dave Chinner <david@fromorbit.com> Cc: Greg Thelen <gthelen@google.com> Cc: Hugh Dickins <hughd@google.com> Cc: Jan Kara <jack@suse.cz> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Luigi Semenzato <semenzato@google.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Metin Doslu <metin@citusdata.com> Cc: Michel Lespinasse <walken@google.com> Cc: Ozgun Erdogan <ozgun@citusdata.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Roman Gushchin <klamm@yandex-team.ru> Cc: Ryan Mallon <rmallon@gmail.com> Cc: Tejun Heo <tj@kernel.org> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-04-03 21:47:49 +00:00
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
kernfs_put(kn);
}
fs: port ->permission() to pass mnt_idmap JIRA: https://issues.redhat.com/browse/RHEL-33888 Status: Linus Conflicts: For consistency drop btrfs hunks because it isn't supported in CentOS Stream and other backports also drop such hunks. CentOS Stream commit 48fa94aacd100 ("ceph: fscrypt_auth handling for ceph") is presnt which causes fuzz 2 in hunk #1 in fs/ceph/super.h. Upstream commit 427505ffeaa46 ("exportfs: use pr_debug for unreachable debug statements") is not present causing fuzz 2 in hunk #1 against fs/exportfs/expfs.c. Dropped hunks for ksmbd because the source is not present in the CentOS Stream source tree. Upstream commit 03fa86e9f79d8 ("namei: stash the sampled ->d_seq into nameidata") is not present causing a fuzz 1 for hunk #14 against fs/namei.c. CentOS Stream c4f3dd0731ba6 ("nfsd: handle failure to collect pre/post-op attrs more sanely") is present and causes a rejects for hunks #4 and #5 against fs/nfsd/vfs.c, apply manually. Dropped hunks for ntfs3 because the source is not present in the CentOS Stream source tree. CentOS Stream commit 98ba731fc7eae ("ovl: Move xattr support to new xattrs.c file") moves ovl_xattr_set() and ovl_xattr_get() from fs/overlayfs/inode.c to fs/overlayfs/xattrs.c which causes hunks #4 and #5 to fail, manually apply to fs/overlayfs/xattrs.c. CentOS Stream commit 55177e4b8365f ("ovl: mark xwhiteouts directory with overlay.opaque='x'") and commit d17b324bb6e9d ("ovl: use ovl_numlower() and ovl_lowerstack() accessors") change the first and third hunks of fs/overlayfs/namei.c causing them to fail, manually apply. CentOS Stream commit 98ba731fc7eae ("ovl: Move xattr support to new xattrs.c file") causes fuzz 2 in hunk #5 of fs/overlayfs/overlayfs.h CentOS Stream commit 355a9c490a076 ("ovl: Add an alternative type of whiteout") changes ovl_cache_update_ino() to ovl_cache_update() in fs/overlayfs/readdir.c, make the change manually. Upstream commit 217af7e2f4deb ("apparmor: refactor profile rules and attachments") is not in CentOS Stream causing hunk #1 to fail to apply so manually apply the change. commit 4609e1f18e19c3b302e1eb4858334bca1532f780 Author: Christian Brauner <brauner@kernel.org> Date: Fri Jan 13 12:49:22 2023 +0100 fs: port ->permission() to pass mnt_idmap Convert to struct mnt_idmap. Last cycle we merged the necessary infrastructure in 256c8aed2b42 ("fs: introduce dedicated idmap type for mounts"). This is just the conversion to struct mnt_idmap. Currently we still pass around the plain namespace that was attached to a mount. This is in general pretty convenient but it makes it easy to conflate namespaces that are relevant on the filesystem with namespaces that are relevent on the mount level. Especially for non-vfs developers without detailed knowledge in this area this can be a potential source for bugs. Once the conversion to struct mnt_idmap is done all helpers down to the really low-level helpers will take a struct mnt_idmap argument instead of two namespace arguments. This way it becomes impossible to conflate the two eliminating the possibility of any bugs. All of the vfs and all filesystems only operate on struct mnt_idmap. Acked-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org> Signed-off-by: Ian Kent <ikent@redhat.com>
2024-05-22 05:41:52 +00:00
int kernfs_iop_permission(struct mnt_idmap *idmap,
struct inode *inode, int mask)
{
struct kernfs_node *kn;
struct kernfs_root *root;
kernfs: use i_lock to protect concurrent inode updates Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2004858 Upstream status: Linus Testing: The series has been included in RHEL-8 and customer testing has been done there. The upstreaming process includes fairly broad general testing as well. commit 47b5c64d0ab5e7136db2b78c6ec710e0d8a5a36b From: Ian Kent <raven@themaw.net> Date: 2021-07-16 17:28:34 +0800 kernfs: use i_lock to protect concurrent inode updates The inode operations .permission() and .getattr() use the kernfs node write lock but all that's needed is the read lock to protect against partial updates of these kernfs node fields which are all done under the write lock. And .permission() is called frequently during path walks and can cause quite a bit of contention between kernfs node operations and path walks when the number of concurrent walks is high. To change kernfs_iop_getattr() and kernfs_iop_permission() to take the rw sem read lock instead of the write lock an additional lock is needed to protect against multiple processes concurrently updating the inode attributes and link count in kernfs_refresh_inode(). The inode i_lock seems like the sensible thing to use to protect these inode attribute updates so use it in kernfs_refresh_inode(). The last hunk in the patch, applied to kernfs_fill_super(), is possibly not needed but taking the lock was present originally. I prefer to continue to take it to protect against a partial update of the source kernfs fields during the call to kernfs_refresh_inode() made by kernfs_get_inode(). Reviewed-by: Miklos Szeredi <mszeredi@redhat.com> Signed-off-by: Ian Kent <raven@themaw.net> Link: https://lore.kernel.org/r/162642771474.63632.16295959115893904470.stgit@web.messagingengine.com Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Signed-off-by: Ian Kent <ikent@redhat.com>
2021-11-29 05:54:21 +00:00
int ret;
if (mask & MAY_NOT_BLOCK)
return -ECHILD;
kn = inode->i_private;
root = kernfs_root(kn);
kernfs: Introduce separate rwsem to protect inode attributes JIRA: https://issues.redhat.com/browse/RHEL-52956 Upstream status: Linus Conflicts: There's a reject of hunks 2, 4 and 5 against fs/kernfs/inode.c because idmapping updates have not yet been made to the RHEL9 kernel. The RH_KABI_EXTEND() macro is used to add the new lock to the kernfs_root structure. This should be fine because CentOS Stream commit 25d13c0bf2507 ("kernfs: move struct kernfs_root out of the public view") has already been included in RHEL-9 making the kernfs_root structure private to kernfs. commit 9caf696142252a466fb89e629d0eddcdced027b0 Author: Imran Khan <imran.f.khan@oracle.com> Date: Thu Mar 9 22:09:30 2023 +1100 kernfs: Introduce separate rwsem to protect inode attributes. Right now a global per-fs rwsem (kernfs_rwsem) synchronizes multiple kernfs operations. On a large system with few hundred CPUs and few hundred applications simultaneoulsy trying to access sysfs, this results in multiple sys_open(s) contending on kernfs_rwsem via kernfs_iop_permission and kernfs_dop_revalidate. For example on a system with 384 cores, if I run 200 instances of an application which is mostly executing the following loop: for (int loop = 0; loop <100 ; loop++) { for (int port_num = 1; port_num < 2; port_num++) { for (int gid_index = 0; gid_index < 254; gid_index++ ) { char ret_buf[64], ret_buf_lo[64]; char gid_file_path[1024]; int ret_len; int ret_fd; ssize_t ret_rd; ub4 i, saved_errno; memset(ret_buf, 0, sizeof(ret_buf)); memset(gid_file_path, 0, sizeof(gid_file_path)); ret_len = snprintf(gid_file_path, sizeof(gid_file_path), "/sys/class/infiniband/%s/ports/%d/gids/%d", dev_name, port_num, gid_index); ret_fd = open(gid_file_path, O_RDONLY | O_CLOEXEC); if (ret_fd < 0) { printf("Failed to open %s\n", gid_file_path); continue; } /* Read the GID */ ret_rd = read(ret_fd, ret_buf, 40); if (ret_rd == -1) { printf("Failed to read from file %s, errno: %u\n", gid_file_path, saved_errno); continue; } close(ret_fd); } } I see contention around kernfs_rwsem as follows: path_openat | |----link_path_walk.part.0.constprop.0 | | | |--49.92%--inode_permission | | | | | --48.69%--kernfs_iop_permission | | | | | |--18.16%--down_read | | | | | |--15.38%--up_read | | | | | --14.58%--_raw_spin_lock | | | | | ----- | | | |--29.08%--walk_component | | | | | --29.02%--lookup_fast | | | | | |--24.26%--kernfs_dop_revalidate | | | | | | | |--14.97%--down_read | | | | | | | --9.01%--up_read | | | | | --4.74%--__d_lookup | | | | | --4.64%--_raw_spin_lock | | | | | ---- Having a separate per-fs rwsem to protect kernfs inode attributes, will avoid the above mentioned contention and result in better performance as can bee seen below: path_openat | |----link_path_walk.part.0.constprop.0 | | | | | |--27.06%--inode_permission | | | | | --25.84%--kernfs_iop_permission | | | | | |--9.29%--up_read | | | | | |--8.19%--down_read | | | | | --7.89%--_raw_spin_lock | | | | | ---- | | | |--22.42%--walk_component | | | | | --22.36%--lookup_fast | | | | | |--16.07%--__d_lookup | | | | | | | --16.01%--_raw_spin_lock | | | | | | | ---- | | | | | --6.28%--kernfs_dop_revalidate | | | | | |--3.76%--down_read | | | | | --2.26%--up_read As can be seen from the above data the overhead due to both kerfs_iop_permission and kernfs_dop_revalidate have gone down and this also reduces overall run time of the earlier mentioned loop. Signed-off-by: Imran Khan <imran.f.khan@oracle.com> Link: https://lore.kernel.org/r/20230309110932.2889010-2-imran.f.khan@oracle.com Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Signed-off-by: Ian Kent <ikent@redhat.com>
2024-08-07 01:55:39 +00:00
down_read(&root->kernfs_iattr_rwsem);
kernfs_refresh_inode(kn, inode);
fs: port ->permission() to pass mnt_idmap JIRA: https://issues.redhat.com/browse/RHEL-33888 Status: Linus Conflicts: For consistency drop btrfs hunks because it isn't supported in CentOS Stream and other backports also drop such hunks. CentOS Stream commit 48fa94aacd100 ("ceph: fscrypt_auth handling for ceph") is presnt which causes fuzz 2 in hunk #1 in fs/ceph/super.h. Upstream commit 427505ffeaa46 ("exportfs: use pr_debug for unreachable debug statements") is not present causing fuzz 2 in hunk #1 against fs/exportfs/expfs.c. Dropped hunks for ksmbd because the source is not present in the CentOS Stream source tree. Upstream commit 03fa86e9f79d8 ("namei: stash the sampled ->d_seq into nameidata") is not present causing a fuzz 1 for hunk #14 against fs/namei.c. CentOS Stream c4f3dd0731ba6 ("nfsd: handle failure to collect pre/post-op attrs more sanely") is present and causes a rejects for hunks #4 and #5 against fs/nfsd/vfs.c, apply manually. Dropped hunks for ntfs3 because the source is not present in the CentOS Stream source tree. CentOS Stream commit 98ba731fc7eae ("ovl: Move xattr support to new xattrs.c file") moves ovl_xattr_set() and ovl_xattr_get() from fs/overlayfs/inode.c to fs/overlayfs/xattrs.c which causes hunks #4 and #5 to fail, manually apply to fs/overlayfs/xattrs.c. CentOS Stream commit 55177e4b8365f ("ovl: mark xwhiteouts directory with overlay.opaque='x'") and commit d17b324bb6e9d ("ovl: use ovl_numlower() and ovl_lowerstack() accessors") change the first and third hunks of fs/overlayfs/namei.c causing them to fail, manually apply. CentOS Stream commit 98ba731fc7eae ("ovl: Move xattr support to new xattrs.c file") causes fuzz 2 in hunk #5 of fs/overlayfs/overlayfs.h CentOS Stream commit 355a9c490a076 ("ovl: Add an alternative type of whiteout") changes ovl_cache_update_ino() to ovl_cache_update() in fs/overlayfs/readdir.c, make the change manually. Upstream commit 217af7e2f4deb ("apparmor: refactor profile rules and attachments") is not in CentOS Stream causing hunk #1 to fail to apply so manually apply the change. commit 4609e1f18e19c3b302e1eb4858334bca1532f780 Author: Christian Brauner <brauner@kernel.org> Date: Fri Jan 13 12:49:22 2023 +0100 fs: port ->permission() to pass mnt_idmap Convert to struct mnt_idmap. Last cycle we merged the necessary infrastructure in 256c8aed2b42 ("fs: introduce dedicated idmap type for mounts"). This is just the conversion to struct mnt_idmap. Currently we still pass around the plain namespace that was attached to a mount. This is in general pretty convenient but it makes it easy to conflate namespaces that are relevant on the filesystem with namespaces that are relevent on the mount level. Especially for non-vfs developers without detailed knowledge in this area this can be a potential source for bugs. Once the conversion to struct mnt_idmap is done all helpers down to the really low-level helpers will take a struct mnt_idmap argument instead of two namespace arguments. This way it becomes impossible to conflate the two eliminating the possibility of any bugs. All of the vfs and all filesystems only operate on struct mnt_idmap. Acked-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org> Signed-off-by: Ian Kent <ikent@redhat.com>
2024-05-22 05:41:52 +00:00
ret = generic_permission(&nop_mnt_idmap, inode, mask);
kernfs: Introduce separate rwsem to protect inode attributes JIRA: https://issues.redhat.com/browse/RHEL-52956 Upstream status: Linus Conflicts: There's a reject of hunks 2, 4 and 5 against fs/kernfs/inode.c because idmapping updates have not yet been made to the RHEL9 kernel. The RH_KABI_EXTEND() macro is used to add the new lock to the kernfs_root structure. This should be fine because CentOS Stream commit 25d13c0bf2507 ("kernfs: move struct kernfs_root out of the public view") has already been included in RHEL-9 making the kernfs_root structure private to kernfs. commit 9caf696142252a466fb89e629d0eddcdced027b0 Author: Imran Khan <imran.f.khan@oracle.com> Date: Thu Mar 9 22:09:30 2023 +1100 kernfs: Introduce separate rwsem to protect inode attributes. Right now a global per-fs rwsem (kernfs_rwsem) synchronizes multiple kernfs operations. On a large system with few hundred CPUs and few hundred applications simultaneoulsy trying to access sysfs, this results in multiple sys_open(s) contending on kernfs_rwsem via kernfs_iop_permission and kernfs_dop_revalidate. For example on a system with 384 cores, if I run 200 instances of an application which is mostly executing the following loop: for (int loop = 0; loop <100 ; loop++) { for (int port_num = 1; port_num < 2; port_num++) { for (int gid_index = 0; gid_index < 254; gid_index++ ) { char ret_buf[64], ret_buf_lo[64]; char gid_file_path[1024]; int ret_len; int ret_fd; ssize_t ret_rd; ub4 i, saved_errno; memset(ret_buf, 0, sizeof(ret_buf)); memset(gid_file_path, 0, sizeof(gid_file_path)); ret_len = snprintf(gid_file_path, sizeof(gid_file_path), "/sys/class/infiniband/%s/ports/%d/gids/%d", dev_name, port_num, gid_index); ret_fd = open(gid_file_path, O_RDONLY | O_CLOEXEC); if (ret_fd < 0) { printf("Failed to open %s\n", gid_file_path); continue; } /* Read the GID */ ret_rd = read(ret_fd, ret_buf, 40); if (ret_rd == -1) { printf("Failed to read from file %s, errno: %u\n", gid_file_path, saved_errno); continue; } close(ret_fd); } } I see contention around kernfs_rwsem as follows: path_openat | |----link_path_walk.part.0.constprop.0 | | | |--49.92%--inode_permission | | | | | --48.69%--kernfs_iop_permission | | | | | |--18.16%--down_read | | | | | |--15.38%--up_read | | | | | --14.58%--_raw_spin_lock | | | | | ----- | | | |--29.08%--walk_component | | | | | --29.02%--lookup_fast | | | | | |--24.26%--kernfs_dop_revalidate | | | | | | | |--14.97%--down_read | | | | | | | --9.01%--up_read | | | | | --4.74%--__d_lookup | | | | | --4.64%--_raw_spin_lock | | | | | ---- Having a separate per-fs rwsem to protect kernfs inode attributes, will avoid the above mentioned contention and result in better performance as can bee seen below: path_openat | |----link_path_walk.part.0.constprop.0 | | | | | |--27.06%--inode_permission | | | | | --25.84%--kernfs_iop_permission | | | | | |--9.29%--up_read | | | | | |--8.19%--down_read | | | | | --7.89%--_raw_spin_lock | | | | | ---- | | | |--22.42%--walk_component | | | | | --22.36%--lookup_fast | | | | | |--16.07%--__d_lookup | | | | | | | --16.01%--_raw_spin_lock | | | | | | | ---- | | | | | --6.28%--kernfs_dop_revalidate | | | | | |--3.76%--down_read | | | | | --2.26%--up_read As can be seen from the above data the overhead due to both kerfs_iop_permission and kernfs_dop_revalidate have gone down and this also reduces overall run time of the earlier mentioned loop. Signed-off-by: Imran Khan <imran.f.khan@oracle.com> Link: https://lore.kernel.org/r/20230309110932.2889010-2-imran.f.khan@oracle.com Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Signed-off-by: Ian Kent <ikent@redhat.com>
2024-08-07 01:55:39 +00:00
up_read(&root->kernfs_iattr_rwsem);
kernfs: use i_lock to protect concurrent inode updates Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2004858 Upstream status: Linus Testing: The series has been included in RHEL-8 and customer testing has been done there. The upstreaming process includes fairly broad general testing as well. commit 47b5c64d0ab5e7136db2b78c6ec710e0d8a5a36b From: Ian Kent <raven@themaw.net> Date: 2021-07-16 17:28:34 +0800 kernfs: use i_lock to protect concurrent inode updates The inode operations .permission() and .getattr() use the kernfs node write lock but all that's needed is the read lock to protect against partial updates of these kernfs node fields which are all done under the write lock. And .permission() is called frequently during path walks and can cause quite a bit of contention between kernfs node operations and path walks when the number of concurrent walks is high. To change kernfs_iop_getattr() and kernfs_iop_permission() to take the rw sem read lock instead of the write lock an additional lock is needed to protect against multiple processes concurrently updating the inode attributes and link count in kernfs_refresh_inode(). The inode i_lock seems like the sensible thing to use to protect these inode attribute updates so use it in kernfs_refresh_inode(). The last hunk in the patch, applied to kernfs_fill_super(), is possibly not needed but taking the lock was present originally. I prefer to continue to take it to protect against a partial update of the source kernfs fields during the call to kernfs_refresh_inode() made by kernfs_get_inode(). Reviewed-by: Miklos Szeredi <mszeredi@redhat.com> Signed-off-by: Ian Kent <raven@themaw.net> Link: https://lore.kernel.org/r/162642771474.63632.16295959115893904470.stgit@web.messagingengine.com Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Signed-off-by: Ian Kent <ikent@redhat.com>
2021-11-29 05:54:21 +00:00
return ret;
}
int kernfs_xattr_get(struct kernfs_node *kn, const char *name,
void *value, size_t size)
{
struct kernfs_iattrs *attrs = kernfs_iattrs_noalloc(kn);
if (!attrs)
return -ENODATA;
return simple_xattr_get(&attrs->xattrs, name, value, size);
}
int kernfs_xattr_set(struct kernfs_node *kn, const char *name,
const void *value, size_t size, int flags)
{
xattr: simple_xattr_set() return old_xattr to be freed JIRA: https://issues.redhat.com/browse/RHEL-27743 Conflicts: * mm/shmem.c: this commit had a merge conflict upstream with commit 6528733416f1 ("shmem: convert to ctime accessor functions"), backported earlier in this set. The conflict was solved via merge commit ecd7db20474c ("Merge tag 'v6.6-vfs.tmpfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs"), from which we borrow the hunk adjustment for this backport. This patch is a backport of the following upstream commit: commit 5de75970c9fd7220e394b76e6d20fbafa1369b5a Author: Hugh Dickins <hughd@google.com> Date: Tue Aug 8 21:30:59 2023 -0700 xattr: simple_xattr_set() return old_xattr to be freed tmpfs wants to support limited user extended attributes, but kernfs (or cgroupfs, the only kernfs with KERNFS_ROOT_SUPPORT_USER_XATTR) already supports user extended attributes through simple xattrs: but limited by a policy (128KiB per inode) too liberal to be used on tmpfs. To allow a different limiting policy for tmpfs, without affecting the policy for kernfs, change simple_xattr_set() to return the replaced or removed xattr (if any), leaving the caller to update their accounting then free the xattr (by simple_xattr_free(), renamed from the static free_simple_xattr()). Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Jan Kara <jack@suse.cz> Reviewed-by: Christian Brauner <brauner@kernel.org> Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com> Message-Id: <158c6585-2aa7-d4aa-90ff-f7c3f8fe407c@google.com> Signed-off-by: Christian Brauner <brauner@kernel.org> Signed-off-by: Rafael Aquini <raquini@redhat.com>
2024-09-27 14:44:59 +00:00
struct simple_xattr *old_xattr;
struct kernfs_iattrs *attrs = kernfs_iattrs(kn);
if (!attrs)
return -ENOMEM;
xattr: simple_xattr_set() return old_xattr to be freed JIRA: https://issues.redhat.com/browse/RHEL-27743 Conflicts: * mm/shmem.c: this commit had a merge conflict upstream with commit 6528733416f1 ("shmem: convert to ctime accessor functions"), backported earlier in this set. The conflict was solved via merge commit ecd7db20474c ("Merge tag 'v6.6-vfs.tmpfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs"), from which we borrow the hunk adjustment for this backport. This patch is a backport of the following upstream commit: commit 5de75970c9fd7220e394b76e6d20fbafa1369b5a Author: Hugh Dickins <hughd@google.com> Date: Tue Aug 8 21:30:59 2023 -0700 xattr: simple_xattr_set() return old_xattr to be freed tmpfs wants to support limited user extended attributes, but kernfs (or cgroupfs, the only kernfs with KERNFS_ROOT_SUPPORT_USER_XATTR) already supports user extended attributes through simple xattrs: but limited by a policy (128KiB per inode) too liberal to be used on tmpfs. To allow a different limiting policy for tmpfs, without affecting the policy for kernfs, change simple_xattr_set() to return the replaced or removed xattr (if any), leaving the caller to update their accounting then free the xattr (by simple_xattr_free(), renamed from the static free_simple_xattr()). Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Jan Kara <jack@suse.cz> Reviewed-by: Christian Brauner <brauner@kernel.org> Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com> Message-Id: <158c6585-2aa7-d4aa-90ff-f7c3f8fe407c@google.com> Signed-off-by: Christian Brauner <brauner@kernel.org> Signed-off-by: Rafael Aquini <raquini@redhat.com>
2024-09-27 14:44:59 +00:00
old_xattr = simple_xattr_set(&attrs->xattrs, name, value, size, flags);
if (IS_ERR(old_xattr))
return PTR_ERR(old_xattr);
simple_xattr_free(old_xattr);
return 0;
}
static int kernfs_vfs_xattr_get(const struct xattr_handler *handler,
struct dentry *unused, struct inode *inode,
const char *suffix, void *value, size_t size)
{
const char *name = xattr_full_name(handler, suffix);
struct kernfs_node *kn = inode->i_private;
return kernfs_xattr_get(kn, name, value, size);
}
static int kernfs_vfs_xattr_set(const struct xattr_handler *handler,
fs: port xattr to mnt_idmap JIRA: https://issues.redhat.com/browse/RHEL-33888 Status: Linus Conflicts: The cifs source has been moved in CentOS Stream so manually apply rejected hunk to fs/smb/client/xattr.c. Dropped hunks for ntfs3 because the source is not present in the CentOS Stream source tree. CentOS Stream commit 98ba731fc7eae ("ovl: Move xattr support to new xattrs.c file") moved ovl_own_xattr_set(), manually apply changes. CentOS Stream commit 67e2fcb2f34bd ("evm: don't copy up 'security.evm' xattr") is present causing hunk #1 against include/linux/evm.h to be rejected, manually apply. Upstream commit 5d1ef2ce13a90 ("ima: Introduce ima_get_current_hash_algo()") is not present in CentOS Stream which causes fuzz 1 for hunk #1 against include/linux/ima.h. There's a reject of hunk #1 for include/linux/lsm_hooks.h but I can't see any reason for it, manually applied the hunk. CentOS Stream does not have upstream commit ce5bb5a86e5eb ("ima: Return int in the functions to measure a buffer") which results in a reject of hunk #2 against security/integrity/ima/ima.h and hunks #8 and #11 against security/integrity/ima/ima_main.c, so manually apply hunks. There also appears to be a whitespace mismatch causing hunk #7 to report fuzz 2 on application. CentOS Stream does not have upstream commit c7423dbdbc9ec ("ima: Handle -ESTALE returned by ima_filter_rule_match()") which results in a reject of hunk #3 against security/integrity/ima/ima_policy.c, so manually apply hunk. commit 39f60c1ccee72caa0104145b5dbf5d37cce1ea39 Author: Christian Brauner <brauner@kernel.org> Date: Fri Jan 13 12:49:23 2023 +0100 fs: port xattr to mnt_idmap Convert to struct mnt_idmap. Last cycle we merged the necessary infrastructure in 256c8aed2b42 ("fs: introduce dedicated idmap type for mounts"). This is just the conversion to struct mnt_idmap. Currently we still pass around the plain namespace that was attached to a mount. This is in general pretty convenient but it makes it easy to conflate namespaces that are relevant on the filesystem with namespaces that are relevent on the mount level. Especially for non-vfs developers without detailed knowledge in this area this can be a potential source for bugs. Once the conversion to struct mnt_idmap is done all helpers down to the really low-level helpers will take a struct mnt_idmap argument instead of two namespace arguments. This way it becomes impossible to conflate the two eliminating the possibility of any bugs. All of the vfs and all filesystems only operate on struct mnt_idmap. Acked-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org> Signed-off-by: Ian Kent <ikent@redhat.com>
2024-05-22 06:41:29 +00:00
struct mnt_idmap *idmap,
struct dentry *unused, struct inode *inode,
const char *suffix, const void *value,
size_t size, int flags)
{
const char *name = xattr_full_name(handler, suffix);
struct kernfs_node *kn = inode->i_private;
return kernfs_xattr_set(kn, name, value, size, flags);
}
static int kernfs_vfs_user_xattr_add(struct kernfs_node *kn,
const char *full_name,
struct simple_xattrs *xattrs,
const void *value, size_t size, int flags)
{
atomic_t *sz = &kn->iattr->user_xattr_size;
atomic_t *nr = &kn->iattr->nr_user_xattrs;
xattr: simple_xattr_set() return old_xattr to be freed JIRA: https://issues.redhat.com/browse/RHEL-27743 Conflicts: * mm/shmem.c: this commit had a merge conflict upstream with commit 6528733416f1 ("shmem: convert to ctime accessor functions"), backported earlier in this set. The conflict was solved via merge commit ecd7db20474c ("Merge tag 'v6.6-vfs.tmpfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs"), from which we borrow the hunk adjustment for this backport. This patch is a backport of the following upstream commit: commit 5de75970c9fd7220e394b76e6d20fbafa1369b5a Author: Hugh Dickins <hughd@google.com> Date: Tue Aug 8 21:30:59 2023 -0700 xattr: simple_xattr_set() return old_xattr to be freed tmpfs wants to support limited user extended attributes, but kernfs (or cgroupfs, the only kernfs with KERNFS_ROOT_SUPPORT_USER_XATTR) already supports user extended attributes through simple xattrs: but limited by a policy (128KiB per inode) too liberal to be used on tmpfs. To allow a different limiting policy for tmpfs, without affecting the policy for kernfs, change simple_xattr_set() to return the replaced or removed xattr (if any), leaving the caller to update their accounting then free the xattr (by simple_xattr_free(), renamed from the static free_simple_xattr()). Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Jan Kara <jack@suse.cz> Reviewed-by: Christian Brauner <brauner@kernel.org> Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com> Message-Id: <158c6585-2aa7-d4aa-90ff-f7c3f8fe407c@google.com> Signed-off-by: Christian Brauner <brauner@kernel.org> Signed-off-by: Rafael Aquini <raquini@redhat.com>
2024-09-27 14:44:59 +00:00
struct simple_xattr *old_xattr;
int ret;
if (atomic_inc_return(nr) > KERNFS_MAX_USER_XATTRS) {
ret = -ENOSPC;
goto dec_count_out;
}
if (atomic_add_return(size, sz) > KERNFS_USER_XATTR_SIZE_LIMIT) {
ret = -ENOSPC;
goto dec_size_out;
}
xattr: simple_xattr_set() return old_xattr to be freed JIRA: https://issues.redhat.com/browse/RHEL-27743 Conflicts: * mm/shmem.c: this commit had a merge conflict upstream with commit 6528733416f1 ("shmem: convert to ctime accessor functions"), backported earlier in this set. The conflict was solved via merge commit ecd7db20474c ("Merge tag 'v6.6-vfs.tmpfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs"), from which we borrow the hunk adjustment for this backport. This patch is a backport of the following upstream commit: commit 5de75970c9fd7220e394b76e6d20fbafa1369b5a Author: Hugh Dickins <hughd@google.com> Date: Tue Aug 8 21:30:59 2023 -0700 xattr: simple_xattr_set() return old_xattr to be freed tmpfs wants to support limited user extended attributes, but kernfs (or cgroupfs, the only kernfs with KERNFS_ROOT_SUPPORT_USER_XATTR) already supports user extended attributes through simple xattrs: but limited by a policy (128KiB per inode) too liberal to be used on tmpfs. To allow a different limiting policy for tmpfs, without affecting the policy for kernfs, change simple_xattr_set() to return the replaced or removed xattr (if any), leaving the caller to update their accounting then free the xattr (by simple_xattr_free(), renamed from the static free_simple_xattr()). Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Jan Kara <jack@suse.cz> Reviewed-by: Christian Brauner <brauner@kernel.org> Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com> Message-Id: <158c6585-2aa7-d4aa-90ff-f7c3f8fe407c@google.com> Signed-off-by: Christian Brauner <brauner@kernel.org> Signed-off-by: Rafael Aquini <raquini@redhat.com>
2024-09-27 14:44:59 +00:00
old_xattr = simple_xattr_set(xattrs, full_name, value, size, flags);
if (!old_xattr)
return 0;
xattr: simple_xattr_set() return old_xattr to be freed JIRA: https://issues.redhat.com/browse/RHEL-27743 Conflicts: * mm/shmem.c: this commit had a merge conflict upstream with commit 6528733416f1 ("shmem: convert to ctime accessor functions"), backported earlier in this set. The conflict was solved via merge commit ecd7db20474c ("Merge tag 'v6.6-vfs.tmpfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs"), from which we borrow the hunk adjustment for this backport. This patch is a backport of the following upstream commit: commit 5de75970c9fd7220e394b76e6d20fbafa1369b5a Author: Hugh Dickins <hughd@google.com> Date: Tue Aug 8 21:30:59 2023 -0700 xattr: simple_xattr_set() return old_xattr to be freed tmpfs wants to support limited user extended attributes, but kernfs (or cgroupfs, the only kernfs with KERNFS_ROOT_SUPPORT_USER_XATTR) already supports user extended attributes through simple xattrs: but limited by a policy (128KiB per inode) too liberal to be used on tmpfs. To allow a different limiting policy for tmpfs, without affecting the policy for kernfs, change simple_xattr_set() to return the replaced or removed xattr (if any), leaving the caller to update their accounting then free the xattr (by simple_xattr_free(), renamed from the static free_simple_xattr()). Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Jan Kara <jack@suse.cz> Reviewed-by: Christian Brauner <brauner@kernel.org> Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com> Message-Id: <158c6585-2aa7-d4aa-90ff-f7c3f8fe407c@google.com> Signed-off-by: Christian Brauner <brauner@kernel.org> Signed-off-by: Rafael Aquini <raquini@redhat.com>
2024-09-27 14:44:59 +00:00
if (IS_ERR(old_xattr)) {
ret = PTR_ERR(old_xattr);
goto dec_size_out;
}
ret = 0;
size = old_xattr->size;
simple_xattr_free(old_xattr);
dec_size_out:
atomic_sub(size, sz);
dec_count_out:
atomic_dec(nr);
return ret;
}
static int kernfs_vfs_user_xattr_rm(struct kernfs_node *kn,
const char *full_name,
struct simple_xattrs *xattrs,
const void *value, size_t size, int flags)
{
atomic_t *sz = &kn->iattr->user_xattr_size;
atomic_t *nr = &kn->iattr->nr_user_xattrs;
xattr: simple_xattr_set() return old_xattr to be freed JIRA: https://issues.redhat.com/browse/RHEL-27743 Conflicts: * mm/shmem.c: this commit had a merge conflict upstream with commit 6528733416f1 ("shmem: convert to ctime accessor functions"), backported earlier in this set. The conflict was solved via merge commit ecd7db20474c ("Merge tag 'v6.6-vfs.tmpfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs"), from which we borrow the hunk adjustment for this backport. This patch is a backport of the following upstream commit: commit 5de75970c9fd7220e394b76e6d20fbafa1369b5a Author: Hugh Dickins <hughd@google.com> Date: Tue Aug 8 21:30:59 2023 -0700 xattr: simple_xattr_set() return old_xattr to be freed tmpfs wants to support limited user extended attributes, but kernfs (or cgroupfs, the only kernfs with KERNFS_ROOT_SUPPORT_USER_XATTR) already supports user extended attributes through simple xattrs: but limited by a policy (128KiB per inode) too liberal to be used on tmpfs. To allow a different limiting policy for tmpfs, without affecting the policy for kernfs, change simple_xattr_set() to return the replaced or removed xattr (if any), leaving the caller to update their accounting then free the xattr (by simple_xattr_free(), renamed from the static free_simple_xattr()). Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Jan Kara <jack@suse.cz> Reviewed-by: Christian Brauner <brauner@kernel.org> Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com> Message-Id: <158c6585-2aa7-d4aa-90ff-f7c3f8fe407c@google.com> Signed-off-by: Christian Brauner <brauner@kernel.org> Signed-off-by: Rafael Aquini <raquini@redhat.com>
2024-09-27 14:44:59 +00:00
struct simple_xattr *old_xattr;
xattr: simple_xattr_set() return old_xattr to be freed JIRA: https://issues.redhat.com/browse/RHEL-27743 Conflicts: * mm/shmem.c: this commit had a merge conflict upstream with commit 6528733416f1 ("shmem: convert to ctime accessor functions"), backported earlier in this set. The conflict was solved via merge commit ecd7db20474c ("Merge tag 'v6.6-vfs.tmpfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs"), from which we borrow the hunk adjustment for this backport. This patch is a backport of the following upstream commit: commit 5de75970c9fd7220e394b76e6d20fbafa1369b5a Author: Hugh Dickins <hughd@google.com> Date: Tue Aug 8 21:30:59 2023 -0700 xattr: simple_xattr_set() return old_xattr to be freed tmpfs wants to support limited user extended attributes, but kernfs (or cgroupfs, the only kernfs with KERNFS_ROOT_SUPPORT_USER_XATTR) already supports user extended attributes through simple xattrs: but limited by a policy (128KiB per inode) too liberal to be used on tmpfs. To allow a different limiting policy for tmpfs, without affecting the policy for kernfs, change simple_xattr_set() to return the replaced or removed xattr (if any), leaving the caller to update their accounting then free the xattr (by simple_xattr_free(), renamed from the static free_simple_xattr()). Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Jan Kara <jack@suse.cz> Reviewed-by: Christian Brauner <brauner@kernel.org> Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com> Message-Id: <158c6585-2aa7-d4aa-90ff-f7c3f8fe407c@google.com> Signed-off-by: Christian Brauner <brauner@kernel.org> Signed-off-by: Rafael Aquini <raquini@redhat.com>
2024-09-27 14:44:59 +00:00
old_xattr = simple_xattr_set(xattrs, full_name, value, size, flags);
if (!old_xattr)
return 0;
xattr: simple_xattr_set() return old_xattr to be freed JIRA: https://issues.redhat.com/browse/RHEL-27743 Conflicts: * mm/shmem.c: this commit had a merge conflict upstream with commit 6528733416f1 ("shmem: convert to ctime accessor functions"), backported earlier in this set. The conflict was solved via merge commit ecd7db20474c ("Merge tag 'v6.6-vfs.tmpfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs"), from which we borrow the hunk adjustment for this backport. This patch is a backport of the following upstream commit: commit 5de75970c9fd7220e394b76e6d20fbafa1369b5a Author: Hugh Dickins <hughd@google.com> Date: Tue Aug 8 21:30:59 2023 -0700 xattr: simple_xattr_set() return old_xattr to be freed tmpfs wants to support limited user extended attributes, but kernfs (or cgroupfs, the only kernfs with KERNFS_ROOT_SUPPORT_USER_XATTR) already supports user extended attributes through simple xattrs: but limited by a policy (128KiB per inode) too liberal to be used on tmpfs. To allow a different limiting policy for tmpfs, without affecting the policy for kernfs, change simple_xattr_set() to return the replaced or removed xattr (if any), leaving the caller to update their accounting then free the xattr (by simple_xattr_free(), renamed from the static free_simple_xattr()). Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Jan Kara <jack@suse.cz> Reviewed-by: Christian Brauner <brauner@kernel.org> Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com> Message-Id: <158c6585-2aa7-d4aa-90ff-f7c3f8fe407c@google.com> Signed-off-by: Christian Brauner <brauner@kernel.org> Signed-off-by: Rafael Aquini <raquini@redhat.com>
2024-09-27 14:44:59 +00:00
if (IS_ERR(old_xattr))
return PTR_ERR(old_xattr);
xattr: simple_xattr_set() return old_xattr to be freed JIRA: https://issues.redhat.com/browse/RHEL-27743 Conflicts: * mm/shmem.c: this commit had a merge conflict upstream with commit 6528733416f1 ("shmem: convert to ctime accessor functions"), backported earlier in this set. The conflict was solved via merge commit ecd7db20474c ("Merge tag 'v6.6-vfs.tmpfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs"), from which we borrow the hunk adjustment for this backport. This patch is a backport of the following upstream commit: commit 5de75970c9fd7220e394b76e6d20fbafa1369b5a Author: Hugh Dickins <hughd@google.com> Date: Tue Aug 8 21:30:59 2023 -0700 xattr: simple_xattr_set() return old_xattr to be freed tmpfs wants to support limited user extended attributes, but kernfs (or cgroupfs, the only kernfs with KERNFS_ROOT_SUPPORT_USER_XATTR) already supports user extended attributes through simple xattrs: but limited by a policy (128KiB per inode) too liberal to be used on tmpfs. To allow a different limiting policy for tmpfs, without affecting the policy for kernfs, change simple_xattr_set() to return the replaced or removed xattr (if any), leaving the caller to update their accounting then free the xattr (by simple_xattr_free(), renamed from the static free_simple_xattr()). Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Jan Kara <jack@suse.cz> Reviewed-by: Christian Brauner <brauner@kernel.org> Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com> Message-Id: <158c6585-2aa7-d4aa-90ff-f7c3f8fe407c@google.com> Signed-off-by: Christian Brauner <brauner@kernel.org> Signed-off-by: Rafael Aquini <raquini@redhat.com>
2024-09-27 14:44:59 +00:00
atomic_sub(old_xattr->size, sz);
atomic_dec(nr);
simple_xattr_free(old_xattr);
return 0;
}
static int kernfs_vfs_user_xattr_set(const struct xattr_handler *handler,
fs: port xattr to mnt_idmap JIRA: https://issues.redhat.com/browse/RHEL-33888 Status: Linus Conflicts: The cifs source has been moved in CentOS Stream so manually apply rejected hunk to fs/smb/client/xattr.c. Dropped hunks for ntfs3 because the source is not present in the CentOS Stream source tree. CentOS Stream commit 98ba731fc7eae ("ovl: Move xattr support to new xattrs.c file") moved ovl_own_xattr_set(), manually apply changes. CentOS Stream commit 67e2fcb2f34bd ("evm: don't copy up 'security.evm' xattr") is present causing hunk #1 against include/linux/evm.h to be rejected, manually apply. Upstream commit 5d1ef2ce13a90 ("ima: Introduce ima_get_current_hash_algo()") is not present in CentOS Stream which causes fuzz 1 for hunk #1 against include/linux/ima.h. There's a reject of hunk #1 for include/linux/lsm_hooks.h but I can't see any reason for it, manually applied the hunk. CentOS Stream does not have upstream commit ce5bb5a86e5eb ("ima: Return int in the functions to measure a buffer") which results in a reject of hunk #2 against security/integrity/ima/ima.h and hunks #8 and #11 against security/integrity/ima/ima_main.c, so manually apply hunks. There also appears to be a whitespace mismatch causing hunk #7 to report fuzz 2 on application. CentOS Stream does not have upstream commit c7423dbdbc9ec ("ima: Handle -ESTALE returned by ima_filter_rule_match()") which results in a reject of hunk #3 against security/integrity/ima/ima_policy.c, so manually apply hunk. commit 39f60c1ccee72caa0104145b5dbf5d37cce1ea39 Author: Christian Brauner <brauner@kernel.org> Date: Fri Jan 13 12:49:23 2023 +0100 fs: port xattr to mnt_idmap Convert to struct mnt_idmap. Last cycle we merged the necessary infrastructure in 256c8aed2b42 ("fs: introduce dedicated idmap type for mounts"). This is just the conversion to struct mnt_idmap. Currently we still pass around the plain namespace that was attached to a mount. This is in general pretty convenient but it makes it easy to conflate namespaces that are relevant on the filesystem with namespaces that are relevent on the mount level. Especially for non-vfs developers without detailed knowledge in this area this can be a potential source for bugs. Once the conversion to struct mnt_idmap is done all helpers down to the really low-level helpers will take a struct mnt_idmap argument instead of two namespace arguments. This way it becomes impossible to conflate the two eliminating the possibility of any bugs. All of the vfs and all filesystems only operate on struct mnt_idmap. Acked-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org> Signed-off-by: Ian Kent <ikent@redhat.com>
2024-05-22 06:41:29 +00:00
struct mnt_idmap *idmap,
struct dentry *unused, struct inode *inode,
const char *suffix, const void *value,
size_t size, int flags)
{
const char *full_name = xattr_full_name(handler, suffix);
struct kernfs_node *kn = inode->i_private;
struct kernfs_iattrs *attrs;
if (!(kernfs_root(kn)->flags & KERNFS_ROOT_SUPPORT_USER_XATTR))
return -EOPNOTSUPP;
attrs = kernfs_iattrs(kn);
if (!attrs)
return -ENOMEM;
if (value)
return kernfs_vfs_user_xattr_add(kn, full_name, &attrs->xattrs,
value, size, flags);
else
return kernfs_vfs_user_xattr_rm(kn, full_name, &attrs->xattrs,
value, size, flags);
}
static const struct xattr_handler kernfs_trusted_xattr_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
.get = kernfs_vfs_xattr_get,
.set = kernfs_vfs_xattr_set,
};
static const struct xattr_handler kernfs_security_xattr_handler = {
.prefix = XATTR_SECURITY_PREFIX,
.get = kernfs_vfs_xattr_get,
.set = kernfs_vfs_xattr_set,
};
static const struct xattr_handler kernfs_user_xattr_handler = {
.prefix = XATTR_USER_PREFIX,
.get = kernfs_vfs_xattr_get,
.set = kernfs_vfs_user_xattr_set,
};
const struct xattr_handler *kernfs_xattr_handlers[] = {
&kernfs_trusted_xattr_handler,
&kernfs_security_xattr_handler,
&kernfs_user_xattr_handler,
NULL
};