2017-11-01 14:08:43 +00:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
|
2012-10-13 09:46:48 +00:00
|
|
|
#ifndef _UAPI_LINUX_FS_H
|
|
|
|
#define _UAPI_LINUX_FS_H
|
|
|
|
|
|
|
|
/*
|
2016-01-08 21:01:25 +00:00
|
|
|
* This file has definitions for some important file table structures
|
|
|
|
* and constants and structures used by various generic file system
|
|
|
|
* ioctl's. Please do not make any changes in this file before
|
|
|
|
* sending patches for review to linux-fsdevel@vger.kernel.org and
|
|
|
|
* linux-api@vger.kernel.org.
|
2012-10-13 09:46:48 +00:00
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/limits.h>
|
|
|
|
#include <linux/ioctl.h>
|
|
|
|
#include <linux/types.h>
|
2019-08-05 02:35:43 +00:00
|
|
|
#ifndef __KERNEL__
|
|
|
|
#include <linux/fscrypt.h>
|
|
|
|
#endif
|
2012-10-13 09:46:48 +00:00
|
|
|
|
2018-11-01 23:07:23 +00:00
|
|
|
/* Use of MS_* flags within the kernel is restricted to core mount(2) code. */
|
|
|
|
#if !defined(__KERNEL__)
|
|
|
|
#include <linux/mount.h>
|
|
|
|
#endif
|
|
|
|
|
2012-10-13 09:46:48 +00:00
|
|
|
/*
|
|
|
|
* It's silly to have NR_OPEN bigger than NR_FILE, but you can change
|
|
|
|
* the file limit at runtime and only root can increase the per-process
|
|
|
|
* nr_file rlimit, so it's safe to set up a ridiculously high absolute
|
|
|
|
* upper limit on files-per-process.
|
|
|
|
*
|
|
|
|
* Some programs (notably those using select()) may have to be
|
|
|
|
* recompiled to take full advantage of the new limits..
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* Fixed constants first: */
|
|
|
|
#undef NR_OPEN
|
|
|
|
#define INR_OPEN_CUR 1024 /* Initial setting for nfile rlimits */
|
|
|
|
#define INR_OPEN_MAX 4096 /* Hard limit for nfile rlimits */
|
|
|
|
|
|
|
|
#define BLOCK_SIZE_BITS 10
|
|
|
|
#define BLOCK_SIZE (1<<BLOCK_SIZE_BITS)
|
|
|
|
|
2024-11-28 11:22:34 +00:00
|
|
|
/* flags for integrity meta */
|
|
|
|
#define IO_INTEGRITY_CHK_GUARD (1U << 0) /* enforce guard check */
|
|
|
|
#define IO_INTEGRITY_CHK_REFTAG (1U << 1) /* enforce ref check */
|
|
|
|
#define IO_INTEGRITY_CHK_APPTAG (1U << 2) /* enforce app check */
|
|
|
|
|
|
|
|
#define IO_INTEGRITY_VALID_FLAGS (IO_INTEGRITY_CHK_GUARD | \
|
|
|
|
IO_INTEGRITY_CHK_REFTAG | \
|
|
|
|
IO_INTEGRITY_CHK_APPTAG)
|
|
|
|
|
2012-10-13 09:46:48 +00:00
|
|
|
#define SEEK_SET 0 /* seek relative to beginning of file */
|
|
|
|
#define SEEK_CUR 1 /* seek relative to current file position */
|
|
|
|
#define SEEK_END 2 /* seek relative to end of file */
|
|
|
|
#define SEEK_DATA 3 /* seek to the next data */
|
|
|
|
#define SEEK_HOLE 4 /* seek to the next hole */
|
|
|
|
#define SEEK_MAX SEEK_HOLE
|
|
|
|
|
2014-04-01 15:08:43 +00:00
|
|
|
#define RENAME_NOREPLACE (1 << 0) /* Don't overwrite target */
|
2014-04-01 15:08:43 +00:00
|
|
|
#define RENAME_EXCHANGE (1 << 1) /* Exchange source and dest */
|
2014-10-23 22:14:37 +00:00
|
|
|
#define RENAME_WHITEOUT (1 << 2) /* Whiteout source */
|
2014-04-01 15:08:43 +00:00
|
|
|
|
2015-12-03 11:59:50 +00:00
|
|
|
struct file_clone_range {
|
|
|
|
__s64 src_fd;
|
|
|
|
__u64 src_offset;
|
|
|
|
__u64 src_length;
|
|
|
|
__u64 dest_offset;
|
|
|
|
};
|
|
|
|
|
2012-10-13 09:46:48 +00:00
|
|
|
struct fstrim_range {
|
|
|
|
__u64 start;
|
|
|
|
__u64 len;
|
|
|
|
__u64 minlen;
|
|
|
|
};
|
|
|
|
|
2024-02-07 02:56:17 +00:00
|
|
|
/*
|
|
|
|
* We include a length field because some filesystems (vfat) have an identifier
|
|
|
|
* that we do want to expose as a UUID, but doesn't have the standard length.
|
|
|
|
*
|
|
|
|
* We use a fixed size buffer beacuse this interface will, by fiat, never
|
|
|
|
* support "UUIDs" longer than 16 bytes; we don't want to force all downstream
|
|
|
|
* users to have to deal with that.
|
|
|
|
*/
|
|
|
|
struct fsuuid2 {
|
|
|
|
__u8 len;
|
|
|
|
__u8 uuid[16];
|
|
|
|
};
|
|
|
|
|
2024-02-07 02:56:19 +00:00
|
|
|
struct fs_sysfs_path {
|
|
|
|
__u8 len;
|
|
|
|
__u8 name[128];
|
|
|
|
};
|
|
|
|
|
2015-12-19 08:55:59 +00:00
|
|
|
/* extent-same (dedupe) ioctls; these MUST match the btrfs ioctl definitions */
|
|
|
|
#define FILE_DEDUPE_RANGE_SAME 0
|
|
|
|
#define FILE_DEDUPE_RANGE_DIFFERS 1
|
|
|
|
|
|
|
|
/* from struct btrfs_ioctl_file_extent_same_info */
|
|
|
|
struct file_dedupe_range_info {
|
|
|
|
__s64 dest_fd; /* in - destination file */
|
|
|
|
__u64 dest_offset; /* in - start of extent in destination */
|
|
|
|
__u64 bytes_deduped; /* out - total # of bytes we were able
|
|
|
|
* to dedupe from this file. */
|
|
|
|
/* status of this dedupe operation:
|
|
|
|
* < 0 for error
|
|
|
|
* == FILE_DEDUPE_RANGE_SAME if dedupe succeeds
|
|
|
|
* == FILE_DEDUPE_RANGE_DIFFERS if data differs
|
|
|
|
*/
|
|
|
|
__s32 status; /* out - see above description */
|
|
|
|
__u32 reserved; /* must be zero */
|
|
|
|
};
|
|
|
|
|
|
|
|
/* from struct btrfs_ioctl_file_extent_same_args */
|
|
|
|
struct file_dedupe_range {
|
|
|
|
__u64 src_offset; /* in - start of extent in source */
|
|
|
|
__u64 src_length; /* in - length of extent */
|
|
|
|
__u16 dest_count; /* in - total elements in info array */
|
|
|
|
__u16 reserved1; /* must be zero */
|
|
|
|
__u32 reserved2; /* must be zero */
|
2022-04-07 00:36:51 +00:00
|
|
|
struct file_dedupe_range_info info[];
|
2015-12-19 08:55:59 +00:00
|
|
|
};
|
|
|
|
|
2012-10-13 09:46:48 +00:00
|
|
|
/* And dynamically-tunable limits and defaults: */
|
|
|
|
struct files_stat_struct {
|
|
|
|
unsigned long nr_files; /* read only */
|
|
|
|
unsigned long nr_free_files; /* read only */
|
|
|
|
unsigned long max_files; /* tunable */
|
|
|
|
};
|
|
|
|
|
|
|
|
struct inodes_stat_t {
|
fs: bump inode and dentry counters to long
This series reworks our current object cache shrinking infrastructure in
two main ways:
* Noticing that a lot of users copy and paste their own version of LRU
lists for objects, we put some effort in providing a generic version.
It is modeled after the filesystem users: dentries, inodes, and xfs
(for various tasks), but we expect that other users could benefit in
the near future with little or no modification. Let us know if you
have any issues.
* The underlying list_lru being proposed automatically and
transparently keeps the elements in per-node lists, and is able to
manipulate the node lists individually. Given this infrastructure, we
are able to modify the up-to-now hammer called shrink_slab to proceed
with node-reclaim instead of always searching memory from all over like
it has been doing.
Per-node lru lists are also expected to lead to less contention in the lru
locks on multi-node scans, since we are now no longer fighting for a
global lock. The locks usually disappear from the profilers with this
change.
Although we have no official benchmarks for this version - be our guest to
independently evaluate this - earlier versions of this series were
performance tested (details at
http://permalink.gmane.org/gmane.linux.kernel.mm/100537) yielding no
visible performance regressions while yielding a better qualitative
behavior in NUMA machines.
With this infrastructure in place, we can use the list_lru entry point to
provide memcg isolation and per-memcg targeted reclaim. Historically,
those two pieces of work have been posted together. This version presents
only the infrastructure work, deferring the memcg work for a later time,
so we can focus on getting this part tested. You can see more about the
history of such work at http://lwn.net/Articles/552769/
Dave Chinner (18):
dcache: convert dentry_stat.nr_unused to per-cpu counters
dentry: move to per-sb LRU locks
dcache: remove dentries from LRU before putting on dispose list
mm: new shrinker API
shrinker: convert superblock shrinkers to new API
list: add a new LRU list type
inode: convert inode lru list to generic lru list code.
dcache: convert to use new lru list infrastructure
list_lru: per-node list infrastructure
shrinker: add node awareness
fs: convert inode and dentry shrinking to be node aware
xfs: convert buftarg LRU to generic code
xfs: rework buffer dispose list tracking
xfs: convert dquot cache lru to list_lru
fs: convert fs shrinkers to new scan/count API
drivers: convert shrinkers to new count/scan API
shrinker: convert remaining shrinkers to count/scan API
shrinker: Kill old ->shrink API.
Glauber Costa (7):
fs: bump inode and dentry counters to long
super: fix calculation of shrinkable objects for small numbers
list_lru: per-node API
vmscan: per-node deferred work
i915: bail out earlier when shrinker cannot acquire mutex
hugepage: convert huge zero page shrinker to new shrinker API
list_lru: dynamically adjust node arrays
This patch:
There are situations in very large machines in which we can have a large
quantity of dirty inodes, unused dentries, etc. This is particularly true
when umounting a filesystem, where eventually since every live object will
eventually be discarded.
Dave Chinner reported a problem with this while experimenting with the
shrinker revamp patchset. So we believe it is time for a change. This
patch just moves int to longs. Machines where it matters should have a
big long anyway.
Signed-off-by: Glauber Costa <glommer@openvz.org>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Cc: Arve Hjønnevåg <arve@android.com>
Cc: Carlos Maiolino <cmaiolino@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Chuck Lever <chuck.lever@oracle.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Gleb Natapov <gleb@redhat.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: J. Bruce Fields <bfields@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Kent Overstreet <koverstreet@google.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Cc: Thomas Hellstrom <thellstrom@vmware.com>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2013-08-28 00:17:53 +00:00
|
|
|
long nr_inodes;
|
|
|
|
long nr_unused;
|
|
|
|
long dummy[5]; /* padding for sysctl ABI compatibility */
|
2012-10-13 09:46:48 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
#define NR_FILE 8192 /* this can well be larger on a larger system */
|
|
|
|
|
2016-01-04 05:44:15 +00:00
|
|
|
/*
|
|
|
|
* Structure for FS_IOC_FSGETXATTR[A] and FS_IOC_FSSETXATTR.
|
|
|
|
*/
|
|
|
|
struct fsxattr {
|
|
|
|
__u32 fsx_xflags; /* xflags field value (get/set) */
|
|
|
|
__u32 fsx_extsize; /* extsize field value (get/set)*/
|
|
|
|
__u32 fsx_nextents; /* nextents field value (get) */
|
|
|
|
__u32 fsx_projid; /* project identifier (get/set) */
|
2016-10-03 16:11:13 +00:00
|
|
|
__u32 fsx_cowextsize; /* CoW extsize field value (get/set)*/
|
|
|
|
unsigned char fsx_pad[8];
|
2016-01-04 05:44:15 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Flags for the fsx_xflags field
|
|
|
|
*/
|
|
|
|
#define FS_XFLAG_REALTIME 0x00000001 /* data in realtime volume */
|
|
|
|
#define FS_XFLAG_PREALLOC 0x00000002 /* preallocated file extents */
|
|
|
|
#define FS_XFLAG_IMMUTABLE 0x00000008 /* file cannot be modified */
|
|
|
|
#define FS_XFLAG_APPEND 0x00000010 /* all writes append */
|
|
|
|
#define FS_XFLAG_SYNC 0x00000020 /* all writes synchronous */
|
|
|
|
#define FS_XFLAG_NOATIME 0x00000040 /* do not update access time */
|
|
|
|
#define FS_XFLAG_NODUMP 0x00000080 /* do not include in backups */
|
|
|
|
#define FS_XFLAG_RTINHERIT 0x00000100 /* create with rt bit set */
|
|
|
|
#define FS_XFLAG_PROJINHERIT 0x00000200 /* create with parents projid */
|
|
|
|
#define FS_XFLAG_NOSYMLINKS 0x00000400 /* disallow symlink creation */
|
|
|
|
#define FS_XFLAG_EXTSIZE 0x00000800 /* extent size allocator hint */
|
|
|
|
#define FS_XFLAG_EXTSZINHERIT 0x00001000 /* inherit inode extent size */
|
|
|
|
#define FS_XFLAG_NODEFRAG 0x00002000 /* do not defragment */
|
|
|
|
#define FS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */
|
2016-01-04 05:44:15 +00:00
|
|
|
#define FS_XFLAG_DAX 0x00008000 /* use DAX for IO */
|
2016-10-03 16:11:13 +00:00
|
|
|
#define FS_XFLAG_COWEXTSIZE 0x00010000 /* CoW extent size allocator hint */
|
2016-01-04 05:44:15 +00:00
|
|
|
#define FS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */
|
|
|
|
|
2012-10-13 09:46:48 +00:00
|
|
|
/* the read-only stuff doesn't really belong here, but any other place is
|
|
|
|
probably as bad and I don't want to create yet another include file. */
|
|
|
|
|
|
|
|
#define BLKROSET _IO(0x12,93) /* set device read-only (0 = read-write) */
|
|
|
|
#define BLKROGET _IO(0x12,94) /* get read-only status (0 = read_write) */
|
|
|
|
#define BLKRRPART _IO(0x12,95) /* re-read partition table */
|
|
|
|
#define BLKGETSIZE _IO(0x12,96) /* return device size /512 (long *arg) */
|
|
|
|
#define BLKFLSBUF _IO(0x12,97) /* flush buffer cache */
|
|
|
|
#define BLKRASET _IO(0x12,98) /* set read ahead for block device */
|
|
|
|
#define BLKRAGET _IO(0x12,99) /* get current read ahead setting */
|
|
|
|
#define BLKFRASET _IO(0x12,100)/* set filesystem (mm/filemap.c) read-ahead */
|
|
|
|
#define BLKFRAGET _IO(0x12,101)/* get filesystem (mm/filemap.c) read-ahead */
|
|
|
|
#define BLKSECTSET _IO(0x12,102)/* set max sectors per request (ll_rw_blk.c) */
|
|
|
|
#define BLKSECTGET _IO(0x12,103)/* get max sectors per request (ll_rw_blk.c) */
|
|
|
|
#define BLKSSZGET _IO(0x12,104)/* get block device sector size */
|
|
|
|
#if 0
|
|
|
|
#define BLKPG _IO(0x12,105)/* See blkpg.h */
|
|
|
|
|
|
|
|
/* Some people are morons. Do not use sizeof! */
|
|
|
|
|
|
|
|
#define BLKELVGET _IOR(0x12,106,size_t)/* elevator get */
|
|
|
|
#define BLKELVSET _IOW(0x12,107,size_t)/* elevator set */
|
|
|
|
/* This was here just to show that the number is taken -
|
|
|
|
probably all these _IO(0x12,*) ioctls should be moved to blkpg.h. */
|
|
|
|
#endif
|
|
|
|
/* A jump here: 108-111 have been used for various private purposes. */
|
|
|
|
#define BLKBSZGET _IOR(0x12,112,size_t)
|
|
|
|
#define BLKBSZSET _IOW(0x12,113,size_t)
|
|
|
|
#define BLKGETSIZE64 _IOR(0x12,114,size_t) /* return device size in bytes (u64 *arg) */
|
|
|
|
#define BLKTRACESETUP _IOWR(0x12,115,struct blk_user_trace_setup)
|
|
|
|
#define BLKTRACESTART _IO(0x12,116)
|
|
|
|
#define BLKTRACESTOP _IO(0x12,117)
|
|
|
|
#define BLKTRACETEARDOWN _IO(0x12,118)
|
|
|
|
#define BLKDISCARD _IO(0x12,119)
|
|
|
|
#define BLKIOMIN _IO(0x12,120)
|
|
|
|
#define BLKIOOPT _IO(0x12,121)
|
|
|
|
#define BLKALIGNOFF _IO(0x12,122)
|
|
|
|
#define BLKPBSZGET _IO(0x12,123)
|
|
|
|
#define BLKDISCARDZEROES _IO(0x12,124)
|
|
|
|
#define BLKSECDISCARD _IO(0x12,125)
|
|
|
|
#define BLKROTATIONAL _IO(0x12,126)
|
|
|
|
#define BLKZEROOUT _IO(0x12,127)
|
2021-07-12 23:05:27 +00:00
|
|
|
#define BLKGETDISKSEQ _IOR(0x12,128,__u64)
|
blk-crypto: add ioctls to create and prepare hardware-wrapped keys
Until this point, the kernel can use hardware-wrapped keys to do
encryption if userspace provides one -- specifically a key in
ephemerally-wrapped form. However, no generic way has been provided for
userspace to get such a key in the first place.
Getting such a key is a two-step process. First, the key needs to be
imported from a raw key or generated by the hardware, producing a key in
long-term wrapped form. This happens once in the whole lifetime of the
key. Second, the long-term wrapped key needs to be converted into
ephemerally-wrapped form. This happens each time the key is "unlocked".
In Android, these operations are supported in a generic way through
KeyMint, a userspace abstraction layer. However, that method is
Android-specific and can't be used on other Linux systems, may rely on
proprietary libraries, and also misleads people into supporting KeyMint
features like rollback resistance that make sense for other KeyMint keys
but don't make sense for hardware-wrapped inline encryption keys.
Therefore, this patch provides a generic kernel interface for these
operations by introducing new block device ioctls:
- BLKCRYPTOIMPORTKEY: convert a raw key to long-term wrapped form.
- BLKCRYPTOGENERATEKEY: have the hardware generate a new key, then
return it in long-term wrapped form.
- BLKCRYPTOPREPAREKEY: convert a key from long-term wrapped form to
ephemerally-wrapped form.
These ioctls are implemented using new operations in blk_crypto_ll_ops.
Signed-off-by: Eric Biggers <ebiggers@google.com>
Tested-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org> # sm8650
Link: https://lore.kernel.org/r/20250204060041.409950-4-ebiggers@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-02-04 06:00:37 +00:00
|
|
|
/* 130-136 are used by zoned block device ioctls (uapi/linux/blkzoned.h) */
|
|
|
|
/* 137-141 are used by blk-crypto ioctls (uapi/linux/blk-crypto.h) */
|
2012-10-13 09:46:48 +00:00
|
|
|
|
|
|
|
#define BMAP_IOCTL 1 /* obsolete - kept for compatibility */
|
|
|
|
#define FIBMAP _IO(0x00,1) /* bmap access */
|
|
|
|
#define FIGETBSZ _IO(0x00,2) /* get the block size used for bmap */
|
|
|
|
#define FIFREEZE _IOWR('X', 119, int) /* Freeze */
|
|
|
|
#define FITHAW _IOWR('X', 120, int) /* Thaw */
|
|
|
|
#define FITRIM _IOWR('X', 121, struct fstrim_range) /* Trim */
|
2015-12-03 11:59:50 +00:00
|
|
|
#define FICLONE _IOW(0x94, 9, int)
|
|
|
|
#define FICLONERANGE _IOW(0x94, 13, struct file_clone_range)
|
2015-12-19 08:55:59 +00:00
|
|
|
#define FIDEDUPERANGE _IOWR(0x94, 54, struct file_dedupe_range)
|
2012-10-13 09:46:48 +00:00
|
|
|
|
2018-05-15 20:20:03 +00:00
|
|
|
#define FSLABEL_MAX 256 /* Max chars for the interface; each fs may differ */
|
|
|
|
|
2012-10-13 09:46:48 +00:00
|
|
|
#define FS_IOC_GETFLAGS _IOR('f', 1, long)
|
|
|
|
#define FS_IOC_SETFLAGS _IOW('f', 2, long)
|
|
|
|
#define FS_IOC_GETVERSION _IOR('v', 1, long)
|
|
|
|
#define FS_IOC_SETVERSION _IOW('v', 2, long)
|
|
|
|
#define FS_IOC_FIEMAP _IOWR('f', 11, struct fiemap)
|
|
|
|
#define FS_IOC32_GETFLAGS _IOR('f', 1, int)
|
|
|
|
#define FS_IOC32_SETFLAGS _IOW('f', 2, int)
|
|
|
|
#define FS_IOC32_GETVERSION _IOR('v', 1, int)
|
|
|
|
#define FS_IOC32_SETVERSION _IOW('v', 2, int)
|
2018-05-15 20:20:03 +00:00
|
|
|
#define FS_IOC_FSGETXATTR _IOR('X', 31, struct fsxattr)
|
|
|
|
#define FS_IOC_FSSETXATTR _IOW('X', 32, struct fsxattr)
|
|
|
|
#define FS_IOC_GETFSLABEL _IOR(0x94, 49, char[FSLABEL_MAX])
|
|
|
|
#define FS_IOC_SETFSLABEL _IOW(0x94, 50, char[FSLABEL_MAX])
|
2024-02-07 02:56:17 +00:00
|
|
|
/* Returns the external filesystem UUID, the same one blkid returns */
|
|
|
|
#define FS_IOC_GETFSUUID _IOR(0x15, 0, struct fsuuid2)
|
2024-02-07 02:56:19 +00:00
|
|
|
/*
|
|
|
|
* Returns the path component under /sys/fs/ that refers to this filesystem;
|
|
|
|
* also /sys/kernel/debug/ for filesystems with debugfs exports
|
|
|
|
*/
|
|
|
|
#define FS_IOC_GETFSSYSFSPATH _IOR(0x15, 1, struct fs_sysfs_path)
|
2012-10-13 09:46:48 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS)
|
2016-01-08 21:01:25 +00:00
|
|
|
*
|
|
|
|
* Note: for historical reasons, these flags were originally used and
|
|
|
|
* defined for use by ext2/ext3, and then other file systems started
|
|
|
|
* using these flags so they wouldn't need to write their own version
|
|
|
|
* of chattr/lsattr (which was shipped as part of e2fsprogs). You
|
|
|
|
* should think twice before trying to use these flags in new
|
|
|
|
* contexts, or trying to assign these flags, since they are used both
|
|
|
|
* as the UAPI and the on-disk encoding for ext2/3/4. Also, we are
|
|
|
|
* almost out of 32-bit flags. :-)
|
|
|
|
*
|
|
|
|
* We have recently hoisted FS_IOC_FSGETXATTR / FS_IOC_FSSETXATTR from
|
|
|
|
* XFS to the generic FS level interface. This uses a structure that
|
|
|
|
* has padding and hence has more room to grow, so it may be more
|
|
|
|
* appropriate for many new use cases.
|
|
|
|
*
|
|
|
|
* Please do not change these flags or interfaces before checking with
|
|
|
|
* linux-fsdevel@vger.kernel.org and linux-api@vger.kernel.org.
|
2012-10-13 09:46:48 +00:00
|
|
|
*/
|
|
|
|
#define FS_SECRM_FL 0x00000001 /* Secure deletion */
|
|
|
|
#define FS_UNRM_FL 0x00000002 /* Undelete */
|
|
|
|
#define FS_COMPR_FL 0x00000004 /* Compress file */
|
|
|
|
#define FS_SYNC_FL 0x00000008 /* Synchronous updates */
|
|
|
|
#define FS_IMMUTABLE_FL 0x00000010 /* Immutable file */
|
|
|
|
#define FS_APPEND_FL 0x00000020 /* writes to file may only append */
|
|
|
|
#define FS_NODUMP_FL 0x00000040 /* do not dump file */
|
|
|
|
#define FS_NOATIME_FL 0x00000080 /* do not update atime */
|
|
|
|
/* Reserved for compression usage... */
|
|
|
|
#define FS_DIRTY_FL 0x00000100
|
|
|
|
#define FS_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */
|
|
|
|
#define FS_NOCOMP_FL 0x00000400 /* Don't compress */
|
|
|
|
/* End compression flags --- maybe not all used */
|
2016-01-08 21:01:25 +00:00
|
|
|
#define FS_ENCRYPT_FL 0x00000800 /* Encrypted file */
|
2012-10-13 09:46:48 +00:00
|
|
|
#define FS_BTREE_FL 0x00001000 /* btree format dir */
|
|
|
|
#define FS_INDEX_FL 0x00001000 /* hash-indexed directory */
|
|
|
|
#define FS_IMAGIC_FL 0x00002000 /* AFS directory */
|
|
|
|
#define FS_JOURNAL_DATA_FL 0x00004000 /* Reserved for ext3 */
|
|
|
|
#define FS_NOTAIL_FL 0x00008000 /* file tail should not be merged */
|
|
|
|
#define FS_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */
|
|
|
|
#define FS_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
|
2016-01-08 21:01:25 +00:00
|
|
|
#define FS_HUGE_FILE_FL 0x00040000 /* Reserved for ext4 */
|
2012-10-13 09:46:48 +00:00
|
|
|
#define FS_EXTENT_FL 0x00080000 /* Extents */
|
2019-07-22 16:26:21 +00:00
|
|
|
#define FS_VERITY_FL 0x00100000 /* Verity protected inode */
|
2016-01-08 21:01:25 +00:00
|
|
|
#define FS_EA_INODE_FL 0x00200000 /* Inode used for large EA */
|
|
|
|
#define FS_EOFBLOCKS_FL 0x00400000 /* Reserved for ext4 */
|
2012-10-13 09:46:48 +00:00
|
|
|
#define FS_NOCOW_FL 0x00800000 /* Do not cow file */
|
2020-05-28 15:00:02 +00:00
|
|
|
#define FS_DAX_FL 0x02000000 /* Inode is DAX */
|
2016-01-08 21:01:25 +00:00
|
|
|
#define FS_INLINE_DATA_FL 0x10000000 /* Reserved for ext4 */
|
2015-10-17 20:15:18 +00:00
|
|
|
#define FS_PROJINHERIT_FL 0x20000000 /* Create with parents projid */
|
2019-07-23 23:05:27 +00:00
|
|
|
#define FS_CASEFOLD_FL 0x40000000 /* Folder is case insensitive */
|
2012-10-13 09:46:48 +00:00
|
|
|
#define FS_RESERVED_FL 0x80000000 /* reserved for ext2 lib */
|
|
|
|
|
|
|
|
#define FS_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */
|
|
|
|
#define FS_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */
|
|
|
|
|
|
|
|
|
|
|
|
#define SYNC_FILE_RANGE_WAIT_BEFORE 1
|
|
|
|
#define SYNC_FILE_RANGE_WRITE 2
|
|
|
|
#define SYNC_FILE_RANGE_WAIT_AFTER 4
|
2019-05-14 00:22:30 +00:00
|
|
|
#define SYNC_FILE_RANGE_WRITE_AND_WAIT (SYNC_FILE_RANGE_WRITE | \
|
|
|
|
SYNC_FILE_RANGE_WAIT_BEFORE | \
|
|
|
|
SYNC_FILE_RANGE_WAIT_AFTER)
|
2012-10-13 09:46:48 +00:00
|
|
|
|
2017-07-06 16:58:37 +00:00
|
|
|
/*
|
|
|
|
* Flags for preadv2/pwritev2:
|
|
|
|
*/
|
|
|
|
|
|
|
|
typedef int __bitwise __kernel_rwf_t;
|
|
|
|
|
|
|
|
/* high priority request, poll if possible */
|
|
|
|
#define RWF_HIPRI ((__force __kernel_rwf_t)0x00000001)
|
|
|
|
|
|
|
|
/* per-IO O_DSYNC */
|
|
|
|
#define RWF_DSYNC ((__force __kernel_rwf_t)0x00000002)
|
|
|
|
|
|
|
|
/* per-IO O_SYNC */
|
|
|
|
#define RWF_SYNC ((__force __kernel_rwf_t)0x00000004)
|
|
|
|
|
|
|
|
/* per-IO, return -EAGAIN if operation would block */
|
|
|
|
#define RWF_NOWAIT ((__force __kernel_rwf_t)0x00000008)
|
|
|
|
|
2017-09-29 12:07:17 +00:00
|
|
|
/* per-IO O_APPEND */
|
|
|
|
#define RWF_APPEND ((__force __kernel_rwf_t)0x00000010)
|
|
|
|
|
2020-08-31 15:32:08 +00:00
|
|
|
/* per-IO negation of O_APPEND */
|
|
|
|
#define RWF_NOAPPEND ((__force __kernel_rwf_t)0x00000020)
|
|
|
|
|
fs: Initial atomic write support
An atomic write is a write issued with torn-write protection, meaning
that for a power failure or any other hardware failure, all or none of the
data from the write will be stored, but never a mix of old and new data.
Userspace may add flag RWF_ATOMIC to pwritev2() to indicate that the
write is to be issued with torn-write prevention, according to special
alignment and length rules.
For any syscall interface utilizing struct iocb, add IOCB_ATOMIC for
iocb->ki_flags field to indicate the same.
A call to statx will give the relevant atomic write info for a file:
- atomic_write_unit_min
- atomic_write_unit_max
- atomic_write_segments_max
Both min and max values must be a power-of-2.
Applications can avail of atomic write feature by ensuring that the total
length of a write is a power-of-2 in size and also sized between
atomic_write_unit_min and atomic_write_unit_max, inclusive. Applications
must ensure that the write is at a naturally-aligned offset in the file
wrt the total write length. The value in atomic_write_segments_max
indicates the upper limit for IOV_ITER iovcnt.
Add file mode flag FMODE_CAN_ATOMIC_WRITE, so files which do not have the
flag set will have RWF_ATOMIC rejected and not just ignored.
Add a type argument to kiocb_set_rw_flags() to allows reads which have
RWF_ATOMIC set to be rejected.
Helper function generic_atomic_write_valid() can be used by FSes to verify
compliant writes. There we check for iov_iter type is for ubuf, which
implies iovcnt==1 for pwritev2(), which is an initial restriction for
atomic_write_segments_max. Initially the only user will be bdev file
operations write handler. We will rely on the block BIO submission path to
ensure write sizes are compliant for the bdev, so we don't need to check
atomic writes sizes yet.
Signed-off-by: Prasad Singamsetty <prasad.singamsetty@oracle.com>
jpg: merge into single patch and much rewrite
Acked-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Link: https://lore.kernel.org/r/20240620125359.2684798-4-john.g.garry@oracle.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-06-20 12:53:52 +00:00
|
|
|
/* Atomic Write */
|
|
|
|
#define RWF_ATOMIC ((__force __kernel_rwf_t)0x00000040)
|
|
|
|
|
2024-12-20 15:47:45 +00:00
|
|
|
/* buffered IO that drops the cache after reading or writing data */
|
|
|
|
#define RWF_DONTCACHE ((__force __kernel_rwf_t)0x00000080)
|
|
|
|
|
2017-07-06 16:58:37 +00:00
|
|
|
/* mask of flags supported by the kernel */
|
2017-09-29 12:07:17 +00:00
|
|
|
#define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
|
2024-12-20 15:47:45 +00:00
|
|
|
RWF_APPEND | RWF_NOAPPEND | RWF_ATOMIC |\
|
|
|
|
RWF_DONTCACHE)
|
2017-06-20 12:05:40 +00:00
|
|
|
|
fs/procfs: implement efficient VMA querying API for /proc/<pid>/maps
/proc/<pid>/maps file is extremely useful in practice for various tasks
involving figuring out process memory layout, what files are backing any
given memory range, etc. One important class of applications that
absolutely rely on this are profilers/stack symbolizers (perf tool being
one of them). Patterns of use differ, but they generally would fall into
two categories.
In on-demand pattern, a profiler/symbolizer would normally capture stack
trace containing absolute memory addresses of some functions, and would
then use /proc/<pid>/maps file to find corresponding backing ELF files
(normally, only executable VMAs are of interest), file offsets within
them, and then continue from there to get yet more information (ELF
symbols, DWARF information) to get human-readable symbolic information.
This pattern is used by Meta's fleet-wide profiler, as one example.
In preprocessing pattern, application doesn't know the set of addresses of
interest, so it has to fetch all relevant VMAs (again, probably only
executable ones), store or cache them, then proceed with profiling and
stack trace capture. Once done, it would do symbolization based on stored
VMA information. This can happen at much later point in time. This
patterns is used by perf tool, as an example.
In either case, there are both performance and correctness requirement
involved. This address to VMA information translation has to be done as
efficiently as possible, but also not miss any VMA (especially in the case
of loading/unloading shared libraries). In practice, correctness can't be
guaranteed (due to process dying before VMA data can be captured, or
shared library being unloaded, etc), but any effort to maximize the chance
of finding the VMA is appreciated.
Unfortunately, for all the /proc/<pid>/maps file universality and
usefulness, it doesn't fit the above use cases 100%.
First, it's main purpose is to emit all VMAs sequentially, but in practice
captured addresses would fall only into a smaller subset of all process'
VMAs, mainly containing executable text. Yet, library would need to parse
most or all of the contents to find needed VMAs, as there is no way to
skip VMAs that are of no use. Efficient library can do the linear pass
and it is still relatively efficient, but it's definitely an overhead that
can be avoided, if there was a way to do more targeted querying of the
relevant VMA information.
Second, it's a text based interface, which makes its programmatic use from
applications and libraries more cumbersome and inefficient due to the need
to handle text parsing to get necessary pieces of information. The
overhead is actually payed both by kernel, formatting originally binary
VMA data into text, and then by user space application, parsing it back
into binary data for further use.
For the on-demand pattern of usage, described above, another problem when
writing generic stack trace symbolization library is an unfortunate
performance-vs-correctness tradeoff that needs to be made. Library has to
make a decision to either cache parsed contents of /proc/<pid>/maps (after
initial processing) to service future requests (if application requests to
symbolize another set of addresses (for the same process), captured at
some later time, which is typical for periodic/continuous profiling cases)
to avoid higher costs of re-parsing this file. Or it has to choose to
cache the contents in memory to speed up future requests. In the former
case, more memory is used for the cache and there is a risk of getting
stale data if application loads or unloads shared libraries, or otherwise
changed its set of VMAs somehow, e.g., through additional mmap() calls.
In the latter case, it's the performance hit that comes from re-opening
the file and re-parsing its contents all over again.
This patch aims to solve this problem by providing a new API built on top
of /proc/<pid>/maps. It's meant to address both non-selectiveness and
text nature of /proc/<pid>/maps, by giving user more control of what sort
of VMA(s) needs to be queried, and being binary-based interface eliminates
the overhead of text formatting (on kernel side) and parsing (on user
space side).
It's also designed to be extensible and forward/backward compatible by
including required struct size field, which user has to provide. We use
established copy_struct_from_user() approach to handle extensibility.
User has a choice to pick either getting VMA that covers provided address
or -ENOENT if none is found (exact, least surprising, case). Or, with an
extra query flag (PROCMAP_QUERY_COVERING_OR_NEXT_VMA), they can get either
VMA that covers the address (if there is one), or the closest next VMA
(i.e., VMA with the smallest vm_start > addr). The latter allows more
efficient use, but, given it could be a surprising behavior, requires an
explicit opt-in.
There is another query flag that is useful for some use cases.
PROCMAP_QUERY_FILE_BACKED_VMA instructs this API to only return
file-backed VMAs. Combining this with PROCMAP_QUERY_COVERING_OR_NEXT_VMA
makes it possible to efficiently iterate only file-backed VMAs of the
process, which is what profilers/symbolizers are normally interested in.
All the above querying flags can be combined with (also optional) set of
desired VMA permissions flags. This allows to, for example, iterate only
an executable subset of VMAs, which is what preprocessing pattern, used by
perf tool, would benefit from, as the assumption is that captured stack
traces would have addresses of executable code. This saves time by
skipping non-executable VMAs altogether efficienty.
All these querying flags (modifiers) are orthogonal and can be combined in
a semantically meaningful and natural way.
Basing this ioctl()-based API on top of /proc/<pid>/maps's FD makes sense
given it's querying the same set of VMA data. It's also benefitial
because permission checks for /proc/<pid>/maps is performed at open time
once, and the actual data read of text contents of /proc/<pid>/maps is
done without further permission checks. We piggyback on this pattern with
ioctl()-based API as well, as that's a desired property. Both for
performance reasons, but also for security and flexibility reasons.
Allowing application to open an FD for /proc/self/maps without any extra
capabilities, and then passing it to some sort of profiling agent through
Unix-domain socket, would allow such profiling agent to not require some
of the capabilities that are otherwise expected when opening
/proc/<pid>/maps file for *another* process. This is a desirable property
for some more restricted setups.
This new ioctl-based implementation doesn't interfere with seq_file-based
implementation of /proc/<pid>/maps textual interface, and so could be used
together or independently without paying any price for that.
Note also, that fetching VMA name (e.g., backing file path, or special
hard-coded or user-provided names) is optional just like build ID. If
user sets vma_name_size to zero, kernel code won't attempt to retrieve it,
saving resources.
Earlier versions of this patch set were adding per-VMA locking, which is
why we have a code structure that is ready for abstracting mmap_lock vs
vm_lock differences (query_vma_setup(), query_vma_teardown(), and
query_vma_find_by_addr()), but given anon_vma_name() is not yet compatible
with per-VMA locking, initial implementation sticks to using only
mmap_lock for now. It will be easy to add back per-VMA locking once all
the pieces are ready later on. Which is why we keep existing code
structure with setup/teardown/query helper functions.
[andrii@kernel.org: improve PROCMAP_QUERY's compat mode handling]
Link: https://lkml.kernel.org/r/20240701174805.1897344-2-andrii@kernel.org
Link: https://lkml.kernel.org/r/20240627170900.1672542-3-andrii@kernel.org
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-06-27 17:08:54 +00:00
|
|
|
#define PROCFS_IOCTL_MAGIC 'f'
|
|
|
|
|
2023-08-21 14:15:14 +00:00
|
|
|
/* Pagemap ioctl */
|
fs/procfs: implement efficient VMA querying API for /proc/<pid>/maps
/proc/<pid>/maps file is extremely useful in practice for various tasks
involving figuring out process memory layout, what files are backing any
given memory range, etc. One important class of applications that
absolutely rely on this are profilers/stack symbolizers (perf tool being
one of them). Patterns of use differ, but they generally would fall into
two categories.
In on-demand pattern, a profiler/symbolizer would normally capture stack
trace containing absolute memory addresses of some functions, and would
then use /proc/<pid>/maps file to find corresponding backing ELF files
(normally, only executable VMAs are of interest), file offsets within
them, and then continue from there to get yet more information (ELF
symbols, DWARF information) to get human-readable symbolic information.
This pattern is used by Meta's fleet-wide profiler, as one example.
In preprocessing pattern, application doesn't know the set of addresses of
interest, so it has to fetch all relevant VMAs (again, probably only
executable ones), store or cache them, then proceed with profiling and
stack trace capture. Once done, it would do symbolization based on stored
VMA information. This can happen at much later point in time. This
patterns is used by perf tool, as an example.
In either case, there are both performance and correctness requirement
involved. This address to VMA information translation has to be done as
efficiently as possible, but also not miss any VMA (especially in the case
of loading/unloading shared libraries). In practice, correctness can't be
guaranteed (due to process dying before VMA data can be captured, or
shared library being unloaded, etc), but any effort to maximize the chance
of finding the VMA is appreciated.
Unfortunately, for all the /proc/<pid>/maps file universality and
usefulness, it doesn't fit the above use cases 100%.
First, it's main purpose is to emit all VMAs sequentially, but in practice
captured addresses would fall only into a smaller subset of all process'
VMAs, mainly containing executable text. Yet, library would need to parse
most or all of the contents to find needed VMAs, as there is no way to
skip VMAs that are of no use. Efficient library can do the linear pass
and it is still relatively efficient, but it's definitely an overhead that
can be avoided, if there was a way to do more targeted querying of the
relevant VMA information.
Second, it's a text based interface, which makes its programmatic use from
applications and libraries more cumbersome and inefficient due to the need
to handle text parsing to get necessary pieces of information. The
overhead is actually payed both by kernel, formatting originally binary
VMA data into text, and then by user space application, parsing it back
into binary data for further use.
For the on-demand pattern of usage, described above, another problem when
writing generic stack trace symbolization library is an unfortunate
performance-vs-correctness tradeoff that needs to be made. Library has to
make a decision to either cache parsed contents of /proc/<pid>/maps (after
initial processing) to service future requests (if application requests to
symbolize another set of addresses (for the same process), captured at
some later time, which is typical for periodic/continuous profiling cases)
to avoid higher costs of re-parsing this file. Or it has to choose to
cache the contents in memory to speed up future requests. In the former
case, more memory is used for the cache and there is a risk of getting
stale data if application loads or unloads shared libraries, or otherwise
changed its set of VMAs somehow, e.g., through additional mmap() calls.
In the latter case, it's the performance hit that comes from re-opening
the file and re-parsing its contents all over again.
This patch aims to solve this problem by providing a new API built on top
of /proc/<pid>/maps. It's meant to address both non-selectiveness and
text nature of /proc/<pid>/maps, by giving user more control of what sort
of VMA(s) needs to be queried, and being binary-based interface eliminates
the overhead of text formatting (on kernel side) and parsing (on user
space side).
It's also designed to be extensible and forward/backward compatible by
including required struct size field, which user has to provide. We use
established copy_struct_from_user() approach to handle extensibility.
User has a choice to pick either getting VMA that covers provided address
or -ENOENT if none is found (exact, least surprising, case). Or, with an
extra query flag (PROCMAP_QUERY_COVERING_OR_NEXT_VMA), they can get either
VMA that covers the address (if there is one), or the closest next VMA
(i.e., VMA with the smallest vm_start > addr). The latter allows more
efficient use, but, given it could be a surprising behavior, requires an
explicit opt-in.
There is another query flag that is useful for some use cases.
PROCMAP_QUERY_FILE_BACKED_VMA instructs this API to only return
file-backed VMAs. Combining this with PROCMAP_QUERY_COVERING_OR_NEXT_VMA
makes it possible to efficiently iterate only file-backed VMAs of the
process, which is what profilers/symbolizers are normally interested in.
All the above querying flags can be combined with (also optional) set of
desired VMA permissions flags. This allows to, for example, iterate only
an executable subset of VMAs, which is what preprocessing pattern, used by
perf tool, would benefit from, as the assumption is that captured stack
traces would have addresses of executable code. This saves time by
skipping non-executable VMAs altogether efficienty.
All these querying flags (modifiers) are orthogonal and can be combined in
a semantically meaningful and natural way.
Basing this ioctl()-based API on top of /proc/<pid>/maps's FD makes sense
given it's querying the same set of VMA data. It's also benefitial
because permission checks for /proc/<pid>/maps is performed at open time
once, and the actual data read of text contents of /proc/<pid>/maps is
done without further permission checks. We piggyback on this pattern with
ioctl()-based API as well, as that's a desired property. Both for
performance reasons, but also for security and flexibility reasons.
Allowing application to open an FD for /proc/self/maps without any extra
capabilities, and then passing it to some sort of profiling agent through
Unix-domain socket, would allow such profiling agent to not require some
of the capabilities that are otherwise expected when opening
/proc/<pid>/maps file for *another* process. This is a desirable property
for some more restricted setups.
This new ioctl-based implementation doesn't interfere with seq_file-based
implementation of /proc/<pid>/maps textual interface, and so could be used
together or independently without paying any price for that.
Note also, that fetching VMA name (e.g., backing file path, or special
hard-coded or user-provided names) is optional just like build ID. If
user sets vma_name_size to zero, kernel code won't attempt to retrieve it,
saving resources.
Earlier versions of this patch set were adding per-VMA locking, which is
why we have a code structure that is ready for abstracting mmap_lock vs
vm_lock differences (query_vma_setup(), query_vma_teardown(), and
query_vma_find_by_addr()), but given anon_vma_name() is not yet compatible
with per-VMA locking, initial implementation sticks to using only
mmap_lock for now. It will be easy to add back per-VMA locking once all
the pieces are ready later on. Which is why we keep existing code
structure with setup/teardown/query helper functions.
[andrii@kernel.org: improve PROCMAP_QUERY's compat mode handling]
Link: https://lkml.kernel.org/r/20240701174805.1897344-2-andrii@kernel.org
Link: https://lkml.kernel.org/r/20240627170900.1672542-3-andrii@kernel.org
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-06-27 17:08:54 +00:00
|
|
|
#define PAGEMAP_SCAN _IOWR(PROCFS_IOCTL_MAGIC, 16, struct pm_scan_arg)
|
2023-08-21 14:15:14 +00:00
|
|
|
|
|
|
|
/* Bitmasks provided in pm_scan_args masks and reported in page_region.categories. */
|
|
|
|
#define PAGE_IS_WPALLOWED (1 << 0)
|
|
|
|
#define PAGE_IS_WRITTEN (1 << 1)
|
|
|
|
#define PAGE_IS_FILE (1 << 2)
|
|
|
|
#define PAGE_IS_PRESENT (1 << 3)
|
|
|
|
#define PAGE_IS_SWAPPED (1 << 4)
|
|
|
|
#define PAGE_IS_PFNZERO (1 << 5)
|
|
|
|
#define PAGE_IS_HUGE (1 << 6)
|
2023-11-06 22:09:58 +00:00
|
|
|
#define PAGE_IS_SOFT_DIRTY (1 << 7)
|
2025-03-24 06:53:26 +00:00
|
|
|
#define PAGE_IS_GUARD (1 << 8)
|
2023-08-21 14:15:14 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* struct page_region - Page region with flags
|
|
|
|
* @start: Start of the region
|
|
|
|
* @end: End of the region (exclusive)
|
|
|
|
* @categories: PAGE_IS_* category bitmask for the region
|
|
|
|
*/
|
|
|
|
struct page_region {
|
|
|
|
__u64 start;
|
|
|
|
__u64 end;
|
|
|
|
__u64 categories;
|
|
|
|
};
|
|
|
|
|
|
|
|
/* Flags for PAGEMAP_SCAN ioctl */
|
|
|
|
#define PM_SCAN_WP_MATCHING (1 << 0) /* Write protect the pages matched. */
|
|
|
|
#define PM_SCAN_CHECK_WPASYNC (1 << 1) /* Abort the scan when a non-WP-enabled page is found. */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* struct pm_scan_arg - Pagemap ioctl argument
|
|
|
|
* @size: Size of the structure
|
|
|
|
* @flags: Flags for the IOCTL
|
|
|
|
* @start: Starting address of the region
|
|
|
|
* @end: Ending address of the region
|
|
|
|
* @walk_end Address where the scan stopped (written by kernel).
|
|
|
|
* walk_end == end (address tags cleared) informs that the scan completed on entire range.
|
|
|
|
* @vec: Address of page_region struct array for output
|
|
|
|
* @vec_len: Length of the page_region struct array
|
|
|
|
* @max_pages: Optional limit for number of returned pages (0 = disabled)
|
|
|
|
* @category_inverted: PAGE_IS_* categories which values match if 0 instead of 1
|
|
|
|
* @category_mask: Skip pages for which any category doesn't match
|
|
|
|
* @category_anyof_mask: Skip pages for which no category matches
|
|
|
|
* @return_mask: PAGE_IS_* categories that are to be reported in `page_region`s returned
|
|
|
|
*/
|
|
|
|
struct pm_scan_arg {
|
|
|
|
__u64 size;
|
|
|
|
__u64 flags;
|
|
|
|
__u64 start;
|
|
|
|
__u64 end;
|
|
|
|
__u64 walk_end;
|
|
|
|
__u64 vec;
|
|
|
|
__u64 vec_len;
|
|
|
|
__u64 max_pages;
|
|
|
|
__u64 category_inverted;
|
|
|
|
__u64 category_mask;
|
|
|
|
__u64 category_anyof_mask;
|
|
|
|
__u64 return_mask;
|
|
|
|
};
|
|
|
|
|
fs/procfs: implement efficient VMA querying API for /proc/<pid>/maps
/proc/<pid>/maps file is extremely useful in practice for various tasks
involving figuring out process memory layout, what files are backing any
given memory range, etc. One important class of applications that
absolutely rely on this are profilers/stack symbolizers (perf tool being
one of them). Patterns of use differ, but they generally would fall into
two categories.
In on-demand pattern, a profiler/symbolizer would normally capture stack
trace containing absolute memory addresses of some functions, and would
then use /proc/<pid>/maps file to find corresponding backing ELF files
(normally, only executable VMAs are of interest), file offsets within
them, and then continue from there to get yet more information (ELF
symbols, DWARF information) to get human-readable symbolic information.
This pattern is used by Meta's fleet-wide profiler, as one example.
In preprocessing pattern, application doesn't know the set of addresses of
interest, so it has to fetch all relevant VMAs (again, probably only
executable ones), store or cache them, then proceed with profiling and
stack trace capture. Once done, it would do symbolization based on stored
VMA information. This can happen at much later point in time. This
patterns is used by perf tool, as an example.
In either case, there are both performance and correctness requirement
involved. This address to VMA information translation has to be done as
efficiently as possible, but also not miss any VMA (especially in the case
of loading/unloading shared libraries). In practice, correctness can't be
guaranteed (due to process dying before VMA data can be captured, or
shared library being unloaded, etc), but any effort to maximize the chance
of finding the VMA is appreciated.
Unfortunately, for all the /proc/<pid>/maps file universality and
usefulness, it doesn't fit the above use cases 100%.
First, it's main purpose is to emit all VMAs sequentially, but in practice
captured addresses would fall only into a smaller subset of all process'
VMAs, mainly containing executable text. Yet, library would need to parse
most or all of the contents to find needed VMAs, as there is no way to
skip VMAs that are of no use. Efficient library can do the linear pass
and it is still relatively efficient, but it's definitely an overhead that
can be avoided, if there was a way to do more targeted querying of the
relevant VMA information.
Second, it's a text based interface, which makes its programmatic use from
applications and libraries more cumbersome and inefficient due to the need
to handle text parsing to get necessary pieces of information. The
overhead is actually payed both by kernel, formatting originally binary
VMA data into text, and then by user space application, parsing it back
into binary data for further use.
For the on-demand pattern of usage, described above, another problem when
writing generic stack trace symbolization library is an unfortunate
performance-vs-correctness tradeoff that needs to be made. Library has to
make a decision to either cache parsed contents of /proc/<pid>/maps (after
initial processing) to service future requests (if application requests to
symbolize another set of addresses (for the same process), captured at
some later time, which is typical for periodic/continuous profiling cases)
to avoid higher costs of re-parsing this file. Or it has to choose to
cache the contents in memory to speed up future requests. In the former
case, more memory is used for the cache and there is a risk of getting
stale data if application loads or unloads shared libraries, or otherwise
changed its set of VMAs somehow, e.g., through additional mmap() calls.
In the latter case, it's the performance hit that comes from re-opening
the file and re-parsing its contents all over again.
This patch aims to solve this problem by providing a new API built on top
of /proc/<pid>/maps. It's meant to address both non-selectiveness and
text nature of /proc/<pid>/maps, by giving user more control of what sort
of VMA(s) needs to be queried, and being binary-based interface eliminates
the overhead of text formatting (on kernel side) and parsing (on user
space side).
It's also designed to be extensible and forward/backward compatible by
including required struct size field, which user has to provide. We use
established copy_struct_from_user() approach to handle extensibility.
User has a choice to pick either getting VMA that covers provided address
or -ENOENT if none is found (exact, least surprising, case). Or, with an
extra query flag (PROCMAP_QUERY_COVERING_OR_NEXT_VMA), they can get either
VMA that covers the address (if there is one), or the closest next VMA
(i.e., VMA with the smallest vm_start > addr). The latter allows more
efficient use, but, given it could be a surprising behavior, requires an
explicit opt-in.
There is another query flag that is useful for some use cases.
PROCMAP_QUERY_FILE_BACKED_VMA instructs this API to only return
file-backed VMAs. Combining this with PROCMAP_QUERY_COVERING_OR_NEXT_VMA
makes it possible to efficiently iterate only file-backed VMAs of the
process, which is what profilers/symbolizers are normally interested in.
All the above querying flags can be combined with (also optional) set of
desired VMA permissions flags. This allows to, for example, iterate only
an executable subset of VMAs, which is what preprocessing pattern, used by
perf tool, would benefit from, as the assumption is that captured stack
traces would have addresses of executable code. This saves time by
skipping non-executable VMAs altogether efficienty.
All these querying flags (modifiers) are orthogonal and can be combined in
a semantically meaningful and natural way.
Basing this ioctl()-based API on top of /proc/<pid>/maps's FD makes sense
given it's querying the same set of VMA data. It's also benefitial
because permission checks for /proc/<pid>/maps is performed at open time
once, and the actual data read of text contents of /proc/<pid>/maps is
done without further permission checks. We piggyback on this pattern with
ioctl()-based API as well, as that's a desired property. Both for
performance reasons, but also for security and flexibility reasons.
Allowing application to open an FD for /proc/self/maps without any extra
capabilities, and then passing it to some sort of profiling agent through
Unix-domain socket, would allow such profiling agent to not require some
of the capabilities that are otherwise expected when opening
/proc/<pid>/maps file for *another* process. This is a desirable property
for some more restricted setups.
This new ioctl-based implementation doesn't interfere with seq_file-based
implementation of /proc/<pid>/maps textual interface, and so could be used
together or independently without paying any price for that.
Note also, that fetching VMA name (e.g., backing file path, or special
hard-coded or user-provided names) is optional just like build ID. If
user sets vma_name_size to zero, kernel code won't attempt to retrieve it,
saving resources.
Earlier versions of this patch set were adding per-VMA locking, which is
why we have a code structure that is ready for abstracting mmap_lock vs
vm_lock differences (query_vma_setup(), query_vma_teardown(), and
query_vma_find_by_addr()), but given anon_vma_name() is not yet compatible
with per-VMA locking, initial implementation sticks to using only
mmap_lock for now. It will be easy to add back per-VMA locking once all
the pieces are ready later on. Which is why we keep existing code
structure with setup/teardown/query helper functions.
[andrii@kernel.org: improve PROCMAP_QUERY's compat mode handling]
Link: https://lkml.kernel.org/r/20240701174805.1897344-2-andrii@kernel.org
Link: https://lkml.kernel.org/r/20240627170900.1672542-3-andrii@kernel.org
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-06-27 17:08:54 +00:00
|
|
|
/* /proc/<pid>/maps ioctl */
|
|
|
|
#define PROCMAP_QUERY _IOWR(PROCFS_IOCTL_MAGIC, 17, struct procmap_query)
|
|
|
|
|
|
|
|
enum procmap_query_flags {
|
|
|
|
/*
|
|
|
|
* VMA permission flags.
|
|
|
|
*
|
|
|
|
* Can be used as part of procmap_query.query_flags field to look up
|
|
|
|
* only VMAs satisfying specified subset of permissions. E.g., specifying
|
|
|
|
* PROCMAP_QUERY_VMA_READABLE only will return both readable and read/write VMAs,
|
|
|
|
* while having PROCMAP_QUERY_VMA_READABLE | PROCMAP_QUERY_VMA_WRITABLE will only
|
|
|
|
* return read/write VMAs, though both executable/non-executable and
|
|
|
|
* private/shared will be ignored.
|
|
|
|
*
|
|
|
|
* PROCMAP_QUERY_VMA_* flags are also returned in procmap_query.vma_flags
|
|
|
|
* field to specify actual VMA permissions.
|
|
|
|
*/
|
|
|
|
PROCMAP_QUERY_VMA_READABLE = 0x01,
|
|
|
|
PROCMAP_QUERY_VMA_WRITABLE = 0x02,
|
|
|
|
PROCMAP_QUERY_VMA_EXECUTABLE = 0x04,
|
|
|
|
PROCMAP_QUERY_VMA_SHARED = 0x08,
|
|
|
|
/*
|
|
|
|
* Query modifier flags.
|
|
|
|
*
|
|
|
|
* By default VMA that covers provided address is returned, or -ENOENT
|
|
|
|
* is returned. With PROCMAP_QUERY_COVERING_OR_NEXT_VMA flag set, closest
|
|
|
|
* VMA with vma_start > addr will be returned if no covering VMA is
|
|
|
|
* found.
|
|
|
|
*
|
|
|
|
* PROCMAP_QUERY_FILE_BACKED_VMA instructs query to consider only VMAs that
|
|
|
|
* have file backing. Can be combined with PROCMAP_QUERY_COVERING_OR_NEXT_VMA
|
|
|
|
* to iterate all VMAs with file backing.
|
|
|
|
*/
|
|
|
|
PROCMAP_QUERY_COVERING_OR_NEXT_VMA = 0x10,
|
|
|
|
PROCMAP_QUERY_FILE_BACKED_VMA = 0x20,
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Input/output argument structured passed into ioctl() call. It can be used
|
|
|
|
* to query a set of VMAs (Virtual Memory Areas) of a process.
|
|
|
|
*
|
|
|
|
* Each field can be one of three kinds, marked in a short comment to the
|
|
|
|
* right of the field:
|
|
|
|
* - "in", input argument, user has to provide this value, kernel doesn't modify it;
|
|
|
|
* - "out", output argument, kernel sets this field with VMA data;
|
|
|
|
* - "in/out", input and output argument; user provides initial value (used
|
|
|
|
* to specify maximum allowable buffer size), and kernel sets it to actual
|
|
|
|
* amount of data written (or zero, if there is no data).
|
|
|
|
*
|
|
|
|
* If matching VMA is found (according to criterias specified by
|
|
|
|
* query_addr/query_flags, all the out fields are filled out, and ioctl()
|
|
|
|
* returns 0. If there is no matching VMA, -ENOENT will be returned.
|
|
|
|
* In case of any other error, negative error code other than -ENOENT is
|
|
|
|
* returned.
|
|
|
|
*
|
|
|
|
* Most of the data is similar to the one returned as text in /proc/<pid>/maps
|
|
|
|
* file, but procmap_query provides more querying flexibility. There are no
|
|
|
|
* consistency guarantees between subsequent ioctl() calls, but data returned
|
|
|
|
* for matched VMA is self-consistent.
|
|
|
|
*/
|
|
|
|
struct procmap_query {
|
|
|
|
/* Query struct size, for backwards/forward compatibility */
|
|
|
|
__u64 size;
|
|
|
|
/*
|
|
|
|
* Query flags, a combination of enum procmap_query_flags values.
|
|
|
|
* Defines query filtering and behavior, see enum procmap_query_flags.
|
|
|
|
*
|
|
|
|
* Input argument, provided by user. Kernel doesn't modify it.
|
|
|
|
*/
|
|
|
|
__u64 query_flags; /* in */
|
|
|
|
/*
|
|
|
|
* Query address. By default, VMA that covers this address will
|
|
|
|
* be looked up. PROCMAP_QUERY_* flags above modify this default
|
|
|
|
* behavior further.
|
|
|
|
*
|
|
|
|
* Input argument, provided by user. Kernel doesn't modify it.
|
|
|
|
*/
|
|
|
|
__u64 query_addr; /* in */
|
|
|
|
/* VMA starting (inclusive) and ending (exclusive) address, if VMA is found. */
|
|
|
|
__u64 vma_start; /* out */
|
|
|
|
__u64 vma_end; /* out */
|
|
|
|
/* VMA permissions flags. A combination of PROCMAP_QUERY_VMA_* flags. */
|
|
|
|
__u64 vma_flags; /* out */
|
|
|
|
/* VMA backing page size granularity. */
|
|
|
|
__u64 vma_page_size; /* out */
|
|
|
|
/*
|
|
|
|
* VMA file offset. If VMA has file backing, this specifies offset
|
|
|
|
* within the file that VMA's start address corresponds to.
|
|
|
|
* Is set to zero if VMA has no backing file.
|
|
|
|
*/
|
|
|
|
__u64 vma_offset; /* out */
|
|
|
|
/* Backing file's inode number, or zero, if VMA has no backing file. */
|
|
|
|
__u64 inode; /* out */
|
|
|
|
/* Backing file's device major/minor number, or zero, if VMA has no backing file. */
|
|
|
|
__u32 dev_major; /* out */
|
|
|
|
__u32 dev_minor; /* out */
|
|
|
|
/*
|
|
|
|
* If set to non-zero value, signals the request to return VMA name
|
|
|
|
* (i.e., VMA's backing file's absolute path, with " (deleted)" suffix
|
|
|
|
* appended, if file was unlinked from FS) for matched VMA. VMA name
|
|
|
|
* can also be some special name (e.g., "[heap]", "[stack]") or could
|
|
|
|
* be even user-supplied with prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME).
|
|
|
|
*
|
|
|
|
* Kernel will set this field to zero, if VMA has no associated name.
|
|
|
|
* Otherwise kernel will return actual amount of bytes filled in
|
|
|
|
* user-supplied buffer (see vma_name_addr field below), including the
|
|
|
|
* terminating zero.
|
|
|
|
*
|
|
|
|
* If VMA name is longer that user-supplied maximum buffer size,
|
|
|
|
* -E2BIG error is returned.
|
|
|
|
*
|
|
|
|
* If this field is set to non-zero value, vma_name_addr should point
|
|
|
|
* to valid user space memory buffer of at least vma_name_size bytes.
|
|
|
|
* If set to zero, vma_name_addr should be set to zero as well
|
|
|
|
*/
|
|
|
|
__u32 vma_name_size; /* in/out */
|
fs/procfs: add build ID fetching to PROCMAP_QUERY API
The need to get ELF build ID reliably is an important aspect when dealing
with profiling and stack trace symbolization, and /proc/<pid>/maps textual
representation doesn't help with this.
To get backing file's ELF build ID, application has to first resolve VMA,
then use it's start/end address range to follow a special
/proc/<pid>/map_files/<start>-<end> symlink to open the ELF file (this is
necessary because backing file might have been removed from the disk or
was already replaced with another binary in the same file path.
Such approach, beyond just adding complexity of having to do a bunch of
extra work, has extra security implications. Because application opens
underlying ELF file and needs read access to its entire contents (as far
as kernel is concerned), kernel puts additional capable() checks on
following /proc/<pid>/map_files/<start>-<end> symlink. And that makes
sense in general.
But in the case of build ID, profiler/symbolizer doesn't need the contents
of ELF file, per se. It's only build ID that is of interest, and ELF
build ID itself doesn't provide any sensitive information.
So this patch adds a way to request backing file's ELF build ID along the
rest of VMA information in the same API. User has control over whether
this piece of information is requested or not by either setting
build_id_size field to zero or non-zero maximum buffer size they provided
through build_id_addr field (which encodes user pointer as __u64 field).
This is a completely optional piece of information, and so has no
performance implications for user cases that don't care about build ID,
while improving performance and simplifying the setup for those
application that do need it.
Kernel already implements build ID fetching, which is used from BPF
subsystem. We are reusing this code here, but plan a follow up changes to
make it work better under more relaxed assumption (compared to what
existing code assumes) of being called from user process context, in which
page faults are allowed. BPF-specific implementation currently bails out
if necessary part of ELF file is not paged in, all due to extra
BPF-specific restrictions (like the need to fetch build ID in restrictive
contexts such as NMI handler).
[andrii@kernel.org: fix integer to pointer cast warning in do_procmap_query()]
Link: https://lkml.kernel.org/r/20240701174805.1897344-1-andrii@kernel.org
Link: https://lkml.kernel.org/r/20240627170900.1672542-4-andrii@kernel.org
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-06-27 17:08:55 +00:00
|
|
|
/*
|
|
|
|
* If set to non-zero value, signals the request to extract and return
|
|
|
|
* VMA's backing file's build ID, if the backing file is an ELF file
|
|
|
|
* and it contains embedded build ID.
|
|
|
|
*
|
|
|
|
* Kernel will set this field to zero, if VMA has no backing file,
|
|
|
|
* backing file is not an ELF file, or ELF file has no build ID
|
|
|
|
* embedded.
|
|
|
|
*
|
|
|
|
* Build ID is a binary value (not a string). Kernel will set
|
|
|
|
* build_id_size field to exact number of bytes used for build ID.
|
|
|
|
* If build ID is requested and present, but needs more bytes than
|
|
|
|
* user-supplied maximum buffer size (see build_id_addr field below),
|
|
|
|
* -E2BIG error will be returned.
|
|
|
|
*
|
|
|
|
* If this field is set to non-zero value, build_id_addr should point
|
|
|
|
* to valid user space memory buffer of at least build_id_size bytes.
|
|
|
|
* If set to zero, build_id_addr should be set to zero as well
|
|
|
|
*/
|
|
|
|
__u32 build_id_size; /* in/out */
|
fs/procfs: implement efficient VMA querying API for /proc/<pid>/maps
/proc/<pid>/maps file is extremely useful in practice for various tasks
involving figuring out process memory layout, what files are backing any
given memory range, etc. One important class of applications that
absolutely rely on this are profilers/stack symbolizers (perf tool being
one of them). Patterns of use differ, but they generally would fall into
two categories.
In on-demand pattern, a profiler/symbolizer would normally capture stack
trace containing absolute memory addresses of some functions, and would
then use /proc/<pid>/maps file to find corresponding backing ELF files
(normally, only executable VMAs are of interest), file offsets within
them, and then continue from there to get yet more information (ELF
symbols, DWARF information) to get human-readable symbolic information.
This pattern is used by Meta's fleet-wide profiler, as one example.
In preprocessing pattern, application doesn't know the set of addresses of
interest, so it has to fetch all relevant VMAs (again, probably only
executable ones), store or cache them, then proceed with profiling and
stack trace capture. Once done, it would do symbolization based on stored
VMA information. This can happen at much later point in time. This
patterns is used by perf tool, as an example.
In either case, there are both performance and correctness requirement
involved. This address to VMA information translation has to be done as
efficiently as possible, but also not miss any VMA (especially in the case
of loading/unloading shared libraries). In practice, correctness can't be
guaranteed (due to process dying before VMA data can be captured, or
shared library being unloaded, etc), but any effort to maximize the chance
of finding the VMA is appreciated.
Unfortunately, for all the /proc/<pid>/maps file universality and
usefulness, it doesn't fit the above use cases 100%.
First, it's main purpose is to emit all VMAs sequentially, but in practice
captured addresses would fall only into a smaller subset of all process'
VMAs, mainly containing executable text. Yet, library would need to parse
most or all of the contents to find needed VMAs, as there is no way to
skip VMAs that are of no use. Efficient library can do the linear pass
and it is still relatively efficient, but it's definitely an overhead that
can be avoided, if there was a way to do more targeted querying of the
relevant VMA information.
Second, it's a text based interface, which makes its programmatic use from
applications and libraries more cumbersome and inefficient due to the need
to handle text parsing to get necessary pieces of information. The
overhead is actually payed both by kernel, formatting originally binary
VMA data into text, and then by user space application, parsing it back
into binary data for further use.
For the on-demand pattern of usage, described above, another problem when
writing generic stack trace symbolization library is an unfortunate
performance-vs-correctness tradeoff that needs to be made. Library has to
make a decision to either cache parsed contents of /proc/<pid>/maps (after
initial processing) to service future requests (if application requests to
symbolize another set of addresses (for the same process), captured at
some later time, which is typical for periodic/continuous profiling cases)
to avoid higher costs of re-parsing this file. Or it has to choose to
cache the contents in memory to speed up future requests. In the former
case, more memory is used for the cache and there is a risk of getting
stale data if application loads or unloads shared libraries, or otherwise
changed its set of VMAs somehow, e.g., through additional mmap() calls.
In the latter case, it's the performance hit that comes from re-opening
the file and re-parsing its contents all over again.
This patch aims to solve this problem by providing a new API built on top
of /proc/<pid>/maps. It's meant to address both non-selectiveness and
text nature of /proc/<pid>/maps, by giving user more control of what sort
of VMA(s) needs to be queried, and being binary-based interface eliminates
the overhead of text formatting (on kernel side) and parsing (on user
space side).
It's also designed to be extensible and forward/backward compatible by
including required struct size field, which user has to provide. We use
established copy_struct_from_user() approach to handle extensibility.
User has a choice to pick either getting VMA that covers provided address
or -ENOENT if none is found (exact, least surprising, case). Or, with an
extra query flag (PROCMAP_QUERY_COVERING_OR_NEXT_VMA), they can get either
VMA that covers the address (if there is one), or the closest next VMA
(i.e., VMA with the smallest vm_start > addr). The latter allows more
efficient use, but, given it could be a surprising behavior, requires an
explicit opt-in.
There is another query flag that is useful for some use cases.
PROCMAP_QUERY_FILE_BACKED_VMA instructs this API to only return
file-backed VMAs. Combining this with PROCMAP_QUERY_COVERING_OR_NEXT_VMA
makes it possible to efficiently iterate only file-backed VMAs of the
process, which is what profilers/symbolizers are normally interested in.
All the above querying flags can be combined with (also optional) set of
desired VMA permissions flags. This allows to, for example, iterate only
an executable subset of VMAs, which is what preprocessing pattern, used by
perf tool, would benefit from, as the assumption is that captured stack
traces would have addresses of executable code. This saves time by
skipping non-executable VMAs altogether efficienty.
All these querying flags (modifiers) are orthogonal and can be combined in
a semantically meaningful and natural way.
Basing this ioctl()-based API on top of /proc/<pid>/maps's FD makes sense
given it's querying the same set of VMA data. It's also benefitial
because permission checks for /proc/<pid>/maps is performed at open time
once, and the actual data read of text contents of /proc/<pid>/maps is
done without further permission checks. We piggyback on this pattern with
ioctl()-based API as well, as that's a desired property. Both for
performance reasons, but also for security and flexibility reasons.
Allowing application to open an FD for /proc/self/maps without any extra
capabilities, and then passing it to some sort of profiling agent through
Unix-domain socket, would allow such profiling agent to not require some
of the capabilities that are otherwise expected when opening
/proc/<pid>/maps file for *another* process. This is a desirable property
for some more restricted setups.
This new ioctl-based implementation doesn't interfere with seq_file-based
implementation of /proc/<pid>/maps textual interface, and so could be used
together or independently without paying any price for that.
Note also, that fetching VMA name (e.g., backing file path, or special
hard-coded or user-provided names) is optional just like build ID. If
user sets vma_name_size to zero, kernel code won't attempt to retrieve it,
saving resources.
Earlier versions of this patch set were adding per-VMA locking, which is
why we have a code structure that is ready for abstracting mmap_lock vs
vm_lock differences (query_vma_setup(), query_vma_teardown(), and
query_vma_find_by_addr()), but given anon_vma_name() is not yet compatible
with per-VMA locking, initial implementation sticks to using only
mmap_lock for now. It will be easy to add back per-VMA locking once all
the pieces are ready later on. Which is why we keep existing code
structure with setup/teardown/query helper functions.
[andrii@kernel.org: improve PROCMAP_QUERY's compat mode handling]
Link: https://lkml.kernel.org/r/20240701174805.1897344-2-andrii@kernel.org
Link: https://lkml.kernel.org/r/20240627170900.1672542-3-andrii@kernel.org
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-06-27 17:08:54 +00:00
|
|
|
/*
|
|
|
|
* User-supplied address of a buffer of at least vma_name_size bytes
|
|
|
|
* for kernel to fill with matched VMA's name (see vma_name_size field
|
|
|
|
* description above for details).
|
|
|
|
*
|
|
|
|
* Should be set to zero if VMA name should not be returned.
|
|
|
|
*/
|
|
|
|
__u64 vma_name_addr; /* in */
|
fs/procfs: add build ID fetching to PROCMAP_QUERY API
The need to get ELF build ID reliably is an important aspect when dealing
with profiling and stack trace symbolization, and /proc/<pid>/maps textual
representation doesn't help with this.
To get backing file's ELF build ID, application has to first resolve VMA,
then use it's start/end address range to follow a special
/proc/<pid>/map_files/<start>-<end> symlink to open the ELF file (this is
necessary because backing file might have been removed from the disk or
was already replaced with another binary in the same file path.
Such approach, beyond just adding complexity of having to do a bunch of
extra work, has extra security implications. Because application opens
underlying ELF file and needs read access to its entire contents (as far
as kernel is concerned), kernel puts additional capable() checks on
following /proc/<pid>/map_files/<start>-<end> symlink. And that makes
sense in general.
But in the case of build ID, profiler/symbolizer doesn't need the contents
of ELF file, per se. It's only build ID that is of interest, and ELF
build ID itself doesn't provide any sensitive information.
So this patch adds a way to request backing file's ELF build ID along the
rest of VMA information in the same API. User has control over whether
this piece of information is requested or not by either setting
build_id_size field to zero or non-zero maximum buffer size they provided
through build_id_addr field (which encodes user pointer as __u64 field).
This is a completely optional piece of information, and so has no
performance implications for user cases that don't care about build ID,
while improving performance and simplifying the setup for those
application that do need it.
Kernel already implements build ID fetching, which is used from BPF
subsystem. We are reusing this code here, but plan a follow up changes to
make it work better under more relaxed assumption (compared to what
existing code assumes) of being called from user process context, in which
page faults are allowed. BPF-specific implementation currently bails out
if necessary part of ELF file is not paged in, all due to extra
BPF-specific restrictions (like the need to fetch build ID in restrictive
contexts such as NMI handler).
[andrii@kernel.org: fix integer to pointer cast warning in do_procmap_query()]
Link: https://lkml.kernel.org/r/20240701174805.1897344-1-andrii@kernel.org
Link: https://lkml.kernel.org/r/20240627170900.1672542-4-andrii@kernel.org
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-06-27 17:08:55 +00:00
|
|
|
/*
|
|
|
|
* User-supplied address of a buffer of at least build_id_size bytes
|
|
|
|
* for kernel to fill with matched VMA's ELF build ID, if available
|
|
|
|
* (see build_id_size field description above for details).
|
|
|
|
*
|
|
|
|
* Should be set to zero if build ID should not be returned.
|
|
|
|
*/
|
|
|
|
__u64 build_id_addr; /* in */
|
fs/procfs: implement efficient VMA querying API for /proc/<pid>/maps
/proc/<pid>/maps file is extremely useful in practice for various tasks
involving figuring out process memory layout, what files are backing any
given memory range, etc. One important class of applications that
absolutely rely on this are profilers/stack symbolizers (perf tool being
one of them). Patterns of use differ, but they generally would fall into
two categories.
In on-demand pattern, a profiler/symbolizer would normally capture stack
trace containing absolute memory addresses of some functions, and would
then use /proc/<pid>/maps file to find corresponding backing ELF files
(normally, only executable VMAs are of interest), file offsets within
them, and then continue from there to get yet more information (ELF
symbols, DWARF information) to get human-readable symbolic information.
This pattern is used by Meta's fleet-wide profiler, as one example.
In preprocessing pattern, application doesn't know the set of addresses of
interest, so it has to fetch all relevant VMAs (again, probably only
executable ones), store or cache them, then proceed with profiling and
stack trace capture. Once done, it would do symbolization based on stored
VMA information. This can happen at much later point in time. This
patterns is used by perf tool, as an example.
In either case, there are both performance and correctness requirement
involved. This address to VMA information translation has to be done as
efficiently as possible, but also not miss any VMA (especially in the case
of loading/unloading shared libraries). In practice, correctness can't be
guaranteed (due to process dying before VMA data can be captured, or
shared library being unloaded, etc), but any effort to maximize the chance
of finding the VMA is appreciated.
Unfortunately, for all the /proc/<pid>/maps file universality and
usefulness, it doesn't fit the above use cases 100%.
First, it's main purpose is to emit all VMAs sequentially, but in practice
captured addresses would fall only into a smaller subset of all process'
VMAs, mainly containing executable text. Yet, library would need to parse
most or all of the contents to find needed VMAs, as there is no way to
skip VMAs that are of no use. Efficient library can do the linear pass
and it is still relatively efficient, but it's definitely an overhead that
can be avoided, if there was a way to do more targeted querying of the
relevant VMA information.
Second, it's a text based interface, which makes its programmatic use from
applications and libraries more cumbersome and inefficient due to the need
to handle text parsing to get necessary pieces of information. The
overhead is actually payed both by kernel, formatting originally binary
VMA data into text, and then by user space application, parsing it back
into binary data for further use.
For the on-demand pattern of usage, described above, another problem when
writing generic stack trace symbolization library is an unfortunate
performance-vs-correctness tradeoff that needs to be made. Library has to
make a decision to either cache parsed contents of /proc/<pid>/maps (after
initial processing) to service future requests (if application requests to
symbolize another set of addresses (for the same process), captured at
some later time, which is typical for periodic/continuous profiling cases)
to avoid higher costs of re-parsing this file. Or it has to choose to
cache the contents in memory to speed up future requests. In the former
case, more memory is used for the cache and there is a risk of getting
stale data if application loads or unloads shared libraries, or otherwise
changed its set of VMAs somehow, e.g., through additional mmap() calls.
In the latter case, it's the performance hit that comes from re-opening
the file and re-parsing its contents all over again.
This patch aims to solve this problem by providing a new API built on top
of /proc/<pid>/maps. It's meant to address both non-selectiveness and
text nature of /proc/<pid>/maps, by giving user more control of what sort
of VMA(s) needs to be queried, and being binary-based interface eliminates
the overhead of text formatting (on kernel side) and parsing (on user
space side).
It's also designed to be extensible and forward/backward compatible by
including required struct size field, which user has to provide. We use
established copy_struct_from_user() approach to handle extensibility.
User has a choice to pick either getting VMA that covers provided address
or -ENOENT if none is found (exact, least surprising, case). Or, with an
extra query flag (PROCMAP_QUERY_COVERING_OR_NEXT_VMA), they can get either
VMA that covers the address (if there is one), or the closest next VMA
(i.e., VMA with the smallest vm_start > addr). The latter allows more
efficient use, but, given it could be a surprising behavior, requires an
explicit opt-in.
There is another query flag that is useful for some use cases.
PROCMAP_QUERY_FILE_BACKED_VMA instructs this API to only return
file-backed VMAs. Combining this with PROCMAP_QUERY_COVERING_OR_NEXT_VMA
makes it possible to efficiently iterate only file-backed VMAs of the
process, which is what profilers/symbolizers are normally interested in.
All the above querying flags can be combined with (also optional) set of
desired VMA permissions flags. This allows to, for example, iterate only
an executable subset of VMAs, which is what preprocessing pattern, used by
perf tool, would benefit from, as the assumption is that captured stack
traces would have addresses of executable code. This saves time by
skipping non-executable VMAs altogether efficienty.
All these querying flags (modifiers) are orthogonal and can be combined in
a semantically meaningful and natural way.
Basing this ioctl()-based API on top of /proc/<pid>/maps's FD makes sense
given it's querying the same set of VMA data. It's also benefitial
because permission checks for /proc/<pid>/maps is performed at open time
once, and the actual data read of text contents of /proc/<pid>/maps is
done without further permission checks. We piggyback on this pattern with
ioctl()-based API as well, as that's a desired property. Both for
performance reasons, but also for security and flexibility reasons.
Allowing application to open an FD for /proc/self/maps without any extra
capabilities, and then passing it to some sort of profiling agent through
Unix-domain socket, would allow such profiling agent to not require some
of the capabilities that are otherwise expected when opening
/proc/<pid>/maps file for *another* process. This is a desirable property
for some more restricted setups.
This new ioctl-based implementation doesn't interfere with seq_file-based
implementation of /proc/<pid>/maps textual interface, and so could be used
together or independently without paying any price for that.
Note also, that fetching VMA name (e.g., backing file path, or special
hard-coded or user-provided names) is optional just like build ID. If
user sets vma_name_size to zero, kernel code won't attempt to retrieve it,
saving resources.
Earlier versions of this patch set were adding per-VMA locking, which is
why we have a code structure that is ready for abstracting mmap_lock vs
vm_lock differences (query_vma_setup(), query_vma_teardown(), and
query_vma_find_by_addr()), but given anon_vma_name() is not yet compatible
with per-VMA locking, initial implementation sticks to using only
mmap_lock for now. It will be easy to add back per-VMA locking once all
the pieces are ready later on. Which is why we keep existing code
structure with setup/teardown/query helper functions.
[andrii@kernel.org: improve PROCMAP_QUERY's compat mode handling]
Link: https://lkml.kernel.org/r/20240701174805.1897344-2-andrii@kernel.org
Link: https://lkml.kernel.org/r/20240627170900.1672542-3-andrii@kernel.org
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-06-27 17:08:54 +00:00
|
|
|
};
|
|
|
|
|
2012-10-13 09:46:48 +00:00
|
|
|
#endif /* _UAPI_LINUX_FS_H */
|