#include <linux/btrfs.h>
#include <linux/capability.h>
#include <linux/cred.h>
#include <linux/mount.h>
#include <linux/fdtable.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/magic.h>
#include <linux/parser.h>
#include <linux/security.h>
#include <linux/seq_file.h>
#include <linux/statfs.h>
#include <linux/slab.h>
#include <linux/user_namespace.h>
#include <linux/uidgid.h>
#include <linux/xattr.h>
#include <linux/posix_acl.h>
#include <linux/posix_acl_xattr.h>
#include <linux/uio.h>

struct shiftfs_super_info {
	struct vfsmount *mnt;
	struct user_namespace *userns;
	/* creds of process who created the super block */
	const struct cred *creator_cred;
	bool mark;
	unsigned int passthrough;
	unsigned int passthrough_mark;
};

static void shiftfs_fill_inode(struct inode *inode, unsigned long ino,
			       umode_t mode, dev_t dev, struct dentry *dentry);

#define SHIFTFS_PASSTHROUGH_NONE 0
#define SHIFTFS_PASSTHROUGH_STAT 1
#define SHIFTFS_PASSTHROUGH_IOCTL 2
#define SHIFTFS_PASSTHROUGH_ALL                                                \
	(SHIFTFS_PASSTHROUGH_STAT | SHIFTFS_PASSTHROUGH_IOCTL)

static inline bool shiftfs_passthrough_ioctls(struct shiftfs_super_info *info)
{
	if (!(info->passthrough & SHIFTFS_PASSTHROUGH_IOCTL))
		return false;

	return true;
}

static inline bool shiftfs_passthrough_statfs(struct shiftfs_super_info *info)
{
	if (!(info->passthrough & SHIFTFS_PASSTHROUGH_STAT))
		return false;

	return true;
}

enum {
	OPT_MARK,
	OPT_PASSTHROUGH,
	OPT_LAST,
};

/* global filesystem options */
static const match_table_t tokens = {
	{ OPT_MARK, "mark" },
	{ OPT_PASSTHROUGH, "passthrough=%u" },
	{ OPT_LAST, NULL }
};

static const struct cred *shiftfs_override_creds(const struct super_block *sb)
{
	struct shiftfs_super_info *sbinfo = sb->s_fs_info;

	return override_creds(sbinfo->creator_cred);
}

static inline void shiftfs_revert_object_creds(const struct cred *oldcred,
					       struct cred *newcred)
{
	revert_creds(oldcred);
	put_cred(newcred);
}

static kuid_t shift_kuid(struct user_namespace *from, struct user_namespace *to,
			 kuid_t kuid)
{
	uid_t uid = from_kuid(from, kuid);
	return make_kuid(to, uid);
}

static kgid_t shift_kgid(struct user_namespace *from, struct user_namespace *to,
			 kgid_t kgid)
{
	gid_t gid = from_kgid(from, kgid);
	return make_kgid(to, gid);
}

static int shiftfs_override_object_creds(const struct super_block *sb,
					 const struct cred **oldcred,
					 struct cred **newcred,
					 struct dentry *dentry, umode_t mode,
					 bool hardlink)
{
	struct shiftfs_super_info *sbinfo = sb->s_fs_info;
	kuid_t fsuid = current_fsuid();
	kgid_t fsgid = current_fsgid();

	*oldcred = shiftfs_override_creds(sb);

	*newcred = prepare_creds();
	if (!*newcred) {
		revert_creds(*oldcred);
		return -ENOMEM;
	}

	(*newcred)->fsuid = shift_kuid(sb->s_user_ns, sbinfo->userns, fsuid);
	(*newcred)->fsgid = shift_kgid(sb->s_user_ns, sbinfo->userns, fsgid);

	if (!hardlink) {
		int err = security_dentry_create_files_as(dentry, mode,
							  &dentry->d_name,
							  *oldcred, *newcred);
		if (err) {
			shiftfs_revert_object_creds(*oldcred, *newcred);
			return err;
		}
	}

	put_cred(override_creds(*newcred));
	return 0;
}

static void shiftfs_copyattr(struct inode *from, struct inode *to)
{
	struct user_namespace *from_ns = from->i_sb->s_user_ns;
	struct user_namespace *to_ns = to->i_sb->s_user_ns;

	to->i_uid = shift_kuid(from_ns, to_ns, from->i_uid);
	to->i_gid = shift_kgid(from_ns, to_ns, from->i_gid);
	to->i_mode = from->i_mode;
	to->i_atime = from->i_atime;
	to->i_mtime = from->i_mtime;
	to->i_ctime = from->i_ctime;
	i_size_write(to, i_size_read(from));
}

static void shiftfs_copyflags(struct inode *from, struct inode *to)
{
	unsigned int mask = S_SYNC | S_IMMUTABLE | S_APPEND | S_NOATIME;

	inode_set_flags(to, from->i_flags & mask, mask);
}

static void shiftfs_file_accessed(struct file *file)
{
	struct inode *upperi, *loweri;

	if (file->f_flags & O_NOATIME)
		return;

	upperi = file_inode(file);
	loweri = upperi->i_private;

	if (!loweri)
		return;

	upperi->i_mtime = loweri->i_mtime;
	upperi->i_ctime = loweri->i_ctime;

	touch_atime(&file->f_path);
}

static int shiftfs_parse_mount_options(struct shiftfs_super_info *sbinfo,
				       char *options)
{
	char *p;
	substring_t args[MAX_OPT_ARGS];

	sbinfo->mark = false;
	sbinfo->passthrough = 0;

	while ((p = strsep(&options, ",")) != NULL) {
		int err, intarg, token;

		if (!*p)
			continue;

		token = match_token(p, tokens, args);
		switch (token) {
		case OPT_MARK:
			sbinfo->mark = true;
			break;
		case OPT_PASSTHROUGH:
			err = match_int(&args[0], &intarg);
			if (err)
				return err;

			if (intarg & ~SHIFTFS_PASSTHROUGH_ALL)
				return -EINVAL;

			sbinfo->passthrough = intarg;
			break;
		default:
			return -EINVAL;
		}
	}

	return 0;
}

static void shiftfs_d_release(struct dentry *dentry)
{
	struct dentry *lowerd = dentry->d_fsdata;

	if (lowerd)
		dput(lowerd);
}

static struct dentry *shiftfs_d_real(struct dentry *dentry,
				     const struct inode *inode)
{
	struct dentry *lowerd = dentry->d_fsdata;

	if (inode && d_inode(dentry) == inode)
		return dentry;

	lowerd = d_real(lowerd, inode);
	if (lowerd && (!inode || inode == d_inode(lowerd)))
		return lowerd;

	WARN(1, "shiftfs_d_real(%pd4, %s:%lu): real dentry not found\n", dentry,
	     inode ? inode->i_sb->s_id : "NULL", inode ? inode->i_ino : 0);
	return dentry;
}

static int shiftfs_d_weak_revalidate(struct dentry *dentry, unsigned int flags)
{
	int err = 1;
	struct dentry *lowerd = dentry->d_fsdata;

	if (d_is_negative(lowerd) != d_is_negative(dentry))
		return 0;

	if ((lowerd->d_flags & DCACHE_OP_WEAK_REVALIDATE))
		err = lowerd->d_op->d_weak_revalidate(lowerd, flags);

	if (d_really_is_positive(dentry)) {
		struct inode *inode = d_inode(dentry);
		struct inode *loweri = d_inode(lowerd);

		shiftfs_copyattr(loweri, inode);
	}

	return err;
}

static int shiftfs_d_revalidate(struct dentry *dentry, unsigned int flags)
{
	int err = 1;
	struct dentry *lowerd = dentry->d_fsdata;

	if (d_unhashed(lowerd) ||
	    ((d_is_negative(lowerd) != d_is_negative(dentry))))
		return 0;

	if (flags & LOOKUP_RCU)
		return -ECHILD;

	if ((lowerd->d_flags & DCACHE_OP_REVALIDATE))
		err = lowerd->d_op->d_revalidate(lowerd, flags);

	if (d_really_is_positive(dentry)) {
		struct inode *inode = d_inode(dentry);
		struct inode *loweri = d_inode(lowerd);

		shiftfs_copyattr(loweri, inode);
	}

	return err;
}

static const struct dentry_operations shiftfs_dentry_ops = {
	.d_release	   = shiftfs_d_release,
	.d_real		   = shiftfs_d_real,
	.d_revalidate	   = shiftfs_d_revalidate,
	.d_weak_revalidate = shiftfs_d_weak_revalidate,
};

static const char *shiftfs_get_link(struct dentry *dentry, struct inode *inode,
				    struct delayed_call *done)
{
	const char *p;
	const struct cred *oldcred;
	struct dentry *lowerd;

	/* RCU lookup not supported */
	if (!dentry)
		return ERR_PTR(-ECHILD);

	lowerd = dentry->d_fsdata;
	oldcred = shiftfs_override_creds(dentry->d_sb);
	p = vfs_get_link(lowerd, done);
	revert_creds(oldcred);

	return p;
}

static int shiftfs_setxattr(struct dentry *dentry, struct inode *inode,
			    const char *name, const void *value,
			    size_t size, int flags)
{
	struct dentry *lowerd = dentry->d_fsdata;
	int err;
	const struct cred *oldcred;

	oldcred = shiftfs_override_creds(dentry->d_sb);
	err = vfs_setxattr(lowerd, name, value, size, flags);
	revert_creds(oldcred);

	shiftfs_copyattr(lowerd->d_inode, inode);

	return err;
}

static int shiftfs_xattr_get(const struct xattr_handler *handler,
			     struct dentry *dentry, struct inode *inode,
			     const char *name, void *value, size_t size)
{
	struct dentry *lowerd = dentry->d_fsdata;
	int err;
	const struct cred *oldcred;

	oldcred = shiftfs_override_creds(dentry->d_sb);
	err = vfs_getxattr(lowerd, name, value, size);
	revert_creds(oldcred);

	return err;
}

static ssize_t shiftfs_listxattr(struct dentry *dentry, char *list,
				 size_t size)
{
	struct dentry *lowerd = dentry->d_fsdata;
	int err;
	const struct cred *oldcred;

	oldcred = shiftfs_override_creds(dentry->d_sb);
	err = vfs_listxattr(lowerd, list, size);
	revert_creds(oldcred);

	return err;
}

static int shiftfs_removexattr(struct dentry *dentry, const char *name)
{
	struct dentry *lowerd = dentry->d_fsdata;
	int err;
	const struct cred *oldcred;

	oldcred = shiftfs_override_creds(dentry->d_sb);
	err = vfs_removexattr(lowerd, name);
	revert_creds(oldcred);

	/* update c/mtime */
	shiftfs_copyattr(lowerd->d_inode, d_inode(dentry));

	return err;
}

static int shiftfs_xattr_set(const struct xattr_handler *handler,
			     struct dentry *dentry, struct inode *inode,
			     const char *name, const void *value, size_t size,
			     int flags)
{
	if (!value)
		return shiftfs_removexattr(dentry, name);
	return shiftfs_setxattr(dentry, inode, name, value, size, flags);
}

static int shiftfs_inode_test(struct inode *inode, void *data)
{
	return inode->i_private == data;
}

static int shiftfs_inode_set(struct inode *inode, void *data)
{
	inode->i_private = data;
	return 0;
}

static int shiftfs_create_object(struct inode *diri, struct dentry *dentry,
				 umode_t mode, const char *symlink,
				 struct dentry *hardlink, bool excl)
{
	int err;
	const struct cred *oldcred;
	struct cred *newcred;
	void *loweri_iop_ptr = NULL;
	umode_t modei = mode;
	struct super_block *dir_sb = diri->i_sb;
	struct dentry *lowerd_new = dentry->d_fsdata;
	struct inode *inode = NULL, *loweri_dir = diri->i_private;
	const struct inode_operations *loweri_dir_iop = loweri_dir->i_op;
	struct dentry *lowerd_link = NULL;

	inode_lock_nested(loweri_dir, I_MUTEX_PARENT);

	if (hardlink) {
		loweri_iop_ptr = loweri_dir_iop->link;
	} else {
		switch (mode & S_IFMT) {
		case S_IFDIR:
			loweri_iop_ptr = loweri_dir_iop->mkdir;
			break;
		case S_IFREG:
			loweri_iop_ptr = loweri_dir_iop->create;
			break;
		case S_IFLNK:
			loweri_iop_ptr = loweri_dir_iop->symlink;
			break;
		case S_IFSOCK:
			/* fall through */
		case S_IFIFO:
			loweri_iop_ptr = loweri_dir_iop->mknod;
			break;
		}
	}
	if (!loweri_iop_ptr) {
		err = -EINVAL;
		goto out_iput;
	}

	if (!hardlink) {
		inode = new_inode(dir_sb);
		if (!inode) {
			err = -ENOMEM;
			goto out_iput;
		}

		/*
		 * new_inode() will have added the new inode to the super
		 * block's list of inodes. Further below we will call
		 * inode_insert5() Which would perform the same operation again
		 * thereby corrupting the list. To avoid this raise I_CREATING
		 * in i_state which will cause inode_insert5() to skip this
		 * step. I_CREATING will be cleared by d_instantiate_new()
		 * below.
		 */
		spin_lock(&inode->i_lock);
		inode->i_state |= I_CREATING;
		spin_unlock(&inode->i_lock);

		inode_init_owner(inode, diri, mode);
		modei = inode->i_mode;
	}

	err = shiftfs_override_object_creds(dentry->d_sb, &oldcred, &newcred,
					    dentry, modei, hardlink != NULL);
	if (err)
		goto out_iput;

	if (hardlink) {
		lowerd_link = hardlink->d_fsdata;
		err = vfs_link(lowerd_link, loweri_dir, lowerd_new, NULL);
	} else {
		switch (modei & S_IFMT) {
		case S_IFDIR:
			err = vfs_mkdir(loweri_dir, lowerd_new, modei);
			break;
		case S_IFREG:
			err = vfs_create(loweri_dir, lowerd_new, modei, excl);
			break;
		case S_IFLNK:
			err = vfs_symlink(loweri_dir, lowerd_new, symlink);
			break;
		case S_IFSOCK:
			/* fall through */
		case S_IFIFO:
			err = vfs_mknod(loweri_dir, lowerd_new, modei, 0);
			break;
		default:
			err = -EINVAL;
			break;
		}
	}

	shiftfs_revert_object_creds(oldcred, newcred);

	if (!err && WARN_ON(!lowerd_new->d_inode))
		err = -EIO;
	if (err)
		goto out_iput;

	if (hardlink) {
		inode = d_inode(hardlink);
		ihold(inode);

		/* copy up times from lower inode */
		shiftfs_copyattr(d_inode(lowerd_link), inode);
		set_nlink(d_inode(hardlink), d_inode(lowerd_link)->i_nlink);
		d_instantiate(dentry, inode);
	} else {
		struct inode *inode_tmp;
		struct inode *loweri_new = d_inode(lowerd_new);

		inode_tmp = inode_insert5(inode, (unsigned long)loweri_new,
					  shiftfs_inode_test, shiftfs_inode_set,
					  loweri_new);
		if (unlikely(inode_tmp != inode)) {
			pr_err_ratelimited("shiftfs: newly created inode found in cache\n");
			iput(inode_tmp);
			err = -EINVAL;
			goto out_iput;
		}

		ihold(loweri_new);
		shiftfs_fill_inode(inode, loweri_new->i_ino, loweri_new->i_mode,
				   0, lowerd_new);
		d_instantiate_new(dentry, inode);
	}

	shiftfs_copyattr(loweri_dir, diri);
	if (loweri_iop_ptr == loweri_dir_iop->mkdir)
		set_nlink(diri, loweri_dir->i_nlink);

	inode = NULL;

out_iput:
	iput(inode);
	inode_unlock(loweri_dir);

	return err;
}

static int shiftfs_create(struct inode *dir, struct dentry *dentry,
			  umode_t mode,  bool excl)
{
	mode |= S_IFREG;

	return shiftfs_create_object(dir, dentry, mode, NULL, NULL, excl);
}

static int shiftfs_mkdir(struct inode *dir, struct dentry *dentry,
			 umode_t mode)
{
	mode |= S_IFDIR;

	return shiftfs_create_object(dir, dentry, mode, NULL, NULL, false);
}

static int shiftfs_link(struct dentry *hardlink, struct inode *dir,
			struct dentry *dentry)
{
	return shiftfs_create_object(dir, dentry, 0, NULL, hardlink, false);
}

static int shiftfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
			 dev_t rdev)
{
	if (!S_ISFIFO(mode) && !S_ISSOCK(mode))
		return -EPERM;

	return shiftfs_create_object(dir, dentry, mode, NULL, NULL, false);
}

static int shiftfs_symlink(struct inode *dir, struct dentry *dentry,
			   const char *symlink)
{
	return shiftfs_create_object(dir, dentry, S_IFLNK, symlink, NULL, false);
}

static int shiftfs_rm(struct inode *dir, struct dentry *dentry, bool rmdir)
{
	struct dentry *lowerd = dentry->d_fsdata;
	struct inode *loweri = dir->i_private;
	struct inode *inode = d_inode(dentry);
	int err;
	const struct cred *oldcred;

	dget(lowerd);
	oldcred = shiftfs_override_creds(dentry->d_sb);
	inode_lock_nested(loweri, I_MUTEX_PARENT);
	if (rmdir)
		err = vfs_rmdir(loweri, lowerd);
	else
		err = vfs_unlink(loweri, lowerd, NULL);
	revert_creds(oldcred);

	if (!err) {
		d_drop(dentry);

		if (rmdir)
			clear_nlink(inode);
		else
			drop_nlink(inode);
	}
	inode_unlock(loweri);

	shiftfs_copyattr(loweri, dir);
	dput(lowerd);

	return err;
}

static int shiftfs_unlink(struct inode *dir, struct dentry *dentry)
{
	return shiftfs_rm(dir, dentry, false);
}

static int shiftfs_rmdir(struct inode *dir, struct dentry *dentry)
{
	return shiftfs_rm(dir, dentry, true);
}

static int shiftfs_rename(struct inode *olddir, struct dentry *old,
			  struct inode *newdir, struct dentry *new,
			  unsigned int flags)
{
	struct dentry *lowerd_dir_old = old->d_parent->d_fsdata,
		      *lowerd_dir_new = new->d_parent->d_fsdata,
		      *lowerd_old = old->d_fsdata, *lowerd_new = new->d_fsdata,
		      *trapd;
	struct inode *loweri_dir_old = lowerd_dir_old->d_inode,
		     *loweri_dir_new = lowerd_dir_new->d_inode;
	int err = -EINVAL;
	const struct cred *oldcred;

	trapd = lock_rename(lowerd_dir_new, lowerd_dir_old);

	if (trapd == lowerd_old || trapd == lowerd_new)
		goto out_unlock;

	oldcred = shiftfs_override_creds(old->d_sb);
	err = vfs_rename(loweri_dir_old, lowerd_old, loweri_dir_new, lowerd_new,
			 NULL, flags);
	revert_creds(oldcred);

	shiftfs_copyattr(loweri_dir_old, olddir);
	shiftfs_copyattr(loweri_dir_new, newdir);

out_unlock:
	unlock_rename(lowerd_dir_new, lowerd_dir_old);

	return err;
}

static struct dentry *shiftfs_lookup(struct inode *dir, struct dentry *dentry,
				     unsigned int flags)
{
	struct dentry *new;
	struct inode *newi;
	const struct cred *oldcred;
	struct dentry *lowerd = dentry->d_parent->d_fsdata;
	struct inode *inode = NULL, *loweri = lowerd->d_inode;

	inode_lock(loweri);
	oldcred = shiftfs_override_creds(dentry->d_sb);
	new = lookup_one_len(dentry->d_name.name, lowerd, dentry->d_name.len);
	revert_creds(oldcred);
	inode_unlock(loweri);

	if (IS_ERR(new))
		return new;

	dentry->d_fsdata = new;

	newi = new->d_inode;
	if (!newi)
		goto out;

	inode = iget5_locked(dentry->d_sb, (unsigned long)newi,
			     shiftfs_inode_test, shiftfs_inode_set, newi);
	if (!inode) {
		dput(new);
		return ERR_PTR(-ENOMEM);
	}
	if (inode->i_state & I_NEW) {
		/*
		 * inode->i_private set by shiftfs_inode_set(), but we still
		 * need to take a reference
		*/
		ihold(newi);
		shiftfs_fill_inode(inode, newi->i_ino, newi->i_mode, 0, new);
		unlock_new_inode(inode);
	}

out:
	return d_splice_alias(inode, dentry);
}

static int shiftfs_permission(struct inode *inode, int mask)
{
	int err;
	const struct cred *oldcred;
	struct inode *loweri = inode->i_private;

	if (!loweri) {
		WARN_ON(!(mask & MAY_NOT_BLOCK));
		return -ECHILD;
	}

	err = generic_permission(inode, mask);
	if (err)
		return err;

	oldcred = shiftfs_override_creds(inode->i_sb);
	err = inode_permission(loweri, mask);
	revert_creds(oldcred);

	return err;
}

static int shiftfs_fiemap(struct inode *inode,
			  struct fiemap_extent_info *fieinfo, u64 start,
			  u64 len)
{
	int err;
	const struct cred *oldcred;
	struct inode *loweri = inode->i_private;

	if (!loweri->i_op->fiemap)
		return -EOPNOTSUPP;

	oldcred = shiftfs_override_creds(inode->i_sb);
	if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC)
		filemap_write_and_wait(loweri->i_mapping);
	err = loweri->i_op->fiemap(loweri, fieinfo, start, len);
	revert_creds(oldcred);

	return err;
}

static int shiftfs_tmpfile(struct inode *dir, struct dentry *dentry,
			   umode_t mode)
{
	int err;
	const struct cred *oldcred;
	struct dentry *lowerd = dentry->d_fsdata;
	struct inode *loweri = dir->i_private;

	if (!loweri->i_op->tmpfile)
		return -EOPNOTSUPP;

	oldcred = shiftfs_override_creds(dir->i_sb);
	err = loweri->i_op->tmpfile(loweri, lowerd, mode);
	revert_creds(oldcred);

	return err;
}

static int shiftfs_setattr(struct dentry *dentry, struct iattr *attr)
{
	struct dentry *lowerd = dentry->d_fsdata;
	struct inode *loweri = lowerd->d_inode;
	struct iattr newattr;
	const struct cred *oldcred;
	struct super_block *sb = dentry->d_sb;
	struct shiftfs_super_info *sbinfo = sb->s_fs_info;
	int err;

	err = setattr_prepare(dentry, attr);
	if (err)
		return err;

	newattr = *attr;
	newattr.ia_uid = shift_kuid(sb->s_user_ns, sbinfo->userns, attr->ia_uid);
	newattr.ia_gid = shift_kgid(sb->s_user_ns, sbinfo->userns, attr->ia_gid);

	/*
	 * mode change is for clearing setuid/setgid bits. Allow lower fs
	 * to interpret this in its own way.
	 */
	if (newattr.ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID))
		newattr.ia_valid &= ~ATTR_MODE;

	inode_lock(loweri);
	oldcred = shiftfs_override_creds(dentry->d_sb);
	err = notify_change(lowerd, &newattr, NULL);
	revert_creds(oldcred);
	inode_unlock(loweri);

	shiftfs_copyattr(loweri, d_inode(dentry));

	return err;
}

static int shiftfs_getattr(const struct path *path, struct kstat *stat,
			   u32 request_mask, unsigned int query_flags)
{
	struct inode *inode = path->dentry->d_inode;
	struct dentry *lowerd = path->dentry->d_fsdata;
	struct inode *loweri = lowerd->d_inode;
	struct shiftfs_super_info *info = path->dentry->d_sb->s_fs_info;
	struct path newpath = { .mnt = info->mnt, .dentry = lowerd };
	struct user_namespace *from_ns = loweri->i_sb->s_user_ns;
	struct user_namespace *to_ns = inode->i_sb->s_user_ns;
	const struct cred *oldcred;
	int err;

	oldcred = shiftfs_override_creds(inode->i_sb);
	err = vfs_getattr(&newpath, stat, request_mask, query_flags);
	revert_creds(oldcred);

	if (err)
		return err;

	/* transform the underlying id */
	stat->uid = shift_kuid(from_ns, to_ns, stat->uid);
	stat->gid = shift_kgid(from_ns, to_ns, stat->gid);
	return 0;
}

#ifdef CONFIG_SHIFT_FS_POSIX_ACL

static int
shift_acl_ids(struct user_namespace *from, struct user_namespace *to,
	      struct posix_acl *acl)
{
	int i;

	for (i = 0; i < acl->a_count; i++) {
		struct posix_acl_entry *e = &acl->a_entries[i];
		switch(e->e_tag) {
		case ACL_USER:
			e->e_uid = shift_kuid(from, to, e->e_uid);
			if (!uid_valid(e->e_uid))
				return -EOVERFLOW;
			break;
		case ACL_GROUP:
			e->e_gid = shift_kgid(from, to, e->e_gid);
			if (!gid_valid(e->e_gid))
				return -EOVERFLOW;
			break;
		}
	}
	return 0;
}

static void
shift_acl_xattr_ids(struct user_namespace *from, struct user_namespace *to,
		    void *value, size_t size)
{
	struct posix_acl_xattr_header *header = value;
	struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end;
	int count;
	kuid_t kuid;
	kgid_t kgid;

	if (!value)
		return;
	if (size < sizeof(struct posix_acl_xattr_header))
		return;
	if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
		return;

	count = posix_acl_xattr_count(size);
	if (count < 0)
		return;
	if (count == 0)
		return;

	for (end = entry + count; entry != end; entry++) {
		switch(le16_to_cpu(entry->e_tag)) {
		case ACL_USER:
			kuid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id));
			kuid = shift_kuid(from, to, kuid);
			entry->e_id = cpu_to_le32(from_kuid(&init_user_ns, kuid));
			break;
		case ACL_GROUP:
			kgid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id));
			kgid = shift_kgid(from, to, kgid);
			entry->e_id = cpu_to_le32(from_kgid(&init_user_ns, kgid));
			break;
		default:
			break;
		}
	}
}

static struct posix_acl *shiftfs_get_acl(struct inode *inode, int type)
{
	struct inode *loweri = inode->i_private;
	const struct cred *oldcred;
	struct posix_acl *lower_acl, *acl = NULL;
	struct user_namespace *from_ns = loweri->i_sb->s_user_ns;
	struct user_namespace *to_ns = inode->i_sb->s_user_ns;
	int size;
	int err;

	if (!IS_POSIXACL(loweri))
		return NULL;

	oldcred = shiftfs_override_creds(inode->i_sb);
	lower_acl = get_acl(loweri, type);
	revert_creds(oldcred);

	if (lower_acl && !IS_ERR(lower_acl)) {
		/* XXX: export posix_acl_clone? */
		size = sizeof(struct posix_acl) +
		       lower_acl->a_count * sizeof(struct posix_acl_entry);
		acl = kmemdup(lower_acl, size, GFP_KERNEL);
		posix_acl_release(lower_acl);

		if (!acl)
			return ERR_PTR(-ENOMEM);

		refcount_set(&acl->a_refcount, 1);

		err = shift_acl_ids(from_ns, to_ns, acl);
		if (err) {
			kfree(acl);
			return ERR_PTR(err);
		}
	}

	return acl;
}

static int
shiftfs_posix_acl_xattr_get(const struct xattr_handler *handler,
			   struct dentry *dentry, struct inode *inode,
			   const char *name, void *buffer, size_t size)
{
	struct inode *loweri = inode->i_private;
	int ret;

	ret = shiftfs_xattr_get(NULL, dentry, inode, handler->name,
				buffer, size);
	if (ret < 0)
		return ret;

	inode_lock(loweri);
	shift_acl_xattr_ids(loweri->i_sb->s_user_ns, inode->i_sb->s_user_ns,
			    buffer, size);
	inode_unlock(loweri);
	return ret;
}

static int
shiftfs_posix_acl_xattr_set(const struct xattr_handler *handler,
			    struct dentry *dentry, struct inode *inode,
			    const char *name, const void *value,
			    size_t size, int flags)
{
	struct inode *loweri = inode->i_private;
	int err;

	if (!IS_POSIXACL(loweri) || !loweri->i_op->set_acl)
		return -EOPNOTSUPP;
	if (handler->flags == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
		return value ? -EACCES : 0;
	if (!inode_owner_or_capable(inode))
		return -EPERM;

	if (value) {
		shift_acl_xattr_ids(inode->i_sb->s_user_ns,
				    loweri->i_sb->s_user_ns,
				    (void *)value, size);
		err = shiftfs_setxattr(dentry, inode, handler->name, value,
				       size, flags);
	} else {
		err = shiftfs_removexattr(dentry, handler->name);
	}

	if (!err)
		shiftfs_copyattr(loweri, inode);

	return err;
}

static const struct xattr_handler
shiftfs_posix_acl_access_xattr_handler = {
	.name = XATTR_NAME_POSIX_ACL_ACCESS,
	.flags = ACL_TYPE_ACCESS,
	.get = shiftfs_posix_acl_xattr_get,
	.set = shiftfs_posix_acl_xattr_set,
};

static const struct xattr_handler
shiftfs_posix_acl_default_xattr_handler = {
	.name = XATTR_NAME_POSIX_ACL_DEFAULT,
	.flags = ACL_TYPE_DEFAULT,
	.get = shiftfs_posix_acl_xattr_get,
	.set = shiftfs_posix_acl_xattr_set,
};

#else /* !CONFIG_SHIFT_FS_POSIX_ACL */

#define shiftfs_get_acl NULL

#endif /* CONFIG_SHIFT_FS_POSIX_ACL */

static const struct inode_operations shiftfs_dir_inode_operations = {
	.lookup		= shiftfs_lookup,
	.mkdir		= shiftfs_mkdir,
	.symlink	= shiftfs_symlink,
	.unlink		= shiftfs_unlink,
	.rmdir		= shiftfs_rmdir,
	.rename		= shiftfs_rename,
	.link		= shiftfs_link,
	.setattr	= shiftfs_setattr,
	.create		= shiftfs_create,
	.mknod		= shiftfs_mknod,
	.permission	= shiftfs_permission,
	.getattr	= shiftfs_getattr,
	.listxattr	= shiftfs_listxattr,
	.get_acl	= shiftfs_get_acl,
};

static const struct inode_operations shiftfs_file_inode_operations = {
	.fiemap		= shiftfs_fiemap,
	.getattr	= shiftfs_getattr,
	.get_acl	= shiftfs_get_acl,
	.listxattr	= shiftfs_listxattr,
	.permission	= shiftfs_permission,
	.setattr	= shiftfs_setattr,
	.tmpfile	= shiftfs_tmpfile,
};

static const struct inode_operations shiftfs_special_inode_operations = {
	.getattr	= shiftfs_getattr,
	.get_acl	= shiftfs_get_acl,
	.listxattr	= shiftfs_listxattr,
	.permission	= shiftfs_permission,
	.setattr	= shiftfs_setattr,
};

static const struct inode_operations shiftfs_symlink_inode_operations = {
	.getattr	= shiftfs_getattr,
	.get_link	= shiftfs_get_link,
	.listxattr	= shiftfs_listxattr,
	.setattr	= shiftfs_setattr,
};

static struct file *shiftfs_open_realfile(const struct file *file,
					  struct inode *realinode)
{
	struct file *realfile;
	const struct cred *old_cred;
	struct inode *inode = file_inode(file);
	struct dentry *lowerd = file->f_path.dentry->d_fsdata;
	struct shiftfs_super_info *info = inode->i_sb->s_fs_info;
	struct path realpath = { .mnt = info->mnt, .dentry = lowerd };

	old_cred = shiftfs_override_creds(inode->i_sb);
	realfile = open_with_fake_path(&realpath, file->f_flags, realinode,
				       info->creator_cred);
	revert_creds(old_cred);

	return realfile;
}

#define SHIFTFS_SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT)

static int shiftfs_change_flags(struct file *file, unsigned int flags)
{
	struct inode *inode = file_inode(file);
	int err;

	/* if some flag changed that cannot be changed then something's amiss */
	if (WARN_ON((file->f_flags ^ flags) & ~SHIFTFS_SETFL_MASK))
		return -EIO;

	flags &= SHIFTFS_SETFL_MASK;

	if (((flags ^ file->f_flags) & O_APPEND) && IS_APPEND(inode))
		return -EPERM;

	if (flags & O_DIRECT) {
		if (!file->f_mapping->a_ops ||
		    !file->f_mapping->a_ops->direct_IO)
			return -EINVAL;
	}

	if (file->f_op->check_flags) {
		err = file->f_op->check_flags(flags);
		if (err)
			return err;
	}

	spin_lock(&file->f_lock);
	file->f_flags = (file->f_flags & ~SHIFTFS_SETFL_MASK) | flags;
	spin_unlock(&file->f_lock);

	return 0;
}

static int shiftfs_open(struct inode *inode, struct file *file)
{
	struct file *realfile;

	realfile = shiftfs_open_realfile(file, inode->i_private);
	if (IS_ERR(realfile))
		return PTR_ERR(realfile);

	file->private_data = realfile;
	/* For O_DIRECT dentry_open() checks f_mapping->a_ops->direct_IO. */
	file->f_mapping = realfile->f_mapping;

	return 0;
}

static int shiftfs_dir_open(struct inode *inode, struct file *file)
{
	struct file *realfile;
	const struct cred *oldcred;
	struct dentry *lowerd = file->f_path.dentry->d_fsdata;
	struct shiftfs_super_info *info = inode->i_sb->s_fs_info;
	struct path realpath = { .mnt = info->mnt, .dentry = lowerd };

	oldcred = shiftfs_override_creds(file->f_path.dentry->d_sb);
	realfile = dentry_open(&realpath, file->f_flags | O_NOATIME,
			       info->creator_cred);
	revert_creds(oldcred);
	if (IS_ERR(realfile))
		return PTR_ERR(realfile);

	file->private_data = realfile;

	return 0;
}

static int shiftfs_release(struct inode *inode, struct file *file)
{
	struct file *realfile = file->private_data;

	if (realfile)
		fput(realfile);

	return 0;
}

static int shiftfs_dir_release(struct inode *inode, struct file *file)
{
	return shiftfs_release(inode, file);
}

static loff_t shiftfs_dir_llseek(struct file *file, loff_t offset, int whence)
{
	struct file *realfile = file->private_data;

	return vfs_llseek(realfile, offset, whence);
}

static loff_t shiftfs_file_llseek(struct file *file, loff_t offset, int whence)
{
	struct inode *realinode = file_inode(file)->i_private;

	return generic_file_llseek_size(file, offset, whence,
					realinode->i_sb->s_maxbytes,
					i_size_read(realinode));
}

/* XXX: Need to figure out what to to about atime updates, maybe other
 * timestamps too ... ref. ovl_file_accessed() */

static rwf_t shiftfs_iocb_to_rwf(struct kiocb *iocb)
{
	int ifl = iocb->ki_flags;
	rwf_t flags = 0;

	if (ifl & IOCB_NOWAIT)
		flags |= RWF_NOWAIT;
	if (ifl & IOCB_HIPRI)
		flags |= RWF_HIPRI;
	if (ifl & IOCB_DSYNC)
		flags |= RWF_DSYNC;
	if (ifl & IOCB_SYNC)
		flags |= RWF_SYNC;

	return flags;
}

static int shiftfs_real_fdget(const struct file *file, struct fd *lowerfd)
{
	struct file *realfile;

	if (file->f_op->open != shiftfs_open &&
	    file->f_op->open != shiftfs_dir_open)
		return -EINVAL;

	realfile = file->private_data;
	lowerfd->flags = 0;
	lowerfd->file = realfile;

	/* Did the flags change since open? */
	if (unlikely(file->f_flags & ~lowerfd->file->f_flags))
		return shiftfs_change_flags(lowerfd->file, file->f_flags);

	return 0;
}

static ssize_t shiftfs_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
	struct file *file = iocb->ki_filp;
	struct fd lowerfd;
	const struct cred *oldcred;
	ssize_t ret;

	if (!iov_iter_count(iter))
		return 0;

	ret = shiftfs_real_fdget(file, &lowerfd);
	if (ret)
		return ret;

	oldcred = shiftfs_override_creds(file->f_path.dentry->d_sb);
	ret = vfs_iter_read(lowerfd.file, iter, &iocb->ki_pos,
			    shiftfs_iocb_to_rwf(iocb));
	revert_creds(oldcred);

	shiftfs_file_accessed(file);

	fdput(lowerfd);
	return ret;
}

static ssize_t shiftfs_write_iter(struct kiocb *iocb, struct iov_iter *iter)
{
	struct file *file = iocb->ki_filp;
	struct inode *inode = file_inode(file);
	struct fd lowerfd;
	const struct cred *oldcred;
	ssize_t ret;

	if (!iov_iter_count(iter))
		return 0;

	inode_lock(inode);
	/* Update mode */
	shiftfs_copyattr(inode->i_private, inode);
	ret = file_remove_privs(file);
	if (ret)
		goto out_unlock;

	ret = shiftfs_real_fdget(file, &lowerfd);
	if (ret)
		goto out_unlock;

	oldcred = shiftfs_override_creds(file->f_path.dentry->d_sb);
	file_start_write(lowerfd.file);
	ret = vfs_iter_write(lowerfd.file, iter, &iocb->ki_pos,
			     shiftfs_iocb_to_rwf(iocb));
	file_end_write(lowerfd.file);
	revert_creds(oldcred);

	/* Update size */
	shiftfs_copyattr(inode->i_private, inode);

	fdput(lowerfd);

out_unlock:
	inode_unlock(inode);
	return ret;
}

static int shiftfs_fsync(struct file *file, loff_t start, loff_t end,
			 int datasync)
{
	struct fd lowerfd;
	const struct cred *oldcred;
	int ret;

	ret = shiftfs_real_fdget(file, &lowerfd);
	if (ret)
		return ret;

	oldcred = shiftfs_override_creds(file->f_path.dentry->d_sb);
	ret = vfs_fsync_range(lowerfd.file, start, end, datasync);
	revert_creds(oldcred);

	fdput(lowerfd);
	return ret;
}

static int shiftfs_mmap(struct file *file, struct vm_area_struct *vma)
{
	struct file *realfile = file->private_data;
	const struct cred *oldcred;
	int ret;

	if (!realfile->f_op->mmap)
		return -ENODEV;

	if (WARN_ON(file != vma->vm_file))
		return -EIO;

	oldcred = shiftfs_override_creds(file->f_path.dentry->d_sb);
	vma->vm_file = get_file(realfile);
	ret = call_mmap(vma->vm_file, vma);
	revert_creds(oldcred);

	shiftfs_file_accessed(file);

	if (ret) {
		/*
		 * Drop refcount from new vm_file value and restore original
		 * vm_file value
		 */
		vma->vm_file = file;
		fput(realfile);
	} else {
		/* Drop refcount from previous vm_file value */
		fput(file);
	}

	return ret;
}

static long shiftfs_fallocate(struct file *file, int mode, loff_t offset,
			      loff_t len)
{
	struct inode *inode = file_inode(file);
	struct inode *loweri = inode->i_private;
	struct fd lowerfd;
	const struct cred *oldcred;
	int ret;

	ret = shiftfs_real_fdget(file, &lowerfd);
	if (ret)
		return ret;

	oldcred = shiftfs_override_creds(file->f_path.dentry->d_sb);
	ret = vfs_fallocate(lowerfd.file, mode, offset, len);
	revert_creds(oldcred);

	/* Update size */
	shiftfs_copyattr(loweri, inode);

	fdput(lowerfd);
	return ret;
}

static int shiftfs_fadvise(struct file *file, loff_t offset, loff_t len,
			   int advice)
{
	struct fd lowerfd;
	const struct cred *oldcred;
	int ret;

	ret = shiftfs_real_fdget(file, &lowerfd);
	if (ret)
		return ret;

	oldcred = shiftfs_override_creds(file->f_path.dentry->d_sb);
	ret = vfs_fadvise(lowerfd.file, offset, len, advice);
	revert_creds(oldcred);

	fdput(lowerfd);
	return ret;
}

static int shiftfs_override_ioctl_creds(int cmd, const struct super_block *sb,
					const struct cred **oldcred,
					struct cred **newcred)
{
	struct shiftfs_super_info *sbinfo = sb->s_fs_info;
	kuid_t fsuid = current_fsuid();
	kgid_t fsgid = current_fsgid();

	*oldcred = shiftfs_override_creds(sb);

	*newcred = prepare_creds();
	if (!*newcred) {
		revert_creds(*oldcred);
		return -ENOMEM;
	}

	(*newcred)->fsuid = shift_kuid(sb->s_user_ns, sbinfo->userns, fsuid);
	(*newcred)->fsgid = shift_kgid(sb->s_user_ns, sbinfo->userns, fsgid);

	/* clear all caps to prevent bypassing capable() checks */
	cap_clear((*newcred)->cap_bset);
	cap_clear((*newcred)->cap_effective);
	cap_clear((*newcred)->cap_inheritable);
	cap_clear((*newcred)->cap_permitted);

	if (cmd == BTRFS_IOC_SNAP_DESTROY) {
		kuid_t kuid_root = make_kuid(sb->s_user_ns, 0);
		/*
		 * Allow the root user in the container to remove subvolumes
		 * from other users.
		 */
		if (uid_valid(kuid_root) && uid_eq(fsuid, kuid_root))
			cap_raise((*newcred)->cap_effective, CAP_DAC_OVERRIDE);
	}

	put_cred(override_creds(*newcred));
	return 0;
}

static inline void shiftfs_revert_ioctl_creds(const struct cred *oldcred,
					      struct cred *newcred)
{
	return shiftfs_revert_object_creds(oldcred, newcred);
}

static inline bool is_btrfs_snap_ioctl(int cmd)
{
	if ((cmd == BTRFS_IOC_SNAP_CREATE) || (cmd == BTRFS_IOC_SNAP_CREATE_V2))
		return true;

	return false;
}

static int shiftfs_btrfs_ioctl_fd_restore(int cmd, int fd, void __user *arg,
					  struct btrfs_ioctl_vol_args *v1,
					  struct btrfs_ioctl_vol_args_v2 *v2)
{
	int ret;

	if (!is_btrfs_snap_ioctl(cmd))
		return 0;

	if (cmd == BTRFS_IOC_SNAP_CREATE)
		ret = copy_to_user(arg, v1, sizeof(*v1));
	else
		ret = copy_to_user(arg, v2, sizeof(*v2));

	__close_fd(current->files, fd);
	kfree(v1);
	kfree(v2);

	return ret ? -EFAULT: 0;
}

static int shiftfs_btrfs_ioctl_fd_replace(int cmd, void __user *arg,
					  struct btrfs_ioctl_vol_args **b1,
					  struct btrfs_ioctl_vol_args_v2 **b2,
					  int *newfd)
{
	int oldfd, ret;
	struct fd src;
	struct fd lfd = {};
	struct btrfs_ioctl_vol_args *v1 = NULL;
	struct btrfs_ioctl_vol_args_v2 *v2 = NULL;

	*b1 = NULL;
	*b2 = NULL;

	if (!is_btrfs_snap_ioctl(cmd))
		return 0;

	if (cmd == BTRFS_IOC_SNAP_CREATE) {
		v1 = memdup_user(arg, sizeof(*v1));
		if (IS_ERR(v1))
			return PTR_ERR(v1);
		oldfd = v1->fd;
	} else {
		v2 = memdup_user(arg, sizeof(*v2));
		if (IS_ERR(v2))
			return PTR_ERR(v2);
		oldfd = v2->fd;
	}

	src = fdget(oldfd);
	if (!src.file) {
		ret = -EINVAL;
		goto err_free;
	}

	ret = shiftfs_real_fdget(src.file, &lfd);
	if (ret) {
		fdput(src);
		goto err_free;
	}

	/*
	 * shiftfs_real_fdget() does not take a reference to lfd.file, so
	 * take a reference here to offset the one which will be put by
	 * __close_fd(), and make sure that reference is put on fdput(lfd).
	 */
	get_file(lfd.file);
	lfd.flags |= FDPUT_FPUT;
	fdput(src);

	*newfd = get_unused_fd_flags(lfd.file->f_flags);
	if (*newfd < 0) {
		fdput(lfd);
		ret = *newfd;
		goto err_free;
	}

	fd_install(*newfd, lfd.file);

	if (cmd == BTRFS_IOC_SNAP_CREATE) {
		v1->fd = *newfd;
		ret = copy_to_user(arg, v1, sizeof(*v1));
		v1->fd = oldfd;
	} else {
		v2->fd = *newfd;
		ret = copy_to_user(arg, v2, sizeof(*v2));
		v2->fd = oldfd;
	}

	if (!ret) {
		*b1 = v1;
		*b2 = v2;
	} else {
		shiftfs_btrfs_ioctl_fd_restore(cmd, *newfd, arg, v1, v2);
		ret = -EFAULT;
	}

	return ret;

err_free:
	kfree(v1);
	kfree(v2);

	return ret;
}

static long shiftfs_real_ioctl(struct file *file, unsigned int cmd,
			       unsigned long arg)
{
	struct fd lowerfd;
	struct cred *newcred;
	const struct cred *oldcred;
	int newfd = -EBADF;
	long err = 0, ret = 0;
	void __user *argp = (void __user *)arg;
	struct super_block *sb = file->f_path.dentry->d_sb;
	struct btrfs_ioctl_vol_args *btrfs_v1 = NULL;
	struct btrfs_ioctl_vol_args_v2 *btrfs_v2 = NULL;

	ret = shiftfs_btrfs_ioctl_fd_replace(cmd, argp, &btrfs_v1, &btrfs_v2,
					     &newfd);
	if (ret < 0)
		return ret;

	ret = shiftfs_real_fdget(file, &lowerfd);
	if (ret)
		goto out_restore;

	ret = shiftfs_override_ioctl_creds(cmd, sb, &oldcred, &newcred);
	if (ret)
		goto out_fdput;

	ret = vfs_ioctl(lowerfd.file, cmd, arg);

	shiftfs_revert_ioctl_creds(oldcred, newcred);

	shiftfs_copyattr(file_inode(lowerfd.file), file_inode(file));
	shiftfs_copyflags(file_inode(lowerfd.file), file_inode(file));

out_fdput:
	fdput(lowerfd);

out_restore:
	err = shiftfs_btrfs_ioctl_fd_restore(cmd, newfd, argp,
					     btrfs_v1, btrfs_v2);
	if (!ret)
		ret = err;

	return ret;
}

static bool in_ioctl_whitelist(int flag, unsigned long arg)
{
	void __user *argp = (void __user *)arg;
	u64 flags = 0;

	switch (flag) {
	case BTRFS_IOC_FS_INFO:
		return true;
	case BTRFS_IOC_SNAP_CREATE:
		return true;
	case BTRFS_IOC_SNAP_CREATE_V2:
		return true;
	case BTRFS_IOC_SUBVOL_CREATE:
		return true;
	case BTRFS_IOC_SUBVOL_CREATE_V2:
		return true;
	case BTRFS_IOC_SUBVOL_GETFLAGS:
		return true;
	case BTRFS_IOC_SUBVOL_SETFLAGS:
		if (copy_from_user(&flags, argp, sizeof(flags)))
			return false;

		if (flags & ~BTRFS_SUBVOL_RDONLY)
			return false;

		return true;
	case BTRFS_IOC_SNAP_DESTROY:
		return true;
	}

	return false;
}

static long shiftfs_ioctl(struct file *file, unsigned int cmd,
			  unsigned long arg)
{
	switch (cmd) {
	case FS_IOC_GETVERSION:
		/* fall through */
	case FS_IOC_GETFLAGS:
		/* fall through */
	case FS_IOC_SETFLAGS:
		break;
	default:
		if (!in_ioctl_whitelist(cmd, arg) ||
		    !shiftfs_passthrough_ioctls(file->f_path.dentry->d_sb->s_fs_info))
			return -ENOTTY;
	}

	return shiftfs_real_ioctl(file, cmd, arg);
}

static long shiftfs_compat_ioctl(struct file *file, unsigned int cmd,
				 unsigned long arg)
{
	switch (cmd) {
	case FS_IOC32_GETVERSION:
		/* fall through */
	case FS_IOC32_GETFLAGS:
		/* fall through */
	case FS_IOC32_SETFLAGS:
		break;
	default:
		if (!in_ioctl_whitelist(cmd, arg) ||
		    !shiftfs_passthrough_ioctls(file->f_path.dentry->d_sb->s_fs_info))
			return -ENOIOCTLCMD;
	}

	return shiftfs_real_ioctl(file, cmd, arg);
}

enum shiftfs_copyop {
	SHIFTFS_COPY,
	SHIFTFS_CLONE,
	SHIFTFS_DEDUPE,
};

static ssize_t shiftfs_copyfile(struct file *file_in, loff_t pos_in,
				struct file *file_out, loff_t pos_out, u64 len,
				unsigned int flags, enum shiftfs_copyop op)
{
	ssize_t ret;
	struct fd real_in, real_out;
	const struct cred *oldcred;
	struct inode *inode_out = file_inode(file_out);
	struct inode *loweri = inode_out->i_private;

	ret = shiftfs_real_fdget(file_out, &real_out);
	if (ret)
		return ret;

	ret = shiftfs_real_fdget(file_in, &real_in);
	if (ret) {
		fdput(real_out);
		return ret;
	}

	oldcred = shiftfs_override_creds(inode_out->i_sb);
	switch (op) {
	case SHIFTFS_COPY:
		ret = vfs_copy_file_range(real_in.file, pos_in, real_out.file,
					  pos_out, len, flags);
		break;

	case SHIFTFS_CLONE:
		ret = vfs_clone_file_range(real_in.file, pos_in, real_out.file,
					   pos_out, len, flags);
		break;

	case SHIFTFS_DEDUPE:
		ret = vfs_dedupe_file_range_one(real_in.file, pos_in,
						real_out.file, pos_out, len,
						flags);
		break;
	}
	revert_creds(oldcred);

	/* Update size */
	shiftfs_copyattr(loweri, inode_out);

	fdput(real_in);
	fdput(real_out);

	return ret;
}

static ssize_t shiftfs_copy_file_range(struct file *file_in, loff_t pos_in,
				       struct file *file_out, loff_t pos_out,
				       size_t len, unsigned int flags)
{
	return shiftfs_copyfile(file_in, pos_in, file_out, pos_out, len, flags,
				SHIFTFS_COPY);
}

static loff_t shiftfs_remap_file_range(struct file *file_in, loff_t pos_in,
				       struct file *file_out, loff_t pos_out,
				       loff_t len, unsigned int remap_flags)
{
	enum shiftfs_copyop op;

	if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
		return -EINVAL;

	if (remap_flags & REMAP_FILE_DEDUP)
		op = SHIFTFS_DEDUPE;
	else
		op = SHIFTFS_CLONE;

	return shiftfs_copyfile(file_in, pos_in, file_out, pos_out, len,
				remap_flags, op);
}

static int shiftfs_iterate_shared(struct file *file, struct dir_context *ctx)
{
	const struct cred *oldcred;
	int err = -ENOTDIR;
	struct file *realfile = file->private_data;

	oldcred = shiftfs_override_creds(file->f_path.dentry->d_sb);
	err = iterate_dir(realfile, ctx);
	revert_creds(oldcred);

	return err;
}

const struct file_operations shiftfs_file_operations = {
	.open			= shiftfs_open,
	.release		= shiftfs_release,
	.llseek			= shiftfs_file_llseek,
	.read_iter		= shiftfs_read_iter,
	.write_iter		= shiftfs_write_iter,
	.fsync			= shiftfs_fsync,
	.mmap			= shiftfs_mmap,
	.fallocate		= shiftfs_fallocate,
	.fadvise		= shiftfs_fadvise,
	.unlocked_ioctl		= shiftfs_ioctl,
	.compat_ioctl		= shiftfs_compat_ioctl,
	.copy_file_range	= shiftfs_copy_file_range,
	.remap_file_range	= shiftfs_remap_file_range,
};

const struct file_operations shiftfs_dir_operations = {
	.open			= shiftfs_dir_open,
	.release		= shiftfs_dir_release,
	.compat_ioctl		= shiftfs_compat_ioctl,
	.fsync			= shiftfs_fsync,
	.iterate_shared		= shiftfs_iterate_shared,
	.llseek			= shiftfs_dir_llseek,
	.read			= generic_read_dir,
	.unlocked_ioctl		= shiftfs_ioctl,
};

static const struct address_space_operations shiftfs_aops = {
	/* For O_DIRECT dentry_open() checks f_mapping->a_ops->direct_IO */
	.direct_IO	= noop_direct_IO,
};

static void shiftfs_fill_inode(struct inode *inode, unsigned long ino,
			       umode_t mode, dev_t dev, struct dentry *dentry)
{
	struct inode *loweri;

	inode->i_ino = ino;
	inode->i_flags |= S_NOCMTIME;

	mode &= S_IFMT;
	inode->i_mode = mode;
	switch (mode & S_IFMT) {
	case S_IFDIR:
		inode->i_op = &shiftfs_dir_inode_operations;
		inode->i_fop = &shiftfs_dir_operations;
		break;
	case S_IFLNK:
		inode->i_op = &shiftfs_symlink_inode_operations;
		break;
	case S_IFREG:
		inode->i_op = &shiftfs_file_inode_operations;
		inode->i_fop = &shiftfs_file_operations;
		inode->i_mapping->a_ops = &shiftfs_aops;
		break;
	default:
		inode->i_op = &shiftfs_special_inode_operations;
		init_special_inode(inode, mode, dev);
		break;
	}

	if (!dentry)
		return;

	loweri = dentry->d_inode;
	if (!loweri->i_op->get_link)
		inode->i_opflags |= IOP_NOFOLLOW;

	shiftfs_copyattr(loweri, inode);
	shiftfs_copyflags(loweri, inode);
	set_nlink(inode, loweri->i_nlink);
}

static int shiftfs_show_options(struct seq_file *m, struct dentry *dentry)
{
	struct super_block *sb = dentry->d_sb;
	struct shiftfs_super_info *sbinfo = sb->s_fs_info;

	if (sbinfo->mark)
		seq_show_option(m, "mark", NULL);

	if (sbinfo->passthrough)
		seq_printf(m, ",passthrough=%u", sbinfo->passthrough);

	return 0;
}

static int shiftfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
	struct super_block *sb = dentry->d_sb;
	struct shiftfs_super_info *sbinfo = sb->s_fs_info;
	struct dentry *root = sb->s_root;
	struct dentry *realroot = root->d_fsdata;
	struct path realpath = { .mnt = sbinfo->mnt, .dentry = realroot };
	int err;

	err = vfs_statfs(&realpath, buf);
	if (err)
		return err;

	if (!shiftfs_passthrough_statfs(sbinfo))
		buf->f_type = sb->s_magic;

	return 0;
}

static void shiftfs_evict_inode(struct inode *inode)
{
	struct inode *loweri = inode->i_private;

	clear_inode(inode);

	if (loweri)
		iput(loweri);
}

static void shiftfs_put_super(struct super_block *sb)
{
	struct shiftfs_super_info *sbinfo = sb->s_fs_info;

	if (sbinfo) {
		mntput(sbinfo->mnt);
		put_cred(sbinfo->creator_cred);
		kfree(sbinfo);
	}
}

static const struct xattr_handler shiftfs_xattr_handler = {
	.prefix = "",
	.get    = shiftfs_xattr_get,
	.set    = shiftfs_xattr_set,
};

const struct xattr_handler *shiftfs_xattr_handlers[] = {
#ifdef CONFIG_SHIFT_FS_POSIX_ACL
	&shiftfs_posix_acl_access_xattr_handler,
	&shiftfs_posix_acl_default_xattr_handler,
#endif
	&shiftfs_xattr_handler,
	NULL
};

static inline bool passthrough_is_subset(int old_flags, int new_flags)
{
	if ((new_flags & old_flags) != new_flags)
		return false;

	return true;
}

static int shiftfs_super_check_flags(unsigned long old_flags,
				     unsigned long new_flags)
{
	if ((old_flags & SB_RDONLY) && !(new_flags & SB_RDONLY))
		return -EPERM;

	if ((old_flags & SB_NOSUID) && !(new_flags & SB_NOSUID))
		return -EPERM;

	if ((old_flags & SB_NODEV) && !(new_flags & SB_NODEV))
		return -EPERM;

	if ((old_flags & SB_NOEXEC) && !(new_flags & SB_NOEXEC))
		return -EPERM;

	if ((old_flags & SB_NOATIME) && !(new_flags & SB_NOATIME))
		return -EPERM;

	if ((old_flags & SB_NODIRATIME) && !(new_flags & SB_NODIRATIME))
		return -EPERM;

	if (!(old_flags & SB_POSIXACL) && (new_flags & SB_POSIXACL))
		return -EPERM;

	return 0;
}

static int shiftfs_remount(struct super_block *sb, int *flags, char *data)
{
	int err;
	struct shiftfs_super_info new = {};
	struct shiftfs_super_info *info = sb->s_fs_info;

	err = shiftfs_parse_mount_options(&new, data);
	if (err)
		return err;

	err = shiftfs_super_check_flags(sb->s_flags, *flags);
	if (err)
		return err;

	/* Mark mount option cannot be changed. */
	if (info->mark || (info->mark != new.mark))
		return -EPERM;

	if (info->passthrough != new.passthrough) {
		/* Don't allow exceeding passthrough options of mark mount. */
		if (!passthrough_is_subset(info->passthrough_mark,
					   info->passthrough))
			return -EPERM;

		info->passthrough = new.passthrough;
	}

	return 0;
}

static const struct super_operations shiftfs_super_ops = {
	.put_super	= shiftfs_put_super,
	.show_options	= shiftfs_show_options,
	.statfs		= shiftfs_statfs,
	.remount_fs	= shiftfs_remount,
	.evict_inode	= shiftfs_evict_inode,
};

struct shiftfs_data {
	void *data;
	const char *path;
};

static void shiftfs_super_force_flags(struct super_block *sb,
				      unsigned long lower_flags)
{
	sb->s_flags |= lower_flags & (SB_RDONLY | SB_NOSUID | SB_NODEV |
				      SB_NOEXEC | SB_NOATIME | SB_NODIRATIME);

	if (!(lower_flags & SB_POSIXACL))
		sb->s_flags &= ~SB_POSIXACL;
}

static int shiftfs_fill_super(struct super_block *sb, void *raw_data,
			      int silent)
{
	int err;
	struct path path = {};
	struct shiftfs_super_info *sbinfo_mp;
	char *name = NULL;
	struct inode *inode = NULL;
	struct dentry *dentry = NULL;
	struct shiftfs_data *data = raw_data;
	struct shiftfs_super_info *sbinfo = NULL;

	if (!data->path)
		return -EINVAL;

	sb->s_fs_info = kzalloc(sizeof(*sbinfo), GFP_KERNEL);
	if (!sb->s_fs_info)
		return -ENOMEM;
	sbinfo = sb->s_fs_info;

	err = shiftfs_parse_mount_options(sbinfo, data->data);
	if (err)
		return err;

	/* to mount a mark, must be userns admin */
	if (!sbinfo->mark && !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
		return -EPERM;

	name = kstrdup(data->path, GFP_KERNEL);
	if (!name)
		return -ENOMEM;

	err = kern_path(name, LOOKUP_FOLLOW, &path);
	if (err)
		goto out_free_name;

	if (!S_ISDIR(path.dentry->d_inode->i_mode)) {
		err = -ENOTDIR;
		goto out_put_path;
	}

	sb->s_flags |= SB_POSIXACL;

	if (sbinfo->mark) {
		struct cred *cred_tmp;
		struct super_block *lower_sb = path.mnt->mnt_sb;

		/* to mark a mount point, must root wrt lower s_user_ns */
		if (!ns_capable(lower_sb->s_user_ns, CAP_SYS_ADMIN)) {
			err = -EPERM;
			goto out_put_path;
		}

		/*
		 * this part is visible unshifted, so make sure no
		 * executables that could be used to give suid
		 * privileges
		 */
		sb->s_iflags = SB_I_NOEXEC;

		shiftfs_super_force_flags(sb, lower_sb->s_flags);

		/*
		 * Handle nesting of shiftfs mounts by referring this mark
		 * mount back to the original mark mount. This is more
		 * efficient and alleviates concerns about stack depth.
		 */
		if (lower_sb->s_magic == SHIFTFS_MAGIC) {
			sbinfo_mp = lower_sb->s_fs_info;

			/* Doesn't make sense to mark a mark mount */
			if (sbinfo_mp->mark) {
				err = -EINVAL;
				goto out_put_path;
			}

			if (!passthrough_is_subset(sbinfo_mp->passthrough,
						   sbinfo->passthrough)) {
				err = -EPERM;
				goto out_put_path;
			}

			sbinfo->mnt = mntget(sbinfo_mp->mnt);
			dentry = dget(path.dentry->d_fsdata);
			/*
			 * Copy up the passthrough mount options from the
			 * parent mark mountpoint.
			 */
			sbinfo->passthrough_mark = sbinfo_mp->passthrough_mark;
			sbinfo->creator_cred = get_cred(sbinfo_mp->creator_cred);
		} else {
			sbinfo->mnt = mntget(path.mnt);
			dentry = dget(path.dentry);
			/*
			 * For a new mark passthrough_mark and passthrough
			 * are identical.
			 */
			sbinfo->passthrough_mark = sbinfo->passthrough;

			cred_tmp = prepare_creds();
			if (!cred_tmp) {
				err = -ENOMEM;
				goto out_put_path;
			}
			/* Don't override disk quota limits or use reserved space. */
			cap_lower(cred_tmp->cap_effective, CAP_SYS_RESOURCE);
			sbinfo->creator_cred = cred_tmp;
		}
	} else {
		/*
		 * This leg executes if we're admin capable in the namespace,
		 * so be very careful.
		 */
		err = -EPERM;
		if (path.dentry->d_sb->s_magic != SHIFTFS_MAGIC)
			goto out_put_path;

		sbinfo_mp = path.dentry->d_sb->s_fs_info;
		if (!sbinfo_mp->mark)
			goto out_put_path;

		if (!passthrough_is_subset(sbinfo_mp->passthrough,
					   sbinfo->passthrough))
			goto out_put_path;

		sbinfo->mnt = mntget(sbinfo_mp->mnt);
		sbinfo->creator_cred = get_cred(sbinfo_mp->creator_cred);
		dentry = dget(path.dentry->d_fsdata);
		/*
		 * Copy up passthrough settings from mark mountpoint so we can
		 * verify when the overlay wants to remount with different
		 * passthrough settings.
		 */
		sbinfo->passthrough_mark = sbinfo_mp->passthrough;
		shiftfs_super_force_flags(sb, path.mnt->mnt_sb->s_flags);
	}

	sb->s_stack_depth = dentry->d_sb->s_stack_depth + 1;
	if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
		printk(KERN_ERR "shiftfs: maximum stacking depth exceeded\n");
		err = -EINVAL;
		goto out_put_path;
	}

	inode = new_inode(sb);
	if (!inode) {
		err = -ENOMEM;
		goto out_put_path;
	}
	shiftfs_fill_inode(inode, dentry->d_inode->i_ino, S_IFDIR, 0, dentry);

	ihold(dentry->d_inode);
	inode->i_private = dentry->d_inode;

	sb->s_magic = SHIFTFS_MAGIC;
	sb->s_maxbytes = MAX_LFS_FILESIZE;
	sb->s_op = &shiftfs_super_ops;
	sb->s_xattr = shiftfs_xattr_handlers;
	sb->s_d_op = &shiftfs_dentry_ops;
	sb->s_root = d_make_root(inode);
	if (!sb->s_root) {
		err = -ENOMEM;
		goto out_put_path;
	}

	sb->s_root->d_fsdata = dentry;
	sbinfo->userns = get_user_ns(dentry->d_sb->s_user_ns);
	shiftfs_copyattr(dentry->d_inode, sb->s_root->d_inode);

	dentry = NULL;
	err = 0;

out_put_path:
	path_put(&path);

out_free_name:
	kfree(name);

	dput(dentry);

	return err;
}

static struct dentry *shiftfs_mount(struct file_system_type *fs_type,
				    int flags, const char *dev_name, void *data)
{
	struct shiftfs_data d = { data, dev_name };

	return mount_nodev(fs_type, flags, &d, shiftfs_fill_super);
}

static struct file_system_type shiftfs_type = {
	.owner		= THIS_MODULE,
	.name		= "shiftfs",
	.mount		= shiftfs_mount,
	.kill_sb	= kill_anon_super,
	.fs_flags	= FS_USERNS_MOUNT,
};

static int __init shiftfs_init(void)
{
	return register_filesystem(&shiftfs_type);
}

static void __exit shiftfs_exit(void)
{
	unregister_filesystem(&shiftfs_type);
}

MODULE_ALIAS_FS("shiftfs");
MODULE_AUTHOR("James Bottomley");
MODULE_AUTHOR("Seth Forshee <seth.forshee@canonical.com>");
MODULE_AUTHOR("Christian Brauner <christian.brauner@ubuntu.com>");
MODULE_DESCRIPTION("id shifting filesystem");
MODULE_LICENSE("GPL v2");
module_init(shiftfs_init)
module_exit(shiftfs_exit)