Merge: Backport the latest s390x KVM fixes from upstream (up to kernel 6.12-rc5)

MR: https://gitlab.com/redhat/centos-stream/src/kernel/centos-stream-9/-/merge_requests/5627

JIRA: https://issues.redhat.com/browse/RHEL-65229

Various important fixes for KVM on s390x from the latest upstream kernel versions.

Signed-off-by: Thomas Huth <thuth@redhat.com>

Approved-by: Rafael Aquini <raquini@redhat.com>
Approved-by: Cédric Le Goater <clg@redhat.com>
Approved-by: CKI KWF Bot <cki-ci-bot+kwf-gitlab-com@redhat.com>

Merged-by: Rado Vrbovsky <rvrbovsk@redhat.com>
This commit is contained in:
Rado Vrbovsky 2024-11-15 21:04:51 +00:00
commit ed55eea6ed
13 changed files with 219 additions and 67 deletions

View File

@ -146,7 +146,7 @@ int gmap_mprotect_notify(struct gmap *, unsigned long start,
void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4],
unsigned long gaddr, unsigned long vmaddr);
int gmap_mark_unmergeable(void);
int s390_disable_cow_sharing(void);
void s390_unlist_old_asce(struct gmap *gmap);
int s390_replace_asce(struct gmap *gmap);
void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns);

View File

@ -15,7 +15,6 @@
#include <linux/hrtimer.h>
#include <linux/interrupt.h>
#include <linux/kvm_types.h>
#include <linux/kvm_host.h>
#include <linux/kvm.h>
#include <linux/seqlock.h>
#include <linux/module.h>

View File

@ -33,6 +33,11 @@ typedef struct {
unsigned int uses_skeys:1;
/* The mmu context uses CMM. */
unsigned int uses_cmm:1;
/*
* The mmu context allows COW-sharing of memory pages (KSM, zeropage).
* Note that COW-sharing during fork() is currently always allowed.
*/
unsigned int allow_cow_sharing:1;
/* The gmaps associated with this context are allowed to use huge pages. */
unsigned int allow_gmap_hpage_1m:1;
} mm_context_t;

View File

@ -36,6 +36,7 @@ static inline int init_new_context(struct task_struct *tsk,
mm->context.has_pgste = 0;
mm->context.uses_skeys = 0;
mm->context.uses_cmm = 0;
mm->context.allow_cow_sharing = 1;
mm->context.allow_gmap_hpage_1m = 0;
#endif
switch (mm->context.asce_limit) {

View File

@ -566,10 +566,20 @@ static inline pud_t set_pud_bit(pud_t pud, pgprot_t prot)
}
/*
* In the case that a guest uses storage keys
* faults should no longer be backed by zero pages
* As soon as the guest uses storage keys or enables PV, we deduplicate all
* mapped shared zeropages and prevent new shared zeropages from getting
* mapped.
*/
#define mm_forbids_zeropage mm_has_pgste
#define mm_forbids_zeropage mm_forbids_zeropage
static inline int mm_forbids_zeropage(struct mm_struct *mm)
{
#ifdef CONFIG_PGSTE
if (!mm->context.allow_cow_sharing)
return 1;
#endif
return 0;
}
static inline int mm_uses_skeys(struct mm_struct *mm)
{
#ifdef CONFIG_PGSTE

View File

@ -442,7 +442,10 @@ static inline int share(unsigned long addr, u16 cmd)
if (!uv_call(0, (u64)&uvcb))
return 0;
return -EINVAL;
pr_err("%s UVC failed (rc: 0x%x, rrc: 0x%x), possible hypervisor bug.\n",
uvcb.header.cmd == UVC_CMD_SET_SHARED_ACCESS ? "Share" : "Unshare",
uvcb.header.rc, uvcb.header.rrc);
panic("System security cannot be guaranteed unless the system panics now.\n");
}
/*

View File

@ -77,7 +77,7 @@ static int __diag_page_ref_service(struct kvm_vcpu *vcpu)
vcpu->stat.instruction_diagnose_258++;
if (vcpu->run->s.regs.gprs[rx] & 7)
return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
rc = read_guest(vcpu, vcpu->run->s.regs.gprs[rx], rx, &parm, sizeof(parm));
rc = read_guest_real(vcpu, vcpu->run->s.regs.gprs[rx], &parm, sizeof(parm));
if (rc)
return kvm_s390_inject_prog_cond(vcpu, rc);
if (parm.parm_version != 2 || parm.parm_len < 5 || parm.code != 0x258)

View File

@ -985,6 +985,8 @@ static int access_guest_page(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa,
const gfn_t gfn = gpa_to_gfn(gpa);
int rc;
if (!gfn_to_memslot(kvm, gfn))
return PGM_ADDRESSING;
if (mode == GACC_STORE)
rc = kvm_write_guest_page(kvm, gfn, data, offset, len);
else
@ -1142,6 +1144,8 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
gra += fragment_len;
data += fragment_len;
}
if (rc > 0)
vcpu->arch.pgm.code = rc;
return rc;
}

View File

@ -405,11 +405,12 @@ int read_guest_abs(struct kvm_vcpu *vcpu, unsigned long gpa, void *data,
* @len: number of bytes to copy
*
* Copy @len bytes from @data (kernel space) to @gra (guest real address).
* It is up to the caller to ensure that the entire guest memory range is
* valid memory before calling this function.
* Guest low address and key protection are not checked.
*
* Returns zero on success or -EFAULT on error.
* Returns zero on success, -EFAULT when copying from @data failed, or
* PGM_ADRESSING in case @gra is outside a memslot. In this case, pgm check info
* is also stored to allow injecting into the guest (if applicable) using
* kvm_s390_inject_prog_cond().
*
* If an error occurs data may have been copied partially to guest memory.
*/
@ -428,11 +429,12 @@ int write_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, void *data,
* @len: number of bytes to copy
*
* Copy @len bytes from @gra (guest real address) to @data (kernel space).
* It is up to the caller to ensure that the entire guest memory range is
* valid memory before calling this function.
* Guest key protection is not checked.
*
* Returns zero on success or -EFAULT on error.
* Returns zero on success, -EFAULT when copying to @data failed, or
* PGM_ADRESSING in case @gra is outside a memslot. In this case, pgm check info
* is also stored to allow injecting into the guest (if applicable) using
* kvm_s390_inject_prog_cond().
*
* If an error occurs data may have been copied partially to kernel space.
*/

View File

@ -348,20 +348,29 @@ static inline int plo_test_bit(unsigned char nr)
return cc == 0;
}
static __always_inline void __insn32_query(unsigned int opcode, u8 *query)
static __always_inline void __sortl_query(u8 (*query)[32])
{
asm volatile(
" lghi 0,0\n"
" lgr 1,%[query]\n"
" la 1,%[query]\n"
/* Parameter registers are ignored */
" .insn rrf,%[opc] << 16,2,4,6,0\n"
" .insn rre,0xb9380000,2,4\n"
: [query] "=R" (*query)
:
: [query] "d" ((unsigned long)query), [opc] "i" (opcode)
: "cc", "memory", "0", "1");
: "cc", "0", "1");
}
#define INSN_SORTL 0xb938
#define INSN_DFLTCC 0xb939
static __always_inline void __dfltcc_query(u8 (*query)[32])
{
asm volatile(
" lghi 0,0\n"
" la 1,%[query]\n"
/* Parameter registers are ignored */
" .insn rrf,0xb9390000,2,4,6,0\n"
: [query] "=R" (*query)
:
: "cc", "0", "1");
}
static void __init kvm_s390_cpu_feat_init(void)
{
@ -415,10 +424,10 @@ static void __init kvm_s390_cpu_feat_init(void)
kvm_s390_available_subfunc.kdsa);
if (test_facility(150)) /* SORTL */
__insn32_query(INSN_SORTL, kvm_s390_available_subfunc.sortl);
__sortl_query(&kvm_s390_available_subfunc.sortl);
if (test_facility(151)) /* DFLTCC */
__insn32_query(INSN_DFLTCC, kvm_s390_available_subfunc.dfltcc);
__dfltcc_query(&kvm_s390_available_subfunc.dfltcc);
if (MACHINE_HAS_ESOP)
allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP);
@ -2630,9 +2639,7 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
if (r)
break;
mmap_write_lock(current->mm);
r = gmap_mark_unmergeable();
mmap_write_unlock(current->mm);
r = s390_disable_cow_sharing();
if (r)
break;

View File

@ -12,6 +12,7 @@
#include <linux/list.h>
#include <linux/bitmap.h>
#include <linux/sched/signal.h>
#include <linux/io.h>
#include <asm/gmap.h>
#include <asm/mmu_context.h>
@ -361,7 +362,7 @@ end:
case -EACCES:
return set_validity_icpt(scb_s, 0x003CU);
}
scb_s->crycbd = ((__u32)(__u64) &vsie_page->crycb) | CRYCB_FORMAT2;
scb_s->crycbd = (u32)virt_to_phys(&vsie_page->crycb) | CRYCB_FORMAT2;
return 0;
}
@ -1005,7 +1006,7 @@ static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
if (read_guest_real(vcpu, fac, &vsie_page->fac,
stfle_size() * sizeof(u64)))
return set_validity_icpt(scb_s, 0x1090U);
scb_s->fac = (__u32)(__u64) &vsie_page->fac;
scb_s->fac = (u32)virt_to_phys(&vsie_page->fac);
}
return 0;
}

View File

@ -2547,41 +2547,6 @@ static inline void thp_split_mm(struct mm_struct *mm)
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
/*
* Remove all empty zero pages from the mapping for lazy refaulting
* - This must be called after mm->context.has_pgste is set, to avoid
* future creation of zero pages
* - This must be called after THP was disabled.
*
* mm contracts with s390, that even if mm were to remove a page table,
* racing with the loop below and so causing pte_offset_map_lock() to fail,
* it will never insert a page table containing empty zero pages once
* mm_forbids_zeropage(mm) i.e. mm->context.has_pgste is set.
*/
static int __zap_zero_pages(pmd_t *pmd, unsigned long start,
unsigned long end, struct mm_walk *walk)
{
unsigned long addr;
for (addr = start; addr != end; addr += PAGE_SIZE) {
pte_t *ptep;
spinlock_t *ptl;
ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
if (!ptep)
break;
if (is_zero_pfn(pte_pfn(*ptep)))
ptep_xchg_direct(walk->mm, addr, ptep, __pte(_PAGE_INVALID));
pte_unmap_unlock(ptep, ptl);
}
return 0;
}
static const struct mm_walk_ops zap_zero_walk_ops = {
.pmd_entry = __zap_zero_pages,
.walk_lock = PGWALK_WRLOCK,
};
/*
* switch on pgstes for its userspace process (for kvm)
*/
@ -2599,22 +2564,142 @@ int s390_enable_sie(void)
mm->context.has_pgste = 1;
/* split thp mappings and disable thp for future mappings */
thp_split_mm(mm);
walk_page_range(mm, 0, TASK_SIZE, &zap_zero_walk_ops, NULL);
mmap_write_unlock(mm);
return 0;
}
EXPORT_SYMBOL_GPL(s390_enable_sie);
int gmap_mark_unmergeable(void)
static int find_zeropage_pte_entry(pte_t *pte, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
unsigned long *found_addr = walk->private;
/* Return 1 of the page is a zeropage. */
if (is_zero_pfn(pte_pfn(*pte))) {
/*
* Shared zeropage in e.g., a FS DAX mapping? We cannot do the
* right thing and likely don't care: FAULT_FLAG_UNSHARE
* currently only works in COW mappings, which is also where
* mm_forbids_zeropage() is checked.
*/
if (!is_cow_mapping(walk->vma->vm_flags))
return -EFAULT;
*found_addr = addr;
return 1;
}
return 0;
}
static const struct mm_walk_ops find_zeropage_ops = {
.pte_entry = find_zeropage_pte_entry,
.walk_lock = PGWALK_WRLOCK,
};
/*
* Unshare all shared zeropages, replacing them by anonymous pages. Note that
* we cannot simply zap all shared zeropages, because this could later
* trigger unexpected userfaultfd missing events.
*
* This must be called after mm->context.allow_cow_sharing was
* set to 0, to avoid future mappings of shared zeropages.
*
* mm contracts with s390, that even if mm were to remove a page table,
* and racing with walk_page_range_vma() calling pte_offset_map_lock()
* would fail, it will never insert a page table containing empty zero
* pages once mm_forbids_zeropage(mm) i.e.
* mm->context.allow_cow_sharing is set to 0.
*/
static int __s390_unshare_zeropages(struct mm_struct *mm)
{
struct vm_area_struct *vma;
VMA_ITERATOR(vmi, mm, 0);
unsigned long addr;
vm_fault_t fault;
int rc;
for_each_vma(vmi, vma) {
/*
* We could only look at COW mappings, but it's more future
* proof to catch unexpected zeropages in other mappings and
* fail.
*/
if ((vma->vm_flags & VM_PFNMAP) || is_vm_hugetlb_page(vma))
continue;
addr = vma->vm_start;
retry:
rc = walk_page_range_vma(vma, addr, vma->vm_end,
&find_zeropage_ops, &addr);
if (rc < 0)
return rc;
else if (!rc)
continue;
/* addr was updated by find_zeropage_pte_entry() */
fault = handle_mm_fault(vma, addr,
FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE,
NULL);
if (fault & VM_FAULT_OOM)
return -ENOMEM;
/*
* See break_ksm(): even after handle_mm_fault() returned 0, we
* must start the lookup from the current address, because
* handle_mm_fault() may back out if there's any difficulty.
*
* VM_FAULT_SIGBUS and VM_FAULT_SIGSEGV are unexpected but
* maybe they could trigger in the future on concurrent
* truncation. In that case, the shared zeropage would be gone
* and we can simply retry and make progress.
*/
cond_resched();
goto retry;
}
return 0;
}
static int __s390_disable_cow_sharing(struct mm_struct *mm)
{
int rc;
if (!mm->context.allow_cow_sharing)
return 0;
mm->context.allow_cow_sharing = 0;
/* Replace all shared zeropages by anonymous pages. */
rc = __s390_unshare_zeropages(mm);
/*
* Make sure to disable KSM (if enabled for the whole process or
* individual VMAs). Note that nothing currently hinders user space
* from re-enabling it.
*/
return ksm_disable(current->mm);
if (!rc)
rc = ksm_disable(mm);
if (rc)
mm->context.allow_cow_sharing = 1;
return rc;
}
EXPORT_SYMBOL_GPL(gmap_mark_unmergeable);
/*
* Disable most COW-sharing of memory pages for the whole process:
* (1) Disable KSM and unmerge/unshare any KSM pages.
* (2) Disallow shared zeropages and unshare any zerpages that are mapped.
*
* Not that we currently don't bother with COW-shared pages that are shared
* with parent/child processes due to fork().
*/
int s390_disable_cow_sharing(void)
{
int rc;
mmap_write_lock(current->mm);
rc = __s390_disable_cow_sharing(current->mm);
mmap_write_unlock(current->mm);
return rc;
}
EXPORT_SYMBOL_GPL(s390_disable_cow_sharing);
/*
* Enable storage key handling from now on and initialize the storage
@ -2683,7 +2768,7 @@ int s390_enable_skey(void)
goto out_up;
mm->context.uses_skeys = 1;
rc = gmap_mark_unmergeable();
rc = __s390_disable_cow_sharing(mm);
if (rc) {
mm->context.uses_skeys = 0;
goto out_up;

View File

@ -212,6 +212,38 @@ out_release:
goto out;
}
static int mfill_atomic_pte_zeroed_folio(pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr)
{
struct folio *folio;
int ret = -ENOMEM;
folio = vma_alloc_zeroed_movable_folio(dst_vma, dst_addr);
if (!folio)
return ret;
if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL))
goto out_put;
/*
* The memory barrier inside __folio_mark_uptodate makes sure that
* zeroing out the folio become visible before mapping the page
* using set_pte_at(). See do_anonymous_page().
*/
__folio_mark_uptodate(folio);
ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
&folio->page, true, 0);
if (ret)
goto out_put;
return 0;
out_put:
folio_put(folio);
return ret;
}
static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr)
@ -220,6 +252,9 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
spinlock_t *ptl;
int ret;
if (mm_forbids_zeropage(dst_vma->vm_mm))
return mfill_atomic_pte_zeroed_folio(dst_pmd, dst_vma, dst_addr);
_dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
dst_vma->vm_page_prot));
ret = -EAGAIN;