Merge: Backport the latest s390x KVM fixes from upstream (up to kernel 6.12-rc5)
MR: https://gitlab.com/redhat/centos-stream/src/kernel/centos-stream-9/-/merge_requests/5627 JIRA: https://issues.redhat.com/browse/RHEL-65229 Various important fixes for KVM on s390x from the latest upstream kernel versions. Signed-off-by: Thomas Huth <thuth@redhat.com> Approved-by: Rafael Aquini <raquini@redhat.com> Approved-by: Cédric Le Goater <clg@redhat.com> Approved-by: CKI KWF Bot <cki-ci-bot+kwf-gitlab-com@redhat.com> Merged-by: Rado Vrbovsky <rvrbovsk@redhat.com>
This commit is contained in:
commit
ed55eea6ed
|
@ -146,7 +146,7 @@ int gmap_mprotect_notify(struct gmap *, unsigned long start,
|
|||
|
||||
void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4],
|
||||
unsigned long gaddr, unsigned long vmaddr);
|
||||
int gmap_mark_unmergeable(void);
|
||||
int s390_disable_cow_sharing(void);
|
||||
void s390_unlist_old_asce(struct gmap *gmap);
|
||||
int s390_replace_asce(struct gmap *gmap);
|
||||
void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns);
|
||||
|
|
|
@ -15,7 +15,6 @@
|
|||
#include <linux/hrtimer.h>
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/kvm_types.h>
|
||||
#include <linux/kvm_host.h>
|
||||
#include <linux/kvm.h>
|
||||
#include <linux/seqlock.h>
|
||||
#include <linux/module.h>
|
||||
|
|
|
@ -33,6 +33,11 @@ typedef struct {
|
|||
unsigned int uses_skeys:1;
|
||||
/* The mmu context uses CMM. */
|
||||
unsigned int uses_cmm:1;
|
||||
/*
|
||||
* The mmu context allows COW-sharing of memory pages (KSM, zeropage).
|
||||
* Note that COW-sharing during fork() is currently always allowed.
|
||||
*/
|
||||
unsigned int allow_cow_sharing:1;
|
||||
/* The gmaps associated with this context are allowed to use huge pages. */
|
||||
unsigned int allow_gmap_hpage_1m:1;
|
||||
} mm_context_t;
|
||||
|
|
|
@ -36,6 +36,7 @@ static inline int init_new_context(struct task_struct *tsk,
|
|||
mm->context.has_pgste = 0;
|
||||
mm->context.uses_skeys = 0;
|
||||
mm->context.uses_cmm = 0;
|
||||
mm->context.allow_cow_sharing = 1;
|
||||
mm->context.allow_gmap_hpage_1m = 0;
|
||||
#endif
|
||||
switch (mm->context.asce_limit) {
|
||||
|
|
|
@ -566,10 +566,20 @@ static inline pud_t set_pud_bit(pud_t pud, pgprot_t prot)
|
|||
}
|
||||
|
||||
/*
|
||||
* In the case that a guest uses storage keys
|
||||
* faults should no longer be backed by zero pages
|
||||
* As soon as the guest uses storage keys or enables PV, we deduplicate all
|
||||
* mapped shared zeropages and prevent new shared zeropages from getting
|
||||
* mapped.
|
||||
*/
|
||||
#define mm_forbids_zeropage mm_has_pgste
|
||||
#define mm_forbids_zeropage mm_forbids_zeropage
|
||||
static inline int mm_forbids_zeropage(struct mm_struct *mm)
|
||||
{
|
||||
#ifdef CONFIG_PGSTE
|
||||
if (!mm->context.allow_cow_sharing)
|
||||
return 1;
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int mm_uses_skeys(struct mm_struct *mm)
|
||||
{
|
||||
#ifdef CONFIG_PGSTE
|
||||
|
|
|
@ -442,7 +442,10 @@ static inline int share(unsigned long addr, u16 cmd)
|
|||
|
||||
if (!uv_call(0, (u64)&uvcb))
|
||||
return 0;
|
||||
return -EINVAL;
|
||||
pr_err("%s UVC failed (rc: 0x%x, rrc: 0x%x), possible hypervisor bug.\n",
|
||||
uvcb.header.cmd == UVC_CMD_SET_SHARED_ACCESS ? "Share" : "Unshare",
|
||||
uvcb.header.rc, uvcb.header.rrc);
|
||||
panic("System security cannot be guaranteed unless the system panics now.\n");
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -77,7 +77,7 @@ static int __diag_page_ref_service(struct kvm_vcpu *vcpu)
|
|||
vcpu->stat.instruction_diagnose_258++;
|
||||
if (vcpu->run->s.regs.gprs[rx] & 7)
|
||||
return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
|
||||
rc = read_guest(vcpu, vcpu->run->s.regs.gprs[rx], rx, &parm, sizeof(parm));
|
||||
rc = read_guest_real(vcpu, vcpu->run->s.regs.gprs[rx], &parm, sizeof(parm));
|
||||
if (rc)
|
||||
return kvm_s390_inject_prog_cond(vcpu, rc);
|
||||
if (parm.parm_version != 2 || parm.parm_len < 5 || parm.code != 0x258)
|
||||
|
|
|
@ -985,6 +985,8 @@ static int access_guest_page(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa,
|
|||
const gfn_t gfn = gpa_to_gfn(gpa);
|
||||
int rc;
|
||||
|
||||
if (!gfn_to_memslot(kvm, gfn))
|
||||
return PGM_ADDRESSING;
|
||||
if (mode == GACC_STORE)
|
||||
rc = kvm_write_guest_page(kvm, gfn, data, offset, len);
|
||||
else
|
||||
|
@ -1142,6 +1144,8 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
|
|||
gra += fragment_len;
|
||||
data += fragment_len;
|
||||
}
|
||||
if (rc > 0)
|
||||
vcpu->arch.pgm.code = rc;
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
|
|
@ -405,11 +405,12 @@ int read_guest_abs(struct kvm_vcpu *vcpu, unsigned long gpa, void *data,
|
|||
* @len: number of bytes to copy
|
||||
*
|
||||
* Copy @len bytes from @data (kernel space) to @gra (guest real address).
|
||||
* It is up to the caller to ensure that the entire guest memory range is
|
||||
* valid memory before calling this function.
|
||||
* Guest low address and key protection are not checked.
|
||||
*
|
||||
* Returns zero on success or -EFAULT on error.
|
||||
* Returns zero on success, -EFAULT when copying from @data failed, or
|
||||
* PGM_ADRESSING in case @gra is outside a memslot. In this case, pgm check info
|
||||
* is also stored to allow injecting into the guest (if applicable) using
|
||||
* kvm_s390_inject_prog_cond().
|
||||
*
|
||||
* If an error occurs data may have been copied partially to guest memory.
|
||||
*/
|
||||
|
@ -428,11 +429,12 @@ int write_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, void *data,
|
|||
* @len: number of bytes to copy
|
||||
*
|
||||
* Copy @len bytes from @gra (guest real address) to @data (kernel space).
|
||||
* It is up to the caller to ensure that the entire guest memory range is
|
||||
* valid memory before calling this function.
|
||||
* Guest key protection is not checked.
|
||||
*
|
||||
* Returns zero on success or -EFAULT on error.
|
||||
* Returns zero on success, -EFAULT when copying to @data failed, or
|
||||
* PGM_ADRESSING in case @gra is outside a memslot. In this case, pgm check info
|
||||
* is also stored to allow injecting into the guest (if applicable) using
|
||||
* kvm_s390_inject_prog_cond().
|
||||
*
|
||||
* If an error occurs data may have been copied partially to kernel space.
|
||||
*/
|
||||
|
|
|
@ -348,20 +348,29 @@ static inline int plo_test_bit(unsigned char nr)
|
|||
return cc == 0;
|
||||
}
|
||||
|
||||
static __always_inline void __insn32_query(unsigned int opcode, u8 *query)
|
||||
static __always_inline void __sortl_query(u8 (*query)[32])
|
||||
{
|
||||
asm volatile(
|
||||
" lghi 0,0\n"
|
||||
" lgr 1,%[query]\n"
|
||||
" la 1,%[query]\n"
|
||||
/* Parameter registers are ignored */
|
||||
" .insn rrf,%[opc] << 16,2,4,6,0\n"
|
||||
" .insn rre,0xb9380000,2,4\n"
|
||||
: [query] "=R" (*query)
|
||||
:
|
||||
: [query] "d" ((unsigned long)query), [opc] "i" (opcode)
|
||||
: "cc", "memory", "0", "1");
|
||||
: "cc", "0", "1");
|
||||
}
|
||||
|
||||
#define INSN_SORTL 0xb938
|
||||
#define INSN_DFLTCC 0xb939
|
||||
static __always_inline void __dfltcc_query(u8 (*query)[32])
|
||||
{
|
||||
asm volatile(
|
||||
" lghi 0,0\n"
|
||||
" la 1,%[query]\n"
|
||||
/* Parameter registers are ignored */
|
||||
" .insn rrf,0xb9390000,2,4,6,0\n"
|
||||
: [query] "=R" (*query)
|
||||
:
|
||||
: "cc", "0", "1");
|
||||
}
|
||||
|
||||
static void __init kvm_s390_cpu_feat_init(void)
|
||||
{
|
||||
|
@ -415,10 +424,10 @@ static void __init kvm_s390_cpu_feat_init(void)
|
|||
kvm_s390_available_subfunc.kdsa);
|
||||
|
||||
if (test_facility(150)) /* SORTL */
|
||||
__insn32_query(INSN_SORTL, kvm_s390_available_subfunc.sortl);
|
||||
__sortl_query(&kvm_s390_available_subfunc.sortl);
|
||||
|
||||
if (test_facility(151)) /* DFLTCC */
|
||||
__insn32_query(INSN_DFLTCC, kvm_s390_available_subfunc.dfltcc);
|
||||
__dfltcc_query(&kvm_s390_available_subfunc.dfltcc);
|
||||
|
||||
if (MACHINE_HAS_ESOP)
|
||||
allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP);
|
||||
|
@ -2630,9 +2639,7 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
|
|||
if (r)
|
||||
break;
|
||||
|
||||
mmap_write_lock(current->mm);
|
||||
r = gmap_mark_unmergeable();
|
||||
mmap_write_unlock(current->mm);
|
||||
r = s390_disable_cow_sharing();
|
||||
if (r)
|
||||
break;
|
||||
|
||||
|
|
|
@ -12,6 +12,7 @@
|
|||
#include <linux/list.h>
|
||||
#include <linux/bitmap.h>
|
||||
#include <linux/sched/signal.h>
|
||||
#include <linux/io.h>
|
||||
|
||||
#include <asm/gmap.h>
|
||||
#include <asm/mmu_context.h>
|
||||
|
@ -361,7 +362,7 @@ end:
|
|||
case -EACCES:
|
||||
return set_validity_icpt(scb_s, 0x003CU);
|
||||
}
|
||||
scb_s->crycbd = ((__u32)(__u64) &vsie_page->crycb) | CRYCB_FORMAT2;
|
||||
scb_s->crycbd = (u32)virt_to_phys(&vsie_page->crycb) | CRYCB_FORMAT2;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -1005,7 +1006,7 @@ static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
|
|||
if (read_guest_real(vcpu, fac, &vsie_page->fac,
|
||||
stfle_size() * sizeof(u64)))
|
||||
return set_validity_icpt(scb_s, 0x1090U);
|
||||
scb_s->fac = (__u32)(__u64) &vsie_page->fac;
|
||||
scb_s->fac = (u32)virt_to_phys(&vsie_page->fac);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -2547,41 +2547,6 @@ static inline void thp_split_mm(struct mm_struct *mm)
|
|||
}
|
||||
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
||||
|
||||
/*
|
||||
* Remove all empty zero pages from the mapping for lazy refaulting
|
||||
* - This must be called after mm->context.has_pgste is set, to avoid
|
||||
* future creation of zero pages
|
||||
* - This must be called after THP was disabled.
|
||||
*
|
||||
* mm contracts with s390, that even if mm were to remove a page table,
|
||||
* racing with the loop below and so causing pte_offset_map_lock() to fail,
|
||||
* it will never insert a page table containing empty zero pages once
|
||||
* mm_forbids_zeropage(mm) i.e. mm->context.has_pgste is set.
|
||||
*/
|
||||
static int __zap_zero_pages(pmd_t *pmd, unsigned long start,
|
||||
unsigned long end, struct mm_walk *walk)
|
||||
{
|
||||
unsigned long addr;
|
||||
|
||||
for (addr = start; addr != end; addr += PAGE_SIZE) {
|
||||
pte_t *ptep;
|
||||
spinlock_t *ptl;
|
||||
|
||||
ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
|
||||
if (!ptep)
|
||||
break;
|
||||
if (is_zero_pfn(pte_pfn(*ptep)))
|
||||
ptep_xchg_direct(walk->mm, addr, ptep, __pte(_PAGE_INVALID));
|
||||
pte_unmap_unlock(ptep, ptl);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct mm_walk_ops zap_zero_walk_ops = {
|
||||
.pmd_entry = __zap_zero_pages,
|
||||
.walk_lock = PGWALK_WRLOCK,
|
||||
};
|
||||
|
||||
/*
|
||||
* switch on pgstes for its userspace process (for kvm)
|
||||
*/
|
||||
|
@ -2599,22 +2564,142 @@ int s390_enable_sie(void)
|
|||
mm->context.has_pgste = 1;
|
||||
/* split thp mappings and disable thp for future mappings */
|
||||
thp_split_mm(mm);
|
||||
walk_page_range(mm, 0, TASK_SIZE, &zap_zero_walk_ops, NULL);
|
||||
mmap_write_unlock(mm);
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(s390_enable_sie);
|
||||
|
||||
int gmap_mark_unmergeable(void)
|
||||
static int find_zeropage_pte_entry(pte_t *pte, unsigned long addr,
|
||||
unsigned long end, struct mm_walk *walk)
|
||||
{
|
||||
unsigned long *found_addr = walk->private;
|
||||
|
||||
/* Return 1 of the page is a zeropage. */
|
||||
if (is_zero_pfn(pte_pfn(*pte))) {
|
||||
/*
|
||||
* Shared zeropage in e.g., a FS DAX mapping? We cannot do the
|
||||
* right thing and likely don't care: FAULT_FLAG_UNSHARE
|
||||
* currently only works in COW mappings, which is also where
|
||||
* mm_forbids_zeropage() is checked.
|
||||
*/
|
||||
if (!is_cow_mapping(walk->vma->vm_flags))
|
||||
return -EFAULT;
|
||||
|
||||
*found_addr = addr;
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct mm_walk_ops find_zeropage_ops = {
|
||||
.pte_entry = find_zeropage_pte_entry,
|
||||
.walk_lock = PGWALK_WRLOCK,
|
||||
};
|
||||
|
||||
/*
|
||||
* Unshare all shared zeropages, replacing them by anonymous pages. Note that
|
||||
* we cannot simply zap all shared zeropages, because this could later
|
||||
* trigger unexpected userfaultfd missing events.
|
||||
*
|
||||
* This must be called after mm->context.allow_cow_sharing was
|
||||
* set to 0, to avoid future mappings of shared zeropages.
|
||||
*
|
||||
* mm contracts with s390, that even if mm were to remove a page table,
|
||||
* and racing with walk_page_range_vma() calling pte_offset_map_lock()
|
||||
* would fail, it will never insert a page table containing empty zero
|
||||
* pages once mm_forbids_zeropage(mm) i.e.
|
||||
* mm->context.allow_cow_sharing is set to 0.
|
||||
*/
|
||||
static int __s390_unshare_zeropages(struct mm_struct *mm)
|
||||
{
|
||||
struct vm_area_struct *vma;
|
||||
VMA_ITERATOR(vmi, mm, 0);
|
||||
unsigned long addr;
|
||||
vm_fault_t fault;
|
||||
int rc;
|
||||
|
||||
for_each_vma(vmi, vma) {
|
||||
/*
|
||||
* We could only look at COW mappings, but it's more future
|
||||
* proof to catch unexpected zeropages in other mappings and
|
||||
* fail.
|
||||
*/
|
||||
if ((vma->vm_flags & VM_PFNMAP) || is_vm_hugetlb_page(vma))
|
||||
continue;
|
||||
addr = vma->vm_start;
|
||||
|
||||
retry:
|
||||
rc = walk_page_range_vma(vma, addr, vma->vm_end,
|
||||
&find_zeropage_ops, &addr);
|
||||
if (rc < 0)
|
||||
return rc;
|
||||
else if (!rc)
|
||||
continue;
|
||||
|
||||
/* addr was updated by find_zeropage_pte_entry() */
|
||||
fault = handle_mm_fault(vma, addr,
|
||||
FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE,
|
||||
NULL);
|
||||
if (fault & VM_FAULT_OOM)
|
||||
return -ENOMEM;
|
||||
/*
|
||||
* See break_ksm(): even after handle_mm_fault() returned 0, we
|
||||
* must start the lookup from the current address, because
|
||||
* handle_mm_fault() may back out if there's any difficulty.
|
||||
*
|
||||
* VM_FAULT_SIGBUS and VM_FAULT_SIGSEGV are unexpected but
|
||||
* maybe they could trigger in the future on concurrent
|
||||
* truncation. In that case, the shared zeropage would be gone
|
||||
* and we can simply retry and make progress.
|
||||
*/
|
||||
cond_resched();
|
||||
goto retry;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int __s390_disable_cow_sharing(struct mm_struct *mm)
|
||||
{
|
||||
int rc;
|
||||
|
||||
if (!mm->context.allow_cow_sharing)
|
||||
return 0;
|
||||
|
||||
mm->context.allow_cow_sharing = 0;
|
||||
|
||||
/* Replace all shared zeropages by anonymous pages. */
|
||||
rc = __s390_unshare_zeropages(mm);
|
||||
/*
|
||||
* Make sure to disable KSM (if enabled for the whole process or
|
||||
* individual VMAs). Note that nothing currently hinders user space
|
||||
* from re-enabling it.
|
||||
*/
|
||||
return ksm_disable(current->mm);
|
||||
if (!rc)
|
||||
rc = ksm_disable(mm);
|
||||
if (rc)
|
||||
mm->context.allow_cow_sharing = 1;
|
||||
return rc;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(gmap_mark_unmergeable);
|
||||
|
||||
/*
|
||||
* Disable most COW-sharing of memory pages for the whole process:
|
||||
* (1) Disable KSM and unmerge/unshare any KSM pages.
|
||||
* (2) Disallow shared zeropages and unshare any zerpages that are mapped.
|
||||
*
|
||||
* Not that we currently don't bother with COW-shared pages that are shared
|
||||
* with parent/child processes due to fork().
|
||||
*/
|
||||
int s390_disable_cow_sharing(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
mmap_write_lock(current->mm);
|
||||
rc = __s390_disable_cow_sharing(current->mm);
|
||||
mmap_write_unlock(current->mm);
|
||||
return rc;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(s390_disable_cow_sharing);
|
||||
|
||||
/*
|
||||
* Enable storage key handling from now on and initialize the storage
|
||||
|
@ -2683,7 +2768,7 @@ int s390_enable_skey(void)
|
|||
goto out_up;
|
||||
|
||||
mm->context.uses_skeys = 1;
|
||||
rc = gmap_mark_unmergeable();
|
||||
rc = __s390_disable_cow_sharing(mm);
|
||||
if (rc) {
|
||||
mm->context.uses_skeys = 0;
|
||||
goto out_up;
|
||||
|
|
|
@ -212,6 +212,38 @@ out_release:
|
|||
goto out;
|
||||
}
|
||||
|
||||
static int mfill_atomic_pte_zeroed_folio(pmd_t *dst_pmd,
|
||||
struct vm_area_struct *dst_vma,
|
||||
unsigned long dst_addr)
|
||||
{
|
||||
struct folio *folio;
|
||||
int ret = -ENOMEM;
|
||||
|
||||
folio = vma_alloc_zeroed_movable_folio(dst_vma, dst_addr);
|
||||
if (!folio)
|
||||
return ret;
|
||||
|
||||
if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL))
|
||||
goto out_put;
|
||||
|
||||
/*
|
||||
* The memory barrier inside __folio_mark_uptodate makes sure that
|
||||
* zeroing out the folio become visible before mapping the page
|
||||
* using set_pte_at(). See do_anonymous_page().
|
||||
*/
|
||||
__folio_mark_uptodate(folio);
|
||||
|
||||
ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
|
||||
&folio->page, true, 0);
|
||||
if (ret)
|
||||
goto out_put;
|
||||
|
||||
return 0;
|
||||
out_put:
|
||||
folio_put(folio);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
|
||||
struct vm_area_struct *dst_vma,
|
||||
unsigned long dst_addr)
|
||||
|
@ -220,6 +252,9 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
|
|||
spinlock_t *ptl;
|
||||
int ret;
|
||||
|
||||
if (mm_forbids_zeropage(dst_vma->vm_mm))
|
||||
return mfill_atomic_pte_zeroed_folio(dst_pmd, dst_vma, dst_addr);
|
||||
|
||||
_dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
|
||||
dst_vma->vm_page_prot));
|
||||
ret = -EAGAIN;
|
||||
|
|
Loading…
Reference in New Issue