Merge: s390/mm: add support for UFFDIO_POISON

MR: https://gitlab.com/redhat/centos-stream/src/kernel/centos-stream-9/-/merge_requests/6304

JIRA: https://issues.redhat.com/browse/RHEL-74362

These are complementary to commit 8a13897fb0daa (2097286f41 in RHEL) for
s390x.

Signed-off-by: Aristeu Rozanski <arozansk@redhat.com>

Approved-by: Herton R. Krzesinski <herton@redhat.com>
Approved-by: Thomas Huth <thuth@redhat.com>
Approved-by: Rafael Aquini <raquini@redhat.com>
Approved-by: CKI KWF Bot <cki-ci-bot+kwf-gitlab-com@redhat.com>

Merged-by: Patrick Talbert <ptalbert@redhat.com>
This commit is contained in:
Patrick Talbert 2025-02-17 12:00:35 -05:00
commit 25be68b708
6 changed files with 133 additions and 39 deletions

View File

@ -17,8 +17,8 @@
#define GMAP_NOTIFY_MPROT 0x1
/* Status bits only for huge segment entries */
#define _SEGMENT_ENTRY_GMAP_IN 0x8000 /* invalidation notify bit */
#define _SEGMENT_ENTRY_GMAP_UC 0x4000 /* dirty (migration) */
#define _SEGMENT_ENTRY_GMAP_IN 0x0800 /* invalidation notify bit */
#define _SEGMENT_ENTRY_GMAP_UC 0x0002 /* dirty (migration) */
/**
* struct gmap_struct - guest address space

View File

@ -10,6 +10,8 @@
#define _ASM_S390_HUGETLB_H
#include <linux/pgtable.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <asm/page.h>
#define hugetlb_free_pgd_range free_pgd_range
@ -91,7 +93,7 @@ static inline int huge_pte_none(pte_t pte)
static inline int huge_pte_none_mostly(pte_t pte)
{
return huge_pte_none(pte);
return huge_pte_none(pte) || is_pte_marker(pte);
}
static inline int huge_pte_write(pte_t pte)

View File

@ -259,24 +259,34 @@ static inline int is_module_addr(void *addr)
#define _REGION1_ENTRY_EMPTY (_REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID)
#define _REGION2_ENTRY (_REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_LENGTH)
#define _REGION2_ENTRY_EMPTY (_REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID)
#define _REGION3_ENTRY (_REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_LENGTH)
#define _REGION3_ENTRY (_REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_LENGTH | \
_REGION3_ENTRY_PRESENT)
#define _REGION3_ENTRY_EMPTY (_REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID)
#define _REGION3_ENTRY_ORIGIN_LARGE ~0x7fffffffUL /* large page address */
#define _REGION3_ENTRY_DIRTY 0x2000 /* SW region dirty bit */
#define _REGION3_ENTRY_YOUNG 0x1000 /* SW region young bit */
#define _REGION3_ENTRY_COMM 0x0010 /* Common-Region, marks swap entry */
#define _REGION3_ENTRY_LARGE 0x0400 /* RTTE-format control, large page */
#define _REGION3_ENTRY_READ 0x0002 /* SW region read bit */
#define _REGION3_ENTRY_WRITE 0x0001 /* SW region write bit */
#define _REGION3_ENTRY_WRITE 0x8000 /* SW region write bit */
#define _REGION3_ENTRY_READ 0x4000 /* SW region read bit */
#ifdef CONFIG_MEM_SOFT_DIRTY
#define _REGION3_ENTRY_SOFT_DIRTY 0x4000 /* SW region soft dirty bit */
#define _REGION3_ENTRY_SOFT_DIRTY 0x0002 /* SW region soft dirty bit */
#else
#define _REGION3_ENTRY_SOFT_DIRTY 0x0000 /* SW region soft dirty bit */
#endif
#define _REGION_ENTRY_BITS 0xfffffffffffff22fUL
/*
* SW region present bit. For non-leaf region-third-table entries, bits 62-63
* indicate the TABLE LENGTH and both must be set to 1. But such entries
* would always be considered as present, so it is safe to use bit 63 as
* PRESENT bit for PUD.
*/
#define _REGION3_ENTRY_PRESENT 0x0001
/* Bits in the segment table entry */
#define _SEGMENT_ENTRY_BITS 0xfffffffffffffe33UL
#define _SEGMENT_ENTRY_HARDWARE_BITS 0xfffffffffffffe30UL
@ -288,21 +298,29 @@ static inline int is_module_addr(void *addr)
#define _SEGMENT_ENTRY_INVALID 0x20 /* invalid segment table entry */
#define _SEGMENT_ENTRY_TYPE_MASK 0x0c /* segment table type mask */
#define _SEGMENT_ENTRY (0)
#define _SEGMENT_ENTRY (_SEGMENT_ENTRY_PRESENT)
#define _SEGMENT_ENTRY_EMPTY (_SEGMENT_ENTRY_INVALID)
#define _SEGMENT_ENTRY_DIRTY 0x2000 /* SW segment dirty bit */
#define _SEGMENT_ENTRY_YOUNG 0x1000 /* SW segment young bit */
#define _SEGMENT_ENTRY_COMM 0x0010 /* Common-Segment, marks swap entry */
#define _SEGMENT_ENTRY_LARGE 0x0400 /* STE-format control, large page */
#define _SEGMENT_ENTRY_WRITE 0x0002 /* SW segment write bit */
#define _SEGMENT_ENTRY_READ 0x0001 /* SW segment read bit */
#define _SEGMENT_ENTRY_WRITE 0x8000 /* SW segment write bit */
#define _SEGMENT_ENTRY_READ 0x4000 /* SW segment read bit */
#ifdef CONFIG_MEM_SOFT_DIRTY
#define _SEGMENT_ENTRY_SOFT_DIRTY 0x4000 /* SW segment soft dirty bit */
#define _SEGMENT_ENTRY_SOFT_DIRTY 0x0002 /* SW segment soft dirty bit */
#else
#define _SEGMENT_ENTRY_SOFT_DIRTY 0x0000 /* SW segment soft dirty bit */
#endif
#define _SEGMENT_ENTRY_PRESENT 0x0001 /* SW segment present bit */
/* Common bits in region and segment table entries, for swap entries */
#define _RST_ENTRY_COMM 0x0010 /* Common-Region/Segment, marks swap entry */
#define _RST_ENTRY_INVALID 0x0020 /* invalid region/segment table entry */
#define _CRST_ENTRIES 2048 /* number of region/segment table entries */
#define _PAGE_ENTRIES 256 /* number of page table entries */
@ -434,17 +452,22 @@ static inline int is_module_addr(void *addr)
/*
* Segment entry (large page) protection definitions.
*/
#define SEGMENT_NONE __pgprot(_SEGMENT_ENTRY_INVALID | \
#define SEGMENT_NONE __pgprot(_SEGMENT_ENTRY_PRESENT | \
_SEGMENT_ENTRY_INVALID | \
_SEGMENT_ENTRY_PROTECT)
#define SEGMENT_RO __pgprot(_SEGMENT_ENTRY_PROTECT | \
#define SEGMENT_RO __pgprot(_SEGMENT_ENTRY_PRESENT | \
_SEGMENT_ENTRY_PROTECT | \
_SEGMENT_ENTRY_READ | \
_SEGMENT_ENTRY_NOEXEC)
#define SEGMENT_RX __pgprot(_SEGMENT_ENTRY_PROTECT | \
#define SEGMENT_RX __pgprot(_SEGMENT_ENTRY_PRESENT | \
_SEGMENT_ENTRY_PROTECT | \
_SEGMENT_ENTRY_READ)
#define SEGMENT_RW __pgprot(_SEGMENT_ENTRY_READ | \
#define SEGMENT_RW __pgprot(_SEGMENT_ENTRY_PRESENT | \
_SEGMENT_ENTRY_READ | \
_SEGMENT_ENTRY_WRITE | \
_SEGMENT_ENTRY_NOEXEC)
#define SEGMENT_RWX __pgprot(_SEGMENT_ENTRY_READ | \
#define SEGMENT_RWX __pgprot(_SEGMENT_ENTRY_PRESENT | \
_SEGMENT_ENTRY_READ | \
_SEGMENT_ENTRY_WRITE)
#define SEGMENT_KERNEL __pgprot(_SEGMENT_ENTRY | \
_SEGMENT_ENTRY_LARGE | \
@ -471,6 +494,7 @@ static inline int is_module_addr(void *addr)
*/
#define REGION3_KERNEL __pgprot(_REGION_ENTRY_TYPE_R3 | \
_REGION3_ENTRY_PRESENT | \
_REGION3_ENTRY_LARGE | \
_REGION3_ENTRY_READ | \
_REGION3_ENTRY_WRITE | \
@ -478,12 +502,14 @@ static inline int is_module_addr(void *addr)
_REGION3_ENTRY_DIRTY | \
_REGION_ENTRY_NOEXEC)
#define REGION3_KERNEL_RO __pgprot(_REGION_ENTRY_TYPE_R3 | \
_REGION3_ENTRY_PRESENT | \
_REGION3_ENTRY_LARGE | \
_REGION3_ENTRY_READ | \
_REGION3_ENTRY_YOUNG | \
_REGION_ENTRY_PROTECT | \
_REGION_ENTRY_NOEXEC)
#define REGION3_KERNEL_EXEC __pgprot(_REGION_ENTRY_TYPE_R3 | \
_REGION3_ENTRY_PRESENT | \
_REGION3_ENTRY_LARGE | \
_REGION3_ENTRY_READ | \
_REGION3_ENTRY_WRITE | \
@ -705,7 +731,7 @@ static inline int pud_present(pud_t pud)
{
if (pud_folded(pud))
return 1;
return (pud_val(pud) & _REGION_ENTRY_ORIGIN) != 0UL;
return (pud_val(pud) & _REGION3_ENTRY_PRESENT) != 0;
}
static inline int pud_none(pud_t pud)
@ -720,13 +746,18 @@ static inline bool pud_leaf(pud_t pud)
{
if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) != _REGION_ENTRY_TYPE_R3)
return 0;
return !!(pud_val(pud) & _REGION3_ENTRY_LARGE);
return (pud_present(pud) && (pud_val(pud) & _REGION3_ENTRY_LARGE) != 0);
}
static inline int pmd_present(pmd_t pmd)
{
return (pmd_val(pmd) & _SEGMENT_ENTRY_PRESENT) != 0;
}
#define pmd_leaf pmd_leaf
static inline bool pmd_leaf(pmd_t pmd)
{
return (pmd_val(pmd) & _SEGMENT_ENTRY_LARGE) != 0;
return (pmd_present(pmd) && (pmd_val(pmd) & _SEGMENT_ENTRY_LARGE) != 0);
}
static inline int pmd_bad(pmd_t pmd)
@ -758,11 +789,6 @@ static inline int p4d_bad(p4d_t p4d)
return (p4d_val(p4d) & ~_REGION_ENTRY_BITS) != 0;
}
static inline int pmd_present(pmd_t pmd)
{
return pmd_val(pmd) != _SEGMENT_ENTRY_EMPTY;
}
static inline int pmd_none(pmd_t pmd)
{
return pmd_val(pmd) == _SEGMENT_ENTRY_EMPTY;
@ -1807,7 +1833,7 @@ static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
static inline int pmd_trans_huge(pmd_t pmd)
{
return pmd_val(pmd) & _SEGMENT_ENTRY_LARGE;
return pmd_leaf(pmd);
}
#define has_transparent_hugepage has_transparent_hugepage
@ -1867,6 +1893,53 @@ static inline swp_entry_t __swp_entry(unsigned long type, unsigned long offset)
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) })
#define __swp_entry_to_pte(x) ((pte_t) { (x).val })
/*
* 64 bit swap entry format for REGION3 and SEGMENT table entries (RSTE)
* Bits 59 and 63 are used to indicate the swap entry. Bit 58 marks the rste
* as invalid.
* A swap entry is indicated by bit pattern (rste & 0x011) == 0x010
* | offset |Xtype |11TT|S0|
* |0000000000111111111122222222223333333333444444444455|555555|5566|66|
* |0123456789012345678901234567890123456789012345678901|234567|8901|23|
*
* Bits 0-51 store the offset.
* Bits 53-57 store the type.
* Bit 62 (S) is used for softdirty tracking.
* Bits 60-61 (TT) indicate the table type: 0x01 for REGION3 and 0x00 for SEGMENT.
* Bit 52 (X) is unused.
*/
#define __SWP_OFFSET_MASK_RSTE ((1UL << 52) - 1)
#define __SWP_OFFSET_SHIFT_RSTE 12
#define __SWP_TYPE_MASK_RSTE ((1UL << 5) - 1)
#define __SWP_TYPE_SHIFT_RSTE 6
/*
* TT bits set to 0x00 == SEGMENT. For REGION3 entries, caller must add R3
* bits 0x01. See also __set_huge_pte_at().
*/
static inline unsigned long mk_swap_rste(unsigned long type, unsigned long offset)
{
unsigned long rste;
rste = _RST_ENTRY_INVALID | _RST_ENTRY_COMM;
rste |= (offset & __SWP_OFFSET_MASK_RSTE) << __SWP_OFFSET_SHIFT_RSTE;
rste |= (type & __SWP_TYPE_MASK_RSTE) << __SWP_TYPE_SHIFT_RSTE;
return rste;
}
static inline unsigned long __swp_type_rste(swp_entry_t entry)
{
return (entry.val >> __SWP_TYPE_SHIFT_RSTE) & __SWP_TYPE_MASK_RSTE;
}
static inline unsigned long __swp_offset_rste(swp_entry_t entry)
{
return (entry.val >> __SWP_OFFSET_SHIFT_RSTE) & __SWP_OFFSET_MASK_RSTE;
}
#define __rste_to_swp_entry(rste) ((swp_entry_t) { rste })
extern int vmem_add_mapping(unsigned long start, unsigned long size);
extern void vmem_remove_mapping(unsigned long start, unsigned long size);
extern int __vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot, bool alloc);

View File

@ -307,13 +307,15 @@ static void do_fault_error(struct pt_regs *regs, vm_fault_t fault)
do_no_context(regs);
else
do_sigsegv(regs, SEGV_MAPERR);
} else if (fault & VM_FAULT_SIGBUS) {
} else if (fault & (VM_FAULT_SIGBUS | VM_FAULT_HWPOISON |
VM_FAULT_HWPOISON_LARGE)) {
/* Kernel mode? Handle exceptions or die */
if (!user_mode(regs))
do_no_context(regs);
else
do_sigbus(regs);
} else {
pr_emerg("Unexpected fault flags: %08x\n", fault);
BUG();
}
break;

View File

@ -616,7 +616,8 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
if (pmd_leaf(*pmd)) {
*table = (pmd_val(*pmd) &
_SEGMENT_ENTRY_HARDWARE_BITS_LARGE)
| _SEGMENT_ENTRY_GMAP_UC;
| _SEGMENT_ENTRY_GMAP_UC
| _SEGMENT_ENTRY;
} else
*table = pmd_val(*pmd) &
_SEGMENT_ENTRY_HARDWARE_BITS;
@ -2345,7 +2346,8 @@ static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr,
gaddr = __gmap_segment_gaddr((unsigned long *)pmdp);
pmdp_notify_gmap(gmap, pmdp, gaddr);
WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
_SEGMENT_ENTRY_GMAP_UC));
_SEGMENT_ENTRY_GMAP_UC |
_SEGMENT_ENTRY));
if (purge)
__pmdp_csp(pmdp);
set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
@ -2399,7 +2401,8 @@ void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr)
gaddr = __gmap_segment_gaddr(entry);
pmdp_notify_gmap(gmap, pmdp, gaddr);
WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
_SEGMENT_ENTRY_GMAP_UC));
_SEGMENT_ENTRY_GMAP_UC |
_SEGMENT_ENTRY));
if (MACHINE_HAS_TLB_GUEST)
__pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
gmap->asce, IDTE_LOCAL);
@ -2434,7 +2437,8 @@ void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr)
gaddr = __gmap_segment_gaddr(entry);
pmdp_notify_gmap(gmap, pmdp, gaddr);
WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
_SEGMENT_ENTRY_GMAP_UC));
_SEGMENT_ENTRY_GMAP_UC |
_SEGMENT_ENTRY));
if (MACHINE_HAS_TLB_GUEST)
__pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
gmap->asce, IDTE_GLOBAL);

View File

@ -24,6 +24,7 @@
static inline unsigned long __pte_to_rste(pte_t pte)
{
swp_entry_t arch_entry;
unsigned long rste;
/*
@ -48,6 +49,7 @@ static inline unsigned long __pte_to_rste(pte_t pte)
*/
if (pte_present(pte)) {
rste = pte_val(pte) & PAGE_MASK;
rste |= _SEGMENT_ENTRY_PRESENT;
rste |= move_set_bit(pte_val(pte), _PAGE_READ,
_SEGMENT_ENTRY_READ);
rste |= move_set_bit(pte_val(pte), _PAGE_WRITE,
@ -66,6 +68,10 @@ static inline unsigned long __pte_to_rste(pte_t pte)
#endif
rste |= move_set_bit(pte_val(pte), _PAGE_NOEXEC,
_SEGMENT_ENTRY_NOEXEC);
} else if (!pte_none(pte)) {
/* swap pte */
arch_entry = __pte_to_swp_entry(pte);
rste = mk_swap_rste(__swp_type(arch_entry), __swp_offset(arch_entry));
} else
rste = _SEGMENT_ENTRY_EMPTY;
return rste;
@ -73,13 +79,18 @@ static inline unsigned long __pte_to_rste(pte_t pte)
static inline pte_t __rste_to_pte(unsigned long rste)
{
swp_entry_t arch_entry;
unsigned long pteval;
int present;
int present, none;
pte_t pte;
if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) {
present = pud_present(__pud(rste));
else
none = pud_none(__pud(rste));
} else {
present = pmd_present(__pmd(rste));
none = pmd_none(__pmd(rste));
}
/*
* Convert encoding pmd / pud bits pte bits
@ -114,6 +125,11 @@ static inline pte_t __rste_to_pte(unsigned long rste)
pteval |= move_set_bit(rste, _SEGMENT_ENTRY_SOFT_DIRTY, _PAGE_SOFT_DIRTY);
#endif
pteval |= move_set_bit(rste, _SEGMENT_ENTRY_NOEXEC, _PAGE_NOEXEC);
} else if (!none) {
/* swap rste */
arch_entry = __rste_to_swp_entry(rste);
pte = mk_swap_pte(__swp_type_rste(arch_entry), __swp_offset_rste(arch_entry));
pteval = pte_val(pte);
} else
pteval = _PAGE_INVALID;
return __pte(pteval);
@ -148,8 +164,6 @@ void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
unsigned long rste;
rste = __pte_to_rste(pte);
if (!MACHINE_HAS_NX)
rste &= ~_SEGMENT_ENTRY_NOEXEC;
/* Set correct table type for 2G hugepages */
if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) {
@ -223,11 +237,10 @@ pte_t *huge_pte_offset(struct mm_struct *mm,
p4dp = p4d_offset(pgdp, addr);
if (p4d_present(*p4dp)) {
pudp = pud_offset(p4dp, addr);
if (pud_present(*pudp)) {
if (pud_leaf(*pudp))
return (pte_t *) pudp;
if (sz == PUD_SIZE)
return (pte_t *)pudp;
if (pud_present(*pudp))
pmdp = pmd_offset(pudp, addr);
}
}
}
return (pte_t *) pmdp;