From b2cf20d2eb36759d093e7204ff459abd365942b8 Mon Sep 17 00:00:00 2001 From: Aristeu Rozanski Date: Tue, 4 Feb 2025 09:15:23 -0500 Subject: [PATCH 1/7] s390/pgtable: Switch read and write softbits for puds JIRA: https://issues.redhat.com/browse/RHEL-74362 Tested: by me commit 3e93d49175a7b82adedd43072b021401d81f6b76 Author: Claudio Imbrenda Date: Mon Apr 29 16:34:08 2024 +0200 s390/pgtable: Switch read and write softbits for puds There is no reason for the read and write softbits to be swapped in the puds compared to pmds. They are different only because the softbits for puds were introduced at the same time when the softbits for pmds were swapped. The current implementation is not wrong per se, since the macros are defined correctly; only the documentation does not reflect reality. With this patch, the read and write softbits for large pmd and large puds will have the same layout, and will match the existing documentation. Signed-off-by: Claudio Imbrenda Reviewed-by: Heiko Carstens Link: https://lore.kernel.org/r/20240429143409.49892-2-imbrenda@linux.ibm.com Signed-off-by: Alexander Gordeev Signed-off-by: Aristeu Rozanski --- arch/s390/include/asm/pgtable.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 58a64e23c9ab..1e323d0b231d 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -266,8 +266,8 @@ static inline int is_module_addr(void *addr) #define _REGION3_ENTRY_DIRTY 0x2000 /* SW region dirty bit */ #define _REGION3_ENTRY_YOUNG 0x1000 /* SW region young bit */ #define _REGION3_ENTRY_LARGE 0x0400 /* RTTE-format control, large page */ -#define _REGION3_ENTRY_READ 0x0002 /* SW region read bit */ -#define _REGION3_ENTRY_WRITE 0x0001 /* SW region write bit */ +#define _REGION3_ENTRY_WRITE 0x0002 /* SW region write bit */ +#define _REGION3_ENTRY_READ 0x0001 /* SW region read bit */ #ifdef CONFIG_MEM_SOFT_DIRTY #define _REGION3_ENTRY_SOFT_DIRTY 0x4000 /* SW region soft dirty bit */ From 7a8c6bb7145518fe6d2bae3e61ee4bc5133dc62e Mon Sep 17 00:00:00 2001 From: Aristeu Rozanski Date: Tue, 4 Feb 2025 09:15:23 -0500 Subject: [PATCH 2/7] s390/mm: Fix VM_FAULT_HWPOISON handling in do_exception() JIRA: https://issues.redhat.com/browse/RHEL-74362 Tested: by me Conflicts: due b20c8216c1e0d5 patchset not being backported commit df39038cd89525d465c2c8827eb64116873f141a Author: Gerald Schaefer Date: Mon Jul 15 20:04:16 2024 +0200 s390/mm: Fix VM_FAULT_HWPOISON handling in do_exception() There is no support for HWPOISON, MEMORY_FAILURE, or ARCH_HAS_COPY_MC on s390. Therefore we do not expect to see VM_FAULT_HWPOISON in do_exception(). However, since commit af19487f00f3 ("mm: make PTE_MARKER_SWAPIN_ERROR more general"), it is possible to see VM_FAULT_HWPOISON in combination with PTE_MARKER_POISONED, even on architectures that do not support HWPOISON otherwise. In this case, we will end up on the BUG() in do_exception(). Fix this by treating VM_FAULT_HWPOISON the same as VM_FAULT_SIGBUS, similar to x86 when MEMORY_FAILURE is not configured. Also print unexpected fault flags, for easier debugging. Note that VM_FAULT_HWPOISON_LARGE is not expected, because s390 cannot support swap entries on other levels than PTE level. Cc: stable@vger.kernel.org # 6.6+ Fixes: af19487f00f3 ("mm: make PTE_MARKER_SWAPIN_ERROR more general") Reported-by: Yunseong Kim Tested-by: Yunseong Kim Acked-by: Alexander Gordeev Signed-off-by: Gerald Schaefer Message-ID: <20240715180416.3632453-1-gerald.schaefer@linux.ibm.com> Signed-off-by: Vasily Gorbik Signed-off-by: Aristeu Rozanski --- arch/s390/mm/fault.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 05c908051e93..fa7bb01f94fc 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -307,13 +307,14 @@ static void do_fault_error(struct pt_regs *regs, vm_fault_t fault) do_no_context(regs); else do_sigsegv(regs, SEGV_MAPERR); - } else if (fault & VM_FAULT_SIGBUS) { + } else if (fault & (VM_FAULT_SIGBUS | VM_FAULT_HWPOISON)) { /* Kernel mode? Handle exceptions or die */ if (!user_mode(regs)) do_no_context(regs); else do_sigbus(regs); } else { + pr_emerg("Unexpected fault flags: %08x\n", fault); BUG(); } break; From 77066111e8d010b663de21d7308400bfabadc8b2 Mon Sep 17 00:00:00 2001 From: Aristeu Rozanski Date: Tue, 4 Feb 2025 09:15:23 -0500 Subject: [PATCH 3/7] s390/mm: Rearrange region-third and segment table entry SW bits JIRA: https://issues.redhat.com/browse/RHEL-74362 Tested: by me commit ae1b9fb2d556e95001399dcd4e228525aead5122 Author: Gerald Schaefer Date: Thu Nov 21 18:45:20 2024 +0100 s390/mm: Rearrange region-third and segment table entry SW bits Rearrange region-third and segment table entry SW bits, in order to make room for future encoding of region/segment table swap entries. Also adjust _SEGMENT_ENTRY_GMAP_UC and _SEGMENT_ENTRY_GMAP_IN bits in gmap code. Those should only apply for gmap PMDs, and not really depend on or conflict with host PMD bits, but for consistency also adjust them: - _SEGMENT_ENTRY_GMAP_UC "dirty (migration)" was using the same bit as _SEGMENT_ENTRY_SOFT_DIRTY in the host PMD -> make it use the new SOFT_DIRTY bit 63 (0x0002) - _SEGMENT_ENTRY_GMAP_IN "invalidation notify bit" was using 0x8000, which was an unused bit in the host PMD, that is now used for _SEGMENT_ENTRY_WRITE -> make it use bit 52 (0x0800) instead, which is still unused in the host PMD This is a prerequisite for hugetlbfs PTE_MARKER support on s390, which is needed to fix a regression introduced with commit 8a13897fb0da ("mm: userfaultfd: support UFFDIO_POISON for hugetlbfs"). That commit depends on the availability of swap entries for hugetlbfs, which were not available for s390 so far. Reviewed-by: Alexander Gordeev Signed-off-by: Gerald Schaefer Signed-off-by: Heiko Carstens Signed-off-by: Aristeu Rozanski --- arch/s390/include/asm/gmap.h | 4 ++-- arch/s390/include/asm/pgtable.h | 13 +++++++------ 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h index 9725586f4259..74bac68cecd4 100644 --- a/arch/s390/include/asm/gmap.h +++ b/arch/s390/include/asm/gmap.h @@ -17,8 +17,8 @@ #define GMAP_NOTIFY_MPROT 0x1 /* Status bits only for huge segment entries */ -#define _SEGMENT_ENTRY_GMAP_IN 0x8000 /* invalidation notify bit */ -#define _SEGMENT_ENTRY_GMAP_UC 0x4000 /* dirty (migration) */ +#define _SEGMENT_ENTRY_GMAP_IN 0x0800 /* invalidation notify bit */ +#define _SEGMENT_ENTRY_GMAP_UC 0x0002 /* dirty (migration) */ /** * struct gmap_struct - guest address space diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 1e323d0b231d..c2cd69593a50 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -266,11 +266,11 @@ static inline int is_module_addr(void *addr) #define _REGION3_ENTRY_DIRTY 0x2000 /* SW region dirty bit */ #define _REGION3_ENTRY_YOUNG 0x1000 /* SW region young bit */ #define _REGION3_ENTRY_LARGE 0x0400 /* RTTE-format control, large page */ -#define _REGION3_ENTRY_WRITE 0x0002 /* SW region write bit */ -#define _REGION3_ENTRY_READ 0x0001 /* SW region read bit */ +#define _REGION3_ENTRY_WRITE 0x8000 /* SW region write bit */ +#define _REGION3_ENTRY_READ 0x4000 /* SW region read bit */ #ifdef CONFIG_MEM_SOFT_DIRTY -#define _REGION3_ENTRY_SOFT_DIRTY 0x4000 /* SW region soft dirty bit */ +#define _REGION3_ENTRY_SOFT_DIRTY 0x0002 /* SW region soft dirty bit */ #else #define _REGION3_ENTRY_SOFT_DIRTY 0x0000 /* SW region soft dirty bit */ #endif @@ -293,12 +293,13 @@ static inline int is_module_addr(void *addr) #define _SEGMENT_ENTRY_DIRTY 0x2000 /* SW segment dirty bit */ #define _SEGMENT_ENTRY_YOUNG 0x1000 /* SW segment young bit */ + #define _SEGMENT_ENTRY_LARGE 0x0400 /* STE-format control, large page */ -#define _SEGMENT_ENTRY_WRITE 0x0002 /* SW segment write bit */ -#define _SEGMENT_ENTRY_READ 0x0001 /* SW segment read bit */ +#define _SEGMENT_ENTRY_WRITE 0x8000 /* SW segment write bit */ +#define _SEGMENT_ENTRY_READ 0x4000 /* SW segment read bit */ #ifdef CONFIG_MEM_SOFT_DIRTY -#define _SEGMENT_ENTRY_SOFT_DIRTY 0x4000 /* SW segment soft dirty bit */ +#define _SEGMENT_ENTRY_SOFT_DIRTY 0x0002 /* SW segment soft dirty bit */ #else #define _SEGMENT_ENTRY_SOFT_DIRTY 0x0000 /* SW segment soft dirty bit */ #endif From 7a0c62f4c3c0ee8a2b400f3ffb0126f8b72e585c Mon Sep 17 00:00:00 2001 From: Aristeu Rozanski Date: Tue, 4 Feb 2025 09:15:23 -0500 Subject: [PATCH 4/7] s390/mm: Introduce region-third and segment table entry present bits JIRA: https://issues.redhat.com/browse/RHEL-74362 Tested: by me commit 03e6db16b808afbe23e10617f2d18578846bdce0 Author: Gerald Schaefer Date: Thu Nov 21 18:45:21 2024 +0100 s390/mm: Introduce region-third and segment table entry present bits Introduce region-third and segment table entry present SW bits, and adjust pmd/pud_present() accordingly. Also add pmd/pud_present() checks to pmd/pud_leaf(), to return false for future swap entries. Same logic applies to pmd_trans_huge(), make that return pmd_leaf() instead of duplicating the same check. huge_pte_offset() also needs to be adjusted, current code would return NULL for !pud_present(). Use the same logic as in the generic version, which allows for !pud_present() swap entries. Similar to PTE, bit 63 can be used for the new SW present bit in region and segment table entries. For segment-table entries (PMD) the architecture says that "Bits 62-63 are available for programming", so they are safe to use. The same is true for large leaf region-third-table entries (PUD). However, for non-leaf region-third-table entries, bits 62-63 indicate the TABLE LENGTH and both must be set to 1. But such entries would always be considered as present, so it is safe to use bit 63 as PRESENT bit for PUD. They also should not conflict with bit 62 potentially later used for preserving SOFT_DIRTY in swap entries, because they are not swap entries. Valid PMDs / PUDs should always have the present bit set, so add it to the various pgprot defines, and also _SEGMENT_ENTRY which is OR'ed e.g. in pmd_populate(). _REGION3_ENTRY wouldn't need any change, as the present bit is already included in the TABLE LENGTH, but also explicitly add it there, for completeness, and just in case the bit would ever be changed. gmap code needs some adjustment, to also OR the _SEGMENT_ENTRY, like it is already done gmap_shadow_pgt() when creating new PMDs, but not in __gmap_link(). Otherwise, the gmap PMDs would not be considered present, e.g. when using pmd_leaf() checks in gmap code. The various WARN_ON checks in gmap code also need adjustment, to tolerate the new present bit. This is a prerequisite for hugetlbfs PTE_MARKER support on s390, which is needed to fix a regression introduced with commit 8a13897fb0da ("mm: userfaultfd: support UFFDIO_POISON for hugetlbfs"). That commit depends on the availability of swap entries for hugetlbfs, which were not available for s390 so far. Reviewed-by: Alexander Gordeev Signed-off-by: Gerald Schaefer Signed-off-by: Heiko Carstens Signed-off-by: Aristeu Rozanski --- arch/s390/include/asm/pgtable.h | 51 ++++++++++++++++++++++----------- arch/s390/mm/gmap.c | 12 +++++--- arch/s390/mm/hugetlbpage.c | 8 +++--- 3 files changed, 47 insertions(+), 24 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index c2cd69593a50..7bd565ab2750 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -259,7 +259,8 @@ static inline int is_module_addr(void *addr) #define _REGION1_ENTRY_EMPTY (_REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID) #define _REGION2_ENTRY (_REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_LENGTH) #define _REGION2_ENTRY_EMPTY (_REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID) -#define _REGION3_ENTRY (_REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_LENGTH) +#define _REGION3_ENTRY (_REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_LENGTH | \ + _REGION3_ENTRY_PRESENT) #define _REGION3_ENTRY_EMPTY (_REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID) #define _REGION3_ENTRY_ORIGIN_LARGE ~0x7fffffffUL /* large page address */ @@ -277,6 +278,14 @@ static inline int is_module_addr(void *addr) #define _REGION_ENTRY_BITS 0xfffffffffffff22fUL +/* + * SW region present bit. For non-leaf region-third-table entries, bits 62-63 + * indicate the TABLE LENGTH and both must be set to 1. But such entries + * would always be considered as present, so it is safe to use bit 63 as + * PRESENT bit for PUD. + */ +#define _REGION3_ENTRY_PRESENT 0x0001 + /* Bits in the segment table entry */ #define _SEGMENT_ENTRY_BITS 0xfffffffffffffe33UL #define _SEGMENT_ENTRY_HARDWARE_BITS 0xfffffffffffffe30UL @@ -288,7 +297,7 @@ static inline int is_module_addr(void *addr) #define _SEGMENT_ENTRY_INVALID 0x20 /* invalid segment table entry */ #define _SEGMENT_ENTRY_TYPE_MASK 0x0c /* segment table type mask */ -#define _SEGMENT_ENTRY (0) +#define _SEGMENT_ENTRY (_SEGMENT_ENTRY_PRESENT) #define _SEGMENT_ENTRY_EMPTY (_SEGMENT_ENTRY_INVALID) #define _SEGMENT_ENTRY_DIRTY 0x2000 /* SW segment dirty bit */ @@ -304,6 +313,8 @@ static inline int is_module_addr(void *addr) #define _SEGMENT_ENTRY_SOFT_DIRTY 0x0000 /* SW segment soft dirty bit */ #endif +#define _SEGMENT_ENTRY_PRESENT 0x0001 /* SW segment present bit */ + #define _CRST_ENTRIES 2048 /* number of region/segment table entries */ #define _PAGE_ENTRIES 256 /* number of page table entries */ @@ -435,17 +446,22 @@ static inline int is_module_addr(void *addr) /* * Segment entry (large page) protection definitions. */ -#define SEGMENT_NONE __pgprot(_SEGMENT_ENTRY_INVALID | \ +#define SEGMENT_NONE __pgprot(_SEGMENT_ENTRY_PRESENT | \ + _SEGMENT_ENTRY_INVALID | \ _SEGMENT_ENTRY_PROTECT) -#define SEGMENT_RO __pgprot(_SEGMENT_ENTRY_PROTECT | \ +#define SEGMENT_RO __pgprot(_SEGMENT_ENTRY_PRESENT | \ + _SEGMENT_ENTRY_PROTECT | \ _SEGMENT_ENTRY_READ | \ _SEGMENT_ENTRY_NOEXEC) -#define SEGMENT_RX __pgprot(_SEGMENT_ENTRY_PROTECT | \ +#define SEGMENT_RX __pgprot(_SEGMENT_ENTRY_PRESENT | \ + _SEGMENT_ENTRY_PROTECT | \ _SEGMENT_ENTRY_READ) -#define SEGMENT_RW __pgprot(_SEGMENT_ENTRY_READ | \ +#define SEGMENT_RW __pgprot(_SEGMENT_ENTRY_PRESENT | \ + _SEGMENT_ENTRY_READ | \ _SEGMENT_ENTRY_WRITE | \ _SEGMENT_ENTRY_NOEXEC) -#define SEGMENT_RWX __pgprot(_SEGMENT_ENTRY_READ | \ +#define SEGMENT_RWX __pgprot(_SEGMENT_ENTRY_PRESENT | \ + _SEGMENT_ENTRY_READ | \ _SEGMENT_ENTRY_WRITE) #define SEGMENT_KERNEL __pgprot(_SEGMENT_ENTRY | \ _SEGMENT_ENTRY_LARGE | \ @@ -472,6 +488,7 @@ static inline int is_module_addr(void *addr) */ #define REGION3_KERNEL __pgprot(_REGION_ENTRY_TYPE_R3 | \ + _REGION3_ENTRY_PRESENT | \ _REGION3_ENTRY_LARGE | \ _REGION3_ENTRY_READ | \ _REGION3_ENTRY_WRITE | \ @@ -479,12 +496,14 @@ static inline int is_module_addr(void *addr) _REGION3_ENTRY_DIRTY | \ _REGION_ENTRY_NOEXEC) #define REGION3_KERNEL_RO __pgprot(_REGION_ENTRY_TYPE_R3 | \ + _REGION3_ENTRY_PRESENT | \ _REGION3_ENTRY_LARGE | \ _REGION3_ENTRY_READ | \ _REGION3_ENTRY_YOUNG | \ _REGION_ENTRY_PROTECT | \ _REGION_ENTRY_NOEXEC) #define REGION3_KERNEL_EXEC __pgprot(_REGION_ENTRY_TYPE_R3 | \ + _REGION3_ENTRY_PRESENT | \ _REGION3_ENTRY_LARGE | \ _REGION3_ENTRY_READ | \ _REGION3_ENTRY_WRITE | \ @@ -706,7 +725,7 @@ static inline int pud_present(pud_t pud) { if (pud_folded(pud)) return 1; - return (pud_val(pud) & _REGION_ENTRY_ORIGIN) != 0UL; + return (pud_val(pud) & _REGION3_ENTRY_PRESENT) != 0; } static inline int pud_none(pud_t pud) @@ -721,13 +740,18 @@ static inline bool pud_leaf(pud_t pud) { if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) != _REGION_ENTRY_TYPE_R3) return 0; - return !!(pud_val(pud) & _REGION3_ENTRY_LARGE); + return (pud_present(pud) && (pud_val(pud) & _REGION3_ENTRY_LARGE) != 0); +} + +static inline int pmd_present(pmd_t pmd) +{ + return (pmd_val(pmd) & _SEGMENT_ENTRY_PRESENT) != 0; } #define pmd_leaf pmd_leaf static inline bool pmd_leaf(pmd_t pmd) { - return (pmd_val(pmd) & _SEGMENT_ENTRY_LARGE) != 0; + return (pmd_present(pmd) && (pmd_val(pmd) & _SEGMENT_ENTRY_LARGE) != 0); } static inline int pmd_bad(pmd_t pmd) @@ -759,11 +783,6 @@ static inline int p4d_bad(p4d_t p4d) return (p4d_val(p4d) & ~_REGION_ENTRY_BITS) != 0; } -static inline int pmd_present(pmd_t pmd) -{ - return pmd_val(pmd) != _SEGMENT_ENTRY_EMPTY; -} - static inline int pmd_none(pmd_t pmd) { return pmd_val(pmd) == _SEGMENT_ENTRY_EMPTY; @@ -1808,7 +1827,7 @@ static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, static inline int pmd_trans_huge(pmd_t pmd) { - return pmd_val(pmd) & _SEGMENT_ENTRY_LARGE; + return pmd_leaf(pmd); } #define has_transparent_hugepage has_transparent_hugepage diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index b72facdb49f6..5124024957a2 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -616,7 +616,8 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) if (pmd_leaf(*pmd)) { *table = (pmd_val(*pmd) & _SEGMENT_ENTRY_HARDWARE_BITS_LARGE) - | _SEGMENT_ENTRY_GMAP_UC; + | _SEGMENT_ENTRY_GMAP_UC + | _SEGMENT_ENTRY; } else *table = pmd_val(*pmd) & _SEGMENT_ENTRY_HARDWARE_BITS; @@ -2345,7 +2346,8 @@ static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr, gaddr = __gmap_segment_gaddr((unsigned long *)pmdp); pmdp_notify_gmap(gmap, pmdp, gaddr); WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | - _SEGMENT_ENTRY_GMAP_UC)); + _SEGMENT_ENTRY_GMAP_UC | + _SEGMENT_ENTRY)); if (purge) __pmdp_csp(pmdp); set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY)); @@ -2399,7 +2401,8 @@ void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr) gaddr = __gmap_segment_gaddr(entry); pmdp_notify_gmap(gmap, pmdp, gaddr); WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | - _SEGMENT_ENTRY_GMAP_UC)); + _SEGMENT_ENTRY_GMAP_UC | + _SEGMENT_ENTRY)); if (MACHINE_HAS_TLB_GUEST) __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, gmap->asce, IDTE_LOCAL); @@ -2434,7 +2437,8 @@ void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr) gaddr = __gmap_segment_gaddr(entry); pmdp_notify_gmap(gmap, pmdp, gaddr); WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | - _SEGMENT_ENTRY_GMAP_UC)); + _SEGMENT_ENTRY_GMAP_UC | + _SEGMENT_ENTRY)); if (MACHINE_HAS_TLB_GUEST) __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, gmap->asce, IDTE_GLOBAL); diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index c2e8242bd15d..4b366623219d 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -48,6 +48,7 @@ static inline unsigned long __pte_to_rste(pte_t pte) */ if (pte_present(pte)) { rste = pte_val(pte) & PAGE_MASK; + rste |= _SEGMENT_ENTRY_PRESENT; rste |= move_set_bit(pte_val(pte), _PAGE_READ, _SEGMENT_ENTRY_READ); rste |= move_set_bit(pte_val(pte), _PAGE_WRITE, @@ -223,11 +224,10 @@ pte_t *huge_pte_offset(struct mm_struct *mm, p4dp = p4d_offset(pgdp, addr); if (p4d_present(*p4dp)) { pudp = pud_offset(p4dp, addr); - if (pud_present(*pudp)) { - if (pud_leaf(*pudp)) - return (pte_t *) pudp; + if (sz == PUD_SIZE) + return (pte_t *)pudp; + if (pud_present(*pudp)) pmdp = pmd_offset(pudp, addr); - } } } return (pte_t *) pmdp; From bf0362c231baa1d5bb6d78f8d3df01ec45616bd8 Mon Sep 17 00:00:00 2001 From: Aristeu Rozanski Date: Tue, 4 Feb 2025 09:15:23 -0500 Subject: [PATCH 5/7] s390/mm: Introduce region-third and segment table swap entries JIRA: https://issues.redhat.com/browse/RHEL-74362 Tested: by me commit f934f6be76c1d033692f7fbfcbb06ec44a25bf3f Author: Gerald Schaefer Date: Thu Nov 21 18:45:22 2024 +0100 s390/mm: Introduce region-third and segment table swap entries Introduce region-third (PUD) and segment table (PMD) swap entries, and make hugetlbfs RSTE <-> PTE conversion code aware of them, so that they can be used for hugetlbfs PTE_MARKER entries. Future work could also build on this to enable THP_SWAP and THP_MIGRATION for s390. Similar to PTE swap entries, bits 0-51 can be used to store the swap offset, but bits 57-61 cannot be used for swap type because that overlaps with the INVALID and TABLE TYPE bits. PMD/PUD swap entries must be invalid, and have a correct table type so that pud_folded() check still works. Bits 53-57 can be used for swap type, but those include the PROTECT bit. So unlike swap PTEs, the PROTECT bit cannot be used to mark the swap entry. Use the "Common-Segment/Region" bit 59 instead for that. Also remove the !MACHINE_HAS_NX check in __set_huge_pte_at(). Otherwise, that would clear the _SEGMENT_ENTRY_NOEXEC bit also for swap entries, where it is used for encoding the swap type. The architecture only requires this bit to be 0 for PTEs, with !MACHINE_HAS_NX, not for segment or region-third entries. And the check is also redundant, because after __pte_to_rste() conversion, for non-swap PTEs it would only be set if it was already set in the PTE, which should never be the case for !MACHINE_HAS_NX. This is a prerequisite for hugetlbfs PTE_MARKER support on s390, which is needed to fix a regression introduced with commit 8a13897fb0da ("mm: userfaultfd: support UFFDIO_POISON for hugetlbfs"). That commit depends on the availability of swap entries for hugetlbfs, which were not available for s390 so far. Reviewed-by: Alexander Gordeev Signed-off-by: Gerald Schaefer Signed-off-by: Heiko Carstens Signed-off-by: Aristeu Rozanski --- arch/s390/include/asm/pgtable.h | 53 +++++++++++++++++++++++++++++++++ arch/s390/mm/hugetlbpage.c | 23 ++++++++++---- 2 files changed, 71 insertions(+), 5 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 7bd565ab2750..2d2d224207bc 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -266,6 +266,7 @@ static inline int is_module_addr(void *addr) #define _REGION3_ENTRY_ORIGIN_LARGE ~0x7fffffffUL /* large page address */ #define _REGION3_ENTRY_DIRTY 0x2000 /* SW region dirty bit */ #define _REGION3_ENTRY_YOUNG 0x1000 /* SW region young bit */ +#define _REGION3_ENTRY_COMM 0x0010 /* Common-Region, marks swap entry */ #define _REGION3_ENTRY_LARGE 0x0400 /* RTTE-format control, large page */ #define _REGION3_ENTRY_WRITE 0x8000 /* SW region write bit */ #define _REGION3_ENTRY_READ 0x4000 /* SW region read bit */ @@ -303,6 +304,7 @@ static inline int is_module_addr(void *addr) #define _SEGMENT_ENTRY_DIRTY 0x2000 /* SW segment dirty bit */ #define _SEGMENT_ENTRY_YOUNG 0x1000 /* SW segment young bit */ +#define _SEGMENT_ENTRY_COMM 0x0010 /* Common-Segment, marks swap entry */ #define _SEGMENT_ENTRY_LARGE 0x0400 /* STE-format control, large page */ #define _SEGMENT_ENTRY_WRITE 0x8000 /* SW segment write bit */ #define _SEGMENT_ENTRY_READ 0x4000 /* SW segment read bit */ @@ -315,6 +317,10 @@ static inline int is_module_addr(void *addr) #define _SEGMENT_ENTRY_PRESENT 0x0001 /* SW segment present bit */ +/* Common bits in region and segment table entries, for swap entries */ +#define _RST_ENTRY_COMM 0x0010 /* Common-Region/Segment, marks swap entry */ +#define _RST_ENTRY_INVALID 0x0020 /* invalid region/segment table entry */ + #define _CRST_ENTRIES 2048 /* number of region/segment table entries */ #define _PAGE_ENTRIES 256 /* number of page table entries */ @@ -1887,6 +1893,53 @@ static inline swp_entry_t __swp_entry(unsigned long type, unsigned long offset) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +/* + * 64 bit swap entry format for REGION3 and SEGMENT table entries (RSTE) + * Bits 59 and 63 are used to indicate the swap entry. Bit 58 marks the rste + * as invalid. + * A swap entry is indicated by bit pattern (rste & 0x011) == 0x010 + * | offset |Xtype |11TT|S0| + * |0000000000111111111122222222223333333333444444444455|555555|5566|66| + * |0123456789012345678901234567890123456789012345678901|234567|8901|23| + * + * Bits 0-51 store the offset. + * Bits 53-57 store the type. + * Bit 62 (S) is used for softdirty tracking. + * Bits 60-61 (TT) indicate the table type: 0x01 for REGION3 and 0x00 for SEGMENT. + * Bit 52 (X) is unused. + */ + +#define __SWP_OFFSET_MASK_RSTE ((1UL << 52) - 1) +#define __SWP_OFFSET_SHIFT_RSTE 12 +#define __SWP_TYPE_MASK_RSTE ((1UL << 5) - 1) +#define __SWP_TYPE_SHIFT_RSTE 6 + +/* + * TT bits set to 0x00 == SEGMENT. For REGION3 entries, caller must add R3 + * bits 0x01. See also __set_huge_pte_at(). + */ +static inline unsigned long mk_swap_rste(unsigned long type, unsigned long offset) +{ + unsigned long rste; + + rste = _RST_ENTRY_INVALID | _RST_ENTRY_COMM; + rste |= (offset & __SWP_OFFSET_MASK_RSTE) << __SWP_OFFSET_SHIFT_RSTE; + rste |= (type & __SWP_TYPE_MASK_RSTE) << __SWP_TYPE_SHIFT_RSTE; + return rste; +} + +static inline unsigned long __swp_type_rste(swp_entry_t entry) +{ + return (entry.val >> __SWP_TYPE_SHIFT_RSTE) & __SWP_TYPE_MASK_RSTE; +} + +static inline unsigned long __swp_offset_rste(swp_entry_t entry) +{ + return (entry.val >> __SWP_OFFSET_SHIFT_RSTE) & __SWP_OFFSET_MASK_RSTE; +} + +#define __rste_to_swp_entry(rste) ((swp_entry_t) { rste }) + extern int vmem_add_mapping(unsigned long start, unsigned long size); extern void vmem_remove_mapping(unsigned long start, unsigned long size); extern int __vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot, bool alloc); diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index 4b366623219d..e5c504449244 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -24,6 +24,7 @@ static inline unsigned long __pte_to_rste(pte_t pte) { + swp_entry_t arch_entry; unsigned long rste; /* @@ -67,6 +68,10 @@ static inline unsigned long __pte_to_rste(pte_t pte) #endif rste |= move_set_bit(pte_val(pte), _PAGE_NOEXEC, _SEGMENT_ENTRY_NOEXEC); + } else if (!pte_none(pte)) { + /* swap pte */ + arch_entry = __pte_to_swp_entry(pte); + rste = mk_swap_rste(__swp_type(arch_entry), __swp_offset(arch_entry)); } else rste = _SEGMENT_ENTRY_EMPTY; return rste; @@ -74,13 +79,18 @@ static inline unsigned long __pte_to_rste(pte_t pte) static inline pte_t __rste_to_pte(unsigned long rste) { + swp_entry_t arch_entry; unsigned long pteval; - int present; + int present, none; + pte_t pte; - if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) + if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) { present = pud_present(__pud(rste)); - else + none = pud_none(__pud(rste)); + } else { present = pmd_present(__pmd(rste)); + none = pmd_none(__pmd(rste)); + } /* * Convert encoding pmd / pud bits pte bits @@ -115,6 +125,11 @@ static inline pte_t __rste_to_pte(unsigned long rste) pteval |= move_set_bit(rste, _SEGMENT_ENTRY_SOFT_DIRTY, _PAGE_SOFT_DIRTY); #endif pteval |= move_set_bit(rste, _SEGMENT_ENTRY_NOEXEC, _PAGE_NOEXEC); + } else if (!none) { + /* swap rste */ + arch_entry = __rste_to_swp_entry(rste); + pte = mk_swap_pte(__swp_type_rste(arch_entry), __swp_offset_rste(arch_entry)); + pteval = pte_val(pte); } else pteval = _PAGE_INVALID; return __pte(pteval); @@ -149,8 +164,6 @@ void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr, unsigned long rste; rste = __pte_to_rste(pte); - if (!MACHINE_HAS_NX) - rste &= ~_SEGMENT_ENTRY_NOEXEC; /* Set correct table type for 2G hugepages */ if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) { From b7f7e8e162fc91f25c66b288dce57f305672b896 Mon Sep 17 00:00:00 2001 From: Aristeu Rozanski Date: Tue, 4 Feb 2025 09:15:23 -0500 Subject: [PATCH 6/7] s390/mm: Add PTE_MARKER support for hugetlbfs mappings JIRA: https://issues.redhat.com/browse/RHEL-74362 Tested: by me Conflicts: due b20c8216c1e0d5 patchset not being backported commit 487ef5d4d912f6a32556d8a61cede17870925295 Author: Gerald Schaefer Date: Thu Nov 21 18:45:23 2024 +0100 s390/mm: Add PTE_MARKER support for hugetlbfs mappings Commit 8a13897fb0daa ("mm: userfaultfd: support UFFDIO_POISON for hugetlbfs") added support for PTE_MARKER_POISONED for hugetlbfs, but PTE_MARKER also needs support for swap entries. For s390, swap entries were only supported on PTE level, not on the PMD/PUD levels that are used for large hugetlbfs mappings. Therefore, when writing a PTE_MARKER_POISONED entry, the resulting entry on PMD/PUD level would be an invalid / empty entry. Further access would then generate a pagefault loop, instead of the expected SIGBUS. It is a loop inside the kernel, but interruptible and uffd fault handling also calls schedule() in between, so at least it won't completely block the system. Previous commits prepared support for swap entries on PMD/PUD levels. PTE_MARKER support for hugetlbfs can now be enabled by simply adding an extra is_pte_marker() check to huge_pte_none_mostly(). Fault handling code also needs to be adjusted to expect the VM_FAULT_HWPOISON_LARGE fault flag, which was not possible on s390 before. Reviewed-by: Alexander Gordeev Signed-off-by: Gerald Schaefer Signed-off-by: Heiko Carstens Signed-off-by: Aristeu Rozanski --- arch/s390/include/asm/hugetlb.h | 2 +- arch/s390/mm/fault.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index deb198a61039..4d9078ac0c81 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -91,7 +91,7 @@ static inline int huge_pte_none(pte_t pte) static inline int huge_pte_none_mostly(pte_t pte) { - return huge_pte_none(pte); + return huge_pte_none(pte) || is_pte_marker(pte); } static inline int huge_pte_write(pte_t pte) diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index fa7bb01f94fc..268fb53713f0 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -307,7 +307,8 @@ static void do_fault_error(struct pt_regs *regs, vm_fault_t fault) do_no_context(regs); else do_sigsegv(regs, SEGV_MAPERR); - } else if (fault & (VM_FAULT_SIGBUS | VM_FAULT_HWPOISON)) { + } else if (fault & (VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | + VM_FAULT_HWPOISON_LARGE)) { /* Kernel mode? Handle exceptions or die */ if (!user_mode(regs)) do_no_context(regs); From 8abcb489642562a335b0a8d64803e18860fe29ae Mon Sep 17 00:00:00 2001 From: Aristeu Rozanski Date: Tue, 4 Feb 2025 09:15:23 -0500 Subject: [PATCH 7/7] s390/mm/hugetlbfs: Add missing includes JIRA: https://issues.redhat.com/browse/RHEL-74362 Tested: by me commit adb44a4bfc8a3312d2a13cc09d2b89d5828e7702 Author: Heiko Carstens Date: Thu Nov 28 09:43:29 2024 +0100 s390/mm/hugetlbfs: Add missing includes Add missing includes to fix this randconfig compile error: All errors (new ones prefixed by >>): In file included from mm/pagewalk.c:5: In file included from include/linux/hugetlb.h:798: >> arch/s390/include/asm/hugetlb.h:94:31: error: call to undeclared function 'is_pte_marker'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration] 94 | return huge_pte_none(pte) || is_pte_marker(pte); | ^ Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202411281002.IPkRpIcR-lkp@intel.com/ Fixes: 487ef5d4d912 ("s390/mm: Add PTE_MARKER support for hugetlbfs mappings") Signed-off-by: Heiko Carstens Signed-off-by: Aristeu Rozanski --- arch/s390/include/asm/hugetlb.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index 4d9078ac0c81..cabfa146bc3b 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -10,6 +10,8 @@ #define _ASM_S390_HUGETLB_H #include +#include +#include #include #define hugetlb_free_pgd_range free_pgd_range