2019-05-27 06:55:01 +00:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
2008-06-24 01:32:21 +00:00
|
|
|
/*
|
|
|
|
|
* Copyright 2008 Michael Ellerman, IBM Corporation.
|
|
|
|
|
*/
|
|
|
|
|
|
2017-04-12 11:18:51 +00:00
|
|
|
#include <linux/kprobes.h>
|
powerpc/code-patching: Use temporary mm for Radix MMU
x86 supports the notion of a temporary mm which restricts access to
temporary PTEs to a single CPU. A temporary mm is useful for situations
where a CPU needs to perform sensitive operations (such as patching a
STRICT_KERNEL_RWX kernel) requiring temporary mappings without exposing
said mappings to other CPUs. Another benefit is that other CPU TLBs do
not need to be flushed when the temporary mm is torn down.
Mappings in the temporary mm can be set in the userspace portion of the
address-space.
Interrupts must be disabled while the temporary mm is in use. HW
breakpoints, which may have been set by userspace as watchpoints on
addresses now within the temporary mm, are saved and disabled when
loading the temporary mm. The HW breakpoints are restored when unloading
the temporary mm. All HW breakpoints are indiscriminately disabled while
the temporary mm is in use - this may include breakpoints set by perf.
Use the `poking_init` init hook to prepare a temporary mm and patching
address. Initialize the temporary mm using mm_alloc(). Choose a
randomized patching address inside the temporary mm userspace address
space. The patching address is randomized between PAGE_SIZE and
DEFAULT_MAP_WINDOW-PAGE_SIZE.
Bits of entropy with 64K page size on BOOK3S_64:
bits of entropy = log2(DEFAULT_MAP_WINDOW_USER64 / PAGE_SIZE)
PAGE_SIZE=64K, DEFAULT_MAP_WINDOW_USER64=128TB
bits of entropy = log2(128TB / 64K)
bits of entropy = 31
The upper limit is DEFAULT_MAP_WINDOW due to how the Book3s64 Hash MMU
operates - by default the space above DEFAULT_MAP_WINDOW is not
available. Currently the Hash MMU does not use a temporary mm so
technically this upper limit isn't necessary; however, a larger
randomization range does not further "harden" this overall approach and
future work may introduce patching with a temporary mm on Hash as well.
Randomization occurs only once during initialization for each CPU as it
comes online.
The patching page is mapped with PAGE_KERNEL to set EAA[0] for the PTE
which ignores the AMR (so no need to unlock/lock KUAP) according to
PowerISA v3.0b Figure 35 on Radix.
Based on x86 implementation:
commit 4fc19708b165
("x86/alternatives: Initialize temporary mm for patching")
and:
commit b3fd8e83ada0
("x86/alternatives: Use temporary mm for text poking")
From: Benjamin Gray <bgray@linux.ibm.com>
Synchronisation is done according to ISA 3.1B Book 3 Chapter 13
"Synchronization Requirements for Context Alterations". Switching the mm
is a change to the PID, which requires a CSI before and after the change,
and a hwsync between the last instruction that performs address
translation for an associated storage access.
Instruction fetch is an associated storage access, but the instruction
address mappings are not being changed, so it should not matter which
context they use. We must still perform a hwsync to guard arbitrary
prior code that may have accessed a userspace address.
TLB invalidation is local and VA specific. Local because only this core
used the patching mm, and VA specific because we only care that the
writable mapping is purged. Leaving the other mappings intact is more
efficient, especially when performing many code patches in a row (e.g.,
as ftrace would).
Signed-off-by: Christopher M. Riedl <cmr@bluescreens.de>
Signed-off-by: Benjamin Gray <bgray@linux.ibm.com>
[mpe: Use mm_alloc() per 107b6828a7cd ("x86/mm: Use mm_alloc() in poking_init()")]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20221109045112.187069-9-bgray@linux.ibm.com
2022-11-09 04:51:11 +00:00
|
|
|
#include <linux/mmu_context.h>
|
|
|
|
|
#include <linux/random.h>
|
2008-06-24 01:32:32 +00:00
|
|
|
#include <linux/vmalloc.h>
|
|
|
|
|
#include <linux/init.h>
|
2017-06-28 17:04:05 +00:00
|
|
|
#include <linux/cpuhotplug.h>
|
2016-12-24 19:46:01 +00:00
|
|
|
#include <linux/uaccess.h>
|
2022-03-22 15:40:20 +00:00
|
|
|
#include <linux/jump_label.h>
|
2008-06-24 01:32:21 +00:00
|
|
|
|
powerpc/code-patching: Use temporary mm for Radix MMU
x86 supports the notion of a temporary mm which restricts access to
temporary PTEs to a single CPU. A temporary mm is useful for situations
where a CPU needs to perform sensitive operations (such as patching a
STRICT_KERNEL_RWX kernel) requiring temporary mappings without exposing
said mappings to other CPUs. Another benefit is that other CPU TLBs do
not need to be flushed when the temporary mm is torn down.
Mappings in the temporary mm can be set in the userspace portion of the
address-space.
Interrupts must be disabled while the temporary mm is in use. HW
breakpoints, which may have been set by userspace as watchpoints on
addresses now within the temporary mm, are saved and disabled when
loading the temporary mm. The HW breakpoints are restored when unloading
the temporary mm. All HW breakpoints are indiscriminately disabled while
the temporary mm is in use - this may include breakpoints set by perf.
Use the `poking_init` init hook to prepare a temporary mm and patching
address. Initialize the temporary mm using mm_alloc(). Choose a
randomized patching address inside the temporary mm userspace address
space. The patching address is randomized between PAGE_SIZE and
DEFAULT_MAP_WINDOW-PAGE_SIZE.
Bits of entropy with 64K page size on BOOK3S_64:
bits of entropy = log2(DEFAULT_MAP_WINDOW_USER64 / PAGE_SIZE)
PAGE_SIZE=64K, DEFAULT_MAP_WINDOW_USER64=128TB
bits of entropy = log2(128TB / 64K)
bits of entropy = 31
The upper limit is DEFAULT_MAP_WINDOW due to how the Book3s64 Hash MMU
operates - by default the space above DEFAULT_MAP_WINDOW is not
available. Currently the Hash MMU does not use a temporary mm so
technically this upper limit isn't necessary; however, a larger
randomization range does not further "harden" this overall approach and
future work may introduce patching with a temporary mm on Hash as well.
Randomization occurs only once during initialization for each CPU as it
comes online.
The patching page is mapped with PAGE_KERNEL to set EAA[0] for the PTE
which ignores the AMR (so no need to unlock/lock KUAP) according to
PowerISA v3.0b Figure 35 on Radix.
Based on x86 implementation:
commit 4fc19708b165
("x86/alternatives: Initialize temporary mm for patching")
and:
commit b3fd8e83ada0
("x86/alternatives: Use temporary mm for text poking")
From: Benjamin Gray <bgray@linux.ibm.com>
Synchronisation is done according to ISA 3.1B Book 3 Chapter 13
"Synchronization Requirements for Context Alterations". Switching the mm
is a change to the PID, which requires a CSI before and after the change,
and a hwsync between the last instruction that performs address
translation for an associated storage access.
Instruction fetch is an associated storage access, but the instruction
address mappings are not being changed, so it should not matter which
context they use. We must still perform a hwsync to guard arbitrary
prior code that may have accessed a userspace address.
TLB invalidation is local and VA specific. Local because only this core
used the patching mm, and VA specific because we only care that the
writable mapping is purged. Leaving the other mappings intact is more
efficient, especially when performing many code patches in a row (e.g.,
as ftrace would).
Signed-off-by: Christopher M. Riedl <cmr@bluescreens.de>
Signed-off-by: Benjamin Gray <bgray@linux.ibm.com>
[mpe: Use mm_alloc() per 107b6828a7cd ("x86/mm: Use mm_alloc() in poking_init()")]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20221109045112.187069-9-bgray@linux.ibm.com
2022-11-09 04:51:11 +00:00
|
|
|
#include <asm/debug.h>
|
|
|
|
|
#include <asm/pgalloc.h>
|
|
|
|
|
#include <asm/tlb.h>
|
2017-06-28 17:04:05 +00:00
|
|
|
#include <asm/tlbflush.h>
|
|
|
|
|
#include <asm/page.h>
|
2024-10-23 16:27:06 +00:00
|
|
|
#include <asm/text-patching.h>
|
2020-05-06 03:40:26 +00:00
|
|
|
#include <asm/inst.h>
|
2008-06-24 01:32:21 +00:00
|
|
|
|
2024-05-15 02:44:41 +00:00
|
|
|
static int __patch_mem(void *exec_addr, unsigned long val, void *patch_addr, bool is_dword)
|
2008-06-24 01:32:21 +00:00
|
|
|
{
|
2024-05-15 02:44:41 +00:00
|
|
|
if (!IS_ENABLED(CONFIG_PPC64) || likely(!is_dword)) {
|
|
|
|
|
/* For big endian correctness: plain address would use the wrong half */
|
|
|
|
|
u32 val32 = val;
|
2021-03-10 17:57:04 +00:00
|
|
|
|
2024-05-15 02:44:41 +00:00
|
|
|
__put_kernel_nofault(patch_addr, &val32, u32, failed);
|
2021-03-10 17:57:04 +00:00
|
|
|
} else {
|
|
|
|
|
__put_kernel_nofault(patch_addr, &val, u64, failed);
|
|
|
|
|
}
|
2017-06-28 17:04:05 +00:00
|
|
|
|
2017-11-24 07:31:07 +00:00
|
|
|
asm ("dcbst 0, %0; sync; icbi 0,%1; sync; isync" :: "r" (patch_addr),
|
|
|
|
|
"r" (exec_addr));
|
2017-06-28 17:04:05 +00:00
|
|
|
|
|
|
|
|
return 0;
|
2020-09-04 11:01:31 +00:00
|
|
|
|
|
|
|
|
failed:
|
2023-10-07 10:46:19 +00:00
|
|
|
mb(); /* sync */
|
powerpc/ftrace: Use patch_instruction() return directly
Instead of returning -EPERM when patch_instruction() fails,
just return what patch_instruction returns.
That simplifies ftrace_modify_code():
0: 94 21 ff c0 stwu r1,-64(r1)
4: 93 e1 00 3c stw r31,60(r1)
8: 7c 7f 1b 79 mr. r31,r3
c: 40 80 00 30 bge 3c <ftrace_modify_code+0x3c>
10: 93 c1 00 38 stw r30,56(r1)
14: 7c 9e 23 78 mr r30,r4
18: 7c a4 2b 78 mr r4,r5
1c: 80 bf 00 00 lwz r5,0(r31)
20: 7c 1e 28 40 cmplw r30,r5
24: 40 82 00 34 bne 58 <ftrace_modify_code+0x58>
28: 83 c1 00 38 lwz r30,56(r1)
2c: 7f e3 fb 78 mr r3,r31
30: 83 e1 00 3c lwz r31,60(r1)
34: 38 21 00 40 addi r1,r1,64
38: 48 00 00 00 b 38 <ftrace_modify_code+0x38>
38: R_PPC_REL24 patch_instruction
Before:
0: 94 21 ff c0 stwu r1,-64(r1)
4: 93 e1 00 3c stw r31,60(r1)
8: 7c 7f 1b 79 mr. r31,r3
c: 40 80 00 4c bge 58 <ftrace_modify_code+0x58>
10: 93 c1 00 38 stw r30,56(r1)
14: 7c 9e 23 78 mr r30,r4
18: 7c a4 2b 78 mr r4,r5
1c: 80 bf 00 00 lwz r5,0(r31)
20: 7c 08 02 a6 mflr r0
24: 90 01 00 44 stw r0,68(r1)
28: 7c 1e 28 40 cmplw r30,r5
2c: 40 82 00 48 bne 74 <ftrace_modify_code+0x74>
30: 7f e3 fb 78 mr r3,r31
34: 48 00 00 01 bl 34 <ftrace_modify_code+0x34>
34: R_PPC_REL24 patch_instruction
38: 80 01 00 44 lwz r0,68(r1)
3c: 20 63 00 00 subfic r3,r3,0
40: 83 c1 00 38 lwz r30,56(r1)
44: 7c 63 19 10 subfe r3,r3,r3
48: 7c 08 03 a6 mtlr r0
4c: 83 e1 00 3c lwz r31,60(r1)
50: 38 21 00 40 addi r1,r1,64
54: 4e 80 00 20 blr
It improves ftrace activation/deactivation duration by about 3%.
Modify patch_instruction() return on failure to -EPERM in order to
match with ftrace expectations. Other users of patch_instruction()
do not care about the exact error value returned.
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/49a8597230713e2633e7d9d7b56140787c4a7e20.1652074503.git.christophe.leroy@csgroup.eu
2022-05-09 05:36:05 +00:00
|
|
|
return -EPERM;
|
2017-06-28 17:04:05 +00:00
|
|
|
}
|
|
|
|
|
|
2021-11-29 17:49:38 +00:00
|
|
|
int raw_patch_instruction(u32 *addr, ppc_inst_t instr)
|
2017-11-24 07:31:07 +00:00
|
|
|
{
|
2024-05-15 02:44:41 +00:00
|
|
|
if (ppc_inst_prefixed(instr))
|
|
|
|
|
return __patch_mem(addr, ppc_inst_as_ulong(instr), addr, true);
|
|
|
|
|
else
|
|
|
|
|
return __patch_mem(addr, ppc_inst_val(instr), addr, false);
|
2017-11-24 07:31:07 +00:00
|
|
|
}
|
|
|
|
|
|
2022-11-09 04:51:12 +00:00
|
|
|
struct patch_context {
|
|
|
|
|
union {
|
|
|
|
|
struct vm_struct *area;
|
|
|
|
|
struct mm_struct *mm;
|
|
|
|
|
};
|
|
|
|
|
unsigned long addr;
|
|
|
|
|
pte_t *pte;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
static DEFINE_PER_CPU(struct patch_context, cpu_patching_context);
|
2017-06-28 17:04:05 +00:00
|
|
|
|
powerpc/code-patching: Pre-map patch area
Paul reported a warning with DEBUG_ATOMIC_SLEEP=y:
BUG: sleeping function called from invalid context at include/linux/sched/mm.h:256
in_atomic(): 0, irqs_disabled(): 1, non_block: 0, pid: 1, name: swapper/0
preempt_count: 0, expected: 0
...
Call Trace:
dump_stack_lvl+0xa0/0xec (unreliable)
__might_resched+0x2f4/0x310
kmem_cache_alloc+0x220/0x4b0
__pud_alloc+0x74/0x1d0
hash__map_kernel_page+0x2cc/0x390
do_patch_instruction+0x134/0x4a0
arch_jump_label_transform+0x64/0x78
__jump_label_update+0x148/0x180
static_key_enable_cpuslocked+0xd0/0x120
static_key_enable+0x30/0x50
check_kvm_guest+0x60/0x88
pSeries_smp_probe+0x54/0xb0
smp_prepare_cpus+0x3e0/0x430
kernel_init_freeable+0x20c/0x43c
kernel_init+0x30/0x1a0
ret_from_kernel_thread+0x5c/0x64
Peter pointed out that this is because do_patch_instruction() has
disabled interrupts, but then map_patch_area() calls map_kernel_page()
then hash__map_kernel_page() which does a sleeping memory allocation.
We only see the warning in KVM guests with SMT enabled, which is not
particularly common, or on other platforms if CONFIG_KPROBES is
disabled, also not common. The reason we don't see it in most
configurations is that another path that happens to have interrupts
enabled has allocated the required page tables for us, eg. there's a
path in kprobes init that does that. That's just pure luck though.
As Christophe suggested, the simplest solution is to do a dummy
map/unmap when we initialise the patching, so that any required page
table levels are pre-allocated before the first call to
do_patch_instruction(). This works because the unmap doesn't free any
page tables that were allocated by the map, it just clears the PTE,
leaving the page table levels there for the next map.
Reported-by: Paul Menzel <pmenzel@molgen.mpg.de>
Debugged-by: Peter Zijlstra <peterz@infradead.org>
Suggested-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20220223015821.473097-1-mpe@ellerman.id.au
2022-02-23 01:58:21 +00:00
|
|
|
static int map_patch_area(void *addr, unsigned long text_poke_addr);
|
|
|
|
|
static void unmap_patch_area(unsigned long addr);
|
|
|
|
|
|
powerpc/code-patching: Use temporary mm for Radix MMU
x86 supports the notion of a temporary mm which restricts access to
temporary PTEs to a single CPU. A temporary mm is useful for situations
where a CPU needs to perform sensitive operations (such as patching a
STRICT_KERNEL_RWX kernel) requiring temporary mappings without exposing
said mappings to other CPUs. Another benefit is that other CPU TLBs do
not need to be flushed when the temporary mm is torn down.
Mappings in the temporary mm can be set in the userspace portion of the
address-space.
Interrupts must be disabled while the temporary mm is in use. HW
breakpoints, which may have been set by userspace as watchpoints on
addresses now within the temporary mm, are saved and disabled when
loading the temporary mm. The HW breakpoints are restored when unloading
the temporary mm. All HW breakpoints are indiscriminately disabled while
the temporary mm is in use - this may include breakpoints set by perf.
Use the `poking_init` init hook to prepare a temporary mm and patching
address. Initialize the temporary mm using mm_alloc(). Choose a
randomized patching address inside the temporary mm userspace address
space. The patching address is randomized between PAGE_SIZE and
DEFAULT_MAP_WINDOW-PAGE_SIZE.
Bits of entropy with 64K page size on BOOK3S_64:
bits of entropy = log2(DEFAULT_MAP_WINDOW_USER64 / PAGE_SIZE)
PAGE_SIZE=64K, DEFAULT_MAP_WINDOW_USER64=128TB
bits of entropy = log2(128TB / 64K)
bits of entropy = 31
The upper limit is DEFAULT_MAP_WINDOW due to how the Book3s64 Hash MMU
operates - by default the space above DEFAULT_MAP_WINDOW is not
available. Currently the Hash MMU does not use a temporary mm so
technically this upper limit isn't necessary; however, a larger
randomization range does not further "harden" this overall approach and
future work may introduce patching with a temporary mm on Hash as well.
Randomization occurs only once during initialization for each CPU as it
comes online.
The patching page is mapped with PAGE_KERNEL to set EAA[0] for the PTE
which ignores the AMR (so no need to unlock/lock KUAP) according to
PowerISA v3.0b Figure 35 on Radix.
Based on x86 implementation:
commit 4fc19708b165
("x86/alternatives: Initialize temporary mm for patching")
and:
commit b3fd8e83ada0
("x86/alternatives: Use temporary mm for text poking")
From: Benjamin Gray <bgray@linux.ibm.com>
Synchronisation is done according to ISA 3.1B Book 3 Chapter 13
"Synchronization Requirements for Context Alterations". Switching the mm
is a change to the PID, which requires a CSI before and after the change,
and a hwsync between the last instruction that performs address
translation for an associated storage access.
Instruction fetch is an associated storage access, but the instruction
address mappings are not being changed, so it should not matter which
context they use. We must still perform a hwsync to guard arbitrary
prior code that may have accessed a userspace address.
TLB invalidation is local and VA specific. Local because only this core
used the patching mm, and VA specific because we only care that the
writable mapping is purged. Leaving the other mappings intact is more
efficient, especially when performing many code patches in a row (e.g.,
as ftrace would).
Signed-off-by: Christopher M. Riedl <cmr@bluescreens.de>
Signed-off-by: Benjamin Gray <bgray@linux.ibm.com>
[mpe: Use mm_alloc() per 107b6828a7cd ("x86/mm: Use mm_alloc() in poking_init()")]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20221109045112.187069-9-bgray@linux.ibm.com
2022-11-09 04:51:11 +00:00
|
|
|
static bool mm_patch_enabled(void)
|
|
|
|
|
{
|
|
|
|
|
return IS_ENABLED(CONFIG_SMP) && radix_enabled();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* The following applies for Radix MMU. Hash MMU has different requirements,
|
|
|
|
|
* and so is not supported.
|
|
|
|
|
*
|
|
|
|
|
* Changing mm requires context synchronising instructions on both sides of
|
|
|
|
|
* the context switch, as well as a hwsync between the last instruction for
|
|
|
|
|
* which the address of an associated storage access was translated using
|
|
|
|
|
* the current context.
|
|
|
|
|
*
|
|
|
|
|
* switch_mm_irqs_off() performs an isync after the context switch. It is
|
|
|
|
|
* the responsibility of the caller to perform the CSI and hwsync before
|
|
|
|
|
* starting/stopping the temp mm.
|
|
|
|
|
*/
|
|
|
|
|
static struct mm_struct *start_using_temp_mm(struct mm_struct *temp_mm)
|
|
|
|
|
{
|
|
|
|
|
struct mm_struct *orig_mm = current->active_mm;
|
|
|
|
|
|
|
|
|
|
lockdep_assert_irqs_disabled();
|
|
|
|
|
switch_mm_irqs_off(orig_mm, temp_mm, current);
|
|
|
|
|
|
|
|
|
|
WARN_ON(!mm_is_thread_local(temp_mm));
|
|
|
|
|
|
|
|
|
|
suspend_breakpoints();
|
|
|
|
|
return orig_mm;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void stop_using_temp_mm(struct mm_struct *temp_mm,
|
|
|
|
|
struct mm_struct *orig_mm)
|
|
|
|
|
{
|
|
|
|
|
lockdep_assert_irqs_disabled();
|
|
|
|
|
switch_mm_irqs_off(temp_mm, orig_mm, current);
|
|
|
|
|
restore_breakpoints();
|
|
|
|
|
}
|
|
|
|
|
|
2017-06-28 17:04:05 +00:00
|
|
|
static int text_area_cpu_up(unsigned int cpu)
|
|
|
|
|
{
|
|
|
|
|
struct vm_struct *area;
|
powerpc/code-patching: Pre-map patch area
Paul reported a warning with DEBUG_ATOMIC_SLEEP=y:
BUG: sleeping function called from invalid context at include/linux/sched/mm.h:256
in_atomic(): 0, irqs_disabled(): 1, non_block: 0, pid: 1, name: swapper/0
preempt_count: 0, expected: 0
...
Call Trace:
dump_stack_lvl+0xa0/0xec (unreliable)
__might_resched+0x2f4/0x310
kmem_cache_alloc+0x220/0x4b0
__pud_alloc+0x74/0x1d0
hash__map_kernel_page+0x2cc/0x390
do_patch_instruction+0x134/0x4a0
arch_jump_label_transform+0x64/0x78
__jump_label_update+0x148/0x180
static_key_enable_cpuslocked+0xd0/0x120
static_key_enable+0x30/0x50
check_kvm_guest+0x60/0x88
pSeries_smp_probe+0x54/0xb0
smp_prepare_cpus+0x3e0/0x430
kernel_init_freeable+0x20c/0x43c
kernel_init+0x30/0x1a0
ret_from_kernel_thread+0x5c/0x64
Peter pointed out that this is because do_patch_instruction() has
disabled interrupts, but then map_patch_area() calls map_kernel_page()
then hash__map_kernel_page() which does a sleeping memory allocation.
We only see the warning in KVM guests with SMT enabled, which is not
particularly common, or on other platforms if CONFIG_KPROBES is
disabled, also not common. The reason we don't see it in most
configurations is that another path that happens to have interrupts
enabled has allocated the required page tables for us, eg. there's a
path in kprobes init that does that. That's just pure luck though.
As Christophe suggested, the simplest solution is to do a dummy
map/unmap when we initialise the patching, so that any required page
table levels are pre-allocated before the first call to
do_patch_instruction(). This works because the unmap doesn't free any
page tables that were allocated by the map, it just clears the PTE,
leaving the page table levels there for the next map.
Reported-by: Paul Menzel <pmenzel@molgen.mpg.de>
Debugged-by: Peter Zijlstra <peterz@infradead.org>
Suggested-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20220223015821.473097-1-mpe@ellerman.id.au
2022-02-23 01:58:21 +00:00
|
|
|
unsigned long addr;
|
|
|
|
|
int err;
|
2017-06-28 17:04:05 +00:00
|
|
|
|
powerpc/code-patching: Fix KASAN hit by not flagging text patching area as VM_ALLOC
Erhard reported the following KASAN hit while booting his PowerMac G4
with a KASAN-enabled kernel 6.13-rc6:
BUG: KASAN: vmalloc-out-of-bounds in copy_to_kernel_nofault+0xd8/0x1c8
Write of size 8 at addr f1000000 by task chronyd/1293
CPU: 0 UID: 123 PID: 1293 Comm: chronyd Tainted: G W 6.13.0-rc6-PMacG4 #2
Tainted: [W]=WARN
Hardware name: PowerMac3,6 7455 0x80010303 PowerMac
Call Trace:
[c2437590] [c1631a84] dump_stack_lvl+0x70/0x8c (unreliable)
[c24375b0] [c0504998] print_report+0xdc/0x504
[c2437610] [c050475c] kasan_report+0xf8/0x108
[c2437690] [c0505a3c] kasan_check_range+0x24/0x18c
[c24376a0] [c03fb5e4] copy_to_kernel_nofault+0xd8/0x1c8
[c24376c0] [c004c014] patch_instructions+0x15c/0x16c
[c2437710] [c00731a8] bpf_arch_text_copy+0x60/0x7c
[c2437730] [c0281168] bpf_jit_binary_pack_finalize+0x50/0xac
[c2437750] [c0073cf4] bpf_int_jit_compile+0xb30/0xdec
[c2437880] [c0280394] bpf_prog_select_runtime+0x15c/0x478
[c24378d0] [c1263428] bpf_prepare_filter+0xbf8/0xc14
[c2437990] [c12677ec] bpf_prog_create_from_user+0x258/0x2b4
[c24379d0] [c027111c] do_seccomp+0x3dc/0x1890
[c2437ac0] [c001d8e0] system_call_exception+0x2dc/0x420
[c2437f30] [c00281ac] ret_from_syscall+0x0/0x2c
--- interrupt: c00 at 0x5a1274
NIP: 005a1274 LR: 006a3b3c CTR: 005296c8
REGS: c2437f40 TRAP: 0c00 Tainted: G W (6.13.0-rc6-PMacG4)
MSR: 0200f932 <VEC,EE,PR,FP,ME,IR,DR,RI> CR: 24004422 XER: 00000000
GPR00: 00000166 af8f3fa0 a7ee3540 00000001 00000000 013b6500 005a5858 0200f932
GPR08: 00000000 00001fe9 013d5fc8 005296c8 2822244c 00b2fcd8 00000000 af8f4b57
GPR16: 00000000 00000001 00000000 00000000 00000000 00000001 00000000 00000002
GPR24: 00afdbb0 00000000 00000000 00000000 006e0004 013ce060 006e7c1c 00000001
NIP [005a1274] 0x5a1274
LR [006a3b3c] 0x6a3b3c
--- interrupt: c00
The buggy address belongs to the virtual mapping at
[f1000000, f1002000) created by:
text_area_cpu_up+0x20/0x190
The buggy address belongs to the physical page:
page: refcount:1 mapcount:0 mapping:00000000 index:0x0 pfn:0x76e30
flags: 0x80000000(zone=2)
raw: 80000000 00000000 00000122 00000000 00000000 00000000 ffffffff 00000001
raw: 00000000
page dumped because: kasan: bad access detected
Memory state around the buggy address:
f0ffff00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
f0ffff80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>f1000000: f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8
^
f1000080: f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8
f1000100: f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8
==================================================================
f8 corresponds to KASAN_VMALLOC_INVALID which means the area is not
initialised hence not supposed to be used yet.
Powerpc text patching infrastructure allocates a virtual memory area
using get_vm_area() and flags it as VM_ALLOC. But that flag is meant
to be used for vmalloc() and vmalloc() allocated memory is not
supposed to be used before a call to __vmalloc_node_range() which is
never called for that area.
That went undetected until commit e4137f08816b ("mm, kasan, kmsan:
instrument copy_from/to_kernel_nofault")
The area allocated by text_area_cpu_up() is not vmalloc memory, it is
mapped directly on demand when needed by map_kernel_page(). There is
no VM flag corresponding to such usage, so just pass no flag. That way
the area will be unpoisonned and usable immediately.
Reported-by: Erhard Furtner <erhard_f@mailbox.org>
Closes: https://lore.kernel.org/all/20250112135832.57c92322@yea/
Fixes: 37bc3e5fd764 ("powerpc/lib/code-patching: Use alternate map for patch_instruction()")
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Madhavan Srinivasan <maddy@linux.ibm.com>
Link: https://patch.msgid.link/06621423da339b374f48c0886e3a5db18e896be8.1739342693.git.christophe.leroy@csgroup.eu
2025-02-12 06:46:28 +00:00
|
|
|
area = get_vm_area(PAGE_SIZE, 0);
|
2017-06-28 17:04:05 +00:00
|
|
|
if (!area) {
|
|
|
|
|
WARN_ONCE(1, "Failed to create text area for cpu %d\n",
|
|
|
|
|
cpu);
|
|
|
|
|
return -1;
|
|
|
|
|
}
|
powerpc/code-patching: Pre-map patch area
Paul reported a warning with DEBUG_ATOMIC_SLEEP=y:
BUG: sleeping function called from invalid context at include/linux/sched/mm.h:256
in_atomic(): 0, irqs_disabled(): 1, non_block: 0, pid: 1, name: swapper/0
preempt_count: 0, expected: 0
...
Call Trace:
dump_stack_lvl+0xa0/0xec (unreliable)
__might_resched+0x2f4/0x310
kmem_cache_alloc+0x220/0x4b0
__pud_alloc+0x74/0x1d0
hash__map_kernel_page+0x2cc/0x390
do_patch_instruction+0x134/0x4a0
arch_jump_label_transform+0x64/0x78
__jump_label_update+0x148/0x180
static_key_enable_cpuslocked+0xd0/0x120
static_key_enable+0x30/0x50
check_kvm_guest+0x60/0x88
pSeries_smp_probe+0x54/0xb0
smp_prepare_cpus+0x3e0/0x430
kernel_init_freeable+0x20c/0x43c
kernel_init+0x30/0x1a0
ret_from_kernel_thread+0x5c/0x64
Peter pointed out that this is because do_patch_instruction() has
disabled interrupts, but then map_patch_area() calls map_kernel_page()
then hash__map_kernel_page() which does a sleeping memory allocation.
We only see the warning in KVM guests with SMT enabled, which is not
particularly common, or on other platforms if CONFIG_KPROBES is
disabled, also not common. The reason we don't see it in most
configurations is that another path that happens to have interrupts
enabled has allocated the required page tables for us, eg. there's a
path in kprobes init that does that. That's just pure luck though.
As Christophe suggested, the simplest solution is to do a dummy
map/unmap when we initialise the patching, so that any required page
table levels are pre-allocated before the first call to
do_patch_instruction(). This works because the unmap doesn't free any
page tables that were allocated by the map, it just clears the PTE,
leaving the page table levels there for the next map.
Reported-by: Paul Menzel <pmenzel@molgen.mpg.de>
Debugged-by: Peter Zijlstra <peterz@infradead.org>
Suggested-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20220223015821.473097-1-mpe@ellerman.id.au
2022-02-23 01:58:21 +00:00
|
|
|
|
|
|
|
|
// Map/unmap the area to ensure all page tables are pre-allocated
|
|
|
|
|
addr = (unsigned long)area->addr;
|
|
|
|
|
err = map_patch_area(empty_zero_page, addr);
|
|
|
|
|
if (err)
|
|
|
|
|
return err;
|
|
|
|
|
|
|
|
|
|
unmap_patch_area(addr);
|
|
|
|
|
|
2022-11-09 04:51:12 +00:00
|
|
|
this_cpu_write(cpu_patching_context.area, area);
|
|
|
|
|
this_cpu_write(cpu_patching_context.addr, addr);
|
|
|
|
|
this_cpu_write(cpu_patching_context.pte, virt_to_kpte(addr));
|
2017-06-28 17:04:05 +00:00
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int text_area_cpu_down(unsigned int cpu)
|
|
|
|
|
{
|
2022-11-09 04:51:12 +00:00
|
|
|
free_vm_area(this_cpu_read(cpu_patching_context.area));
|
|
|
|
|
this_cpu_write(cpu_patching_context.area, NULL);
|
|
|
|
|
this_cpu_write(cpu_patching_context.addr, 0);
|
|
|
|
|
this_cpu_write(cpu_patching_context.pte, NULL);
|
2017-06-28 17:04:05 +00:00
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
powerpc/code-patching: Use temporary mm for Radix MMU
x86 supports the notion of a temporary mm which restricts access to
temporary PTEs to a single CPU. A temporary mm is useful for situations
where a CPU needs to perform sensitive operations (such as patching a
STRICT_KERNEL_RWX kernel) requiring temporary mappings without exposing
said mappings to other CPUs. Another benefit is that other CPU TLBs do
not need to be flushed when the temporary mm is torn down.
Mappings in the temporary mm can be set in the userspace portion of the
address-space.
Interrupts must be disabled while the temporary mm is in use. HW
breakpoints, which may have been set by userspace as watchpoints on
addresses now within the temporary mm, are saved and disabled when
loading the temporary mm. The HW breakpoints are restored when unloading
the temporary mm. All HW breakpoints are indiscriminately disabled while
the temporary mm is in use - this may include breakpoints set by perf.
Use the `poking_init` init hook to prepare a temporary mm and patching
address. Initialize the temporary mm using mm_alloc(). Choose a
randomized patching address inside the temporary mm userspace address
space. The patching address is randomized between PAGE_SIZE and
DEFAULT_MAP_WINDOW-PAGE_SIZE.
Bits of entropy with 64K page size on BOOK3S_64:
bits of entropy = log2(DEFAULT_MAP_WINDOW_USER64 / PAGE_SIZE)
PAGE_SIZE=64K, DEFAULT_MAP_WINDOW_USER64=128TB
bits of entropy = log2(128TB / 64K)
bits of entropy = 31
The upper limit is DEFAULT_MAP_WINDOW due to how the Book3s64 Hash MMU
operates - by default the space above DEFAULT_MAP_WINDOW is not
available. Currently the Hash MMU does not use a temporary mm so
technically this upper limit isn't necessary; however, a larger
randomization range does not further "harden" this overall approach and
future work may introduce patching with a temporary mm on Hash as well.
Randomization occurs only once during initialization for each CPU as it
comes online.
The patching page is mapped with PAGE_KERNEL to set EAA[0] for the PTE
which ignores the AMR (so no need to unlock/lock KUAP) according to
PowerISA v3.0b Figure 35 on Radix.
Based on x86 implementation:
commit 4fc19708b165
("x86/alternatives: Initialize temporary mm for patching")
and:
commit b3fd8e83ada0
("x86/alternatives: Use temporary mm for text poking")
From: Benjamin Gray <bgray@linux.ibm.com>
Synchronisation is done according to ISA 3.1B Book 3 Chapter 13
"Synchronization Requirements for Context Alterations". Switching the mm
is a change to the PID, which requires a CSI before and after the change,
and a hwsync between the last instruction that performs address
translation for an associated storage access.
Instruction fetch is an associated storage access, but the instruction
address mappings are not being changed, so it should not matter which
context they use. We must still perform a hwsync to guard arbitrary
prior code that may have accessed a userspace address.
TLB invalidation is local and VA specific. Local because only this core
used the patching mm, and VA specific because we only care that the
writable mapping is purged. Leaving the other mappings intact is more
efficient, especially when performing many code patches in a row (e.g.,
as ftrace would).
Signed-off-by: Christopher M. Riedl <cmr@bluescreens.de>
Signed-off-by: Benjamin Gray <bgray@linux.ibm.com>
[mpe: Use mm_alloc() per 107b6828a7cd ("x86/mm: Use mm_alloc() in poking_init()")]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20221109045112.187069-9-bgray@linux.ibm.com
2022-11-09 04:51:11 +00:00
|
|
|
static void put_patching_mm(struct mm_struct *mm, unsigned long patching_addr)
|
|
|
|
|
{
|
|
|
|
|
struct mmu_gather tlb;
|
|
|
|
|
|
|
|
|
|
tlb_gather_mmu(&tlb, mm);
|
|
|
|
|
free_pgd_range(&tlb, patching_addr, patching_addr + PAGE_SIZE, 0, 0);
|
|
|
|
|
mmput(mm);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int text_area_cpu_up_mm(unsigned int cpu)
|
|
|
|
|
{
|
|
|
|
|
struct mm_struct *mm;
|
|
|
|
|
unsigned long addr;
|
|
|
|
|
pte_t *pte;
|
|
|
|
|
spinlock_t *ptl;
|
|
|
|
|
|
|
|
|
|
mm = mm_alloc();
|
|
|
|
|
if (WARN_ON(!mm))
|
|
|
|
|
goto fail_no_mm;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Choose a random page-aligned address from the interval
|
|
|
|
|
* [PAGE_SIZE .. DEFAULT_MAP_WINDOW - PAGE_SIZE].
|
|
|
|
|
* The lower address bound is PAGE_SIZE to avoid the zero-page.
|
|
|
|
|
*/
|
|
|
|
|
addr = (1 + (get_random_long() % (DEFAULT_MAP_WINDOW / PAGE_SIZE - 2))) << PAGE_SHIFT;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* PTE allocation uses GFP_KERNEL which means we need to
|
|
|
|
|
* pre-allocate the PTE here because we cannot do the
|
|
|
|
|
* allocation during patching when IRQs are disabled.
|
|
|
|
|
*
|
|
|
|
|
* Using get_locked_pte() to avoid open coding, the lock
|
|
|
|
|
* is unnecessary.
|
|
|
|
|
*/
|
|
|
|
|
pte = get_locked_pte(mm, addr, &ptl);
|
|
|
|
|
if (!pte)
|
|
|
|
|
goto fail_no_pte;
|
|
|
|
|
pte_unmap_unlock(pte, ptl);
|
|
|
|
|
|
2022-11-09 04:51:12 +00:00
|
|
|
this_cpu_write(cpu_patching_context.mm, mm);
|
|
|
|
|
this_cpu_write(cpu_patching_context.addr, addr);
|
powerpc/code-patching: Use temporary mm for Radix MMU
x86 supports the notion of a temporary mm which restricts access to
temporary PTEs to a single CPU. A temporary mm is useful for situations
where a CPU needs to perform sensitive operations (such as patching a
STRICT_KERNEL_RWX kernel) requiring temporary mappings without exposing
said mappings to other CPUs. Another benefit is that other CPU TLBs do
not need to be flushed when the temporary mm is torn down.
Mappings in the temporary mm can be set in the userspace portion of the
address-space.
Interrupts must be disabled while the temporary mm is in use. HW
breakpoints, which may have been set by userspace as watchpoints on
addresses now within the temporary mm, are saved and disabled when
loading the temporary mm. The HW breakpoints are restored when unloading
the temporary mm. All HW breakpoints are indiscriminately disabled while
the temporary mm is in use - this may include breakpoints set by perf.
Use the `poking_init` init hook to prepare a temporary mm and patching
address. Initialize the temporary mm using mm_alloc(). Choose a
randomized patching address inside the temporary mm userspace address
space. The patching address is randomized between PAGE_SIZE and
DEFAULT_MAP_WINDOW-PAGE_SIZE.
Bits of entropy with 64K page size on BOOK3S_64:
bits of entropy = log2(DEFAULT_MAP_WINDOW_USER64 / PAGE_SIZE)
PAGE_SIZE=64K, DEFAULT_MAP_WINDOW_USER64=128TB
bits of entropy = log2(128TB / 64K)
bits of entropy = 31
The upper limit is DEFAULT_MAP_WINDOW due to how the Book3s64 Hash MMU
operates - by default the space above DEFAULT_MAP_WINDOW is not
available. Currently the Hash MMU does not use a temporary mm so
technically this upper limit isn't necessary; however, a larger
randomization range does not further "harden" this overall approach and
future work may introduce patching with a temporary mm on Hash as well.
Randomization occurs only once during initialization for each CPU as it
comes online.
The patching page is mapped with PAGE_KERNEL to set EAA[0] for the PTE
which ignores the AMR (so no need to unlock/lock KUAP) according to
PowerISA v3.0b Figure 35 on Radix.
Based on x86 implementation:
commit 4fc19708b165
("x86/alternatives: Initialize temporary mm for patching")
and:
commit b3fd8e83ada0
("x86/alternatives: Use temporary mm for text poking")
From: Benjamin Gray <bgray@linux.ibm.com>
Synchronisation is done according to ISA 3.1B Book 3 Chapter 13
"Synchronization Requirements for Context Alterations". Switching the mm
is a change to the PID, which requires a CSI before and after the change,
and a hwsync between the last instruction that performs address
translation for an associated storage access.
Instruction fetch is an associated storage access, but the instruction
address mappings are not being changed, so it should not matter which
context they use. We must still perform a hwsync to guard arbitrary
prior code that may have accessed a userspace address.
TLB invalidation is local and VA specific. Local because only this core
used the patching mm, and VA specific because we only care that the
writable mapping is purged. Leaving the other mappings intact is more
efficient, especially when performing many code patches in a row (e.g.,
as ftrace would).
Signed-off-by: Christopher M. Riedl <cmr@bluescreens.de>
Signed-off-by: Benjamin Gray <bgray@linux.ibm.com>
[mpe: Use mm_alloc() per 107b6828a7cd ("x86/mm: Use mm_alloc() in poking_init()")]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20221109045112.187069-9-bgray@linux.ibm.com
2022-11-09 04:51:11 +00:00
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
fail_no_pte:
|
|
|
|
|
put_patching_mm(mm, addr);
|
|
|
|
|
fail_no_mm:
|
|
|
|
|
return -ENOMEM;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int text_area_cpu_down_mm(unsigned int cpu)
|
|
|
|
|
{
|
2022-11-09 04:51:12 +00:00
|
|
|
put_patching_mm(this_cpu_read(cpu_patching_context.mm),
|
|
|
|
|
this_cpu_read(cpu_patching_context.addr));
|
powerpc/code-patching: Use temporary mm for Radix MMU
x86 supports the notion of a temporary mm which restricts access to
temporary PTEs to a single CPU. A temporary mm is useful for situations
where a CPU needs to perform sensitive operations (such as patching a
STRICT_KERNEL_RWX kernel) requiring temporary mappings without exposing
said mappings to other CPUs. Another benefit is that other CPU TLBs do
not need to be flushed when the temporary mm is torn down.
Mappings in the temporary mm can be set in the userspace portion of the
address-space.
Interrupts must be disabled while the temporary mm is in use. HW
breakpoints, which may have been set by userspace as watchpoints on
addresses now within the temporary mm, are saved and disabled when
loading the temporary mm. The HW breakpoints are restored when unloading
the temporary mm. All HW breakpoints are indiscriminately disabled while
the temporary mm is in use - this may include breakpoints set by perf.
Use the `poking_init` init hook to prepare a temporary mm and patching
address. Initialize the temporary mm using mm_alloc(). Choose a
randomized patching address inside the temporary mm userspace address
space. The patching address is randomized between PAGE_SIZE and
DEFAULT_MAP_WINDOW-PAGE_SIZE.
Bits of entropy with 64K page size on BOOK3S_64:
bits of entropy = log2(DEFAULT_MAP_WINDOW_USER64 / PAGE_SIZE)
PAGE_SIZE=64K, DEFAULT_MAP_WINDOW_USER64=128TB
bits of entropy = log2(128TB / 64K)
bits of entropy = 31
The upper limit is DEFAULT_MAP_WINDOW due to how the Book3s64 Hash MMU
operates - by default the space above DEFAULT_MAP_WINDOW is not
available. Currently the Hash MMU does not use a temporary mm so
technically this upper limit isn't necessary; however, a larger
randomization range does not further "harden" this overall approach and
future work may introduce patching with a temporary mm on Hash as well.
Randomization occurs only once during initialization for each CPU as it
comes online.
The patching page is mapped with PAGE_KERNEL to set EAA[0] for the PTE
which ignores the AMR (so no need to unlock/lock KUAP) according to
PowerISA v3.0b Figure 35 on Radix.
Based on x86 implementation:
commit 4fc19708b165
("x86/alternatives: Initialize temporary mm for patching")
and:
commit b3fd8e83ada0
("x86/alternatives: Use temporary mm for text poking")
From: Benjamin Gray <bgray@linux.ibm.com>
Synchronisation is done according to ISA 3.1B Book 3 Chapter 13
"Synchronization Requirements for Context Alterations". Switching the mm
is a change to the PID, which requires a CSI before and after the change,
and a hwsync between the last instruction that performs address
translation for an associated storage access.
Instruction fetch is an associated storage access, but the instruction
address mappings are not being changed, so it should not matter which
context they use. We must still perform a hwsync to guard arbitrary
prior code that may have accessed a userspace address.
TLB invalidation is local and VA specific. Local because only this core
used the patching mm, and VA specific because we only care that the
writable mapping is purged. Leaving the other mappings intact is more
efficient, especially when performing many code patches in a row (e.g.,
as ftrace would).
Signed-off-by: Christopher M. Riedl <cmr@bluescreens.de>
Signed-off-by: Benjamin Gray <bgray@linux.ibm.com>
[mpe: Use mm_alloc() per 107b6828a7cd ("x86/mm: Use mm_alloc() in poking_init()")]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20221109045112.187069-9-bgray@linux.ibm.com
2022-11-09 04:51:11 +00:00
|
|
|
|
2022-11-09 04:51:12 +00:00
|
|
|
this_cpu_write(cpu_patching_context.mm, NULL);
|
|
|
|
|
this_cpu_write(cpu_patching_context.addr, 0);
|
powerpc/code-patching: Use temporary mm for Radix MMU
x86 supports the notion of a temporary mm which restricts access to
temporary PTEs to a single CPU. A temporary mm is useful for situations
where a CPU needs to perform sensitive operations (such as patching a
STRICT_KERNEL_RWX kernel) requiring temporary mappings without exposing
said mappings to other CPUs. Another benefit is that other CPU TLBs do
not need to be flushed when the temporary mm is torn down.
Mappings in the temporary mm can be set in the userspace portion of the
address-space.
Interrupts must be disabled while the temporary mm is in use. HW
breakpoints, which may have been set by userspace as watchpoints on
addresses now within the temporary mm, are saved and disabled when
loading the temporary mm. The HW breakpoints are restored when unloading
the temporary mm. All HW breakpoints are indiscriminately disabled while
the temporary mm is in use - this may include breakpoints set by perf.
Use the `poking_init` init hook to prepare a temporary mm and patching
address. Initialize the temporary mm using mm_alloc(). Choose a
randomized patching address inside the temporary mm userspace address
space. The patching address is randomized between PAGE_SIZE and
DEFAULT_MAP_WINDOW-PAGE_SIZE.
Bits of entropy with 64K page size on BOOK3S_64:
bits of entropy = log2(DEFAULT_MAP_WINDOW_USER64 / PAGE_SIZE)
PAGE_SIZE=64K, DEFAULT_MAP_WINDOW_USER64=128TB
bits of entropy = log2(128TB / 64K)
bits of entropy = 31
The upper limit is DEFAULT_MAP_WINDOW due to how the Book3s64 Hash MMU
operates - by default the space above DEFAULT_MAP_WINDOW is not
available. Currently the Hash MMU does not use a temporary mm so
technically this upper limit isn't necessary; however, a larger
randomization range does not further "harden" this overall approach and
future work may introduce patching with a temporary mm on Hash as well.
Randomization occurs only once during initialization for each CPU as it
comes online.
The patching page is mapped with PAGE_KERNEL to set EAA[0] for the PTE
which ignores the AMR (so no need to unlock/lock KUAP) according to
PowerISA v3.0b Figure 35 on Radix.
Based on x86 implementation:
commit 4fc19708b165
("x86/alternatives: Initialize temporary mm for patching")
and:
commit b3fd8e83ada0
("x86/alternatives: Use temporary mm for text poking")
From: Benjamin Gray <bgray@linux.ibm.com>
Synchronisation is done according to ISA 3.1B Book 3 Chapter 13
"Synchronization Requirements for Context Alterations". Switching the mm
is a change to the PID, which requires a CSI before and after the change,
and a hwsync between the last instruction that performs address
translation for an associated storage access.
Instruction fetch is an associated storage access, but the instruction
address mappings are not being changed, so it should not matter which
context they use. We must still perform a hwsync to guard arbitrary
prior code that may have accessed a userspace address.
TLB invalidation is local and VA specific. Local because only this core
used the patching mm, and VA specific because we only care that the
writable mapping is purged. Leaving the other mappings intact is more
efficient, especially when performing many code patches in a row (e.g.,
as ftrace would).
Signed-off-by: Christopher M. Riedl <cmr@bluescreens.de>
Signed-off-by: Benjamin Gray <bgray@linux.ibm.com>
[mpe: Use mm_alloc() per 107b6828a7cd ("x86/mm: Use mm_alloc() in poking_init()")]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20221109045112.187069-9-bgray@linux.ibm.com
2022-11-09 04:51:11 +00:00
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2022-03-22 15:40:21 +00:00
|
|
|
static __ro_after_init DEFINE_STATIC_KEY_FALSE(poking_init_done);
|
|
|
|
|
|
2021-06-09 01:34:24 +00:00
|
|
|
void __init poking_init(void)
|
2017-06-28 17:04:05 +00:00
|
|
|
{
|
powerpc/code-patching: Use temporary mm for Radix MMU
x86 supports the notion of a temporary mm which restricts access to
temporary PTEs to a single CPU. A temporary mm is useful for situations
where a CPU needs to perform sensitive operations (such as patching a
STRICT_KERNEL_RWX kernel) requiring temporary mappings without exposing
said mappings to other CPUs. Another benefit is that other CPU TLBs do
not need to be flushed when the temporary mm is torn down.
Mappings in the temporary mm can be set in the userspace portion of the
address-space.
Interrupts must be disabled while the temporary mm is in use. HW
breakpoints, which may have been set by userspace as watchpoints on
addresses now within the temporary mm, are saved and disabled when
loading the temporary mm. The HW breakpoints are restored when unloading
the temporary mm. All HW breakpoints are indiscriminately disabled while
the temporary mm is in use - this may include breakpoints set by perf.
Use the `poking_init` init hook to prepare a temporary mm and patching
address. Initialize the temporary mm using mm_alloc(). Choose a
randomized patching address inside the temporary mm userspace address
space. The patching address is randomized between PAGE_SIZE and
DEFAULT_MAP_WINDOW-PAGE_SIZE.
Bits of entropy with 64K page size on BOOK3S_64:
bits of entropy = log2(DEFAULT_MAP_WINDOW_USER64 / PAGE_SIZE)
PAGE_SIZE=64K, DEFAULT_MAP_WINDOW_USER64=128TB
bits of entropy = log2(128TB / 64K)
bits of entropy = 31
The upper limit is DEFAULT_MAP_WINDOW due to how the Book3s64 Hash MMU
operates - by default the space above DEFAULT_MAP_WINDOW is not
available. Currently the Hash MMU does not use a temporary mm so
technically this upper limit isn't necessary; however, a larger
randomization range does not further "harden" this overall approach and
future work may introduce patching with a temporary mm on Hash as well.
Randomization occurs only once during initialization for each CPU as it
comes online.
The patching page is mapped with PAGE_KERNEL to set EAA[0] for the PTE
which ignores the AMR (so no need to unlock/lock KUAP) according to
PowerISA v3.0b Figure 35 on Radix.
Based on x86 implementation:
commit 4fc19708b165
("x86/alternatives: Initialize temporary mm for patching")
and:
commit b3fd8e83ada0
("x86/alternatives: Use temporary mm for text poking")
From: Benjamin Gray <bgray@linux.ibm.com>
Synchronisation is done according to ISA 3.1B Book 3 Chapter 13
"Synchronization Requirements for Context Alterations". Switching the mm
is a change to the PID, which requires a CSI before and after the change,
and a hwsync between the last instruction that performs address
translation for an associated storage access.
Instruction fetch is an associated storage access, but the instruction
address mappings are not being changed, so it should not matter which
context they use. We must still perform a hwsync to guard arbitrary
prior code that may have accessed a userspace address.
TLB invalidation is local and VA specific. Local because only this core
used the patching mm, and VA specific because we only care that the
writable mapping is purged. Leaving the other mappings intact is more
efficient, especially when performing many code patches in a row (e.g.,
as ftrace would).
Signed-off-by: Christopher M. Riedl <cmr@bluescreens.de>
Signed-off-by: Benjamin Gray <bgray@linux.ibm.com>
[mpe: Use mm_alloc() per 107b6828a7cd ("x86/mm: Use mm_alloc() in poking_init()")]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20221109045112.187069-9-bgray@linux.ibm.com
2022-11-09 04:51:11 +00:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
|
|
if (mm_patch_enabled())
|
|
|
|
|
ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
|
|
|
|
|
"powerpc/text_poke_mm:online",
|
|
|
|
|
text_area_cpu_up_mm,
|
|
|
|
|
text_area_cpu_down_mm);
|
|
|
|
|
else
|
|
|
|
|
ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
|
|
|
|
|
"powerpc/text_poke:online",
|
|
|
|
|
text_area_cpu_up,
|
|
|
|
|
text_area_cpu_down);
|
2022-11-09 04:51:05 +00:00
|
|
|
|
|
|
|
|
/* cpuhp_setup_state returns >= 0 on success */
|
|
|
|
|
if (WARN_ON(ret < 0))
|
|
|
|
|
return;
|
|
|
|
|
|
2022-03-22 15:40:21 +00:00
|
|
|
static_branch_enable(&poking_init_done);
|
2017-06-28 17:04:05 +00:00
|
|
|
}
|
|
|
|
|
|
2022-08-15 11:48:40 +00:00
|
|
|
static unsigned long get_patch_pfn(void *addr)
|
|
|
|
|
{
|
2024-05-05 16:06:26 +00:00
|
|
|
if (IS_ENABLED(CONFIG_EXECMEM) && is_vmalloc_or_module_addr(addr))
|
2022-08-15 11:48:40 +00:00
|
|
|
return vmalloc_to_pfn(addr);
|
|
|
|
|
else
|
|
|
|
|
return __pa_symbol(addr) >> PAGE_SHIFT;
|
|
|
|
|
}
|
|
|
|
|
|
2017-06-28 17:04:05 +00:00
|
|
|
/*
|
|
|
|
|
* This can be called for kernel text or a module.
|
|
|
|
|
*/
|
|
|
|
|
static int map_patch_area(void *addr, unsigned long text_poke_addr)
|
|
|
|
|
{
|
2022-08-15 11:48:40 +00:00
|
|
|
unsigned long pfn = get_patch_pfn(addr);
|
2017-06-28 17:04:05 +00:00
|
|
|
|
2021-12-02 12:00:19 +00:00
|
|
|
return map_kernel_page(text_poke_addr, (pfn << PAGE_SHIFT), PAGE_KERNEL);
|
2017-06-28 17:04:05 +00:00
|
|
|
}
|
|
|
|
|
|
2021-12-02 12:00:20 +00:00
|
|
|
static void unmap_patch_area(unsigned long addr)
|
2017-06-28 17:04:05 +00:00
|
|
|
{
|
|
|
|
|
pte_t *ptep;
|
|
|
|
|
pmd_t *pmdp;
|
|
|
|
|
pud_t *pudp;
|
2020-06-04 23:46:44 +00:00
|
|
|
p4d_t *p4dp;
|
2017-06-28 17:04:05 +00:00
|
|
|
pgd_t *pgdp;
|
|
|
|
|
|
|
|
|
|
pgdp = pgd_offset_k(addr);
|
2021-12-02 12:00:20 +00:00
|
|
|
if (WARN_ON(pgd_none(*pgdp)))
|
|
|
|
|
return;
|
2017-06-28 17:04:05 +00:00
|
|
|
|
2020-06-04 23:46:44 +00:00
|
|
|
p4dp = p4d_offset(pgdp, addr);
|
2021-12-02 12:00:20 +00:00
|
|
|
if (WARN_ON(p4d_none(*p4dp)))
|
|
|
|
|
return;
|
2020-06-04 23:46:44 +00:00
|
|
|
|
|
|
|
|
pudp = pud_offset(p4dp, addr);
|
2021-12-02 12:00:20 +00:00
|
|
|
if (WARN_ON(pud_none(*pudp)))
|
|
|
|
|
return;
|
2017-06-28 17:04:05 +00:00
|
|
|
|
|
|
|
|
pmdp = pmd_offset(pudp, addr);
|
2021-12-02 12:00:20 +00:00
|
|
|
if (WARN_ON(pmd_none(*pmdp)))
|
|
|
|
|
return;
|
2017-06-28 17:04:05 +00:00
|
|
|
|
|
|
|
|
ptep = pte_offset_kernel(pmdp, addr);
|
2021-12-02 12:00:20 +00:00
|
|
|
if (WARN_ON(pte_none(*ptep)))
|
|
|
|
|
return;
|
2017-06-28 17:04:05 +00:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* In hash, pte_clear flushes the tlb, in radix, we have to
|
|
|
|
|
*/
|
|
|
|
|
pte_clear(&init_mm, addr, ptep);
|
|
|
|
|
flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
|
2008-06-24 01:32:21 +00:00
|
|
|
}
|
|
|
|
|
|
2024-05-15 02:44:41 +00:00
|
|
|
static int __do_patch_mem_mm(void *addr, unsigned long val, bool is_dword)
|
powerpc/code-patching: Use temporary mm for Radix MMU
x86 supports the notion of a temporary mm which restricts access to
temporary PTEs to a single CPU. A temporary mm is useful for situations
where a CPU needs to perform sensitive operations (such as patching a
STRICT_KERNEL_RWX kernel) requiring temporary mappings without exposing
said mappings to other CPUs. Another benefit is that other CPU TLBs do
not need to be flushed when the temporary mm is torn down.
Mappings in the temporary mm can be set in the userspace portion of the
address-space.
Interrupts must be disabled while the temporary mm is in use. HW
breakpoints, which may have been set by userspace as watchpoints on
addresses now within the temporary mm, are saved and disabled when
loading the temporary mm. The HW breakpoints are restored when unloading
the temporary mm. All HW breakpoints are indiscriminately disabled while
the temporary mm is in use - this may include breakpoints set by perf.
Use the `poking_init` init hook to prepare a temporary mm and patching
address. Initialize the temporary mm using mm_alloc(). Choose a
randomized patching address inside the temporary mm userspace address
space. The patching address is randomized between PAGE_SIZE and
DEFAULT_MAP_WINDOW-PAGE_SIZE.
Bits of entropy with 64K page size on BOOK3S_64:
bits of entropy = log2(DEFAULT_MAP_WINDOW_USER64 / PAGE_SIZE)
PAGE_SIZE=64K, DEFAULT_MAP_WINDOW_USER64=128TB
bits of entropy = log2(128TB / 64K)
bits of entropy = 31
The upper limit is DEFAULT_MAP_WINDOW due to how the Book3s64 Hash MMU
operates - by default the space above DEFAULT_MAP_WINDOW is not
available. Currently the Hash MMU does not use a temporary mm so
technically this upper limit isn't necessary; however, a larger
randomization range does not further "harden" this overall approach and
future work may introduce patching with a temporary mm on Hash as well.
Randomization occurs only once during initialization for each CPU as it
comes online.
The patching page is mapped with PAGE_KERNEL to set EAA[0] for the PTE
which ignores the AMR (so no need to unlock/lock KUAP) according to
PowerISA v3.0b Figure 35 on Radix.
Based on x86 implementation:
commit 4fc19708b165
("x86/alternatives: Initialize temporary mm for patching")
and:
commit b3fd8e83ada0
("x86/alternatives: Use temporary mm for text poking")
From: Benjamin Gray <bgray@linux.ibm.com>
Synchronisation is done according to ISA 3.1B Book 3 Chapter 13
"Synchronization Requirements for Context Alterations". Switching the mm
is a change to the PID, which requires a CSI before and after the change,
and a hwsync between the last instruction that performs address
translation for an associated storage access.
Instruction fetch is an associated storage access, but the instruction
address mappings are not being changed, so it should not matter which
context they use. We must still perform a hwsync to guard arbitrary
prior code that may have accessed a userspace address.
TLB invalidation is local and VA specific. Local because only this core
used the patching mm, and VA specific because we only care that the
writable mapping is purged. Leaving the other mappings intact is more
efficient, especially when performing many code patches in a row (e.g.,
as ftrace would).
Signed-off-by: Christopher M. Riedl <cmr@bluescreens.de>
Signed-off-by: Benjamin Gray <bgray@linux.ibm.com>
[mpe: Use mm_alloc() per 107b6828a7cd ("x86/mm: Use mm_alloc() in poking_init()")]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20221109045112.187069-9-bgray@linux.ibm.com
2022-11-09 04:51:11 +00:00
|
|
|
{
|
|
|
|
|
int err;
|
|
|
|
|
u32 *patch_addr;
|
|
|
|
|
unsigned long text_poke_addr;
|
|
|
|
|
pte_t *pte;
|
|
|
|
|
unsigned long pfn = get_patch_pfn(addr);
|
|
|
|
|
struct mm_struct *patching_mm;
|
|
|
|
|
struct mm_struct *orig_mm;
|
2022-12-16 01:43:12 +00:00
|
|
|
spinlock_t *ptl;
|
powerpc/code-patching: Use temporary mm for Radix MMU
x86 supports the notion of a temporary mm which restricts access to
temporary PTEs to a single CPU. A temporary mm is useful for situations
where a CPU needs to perform sensitive operations (such as patching a
STRICT_KERNEL_RWX kernel) requiring temporary mappings without exposing
said mappings to other CPUs. Another benefit is that other CPU TLBs do
not need to be flushed when the temporary mm is torn down.
Mappings in the temporary mm can be set in the userspace portion of the
address-space.
Interrupts must be disabled while the temporary mm is in use. HW
breakpoints, which may have been set by userspace as watchpoints on
addresses now within the temporary mm, are saved and disabled when
loading the temporary mm. The HW breakpoints are restored when unloading
the temporary mm. All HW breakpoints are indiscriminately disabled while
the temporary mm is in use - this may include breakpoints set by perf.
Use the `poking_init` init hook to prepare a temporary mm and patching
address. Initialize the temporary mm using mm_alloc(). Choose a
randomized patching address inside the temporary mm userspace address
space. The patching address is randomized between PAGE_SIZE and
DEFAULT_MAP_WINDOW-PAGE_SIZE.
Bits of entropy with 64K page size on BOOK3S_64:
bits of entropy = log2(DEFAULT_MAP_WINDOW_USER64 / PAGE_SIZE)
PAGE_SIZE=64K, DEFAULT_MAP_WINDOW_USER64=128TB
bits of entropy = log2(128TB / 64K)
bits of entropy = 31
The upper limit is DEFAULT_MAP_WINDOW due to how the Book3s64 Hash MMU
operates - by default the space above DEFAULT_MAP_WINDOW is not
available. Currently the Hash MMU does not use a temporary mm so
technically this upper limit isn't necessary; however, a larger
randomization range does not further "harden" this overall approach and
future work may introduce patching with a temporary mm on Hash as well.
Randomization occurs only once during initialization for each CPU as it
comes online.
The patching page is mapped with PAGE_KERNEL to set EAA[0] for the PTE
which ignores the AMR (so no need to unlock/lock KUAP) according to
PowerISA v3.0b Figure 35 on Radix.
Based on x86 implementation:
commit 4fc19708b165
("x86/alternatives: Initialize temporary mm for patching")
and:
commit b3fd8e83ada0
("x86/alternatives: Use temporary mm for text poking")
From: Benjamin Gray <bgray@linux.ibm.com>
Synchronisation is done according to ISA 3.1B Book 3 Chapter 13
"Synchronization Requirements for Context Alterations". Switching the mm
is a change to the PID, which requires a CSI before and after the change,
and a hwsync between the last instruction that performs address
translation for an associated storage access.
Instruction fetch is an associated storage access, but the instruction
address mappings are not being changed, so it should not matter which
context they use. We must still perform a hwsync to guard arbitrary
prior code that may have accessed a userspace address.
TLB invalidation is local and VA specific. Local because only this core
used the patching mm, and VA specific because we only care that the
writable mapping is purged. Leaving the other mappings intact is more
efficient, especially when performing many code patches in a row (e.g.,
as ftrace would).
Signed-off-by: Christopher M. Riedl <cmr@bluescreens.de>
Signed-off-by: Benjamin Gray <bgray@linux.ibm.com>
[mpe: Use mm_alloc() per 107b6828a7cd ("x86/mm: Use mm_alloc() in poking_init()")]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20221109045112.187069-9-bgray@linux.ibm.com
2022-11-09 04:51:11 +00:00
|
|
|
|
2022-11-09 04:51:12 +00:00
|
|
|
patching_mm = __this_cpu_read(cpu_patching_context.mm);
|
|
|
|
|
text_poke_addr = __this_cpu_read(cpu_patching_context.addr);
|
powerpc/code-patching: Use temporary mm for Radix MMU
x86 supports the notion of a temporary mm which restricts access to
temporary PTEs to a single CPU. A temporary mm is useful for situations
where a CPU needs to perform sensitive operations (such as patching a
STRICT_KERNEL_RWX kernel) requiring temporary mappings without exposing
said mappings to other CPUs. Another benefit is that other CPU TLBs do
not need to be flushed when the temporary mm is torn down.
Mappings in the temporary mm can be set in the userspace portion of the
address-space.
Interrupts must be disabled while the temporary mm is in use. HW
breakpoints, which may have been set by userspace as watchpoints on
addresses now within the temporary mm, are saved and disabled when
loading the temporary mm. The HW breakpoints are restored when unloading
the temporary mm. All HW breakpoints are indiscriminately disabled while
the temporary mm is in use - this may include breakpoints set by perf.
Use the `poking_init` init hook to prepare a temporary mm and patching
address. Initialize the temporary mm using mm_alloc(). Choose a
randomized patching address inside the temporary mm userspace address
space. The patching address is randomized between PAGE_SIZE and
DEFAULT_MAP_WINDOW-PAGE_SIZE.
Bits of entropy with 64K page size on BOOK3S_64:
bits of entropy = log2(DEFAULT_MAP_WINDOW_USER64 / PAGE_SIZE)
PAGE_SIZE=64K, DEFAULT_MAP_WINDOW_USER64=128TB
bits of entropy = log2(128TB / 64K)
bits of entropy = 31
The upper limit is DEFAULT_MAP_WINDOW due to how the Book3s64 Hash MMU
operates - by default the space above DEFAULT_MAP_WINDOW is not
available. Currently the Hash MMU does not use a temporary mm so
technically this upper limit isn't necessary; however, a larger
randomization range does not further "harden" this overall approach and
future work may introduce patching with a temporary mm on Hash as well.
Randomization occurs only once during initialization for each CPU as it
comes online.
The patching page is mapped with PAGE_KERNEL to set EAA[0] for the PTE
which ignores the AMR (so no need to unlock/lock KUAP) according to
PowerISA v3.0b Figure 35 on Radix.
Based on x86 implementation:
commit 4fc19708b165
("x86/alternatives: Initialize temporary mm for patching")
and:
commit b3fd8e83ada0
("x86/alternatives: Use temporary mm for text poking")
From: Benjamin Gray <bgray@linux.ibm.com>
Synchronisation is done according to ISA 3.1B Book 3 Chapter 13
"Synchronization Requirements for Context Alterations". Switching the mm
is a change to the PID, which requires a CSI before and after the change,
and a hwsync between the last instruction that performs address
translation for an associated storage access.
Instruction fetch is an associated storage access, but the instruction
address mappings are not being changed, so it should not matter which
context they use. We must still perform a hwsync to guard arbitrary
prior code that may have accessed a userspace address.
TLB invalidation is local and VA specific. Local because only this core
used the patching mm, and VA specific because we only care that the
writable mapping is purged. Leaving the other mappings intact is more
efficient, especially when performing many code patches in a row (e.g.,
as ftrace would).
Signed-off-by: Christopher M. Riedl <cmr@bluescreens.de>
Signed-off-by: Benjamin Gray <bgray@linux.ibm.com>
[mpe: Use mm_alloc() per 107b6828a7cd ("x86/mm: Use mm_alloc() in poking_init()")]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20221109045112.187069-9-bgray@linux.ibm.com
2022-11-09 04:51:11 +00:00
|
|
|
patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr));
|
|
|
|
|
|
2022-12-16 01:43:12 +00:00
|
|
|
pte = get_locked_pte(patching_mm, text_poke_addr, &ptl);
|
|
|
|
|
if (!pte)
|
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
powerpc/code-patching: Use temporary mm for Radix MMU
x86 supports the notion of a temporary mm which restricts access to
temporary PTEs to a single CPU. A temporary mm is useful for situations
where a CPU needs to perform sensitive operations (such as patching a
STRICT_KERNEL_RWX kernel) requiring temporary mappings without exposing
said mappings to other CPUs. Another benefit is that other CPU TLBs do
not need to be flushed when the temporary mm is torn down.
Mappings in the temporary mm can be set in the userspace portion of the
address-space.
Interrupts must be disabled while the temporary mm is in use. HW
breakpoints, which may have been set by userspace as watchpoints on
addresses now within the temporary mm, are saved and disabled when
loading the temporary mm. The HW breakpoints are restored when unloading
the temporary mm. All HW breakpoints are indiscriminately disabled while
the temporary mm is in use - this may include breakpoints set by perf.
Use the `poking_init` init hook to prepare a temporary mm and patching
address. Initialize the temporary mm using mm_alloc(). Choose a
randomized patching address inside the temporary mm userspace address
space. The patching address is randomized between PAGE_SIZE and
DEFAULT_MAP_WINDOW-PAGE_SIZE.
Bits of entropy with 64K page size on BOOK3S_64:
bits of entropy = log2(DEFAULT_MAP_WINDOW_USER64 / PAGE_SIZE)
PAGE_SIZE=64K, DEFAULT_MAP_WINDOW_USER64=128TB
bits of entropy = log2(128TB / 64K)
bits of entropy = 31
The upper limit is DEFAULT_MAP_WINDOW due to how the Book3s64 Hash MMU
operates - by default the space above DEFAULT_MAP_WINDOW is not
available. Currently the Hash MMU does not use a temporary mm so
technically this upper limit isn't necessary; however, a larger
randomization range does not further "harden" this overall approach and
future work may introduce patching with a temporary mm on Hash as well.
Randomization occurs only once during initialization for each CPU as it
comes online.
The patching page is mapped with PAGE_KERNEL to set EAA[0] for the PTE
which ignores the AMR (so no need to unlock/lock KUAP) according to
PowerISA v3.0b Figure 35 on Radix.
Based on x86 implementation:
commit 4fc19708b165
("x86/alternatives: Initialize temporary mm for patching")
and:
commit b3fd8e83ada0
("x86/alternatives: Use temporary mm for text poking")
From: Benjamin Gray <bgray@linux.ibm.com>
Synchronisation is done according to ISA 3.1B Book 3 Chapter 13
"Synchronization Requirements for Context Alterations". Switching the mm
is a change to the PID, which requires a CSI before and after the change,
and a hwsync between the last instruction that performs address
translation for an associated storage access.
Instruction fetch is an associated storage access, but the instruction
address mappings are not being changed, so it should not matter which
context they use. We must still perform a hwsync to guard arbitrary
prior code that may have accessed a userspace address.
TLB invalidation is local and VA specific. Local because only this core
used the patching mm, and VA specific because we only care that the
writable mapping is purged. Leaving the other mappings intact is more
efficient, especially when performing many code patches in a row (e.g.,
as ftrace would).
Signed-off-by: Christopher M. Riedl <cmr@bluescreens.de>
Signed-off-by: Benjamin Gray <bgray@linux.ibm.com>
[mpe: Use mm_alloc() per 107b6828a7cd ("x86/mm: Use mm_alloc() in poking_init()")]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20221109045112.187069-9-bgray@linux.ibm.com
2022-11-09 04:51:11 +00:00
|
|
|
__set_pte_at(patching_mm, text_poke_addr, pte, pfn_pte(pfn, PAGE_KERNEL), 0);
|
|
|
|
|
|
|
|
|
|
/* order PTE update before use, also serves as the hwsync */
|
|
|
|
|
asm volatile("ptesync": : :"memory");
|
|
|
|
|
|
|
|
|
|
/* order context switch after arbitrary prior code */
|
|
|
|
|
isync();
|
|
|
|
|
|
|
|
|
|
orig_mm = start_using_temp_mm(patching_mm);
|
|
|
|
|
|
2024-05-15 02:44:41 +00:00
|
|
|
err = __patch_mem(addr, val, patch_addr, is_dword);
|
powerpc/code-patching: Use temporary mm for Radix MMU
x86 supports the notion of a temporary mm which restricts access to
temporary PTEs to a single CPU. A temporary mm is useful for situations
where a CPU needs to perform sensitive operations (such as patching a
STRICT_KERNEL_RWX kernel) requiring temporary mappings without exposing
said mappings to other CPUs. Another benefit is that other CPU TLBs do
not need to be flushed when the temporary mm is torn down.
Mappings in the temporary mm can be set in the userspace portion of the
address-space.
Interrupts must be disabled while the temporary mm is in use. HW
breakpoints, which may have been set by userspace as watchpoints on
addresses now within the temporary mm, are saved and disabled when
loading the temporary mm. The HW breakpoints are restored when unloading
the temporary mm. All HW breakpoints are indiscriminately disabled while
the temporary mm is in use - this may include breakpoints set by perf.
Use the `poking_init` init hook to prepare a temporary mm and patching
address. Initialize the temporary mm using mm_alloc(). Choose a
randomized patching address inside the temporary mm userspace address
space. The patching address is randomized between PAGE_SIZE and
DEFAULT_MAP_WINDOW-PAGE_SIZE.
Bits of entropy with 64K page size on BOOK3S_64:
bits of entropy = log2(DEFAULT_MAP_WINDOW_USER64 / PAGE_SIZE)
PAGE_SIZE=64K, DEFAULT_MAP_WINDOW_USER64=128TB
bits of entropy = log2(128TB / 64K)
bits of entropy = 31
The upper limit is DEFAULT_MAP_WINDOW due to how the Book3s64 Hash MMU
operates - by default the space above DEFAULT_MAP_WINDOW is not
available. Currently the Hash MMU does not use a temporary mm so
technically this upper limit isn't necessary; however, a larger
randomization range does not further "harden" this overall approach and
future work may introduce patching with a temporary mm on Hash as well.
Randomization occurs only once during initialization for each CPU as it
comes online.
The patching page is mapped with PAGE_KERNEL to set EAA[0] for the PTE
which ignores the AMR (so no need to unlock/lock KUAP) according to
PowerISA v3.0b Figure 35 on Radix.
Based on x86 implementation:
commit 4fc19708b165
("x86/alternatives: Initialize temporary mm for patching")
and:
commit b3fd8e83ada0
("x86/alternatives: Use temporary mm for text poking")
From: Benjamin Gray <bgray@linux.ibm.com>
Synchronisation is done according to ISA 3.1B Book 3 Chapter 13
"Synchronization Requirements for Context Alterations". Switching the mm
is a change to the PID, which requires a CSI before and after the change,
and a hwsync between the last instruction that performs address
translation for an associated storage access.
Instruction fetch is an associated storage access, but the instruction
address mappings are not being changed, so it should not matter which
context they use. We must still perform a hwsync to guard arbitrary
prior code that may have accessed a userspace address.
TLB invalidation is local and VA specific. Local because only this core
used the patching mm, and VA specific because we only care that the
writable mapping is purged. Leaving the other mappings intact is more
efficient, especially when performing many code patches in a row (e.g.,
as ftrace would).
Signed-off-by: Christopher M. Riedl <cmr@bluescreens.de>
Signed-off-by: Benjamin Gray <bgray@linux.ibm.com>
[mpe: Use mm_alloc() per 107b6828a7cd ("x86/mm: Use mm_alloc() in poking_init()")]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20221109045112.187069-9-bgray@linux.ibm.com
2022-11-09 04:51:11 +00:00
|
|
|
|
|
|
|
|
/* context synchronisation performed by __patch_instruction (isync or exception) */
|
|
|
|
|
stop_using_temp_mm(patching_mm, orig_mm);
|
|
|
|
|
|
|
|
|
|
pte_clear(patching_mm, text_poke_addr, pte);
|
|
|
|
|
/*
|
|
|
|
|
* ptesync to order PTE update before TLB invalidation done
|
|
|
|
|
* by radix__local_flush_tlb_page_psize (in _tlbiel_va)
|
|
|
|
|
*/
|
|
|
|
|
local_flush_tlb_page_psize(patching_mm, text_poke_addr, mmu_virtual_psize);
|
|
|
|
|
|
2022-12-16 01:43:12 +00:00
|
|
|
pte_unmap_unlock(pte, ptl);
|
|
|
|
|
|
powerpc/code-patching: Use temporary mm for Radix MMU
x86 supports the notion of a temporary mm which restricts access to
temporary PTEs to a single CPU. A temporary mm is useful for situations
where a CPU needs to perform sensitive operations (such as patching a
STRICT_KERNEL_RWX kernel) requiring temporary mappings without exposing
said mappings to other CPUs. Another benefit is that other CPU TLBs do
not need to be flushed when the temporary mm is torn down.
Mappings in the temporary mm can be set in the userspace portion of the
address-space.
Interrupts must be disabled while the temporary mm is in use. HW
breakpoints, which may have been set by userspace as watchpoints on
addresses now within the temporary mm, are saved and disabled when
loading the temporary mm. The HW breakpoints are restored when unloading
the temporary mm. All HW breakpoints are indiscriminately disabled while
the temporary mm is in use - this may include breakpoints set by perf.
Use the `poking_init` init hook to prepare a temporary mm and patching
address. Initialize the temporary mm using mm_alloc(). Choose a
randomized patching address inside the temporary mm userspace address
space. The patching address is randomized between PAGE_SIZE and
DEFAULT_MAP_WINDOW-PAGE_SIZE.
Bits of entropy with 64K page size on BOOK3S_64:
bits of entropy = log2(DEFAULT_MAP_WINDOW_USER64 / PAGE_SIZE)
PAGE_SIZE=64K, DEFAULT_MAP_WINDOW_USER64=128TB
bits of entropy = log2(128TB / 64K)
bits of entropy = 31
The upper limit is DEFAULT_MAP_WINDOW due to how the Book3s64 Hash MMU
operates - by default the space above DEFAULT_MAP_WINDOW is not
available. Currently the Hash MMU does not use a temporary mm so
technically this upper limit isn't necessary; however, a larger
randomization range does not further "harden" this overall approach and
future work may introduce patching with a temporary mm on Hash as well.
Randomization occurs only once during initialization for each CPU as it
comes online.
The patching page is mapped with PAGE_KERNEL to set EAA[0] for the PTE
which ignores the AMR (so no need to unlock/lock KUAP) according to
PowerISA v3.0b Figure 35 on Radix.
Based on x86 implementation:
commit 4fc19708b165
("x86/alternatives: Initialize temporary mm for patching")
and:
commit b3fd8e83ada0
("x86/alternatives: Use temporary mm for text poking")
From: Benjamin Gray <bgray@linux.ibm.com>
Synchronisation is done according to ISA 3.1B Book 3 Chapter 13
"Synchronization Requirements for Context Alterations". Switching the mm
is a change to the PID, which requires a CSI before and after the change,
and a hwsync between the last instruction that performs address
translation for an associated storage access.
Instruction fetch is an associated storage access, but the instruction
address mappings are not being changed, so it should not matter which
context they use. We must still perform a hwsync to guard arbitrary
prior code that may have accessed a userspace address.
TLB invalidation is local and VA specific. Local because only this core
used the patching mm, and VA specific because we only care that the
writable mapping is purged. Leaving the other mappings intact is more
efficient, especially when performing many code patches in a row (e.g.,
as ftrace would).
Signed-off-by: Christopher M. Riedl <cmr@bluescreens.de>
Signed-off-by: Benjamin Gray <bgray@linux.ibm.com>
[mpe: Use mm_alloc() per 107b6828a7cd ("x86/mm: Use mm_alloc() in poking_init()")]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20221109045112.187069-9-bgray@linux.ibm.com
2022-11-09 04:51:11 +00:00
|
|
|
return err;
|
|
|
|
|
}
|
|
|
|
|
|
2024-05-15 02:44:41 +00:00
|
|
|
static int __do_patch_mem(void *addr, unsigned long val, bool is_dword)
|
2021-12-02 12:00:21 +00:00
|
|
|
{
|
|
|
|
|
int err;
|
|
|
|
|
u32 *patch_addr;
|
|
|
|
|
unsigned long text_poke_addr;
|
2022-08-15 11:48:40 +00:00
|
|
|
pte_t *pte;
|
|
|
|
|
unsigned long pfn = get_patch_pfn(addr);
|
2021-12-02 12:00:21 +00:00
|
|
|
|
2022-11-09 04:51:12 +00:00
|
|
|
text_poke_addr = (unsigned long)__this_cpu_read(cpu_patching_context.addr) & PAGE_MASK;
|
2021-12-02 12:00:21 +00:00
|
|
|
patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr));
|
|
|
|
|
|
2022-11-09 04:51:12 +00:00
|
|
|
pte = __this_cpu_read(cpu_patching_context.pte);
|
2022-08-15 11:48:40 +00:00
|
|
|
__set_pte_at(&init_mm, text_poke_addr, pte, pfn_pte(pfn, PAGE_KERNEL), 0);
|
|
|
|
|
/* See ptesync comment in radix__set_pte_at() */
|
|
|
|
|
if (radix_enabled())
|
|
|
|
|
asm volatile("ptesync": : :"memory");
|
2021-12-02 12:00:21 +00:00
|
|
|
|
2024-05-15 02:44:41 +00:00
|
|
|
err = __patch_mem(addr, val, patch_addr, is_dword);
|
2021-12-02 12:00:21 +00:00
|
|
|
|
2022-08-15 11:48:40 +00:00
|
|
|
pte_clear(&init_mm, text_poke_addr, pte);
|
|
|
|
|
flush_tlb_kernel_range(text_poke_addr, text_poke_addr + PAGE_SIZE);
|
2021-12-02 12:00:21 +00:00
|
|
|
|
|
|
|
|
return err;
|
|
|
|
|
}
|
|
|
|
|
|
2024-05-15 02:44:41 +00:00
|
|
|
static int patch_mem(void *addr, unsigned long val, bool is_dword)
|
2017-06-28 17:04:05 +00:00
|
|
|
{
|
|
|
|
|
int err;
|
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* During early early boot patch_instruction is called
|
|
|
|
|
* when text_poke_area is not ready, but we still need
|
|
|
|
|
* to allow patching. We just do the plain old patching
|
|
|
|
|
*/
|
2022-12-02 08:31:39 +00:00
|
|
|
if (!IS_ENABLED(CONFIG_STRICT_KERNEL_RWX) ||
|
|
|
|
|
!static_branch_likely(&poking_init_done))
|
2024-05-15 02:44:41 +00:00
|
|
|
return __patch_mem(addr, val, addr, is_dword);
|
2017-06-28 17:04:05 +00:00
|
|
|
|
|
|
|
|
local_irq_save(flags);
|
powerpc/code-patching: Use temporary mm for Radix MMU
x86 supports the notion of a temporary mm which restricts access to
temporary PTEs to a single CPU. A temporary mm is useful for situations
where a CPU needs to perform sensitive operations (such as patching a
STRICT_KERNEL_RWX kernel) requiring temporary mappings without exposing
said mappings to other CPUs. Another benefit is that other CPU TLBs do
not need to be flushed when the temporary mm is torn down.
Mappings in the temporary mm can be set in the userspace portion of the
address-space.
Interrupts must be disabled while the temporary mm is in use. HW
breakpoints, which may have been set by userspace as watchpoints on
addresses now within the temporary mm, are saved and disabled when
loading the temporary mm. The HW breakpoints are restored when unloading
the temporary mm. All HW breakpoints are indiscriminately disabled while
the temporary mm is in use - this may include breakpoints set by perf.
Use the `poking_init` init hook to prepare a temporary mm and patching
address. Initialize the temporary mm using mm_alloc(). Choose a
randomized patching address inside the temporary mm userspace address
space. The patching address is randomized between PAGE_SIZE and
DEFAULT_MAP_WINDOW-PAGE_SIZE.
Bits of entropy with 64K page size on BOOK3S_64:
bits of entropy = log2(DEFAULT_MAP_WINDOW_USER64 / PAGE_SIZE)
PAGE_SIZE=64K, DEFAULT_MAP_WINDOW_USER64=128TB
bits of entropy = log2(128TB / 64K)
bits of entropy = 31
The upper limit is DEFAULT_MAP_WINDOW due to how the Book3s64 Hash MMU
operates - by default the space above DEFAULT_MAP_WINDOW is not
available. Currently the Hash MMU does not use a temporary mm so
technically this upper limit isn't necessary; however, a larger
randomization range does not further "harden" this overall approach and
future work may introduce patching with a temporary mm on Hash as well.
Randomization occurs only once during initialization for each CPU as it
comes online.
The patching page is mapped with PAGE_KERNEL to set EAA[0] for the PTE
which ignores the AMR (so no need to unlock/lock KUAP) according to
PowerISA v3.0b Figure 35 on Radix.
Based on x86 implementation:
commit 4fc19708b165
("x86/alternatives: Initialize temporary mm for patching")
and:
commit b3fd8e83ada0
("x86/alternatives: Use temporary mm for text poking")
From: Benjamin Gray <bgray@linux.ibm.com>
Synchronisation is done according to ISA 3.1B Book 3 Chapter 13
"Synchronization Requirements for Context Alterations". Switching the mm
is a change to the PID, which requires a CSI before and after the change,
and a hwsync between the last instruction that performs address
translation for an associated storage access.
Instruction fetch is an associated storage access, but the instruction
address mappings are not being changed, so it should not matter which
context they use. We must still perform a hwsync to guard arbitrary
prior code that may have accessed a userspace address.
TLB invalidation is local and VA specific. Local because only this core
used the patching mm, and VA specific because we only care that the
writable mapping is purged. Leaving the other mappings intact is more
efficient, especially when performing many code patches in a row (e.g.,
as ftrace would).
Signed-off-by: Christopher M. Riedl <cmr@bluescreens.de>
Signed-off-by: Benjamin Gray <bgray@linux.ibm.com>
[mpe: Use mm_alloc() per 107b6828a7cd ("x86/mm: Use mm_alloc() in poking_init()")]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20221109045112.187069-9-bgray@linux.ibm.com
2022-11-09 04:51:11 +00:00
|
|
|
if (mm_patch_enabled())
|
2024-05-15 02:44:41 +00:00
|
|
|
err = __do_patch_mem_mm(addr, val, is_dword);
|
powerpc/code-patching: Use temporary mm for Radix MMU
x86 supports the notion of a temporary mm which restricts access to
temporary PTEs to a single CPU. A temporary mm is useful for situations
where a CPU needs to perform sensitive operations (such as patching a
STRICT_KERNEL_RWX kernel) requiring temporary mappings without exposing
said mappings to other CPUs. Another benefit is that other CPU TLBs do
not need to be flushed when the temporary mm is torn down.
Mappings in the temporary mm can be set in the userspace portion of the
address-space.
Interrupts must be disabled while the temporary mm is in use. HW
breakpoints, which may have been set by userspace as watchpoints on
addresses now within the temporary mm, are saved and disabled when
loading the temporary mm. The HW breakpoints are restored when unloading
the temporary mm. All HW breakpoints are indiscriminately disabled while
the temporary mm is in use - this may include breakpoints set by perf.
Use the `poking_init` init hook to prepare a temporary mm and patching
address. Initialize the temporary mm using mm_alloc(). Choose a
randomized patching address inside the temporary mm userspace address
space. The patching address is randomized between PAGE_SIZE and
DEFAULT_MAP_WINDOW-PAGE_SIZE.
Bits of entropy with 64K page size on BOOK3S_64:
bits of entropy = log2(DEFAULT_MAP_WINDOW_USER64 / PAGE_SIZE)
PAGE_SIZE=64K, DEFAULT_MAP_WINDOW_USER64=128TB
bits of entropy = log2(128TB / 64K)
bits of entropy = 31
The upper limit is DEFAULT_MAP_WINDOW due to how the Book3s64 Hash MMU
operates - by default the space above DEFAULT_MAP_WINDOW is not
available. Currently the Hash MMU does not use a temporary mm so
technically this upper limit isn't necessary; however, a larger
randomization range does not further "harden" this overall approach and
future work may introduce patching with a temporary mm on Hash as well.
Randomization occurs only once during initialization for each CPU as it
comes online.
The patching page is mapped with PAGE_KERNEL to set EAA[0] for the PTE
which ignores the AMR (so no need to unlock/lock KUAP) according to
PowerISA v3.0b Figure 35 on Radix.
Based on x86 implementation:
commit 4fc19708b165
("x86/alternatives: Initialize temporary mm for patching")
and:
commit b3fd8e83ada0
("x86/alternatives: Use temporary mm for text poking")
From: Benjamin Gray <bgray@linux.ibm.com>
Synchronisation is done according to ISA 3.1B Book 3 Chapter 13
"Synchronization Requirements for Context Alterations". Switching the mm
is a change to the PID, which requires a CSI before and after the change,
and a hwsync between the last instruction that performs address
translation for an associated storage access.
Instruction fetch is an associated storage access, but the instruction
address mappings are not being changed, so it should not matter which
context they use. We must still perform a hwsync to guard arbitrary
prior code that may have accessed a userspace address.
TLB invalidation is local and VA specific. Local because only this core
used the patching mm, and VA specific because we only care that the
writable mapping is purged. Leaving the other mappings intact is more
efficient, especially when performing many code patches in a row (e.g.,
as ftrace would).
Signed-off-by: Christopher M. Riedl <cmr@bluescreens.de>
Signed-off-by: Benjamin Gray <bgray@linux.ibm.com>
[mpe: Use mm_alloc() per 107b6828a7cd ("x86/mm: Use mm_alloc() in poking_init()")]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20221109045112.187069-9-bgray@linux.ibm.com
2022-11-09 04:51:11 +00:00
|
|
|
else
|
2024-05-15 02:44:41 +00:00
|
|
|
err = __do_patch_mem(addr, val, is_dword);
|
2017-06-28 17:04:05 +00:00
|
|
|
local_irq_restore(flags);
|
|
|
|
|
|
|
|
|
|
return err;
|
|
|
|
|
}
|
2024-05-15 02:44:41 +00:00
|
|
|
|
|
|
|
|
#ifdef CONFIG_PPC64
|
|
|
|
|
|
|
|
|
|
int patch_instruction(u32 *addr, ppc_inst_t instr)
|
|
|
|
|
{
|
|
|
|
|
if (ppc_inst_prefixed(instr))
|
|
|
|
|
return patch_mem(addr, ppc_inst_as_ulong(instr), true);
|
|
|
|
|
else
|
|
|
|
|
return patch_mem(addr, ppc_inst_val(instr), false);
|
|
|
|
|
}
|
2017-06-28 17:04:05 +00:00
|
|
|
NOKPROBE_SYMBOL(patch_instruction);
|
|
|
|
|
|
2024-05-15 02:44:41 +00:00
|
|
|
int patch_uint(void *addr, unsigned int val)
|
|
|
|
|
{
|
2024-05-15 02:44:42 +00:00
|
|
|
if (!IS_ALIGNED((unsigned long)addr, sizeof(unsigned int)))
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
2024-05-15 02:44:41 +00:00
|
|
|
return patch_mem(addr, val, false);
|
|
|
|
|
}
|
|
|
|
|
NOKPROBE_SYMBOL(patch_uint);
|
|
|
|
|
|
|
|
|
|
int patch_ulong(void *addr, unsigned long val)
|
|
|
|
|
{
|
2024-05-15 02:44:42 +00:00
|
|
|
if (!IS_ALIGNED((unsigned long)addr, sizeof(unsigned long)))
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
2024-05-15 02:44:41 +00:00
|
|
|
return patch_mem(addr, val, true);
|
|
|
|
|
}
|
|
|
|
|
NOKPROBE_SYMBOL(patch_ulong);
|
|
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
|
|
|
|
|
int patch_instruction(u32 *addr, ppc_inst_t instr)
|
|
|
|
|
{
|
|
|
|
|
return patch_mem(addr, ppc_inst_val(instr), false);
|
|
|
|
|
}
|
|
|
|
|
NOKPROBE_SYMBOL(patch_instruction)
|
|
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
2024-03-25 05:28:15 +00:00
|
|
|
static int patch_memset64(u64 *addr, u64 val, size_t count)
|
|
|
|
|
{
|
|
|
|
|
for (u64 *end = addr + count; addr < end; addr++)
|
|
|
|
|
__put_kernel_nofault(addr, &val, u64, failed);
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
failed:
|
|
|
|
|
return -EPERM;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int patch_memset32(u32 *addr, u32 val, size_t count)
|
|
|
|
|
{
|
|
|
|
|
for (u32 *end = addr + count; addr < end; addr++)
|
|
|
|
|
__put_kernel_nofault(addr, &val, u32, failed);
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
failed:
|
|
|
|
|
return -EPERM;
|
|
|
|
|
}
|
|
|
|
|
|
2023-10-20 14:13:54 +00:00
|
|
|
static int __patch_instructions(u32 *patch_addr, u32 *code, size_t len, bool repeat_instr)
|
|
|
|
|
{
|
|
|
|
|
unsigned long start = (unsigned long)patch_addr;
|
2024-03-25 05:28:15 +00:00
|
|
|
int err;
|
2023-10-20 14:13:54 +00:00
|
|
|
|
|
|
|
|
/* Repeat instruction */
|
|
|
|
|
if (repeat_instr) {
|
|
|
|
|
ppc_inst_t instr = ppc_inst_read(code);
|
|
|
|
|
|
|
|
|
|
if (ppc_inst_prefixed(instr)) {
|
|
|
|
|
u64 val = ppc_inst_as_ulong(instr);
|
|
|
|
|
|
2024-03-25 05:28:15 +00:00
|
|
|
err = patch_memset64((u64 *)patch_addr, val, len / 8);
|
2023-10-20 14:13:54 +00:00
|
|
|
} else {
|
|
|
|
|
u32 val = ppc_inst_val(instr);
|
|
|
|
|
|
2024-03-25 05:28:15 +00:00
|
|
|
err = patch_memset32(patch_addr, val, len / 4);
|
2023-10-20 14:13:54 +00:00
|
|
|
}
|
|
|
|
|
} else {
|
2024-03-25 05:28:15 +00:00
|
|
|
err = copy_to_kernel_nofault(patch_addr, code, len);
|
2023-10-20 14:13:54 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
smp_wmb(); /* smp write barrier */
|
|
|
|
|
flush_icache_range(start, start + len);
|
2024-03-25 05:28:15 +00:00
|
|
|
return err;
|
2023-10-20 14:13:54 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* A page is mapped and instructions that fit the page are patched.
|
|
|
|
|
* Assumes 'len' to be (PAGE_SIZE - offset_in_page(addr)) or below.
|
|
|
|
|
*/
|
|
|
|
|
static int __do_patch_instructions_mm(u32 *addr, u32 *code, size_t len, bool repeat_instr)
|
|
|
|
|
{
|
|
|
|
|
struct mm_struct *patching_mm, *orig_mm;
|
|
|
|
|
unsigned long pfn = get_patch_pfn(addr);
|
|
|
|
|
unsigned long text_poke_addr;
|
|
|
|
|
spinlock_t *ptl;
|
|
|
|
|
u32 *patch_addr;
|
|
|
|
|
pte_t *pte;
|
|
|
|
|
int err;
|
|
|
|
|
|
|
|
|
|
patching_mm = __this_cpu_read(cpu_patching_context.mm);
|
|
|
|
|
text_poke_addr = __this_cpu_read(cpu_patching_context.addr);
|
|
|
|
|
patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr));
|
|
|
|
|
|
|
|
|
|
pte = get_locked_pte(patching_mm, text_poke_addr, &ptl);
|
|
|
|
|
if (!pte)
|
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
|
|
__set_pte_at(patching_mm, text_poke_addr, pte, pfn_pte(pfn, PAGE_KERNEL), 0);
|
|
|
|
|
|
|
|
|
|
/* order PTE update before use, also serves as the hwsync */
|
|
|
|
|
asm volatile("ptesync" ::: "memory");
|
|
|
|
|
|
|
|
|
|
/* order context switch after arbitrary prior code */
|
|
|
|
|
isync();
|
|
|
|
|
|
|
|
|
|
orig_mm = start_using_temp_mm(patching_mm);
|
|
|
|
|
|
powerpc/code-patching: Disable KASAN report during patching via temporary mm
Erhard reports the following KASAN hit on Talos II (power9) with kernel 6.13:
[ 12.028126] ==================================================================
[ 12.028198] BUG: KASAN: user-memory-access in copy_to_kernel_nofault+0x8c/0x1a0
[ 12.028260] Write of size 8 at addr 0000187e458f2000 by task systemd/1
[ 12.028346] CPU: 87 UID: 0 PID: 1 Comm: systemd Tainted: G T 6.13.0-P9-dirty #3
[ 12.028408] Tainted: [T]=RANDSTRUCT
[ 12.028446] Hardware name: T2P9D01 REV 1.01 POWER9 0x4e1202 opal:skiboot-bc106a0 PowerNV
[ 12.028500] Call Trace:
[ 12.028536] [c000000008dbf3b0] [c000000001656a48] dump_stack_lvl+0xbc/0x110 (unreliable)
[ 12.028609] [c000000008dbf3f0] [c0000000006e2fc8] print_report+0x6b0/0x708
[ 12.028666] [c000000008dbf4e0] [c0000000006e2454] kasan_report+0x164/0x300
[ 12.028725] [c000000008dbf600] [c0000000006e54d4] kasan_check_range+0x314/0x370
[ 12.028784] [c000000008dbf640] [c0000000006e6310] __kasan_check_write+0x20/0x40
[ 12.028842] [c000000008dbf660] [c000000000578e8c] copy_to_kernel_nofault+0x8c/0x1a0
[ 12.028902] [c000000008dbf6a0] [c0000000000acfe4] __patch_instructions+0x194/0x210
[ 12.028965] [c000000008dbf6e0] [c0000000000ade80] patch_instructions+0x150/0x590
[ 12.029026] [c000000008dbf7c0] [c0000000001159bc] bpf_arch_text_copy+0x6c/0xe0
[ 12.029085] [c000000008dbf800] [c000000000424250] bpf_jit_binary_pack_finalize+0x40/0xc0
[ 12.029147] [c000000008dbf830] [c000000000115dec] bpf_int_jit_compile+0x3bc/0x930
[ 12.029206] [c000000008dbf990] [c000000000423720] bpf_prog_select_runtime+0x1f0/0x280
[ 12.029266] [c000000008dbfa00] [c000000000434b18] bpf_prog_load+0xbb8/0x1370
[ 12.029324] [c000000008dbfb70] [c000000000436ebc] __sys_bpf+0x5ac/0x2e00
[ 12.029379] [c000000008dbfd00] [c00000000043a228] sys_bpf+0x28/0x40
[ 12.029435] [c000000008dbfd20] [c000000000038eb4] system_call_exception+0x334/0x610
[ 12.029497] [c000000008dbfe50] [c00000000000c270] system_call_vectored_common+0xf0/0x280
[ 12.029561] --- interrupt: 3000 at 0x3fff82f5cfa8
[ 12.029608] NIP: 00003fff82f5cfa8 LR: 00003fff82f5cfa8 CTR: 0000000000000000
[ 12.029660] REGS: c000000008dbfe80 TRAP: 3000 Tainted: G T (6.13.0-P9-dirty)
[ 12.029735] MSR: 900000000280f032 <SF,HV,VEC,VSX,EE,PR,FP,ME,IR,DR,RI> CR: 42004848 XER: 00000000
[ 12.029855] IRQMASK: 0
GPR00: 0000000000000169 00003fffdcf789a0 00003fff83067100 0000000000000005
GPR04: 00003fffdcf78a98 0000000000000090 0000000000000000 0000000000000008
GPR08: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
GPR12: 0000000000000000 00003fff836ff7e0 c000000000010678 0000000000000000
GPR16: 0000000000000000 0000000000000000 00003fffdcf78f28 00003fffdcf78f90
GPR20: 0000000000000000 0000000000000000 0000000000000000 00003fffdcf78f80
GPR24: 00003fffdcf78f70 00003fffdcf78d10 00003fff835c7239 00003fffdcf78bd8
GPR28: 00003fffdcf78a98 0000000000000000 0000000000000000 000000011f547580
[ 12.030316] NIP [00003fff82f5cfa8] 0x3fff82f5cfa8
[ 12.030361] LR [00003fff82f5cfa8] 0x3fff82f5cfa8
[ 12.030405] --- interrupt: 3000
[ 12.030444] ==================================================================
Commit c28c15b6d28a ("powerpc/code-patching: Use temporary mm for
Radix MMU") is inspired from x86 but unlike x86 is doesn't disable
KASAN reports during patching. This wasn't a problem at the begining
because __patch_mem() is not instrumented.
Commit 465cabc97b42 ("powerpc/code-patching: introduce
patch_instructions()") use copy_to_kernel_nofault() to copy several
instructions at once. But when using temporary mm the destination is
not regular kernel memory but a kind of kernel-like memory located
in user address space. Because it is not in kernel address space it is
not covered by KASAN shadow memory. Since commit e4137f08816b ("mm,
kasan, kmsan: instrument copy_from/to_kernel_nofault") KASAN reports
bad accesses from copy_to_kernel_nofault(). Here a bad access to user
memory is reported because KASAN detects the lack of shadow memory and
the address is below TASK_SIZE.
Do like x86 in commit b3fd8e83ada0 ("x86/alternatives: Use temporary
mm for text poking") and disable KASAN reports during patching when
using temporary mm.
Reported-by: Erhard Furtner <erhard_f@mailbox.org>
Close: https://lore.kernel.org/all/20250201151435.48400261@yea/
Fixes: 465cabc97b42 ("powerpc/code-patching: introduce patch_instructions()")
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Madhavan Srinivasan <maddy@linux.ibm.com>
Link: https://patch.msgid.link/1c05b2a1b02ad75b981cfc45927e0b4a90441046.1738577687.git.christophe.leroy@csgroup.eu
2025-02-03 10:14:57 +00:00
|
|
|
kasan_disable_current();
|
2023-10-20 14:13:54 +00:00
|
|
|
err = __patch_instructions(patch_addr, code, len, repeat_instr);
|
powerpc/code-patching: Disable KASAN report during patching via temporary mm
Erhard reports the following KASAN hit on Talos II (power9) with kernel 6.13:
[ 12.028126] ==================================================================
[ 12.028198] BUG: KASAN: user-memory-access in copy_to_kernel_nofault+0x8c/0x1a0
[ 12.028260] Write of size 8 at addr 0000187e458f2000 by task systemd/1
[ 12.028346] CPU: 87 UID: 0 PID: 1 Comm: systemd Tainted: G T 6.13.0-P9-dirty #3
[ 12.028408] Tainted: [T]=RANDSTRUCT
[ 12.028446] Hardware name: T2P9D01 REV 1.01 POWER9 0x4e1202 opal:skiboot-bc106a0 PowerNV
[ 12.028500] Call Trace:
[ 12.028536] [c000000008dbf3b0] [c000000001656a48] dump_stack_lvl+0xbc/0x110 (unreliable)
[ 12.028609] [c000000008dbf3f0] [c0000000006e2fc8] print_report+0x6b0/0x708
[ 12.028666] [c000000008dbf4e0] [c0000000006e2454] kasan_report+0x164/0x300
[ 12.028725] [c000000008dbf600] [c0000000006e54d4] kasan_check_range+0x314/0x370
[ 12.028784] [c000000008dbf640] [c0000000006e6310] __kasan_check_write+0x20/0x40
[ 12.028842] [c000000008dbf660] [c000000000578e8c] copy_to_kernel_nofault+0x8c/0x1a0
[ 12.028902] [c000000008dbf6a0] [c0000000000acfe4] __patch_instructions+0x194/0x210
[ 12.028965] [c000000008dbf6e0] [c0000000000ade80] patch_instructions+0x150/0x590
[ 12.029026] [c000000008dbf7c0] [c0000000001159bc] bpf_arch_text_copy+0x6c/0xe0
[ 12.029085] [c000000008dbf800] [c000000000424250] bpf_jit_binary_pack_finalize+0x40/0xc0
[ 12.029147] [c000000008dbf830] [c000000000115dec] bpf_int_jit_compile+0x3bc/0x930
[ 12.029206] [c000000008dbf990] [c000000000423720] bpf_prog_select_runtime+0x1f0/0x280
[ 12.029266] [c000000008dbfa00] [c000000000434b18] bpf_prog_load+0xbb8/0x1370
[ 12.029324] [c000000008dbfb70] [c000000000436ebc] __sys_bpf+0x5ac/0x2e00
[ 12.029379] [c000000008dbfd00] [c00000000043a228] sys_bpf+0x28/0x40
[ 12.029435] [c000000008dbfd20] [c000000000038eb4] system_call_exception+0x334/0x610
[ 12.029497] [c000000008dbfe50] [c00000000000c270] system_call_vectored_common+0xf0/0x280
[ 12.029561] --- interrupt: 3000 at 0x3fff82f5cfa8
[ 12.029608] NIP: 00003fff82f5cfa8 LR: 00003fff82f5cfa8 CTR: 0000000000000000
[ 12.029660] REGS: c000000008dbfe80 TRAP: 3000 Tainted: G T (6.13.0-P9-dirty)
[ 12.029735] MSR: 900000000280f032 <SF,HV,VEC,VSX,EE,PR,FP,ME,IR,DR,RI> CR: 42004848 XER: 00000000
[ 12.029855] IRQMASK: 0
GPR00: 0000000000000169 00003fffdcf789a0 00003fff83067100 0000000000000005
GPR04: 00003fffdcf78a98 0000000000000090 0000000000000000 0000000000000008
GPR08: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
GPR12: 0000000000000000 00003fff836ff7e0 c000000000010678 0000000000000000
GPR16: 0000000000000000 0000000000000000 00003fffdcf78f28 00003fffdcf78f90
GPR20: 0000000000000000 0000000000000000 0000000000000000 00003fffdcf78f80
GPR24: 00003fffdcf78f70 00003fffdcf78d10 00003fff835c7239 00003fffdcf78bd8
GPR28: 00003fffdcf78a98 0000000000000000 0000000000000000 000000011f547580
[ 12.030316] NIP [00003fff82f5cfa8] 0x3fff82f5cfa8
[ 12.030361] LR [00003fff82f5cfa8] 0x3fff82f5cfa8
[ 12.030405] --- interrupt: 3000
[ 12.030444] ==================================================================
Commit c28c15b6d28a ("powerpc/code-patching: Use temporary mm for
Radix MMU") is inspired from x86 but unlike x86 is doesn't disable
KASAN reports during patching. This wasn't a problem at the begining
because __patch_mem() is not instrumented.
Commit 465cabc97b42 ("powerpc/code-patching: introduce
patch_instructions()") use copy_to_kernel_nofault() to copy several
instructions at once. But when using temporary mm the destination is
not regular kernel memory but a kind of kernel-like memory located
in user address space. Because it is not in kernel address space it is
not covered by KASAN shadow memory. Since commit e4137f08816b ("mm,
kasan, kmsan: instrument copy_from/to_kernel_nofault") KASAN reports
bad accesses from copy_to_kernel_nofault(). Here a bad access to user
memory is reported because KASAN detects the lack of shadow memory and
the address is below TASK_SIZE.
Do like x86 in commit b3fd8e83ada0 ("x86/alternatives: Use temporary
mm for text poking") and disable KASAN reports during patching when
using temporary mm.
Reported-by: Erhard Furtner <erhard_f@mailbox.org>
Close: https://lore.kernel.org/all/20250201151435.48400261@yea/
Fixes: 465cabc97b42 ("powerpc/code-patching: introduce patch_instructions()")
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Madhavan Srinivasan <maddy@linux.ibm.com>
Link: https://patch.msgid.link/1c05b2a1b02ad75b981cfc45927e0b4a90441046.1738577687.git.christophe.leroy@csgroup.eu
2025-02-03 10:14:57 +00:00
|
|
|
kasan_enable_current();
|
2023-10-20 14:13:54 +00:00
|
|
|
|
|
|
|
|
/* context synchronisation performed by __patch_instructions */
|
|
|
|
|
stop_using_temp_mm(patching_mm, orig_mm);
|
|
|
|
|
|
|
|
|
|
pte_clear(patching_mm, text_poke_addr, pte);
|
|
|
|
|
/*
|
|
|
|
|
* ptesync to order PTE update before TLB invalidation done
|
|
|
|
|
* by radix__local_flush_tlb_page_psize (in _tlbiel_va)
|
|
|
|
|
*/
|
|
|
|
|
local_flush_tlb_page_psize(patching_mm, text_poke_addr, mmu_virtual_psize);
|
|
|
|
|
|
|
|
|
|
pte_unmap_unlock(pte, ptl);
|
|
|
|
|
|
|
|
|
|
return err;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* A page is mapped and instructions that fit the page are patched.
|
|
|
|
|
* Assumes 'len' to be (PAGE_SIZE - offset_in_page(addr)) or below.
|
|
|
|
|
*/
|
|
|
|
|
static int __do_patch_instructions(u32 *addr, u32 *code, size_t len, bool repeat_instr)
|
|
|
|
|
{
|
|
|
|
|
unsigned long pfn = get_patch_pfn(addr);
|
|
|
|
|
unsigned long text_poke_addr;
|
|
|
|
|
u32 *patch_addr;
|
|
|
|
|
pte_t *pte;
|
|
|
|
|
int err;
|
|
|
|
|
|
|
|
|
|
text_poke_addr = (unsigned long)__this_cpu_read(cpu_patching_context.addr) & PAGE_MASK;
|
|
|
|
|
patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr));
|
|
|
|
|
|
|
|
|
|
pte = __this_cpu_read(cpu_patching_context.pte);
|
|
|
|
|
__set_pte_at(&init_mm, text_poke_addr, pte, pfn_pte(pfn, PAGE_KERNEL), 0);
|
|
|
|
|
/* See ptesync comment in radix__set_pte_at() */
|
|
|
|
|
if (radix_enabled())
|
|
|
|
|
asm volatile("ptesync" ::: "memory");
|
|
|
|
|
|
|
|
|
|
err = __patch_instructions(patch_addr, code, len, repeat_instr);
|
|
|
|
|
|
|
|
|
|
pte_clear(&init_mm, text_poke_addr, pte);
|
|
|
|
|
flush_tlb_kernel_range(text_poke_addr, text_poke_addr + PAGE_SIZE);
|
|
|
|
|
|
|
|
|
|
return err;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Patch 'addr' with 'len' bytes of instructions from 'code'.
|
|
|
|
|
*
|
|
|
|
|
* If repeat_instr is true, the same instruction is filled for
|
|
|
|
|
* 'len' bytes.
|
|
|
|
|
*/
|
|
|
|
|
int patch_instructions(u32 *addr, u32 *code, size_t len, bool repeat_instr)
|
|
|
|
|
{
|
|
|
|
|
while (len > 0) {
|
|
|
|
|
unsigned long flags;
|
|
|
|
|
size_t plen;
|
|
|
|
|
int err;
|
|
|
|
|
|
|
|
|
|
plen = min_t(size_t, PAGE_SIZE - offset_in_page(addr), len);
|
|
|
|
|
|
|
|
|
|
local_irq_save(flags);
|
|
|
|
|
if (mm_patch_enabled())
|
|
|
|
|
err = __do_patch_instructions_mm(addr, code, plen, repeat_instr);
|
|
|
|
|
else
|
|
|
|
|
err = __do_patch_instructions(addr, code, plen, repeat_instr);
|
|
|
|
|
local_irq_restore(flags);
|
|
|
|
|
if (err)
|
|
|
|
|
return err;
|
|
|
|
|
|
|
|
|
|
len -= plen;
|
|
|
|
|
addr = (u32 *)((unsigned long)addr + plen);
|
|
|
|
|
if (!repeat_instr)
|
|
|
|
|
code = (u32 *)((unsigned long)code + plen);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
NOKPROBE_SYMBOL(patch_instructions);
|
|
|
|
|
|
2021-05-20 13:50:45 +00:00
|
|
|
int patch_branch(u32 *addr, unsigned long target, int flags)
|
2008-06-24 01:32:22 +00:00
|
|
|
{
|
2021-11-29 17:49:38 +00:00
|
|
|
ppc_inst_t instr;
|
2020-05-06 03:40:25 +00:00
|
|
|
|
2021-12-02 12:00:22 +00:00
|
|
|
if (create_branch(&instr, addr, target, flags))
|
|
|
|
|
return -ERANGE;
|
|
|
|
|
|
2020-05-06 03:40:25 +00:00
|
|
|
return patch_instruction(addr, instr);
|
2008-06-24 01:32:22 +00:00
|
|
|
}
|
|
|
|
|
|
2017-02-08 09:50:51 +00:00
|
|
|
/*
|
|
|
|
|
* Helper to check if a given instruction is a conditional branch
|
|
|
|
|
* Derived from the conditional checks in analyse_instr()
|
|
|
|
|
*/
|
2021-11-29 17:49:38 +00:00
|
|
|
bool is_conditional_branch(ppc_inst_t instr)
|
2017-02-08 09:50:51 +00:00
|
|
|
{
|
2020-05-06 03:40:28 +00:00
|
|
|
unsigned int opcode = ppc_inst_primary_opcode(instr);
|
2017-02-08 09:50:51 +00:00
|
|
|
|
|
|
|
|
if (opcode == 16) /* bc, bca, bcl, bcla */
|
|
|
|
|
return true;
|
|
|
|
|
if (opcode == 19) {
|
2020-05-06 03:40:27 +00:00
|
|
|
switch ((ppc_inst_val(instr) >> 1) & 0x3ff) {
|
2017-02-08 09:50:51 +00:00
|
|
|
case 16: /* bclr, bclrl */
|
|
|
|
|
case 528: /* bcctr, bcctrl */
|
|
|
|
|
case 560: /* bctar, bctarl */
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return false;
|
|
|
|
|
}
|
2017-04-12 11:18:51 +00:00
|
|
|
NOKPROBE_SYMBOL(is_conditional_branch);
|
2017-02-08 09:50:51 +00:00
|
|
|
|
2021-11-29 17:49:38 +00:00
|
|
|
int create_cond_branch(ppc_inst_t *instr, const u32 *addr,
|
2020-05-06 03:40:25 +00:00
|
|
|
unsigned long target, int flags)
|
2008-06-24 01:32:29 +00:00
|
|
|
{
|
|
|
|
|
long offset;
|
|
|
|
|
|
|
|
|
|
offset = target;
|
|
|
|
|
if (! (flags & BRANCH_ABSOLUTE))
|
|
|
|
|
offset = offset - (unsigned long)addr;
|
|
|
|
|
|
|
|
|
|
/* Check we can represent the target in the instruction format */
|
2021-10-05 20:25:20 +00:00
|
|
|
if (!is_offset_in_cond_branch_range(offset))
|
2020-05-06 03:40:25 +00:00
|
|
|
return 1;
|
2008-06-24 01:32:29 +00:00
|
|
|
|
|
|
|
|
/* Mask out the flags and target, so they don't step on each other. */
|
2020-05-06 03:40:31 +00:00
|
|
|
*instr = ppc_inst(0x40000000 | (flags & 0x3FF0003) | (offset & 0xFFFC));
|
2008-06-24 01:32:29 +00:00
|
|
|
|
2020-05-06 03:40:25 +00:00
|
|
|
return 0;
|
2008-06-24 01:32:29 +00:00
|
|
|
}
|
|
|
|
|
|
2021-11-29 17:49:38 +00:00
|
|
|
int instr_is_relative_branch(ppc_inst_t instr)
|
2008-06-24 01:32:29 +00:00
|
|
|
{
|
2020-05-06 03:40:27 +00:00
|
|
|
if (ppc_inst_val(instr) & BRANCH_ABSOLUTE)
|
2008-06-24 01:32:29 +00:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
return instr_is_branch_iform(instr) || instr_is_branch_bform(instr);
|
|
|
|
|
}
|
|
|
|
|
|
2021-11-29 17:49:38 +00:00
|
|
|
int instr_is_relative_link_branch(ppc_inst_t instr)
|
2017-11-16 17:45:37 +00:00
|
|
|
{
|
2020-05-06 03:40:27 +00:00
|
|
|
return instr_is_relative_branch(instr) && (ppc_inst_val(instr) & BRANCH_SET_LINK);
|
2017-11-16 17:45:37 +00:00
|
|
|
}
|
|
|
|
|
|
2021-05-20 13:50:45 +00:00
|
|
|
static unsigned long branch_iform_target(const u32 *instr)
|
2008-06-24 01:32:29 +00:00
|
|
|
{
|
|
|
|
|
signed long imm;
|
|
|
|
|
|
2021-05-20 13:50:42 +00:00
|
|
|
imm = ppc_inst_val(ppc_inst_read(instr)) & 0x3FFFFFC;
|
2008-06-24 01:32:29 +00:00
|
|
|
|
|
|
|
|
/* If the top bit of the immediate value is set this is negative */
|
|
|
|
|
if (imm & 0x2000000)
|
|
|
|
|
imm -= 0x4000000;
|
|
|
|
|
|
2021-05-20 13:50:42 +00:00
|
|
|
if ((ppc_inst_val(ppc_inst_read(instr)) & BRANCH_ABSOLUTE) == 0)
|
2008-06-24 01:32:29 +00:00
|
|
|
imm += (unsigned long)instr;
|
|
|
|
|
|
|
|
|
|
return (unsigned long)imm;
|
|
|
|
|
}
|
|
|
|
|
|
2021-05-20 13:50:45 +00:00
|
|
|
static unsigned long branch_bform_target(const u32 *instr)
|
2008-06-24 01:32:29 +00:00
|
|
|
{
|
|
|
|
|
signed long imm;
|
|
|
|
|
|
2021-05-20 13:50:42 +00:00
|
|
|
imm = ppc_inst_val(ppc_inst_read(instr)) & 0xFFFC;
|
2008-06-24 01:32:29 +00:00
|
|
|
|
|
|
|
|
/* If the top bit of the immediate value is set this is negative */
|
|
|
|
|
if (imm & 0x8000)
|
|
|
|
|
imm -= 0x10000;
|
|
|
|
|
|
2021-05-20 13:50:42 +00:00
|
|
|
if ((ppc_inst_val(ppc_inst_read(instr)) & BRANCH_ABSOLUTE) == 0)
|
2008-06-24 01:32:29 +00:00
|
|
|
imm += (unsigned long)instr;
|
|
|
|
|
|
|
|
|
|
return (unsigned long)imm;
|
|
|
|
|
}
|
|
|
|
|
|
2021-05-20 13:50:45 +00:00
|
|
|
unsigned long branch_target(const u32 *instr)
|
2008-06-24 01:32:29 +00:00
|
|
|
{
|
2020-05-06 03:40:32 +00:00
|
|
|
if (instr_is_branch_iform(ppc_inst_read(instr)))
|
2008-06-24 01:32:29 +00:00
|
|
|
return branch_iform_target(instr);
|
2020-05-06 03:40:32 +00:00
|
|
|
else if (instr_is_branch_bform(ppc_inst_read(instr)))
|
2008-06-24 01:32:29 +00:00
|
|
|
return branch_bform_target(instr);
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2021-11-29 17:49:38 +00:00
|
|
|
int translate_branch(ppc_inst_t *instr, const u32 *dest, const u32 *src)
|
2008-06-24 01:32:29 +00:00
|
|
|
{
|
|
|
|
|
unsigned long target;
|
|
|
|
|
target = branch_target(src);
|
|
|
|
|
|
2020-05-06 03:40:32 +00:00
|
|
|
if (instr_is_branch_iform(ppc_inst_read(src)))
|
|
|
|
|
return create_branch(instr, dest, target,
|
|
|
|
|
ppc_inst_val(ppc_inst_read(src)));
|
|
|
|
|
else if (instr_is_branch_bform(ppc_inst_read(src)))
|
|
|
|
|
return create_cond_branch(instr, dest, target,
|
|
|
|
|
ppc_inst_val(ppc_inst_read(src)));
|
2008-06-24 01:32:29 +00:00
|
|
|
|
2020-05-06 03:40:25 +00:00
|
|
|
return 1;
|
2008-06-24 01:32:29 +00:00
|
|
|
}
|