diff --git a/kernel/src/vm/vmar/mod.rs b/kernel/src/vm/vmar/mod.rs index f691a4045..2197d3991 100644 --- a/kernel/src/vm/vmar/mod.rs +++ b/kernel/src/vm/vmar/mod.rs @@ -16,7 +16,7 @@ use align_ext::AlignExt; use aster_rights::Rights; use ostd::{ cpu::CpuExceptionInfo, - mm::{VmSpace, MAX_USERSPACE_VADDR}, + mm::{PageFlags, PageProperty, VmSpace, MAX_USERSPACE_VADDR}, }; use self::{ @@ -220,13 +220,6 @@ impl Vmar_ { } fn new_root() -> Arc { - fn handle_page_fault_wrapper( - vm_space: &VmSpace, - trap_info: &CpuExceptionInfo, - ) -> core::result::Result<(), ()> { - handle_page_fault_from_vm_space(vm_space, &trap_info.try_into().unwrap()) - } - let mut free_regions = BTreeMap::new(); let root_region = FreeRegion::new(ROOT_VMAR_LOWEST_ADDR..ROOT_VMAR_CAP_ADDR); free_regions.insert(root_region.start(), root_region); @@ -668,7 +661,9 @@ impl Vmar_ { let vm_space = if let Some(parent) = parent { parent.vm_space().clone() } else { - Arc::new(self.vm_space().fork_copy_on_write()) + let new_space = VmSpace::new(); + new_space.register_page_fault_handler(handle_page_fault_wrapper); + Arc::new(new_space) }; Vmar_::new(vmar_inner, vm_space, self.base, self.size, parent) }; @@ -694,18 +689,43 @@ impl Vmar_ { } // Clone mappings. - for (vm_mapping_base, vm_mapping) in &inner.vm_mappings { - let new_mapping = Arc::new(vm_mapping.new_fork(&new_vmar_)?); - new_vmar_ - .inner - .lock() - .vm_mappings - .insert(*vm_mapping_base, new_mapping); + { + let new_vmspace = new_vmar_.vm_space(); + let range = self.base..(self.base + self.size); + let mut new_cursor = new_vmspace.cursor_mut(&range).unwrap(); + let cur_vmspace = self.vm_space(); + let mut cur_cursor = cur_vmspace.cursor_mut(&range).unwrap(); + for (vm_mapping_base, vm_mapping) in &inner.vm_mappings { + // Clone the `VmMapping` to the new VMAR. + let new_mapping = Arc::new(vm_mapping.new_fork(&new_vmar_)?); + new_vmar_ + .inner + .lock() + .vm_mappings + .insert(*vm_mapping_base, new_mapping); + + // Protect the mapping and copy to the new page table for COW. + cur_cursor.jump(*vm_mapping_base).unwrap(); + new_cursor.jump(*vm_mapping_base).unwrap(); + let mut op = |page: &mut PageProperty| { + page.flags -= PageFlags::W; + }; + new_cursor.copy_from(&mut cur_cursor, vm_mapping.map_size(), &mut op); + } } + Ok(new_vmar_) } } +/// This is for fallible user space write handling. +fn handle_page_fault_wrapper( + vm_space: &VmSpace, + trap_info: &CpuExceptionInfo, +) -> core::result::Result<(), ()> { + handle_page_fault_from_vm_space(vm_space, &trap_info.try_into().unwrap()) +} + impl Vmar { /// The base address, i.e., the offset relative to the root VMAR. /// diff --git a/ostd/src/mm/page_table/cursor.rs b/ostd/src/mm/page_table/cursor.rs index a02f490df..750a26b82 100644 --- a/ostd/src/mm/page_table/cursor.rs +++ b/ostd/src/mm/page_table/cursor.rs @@ -734,26 +734,97 @@ where None } - pub fn preempt_guard(&self) -> &DisabledPreemptGuard { - &self.0.preempt_guard + /// Copies the mapping from the given cursor to the current cursor. + /// + /// All the mappings in the current cursor's range must be empty. The + /// function allows the source cursor to operate on the mapping before + /// the copy happens. So it is equivalent to protect then duplicate. + /// Only the mapping is copied, the mapped pages are not copied. + /// + /// It can only copy tracked mappings since we consider the untracked + /// mappings not useful to be copied. + /// + /// After the operation, both cursors will advance by the specified length. + /// + /// # Safety + /// + /// The caller should ensure that + /// - the range being copied with the operation does not affect kernel's + /// memory safety. + /// - both of the cursors are in tracked mappings. + /// + /// # Panics + /// + /// This function will panic if: + /// - either one of the range to be copied is out of the range where any + /// of the cursor is required to operate; + /// - either one of the specified virtual address ranges only covers a + /// part of a page. + /// - the current cursor's range contains mapped pages. + pub unsafe fn copy_from( + &mut self, + src: &mut Self, + len: usize, + op: &mut impl FnMut(&mut PageProperty), + ) { + assert!(len % page_size::(1) == 0); + let this_end = self.0.va + len; + assert!(this_end <= self.0.barrier_va.end); + let src_end = src.0.va + len; + assert!(src_end <= src.0.barrier_va.end); + + while self.0.va < this_end && src.0.va < src_end { + let cur_pte = src.0.read_cur_pte(); + if !cur_pte.is_present() { + src.0.move_forward(); + continue; + } + + // Go down if it's not a last node. + if !cur_pte.is_last(src.0.level) { + src.0.level_down(); + + // We have got down a level. If there's no mapped PTEs in + // the current node, we can go back and skip to save time. + if src.0.guards[(src.0.level - 1) as usize] + .as_ref() + .unwrap() + .nr_children() + == 0 + { + src.0.level_up(); + src.0.move_forward(); + } + + continue; + } + + // Do protection. + let mut pte_prop = cur_pte.prop(); + op(&mut pte_prop); + + let idx = src.0.cur_idx(); + src.cur_node_mut().protect(idx, pte_prop); + + // Do copy. + let child = src.cur_node_mut().child(idx, true); + let Child::::Page(page, prop) = child else { + panic!("Unexpected child for source mapping: {:#?}", child); + }; + self.jump(src.0.va).unwrap(); + let mapped_page_size = page.size(); + let original = self.map(page, prop); + debug_assert!(original.is_none()); + + // Only move the source cursor forward since `Self::map` will do it. + // This assertion is to ensure that they move by the same length. + debug_assert_eq!(mapped_page_size, page_size::(src.0.level)); + src.0.move_forward(); + } } - /// Consumes itself and leak the root guard for the caller if it locked the root level. - /// - /// It is useful when the caller wants to keep the root guard while the cursor should be dropped. - pub(super) fn leak_root_guard(mut self) -> Option> { - if self.0.guard_level != C::NR_LEVELS { - return None; - } - - while self.0.level < C::NR_LEVELS { - self.0.level_up(); - } - - self.0.guards[(C::NR_LEVELS - 1) as usize].take() - - // Ok to drop the cursor here because we ensure not to access the page table if the current - // level is the root level when running the dropping method. + pub fn preempt_guard(&self) -> &DisabledPreemptGuard { + &self.0.preempt_guard } /// Goes down a level assuming the current slot is absent. diff --git a/ostd/src/mm/page_table/mod.rs b/ostd/src/mm/page_table/mod.rs index 9bb1e2cc6..a8294c38d 100644 --- a/ostd/src/mm/page_table/mod.rs +++ b/ostd/src/mm/page_table/mod.rs @@ -92,53 +92,29 @@ impl PageTable { self.root.activate(); } } - - /// Create a cloned new page table. - /// - /// This method takes a mutable cursor to the old page table that locks the - /// entire virtual address range. The caller may implement the copy-on-write - /// mechanism by first protecting the old page table and then clone it using - /// this method. - /// - /// TODO: We may consider making the page table itself copy-on-write. - pub fn clone_with( - &self, - cursor: CursorMut<'_, UserMode, PageTableEntry, PagingConsts>, - ) -> Self { - let root_node = cursor.leak_root_guard().unwrap(); - - const NR_PTES_PER_NODE: usize = nr_subpage_per_huge::(); - let new_root_node = unsafe { - root_node.make_copy( - 0..NR_PTES_PER_NODE / 2, - NR_PTES_PER_NODE / 2..NR_PTES_PER_NODE, - ) - }; - - PageTable:: { - root: new_root_node.into_raw(), - _phantom: PhantomData, - } - } } impl PageTable { /// Create a new user page table. /// - /// This should be the only way to create the first user page table, that is - /// to fork the kernel page table with all the kernel mappings shared. - /// - /// Then, one can use a user page table to call [`fork_copy_on_write`], creating - /// other child page tables. + /// This should be the only way to create the user page table, that is to + /// duplicate the kernel page table with all the kernel mappings shared. pub fn create_user_page_table(&self) -> PageTable { let root_node = self.root.clone_shallow().lock(); + let mut new_node = PageTableNode::alloc(PagingConsts::NR_LEVELS); + // Make a shallow copy of the root node in the kernel space range. + // The user space range is not copied. const NR_PTES_PER_NODE: usize = nr_subpage_per_huge::(); - let new_root_node = - unsafe { root_node.make_copy(0..0, NR_PTES_PER_NODE / 2..NR_PTES_PER_NODE) }; + for i in NR_PTES_PER_NODE / 2..NR_PTES_PER_NODE { + let child = root_node.child(i, /* meaningless */ true); + if !child.is_none() { + let _ = new_node.replace_child(i, child, /* meaningless */ true); + } + } PageTable:: { - root: new_root_node.into_raw(), + root: new_node.into_raw(), _phantom: PhantomData, } } diff --git a/ostd/src/mm/page_table/node.rs b/ostd/src/mm/page_table/node.rs index 134cd112a..f39d9daf5 100644 --- a/ostd/src/mm/page_table/node.rs +++ b/ostd/src/mm/page_table/node.rs @@ -25,9 +25,7 @@ //! the initialization of the entity that the PTE points to. This is taken care in this module. //! -use core::{ - fmt, marker::PhantomData, mem::ManuallyDrop, ops::Range, panic, sync::atomic::Ordering, -}; +use core::{fmt, marker::PhantomData, mem::ManuallyDrop, panic, sync::atomic::Ordering}; use super::{nr_subpage_per_huge, page_size, PageTableEntryTrait}; use crate::{ @@ -374,74 +372,6 @@ where } } - /// Makes a copy of the page table node. - /// - /// This function allows you to control about the way to copy the children. - /// For indexes in `deep`, the children are deep copied and this function will be recursively called. - /// For indexes in `shallow`, the children are shallow copied as new references. - /// - /// You cannot shallow copy a child that is mapped to a page. Deep copying a page child will not - /// copy the mapped page but will copy the handle to the page. - /// - /// You cannot either deep copy or shallow copy a child that is mapped to an untracked page. - /// - /// The ranges must be disjoint. - pub(super) unsafe fn make_copy(&self, deep: Range, shallow: Range) -> Self { - debug_assert!(deep.end <= nr_subpage_per_huge::()); - debug_assert!(shallow.end <= nr_subpage_per_huge::()); - debug_assert!(deep.end <= shallow.start || deep.start >= shallow.end); - - let mut new_pt = Self::alloc(self.level()); - let mut copied_child_count = self.nr_children(); - for i in deep { - if copied_child_count == 0 { - return new_pt; - } - match self.child(i, true) { - Child::PageTable(pt) => { - let guard = pt.clone_shallow().lock(); - let new_child = guard.make_copy(0..nr_subpage_per_huge::(), 0..0); - let old = new_pt.replace_child(i, Child::PageTable(new_child.into_raw()), true); - debug_assert!(old.is_none()); - copied_child_count -= 1; - } - Child::Page(page, prop) => { - let old = new_pt.replace_child(i, Child::Page(page.clone(), prop), true); - debug_assert!(old.is_none()); - copied_child_count -= 1; - } - Child::None => {} - Child::Untracked(_, _) => { - unreachable!(); - } - } - } - - for i in shallow { - if copied_child_count == 0 { - return new_pt; - } - debug_assert_eq!(self.level(), C::NR_LEVELS); - match self.child(i, /*meaningless*/ true) { - Child::PageTable(pt) => { - let old = new_pt.replace_child( - i, - Child::PageTable(pt.clone_shallow()), - /*meaningless*/ true, - ); - debug_assert!(old.is_none()); - copied_child_count -= 1; - } - Child::None => {} - Child::Page(_, _) | Child::Untracked(_, _) => { - unreachable!(); - } - } - } - - new_pt - } - /// Splits the untracked huge page mapped at `idx` to smaller pages. pub(super) fn split_untracked_huge(&mut self, idx: usize) { // These should be ensured by the cursor. diff --git a/ostd/src/mm/page_table/test.rs b/ostd/src/mm/page_table/test.rs index 834289a91..6acb5bc22 100644 --- a/ostd/src/mm/page_table/test.rs +++ b/ostd/src/mm/page_table/test.rs @@ -81,6 +81,10 @@ fn test_untracked_map_unmap() { #[ktest] fn test_user_copy_on_write() { + fn prot_op(prop: &mut PageProperty) { + prop.flags -= PageFlags::W; + } + let pt = PageTable::::empty(); let from = PAGE_SIZE..PAGE_SIZE * 2; let page = allocator::alloc_single(FrameMeta::default()).unwrap(); @@ -96,7 +100,14 @@ fn test_user_copy_on_write() { unsafe { pt.cursor_mut(&from).unwrap().map(page.clone().into(), prop) }; assert_eq!(pt.query(from.start + 10).unwrap().0, start_paddr + 10); - let child_pt = pt.clone_with(pt.cursor_mut(&(0..MAX_USERSPACE_VADDR)).unwrap()); + let child_pt = { + let child_pt = PageTable::::empty(); + let range = 0..MAX_USERSPACE_VADDR; + let mut child_cursor = child_pt.cursor_mut(&range).unwrap(); + let mut parent_cursor = pt.cursor_mut(&range).unwrap(); + unsafe { child_cursor.copy_from(&mut parent_cursor, range.len(), &mut prot_op) }; + child_pt + }; assert_eq!(pt.query(from.start + 10).unwrap().0, start_paddr + 10); assert_eq!(child_pt.query(from.start + 10).unwrap().0, start_paddr + 10); assert!(matches!( @@ -106,7 +117,14 @@ fn test_user_copy_on_write() { assert!(pt.query(from.start + 10).is_none()); assert_eq!(child_pt.query(from.start + 10).unwrap().0, start_paddr + 10); - let sibling_pt = pt.clone_with(pt.cursor_mut(&(0..MAX_USERSPACE_VADDR)).unwrap()); + let sibling_pt = { + let sibling_pt = PageTable::::empty(); + let range = 0..MAX_USERSPACE_VADDR; + let mut sibling_cursor = sibling_pt.cursor_mut(&range).unwrap(); + let mut parent_cursor = pt.cursor_mut(&range).unwrap(); + unsafe { sibling_cursor.copy_from(&mut parent_cursor, range.len(), &mut prot_op) }; + sibling_pt + }; assert!(sibling_pt.query(from.start + 10).is_none()); assert_eq!(child_pt.query(from.start + 10).unwrap().0, start_paddr + 10); drop(pt); diff --git a/ostd/src/mm/vm_space.rs b/ostd/src/mm/vm_space.rs index 78109d9b8..ea6729372 100644 --- a/ostd/src/mm/vm_space.rs +++ b/ostd/src/mm/vm_space.rs @@ -22,7 +22,7 @@ use super::{ kspace::KERNEL_PAGE_TABLE, page::DynPage, page_table::{PageTable, UserMode}, - PageFlags, PageProperty, VmReader, VmWriter, PAGE_SIZE, + PageProperty, VmReader, VmWriter, PAGE_SIZE, }; use crate::{ arch::mm::{current_page_table_paddr, PageTableEntry, PagingConsts}, @@ -173,48 +173,6 @@ impl VmSpace { self.page_fault_handler.call_once(|| func); } - /// Forks a new VM space with copy-on-write semantics. - /// - /// Both the parent and the newly forked VM space will be marked as - /// read-only. And both the VM space will take handles to the same - /// physical memory pages. - pub fn fork_copy_on_write(&self) -> Self { - // Protect the parent VM space as read-only. - let end = MAX_USERSPACE_VADDR; - let mut cursor = self.cursor_mut(&(0..end)).unwrap(); - let mut op = |prop: &mut PageProperty| { - prop.flags -= PageFlags::W; - }; - - cursor.protect(end, &mut op); - - let page_fault_handler = { - let new_handler = Once::new(); - if let Some(handler) = self.page_fault_handler.get() { - new_handler.call_once(|| *handler); - } - new_handler - }; - - let CursorMut { - pt_cursor, - activation_lock, - .. - } = cursor; - - let new_pt = self.pt.clone_with(pt_cursor); - - // Release the activation lock after the page table is cloned to - // prevent modification to the parent page table while cloning. - drop(activation_lock); - - Self { - pt: new_pt, - page_fault_handler, - activation_lock: RwLock::new(()), - } - } - /// Creates a reader to read data from the user space of the current task. /// /// Returns `Err` if this `VmSpace` is not belonged to the user space of the current task @@ -433,6 +391,44 @@ impl CursorMut<'_, '_> { self.dispatch_tlb_flush(); } + /// Copies the mapping from the given cursor to the current cursor. + /// + /// All the mappings in the current cursor's range must be empty. The + /// function allows the source cursor to operate on the mapping before + /// the copy happens. So it is equivalent to protect then duplicate. + /// Only the mapping is copied, the mapped pages are not copied. + /// + /// After the operation, both cursors will advance by the specified length. + /// + /// # Panics + /// + /// This function will panic if: + /// - either one of the range to be copied is out of the range where any + /// of the cursor is required to operate; + /// - either one of the specified virtual address ranges only covers a + /// part of a page. + /// - the current cursor's range contains mapped pages. + pub fn copy_from( + &mut self, + src: &mut Self, + len: usize, + op: &mut impl FnMut(&mut PageProperty), + ) { + let va = src.virt_addr(); + + // SAFETY: Operations on user memory spaces are safe if it doesn't + // involve dropping any pages. + unsafe { self.pt_cursor.copy_from(&mut src.pt_cursor, len, op) }; + + if len > TLB_FLUSH_ALL_THRESHOLD * PAGE_SIZE { + src.issue_tlb_flush(TlbFlushOp::All, None); + } else { + src.issue_tlb_flush(TlbFlushOp::Range(va..va + len), None); + } + + src.dispatch_tlb_flush(); + } + fn issue_tlb_flush(&self, op: TlbFlushOp, drop_after_flush: Option) { let request = TlbFlushRequest { op,