structpage { unsignedlong flags; /* Atomic flags, some possibly * updated asynchronously */ //存放页的状态 /* * Five words (20/40 bytes) are available in this union. * WARNING: bit 0 of the first word is used for PageTail(). That * means the other users of this union MUST NOT use the bit to * avoid collision and false-positive PageTail(). */ union { struct {/* Page cache and anonymous pages */ /** * @lru: Pageout list, eg. active_list protected by * zone_lru_lock. Sometimes used as a generic list * by the page owner. */ structlist_headlru; /* See page-flags.h for PAGE_MAPPING_FLAGS */ structaddress_space *mapping; pgoff_t index; /* Our offset within mapping. */ /** * @private: Mapping-private opaque data. * Usually used for buffer_heads if PagePrivate. * Used for swp_entry_t if PageSwapCache. * Indicates order in the buddy system if PageBuddy. */ unsignedlongprivate; }; struct {/* slab, slob and slub */ union { structlist_headslab_list;/* uses lru */ struct {/* Partial pages */ structpage *next; #ifdef CONFIG_64BIT int pages; /* Nr of pages left */ int pobjects; /* Approximate count */ #else shortint pages; shortint pobjects; #endif }; }; structkmem_cache *slab_cache;/* not slob */ /* Double-word boundary */ void *freelist; /* first free object */ union { void *s_mem; /* slab: first object */ unsignedlong counters; /* SLUB */ struct {/* SLUB */ unsigned inuse:16; unsigned objects:15; unsigned frozen:1; }; }; }; struct {/* Tail pages of compound page */ unsignedlong compound_head; /* Bit zero is set */
/** @rcu_head: You can use this to free a page by RCU. */ structrcu_headrcu_head; };
union {/* This union is 4 bytes in size. */ /* * If the page can be mapped to userspace, encodes the number * of times this page is referenced by a page table. */ atomic_t _mapcount;
/* * If the page is neither PageSlab nor mappable to userspace, * the value stored here may help determine what this page * is used for. See page-flags.h for a list of page types * which are currently stored here. */ unsignedint page_type;
/* * On machines where all RAM is mapped into kernel address space, * we can simply calculate the virtual address. On machines with * highmem some memory is mapped into kernel virtual memory * dynamically, so we need a place to store that address. * Note that this field could be 16 bits on x86 ... ;) * * Architectures with slow multiplication can define * WANT_PAGE_VIRTUAL in asm/page.h */ #if defined(WANT_PAGE_VIRTUAL) void *virtual; /* Kernel virtual address (NULL if not kmapped, ie. highmem) */ #endif/* WANT_PAGE_VIRTUAL */
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS int _last_cpupid; #endif } _struct_page_alignment;
enumzone_type { #ifdef CONFIG_ZONE_DMA /* * ZONE_DMA is used when there are devices that are not able * to do DMA to all of addressable memory (ZONE_NORMAL). Then we * carve out the portion of memory that is needed for these devices. * The range is arch specific. * * Some examples * * Architecture Limit * --------------------------- * parisc, ia64, sparc <4G * s390, powerpc <2G * arm Various * alpha Unlimited or 0-16MB. * * i386, x86_64 and multiple other arches * <16M. */ ZONE_DMA, #endif #ifdef CONFIG_ZONE_DMA32 /* * x86_64 needs two ZONE_DMAs because it supports devices that are * only able to do DMA to the lower 16M but also 32 bit devices that * can only do DMA areas below 4G. */ ZONE_DMA32, #endif /* * Normal addressable memory is in ZONE_NORMAL. DMA operations can be * performed on pages in ZONE_NORMAL if the DMA devices support * transfers to all addressable memory. */ ZONE_NORMAL, #ifdef CONFIG_HIGHMEM /* * A memory area that is only addressable by the kernel through * mapping portions into its own address space. This is for example * used by i386 to allow the kernel to address the memory beyond * 900MB. The kernel will set up special mappings (page * table entries on i386) for each page that the kernel needs to * access. */ ZONE_HIGHMEM, #endif ZONE_MOVABLE, #ifdef CONFIG_ZONE_DEVICE ZONE_DEVICE, #endif __MAX_NR_ZONES
// /inlcude/linux/mm_types.h /* * This struct defines a memory VMM memory area. There is one of these * per VM-area/task. A VM area is any part of the process virtual memory * space that has a special rule for the page-fault handlers (ie a shared * library, the executable area etc). */ structvm_area_struct { /* The first cache line has the info for VMA tree walking. */
unsignedlong vm_start; /* Our start address within vm_mm. */ unsignedlong vm_end; /* The first byte after our end address within vm_mm. */
/* linked list of VM areas per task, sorted by address */ structvm_area_struct *vm_next, *vm_prev;
structrb_nodevm_rb;
/* * Largest free memory gap in bytes to the left of this VMA. * Either between this VMA and vma->vm_prev, or between one of the * VMAs below us in the VMA rbtree and its ->vm_prev. This helps * get_unmapped_area find a free area of the right size. */ unsignedlong rb_subtree_gap;
/* Second cache line starts here. */
structmm_struct *vm_mm;/* The address space we belong to. */ pgprot_t vm_page_prot; /* Access permissions of this VMA. */ unsignedlong vm_flags; /* Flags, see mm.h. */
/* * For areas with an address space and backing store, * linkage into the address_space->i_mmap interval tree. */ struct { structrb_noderb; unsignedlong rb_subtree_last; } shared;
/* * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma * list, after a COW of one of the file pages. A MAP_SHARED vma * can only be in the i_mmap tree. An anonymous MAP_PRIVATE, stack * or brk vma (with NULL file) can only be in an anon_vma list. */ structlist_headanon_vma_chain;/* Serialized by mmap_sem & * page_table_lock */ structanon_vma *anon_vma;/* Serialized by page_table_lock */
/* Function pointers to deal with this struct. */ conststructvm_operations_struct *vm_ops;
/* Information about our backing store: */ unsignedlong vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE units */ structfile * vm_file;/* File we map to (can be NULL). */ void * vm_private_data; /* was vm_pte (shared mem) */
atomic_long_t swap_readahead_info; #ifndef CONFIG_MMU structvm_region *vm_region;/* NOMMU mapping region */ #endif #ifdef CONFIG_NUMA structmempolicy *vm_policy;/* NUMA policy for the VMA */ #endif structvm_userfaultfd_ctxvm_userfaultfd_ctx; } __randomize_layout;
/* * These are the virtual MM functions - opening of an area, closing and * unmapping it (needed to keep files on disk up-to-date etc), pointer * to the functions called when a no-page or a wp-page exception occurs. */ structvm_operations_struct { void (*open)(struct vm_area_struct * area); void (*close)(struct vm_area_struct * area); int (*split)(struct vm_area_struct * area, unsignedlong addr); int (*mremap)(struct vm_area_struct * area); vm_fault_t (*fault)(struct vm_fault *vmf); vm_fault_t (*huge_fault)(struct vm_fault *vmf, enum page_entry_size pe_size); void (*map_pages)(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); unsignedlong(*pagesize)(struct vm_area_struct * area);
/* notification that a previously read-only page is about to become * writable, if an error is returned it will cause a SIGBUS */ vm_fault_t (*page_mkwrite)(struct vm_fault *vmf);
/* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */ vm_fault_t (*pfn_mkwrite)(struct vm_fault *vmf);
/* called by access_process_vm when get_user_pages() fails, typically * for use by special VMAs that can switch between memory and hardware */ int (*access)(struct vm_area_struct *vma, unsignedlong addr, void *buf, int len, int write);
/* Called by the /proc/PID/maps code to ask the vma whether it * has a special name. Returning non-NULL will also cause this * vma to be dumped unconditionally. */ constchar *(*name)(struct vm_area_struct *vma);
#ifdef CONFIG_NUMA /* * set_policy() op must add a reference to any non-NULL @new mempolicy * to hold the policy upon return. Caller should pass NULL @new to * remove a policy and fall back to surrounding context--i.e. do not * install a MPOL_DEFAULT policy, nor the task or system default * mempolicy. */ int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new);
/* * get_policy() op must add reference [mpol_get()] to any policy at * (vma,addr) marked as MPOL_SHARED. The shared policy infrastructure * in mm/mempolicy.c will do this automatically. * get_policy() must NOT add a ref if the policy at (vma,addr) is not * marked as MPOL_SHARED. vma policies are protected by the mmap_sem. * If no [shared/vma] mempolicy exists at the addr, get_policy() op * must return NULL--i.e., do not "fallback" to task or system default * policy. */ structmempolicy *(*get_policy)(structvm_area_struct *vma, unsignedlongaddr); #endif /* * Called by vm_normal_page() for special PTEs to find the * page for @addr. This is useful if the default behavior * (using pte_page()) would not find the correct page. */ structpage *(*find_special_page)(structvm_area_struct *vma, unsignedlongaddr); };
// /inlcude/linux/mm_types.h structmm_struct { struct { structvm_area_struct *mmap;/* list of VMAs */ structrb_rootmm_rb; u64 vmacache_seqnum; /* per-thread vmacache */ #ifdef CONFIG_MMU unsignedlong(*get_unmapped_area)(struct file *filp, unsignedlong addr, unsignedlong len, unsignedlong pgoff, unsignedlong flags); #endif unsignedlong mmap_base; /* base of mmap area */ unsignedlong mmap_legacy_base; /* base of mmap area in bottom-up allocations */ #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES /* Base adresses for compatible mmap() */ unsignedlong mmap_compat_base; unsignedlong mmap_compat_legacy_base; #endif unsignedlong task_size; /* size of task vm space */ unsignedlong highest_vm_end; /* highest vma end address */ pgd_t * pgd;
/** * @mm_users: The number of users including userspace. * * Use mmget()/mmget_not_zero()/mmput() to modify. When this * drops to 0 (i.e. when the task exits and there are no other * temporary reference holders), we also release a reference on * @mm_count (which may then free the &struct mm_struct if * @mm_count also drops to 0). */ atomic_t mm_users;
/** * @mm_count: The number of references to &struct mm_struct * (@mm_users count as 1). * * Use mmgrab()/mmdrop() to modify. When this drops to 0, the * &struct mm_struct is freed. */ atomic_t mm_count;
#ifdef CONFIG_MMU atomic_long_t pgtables_bytes; /* PTE page table pages */ #endif int map_count; /* number of VMAs */
spinlock_t page_table_lock; /* Protects page tables and some * counters */ structrw_semaphoremmap_sem;
structlist_headmmlist;/* List of maybe swapped mm's. These * are globally strung together off * init_mm.mmlist, and are protected * by mmlist_lock */
unsignedlong saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
/* * Special counters, in some configurations protected by the * page_table_lock, in other configurations by being atomic. */ structmm_rss_statrss_stat;
structlinux_binfmt *binfmt;
/* Architecture-specific MM context */ mm_context_t context;
unsignedlong flags; /* Must use atomic bitops to access */
structcore_state *core_state;/* coredumping support */ #ifdef CONFIG_MEMBARRIER atomic_t membarrier_state; #endif #ifdef CONFIG_AIO spinlock_t ioctx_lock; structkioctx_table __rcu *ioctx_table; #endif #ifdef CONFIG_MEMCG /* * "owner" points to a task that is regarded as the canonical * user/owner of this mm. All of the following must be true in * order for it to be changed: * * current == mm->owner * current->mm != mm * new_owner->mm == mm * new_owner->alloc_lock is held */ structtask_struct __rcu *owner; #endif structuser_namespace *user_ns;
/* store ref to file /proc/<pid>/exe symlink points to */ structfile __rcu *exe_file; #ifdef CONFIG_MMU_NOTIFIER structmmu_notifier_mm *mmu_notifier_mm; #endif #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS pgtable_t pmd_huge_pte; /* protected by page_table_lock */ #endif #ifdef CONFIG_NUMA_BALANCING /* * numa_next_scan is the next time that the PTEs will be marked * pte_numa. NUMA hinting faults will gather statistics and * migrate pages to new nodes if necessary. */ unsignedlong numa_next_scan;
/* Restart point for scanning and setting pte_numa */ unsignedlong numa_scan_offset;
/* numa_scan_seq prevents two threads setting pte_numa */ int numa_scan_seq; #endif /* * An operation with batched TLB flushing is going on. Anything * that can move process memory needs to flush the TLB when * moving a PROT_NONE or PROT_NUMA mapped page. */ atomic_t tlb_flush_pending; #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH /* See flush_tlb_batched_pending() */ bool tlb_flush_batched; #endif structuprobes_stateuprobes_state; #ifdef CONFIG_HUGETLB_PAGE atomic_long_t hugetlb_usage; #endif structwork_structasync_put_work;
#if IS_ENABLED(CONFIG_HMM) /* HMM needs to track a few things per mm */ structhmm *hmm; #endif } __randomize_layout;
/* * The mm_cpumask needs to be at the end of mm_struct, because it * is dynamically sized based on nr_cpu_ids. */ unsignedlong cpu_bitmap[]; };
/* /mm/memory.c */ /** * remap_pfn_range - remap kernel memory to userspace * @vma: user vma to map to * @addr: target user address to start at * @pfn: physical address of kernel memory * @size: size of map area * @prot: page protection flags for this mapping * * Note: this is only safe if the mm semaphore is held when called. */ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { ... } EXPORT_SYMBOL(remap_pfn_range);
/* Remap-pfn-range will mark the range VM_IO */ if (remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, size, vma->vm_page_prot)) { return -EAGAIN; } return0; }
asmlinkage unsignedlong sys_mmap (unsignedlong addr, unsignedlong len, int prot, int flags, int fd, long off) { if (offset_in_page(off) != 0) return -EINVAL;
addr = ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); if (!IS_ERR((void *) addr)) force_successful_syscall_return(); return addr; }
len = ALIGN(len, huge_page_size(hs)); /* * VM_NORESERVE is used because the reservations will be * taken when vm_ops->mmap() is called * A dummy user value is used because we are not locking * memory so no accounting is necessary */ file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, &user, HUGETLB_ANONHUGE_INODE, (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); if (IS_ERR(file)) return PTR_ERR(file); }
/* * Does the application expect PROT_READ to imply PROT_EXEC? * * (the exception is when the underlying filesystem is noexec * mounted, in which case we dont add PROT_EXEC.) */ if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) if (!(file && path_noexec(&file->f_path))) prot |= PROT_EXEC;
/* force arch specific MAP_FIXED handling in get_unmapped_area */ if (flags & MAP_FIXED_NOREPLACE) flags |= MAP_FIXED;
if (!(flags & MAP_FIXED)) addr = round_hint_to_min(addr);
/* Careful about overflows.. */ len = PAGE_ALIGN(len); //页对齐 if (!len) return -ENOMEM;
/* Too many mappings? */ if (mm->map_count > sysctl_max_map_count) return -ENOMEM; //mm->map_count有最大值限制
/* Obtain the address to map to. we verify (or select) it and ensure * that it represents a valid section of the address space. */ addr = get_unmapped_area(file, addr, len, pgoff, flags); //寻找一个合适的线性地址 if (offset_in_page(addr)) //#define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) // PAGE_MASK=0XFFFFF000 return addr; //其实在get_unmapped_area中也做了一次这个检查,如果不是页对齐,会返回-EINVAL
if (prot == PROT_EXEC) { pkey = execute_only_pkey(mm); if (pkey < 0) pkey = 0; }
/* Do simple checking here so the lower-level routines won't have * to. we assume access permissions have been handled by the open * of the memory object, so we don't do any here. */ vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
if (flags & MAP_LOCKED) if (!can_do_mlock()) return -EPERM;
if (mlock_future_check(mm, vm_flags, len)) return -EAGAIN;
if (file) { structinode *inode = file_inode(file); unsignedlong flags_mask;
if (!file_mmap_ok(file, inode, pgoff, len)) //检查len的合法性 return -EOVERFLOW;
switch (flags & MAP_TYPE) { case MAP_SHARED: /* * Force use of MAP_SHARED_VALIDATE with non-legacy * flags. E.g. MAP_SYNC is dangerous to use with * MAP_SHARED as you don't know which consistency model * you will get. We silently ignore unsupported flags * with MAP_SHARED to preserve backward compatibility. */ flags &= LEGACY_MAP_MASK; /* fall through */ case MAP_SHARED_VALIDATE: if (flags & ~flags_mask) return -EOPNOTSUPP; if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE)) return -EACCES;
/* * Make sure we don't allow writing to an append-only * file.. */ if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE)) return -EACCES;
/* * Make sure there are no mandatory locks on the file. */ if (locks_verify_locked(file)) return -EAGAIN;
/* fall through */ case MAP_PRIVATE: if (!(file->f_mode & FMODE_READ)) return -EACCES; if (path_noexec(&file->f_path)) { if (vm_flags & VM_EXEC) return -EPERM; vm_flags &= ~VM_MAYEXEC; }
if (!file->f_op->mmap) return -ENODEV; if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) return -EINVAL; break;
default: return -EINVAL; } } else { switch (flags & MAP_TYPE) { case MAP_SHARED: if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) return -EINVAL; /* * Ignore pgoff. */ pgoff = 0; vm_flags |= VM_SHARED | VM_MAYSHARE; break; case MAP_PRIVATE: /* * Set pgoff according to addr for anon_vma. */ pgoff = addr >> PAGE_SHIFT; break; default: return -EINVAL; } }
/* * Set 'VM_NORESERVE' if we should not account for the * memory use of this mapping. */ if (flags & MAP_NORESERVE) { /* We honor MAP_NORESERVE if allowed to overcommit */ if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) vm_flags |= VM_NORESERVE;
/* Check against address space limit. */ if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { unsignedlong nr_pages;
/* * MAP_FIXED may remove pages of mappings that intersects with * requested mapping. Account for the pages it would unmap. */ nr_pages = count_vma_pages_range(mm, addr, addr + len);
/* Clear old maps */ while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { if (do_munmap(mm, addr, len, uf)) return -ENOMEM; }
/* * Private writable mapping: check memory availability */ if (accountable_mapping(file, vm_flags)) { charged = len >> PAGE_SHIFT; if (security_vm_enough_memory_mm(mm, charged)) return -ENOMEM; vm_flags |= VM_ACCOUNT; }
/* * Can we just expand an old mapping? */ vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX); if (vma) goto out;
/* * Determine the object being mapped and call the appropriate * specific mapper. the address has already been validated, but * not unmapped, but the maps are removed from the list. */ vma = vm_area_alloc(mm); if (!vma) { error = -ENOMEM; goto unacct_error; }
if (file) { if (vm_flags & VM_DENYWRITE) { error = deny_write_access(file); if (error) goto free_vma; } if (vm_flags & VM_SHARED) { error = mapping_map_writable(file->f_mapping); if (error) goto allow_write_and_free_vma; }
/* ->mmap() can change vma->vm_file, but must guarantee that * vma_link() below can deny write-access if VM_DENYWRITE is set * and map writably if VM_SHARED is set. This usually means the * new file must not have been exposed to user-space, yet. */ vma->vm_file = get_file(file); error = call_mmap(file, vma); //file->f_op->mmap(file, vma)这里是真正完成映射的(建立页表等) if (error) goto unmap_and_free_vma;
/* Can addr have changed?? * * Answer: Yes, several device drivers can do it in their * f_op->mmap method. -DaveM * Bug: If addr is changed, prev, rb_link, rb_parent should * be updated for vma_link() */ WARN_ON_ONCE(addr != vma->vm_start);
/* * New (or expanded) vma always get soft dirty status. * Otherwise user-space soft-dirty page tracker won't * be able to distinguish situation when vma area unmapped, * then new mapped in-place (which must be aimed as * a completely new data area). */ vma->vm_flags |= VM_SOFTDIRTY;