Posts

    kernel-mm映射三部曲-1

    read the fuck code!
    A picture is worth a thousand words.

    现在使用的机器是linux3.0.35,arm 32bit需要处理一下内存,一次就处理完
    本文只记录实际的应用,不做科普普及,如需要详尽书面知识请查阅相关wiki.
    不关心整体,因为整体不在本次关注点内

    虚拟地址和物理地址

    下边的代码中v6是虚拟地址加上0x90000000, 在内核中找到对应代码. 对应的代码为 __pa(virtual_address)>>PAGE_SHIFT.可知v6为physical address. 从而得出物理地址得出是虚拟地址加0x90000000.
    #define PFN_PHYS(x) ((phys_addr_t)(x) << PAGE_SHIFT) 为页框转为物理地址.

    反向则为物理地址转页框PFN.>>12

    set_bit(10, mem_map + 32 * ((v6 >> 12) - 0x10000)) 其中mem_map是内核的导出符号,从导入表可以看到。mem_map其实保存了 page信息。 其次mem_map + 32 * ((v6 >> 12) - 0x10000)对应的函数宏为 v6为__pa得到的物理地址,然后>>PAGE_SHIFT 为得到的页框pfn,然后pfn-0x10000<<5得到的是page页虚拟地址. 这个计算过程对应的宏是pfn_to_page 然后验证一下:

    先看page数据结构:

    /*
     * Common helper functions.
     */
    unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
    {
        struct page *page;
    
        /*
         * __get_free_pages() returns a 32-bit address, which cannot represent
         * a highmem page
         */
        VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
    
        page = alloc_pages(gfp_mask, order);
        if (!page)
            return 0;
        return (unsigned long) page_address(page);
    }
    

    直接分page 然后page_address:

    #if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL)
    #define page_address(page) lowmem_page_address(page)
    #define set_page_address(page, address)  do { } while(0)
    #define page_address_init()  do { } while(0)
    #endif
    

    lowmem_page_address:

    #include <linux/vmstat.h>
    
    static __always_inline void *lowmem_page_address(struct page *page)
    {
        return __va(PFN_PHYS(page_to_pfn(page)));
    }
    

    page_to_pfn:

    #define page_to_pfn __page_to_pfn
    

    __page_to_pfn: 因为内核分为三种内存模型,要确定是哪一种内存模型:
    * CONFIG_FLATMEM 平坦内存 * CONFIG_DISCONTIGMEM 不连续内存 * CONFIG_SPARSEMEM 稀疏内存 支持热插拔 是arm设备,所以是flatmem了,不用看config文件了。

    /*
     * supports 3 memory models.
     */
    #if defined(CONFIG_FLATMEM)
    
    #define __pfn_to_page(pfn)  (mem_map + ((pfn) - ARCH_PFN_OFFSET))
    #define __page_to_pfn(page) ((unsigned long)((page) - mem_map) + \
                     ARCH_PFN_OFFSET)
    #elif defined(CONFIG_DISCONTIGMEM)
    

    通过上边得代码可推定ARCH_PFN_OFFSET为0x10000. 验证: #define ARCH_PFN_OFFSET PHYS_PFN_OFFSET #define PHYS_PFN_OFFSET (PHYS_OFFSET >> PAGE_SHIFT)

    #ifndef PHYS_OFFSET
    #define PHYS_OFFSET         UL(CONFIG_DRAM_BASE)
    #endif
    

    推定上边得代码是 推page

    # include <linux/init.h>
    # include <linux/kernel.h>
    # include <linux/module.h>
    # include <linux/mm_types.h>
    # include <linux/mm.h>
    # include <linux/gfp.h>
    
    //内核模块初始化函数
    static int __init lkm_init(void)
    {
        struct page *page = alloc_pages(GFP_KERNEL, 0);
    
        unsigned long virt_address = (unsigned long)page_address(page);
    
        printk("virtual addr = 0x%lx\n", virt_address);
    
        unsigned int pfn = page_to_pfn(page);
        printk("pfn = %d\n", pfn);
    
        unsigned long phys_address = PFN_PHYS(pfn);
        printk("phys addr = 0x%lx\n", phys_address);
    
        unsigned long virt_address1 = (unsigned long)__va(phys_address);
    
        printk("virtual addr1 = 0x%lx\n", virt_address1);
    
        free_pages(virt_address, 0);
    
        return 0;
    
    }
    
    //内核模块退出函数
    static void __exit lkm_exit(void)
    {
        printk("Goodbye\n");
    }
    
    
    module_init(lkm_init);
    module_exit(lkm_exit);
    
    MODULE_LICENSE("GPL");
    

    用户层的页表映射

    pagemap页映射:

    /proc/pid/pagemap 该文件允许用户空间程序找出每个虚拟页映射到物理帧(内核书中描写为页框)。每个虚拟页面对应一个64位的值。包含以下数据(fs/proc/task_mmu.c,pagemap_read方法读取) * Bits 0-54 page frame number (PFN) if present
    Bits 0-4 swap type if swapped
    Bits 5-54 swap offset if swapped
    Bit 55 pte is soft-dirty (see Documentation/admin-guide/mm/soft-dirty.rst)
    Bit 56 page exclusively mapped (since 4.2)
    Bits 57-60 zero
    Bit 61 page is file-page or shared-anon (since 3.5)
    Bit 62 page swapped
    Bit 63 page present

    其中
    使用/proc/pid/maps可以高效的确定映射的内存区域、跳过未映射的区域。
    /proc/kpagecount:这个文件包含64位计数 , 表示每一页被映射的次数,按照PFN值固定索引。
    /proc/kpageflags:此文件包含为64位的标志集 ,表示该页的属性,按照PFN索引。

    使用下边测试程序来读页指针:

    #!/usr/bin/python
    
    import sys
    import os
    import binascii
    import struct
    
    def read_entry(path, offset, size=8):
      with open(path, 'r') as f:
        f.seek(offset, 0)
        return struct.unpack('Q', f.read(size))[0]
    
    # Read /proc/$PID/pagemap
    def get_pagemap_entry(pid, addr):
      maps_path = "/proc/{0}/pagemap".format(pid)
      if not os.path.isfile(maps_path):
        print "Process {0} doesn't exist.".format(pid)
        return
    
      page_size = os.sysconf("SC_PAGE_SIZE")
      pagemap_entry_size = 8
      offset  = (addr / page_size) * pagemap_entry_size
    
      return read_entry(maps_path, offset)
    
    def get_pfn(entry):
      return entry & 0x7FFFFFFFFFFFFF
    
    def is_present(entry):
      return ((entry & (1 << 63)) != 0)
    
    def is_file_page(entry):
      return ((entry & (1 << 61)) != 0)
    ##########################################################
    
    # Read /proc/kpagecount
    def get_pagecount(pfn):
      file_path = "/proc/kpagecount"
      offset = pfn * 8
      return read_entry(file_path, offset)
    
    ##########################################################
    
    # Read /proc/kpageflags
    def get_page_flags(pfn):
      file_path = "/proc/kpageflags"
      offset = pfn * 8
      return read_entry(file_path, offset)
    
    
    if __name__ == "__main__":
      pid = sys.argv[1]
      if sys.argv[2].startswith("0x"):
        addr = long(sys.argv[2], base=16)
      else:
        addr = long(sys.argv[2])
    
      entry = get_pagemap_entry(pid, addr)
      pfn = get_pfn(entry)
      print "PFN: {}".format(hex(pfn))
      print "Is Present? : {}".format(is_present(entry))
      print "Is file-page: {}".format(is_file_page(entry))
      print "Page count: {}".format(get_pagecount(pfn))
      print "Page flags: {}".format(hex(get_page_flags(pfn)))
    

    线性地址转换(MMU)4-levle页表

    4级页表存在19年以前的发行版中,新的发行版已经是默认支持5级页表了。5级页表支持64TB以上的ram内存。

    • 地址是直接传递给MMU而非物理地址
    • 64-bit linear address is split into some parts. Only low 48 bits are significant, it means that 2^48 or 256 TBytes of linear-address space may be accessed at any given time.
    • cr3 register stores the address of the 4 top-level paging structure.
    • 47:39 bits of the given linear address store an index into the paging structure level-4, 38:30 bits store index into the paging structure level-3, 29:21 bits store an index into the paging structure level-2, 20:12 bits store an index into the paging structure level-1 and 11:0 bits provide the offset into the physical page in byte.

      如图: 分别为page global dir -> page upper dir ->page mid dir ->page table-> offset PGD->PUD->PMD->PTE Every access to a linear address is either a supervisor-mode access or a user-mode access. This access is determined by the CPL (current privilege level). If CPL < 3 it is a supervisor mode access level, otherwise it is a user mode access level. For example, the top level page table entry contains access bits and has the following structure (See arch/x86/include/asm/pgtable_types.h for the bit offset definitions):

    63  62                  52 51                                                    32
     --------------------------------------------------------------------------------
    | N |                     |                                                     |
    |   |     Available       |     Address of the paging structure on lower level  |
    | X |                     |                                                     |
     --------------------------------------------------------------------------------
    31                                              12 11  9 8 7 6 5   4   3 2 1     0
     --------------------------------------------------------------------------------
    |                                                |     | M |I| | P | P |U|W|    |
    | Address of the paging structure on lower level | AVL | B |G|A| C | W | | |  P |
    |                                                |     | Z |N| | D | T |S|R|    |
     --------------------------------------------------------------------------------
    

    Where:

    • 63 bit - N/X bit (No Execute Bit) which presents ability to execute the code from physical pages mapped by the table entry;
    • 62:52 bits - ignored by CPU, used by system software;
    • 51:12 bits - stores physical address of the lower level paging structure;
    • 11: 9 bits - ignored by CPU;
    • MBZ - must be zero bits;
    • Ignored bits;
    • A - accessed bit indicates was physical page or page structure accessed;
    • PWT and PCD used for cache;
    • U/S - user/supervisor bit controls user access to all the physical pages mapped by this table entry;
    • R/W - read/write bit controls read/write access to all the physical pages mapped by this table entry;
    • P - present bit. Current bit indicates was page table or physical page loaded into primary memory or not. Ok, we know about the paging structures and their entries. Now let’s see some details about 4-level paging in the Linux kernel.

    关闭内核5级页表使用no5lvl或者检查是否开启5级页表用lscpu | grep -i la57 还有就是CONFIG_X86_5LEVEL配置可以直接看到 5级页表本质上在PUD的前一级加了一个P4D,然后在位数上从48位拓冲到57位。

    Translation Lookaside Buffer (TLB)

    TLB中保存的是线性地址和对应的物理地址 一般分为指令缓存和数据缓存。修改CR3寄存器可使得TLB刷新。

    windows系统的内存分页

    32位系统下的xp系统分no-pxe 和pxe 分别分页是10-10-12分页和2-9-9-12分页. 差别就是一个是32位,一个是36位,多了4位的寻址。能支持64GB寻址。