学习和研究中前行,并在分享中提升自己

欢迎订阅阿里内推邮件



关于thp大页研究

阅读次数: 1382| 时间:2017年12月2日 21:35 | 标签:linux

关于thp大页研究

linux 内存分级结构

在研究thp透明大页之前,我们先来复习一下linux四级页表,分别是pgd,pud,pmd,pte。他们分别称为:页全局目录、页上级目录、页中间目录、页表。那下面我们先看一下怎么从一个addr转换成具体物理页

static void dump_pagetable(unsigned long address)                               
{                                                                               
    pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK);                        
    pgd_t *pgd = base + pgd_index(address);                                     
    pud_t *pud;                                                                 
    pmd_t *pmd;                                                                 
    pte_t *pte;                                                                 

    if (bad_address(pgd))                                                       
        goto bad;                                                               

    printk("PGD %lx ", pgd_val(*pgd));                                          

    if (!pgd_present(*pgd))                                                     
        goto out;                                                               

    pud = pud_offset(pgd, address);                                             
    if (bad_address(pud))                                                       
        goto bad;                                                               

    printk("PUD %lx ", pud_val(*pud));                                          
    if (!pud_present(*pud) || pud_large(*pud))                                  
        goto out;                                                               

    pmd = pmd_offset(pud, address);                                             
    if (bad_address(pmd))                                                       
        goto bad;                                                               

    printk("PMD %lx ", pmd_val(*pmd));                                          
    if (!pmd_present(*pmd) || pmd_large(*pmd))                                  
        goto out;                                                               

    pte = pte_offset_kernel(pmd, address);                                      
    if (bad_address(pte))                                                       
        goto bad;                                                               

    printk("PTE %lx", pte_val(*pte));                                           
out:                                                                            
    printk("\n");                                                               
    return;                                                                     
bad:                                                                            
    printk("BAD\n");                                                            
}                                                                                                                                                                           

拿到pte之后, pa = (pte_val(*pte) & PAGE_MASK | (address & ~PAGE_MASK);即得到物理地址

2M大页和4k小页在linux分级页表上的不同

在一个linux 64位操作系统当中,pgd ,pud,pmd,pte都为9位,而offset为12位。一个4k页(2的12次方)对应的页表分级图如下

同理一个pmd对应的大小为2的21次方,故对应的页大小为2Mb,其分页结构如下

thp的alloc

上面我们复习了一下linux内存的寻址方式,接下来我们看一下如果分配置出一个大页。镜头拉回到mm fault入口函数,在linux 中提供大页有两种方式 一种是hugetlbfs,另外一种是THP,本文中研究THP

int handle_mm_fault(struct vm_area_struct *mva, unsigned long address,              
        unsigned int flags)                                                         
{                                                                                   
    int ret;                                                                        

    __set_current_state(TASK_RUNNING);                                              

    count_vm_event(PGFAULT);                                                        
    mem_cgroup_count_vm_event(vma->vm_mm, PGFAULT);                                 

    /* do counter updates before entering really critical section. */               
    check_sync_rss_stat(current);                                                   

    /*                                                                              
     * Enable the memcg OOM handling for faults triggered in user                   
     * space.  Kernel faults are handled more gracefully.                           
     */                                                                             
    if (flags & FAULT_FLAG_USER)                                                                                                                                            
        mem_cgroup_oom_enable();                                                

    if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,               
                        flags & FAULT_FLAG_INSTRUCTION,                         
                        flags & FAULT_FLAG_REMOTE))                             
        return VM_FAULT_SIGSEGV;                                                

    if (unlikely(is_vm_hugetlb_page(vma)))   
        //通过hugetlbfs来提供大页                                   
        ret = hugetlb_fault(vma->vm_mm, vma, address, flags);                   
    else                                                                        
        ret = __handle_mm_fault(vma, address, flags);                           

    if (flags & FAULT_FLAG_USER) {                                              
        mem_cgroup_oom_disable();                                               
        /*                                                                      
         * The task may have entered a memcg OOM situation but                  
         * if the allocation error was handled gracefully (no                   
         * VM_FAULT_OOM), there is no need to kill anything.                    
         * Just clean up the OOM state peacefully.                              
         */                                                                     
        if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))                
            mem_cgroup_oom_synchronize(false);                                  
    }                                                                               

    /*                                             

那么具体的调用关系如下:handle_mm_fault->__handle_mm_fault->create_hug_pmd->do_huge_pmd_anonymous_page。下面看一下 do_huge_pmd_anonymous_page的具体逻辑 以下为部分代码的节选

int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                   unsigned long address, pmd_t *pmd,                           
                   unsigned int flags)                                          
{                                                                               
    struct page *page;   
    /*pmd_mask 为0xffe00000 */                                                      
    unsigned long haddr = address & HPAGE_PMD_MASK;                             

    if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)          
        return VM_FAULT_FALLBACK;  
        //为vma准备anon_vma                                             
    if (unlikely(anon_vma_prepare(vma)))                                        
        return VM_FAULT_OOM;  
        //将该vma加入到khugepaged进程扫描list                                                  
    if (unlikely(khugepaged_enter(vma)))                                        
        return VM_FAULT_OOM;  
        //如果是因为读引起的fault                                                  
    if (!(flags & FAULT_FLAG_WRITE) &&                                          
            transparent_hugepage_use_zero_page()) {                             
        spinlock_t *ptl;                                                        
        pgtable_t pgtable;                                                      
        struct page *zero_page;                                                 
        bool set;                                                               
        int ret;                                                                
        pgtable = pte_alloc_one(mm, haddr);                                     
        if (unlikely(!pgtable))                                                 
            return VM_FAULT_OOM;                                                
        zero_page = get_huge_zero_page();                                       
        if (unlikely(!zero_page)) {                                             
            pte_free(mm, pgtable);                                              
            count_vm_event(THP_FAULT_FALLBACK);                                 
            return VM_FAULT_FALLBACK;                                           
        }                                                                       
        ptl = pmd_lock(mm, pmd);                                                
        ret = 0;                                                                
        set = false;                                                            
        if (pmd_none(*pmd)) {                                                   
            if (userfaultfd_missing(vma)) {                                     
                spin_unlock(ptl);                                               
                ret = handle_userfault(vma, address, flags,                     
                               VM_UFFD_MISSING);                                
                VM_BUG_ON(ret & VM_FAULT_FALLBACK);                             
            } else {                                                            
                set_huge_zero_page(pgtable, mm, vma,                            
                           haddr, pmd,                                          
                           zero_page);                                          
                spin_unlock(ptl);                                               
                set = true;  
           }                                                                   
        } else                                                                  
            spin_unlock(ptl);                                                   
        if (!set) {                                                             
            pte_free(mm, pgtable);                                              
            put_huge_zero_page();                                               
        }                                                                       
        return ret;                                                             
    } 

    /* 从buddy系统中分配大小为HPAGE_PMD_ORDER(大小为9)的连续pageblock */   

    page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),                 
            vma, haddr, numa_node_id(), 0);                                     
    if (unlikely(!page)) {                                                      
        count_vm_event(THP_FAULT_FALLBACK);                                     
        return VM_FAULT_FALLBACK;                                               
    }                                                                           
    return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page,            
                        flags);                 

接下来看一下__do_huge_pmd_anonymous_page核心逻辑

        entry = mk_huge_pmd(page, vma->vm_page_prot);                           
        entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);                     
        page_add_new_anon_rmap(page, vma, haddr, true);                         
        mem_cgroup_commit_charge(page, memcg, false, true);                     
        lru_cache_add_active_or_unevictable(page, vma);                         
        pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);              
        set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);     

1.设置Pmd val为_PAGE_PSE 2.设置pmd PAGE_DIRTY 和PAGE_RW 3.将vma的anon和page pmd map起来 4.级mem cgroup提交一个page消费 5.将page加入到active或者unevictable lru上 6.存储 hugepage的页表 7.更新pmd值为entry

总结

1.4k page为四级页表,而2M页为三级页表。故4k页 page_shit为12,2M页page_shit 21 2.THP的生成依赖于系统连续内存,具体是order为9的连续内存。所以,如果没有连续内存也就无法生成大页